Fixed whoosh index creation to work with 0.2.x. #15
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import re
4
5 from sqlalchemy.sql import func
6 import whoosh
7 import whoosh.filedb.filestore
8 import whoosh.filedb.fileindex
9 import whoosh.index
10 from whoosh.qparser import QueryParser
11 import whoosh.spelling
12
13 import pokedex.db.tables as tables
14
15 # Dictionary of table name => table class.
16 # Need the table name so we can get the class from the table name after we
17 # retrieve something from the index
18 indexed_tables = {}
19 for cls in [
20 tables.Ability,
21 tables.Item,
22 tables.Move,
23 tables.Pokemon,
24 tables.Type,
25 ]:
26 indexed_tables[cls.__tablename__] = cls
27
28 # Dictionary of extra keys to file types of objects under, e.g. Pokémon can
29 # also be looked up purely by number
30 extra_keys = {
31 tables.Move: [
32 lambda row: u"move %d" % row.id,
33 ],
34 tables.Pokemon: [
35 lambda row: unicode(row.id),
36 ],
37 }
38
39 index_bits = {}
40 def get_index(session):
41 """Returns (index, speller).
42
43 Creates an index if one does not exist.
44 """
45
46 if index_bits:
47 return index_bits['index'], index_bits['speller']
48
49 store = whoosh.filedb.filestore.RamStorage()
50 schema = whoosh.fields.Schema(
51 name=whoosh.fields.ID(stored=True),
52 table=whoosh.fields.STORED,
53 row_id=whoosh.fields.STORED,
54 language=whoosh.fields.STORED,
55
56 # Whoosh 0.2 explodes when using a file-stored schema with no TEXT
57 # columns. Appease it
58 dummy=whoosh.fields.TEXT,
59 )
60
61 index_directory = '/var/tmp/pokedex'
62 if not os.path.exists(index_directory):
63 os.mkdir(index_directory)
64 index = whoosh.index.create_in(index_directory, schema=schema)
65 writer = index.writer()
66
67 # Index every name in all our tables of interest
68 speller_entries = []
69 for cls in indexed_tables.values():
70 q = session.query(cls)
71
72 # Only index base Pokémon formes
73 if hasattr(cls, 'forme_base_pokemon_id'):
74 q = q.filter_by(forme_base_pokemon_id=None)
75
76 for row in q.yield_per(5):
77 row_key = dict(table=cls.__tablename__, row_id=row.id)
78
79 # Spelling index only indexes strings of letters, alas, so we
80 # reduce every name to this to make the index work. However, exact
81 # matches are not returned, so e.g. 'nidoran' would neither match
82 # exactly nor fuzzy-match. Solution: add the spelling-munged name
83 # as a regular index row too.
84 name = row.name.lower()
85 writer.add_document(name=name, **row_key)
86
87 speller_entries.append(name)
88
89 for extra_key_func in extra_keys.get(cls, []):
90 extra_key = extra_key_func(row)
91 writer.add_document(name=extra_key, **row_key)
92
93 writer.commit()
94
95 # XXX GIHWEGREHKG
96 old__schema = whoosh.spelling.SpellChecker._schema
97 def new__schema(self):
98 schema = old__schema(self)
99 schema.add('dummy', whoosh.fields.TEXT)
100 return schema
101 whoosh.spelling.SpellChecker._schema = new__schema
102
103 # Construct and populate a spell-checker index. Quicker to do it all
104 # at once, as every call to add_* does a commit(), and those seem to be
105 # expensive
106 speller = whoosh.spelling.SpellChecker(index.storage)
107 # WARNING: HERE BE DRAGONS
108 # whoosh.spelling refuses to index things that don't look like words.
109 # Unfortunately, this doesn't work so well for Pokémon (Mr. Mime,
110 # Porygon-Z, etc.), and attempts to work around it lead to further
111 # complications.
112 # The below is copied from SpellChecker.add_scored_words without the check
113 # for isalpha(). XXX get whoosh patched to make this unnecessary!
114 writer = speller.index(create=True).writer()
115 for word in speller_entries:
116 fields = {"word": word, "score": 1}
117 for size in xrange(speller.mingram, speller.maxgram + 1):
118 nga = whoosh.analysis.NgramAnalyzer(size)
119 gramlist = [t.text for t in nga(word)]
120 if len(gramlist) > 0:
121 fields["start%s" % size] = gramlist[0]
122 fields["end%s" % size] = gramlist[-1]
123 fields["gram%s" % size] = " ".join(gramlist)
124 writer.add_document(**fields)
125 writer.commit()
126 # end copy-pasta
127
128 index_bits['index'] = index
129 index_bits['speller'] = speller
130 index_bits['store'] = store
131 return index_bits['index'], index_bits['speller']
132
133 def lookup(session, name, exact_only=False):
134 """Attempts to find some sort of object, given a database session and name.
135
136 Returns (objects, exact) where `objects` is a list of database objects, and
137 `exact` is True iff the given name matched the returned objects exactly.
138
139 This function ONLY does fuzzy matching if there are no exact matches.
140
141 Formes are not returned; "Shaymin" will return only grass Shaymin.
142
143 Currently recognizes:
144 - Pokémon names: "Eevee"
145 """
146
147 exact = True
148
149 index, speller = get_index(session)
150
151 # Look for exact name. A Term object does an exact match, so we don't have
152 # to worry about a query parser tripping on weird characters in the input
153 searcher = index.searcher()
154 query = whoosh.query.Term('name', name.lower())
155 results = searcher.search(query)
156
157 if not exact_only:
158 # Look for some fuzzy matches
159 if not results:
160 exact = False
161 results = []
162
163 for suggestion in speller.suggest(name, 3):
164 query = whoosh.query.Term('name', suggestion)
165 results.extend(searcher.search(query))
166
167 # Convert results to db objects
168 objects = []
169 seen = {}
170 for result in results:
171 # Skip dupe results
172 seen_key = result['table'], result['row_id']
173 if seen_key in seen:
174 continue
175 seen[seen_key] = True
176
177 cls = indexed_tables[result['table']]
178 obj = session.query(cls).get(result['row_id'])
179 objects.append(obj)
180
181 return objects, exact