cb6d5f60be0de11942fd9bcd140666e145d434d3
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 from collections import namedtuple
3 import os, os.path
4 import pkg_resources
5 import re
6 import shutil
7
8 from sqlalchemy.sql import func
9 import whoosh
10 import whoosh.filedb.filestore
11 import whoosh.filedb.fileindex
12 import whoosh.index
13 from whoosh.qparser import QueryParser
14 import whoosh.scoring
15 import whoosh.spelling
16
17 from pokedex.db import connect
18 import pokedex.db.tables as tables
19 from pokedex.roomaji import romanize
20
21 __all__ = ['open_index', 'lookup']
22
23 # Dictionary of table name => table class.
24 # Need the table name so we can get the class from the table name after we
25 # retrieve something from the index
26 indexed_tables = {}
27 for cls in [
28 tables.Ability,
29 tables.Item,
30 tables.Move,
31 tables.Pokemon,
32 tables.Type,
33 ]:
34 indexed_tables[cls.__tablename__] = cls
35
36 def open_index(directory=None, session=None, recreate=False):
37 """Opens the whoosh index stored in the named directory and returns (index,
38 speller). If the index doesn't already exist, it will be created.
39
40 `directory`
41 Directory containing the index. Defaults to a location within the
42 `pokedex` egg directory.
43
44 `session`
45 If the index needs to be created, this database session will be used.
46 Defaults to an attempt to connect to the default SQLite database
47 installed by `pokedex setup`.
48
49 `recreate`
50 If set to True, the whoosh index will be created even if it already
51 exists.
52 """
53
54 # Defaults
55 if not directory:
56 directory = pkg_resources.resource_filename('pokedex',
57 'data/whoosh-index')
58
59 if not session:
60 session = connect()
61
62 # Attempt to open or create the index
63 directory_exists = os.path.exists(directory)
64 if directory_exists and not recreate:
65 # Already exists; should be an index!
66 try:
67 index = whoosh.index.open_dir(directory, indexname='MAIN')
68 spell_store = whoosh.filedb.filestore.FileStorage(directory)
69 speller = whoosh.spelling.SpellChecker(spell_store)
70 return index, speller
71 except whoosh.index.EmptyIndexError as e:
72 # Apparently not a real index. Fall out of the if and create it
73 pass
74
75 # Delete and start over if we're going to bail anyway.
76 if directory_exists and recreate:
77 # Be safe and only delete if it looks like a whoosh index, i.e.,
78 # everything starts with _
79 if all(f[0] == '_' for f in os.listdir(directory)):
80 shutil.rmtree(directory)
81 directory_exists = False
82
83 if not directory_exists:
84 os.mkdir(directory)
85
86
87 ### Create index
88 schema = whoosh.fields.Schema(
89 name=whoosh.fields.ID(stored=True),
90 table=whoosh.fields.ID(stored=True),
91 row_id=whoosh.fields.ID(stored=True),
92 language=whoosh.fields.STORED,
93 display_name=whoosh.fields.STORED, # non-lowercased name
94 forme_name=whoosh.fields.ID,
95 )
96
97 index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
98 writer = index.writer()
99
100 # Index every name in all our tables of interest
101 # speller_entries becomes a list of (word, score) tuples; the score is 2
102 # for English names, 1.5 for Roomaji, and 1 for everything else. I think
103 # this biases the results in the direction most people expect, especially
104 # when e.g. German names are very similar to English names
105 speller_entries = []
106 for cls in indexed_tables.values():
107 q = session.query(cls)
108
109 for row in q.yield_per(5):
110 # XXX need to give forme_name a dummy value because I can't search
111 # for explicitly empty fields. boo.
112 row_key = dict(table=unicode(cls.__tablename__),
113 row_id=unicode(row.id),
114 forme_name=u'XXX')
115
116 def add(name, language, score):
117 writer.add_document(name=name.lower(), display_name=name,
118 language=language,
119 **row_key)
120 speller_entries.append((name.lower(), score))
121
122 # If this is a form, mark it as such
123 if getattr(row, 'forme_base_pokemon_id', None):
124 row_key['forme_name'] = row.forme_name
125
126 name = row.name
127 add(name, None, 1)
128
129 # Pokemon also get other languages
130 for foreign_name in getattr(row, 'foreign_names', []):
131 moonspeak = foreign_name.name
132 if name == moonspeak:
133 # Don't add the English name again as a different language;
134 # no point and it makes spell results confusing
135 continue
136
137 add(moonspeak, foreign_name.language.name, 3)
138
139 # Add Roomaji too
140 if foreign_name.language.name == 'Japanese':
141 roomaji = romanize(foreign_name.name)
142 add(roomaji, u'Roomaji', 8)
143
144 writer.commit()
145
146 # Construct and populate a spell-checker index. Quicker to do it all
147 # at once, as every call to add_* does a commit(), and those seem to be
148 # expensive
149 speller = whoosh.spelling.SpellChecker(index.storage)
150 speller.add_scored_words(speller_entries)
151
152 return index, speller
153
154
155 class LanguageWeighting(whoosh.scoring.Weighting):
156 """A scoring class that forces otherwise-equal English results to come
157 before foreign results.
158 """
159
160 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
161 doc = searcher.stored_fields(docnum)
162 if doc['language'] == None:
163 # English (well, "default"); leave it at 1
164 return weight
165 elif doc['language'] == u'Roomaji':
166 # Give Roomaji a bit of a boost, as it's most likely to be searched
167 return weight * 0.95
168 else:
169 # Everything else can drop down the totem pole
170 return weight * 0.9
171
172 rx_is_number = re.compile('^\d+$')
173
174 LookupResult = namedtuple('LookupResult',
175 ['object', 'name', 'language', 'exact'])
176 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
177 """Attempts to find some sort of object, given a database session and name.
178
179 Returns a list of named (object, name, language, exact) tuples. `object`
180 is a database object, `name` is the name under which the object was found,
181 `language` is the name of the language in which the name was found, and
182 `exact` is True iff this was an exact match.
183
184 This function currently ONLY does fuzzy matching if there are no exact
185 matches.
186
187 Formes are not returned; "Shaymin" will return only grass Shaymin.
188
189 Recognizes:
190 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
191 - Foreign names: "Iibui", "Eivui"
192 - Fuzzy names in whatever language: "Evee", "Ibui"
193 - IDs: "133", "192", "250"
194 Also:
195 - Type restrictions. "type:psychic" will only return the type. This is
196 how to make ID lookup useful. Multiple type specs can be entered with
197 commas, as "move,item:1". If `valid_types` are provided, any type prefix
198 will be ignored.
199 - Alternate formes can be specified merely like "wash rotom".
200
201 `input`
202 Name of the thing to look for.
203
204 `valid_types`
205 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
206 this is provided, only results in one of the given tables will be
207 returned.
208
209 `session`
210 A database session to use for retrieving objects. As with get_index,
211 if this is not provided, a connection to the default database will be
212 attempted.
213
214 `indices`
215 Tuple of index, speller as returned from `open_index()`. Defaults to
216 a call to `open_index()`.
217
218 `exact_only`
219 If True, only exact matches are returned. If set to False (the
220 default), and the provided `name` doesn't match anything exactly,
221 spelling correction will be attempted.
222 """
223
224 if not session:
225 session = connect()
226
227 if indices:
228 index, speller = indices
229 else:
230 index, speller = open_index()
231
232 name = unicode(input).lower()
233 exact = True
234 form = None
235
236 # Remove any type prefix (pokemon:133) before constructing a query
237 if ':' in name:
238 prefix_chunk, name = name.split(':', 2)
239 prefixes = prefix_chunk.split(',')
240 if not valid_types:
241 # Only use types from the query string if none were explicitly
242 # provided
243 valid_types = prefixes
244
245 # If the input provided is a number, match it as an id. Otherwise, name.
246 # Term objects do an exact match, so we don't have to worry about a query
247 # parser tripping on weird characters in the input
248 if rx_is_number.match(name):
249 # Don't spell-check numbers!
250 exact_only = True
251 query = whoosh.query.Term(u'row_id', name)
252 else:
253 # Not an integer
254 query = whoosh.query.Term(u'name', name) \
255 & whoosh.query.Term(u'forme_name', u'XXX')
256
257 # If there's a space in the input, this might be a form
258 if ' ' in name:
259 form, formless_name = name.split(' ', 2)
260 form_query = whoosh.query.Term(u'name', formless_name) \
261 & whoosh.query.Term(u'forme_name', form)
262 query = query | form_query
263
264 ### Filter by type of object
265 type_terms = []
266 for valid_type in valid_types:
267 if hasattr(valid_type, '__tablename__'):
268 table_name = getattr(valid_type, '__tablename__')
269 elif valid_type in indexed_tables:
270 table_name = valid_type
271 elif valid_type + 's' in indexed_tables:
272 table_name = valid_type + 's'
273 else:
274 # Bogus. Be nice and ignore it
275 continue
276
277 type_terms.append(whoosh.query.Term(u'table', table_name))
278
279 if type_terms:
280 query = query & whoosh.query.Or(type_terms)
281
282
283 ### Actual searching
284 searcher = index.searcher()
285 searcher.weighting = LanguageWeighting() # XXX kosher? docs say search()
286 # takes a weighting kw but it
287 # certainly does not
288 results = searcher.search(query)
289
290 # Look for some fuzzy matches if necessary
291 if not exact_only and not results:
292 exact = False
293 results = []
294
295 for suggestion in speller.suggest(name, 25):
296 query = whoosh.query.Term('name', suggestion)
297 results.extend(searcher.search(query))
298
299 ### Convert results to db objects
300 objects = []
301 seen = {}
302 for result in results:
303 # Skip dupe results
304 seen_key = result['table'], result['row_id']
305 if seen_key in seen:
306 continue
307 seen[seen_key] = True
308
309 cls = indexed_tables[result['table']]
310 obj = session.query(cls).get(result['row_id'])
311
312 objects.append(LookupResult(object=obj,
313 name=result['display_name'],
314 language=result['language'],
315 exact=exact))
316
317 # Only return up to 10 matches; beyond that, something is wrong.
318 # We strip out duplicate entries above, so it's remotely possible that we
319 # should have more than 10 here and lost a few. The speller returns 25 to
320 # give us some padding, and should avoid that problem. Not a big deal if
321 # we lose the 25th-most-likely match anyway.
322 return objects[:10]