Significantly improved lookup fuzzy matching.
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import random
4 import re
5 import shutil
6 import unicodedata
7
8 from sqlalchemy.sql import func
9 import whoosh
10 import whoosh.filedb.filestore
11 import whoosh.filedb.fileindex
12 import whoosh.index
13 from whoosh.qparser import QueryParser
14 import whoosh.scoring
15 import whoosh.spelling
16
17 from pokedex.util import namedtuple
18
19 from pokedex.db import connect
20 import pokedex.db.tables as tables
21 from pokedex.roomaji import romanize
22 from pokedex.defaults import get_default_index_dir
23
24 __all__ = ['PokedexLookup']
25
26
27 rx_is_number = re.compile('^\d+$')
28
29 LookupResult = namedtuple('LookupResult',
30 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
31
32 class UninitializedIndex(object):
33 class UninitializedIndexError(Exception):
34 pass
35
36 def __nonzero__(self):
37 """Dummy object should identify itself as False."""
38 return False
39
40 def __bool__(self):
41 """Python 3000 version of the above. Future-proofing rules!"""
42 return False
43
44 def __getattr__(self, *args, **kwargs):
45 raise self.UninitializedIndexError(
46 "The lookup index does not exist. Please use `pokedex setup` "
47 "or lookup.rebuild_index() to create it."
48 )
49
50 class LanguageWeighting(whoosh.scoring.Weighting):
51 """A scoring class that forces otherwise-equal English results to come
52 before foreign results.
53 """
54
55 def __init__(self, extra_weights={}, *args, **kwargs):
56 """`extra_weights` may be a dictionary of weights which will be
57 factored in.
58
59 Intended for use with spelling corrections, which come along with their
60 own weightings.
61 """
62 self.extra_weights = extra_weights
63 super(LanguageWeighting, self).__init__(*args, **kwargs)
64
65 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
66 doc = searcher.stored_fields(docnum)
67
68 # Apply extra weight
69 weight = weight * self.extra_weights.get(text, 1.0)
70
71 if doc['language'] == None:
72 # English (well, "default"); leave it at 1
73 return weight
74 elif doc['language'] == u'Roomaji':
75 # Give Roomaji a little boost; it's most likely to be searched
76 return weight * 0.9
77 else:
78 # Everything else can drop down the totem pole
79 return weight * 0.8
80
81
82 class PokedexLookup(object):
83 INTERMEDIATE_LOOKUP_RESULTS = 25
84 MAX_LOOKUP_RESULTS = 10
85
86 # The speller only checks how much the input matches a word; there can be
87 # all manner of extra unmatched junk, and it won't affect the weighting.
88 # To compensate, greatly boost the weighting of matches at the beginning
89 # and end, so nearly-full-word-matches are much better
90 SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
91
92 # Dictionary of table name => table class.
93 # Need the table name so we can get the class from the table name after we
94 # retrieve something from the index
95 indexed_tables = dict(
96 (cls.__tablename__, cls)
97 for cls in (
98 tables.Ability,
99 tables.Item,
100 tables.Location,
101 tables.Move,
102 tables.Nature,
103 tables.Pokemon,
104 tables.Type,
105 )
106 )
107
108
109 def __init__(self, directory=None, session=None):
110 """Opens the whoosh index stored in the named directory. If the index
111 doesn't already exist, it will be created.
112
113 `directory`
114 Directory containing the index. Defaults to a location within the
115 `pokedex` egg directory.
116
117 `session`
118 Used for creating the index and retrieving objects. Defaults to an
119 attempt to connect to the default SQLite database installed by
120 `pokedex setup`.
121 """
122
123 # By the time this returns, self.index, self.speller, and self.session
124 # must be set
125
126 # If a directory was not given, use the default
127 if directory is None:
128 directory = get_default_index_dir()
129
130 self.directory = directory
131
132 if session:
133 self.session = session
134 else:
135 self.session = connect()
136
137 # Attempt to open or create the index
138 if not os.path.exists(directory) or not os.listdir(directory):
139 # Directory doesn't exist OR is empty; caller needs to use
140 # rebuild_index before doing anything. Provide a dummy object that
141 # complains when used
142 self.index = UninitializedIndex()
143 self.speller = UninitializedIndex()
144 return
145
146 # Otherwise, already exists; should be an index! Bam, done.
147 # Note that this will explode if the directory exists but doesn't
148 # contain an index; that's a feature
149 try:
150 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
151 except whoosh.index.EmptyIndexError:
152 raise IOError(
153 "The index directory already contains files. "
154 "Please use a dedicated directory for the lookup index."
155 )
156
157 # Create speller, and done
158 spell_store = whoosh.filedb.filestore.FileStorage(directory)
159 self.speller = whoosh.spelling.SpellChecker(spell_store,
160 **self.SPELLER_OPTIONS)
161
162
163 def rebuild_index(self):
164 """Creates the index from scratch."""
165
166 schema = whoosh.fields.Schema(
167 name=whoosh.fields.ID(stored=True),
168 table=whoosh.fields.ID(stored=True),
169 row_id=whoosh.fields.ID(stored=True),
170 language=whoosh.fields.STORED,
171 iso3166=whoosh.fields.STORED,
172 display_name=whoosh.fields.STORED, # non-lowercased name
173 )
174
175 if not os.path.exists(self.directory):
176 os.mkdir(self.directory)
177
178 self.index = whoosh.index.create_in(self.directory, schema=schema,
179 indexname='MAIN')
180 writer = self.index.writer()
181
182 # Index every name in all our tables of interest
183 speller_entries = set()
184 for cls in self.indexed_tables.values():
185 q = self.session.query(cls)
186
187 for row in q.yield_per(5):
188 row_key = dict(table=unicode(cls.__tablename__),
189 row_id=unicode(row.id))
190
191 def add(name, language, iso3166):
192 normalized_name = self.normalize_name(name)
193
194 writer.add_document(
195 name=normalized_name, display_name=name,
196 language=language, iso3166=iso3166,
197 **row_key
198 )
199
200 speller_entries.add(normalized_name)
201
202
203 # Add the basic English name to the index
204 if cls == tables.Pokemon:
205 # Pokémon need their form name added
206 # XXX kinda kludgy
207 add(row.full_name, None, u'us')
208
209 # If this is a default form, ALSO add the unadorned name,
210 # so 'Deoxys' alone will still do the right thing
211 if row.forme_name and not row.forme_base_pokemon_id:
212 add(row.name, None, u'us')
213 else:
214 add(row.name, None, u'us')
215
216 # Some things also have other languages' names
217 # XXX other language form names..?
218 for foreign_name in getattr(row, 'foreign_names', []):
219 moonspeak = foreign_name.name
220 if row.name == moonspeak:
221 # Don't add the English name again as a different
222 # language; no point and it makes spell results
223 # confusing
224 continue
225
226 add(moonspeak, foreign_name.language.name,
227 foreign_name.language.iso3166)
228
229 # Add Roomaji too
230 if foreign_name.language.name == 'Japanese':
231 roomaji = romanize(foreign_name.name)
232 add(roomaji, u'Roomaji', u'jp')
233
234 writer.commit()
235
236 # Construct and populate a spell-checker index. Quicker to do it all
237 # at once, as every call to add_* does a commit(), and those seem to be
238 # expensive
239 self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
240 **self.SPELLER_OPTIONS)
241 self.speller.add_words(speller_entries)
242
243
244 def normalize_name(self, name):
245 """Strips irrelevant formatting junk from name input.
246
247 Specifically: everything is lowercased, and accents are removed.
248 """
249 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
250 # Makes sense to me. Decompose by Unicode rules, then remove combining
251 # characters, then recombine. I'm explicitly doing it this way instead
252 # of testing combining() because Korean characters apparently
253 # decompose! But the results are considered letters, not combining
254 # characters, so testing for Mn works well, and combining them again
255 # makes them look right.
256 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
257 name = u"".join(c for c in nkfd_form
258 if unicodedata.category(c) != 'Mn')
259 name = unicodedata.normalize('NFC', name)
260
261 name = name.strip()
262 name = name.lower()
263
264 return name
265
266
267 def _apply_valid_types(self, name, valid_types):
268 """Combines the enforced `valid_types` with any from the search string
269 itself and updates the query.
270
271 For example, a name of 'a,b:foo' and valid_types of b,c will search for
272 only `b`s named "foo".
273
274 Returns `(name, merged_valid_types, term)`, where `name` has had any type
275 prefix stripped, `merged_valid_types` combines the original
276 `valid_types` with the type prefix, and `term` is a query term for
277 limited to just the allowed types. If there are no type restrictions
278 at all, `term` will be None.
279 """
280
281 # Remove any type prefix (pokemon:133) first
282 user_valid_types = []
283 if ':' in name:
284 prefix_chunk, name = name.split(':', 1)
285 name = name.strip()
286
287 prefixes = prefix_chunk.split(',')
288 user_valid_types = [_.strip() for _ in prefixes]
289
290 # Merge the valid types together. Only types that appear in BOTH lists
291 # may be used.
292 # As a special case, if the user asked for types that are explicitly
293 # forbidden, completely ignore what the user requested
294 combined_valid_types = []
295 if user_valid_types and valid_types:
296 combined_valid_types = list(
297 set(user_valid_types) & set(combined_valid_types)
298 )
299
300 if not combined_valid_types:
301 # No overlap! Just use the enforced ones
302 combined_valid_types = valid_types
303 else:
304 # One list or the other was blank, so just use the one that isn't
305 combined_valid_types = valid_types + user_valid_types
306
307 if not combined_valid_types:
308 # No restrictions
309 return name, [], None
310
311 # Construct the term
312 type_terms = []
313 final_valid_types = []
314 for valid_type in combined_valid_types:
315 table_name = self._parse_table_name(valid_type)
316
317 # Quietly ignore bogus valid_types; more likely to DTRT
318 if table_name:
319 final_valid_types.append(valid_type)
320 type_terms.append(whoosh.query.Term(u'table', table_name))
321
322 return name, final_valid_types, whoosh.query.Or(type_terms)
323
324
325 def _parse_table_name(self, name):
326 """Takes a singular table name, table name, or table object and returns
327 the table name.
328
329 Returns None for a bogus name.
330 """
331 # Table object
332 if hasattr(name, '__tablename__'):
333 return getattr(name, '__tablename__')
334
335 # Table name
336 for table in self.indexed_tables.values():
337 if name in (table.__tablename__, table.__singlename__):
338 return table.__tablename__
339
340 # Bogus. Be nice and return dummy
341 return None
342
343 def _whoosh_records_to_results(self, records, exact=True):
344 """Converts a list of whoosh's indexed records to LookupResult tuples
345 containing database objects.
346 """
347 # XXX this 'exact' thing is getting kinda leaky. would like a better
348 # way to handle it, since only lookup() cares about fuzzy results
349 seen = {}
350 results = []
351 for record in records:
352 # Skip dupes
353 seen_key = record['table'], record['row_id']
354 if seen_key in seen:
355 continue
356 seen[seen_key] = True
357
358 cls = self.indexed_tables[record['table']]
359 obj = self.session.query(cls).get(record['row_id'])
360
361 results.append(LookupResult(object=obj,
362 indexed_name=record['name'],
363 name=record['display_name'],
364 language=record['language'],
365 iso3166=record['iso3166'],
366 exact=exact))
367
368 return results
369
370
371 def lookup(self, input, valid_types=[], exact_only=False):
372 """Attempts to find some sort of object, given a name.
373
374 Returns a list of named (object, name, language, iso3166, exact)
375 tuples. `object` is a database object, `name` is the name under which
376 the object was found, `language` and `iso3166` are the name and country
377 code of the language in which the name was found, and `exact` is True
378 iff this was an
379 exact match.
380
381 This function currently ONLY does fuzzy matching if there are no exact
382 matches.
383
384 Formes are not returned unless requested; "Shaymin" will return only
385 grass Shaymin.
386
387 Extraneous whitespace is removed with extreme prejudice.
388
389 Recognizes:
390 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
391 - Foreign names: "Iibui", "Eivui"
392 - Fuzzy names in whatever language: "Evee", "Ibui"
393 - IDs: "133", "192", "250"
394 Also:
395 - Type restrictions. "type:psychic" will only return the type. This
396 is how to make ID lookup useful. Multiple type specs can be entered
397 with commas, as "move,item:1". If `valid_types` are provided, any
398 type prefix will be ignored.
399 - Alternate formes can be specified merely like "wash rotom".
400
401 `input`
402 Name of the thing to look for.
403
404 `valid_types`
405 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
406 this is provided, only results in one of the given tables will be
407 returned.
408
409 `exact_only`
410 If True, only exact matches are returned. If set to False (the
411 default), and the provided `name` doesn't match anything exactly,
412 spelling correction will be attempted.
413 """
414
415 name = self.normalize_name(input)
416 exact = True
417 form = None
418
419 # Pop off any type prefix and merge with valid_types
420 name, merged_valid_types, type_term = \
421 self._apply_valid_types(name, valid_types)
422
423 # Random lookup
424 if name == 'random':
425 return self.random_lookup(valid_types=merged_valid_types)
426
427 # Do different things depending what the query looks like
428 # Note: Term objects do an exact match, so we don't have to worry about
429 # a query parser tripping on weird characters in the input
430 try:
431 # Let Python try to convert to a number, so 0xff works
432 name_as_number = int(name, base=0)
433 except ValueError:
434 # Oh well
435 name_as_number = None
436
437 if '*' in name or '?' in name:
438 exact_only = True
439 query = whoosh.query.Wildcard(u'name', name)
440 elif name_as_number is not None:
441 # Don't spell-check numbers!
442 exact_only = True
443 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
444 else:
445 # Not an integer
446 query = whoosh.query.Term(u'name', name)
447
448 if type_term:
449 query = query & type_term
450
451
452 ### Actual searching
453 searcher = self.index.searcher()
454 # XXX is this kosher? docs say search() takes a weighting arg, but it
455 # certainly does not
456 searcher.weighting = LanguageWeighting()
457 results = searcher.search(query,
458 limit=self.INTERMEDIATE_LOOKUP_RESULTS)
459
460 # Look for some fuzzy matches if necessary
461 if not exact_only and not results:
462 exact = False
463 results = []
464
465 fuzzy_query_parts = []
466 fuzzy_weights = {}
467 min_weight = [None]
468 for suggestion, _, weight in self.speller.suggestions_and_scores(name):
469 # Only allow the top 50% of scores; otherwise there will always
470 # be a lot of trailing junk
471 if min_weight[0] is None:
472 min_weight[0] = weight * 0.5
473 elif weight < min_weight[0]:
474 break
475
476 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
477 fuzzy_weights[suggestion] = weight
478
479 if not fuzzy_query_parts:
480 # Nothing at all; don't try querying
481 return []
482
483 fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
484 if type_term:
485 fuzzy_query = fuzzy_query & type_term
486
487 searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
488 results = searcher.search(fuzzy_query)
489
490 ### Convert results to db objects
491 objects = self._whoosh_records_to_results(results, exact=exact)
492
493 # Only return up to 10 matches; beyond that, something is wrong. We
494 # strip out duplicate entries above, so it's remotely possible that we
495 # should have more than 10 here and lost a few. The speller returns 25
496 # to give us some padding, and should avoid that problem. Not a big
497 # deal if we lose the 25th-most-likely match anyway.
498 return objects[:self.MAX_LOOKUP_RESULTS]
499
500
501 def random_lookup(self, valid_types=[]):
502 """Returns a random lookup result from one of the provided
503 `valid_types`.
504 """
505
506 tables = []
507 for valid_type in valid_types:
508 table_name = self._parse_table_name(valid_type)
509 if table_name:
510 tables.append(self.indexed_tables[table_name])
511
512 if not tables:
513 # n.b.: It's possible we got a list of valid_types and none of them
514 # were valid, but this function is guaranteed to return
515 # *something*, so it politely selects from the entire index isntead
516 tables = self.indexed_tables.values()
517
518 # Rather than create an array of many hundred items and pick randomly
519 # from it, just pick a number up to the total number of potential
520 # items, then pick randomly from that, and partition the whole range
521 # into chunks. This also avoids the slight problem that the index
522 # contains more rows (for languages) for some items than others.
523 # XXX ought to cache this (in the index?) if possible
524 total = 0
525 partitions = []
526 for table in tables:
527 count = self.session.query(table).count()
528 total += count
529 partitions.append((table, count))
530
531 n = random.randint(1, total)
532 while n > partitions[0][1]:
533 n -= partitions[0][1]
534 partitions.pop(0)
535
536 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
537
538 def prefix_lookup(self, prefix, valid_types=[]):
539 """Returns terms starting with the given exact prefix.
540
541 Type prefixes are recognized, but no other name munging is done.
542 """
543
544 # Pop off any type prefix and merge with valid_types
545 prefix, merged_valid_types, type_term = \
546 self._apply_valid_types(prefix, valid_types)
547
548 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
549
550 if type_term:
551 query = query & type_term
552
553 searcher = self.index.searcher()
554 searcher.weighting = LanguageWeighting()
555 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
556
557 return self._whoosh_records_to_results(results)