Overhaul the Pokémon form schema. #286 #179 #379
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import random
4 import re
5 import shutil
6 import unicodedata
7
8 from sqlalchemy.sql import func
9 import whoosh
10 import whoosh.filedb.filestore
11 import whoosh.filedb.fileindex
12 import whoosh.index
13 from whoosh.qparser import QueryParser
14 import whoosh.scoring
15 import whoosh.spelling
16
17 from pokedex.util import namedtuple
18
19 from pokedex.db import connect
20 import pokedex.db.tables as tables
21 from pokedex.roomaji import romanize
22 from pokedex.defaults import get_default_index_dir
23
24 __all__ = ['PokedexLookup']
25
26
27 rx_is_number = re.compile('^\d+$')
28
29 LookupResult = namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
31 ])
32
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
35 pass
36
37 def __nonzero__(self):
38 """Dummy object should identify itself as False."""
39 return False
40
41 def __bool__(self):
42 """Python 3000 version of the above. Future-proofing rules!"""
43 return False
44
45 def __getattr__(self, *args, **kwargs):
46 raise self.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
49 )
50
51 class LanguageWeighting(whoosh.scoring.Weighting):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
54 """
55
56 def __init__(self, extra_weights={}, *args, **kwargs):
57 """`extra_weights` may be a dictionary of weights which will be
58 factored in.
59
60 Intended for use with spelling corrections, which come along with their
61 own weightings.
62 """
63 self.extra_weights = extra_weights
64 super(LanguageWeighting, self).__init__(*args, **kwargs)
65
66 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
67 doc = searcher.stored_fields(docnum)
68
69 # Apply extra weight
70 weight = weight * self.extra_weights.get(text, 1.0)
71
72 language = doc.get('language')
73 if language is None:
74 # English (well, "default"); leave it at 1
75 return weight
76 elif language == u'Roomaji':
77 # Give Roomaji a little boost; it's most likely to be searched
78 return weight * 0.9
79 else:
80 # Everything else can drop down the totem pole
81 return weight * 0.8
82
83
84 class PokedexLookup(object):
85 MAX_FUZZY_RESULTS = 10
86 MAX_EXACT_RESULTS = 43
87 INTERMEDIATE_FACTOR = 2
88
89 # The speller only checks how much the input matches a word; there can be
90 # all manner of extra unmatched junk, and it won't affect the weighting.
91 # To compensate, greatly boost the weighting of matches at the beginning
92 # and end, so nearly-full-word-matches are much better
93 SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
94
95 # Dictionary of table name => table class.
96 # Need the table name so we can get the class from the table name after we
97 # retrieve something from the index
98 indexed_tables = dict(
99 (cls.__tablename__, cls)
100 for cls in (
101 tables.Ability,
102 tables.Item,
103 tables.Location,
104 tables.Move,
105 tables.Nature,
106 tables.Pokemon,
107 tables.PokemonForm,
108 tables.Type,
109 )
110 )
111
112
113 def __init__(self, directory=None, session=None):
114 """Opens the whoosh index stored in the named directory. If the index
115 doesn't already exist, it will be created.
116
117 `directory`
118 Directory containing the index. Defaults to a location within the
119 `pokedex` egg directory.
120
121 `session`
122 Used for creating the index and retrieving objects. Defaults to an
123 attempt to connect to the default SQLite database installed by
124 `pokedex setup`.
125 """
126
127 # By the time this returns, self.index, self.speller, and self.session
128 # must be set
129
130 # If a directory was not given, use the default
131 if directory is None:
132 directory = get_default_index_dir()
133
134 self.directory = directory
135
136 if session:
137 self.session = session
138 else:
139 self.session = connect()
140
141 # Attempt to open or create the index
142 if not os.path.exists(directory) or not os.listdir(directory):
143 # Directory doesn't exist OR is empty; caller needs to use
144 # rebuild_index before doing anything. Provide a dummy object that
145 # complains when used
146 self.index = UninitializedIndex()
147 self.speller = UninitializedIndex()
148 return
149
150 # Otherwise, already exists; should be an index! Bam, done.
151 # Note that this will explode if the directory exists but doesn't
152 # contain an index; that's a feature
153 try:
154 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
155 except whoosh.index.EmptyIndexError:
156 raise IOError(
157 "The index directory already contains files. "
158 "Please use a dedicated directory for the lookup index."
159 )
160
161 # Create speller, and done
162 spell_store = whoosh.filedb.filestore.FileStorage(directory)
163 self.speller = whoosh.spelling.SpellChecker(spell_store,
164 **self.SPELLER_OPTIONS)
165
166
167 def rebuild_index(self):
168 """Creates the index from scratch."""
169
170 schema = whoosh.fields.Schema(
171 name=whoosh.fields.ID(stored=True),
172 table=whoosh.fields.ID(stored=True),
173 row_id=whoosh.fields.ID(stored=True),
174 language=whoosh.fields.STORED,
175 iso639=whoosh.fields.ID(stored=True),
176 iso3166=whoosh.fields.ID(stored=True),
177 display_name=whoosh.fields.STORED, # non-lowercased name
178 )
179
180 if os.path.exists(self.directory):
181 # create_in() isn't totally reliable, so just nuke whatever's there
182 # manually. Try to be careful about this...
183 for f in os.listdir(self.directory):
184 if re.match('^_?(MAIN|SPELL)_', f):
185 os.remove(os.path.join(self.directory, f))
186 else:
187 os.mkdir(self.directory)
188
189 self.index = whoosh.index.create_in(self.directory, schema=schema,
190 indexname='MAIN')
191 writer = self.index.writer()
192
193 # Index every name in all our tables of interest
194 speller_entries = set()
195 for cls in self.indexed_tables.values():
196 q = self.session.query(cls)
197
198 for row in q.yield_per(5):
199 row_key = dict(table=unicode(cls.__tablename__),
200 row_id=unicode(row.id))
201
202 def add(name, language, iso639, iso3166):
203 normalized_name = self.normalize_name(name)
204
205 writer.add_document(
206 name=normalized_name, display_name=name,
207 language=language, iso639=iso639, iso3166=iso3166,
208 **row_key
209 )
210
211 speller_entries.add(normalized_name)
212
213
214 # Add the basic English name to the index
215 if cls == tables.Pokemon:
216 # Don't re-add alternate forms of the same Pokémon; they'll
217 # be added as Pokémon forms instead
218 if not row.is_base_form:
219 continue
220 elif cls == tables.PokemonForm:
221 if row.name:
222 add(row.pokemon_name, None, u'en', u'us')
223 continue
224
225 add(row.name, None, u'en', u'us')
226
227 # Some things also have other languages' names
228 # XXX other language form names..?
229 for foreign_name in getattr(row, 'foreign_names', []):
230 moonspeak = foreign_name.name
231 if row.name == moonspeak:
232 # Don't add the English name again as a different
233 # language; no point and it makes spell results
234 # confusing
235 continue
236
237 add(moonspeak, foreign_name.language.name,
238 foreign_name.language.iso639,
239 foreign_name.language.iso3166)
240
241 # Add Roomaji too
242 if foreign_name.language.name == 'Japanese':
243 roomaji = romanize(foreign_name.name)
244 add(roomaji, u'Roomaji', u'ja', u'jp')
245
246 writer.commit()
247
248 # Construct and populate a spell-checker index. Quicker to do it all
249 # at once, as every call to add_* does a commit(), and those seem to be
250 # expensive
251 self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
252 **self.SPELLER_OPTIONS)
253 self.speller.add_words(speller_entries)
254
255
256 def normalize_name(self, name):
257 """Strips irrelevant formatting junk from name input.
258
259 Specifically: everything is lowercased, and accents are removed.
260 """
261 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
262 # Makes sense to me. Decompose by Unicode rules, then remove combining
263 # characters, then recombine. I'm explicitly doing it this way instead
264 # of testing combining() because Korean characters apparently
265 # decompose! But the results are considered letters, not combining
266 # characters, so testing for Mn works well, and combining them again
267 # makes them look right.
268 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
269 name = u"".join(c for c in nkfd_form
270 if unicodedata.category(c) != 'Mn')
271 name = unicodedata.normalize('NFC', name)
272
273 name = name.strip()
274 name = name.lower()
275
276 return name
277
278
279 def _apply_valid_types(self, name, valid_types):
280 """Combines the enforced `valid_types` with any from the search string
281 itself and updates the query.
282
283 For example, a name of 'a,b:foo' and valid_types of b,c will search for
284 only `b`s named "foo".
285
286 Returns `(name, merged_valid_types, term)`, where `name` has had any type
287 prefix stripped, `merged_valid_types` combines the original
288 `valid_types` with the type prefix, and `term` is a query term for
289 limited to just the allowed types. If there are no type restrictions
290 at all, `term` will be None.
291 """
292
293 # Remove any type prefix (pokemon:133) first
294 user_valid_types = []
295 if ':' in name:
296 prefix_chunk, name = name.split(':', 1)
297 name = name.strip()
298
299 prefixes = prefix_chunk.split(',')
300 user_valid_types = []
301 for prefix in prefixes:
302 prefix = prefix.strip()
303 if prefix:
304 user_valid_types.append(prefix)
305
306 # Merge the valid types together. Only types that appear in BOTH lists
307 # may be used.
308 # As a special case, if the user asked for types that are explicitly
309 # forbidden, completely ignore what the user requested.
310 # And, just to complicate matters: "type" and language need to be
311 # considered separately.
312 def merge_requirements(func):
313 user = filter(func, user_valid_types)
314 system = filter(func, valid_types)
315
316 if user and system:
317 merged = list(set(user) & set(system))
318 if merged:
319 return merged
320 else:
321 # No overlap; use the system restrictions
322 return system
323 else:
324 # One or the other is blank; use the one that's not
325 return user or system
326
327 # @foo means language must be foo; otherwise it's a table name
328 lang_requirements = merge_requirements(lambda req: req[0] == u'@')
329 type_requirements = merge_requirements(lambda req: req[0] != u'@')
330 all_requirements = lang_requirements + type_requirements
331
332 # Construct the term
333 lang_terms = []
334 for lang in lang_requirements:
335 # Allow for either country or language codes
336 lang_code = lang[1:]
337 lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
338 lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
339
340 type_terms = []
341 for type in type_requirements:
342 table_name = self._parse_table_name(type)
343
344 # Quietly ignore bogus valid_types; more likely to DTRT
345 if table_name:
346 type_terms.append(whoosh.query.Term(u'table', table_name))
347
348 # Combine both kinds of restriction
349 all_terms = []
350 if type_terms:
351 all_terms.append(whoosh.query.Or(type_terms))
352 if lang_terms:
353 all_terms.append(whoosh.query.Or(lang_terms))
354
355 return name, all_requirements, whoosh.query.And(all_terms)
356
357
358 def _parse_table_name(self, name):
359 """Takes a singular table name, table name, or table object and returns
360 the table name.
361
362 Returns None for a bogus name.
363 """
364 # Table object
365 if hasattr(name, '__tablename__'):
366 return getattr(name, '__tablename__')
367
368 # Table name
369 for table in self.indexed_tables.values():
370 if name in (table.__tablename__, table.__singlename__):
371 return table.__tablename__
372
373 # Bogus. Be nice and return dummy
374 return None
375
376 def _whoosh_records_to_results(self, records, exact=True):
377 """Converts a list of whoosh's indexed records to LookupResult tuples
378 containing database objects.
379 """
380 # XXX this 'exact' thing is getting kinda leaky. would like a better
381 # way to handle it, since only lookup() cares about fuzzy results
382 seen = {}
383 results = []
384 for record in records:
385 # Skip dupes
386 seen_key = record['table'], record['row_id']
387 if seen_key in seen:
388 continue
389 seen[seen_key] = True
390
391 cls = self.indexed_tables[record['table']]
392 obj = self.session.query(cls).get(record['row_id'])
393
394 results.append(LookupResult(object=obj,
395 indexed_name=record['name'],
396 name=record['display_name'],
397 language=record.get('language'),
398 iso639=record['iso639'],
399 iso3166=record['iso3166'],
400 exact=exact))
401
402 return results
403
404
405 def lookup(self, input, valid_types=[], exact_only=False):
406 """Attempts to find some sort of object, given a name.
407
408 Returns a list of named (object, name, language, iso639, iso3166,
409 exact) tuples. `object` is a database object, `name` is the name under
410 which the object was found, `language` and the two isos are the name
411 and country codes of the language in which the name was found, and
412 `exact` is True iff this was an exact match.
413
414 This function currently ONLY does fuzzy matching if there are no exact
415 matches.
416
417 Formes are not returned unless requested; "Shaymin" will return only
418 grass Shaymin.
419
420 Extraneous whitespace is removed with extreme prejudice.
421
422 Recognizes:
423 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
424 - Foreign names: "Iibui", "Eivui"
425 - Fuzzy names in whatever language: "Evee", "Ibui"
426 - IDs: "133", "192", "250"
427 Also:
428 - Type restrictions. "type:psychic" will only return the type. This
429 is how to make ID lookup useful. Multiple type specs can be entered
430 with commas, as "move,item:1".
431 - Language restrictions. "@fr:charge" will only return Tackle, which
432 is called "Charge" in French. These can be combined with type
433 restrictions, e.g., "@fr,move:charge".
434 - Alternate formes can be specified merely like "wash rotom".
435
436 `input`
437 Name of the thing to look for.
438
439 `valid_types`
440 A list of type or language restrictions, e.g., `['pokemon',
441 '@ja']`. If this is provided, only results in one of the given
442 tables will be returned.
443
444 `exact_only`
445 If True, only exact matches are returned. If set to False (the
446 default), and the provided `name` doesn't match anything exactly,
447 spelling correction will be attempted.
448 """
449
450 name = self.normalize_name(input)
451 exact = True
452 form = None
453
454 # Pop off any type prefix and merge with valid_types
455 name, merged_valid_types, type_term = \
456 self._apply_valid_types(name, valid_types)
457
458 # Random lookup
459 if name == 'random':
460 return self.random_lookup(valid_types=merged_valid_types)
461
462 # Do different things depending what the query looks like
463 # Note: Term objects do an exact match, so we don't have to worry about
464 # a query parser tripping on weird characters in the input
465 try:
466 # Let Python try to convert to a number, so 0xff works
467 name_as_number = int(name, base=0)
468 except ValueError:
469 # Oh well
470 name_as_number = None
471
472 if '*' in name or '?' in name:
473 exact_only = True
474 query = whoosh.query.Wildcard(u'name', name)
475 elif name_as_number is not None:
476 # Don't spell-check numbers!
477 exact_only = True
478 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
479 else:
480 # Not an integer
481 query = whoosh.query.Term(u'name', name)
482
483 if type_term:
484 query = query & type_term
485
486
487 ### Actual searching
488 # Limits; result limits are constants, and intermediate results (before
489 # duplicate items are stripped out) are capped at the result limit
490 # times another constant.
491 # Fuzzy are capped at 10, beyond which something is probably very
492 # wrong. Exact matches -- that is, wildcards and ids -- are far less
493 # constrained.
494 # Also, exact matches are sorted by name, since weight doesn't matter.
495 sort_by = dict()
496 if exact_only:
497 max_results = self.MAX_EXACT_RESULTS
498 sort_by['sortedby'] = (u'table', u'name')
499 else:
500 max_results = self.MAX_FUZZY_RESULTS
501
502 searcher = self.index.searcher(weighting=LanguageWeighting())
503 results = searcher.search(
504 query,
505 limit=int(max_results * self.INTERMEDIATE_FACTOR),
506 **sort_by
507 )
508
509 # Look for some fuzzy matches if necessary
510 if not exact_only and not results:
511 exact = False
512 results = []
513
514 fuzzy_query_parts = []
515 fuzzy_weights = {}
516 min_weight = [None]
517 for suggestion, _, weight in self.speller.suggestions_and_scores(name):
518 # Only allow the top 50% of scores; otherwise there will always
519 # be a lot of trailing junk
520 if min_weight[0] is None:
521 min_weight[0] = weight * 0.5
522 elif weight < min_weight[0]:
523 break
524
525 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
526 fuzzy_weights[suggestion] = weight
527
528 if not fuzzy_query_parts:
529 # Nothing at all; don't try querying
530 return []
531
532 fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
533 if type_term:
534 fuzzy_query = fuzzy_query & type_term
535
536 searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
537 results = searcher.search(fuzzy_query)
538
539 ### Convert results to db objects
540 objects = self._whoosh_records_to_results(results, exact=exact)
541
542 # Truncate and return
543 return objects[:max_results]
544
545
546 def random_lookup(self, valid_types=[]):
547 """Returns a random lookup result from one of the provided
548 `valid_types`.
549 """
550
551 table_names = []
552 for valid_type in valid_types:
553 table_name = self._parse_table_name(valid_type)
554 # Skip anything not recognized. Could be, say, a language code.
555 # XXX The vast majority of Pokémon forms are unnamed and unindexed,
556 # which can produce blank results. So skip them too for now.
557 if table_name and table_name != 'pokemon_forms':
558 table_names.append(table_name)
559
560 if not table_names:
561 # n.b.: It's possible we got a list of valid_types and none of them
562 # were valid, but this function is guaranteed to return
563 # *something*, so it politely selects from the entire index instead
564 table_names = self.indexed_tables.keys()
565 table_names.remove('pokemon_forms')
566
567 # Rather than create an array of many hundred items and pick randomly
568 # from it, just pick a number up to the total number of potential
569 # items, then pick randomly from that, and partition the whole range
570 # into chunks. This also avoids the slight problem that the index
571 # contains more rows (for languages) for some items than others.
572 # XXX ought to cache this (in the index?) if possible
573 total = 0
574 partitions = []
575 for table_name in table_names:
576 count = self.session.query(self.indexed_tables[table_name]).count()
577 total += count
578 partitions.append((table_name, count))
579
580 n = random.randint(1, total)
581 while n > partitions[0][1]:
582 n -= partitions[0][1]
583 partitions.pop(0)
584
585 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
586
587 def prefix_lookup(self, prefix, valid_types=[]):
588 """Returns terms starting with the given exact prefix.
589
590 Type prefixes are recognized, but no other name munging is done.
591 """
592
593 # Pop off any type prefix and merge with valid_types
594 prefix, merged_valid_types, type_term = \
595 self._apply_valid_types(prefix, valid_types)
596
597 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
598
599 if type_term:
600 query = query & type_term
601
602 searcher = self.index.searcher()
603 searcher.weighting = LanguageWeighting()
604 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
605
606 return self._whoosh_records_to_results(results)