Return more than just ten results for wildcard lookups. #90
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import random
4 import re
5 import shutil
6 import unicodedata
7
8 from sqlalchemy.sql import func
9 import whoosh
10 import whoosh.filedb.filestore
11 import whoosh.filedb.fileindex
12 import whoosh.index
13 from whoosh.qparser import QueryParser
14 import whoosh.scoring
15 import whoosh.spelling
16
17 from pokedex.util import namedtuple
18
19 from pokedex.db import connect
20 import pokedex.db.tables as tables
21 from pokedex.roomaji import romanize
22 from pokedex.defaults import get_default_index_dir
23
24 __all__ = ['PokedexLookup']
25
26
27 rx_is_number = re.compile('^\d+$')
28
29 LookupResult = namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
31 ])
32
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
35 pass
36
37 def __nonzero__(self):
38 """Dummy object should identify itself as False."""
39 return False
40
41 def __bool__(self):
42 """Python 3000 version of the above. Future-proofing rules!"""
43 return False
44
45 def __getattr__(self, *args, **kwargs):
46 raise self.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
49 )
50
51 class LanguageWeighting(whoosh.scoring.Weighting):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
54 """
55
56 def __init__(self, extra_weights={}, *args, **kwargs):
57 """`extra_weights` may be a dictionary of weights which will be
58 factored in.
59
60 Intended for use with spelling corrections, which come along with their
61 own weightings.
62 """
63 self.extra_weights = extra_weights
64 super(LanguageWeighting, self).__init__(*args, **kwargs)
65
66 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
67 doc = searcher.stored_fields(docnum)
68
69 # Apply extra weight
70 weight = weight * self.extra_weights.get(text, 1.0)
71
72 if doc['language'] == None:
73 # English (well, "default"); leave it at 1
74 return weight
75 elif doc['language'] == u'Roomaji':
76 # Give Roomaji a little boost; it's most likely to be searched
77 return weight * 0.9
78 else:
79 # Everything else can drop down the totem pole
80 return weight * 0.8
81
82
83 class PokedexLookup(object):
84 MAX_FUZZY_RESULTS = 10
85 MAX_EXACT_RESULTS = 43
86 INTERMEDIATE_FACTOR = 2
87
88 # The speller only checks how much the input matches a word; there can be
89 # all manner of extra unmatched junk, and it won't affect the weighting.
90 # To compensate, greatly boost the weighting of matches at the beginning
91 # and end, so nearly-full-word-matches are much better
92 SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
93
94 # Dictionary of table name => table class.
95 # Need the table name so we can get the class from the table name after we
96 # retrieve something from the index
97 indexed_tables = dict(
98 (cls.__tablename__, cls)
99 for cls in (
100 tables.Ability,
101 tables.Item,
102 tables.Location,
103 tables.Move,
104 tables.Nature,
105 tables.Pokemon,
106 tables.Type,
107 )
108 )
109
110
111 def __init__(self, directory=None, session=None):
112 """Opens the whoosh index stored in the named directory. If the index
113 doesn't already exist, it will be created.
114
115 `directory`
116 Directory containing the index. Defaults to a location within the
117 `pokedex` egg directory.
118
119 `session`
120 Used for creating the index and retrieving objects. Defaults to an
121 attempt to connect to the default SQLite database installed by
122 `pokedex setup`.
123 """
124
125 # By the time this returns, self.index, self.speller, and self.session
126 # must be set
127
128 # If a directory was not given, use the default
129 if directory is None:
130 directory = get_default_index_dir()
131
132 self.directory = directory
133
134 if session:
135 self.session = session
136 else:
137 self.session = connect()
138
139 # Attempt to open or create the index
140 if not os.path.exists(directory) or not os.listdir(directory):
141 # Directory doesn't exist OR is empty; caller needs to use
142 # rebuild_index before doing anything. Provide a dummy object that
143 # complains when used
144 self.index = UninitializedIndex()
145 self.speller = UninitializedIndex()
146 return
147
148 # Otherwise, already exists; should be an index! Bam, done.
149 # Note that this will explode if the directory exists but doesn't
150 # contain an index; that's a feature
151 try:
152 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
153 except whoosh.index.EmptyIndexError:
154 raise IOError(
155 "The index directory already contains files. "
156 "Please use a dedicated directory for the lookup index."
157 )
158
159 # Create speller, and done
160 spell_store = whoosh.filedb.filestore.FileStorage(directory)
161 self.speller = whoosh.spelling.SpellChecker(spell_store,
162 **self.SPELLER_OPTIONS)
163
164
165 def rebuild_index(self):
166 """Creates the index from scratch."""
167
168 schema = whoosh.fields.Schema(
169 name=whoosh.fields.ID(stored=True),
170 table=whoosh.fields.ID(stored=True),
171 row_id=whoosh.fields.ID(stored=True),
172 language=whoosh.fields.STORED,
173 iso639=whoosh.fields.ID(stored=True),
174 iso3166=whoosh.fields.ID(stored=True),
175 display_name=whoosh.fields.STORED, # non-lowercased name
176 )
177
178 if not os.path.exists(self.directory):
179 os.mkdir(self.directory)
180
181 self.index = whoosh.index.create_in(self.directory, schema=schema,
182 indexname='MAIN')
183 writer = self.index.writer()
184
185 # Index every name in all our tables of interest
186 speller_entries = set()
187 for cls in self.indexed_tables.values():
188 q = self.session.query(cls)
189
190 for row in q.yield_per(5):
191 row_key = dict(table=unicode(cls.__tablename__),
192 row_id=unicode(row.id))
193
194 def add(name, language, iso639, iso3166):
195 normalized_name = self.normalize_name(name)
196
197 writer.add_document(
198 name=normalized_name, display_name=name,
199 language=language, iso639=iso639, iso3166=iso3166,
200 **row_key
201 )
202
203 speller_entries.add(normalized_name)
204
205
206 # Add the basic English name to the index
207 if cls == tables.Pokemon:
208 # Pokémon need their form name added
209 # XXX kinda kludgy
210 add(row.full_name, None, u'en', u'us')
211
212 # If this is a default form, ALSO add the unadorned name,
213 # so 'Deoxys' alone will still do the right thing
214 if row.forme_name and not row.forme_base_pokemon_id:
215 add(row.name, None, u'en', u'us')
216 else:
217 add(row.name, None, u'en', u'us')
218
219 # Some things also have other languages' names
220 # XXX other language form names..?
221 for foreign_name in getattr(row, 'foreign_names', []):
222 moonspeak = foreign_name.name
223 if row.name == moonspeak:
224 # Don't add the English name again as a different
225 # language; no point and it makes spell results
226 # confusing
227 continue
228
229 add(moonspeak, foreign_name.language.name,
230 foreign_name.language.iso639,
231 foreign_name.language.iso3166)
232
233 # Add Roomaji too
234 if foreign_name.language.name == 'Japanese':
235 roomaji = romanize(foreign_name.name)
236 add(roomaji, u'Roomaji', u'ja', u'jp')
237
238 writer.commit()
239
240 # Construct and populate a spell-checker index. Quicker to do it all
241 # at once, as every call to add_* does a commit(), and those seem to be
242 # expensive
243 self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
244 **self.SPELLER_OPTIONS)
245 self.speller.add_words(speller_entries)
246
247
248 def normalize_name(self, name):
249 """Strips irrelevant formatting junk from name input.
250
251 Specifically: everything is lowercased, and accents are removed.
252 """
253 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
254 # Makes sense to me. Decompose by Unicode rules, then remove combining
255 # characters, then recombine. I'm explicitly doing it this way instead
256 # of testing combining() because Korean characters apparently
257 # decompose! But the results are considered letters, not combining
258 # characters, so testing for Mn works well, and combining them again
259 # makes them look right.
260 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
261 name = u"".join(c for c in nkfd_form
262 if unicodedata.category(c) != 'Mn')
263 name = unicodedata.normalize('NFC', name)
264
265 name = name.strip()
266 name = name.lower()
267
268 return name
269
270
271 def _apply_valid_types(self, name, valid_types):
272 """Combines the enforced `valid_types` with any from the search string
273 itself and updates the query.
274
275 For example, a name of 'a,b:foo' and valid_types of b,c will search for
276 only `b`s named "foo".
277
278 Returns `(name, merged_valid_types, term)`, where `name` has had any type
279 prefix stripped, `merged_valid_types` combines the original
280 `valid_types` with the type prefix, and `term` is a query term for
281 limited to just the allowed types. If there are no type restrictions
282 at all, `term` will be None.
283 """
284
285 # Remove any type prefix (pokemon:133) first
286 user_valid_types = []
287 if ':' in name:
288 prefix_chunk, name = name.split(':', 1)
289 name = name.strip()
290
291 prefixes = prefix_chunk.split(',')
292 user_valid_types = [_.strip() for _ in prefixes]
293
294 # Merge the valid types together. Only types that appear in BOTH lists
295 # may be used.
296 # As a special case, if the user asked for types that are explicitly
297 # forbidden, completely ignore what the user requested
298 combined_valid_types = []
299 if user_valid_types and valid_types:
300 combined_valid_types = list(
301 set(user_valid_types) & set(combined_valid_types)
302 )
303
304 if not combined_valid_types:
305 # No overlap! Just use the enforced ones
306 combined_valid_types = valid_types
307 else:
308 # One list or the other was blank, so just use the one that isn't
309 combined_valid_types = valid_types + user_valid_types
310
311 if not combined_valid_types:
312 # No restrictions
313 return name, [], None
314
315 # Construct the term
316 type_terms = []
317 lang_terms = []
318 final_valid_types = []
319 for valid_type in combined_valid_types:
320 if valid_type.startswith(u'@'):
321 # @foo means: language must be foo.
322 # Allow for either country or language codes
323 lang_code = valid_type[1:]
324 lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
325 lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
326 else:
327 # otherwise, this is a type/table name
328 table_name = self._parse_table_name(valid_type)
329
330 # Quietly ignore bogus valid_types; more likely to DTRT
331 if table_name:
332 type_terms.append(whoosh.query.Term(u'table', table_name))
333
334 # Combine both kinds of restriction
335 all_terms = []
336 if type_terms:
337 all_terms.append(whoosh.query.Or(type_terms))
338 if lang_terms:
339 all_terms.append(whoosh.query.Or(lang_terms))
340
341 return name, combined_valid_types, whoosh.query.And(all_terms)
342
343
344 def _parse_table_name(self, name):
345 """Takes a singular table name, table name, or table object and returns
346 the table name.
347
348 Returns None for a bogus name.
349 """
350 # Table object
351 if hasattr(name, '__tablename__'):
352 return getattr(name, '__tablename__')
353
354 # Table name
355 for table in self.indexed_tables.values():
356 if name in (table.__tablename__, table.__singlename__):
357 return table.__tablename__
358
359 # Bogus. Be nice and return dummy
360 return None
361
362 def _whoosh_records_to_results(self, records, exact=True):
363 """Converts a list of whoosh's indexed records to LookupResult tuples
364 containing database objects.
365 """
366 # XXX this 'exact' thing is getting kinda leaky. would like a better
367 # way to handle it, since only lookup() cares about fuzzy results
368 seen = {}
369 results = []
370 for record in records:
371 # Skip dupes
372 seen_key = record['table'], record['row_id']
373 if seen_key in seen:
374 continue
375 seen[seen_key] = True
376
377 cls = self.indexed_tables[record['table']]
378 obj = self.session.query(cls).get(record['row_id'])
379
380 results.append(LookupResult(object=obj,
381 indexed_name=record['name'],
382 name=record['display_name'],
383 language=record['language'],
384 iso639=record['iso639'],
385 iso3166=record['iso3166'],
386 exact=exact))
387
388 return results
389
390
391 def lookup(self, input, valid_types=[], exact_only=False):
392 """Attempts to find some sort of object, given a name.
393
394 Returns a list of named (object, name, language, iso639, iso3166,
395 exact) tuples. `object` is a database object, `name` is the name under
396 which the object was found, `language` and the two isos are the name
397 and country codes of the language in which the name was found, and
398 `exact` is True iff this was an exact match.
399
400 This function currently ONLY does fuzzy matching if there are no exact
401 matches.
402
403 Formes are not returned unless requested; "Shaymin" will return only
404 grass Shaymin.
405
406 Extraneous whitespace is removed with extreme prejudice.
407
408 Recognizes:
409 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
410 - Foreign names: "Iibui", "Eivui"
411 - Fuzzy names in whatever language: "Evee", "Ibui"
412 - IDs: "133", "192", "250"
413 Also:
414 - Type restrictions. "type:psychic" will only return the type. This
415 is how to make ID lookup useful. Multiple type specs can be entered
416 with commas, as "move,item:1".
417 - Language restrictions. "@fr:charge" will only return Tackle, which
418 is called "Charge" in French. These can be combined with type
419 restrictions, e.g., "@fr,move:charge".
420 - Alternate formes can be specified merely like "wash rotom".
421
422 `input`
423 Name of the thing to look for.
424
425 `valid_types`
426 A list of type or language restrictions, e.g., `['pokemon',
427 '@ja']`. If this is provided, only results in one of the given
428 tables will be returned.
429
430 `exact_only`
431 If True, only exact matches are returned. If set to False (the
432 default), and the provided `name` doesn't match anything exactly,
433 spelling correction will be attempted.
434 """
435
436 name = self.normalize_name(input)
437 exact = True
438 form = None
439
440 # Pop off any type prefix and merge with valid_types
441 name, merged_valid_types, type_term = \
442 self._apply_valid_types(name, valid_types)
443
444 # Random lookup
445 if name == 'random':
446 return self.random_lookup(valid_types=merged_valid_types)
447
448 # Do different things depending what the query looks like
449 # Note: Term objects do an exact match, so we don't have to worry about
450 # a query parser tripping on weird characters in the input
451 try:
452 # Let Python try to convert to a number, so 0xff works
453 name_as_number = int(name, base=0)
454 except ValueError:
455 # Oh well
456 name_as_number = None
457
458 if '*' in name or '?' in name:
459 exact_only = True
460 query = whoosh.query.Wildcard(u'name', name)
461 elif name_as_number is not None:
462 # Don't spell-check numbers!
463 exact_only = True
464 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
465 else:
466 # Not an integer
467 query = whoosh.query.Term(u'name', name)
468
469 if type_term:
470 query = query & type_term
471
472
473 ### Actual searching
474 # Limits; result limits are constants, and intermediate results (before
475 # duplicate items are stripped out) are capped at the result limit
476 # times another constant.
477 # Fuzzy are capped at 10, beyond which something is probably very
478 # wrong. Exact matches -- that is, wildcards and ids -- are far less
479 # constrained.
480 # Also, exact matches are sorted by name, since weight doesn't matter.
481 sort_by = dict()
482 if exact_only:
483 max_results = self.MAX_EXACT_RESULTS
484 sort_by['sortedby'] = (u'table', u'name')
485 else:
486 max_results = self.MAX_FUZZY_RESULTS
487
488 searcher = self.index.searcher(weighting=LanguageWeighting())
489 results = searcher.search(
490 query,
491 limit=int(max_results * self.INTERMEDIATE_FACTOR),
492 **sort_by
493 )
494
495 # Look for some fuzzy matches if necessary
496 if not exact_only and not results:
497 exact = False
498 results = []
499
500 fuzzy_query_parts = []
501 fuzzy_weights = {}
502 min_weight = [None]
503 for suggestion, _, weight in self.speller.suggestions_and_scores(name):
504 # Only allow the top 50% of scores; otherwise there will always
505 # be a lot of trailing junk
506 if min_weight[0] is None:
507 min_weight[0] = weight * 0.5
508 elif weight < min_weight[0]:
509 break
510
511 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
512 fuzzy_weights[suggestion] = weight
513
514 if not fuzzy_query_parts:
515 # Nothing at all; don't try querying
516 return []
517
518 fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
519 if type_term:
520 fuzzy_query = fuzzy_query & type_term
521
522 searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
523 results = searcher.search(fuzzy_query)
524
525 ### Convert results to db objects
526 objects = self._whoosh_records_to_results(results, exact=exact)
527
528 # Truncate and return
529 return objects[:max_results]
530
531
532 def random_lookup(self, valid_types=[]):
533 """Returns a random lookup result from one of the provided
534 `valid_types`.
535 """
536
537 table_names = []
538 for valid_type in valid_types:
539 table_name = self._parse_table_name(valid_type)
540 # Skip anything not recognized. Could be, say, a language code
541 if table_name:
542 table_names.append(table_name)
543
544 if not table_names:
545 # n.b.: It's possible we got a list of valid_types and none of them
546 # were valid, but this function is guaranteed to return
547 # *something*, so it politely selects from the entire index instead
548 table_names = self.indexed_tables.keys()
549
550 # Rather than create an array of many hundred items and pick randomly
551 # from it, just pick a number up to the total number of potential
552 # items, then pick randomly from that, and partition the whole range
553 # into chunks. This also avoids the slight problem that the index
554 # contains more rows (for languages) for some items than others.
555 # XXX ought to cache this (in the index?) if possible
556 total = 0
557 partitions = []
558 for table_name in table_names:
559 count = self.session.query(self.indexed_tables[table_name]).count()
560 total += count
561 partitions.append((table_name, count))
562
563 n = random.randint(1, total)
564 while n > partitions[0][1]:
565 n -= partitions[0][1]
566 partitions.pop(0)
567
568 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
569
570 def prefix_lookup(self, prefix, valid_types=[]):
571 """Returns terms starting with the given exact prefix.
572
573 Type prefixes are recognized, but no other name munging is done.
574 """
575
576 # Pop off any type prefix and merge with valid_types
577 prefix, merged_valid_types, type_term = \
578 self._apply_valid_types(prefix, valid_types)
579
580 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
581
582 if type_term:
583 query = query & type_term
584
585 searcher = self.index.searcher()
586 searcher.weighting = LanguageWeighting()
587 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
588
589 return self._whoosh_records_to_results(results)