Make user and code valid_types not interfere when one is language and the other is...
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import random
4 import re
5 import shutil
6 import unicodedata
7
8 from sqlalchemy.sql import func
9 import whoosh
10 import whoosh.filedb.filestore
11 import whoosh.filedb.fileindex
12 import whoosh.index
13 from whoosh.qparser import QueryParser
14 import whoosh.scoring
15 import whoosh.spelling
16
17 from pokedex.util import namedtuple
18
19 from pokedex.db import connect
20 import pokedex.db.tables as tables
21 from pokedex.roomaji import romanize
22 from pokedex.defaults import get_default_index_dir
23
24 __all__ = ['PokedexLookup']
25
26
27 rx_is_number = re.compile('^\d+$')
28
29 LookupResult = namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
31 ])
32
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
35 pass
36
37 def __nonzero__(self):
38 """Dummy object should identify itself as False."""
39 return False
40
41 def __bool__(self):
42 """Python 3000 version of the above. Future-proofing rules!"""
43 return False
44
45 def __getattr__(self, *args, **kwargs):
46 raise self.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
49 )
50
51 class LanguageWeighting(whoosh.scoring.Weighting):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
54 """
55
56 def __init__(self, extra_weights={}, *args, **kwargs):
57 """`extra_weights` may be a dictionary of weights which will be
58 factored in.
59
60 Intended for use with spelling corrections, which come along with their
61 own weightings.
62 """
63 self.extra_weights = extra_weights
64 super(LanguageWeighting, self).__init__(*args, **kwargs)
65
66 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
67 doc = searcher.stored_fields(docnum)
68
69 # Apply extra weight
70 weight = weight * self.extra_weights.get(text, 1.0)
71
72 if doc['language'] == None:
73 # English (well, "default"); leave it at 1
74 return weight
75 elif doc['language'] == u'Roomaji':
76 # Give Roomaji a little boost; it's most likely to be searched
77 return weight * 0.9
78 else:
79 # Everything else can drop down the totem pole
80 return weight * 0.8
81
82
83 class PokedexLookup(object):
84 MAX_FUZZY_RESULTS = 10
85 MAX_EXACT_RESULTS = 43
86 INTERMEDIATE_FACTOR = 2
87
88 # The speller only checks how much the input matches a word; there can be
89 # all manner of extra unmatched junk, and it won't affect the weighting.
90 # To compensate, greatly boost the weighting of matches at the beginning
91 # and end, so nearly-full-word-matches are much better
92 SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
93
94 # Dictionary of table name => table class.
95 # Need the table name so we can get the class from the table name after we
96 # retrieve something from the index
97 indexed_tables = dict(
98 (cls.__tablename__, cls)
99 for cls in (
100 tables.Ability,
101 tables.Item,
102 tables.Location,
103 tables.Move,
104 tables.Nature,
105 tables.Pokemon,
106 tables.Type,
107 )
108 )
109
110
111 def __init__(self, directory=None, session=None):
112 """Opens the whoosh index stored in the named directory. If the index
113 doesn't already exist, it will be created.
114
115 `directory`
116 Directory containing the index. Defaults to a location within the
117 `pokedex` egg directory.
118
119 `session`
120 Used for creating the index and retrieving objects. Defaults to an
121 attempt to connect to the default SQLite database installed by
122 `pokedex setup`.
123 """
124
125 # By the time this returns, self.index, self.speller, and self.session
126 # must be set
127
128 # If a directory was not given, use the default
129 if directory is None:
130 directory = get_default_index_dir()
131
132 self.directory = directory
133
134 if session:
135 self.session = session
136 else:
137 self.session = connect()
138
139 # Attempt to open or create the index
140 if not os.path.exists(directory) or not os.listdir(directory):
141 # Directory doesn't exist OR is empty; caller needs to use
142 # rebuild_index before doing anything. Provide a dummy object that
143 # complains when used
144 self.index = UninitializedIndex()
145 self.speller = UninitializedIndex()
146 return
147
148 # Otherwise, already exists; should be an index! Bam, done.
149 # Note that this will explode if the directory exists but doesn't
150 # contain an index; that's a feature
151 try:
152 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
153 except whoosh.index.EmptyIndexError:
154 raise IOError(
155 "The index directory already contains files. "
156 "Please use a dedicated directory for the lookup index."
157 )
158
159 # Create speller, and done
160 spell_store = whoosh.filedb.filestore.FileStorage(directory)
161 self.speller = whoosh.spelling.SpellChecker(spell_store,
162 **self.SPELLER_OPTIONS)
163
164
165 def rebuild_index(self):
166 """Creates the index from scratch."""
167
168 schema = whoosh.fields.Schema(
169 name=whoosh.fields.ID(stored=True),
170 table=whoosh.fields.ID(stored=True),
171 row_id=whoosh.fields.ID(stored=True),
172 language=whoosh.fields.STORED,
173 iso639=whoosh.fields.ID(stored=True),
174 iso3166=whoosh.fields.ID(stored=True),
175 display_name=whoosh.fields.STORED, # non-lowercased name
176 )
177
178 if not os.path.exists(self.directory):
179 os.mkdir(self.directory)
180
181 self.index = whoosh.index.create_in(self.directory, schema=schema,
182 indexname='MAIN')
183 writer = self.index.writer()
184
185 # Index every name in all our tables of interest
186 speller_entries = set()
187 for cls in self.indexed_tables.values():
188 q = self.session.query(cls)
189
190 for row in q.yield_per(5):
191 row_key = dict(table=unicode(cls.__tablename__),
192 row_id=unicode(row.id))
193
194 def add(name, language, iso639, iso3166):
195 normalized_name = self.normalize_name(name)
196
197 writer.add_document(
198 name=normalized_name, display_name=name,
199 language=language, iso639=iso639, iso3166=iso3166,
200 **row_key
201 )
202
203 speller_entries.add(normalized_name)
204
205
206 # Add the basic English name to the index
207 if cls == tables.Pokemon:
208 # Pokémon need their form name added
209 # XXX kinda kludgy
210 add(row.full_name, None, u'en', u'us')
211
212 # If this is a default form, ALSO add the unadorned name,
213 # so 'Deoxys' alone will still do the right thing
214 if row.forme_name and not row.forme_base_pokemon_id:
215 add(row.name, None, u'en', u'us')
216 else:
217 add(row.name, None, u'en', u'us')
218
219 # Some things also have other languages' names
220 # XXX other language form names..?
221 for foreign_name in getattr(row, 'foreign_names', []):
222 moonspeak = foreign_name.name
223 if row.name == moonspeak:
224 # Don't add the English name again as a different
225 # language; no point and it makes spell results
226 # confusing
227 continue
228
229 add(moonspeak, foreign_name.language.name,
230 foreign_name.language.iso639,
231 foreign_name.language.iso3166)
232
233 # Add Roomaji too
234 if foreign_name.language.name == 'Japanese':
235 roomaji = romanize(foreign_name.name)
236 add(roomaji, u'Roomaji', u'ja', u'jp')
237
238 writer.commit()
239
240 # Construct and populate a spell-checker index. Quicker to do it all
241 # at once, as every call to add_* does a commit(), and those seem to be
242 # expensive
243 self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
244 **self.SPELLER_OPTIONS)
245 self.speller.add_words(speller_entries)
246
247
248 def normalize_name(self, name):
249 """Strips irrelevant formatting junk from name input.
250
251 Specifically: everything is lowercased, and accents are removed.
252 """
253 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
254 # Makes sense to me. Decompose by Unicode rules, then remove combining
255 # characters, then recombine. I'm explicitly doing it this way instead
256 # of testing combining() because Korean characters apparently
257 # decompose! But the results are considered letters, not combining
258 # characters, so testing for Mn works well, and combining them again
259 # makes them look right.
260 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
261 name = u"".join(c for c in nkfd_form
262 if unicodedata.category(c) != 'Mn')
263 name = unicodedata.normalize('NFC', name)
264
265 name = name.strip()
266 name = name.lower()
267
268 return name
269
270
271 def _apply_valid_types(self, name, valid_types):
272 """Combines the enforced `valid_types` with any from the search string
273 itself and updates the query.
274
275 For example, a name of 'a,b:foo' and valid_types of b,c will search for
276 only `b`s named "foo".
277
278 Returns `(name, merged_valid_types, term)`, where `name` has had any type
279 prefix stripped, `merged_valid_types` combines the original
280 `valid_types` with the type prefix, and `term` is a query term for
281 limited to just the allowed types. If there are no type restrictions
282 at all, `term` will be None.
283 """
284
285 # Remove any type prefix (pokemon:133) first
286 user_valid_types = []
287 if ':' in name:
288 prefix_chunk, name = name.split(':', 1)
289 name = name.strip()
290
291 prefixes = prefix_chunk.split(',')
292 user_valid_types = [_.strip() for _ in prefixes]
293
294 # Merge the valid types together. Only types that appear in BOTH lists
295 # may be used.
296 # As a special case, if the user asked for types that are explicitly
297 # forbidden, completely ignore what the user requested.
298 # And, just to complicate matters: "type" and language need to be
299 # considered separately.
300 def merge_requirements(func):
301 user = filter(func, user_valid_types)
302 system = filter(func, valid_types)
303
304 if user and system:
305 merged = list(set(user) & set(system))
306 if merged:
307 return merged
308 else:
309 # No overlap; use the system restrictions
310 return system
311 else:
312 # One or the other is blank; use the one that's not
313 return user or system
314
315 # @foo means language must be foo; otherwise it's a table name
316 lang_requirements = merge_requirements(lambda req: req[0] == u'@')
317 type_requirements = merge_requirements(lambda req: req[0] != u'@')
318 all_requirements = lang_requirements + type_requirements
319
320 # Construct the term
321 lang_terms = []
322 for lang in lang_requirements:
323 # Allow for either country or language codes
324 lang_code = lang[1:]
325 lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
326 lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
327
328 type_terms = []
329 for type in type_requirements:
330 table_name = self._parse_table_name(type)
331
332 # Quietly ignore bogus valid_types; more likely to DTRT
333 if table_name:
334 type_terms.append(whoosh.query.Term(u'table', table_name))
335
336 # Combine both kinds of restriction
337 all_terms = []
338 if type_terms:
339 all_terms.append(whoosh.query.Or(type_terms))
340 if lang_terms:
341 all_terms.append(whoosh.query.Or(lang_terms))
342
343 return name, all_requirements, whoosh.query.And(all_terms)
344
345
346 def _parse_table_name(self, name):
347 """Takes a singular table name, table name, or table object and returns
348 the table name.
349
350 Returns None for a bogus name.
351 """
352 # Table object
353 if hasattr(name, '__tablename__'):
354 return getattr(name, '__tablename__')
355
356 # Table name
357 for table in self.indexed_tables.values():
358 if name in (table.__tablename__, table.__singlename__):
359 return table.__tablename__
360
361 # Bogus. Be nice and return dummy
362 return None
363
364 def _whoosh_records_to_results(self, records, exact=True):
365 """Converts a list of whoosh's indexed records to LookupResult tuples
366 containing database objects.
367 """
368 # XXX this 'exact' thing is getting kinda leaky. would like a better
369 # way to handle it, since only lookup() cares about fuzzy results
370 seen = {}
371 results = []
372 for record in records:
373 # Skip dupes
374 seen_key = record['table'], record['row_id']
375 if seen_key in seen:
376 continue
377 seen[seen_key] = True
378
379 cls = self.indexed_tables[record['table']]
380 obj = self.session.query(cls).get(record['row_id'])
381
382 results.append(LookupResult(object=obj,
383 indexed_name=record['name'],
384 name=record['display_name'],
385 language=record['language'],
386 iso639=record['iso639'],
387 iso3166=record['iso3166'],
388 exact=exact))
389
390 return results
391
392
393 def lookup(self, input, valid_types=[], exact_only=False):
394 """Attempts to find some sort of object, given a name.
395
396 Returns a list of named (object, name, language, iso639, iso3166,
397 exact) tuples. `object` is a database object, `name` is the name under
398 which the object was found, `language` and the two isos are the name
399 and country codes of the language in which the name was found, and
400 `exact` is True iff this was an exact match.
401
402 This function currently ONLY does fuzzy matching if there are no exact
403 matches.
404
405 Formes are not returned unless requested; "Shaymin" will return only
406 grass Shaymin.
407
408 Extraneous whitespace is removed with extreme prejudice.
409
410 Recognizes:
411 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
412 - Foreign names: "Iibui", "Eivui"
413 - Fuzzy names in whatever language: "Evee", "Ibui"
414 - IDs: "133", "192", "250"
415 Also:
416 - Type restrictions. "type:psychic" will only return the type. This
417 is how to make ID lookup useful. Multiple type specs can be entered
418 with commas, as "move,item:1".
419 - Language restrictions. "@fr:charge" will only return Tackle, which
420 is called "Charge" in French. These can be combined with type
421 restrictions, e.g., "@fr,move:charge".
422 - Alternate formes can be specified merely like "wash rotom".
423
424 `input`
425 Name of the thing to look for.
426
427 `valid_types`
428 A list of type or language restrictions, e.g., `['pokemon',
429 '@ja']`. If this is provided, only results in one of the given
430 tables will be returned.
431
432 `exact_only`
433 If True, only exact matches are returned. If set to False (the
434 default), and the provided `name` doesn't match anything exactly,
435 spelling correction will be attempted.
436 """
437
438 name = self.normalize_name(input)
439 exact = True
440 form = None
441
442 # Pop off any type prefix and merge with valid_types
443 name, merged_valid_types, type_term = \
444 self._apply_valid_types(name, valid_types)
445
446 # Random lookup
447 if name == 'random':
448 return self.random_lookup(valid_types=merged_valid_types)
449
450 # Do different things depending what the query looks like
451 # Note: Term objects do an exact match, so we don't have to worry about
452 # a query parser tripping on weird characters in the input
453 try:
454 # Let Python try to convert to a number, so 0xff works
455 name_as_number = int(name, base=0)
456 except ValueError:
457 # Oh well
458 name_as_number = None
459
460 if '*' in name or '?' in name:
461 exact_only = True
462 query = whoosh.query.Wildcard(u'name', name)
463 elif name_as_number is not None:
464 # Don't spell-check numbers!
465 exact_only = True
466 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
467 else:
468 # Not an integer
469 query = whoosh.query.Term(u'name', name)
470
471 if type_term:
472 query = query & type_term
473
474
475 ### Actual searching
476 # Limits; result limits are constants, and intermediate results (before
477 # duplicate items are stripped out) are capped at the result limit
478 # times another constant.
479 # Fuzzy are capped at 10, beyond which something is probably very
480 # wrong. Exact matches -- that is, wildcards and ids -- are far less
481 # constrained.
482 # Also, exact matches are sorted by name, since weight doesn't matter.
483 sort_by = dict()
484 if exact_only:
485 max_results = self.MAX_EXACT_RESULTS
486 sort_by['sortedby'] = (u'table', u'name')
487 else:
488 max_results = self.MAX_FUZZY_RESULTS
489
490 searcher = self.index.searcher(weighting=LanguageWeighting())
491 results = searcher.search(
492 query,
493 limit=int(max_results * self.INTERMEDIATE_FACTOR),
494 **sort_by
495 )
496
497 # Look for some fuzzy matches if necessary
498 if not exact_only and not results:
499 exact = False
500 results = []
501
502 fuzzy_query_parts = []
503 fuzzy_weights = {}
504 min_weight = [None]
505 for suggestion, _, weight in self.speller.suggestions_and_scores(name):
506 # Only allow the top 50% of scores; otherwise there will always
507 # be a lot of trailing junk
508 if min_weight[0] is None:
509 min_weight[0] = weight * 0.5
510 elif weight < min_weight[0]:
511 break
512
513 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
514 fuzzy_weights[suggestion] = weight
515
516 if not fuzzy_query_parts:
517 # Nothing at all; don't try querying
518 return []
519
520 fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
521 if type_term:
522 fuzzy_query = fuzzy_query & type_term
523
524 searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
525 results = searcher.search(fuzzy_query)
526
527 ### Convert results to db objects
528 objects = self._whoosh_records_to_results(results, exact=exact)
529
530 # Truncate and return
531 return objects[:max_results]
532
533
534 def random_lookup(self, valid_types=[]):
535 """Returns a random lookup result from one of the provided
536 `valid_types`.
537 """
538
539 table_names = []
540 for valid_type in valid_types:
541 table_name = self._parse_table_name(valid_type)
542 # Skip anything not recognized. Could be, say, a language code
543 if table_name:
544 table_names.append(table_name)
545
546 if not table_names:
547 # n.b.: It's possible we got a list of valid_types and none of them
548 # were valid, but this function is guaranteed to return
549 # *something*, so it politely selects from the entire index instead
550 table_names = self.indexed_tables.keys()
551
552 # Rather than create an array of many hundred items and pick randomly
553 # from it, just pick a number up to the total number of potential
554 # items, then pick randomly from that, and partition the whole range
555 # into chunks. This also avoids the slight problem that the index
556 # contains more rows (for languages) for some items than others.
557 # XXX ought to cache this (in the index?) if possible
558 total = 0
559 partitions = []
560 for table_name in table_names:
561 count = self.session.query(self.indexed_tables[table_name]).count()
562 total += count
563 partitions.append((table_name, count))
564
565 n = random.randint(1, total)
566 while n > partitions[0][1]:
567 n -= partitions[0][1]
568 partitions.pop(0)
569
570 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
571
572 def prefix_lookup(self, prefix, valid_types=[]):
573 """Returns terms starting with the given exact prefix.
574
575 Type prefixes are recognized, but no other name munging is done.
576 """
577
578 # Pop off any type prefix and merge with valid_types
579 prefix, merged_valid_types, type_term = \
580 self._apply_valid_types(prefix, valid_types)
581
582 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
583
584 if type_term:
585 query = query & type_term
586
587 searcher = self.index.searcher()
588 searcher.weighting = LanguageWeighting()
589 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
590
591 return self._whoosh_records_to_results(results)