Allow restricting lookup by language, with a @ja: prefix. #90
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import random
4 import re
5 import shutil
6 import unicodedata
7
8 from sqlalchemy.sql import func
9 import whoosh
10 import whoosh.filedb.filestore
11 import whoosh.filedb.fileindex
12 import whoosh.index
13 from whoosh.qparser import QueryParser
14 import whoosh.scoring
15 import whoosh.spelling
16
17 from pokedex.util import namedtuple
18
19 from pokedex.db import connect
20 import pokedex.db.tables as tables
21 from pokedex.roomaji import romanize
22 from pokedex.defaults import get_default_index_dir
23
24 __all__ = ['PokedexLookup']
25
26
27 rx_is_number = re.compile('^\d+$')
28
29 LookupResult = namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
31 ])
32
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
35 pass
36
37 def __nonzero__(self):
38 """Dummy object should identify itself as False."""
39 return False
40
41 def __bool__(self):
42 """Python 3000 version of the above. Future-proofing rules!"""
43 return False
44
45 def __getattr__(self, *args, **kwargs):
46 raise self.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
49 )
50
51 class LanguageWeighting(whoosh.scoring.Weighting):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
54 """
55
56 def __init__(self, extra_weights={}, *args, **kwargs):
57 """`extra_weights` may be a dictionary of weights which will be
58 factored in.
59
60 Intended for use with spelling corrections, which come along with their
61 own weightings.
62 """
63 self.extra_weights = extra_weights
64 super(LanguageWeighting, self).__init__(*args, **kwargs)
65
66 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
67 doc = searcher.stored_fields(docnum)
68
69 # Apply extra weight
70 weight = weight * self.extra_weights.get(text, 1.0)
71
72 if doc['language'] == None:
73 # English (well, "default"); leave it at 1
74 return weight
75 elif doc['language'] == u'Roomaji':
76 # Give Roomaji a little boost; it's most likely to be searched
77 return weight * 0.9
78 else:
79 # Everything else can drop down the totem pole
80 return weight * 0.8
81
82
83 class PokedexLookup(object):
84 INTERMEDIATE_LOOKUP_RESULTS = 25
85 MAX_LOOKUP_RESULTS = 10
86
87 # The speller only checks how much the input matches a word; there can be
88 # all manner of extra unmatched junk, and it won't affect the weighting.
89 # To compensate, greatly boost the weighting of matches at the beginning
90 # and end, so nearly-full-word-matches are much better
91 SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
92
93 # Dictionary of table name => table class.
94 # Need the table name so we can get the class from the table name after we
95 # retrieve something from the index
96 indexed_tables = dict(
97 (cls.__tablename__, cls)
98 for cls in (
99 tables.Ability,
100 tables.Item,
101 tables.Location,
102 tables.Move,
103 tables.Nature,
104 tables.Pokemon,
105 tables.Type,
106 )
107 )
108
109
110 def __init__(self, directory=None, session=None):
111 """Opens the whoosh index stored in the named directory. If the index
112 doesn't already exist, it will be created.
113
114 `directory`
115 Directory containing the index. Defaults to a location within the
116 `pokedex` egg directory.
117
118 `session`
119 Used for creating the index and retrieving objects. Defaults to an
120 attempt to connect to the default SQLite database installed by
121 `pokedex setup`.
122 """
123
124 # By the time this returns, self.index, self.speller, and self.session
125 # must be set
126
127 # If a directory was not given, use the default
128 if directory is None:
129 directory = get_default_index_dir()
130
131 self.directory = directory
132
133 if session:
134 self.session = session
135 else:
136 self.session = connect()
137
138 # Attempt to open or create the index
139 if not os.path.exists(directory) or not os.listdir(directory):
140 # Directory doesn't exist OR is empty; caller needs to use
141 # rebuild_index before doing anything. Provide a dummy object that
142 # complains when used
143 self.index = UninitializedIndex()
144 self.speller = UninitializedIndex()
145 return
146
147 # Otherwise, already exists; should be an index! Bam, done.
148 # Note that this will explode if the directory exists but doesn't
149 # contain an index; that's a feature
150 try:
151 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
152 except whoosh.index.EmptyIndexError:
153 raise IOError(
154 "The index directory already contains files. "
155 "Please use a dedicated directory for the lookup index."
156 )
157
158 # Create speller, and done
159 spell_store = whoosh.filedb.filestore.FileStorage(directory)
160 self.speller = whoosh.spelling.SpellChecker(spell_store,
161 **self.SPELLER_OPTIONS)
162
163
164 def rebuild_index(self):
165 """Creates the index from scratch."""
166
167 schema = whoosh.fields.Schema(
168 name=whoosh.fields.ID(stored=True),
169 table=whoosh.fields.ID(stored=True),
170 row_id=whoosh.fields.ID(stored=True),
171 language=whoosh.fields.STORED,
172 iso639=whoosh.fields.ID(stored=True),
173 iso3166=whoosh.fields.ID(stored=True),
174 display_name=whoosh.fields.STORED, # non-lowercased name
175 )
176
177 if not os.path.exists(self.directory):
178 os.mkdir(self.directory)
179
180 self.index = whoosh.index.create_in(self.directory, schema=schema,
181 indexname='MAIN')
182 writer = self.index.writer()
183
184 # Index every name in all our tables of interest
185 speller_entries = set()
186 for cls in self.indexed_tables.values():
187 q = self.session.query(cls)
188
189 for row in q.yield_per(5):
190 row_key = dict(table=unicode(cls.__tablename__),
191 row_id=unicode(row.id))
192
193 def add(name, language, iso639, iso3166):
194 normalized_name = self.normalize_name(name)
195
196 writer.add_document(
197 name=normalized_name, display_name=name,
198 language=language, iso639=iso639, iso3166=iso3166,
199 **row_key
200 )
201
202 speller_entries.add(normalized_name)
203
204
205 # Add the basic English name to the index
206 if cls == tables.Pokemon:
207 # Pokémon need their form name added
208 # XXX kinda kludgy
209 add(row.full_name, None, u'en', u'us')
210
211 # If this is a default form, ALSO add the unadorned name,
212 # so 'Deoxys' alone will still do the right thing
213 if row.forme_name and not row.forme_base_pokemon_id:
214 add(row.name, None, u'en', u'us')
215 else:
216 add(row.name, None, u'en', u'us')
217
218 # Some things also have other languages' names
219 # XXX other language form names..?
220 for foreign_name in getattr(row, 'foreign_names', []):
221 moonspeak = foreign_name.name
222 if row.name == moonspeak:
223 # Don't add the English name again as a different
224 # language; no point and it makes spell results
225 # confusing
226 continue
227
228 add(moonspeak, foreign_name.language.name,
229 foreign_name.language.iso639,
230 foreign_name.language.iso3166)
231
232 # Add Roomaji too
233 if foreign_name.language.name == 'Japanese':
234 roomaji = romanize(foreign_name.name)
235 add(roomaji, u'Roomaji', u'ja', u'jp')
236
237 writer.commit()
238
239 # Construct and populate a spell-checker index. Quicker to do it all
240 # at once, as every call to add_* does a commit(), and those seem to be
241 # expensive
242 self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
243 **self.SPELLER_OPTIONS)
244 self.speller.add_words(speller_entries)
245
246
247 def normalize_name(self, name):
248 """Strips irrelevant formatting junk from name input.
249
250 Specifically: everything is lowercased, and accents are removed.
251 """
252 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
253 # Makes sense to me. Decompose by Unicode rules, then remove combining
254 # characters, then recombine. I'm explicitly doing it this way instead
255 # of testing combining() because Korean characters apparently
256 # decompose! But the results are considered letters, not combining
257 # characters, so testing for Mn works well, and combining them again
258 # makes them look right.
259 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
260 name = u"".join(c for c in nkfd_form
261 if unicodedata.category(c) != 'Mn')
262 name = unicodedata.normalize('NFC', name)
263
264 name = name.strip()
265 name = name.lower()
266
267 return name
268
269
270 def _apply_valid_types(self, name, valid_types):
271 """Combines the enforced `valid_types` with any from the search string
272 itself and updates the query.
273
274 For example, a name of 'a,b:foo' and valid_types of b,c will search for
275 only `b`s named "foo".
276
277 Returns `(name, merged_valid_types, term)`, where `name` has had any type
278 prefix stripped, `merged_valid_types` combines the original
279 `valid_types` with the type prefix, and `term` is a query term for
280 limited to just the allowed types. If there are no type restrictions
281 at all, `term` will be None.
282 """
283
284 # Remove any type prefix (pokemon:133) first
285 user_valid_types = []
286 if ':' in name:
287 prefix_chunk, name = name.split(':', 1)
288 name = name.strip()
289
290 prefixes = prefix_chunk.split(',')
291 user_valid_types = [_.strip() for _ in prefixes]
292
293 # Merge the valid types together. Only types that appear in BOTH lists
294 # may be used.
295 # As a special case, if the user asked for types that are explicitly
296 # forbidden, completely ignore what the user requested
297 combined_valid_types = []
298 if user_valid_types and valid_types:
299 combined_valid_types = list(
300 set(user_valid_types) & set(combined_valid_types)
301 )
302
303 if not combined_valid_types:
304 # No overlap! Just use the enforced ones
305 combined_valid_types = valid_types
306 else:
307 # One list or the other was blank, so just use the one that isn't
308 combined_valid_types = valid_types + user_valid_types
309
310 if not combined_valid_types:
311 # No restrictions
312 return name, [], None
313
314 # Construct the term
315 type_terms = []
316 lang_terms = []
317 final_valid_types = []
318 for valid_type in combined_valid_types:
319 if valid_type.startswith(u'@'):
320 # @foo means: language must be foo.
321 # Allow for either country or language codes
322 lang_code = valid_type[1:]
323 lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
324 lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
325 else:
326 # otherwise, this is a type/table name
327 table_name = self._parse_table_name(valid_type)
328
329 # Quietly ignore bogus valid_types; more likely to DTRT
330 if table_name:
331 type_terms.append(whoosh.query.Term(u'table', table_name))
332
333 # Combine both kinds of restriction
334 all_terms = []
335 if type_terms:
336 all_terms.append(whoosh.query.Or(type_terms))
337 if lang_terms:
338 all_terms.append(whoosh.query.Or(lang_terms))
339
340 return name, combined_valid_types, whoosh.query.And(all_terms)
341
342
343 def _parse_table_name(self, name):
344 """Takes a singular table name, table name, or table object and returns
345 the table name.
346
347 Returns None for a bogus name.
348 """
349 # Table object
350 if hasattr(name, '__tablename__'):
351 return getattr(name, '__tablename__')
352
353 # Table name
354 for table in self.indexed_tables.values():
355 if name in (table.__tablename__, table.__singlename__):
356 return table.__tablename__
357
358 # Bogus. Be nice and return dummy
359 return None
360
361 def _whoosh_records_to_results(self, records, exact=True):
362 """Converts a list of whoosh's indexed records to LookupResult tuples
363 containing database objects.
364 """
365 # XXX this 'exact' thing is getting kinda leaky. would like a better
366 # way to handle it, since only lookup() cares about fuzzy results
367 seen = {}
368 results = []
369 for record in records:
370 # Skip dupes
371 seen_key = record['table'], record['row_id']
372 if seen_key in seen:
373 continue
374 seen[seen_key] = True
375
376 cls = self.indexed_tables[record['table']]
377 obj = self.session.query(cls).get(record['row_id'])
378
379 results.append(LookupResult(object=obj,
380 indexed_name=record['name'],
381 name=record['display_name'],
382 language=record['language'],
383 iso639=record['iso639'],
384 iso3166=record['iso3166'],
385 exact=exact))
386
387 return results
388
389
390 def lookup(self, input, valid_types=[], exact_only=False):
391 """Attempts to find some sort of object, given a name.
392
393 Returns a list of named (object, name, language, iso639, iso3166,
394 exact) tuples. `object` is a database object, `name` is the name under
395 which the object was found, `language` and the two isos are the name
396 and country codes of the language in which the name was found, and
397 `exact` is True iff this was an exact match.
398
399 This function currently ONLY does fuzzy matching if there are no exact
400 matches.
401
402 Formes are not returned unless requested; "Shaymin" will return only
403 grass Shaymin.
404
405 Extraneous whitespace is removed with extreme prejudice.
406
407 Recognizes:
408 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
409 - Foreign names: "Iibui", "Eivui"
410 - Fuzzy names in whatever language: "Evee", "Ibui"
411 - IDs: "133", "192", "250"
412 Also:
413 - Type restrictions. "type:psychic" will only return the type. This
414 is how to make ID lookup useful. Multiple type specs can be entered
415 with commas, as "move,item:1".
416 - Language restrictions. "@fr:charge" will only return Tackle, which
417 is called "Charge" in French. These can be combined with type
418 restrictions, e.g., "@fr,move:charge".
419 - Alternate formes can be specified merely like "wash rotom".
420
421 `input`
422 Name of the thing to look for.
423
424 `valid_types`
425 A list of type or language restrictions, e.g., `['pokemon',
426 '@ja']`. If this is provided, only results in one of the given
427 tables will be returned.
428
429 `exact_only`
430 If True, only exact matches are returned. If set to False (the
431 default), and the provided `name` doesn't match anything exactly,
432 spelling correction will be attempted.
433 """
434
435 name = self.normalize_name(input)
436 exact = True
437 form = None
438
439 # Pop off any type prefix and merge with valid_types
440 name, merged_valid_types, type_term = \
441 self._apply_valid_types(name, valid_types)
442
443 # Random lookup
444 if name == 'random':
445 return self.random_lookup(valid_types=merged_valid_types)
446
447 # Do different things depending what the query looks like
448 # Note: Term objects do an exact match, so we don't have to worry about
449 # a query parser tripping on weird characters in the input
450 try:
451 # Let Python try to convert to a number, so 0xff works
452 name_as_number = int(name, base=0)
453 except ValueError:
454 # Oh well
455 name_as_number = None
456
457 if '*' in name or '?' in name:
458 exact_only = True
459 query = whoosh.query.Wildcard(u'name', name)
460 elif name_as_number is not None:
461 # Don't spell-check numbers!
462 exact_only = True
463 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
464 else:
465 # Not an integer
466 query = whoosh.query.Term(u'name', name)
467
468 if type_term:
469 query = query & type_term
470
471
472 ### Actual searching
473 searcher = self.index.searcher()
474 # XXX is this kosher? docs say search() takes a weighting arg, but it
475 # certainly does not
476 searcher.weighting = LanguageWeighting()
477 results = searcher.search(query,
478 limit=self.INTERMEDIATE_LOOKUP_RESULTS)
479
480 # Look for some fuzzy matches if necessary
481 if not exact_only and not results:
482 exact = False
483 results = []
484
485 fuzzy_query_parts = []
486 fuzzy_weights = {}
487 min_weight = [None]
488 for suggestion, _, weight in self.speller.suggestions_and_scores(name):
489 # Only allow the top 50% of scores; otherwise there will always
490 # be a lot of trailing junk
491 if min_weight[0] is None:
492 min_weight[0] = weight * 0.5
493 elif weight < min_weight[0]:
494 break
495
496 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
497 fuzzy_weights[suggestion] = weight
498
499 if not fuzzy_query_parts:
500 # Nothing at all; don't try querying
501 return []
502
503 fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
504 if type_term:
505 fuzzy_query = fuzzy_query & type_term
506
507 searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
508 results = searcher.search(fuzzy_query)
509
510 ### Convert results to db objects
511 objects = self._whoosh_records_to_results(results, exact=exact)
512
513 # Only return up to 10 matches; beyond that, something is wrong. We
514 # strip out duplicate entries above, so it's remotely possible that we
515 # should have more than 10 here and lost a few. The speller returns 25
516 # to give us some padding, and should avoid that problem. Not a big
517 # deal if we lose the 25th-most-likely match anyway.
518 return objects[:self.MAX_LOOKUP_RESULTS]
519
520
521 def random_lookup(self, valid_types=[]):
522 """Returns a random lookup result from one of the provided
523 `valid_types`.
524 """
525
526 table_names = []
527 for valid_type in valid_types:
528 table_name = self._parse_table_name(valid_type)
529 # Skip anything not recognized. Could be, say, a language code
530 if table_name:
531 table_names.append(table_name)
532
533 if not table_names:
534 # n.b.: It's possible we got a list of valid_types and none of them
535 # were valid, but this function is guaranteed to return
536 # *something*, so it politely selects from the entire index instead
537 table_names = self.indexed_tables.keys()
538
539 # Rather than create an array of many hundred items and pick randomly
540 # from it, just pick a number up to the total number of potential
541 # items, then pick randomly from that, and partition the whole range
542 # into chunks. This also avoids the slight problem that the index
543 # contains more rows (for languages) for some items than others.
544 # XXX ought to cache this (in the index?) if possible
545 total = 0
546 partitions = []
547 for table_name in table_names:
548 count = self.session.query(self.indexed_tables[table_name]).count()
549 total += count
550 partitions.append((table_name, count))
551
552 n = random.randint(1, total)
553 while n > partitions[0][1]:
554 n -= partitions[0][1]
555 partitions.pop(0)
556
557 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
558
559 def prefix_lookup(self, prefix, valid_types=[]):
560 """Returns terms starting with the given exact prefix.
561
562 Type prefixes are recognized, but no other name munging is done.
563 """
564
565 # Pop off any type prefix and merge with valid_types
566 prefix, merged_valid_types, type_term = \
567 self._apply_valid_types(prefix, valid_types)
568
569 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
570
571 if type_term:
572 query = query & type_term
573
574 searcher = self.index.searcher()
575 searcher.weighting = LanguageWeighting()
576 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
577
578 return self._whoosh_records_to_results(results)