a75136b7a72ab372f1e4227e6982bbdf849fdab5
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 from collections import namedtuple
3 import os, os.path
4 import pkg_resources
5 import random
6 import re
7 import shutil
8 import unicodedata
9
10 from sqlalchemy.sql import func
11 import whoosh
12 import whoosh.filedb.filestore
13 import whoosh.filedb.fileindex
14 import whoosh.index
15 from whoosh.qparser import QueryParser
16 import whoosh.scoring
17 import whoosh.spelling
18
19 from pokedex.db import connect
20 import pokedex.db.tables as tables
21 from pokedex.roomaji import romanize
22
23 __all__ = ['PokedexLookup']
24
25
26 rx_is_number = re.compile('^\d+$')
27
28 LookupResult = namedtuple('LookupResult',
29 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
30
31 class UninitializedIndex(object):
32 class UninitializedIndexError(Exception):
33 pass
34
35 def __nonzero__(self):
36 """Dummy object should identify itself as False."""
37 return False
38
39 def __bool__(self):
40 """Python 3000 version of the above. Future-proofing rules!"""
41 return False
42
43 def __getattr__(self, *args, **kwargs):
44 raise self.UninitializedIndexError(
45 "The lookup index does not exist. Please use `pokedex setup` "
46 "or lookup.rebuild_index() to create it."
47 )
48
49 class LanguageWeighting(whoosh.scoring.Weighting):
50 """A scoring class that forces otherwise-equal English results to come
51 before foreign results.
52 """
53
54 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
55 doc = searcher.stored_fields(docnum)
56 if doc['language'] == None:
57 # English (well, "default"); leave it at 1
58 return weight
59 elif doc['language'] == u'Roomaji':
60 # Give Roomaji a little boost; it's most likely to be searched
61 return weight * 0.95
62 else:
63 # Everything else can drop down the totem pole
64 return weight * 0.9
65
66
67 class PokedexLookup(object):
68 INTERMEDIATE_LOOKUP_RESULTS = 25
69 MAX_LOOKUP_RESULTS = 10
70
71 # Dictionary of table name => table class.
72 # Need the table name so we can get the class from the table name after we
73 # retrieve something from the index
74 indexed_tables = dict(
75 (cls.__tablename__, cls)
76 for cls in (
77 tables.Ability,
78 tables.Item,
79 tables.Location,
80 tables.Move,
81 tables.Nature,
82 tables.Pokemon,
83 tables.Type,
84 )
85 )
86
87
88 def __init__(self, directory=None, session=None):
89 """Opens the whoosh index stored in the named directory. If the index
90 doesn't already exist, it will be created.
91
92 `directory`
93 Directory containing the index. Defaults to a location within the
94 `pokedex` egg directory.
95
96 `session`
97 Used for creating the index and retrieving objects. Defaults to an
98 attempt to connect to the default SQLite database installed by
99 `pokedex setup`.
100 """
101
102 # By the time this returns, self.index, self.speller, and self.session
103 # must be set
104
105 # Defaults
106 if not directory:
107 directory = os.environ.get('POKEDEX_INDEX_DIR', None)
108
109 if not directory:
110 directory = pkg_resources.resource_filename('pokedex',
111 'data/whoosh-index')
112 self.directory = directory
113
114 if session:
115 self.session = session
116 else:
117 self.session = connect()
118
119 # Attempt to open or create the index
120 if not os.path.exists(directory) or not os.listdir(directory):
121 # Directory doesn't exist OR is empty; caller needs to use
122 # rebuild_index before doing anything. Provide a dummy object that
123 # complains when used
124 self.index = UninitializedIndex()
125 self.speller = UninitializedIndex()
126 return
127
128 # Otherwise, already exists; should be an index! Bam, done.
129 # Note that this will explode if the directory exists but doesn't
130 # contain an index; that's a feature
131 try:
132 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
133 except whoosh.index.EmptyIndexError:
134 raise IOError(
135 "The index directory already contains files. "
136 "Please use a dedicated directory for the lookup index."
137 )
138
139 # Create speller, and done
140 spell_store = whoosh.filedb.filestore.FileStorage(directory)
141 self.speller = whoosh.spelling.SpellChecker(spell_store)
142
143
144 def rebuild_index(self):
145 """Creates the index from scratch."""
146
147 schema = whoosh.fields.Schema(
148 name=whoosh.fields.ID(stored=True),
149 table=whoosh.fields.ID(stored=True),
150 row_id=whoosh.fields.ID(stored=True),
151 language=whoosh.fields.STORED,
152 iso3166=whoosh.fields.STORED,
153 display_name=whoosh.fields.STORED, # non-lowercased name
154 )
155
156 if not os.path.exists(self.directory):
157 os.mkdir(self.directory)
158
159 self.index = whoosh.index.create_in(self.directory, schema=schema,
160 indexname='MAIN')
161 writer = self.index.writer()
162
163 # Index every name in all our tables of interest
164 # speller_entries becomes a list of (word, score) tuples; the score is
165 # 2 for English names, 1.5 for Roomaji, and 1 for everything else. I
166 # think this biases the results in the direction most people expect,
167 # especially when e.g. German names are very similar to English names
168 speller_entries = []
169 for cls in self.indexed_tables.values():
170 q = self.session.query(cls)
171
172 for row in q.yield_per(5):
173 row_key = dict(table=unicode(cls.__tablename__),
174 row_id=unicode(row.id))
175
176 def add(name, language, iso3166, score):
177 normalized_name = self.normalize_name(name)
178
179 writer.add_document(
180 name=normalized_name, display_name=name,
181 language=language, iso3166=iso3166,
182 **row_key
183 )
184
185 speller_entries.append((normalized_name, score))
186
187
188 # Add the basic English name to the index
189 if cls == tables.Pokemon:
190 # Pokémon need their form name added
191 # XXX kinda kludgy
192 add(row.full_name, None, u'us', 1)
193
194 # If this is a default form, ALSO add the unadorned name,
195 # so 'Deoxys' alone will still do the right thing
196 if row.forme_name and not row.forme_base_pokemon_id:
197 add(row.name, None, u'us', 1)
198 else:
199 add(row.name, None, u'us', 1)
200
201 # Some things also have other languages' names
202 # XXX other language form names..?
203 for foreign_name in getattr(row, 'foreign_names', []):
204 moonspeak = foreign_name.name
205 if row.name == moonspeak:
206 # Don't add the English name again as a different
207 # language; no point and it makes spell results
208 # confusing
209 continue
210
211 add(moonspeak, foreign_name.language.name,
212 foreign_name.language.iso3166,
213 3)
214
215 # Add Roomaji too
216 if foreign_name.language.name == 'Japanese':
217 roomaji = romanize(foreign_name.name)
218 add(roomaji, u'Roomaji', u'jp', 8)
219
220 writer.commit()
221
222 # Construct and populate a spell-checker index. Quicker to do it all
223 # at once, as every call to add_* does a commit(), and those seem to be
224 # expensive
225 self.speller = whoosh.spelling.SpellChecker(self.index.storage)
226 self.speller.add_scored_words(speller_entries)
227
228
229 def normalize_name(self, name):
230 """Strips irrelevant formatting junk from name input.
231
232 Specifically: everything is lowercased, and accents are removed.
233 """
234 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
235 # Makes sense to me. Decompose by Unicode rules, then remove combining
236 # characters, then recombine. I'm explicitly doing it this way instead
237 # of testing combining() because Korean characters apparently
238 # decompose! But the results are considered letters, not combining
239 # characters, so testing for Mn works well, and combining them again
240 # makes them look right.
241 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
242 name = u"".join(c for c in nkfd_form
243 if unicodedata.category(c) != 'Mn')
244 name = unicodedata.normalize('NFC', name)
245
246 name = name.strip()
247 name = name.lower()
248
249 return name
250
251
252 def _apply_valid_types(self, name, valid_types):
253 """Combines the enforced `valid_types` with any from the search string
254 itself and updates the query.
255
256 For example, a name of 'a,b:foo' and valid_types of b,c will search for
257 only `b`s named "foo".
258
259 Returns `(name, merged_valid_types, term)`, where `name` has had any type
260 prefix stripped, `merged_valid_types` combines the original
261 `valid_types` with the type prefix, and `term` is a query term for
262 limited to just the allowed types. If there are no type restrictions
263 at all, `term` will be None.
264 """
265
266 # Remove any type prefix (pokemon:133) first
267 user_valid_types = []
268 if ':' in name:
269 prefix_chunk, name = name.split(':', 1)
270 name = name.strip()
271
272 prefixes = prefix_chunk.split(',')
273 user_valid_types = [_.strip() for _ in prefixes]
274
275 # Merge the valid types together. Only types that appear in BOTH lists
276 # may be used.
277 # As a special case, if the user asked for types that are explicitly
278 # forbidden, completely ignore what the user requested
279 combined_valid_types = []
280 if user_valid_types and valid_types:
281 combined_valid_types = list(
282 set(user_valid_types) & set(combined_valid_types)
283 )
284
285 if not combined_valid_types:
286 # No overlap! Just use the enforced ones
287 combined_valid_types = valid_types
288 else:
289 # One list or the other was blank, so just use the one that isn't
290 combined_valid_types = valid_types + user_valid_types
291
292 if not combined_valid_types:
293 # No restrictions
294 return name, [], None
295
296 # Construct the term
297 type_terms = []
298 final_valid_types = []
299 for valid_type in combined_valid_types:
300 table_name = self._parse_table_name(valid_type)
301
302 # Quietly ignore bogus valid_types; more likely to DTRT
303 if table_name:
304 final_valid_types.append(valid_type)
305 type_terms.append(whoosh.query.Term(u'table', table_name))
306
307 return name, final_valid_types, whoosh.query.Or(type_terms)
308
309
310 def _parse_table_name(self, name):
311 """Takes a singular table name, table name, or table object and returns
312 the table name.
313
314 Returns None for a bogus name.
315 """
316 # Table object
317 if hasattr(name, '__tablename__'):
318 return getattr(name, '__tablename__')
319
320 # Table name
321 for table in self.indexed_tables.values():
322 if name in (table.__tablename__, table.__singlename__):
323 return table.__tablename__
324
325 # Bogus. Be nice and return dummy
326 return None
327
328 def _whoosh_records_to_results(self, records, exact=True):
329 """Converts a list of whoosh's indexed records to LookupResult tuples
330 containing database objects.
331 """
332 # XXX this 'exact' thing is getting kinda leaky. would like a better
333 # way to handle it, since only lookup() cares about fuzzy results
334 seen = {}
335 results = []
336 for record in records:
337 # Skip dupes
338 seen_key = record['table'], record['row_id']
339 if seen_key in seen:
340 continue
341 seen[seen_key] = True
342
343 cls = self.indexed_tables[record['table']]
344 obj = self.session.query(cls).get(record['row_id'])
345
346 results.append(LookupResult(object=obj,
347 indexed_name=record['name'],
348 name=record['display_name'],
349 language=record['language'],
350 iso3166=record['iso3166'],
351 exact=exact))
352
353 return results
354
355
356 def lookup(self, input, valid_types=[], exact_only=False):
357 """Attempts to find some sort of object, given a name.
358
359 Returns a list of named (object, name, language, iso3166, exact)
360 tuples. `object` is a database object, `name` is the name under which
361 the object was found, `language` and `iso3166` are the name and country
362 code of the language in which the name was found, and `exact` is True
363 iff this was an
364 exact match.
365
366 This function currently ONLY does fuzzy matching if there are no exact
367 matches.
368
369 Formes are not returned unless requested; "Shaymin" will return only
370 grass Shaymin.
371
372 Extraneous whitespace is removed with extreme prejudice.
373
374 Recognizes:
375 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
376 - Foreign names: "Iibui", "Eivui"
377 - Fuzzy names in whatever language: "Evee", "Ibui"
378 - IDs: "133", "192", "250"
379 Also:
380 - Type restrictions. "type:psychic" will only return the type. This
381 is how to make ID lookup useful. Multiple type specs can be entered
382 with commas, as "move,item:1". If `valid_types` are provided, any
383 type prefix will be ignored.
384 - Alternate formes can be specified merely like "wash rotom".
385
386 `input`
387 Name of the thing to look for.
388
389 `valid_types`
390 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
391 this is provided, only results in one of the given tables will be
392 returned.
393
394 `exact_only`
395 If True, only exact matches are returned. If set to False (the
396 default), and the provided `name` doesn't match anything exactly,
397 spelling correction will be attempted.
398 """
399
400 name = self.normalize_name(input)
401 exact = True
402 form = None
403
404 # Pop off any type prefix and merge with valid_types
405 name, merged_valid_types, type_term = \
406 self._apply_valid_types(name, valid_types)
407
408 # Random lookup
409 if name == 'random':
410 return self.random_lookup(valid_types=merged_valid_types)
411
412 # Do different things depending what the query looks like
413 # Note: Term objects do an exact match, so we don't have to worry about
414 # a query parser tripping on weird characters in the input
415 try:
416 # Let Python try to convert to a number, so 0xff works
417 name_as_number = int(name, base=0)
418 except ValueError:
419 # Oh well
420 name_as_number = None
421
422 if '*' in name or '?' in name:
423 exact_only = True
424 query = whoosh.query.Wildcard(u'name', name)
425 elif name_as_number is not None:
426 # Don't spell-check numbers!
427 exact_only = True
428 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
429 else:
430 # Not an integer
431 query = whoosh.query.Term(u'name', name)
432
433 if type_term:
434 query = query & type_term
435
436
437 ### Actual searching
438 searcher = self.index.searcher()
439 # XXX is this kosher? docs say search() takes a weighting arg, but it
440 # certainly does not
441 searcher.weighting = LanguageWeighting()
442 results = searcher.search(query,
443 limit=self.INTERMEDIATE_LOOKUP_RESULTS)
444
445 # Look for some fuzzy matches if necessary
446 if not exact_only and not results:
447 exact = False
448 results = []
449
450 for suggestion in self.speller.suggest(
451 name, self.INTERMEDIATE_LOOKUP_RESULTS):
452
453 query = whoosh.query.Term('name', suggestion)
454 results.extend(searcher.search(query))
455
456 ### Convert results to db objects
457 objects = self._whoosh_records_to_results(results, exact=exact)
458
459 # Only return up to 10 matches; beyond that, something is wrong. We
460 # strip out duplicate entries above, so it's remotely possible that we
461 # should have more than 10 here and lost a few. The speller returns 25
462 # to give us some padding, and should avoid that problem. Not a big
463 # deal if we lose the 25th-most-likely match anyway.
464 return objects[:self.MAX_LOOKUP_RESULTS]
465
466
467 def random_lookup(self, valid_types=[]):
468 """Returns a random lookup result from one of the provided
469 `valid_types`.
470 """
471
472 tables = []
473 for valid_type in valid_types:
474 table_name = self._parse_table_name(valid_type)
475 if table_name:
476 tables.append(self.indexed_tables[table_name])
477
478 if not tables:
479 # n.b.: It's possible we got a list of valid_types and none of them
480 # were valid, but this function is guaranteed to return
481 # *something*, so it politely selects from the entire index isntead
482 tables = self.indexed_tables.values()
483
484 # Rather than create an array of many hundred items and pick randomly
485 # from it, just pick a number up to the total number of potential
486 # items, then pick randomly from that, and partition the whole range
487 # into chunks. This also avoids the slight problem that the index
488 # contains more rows (for languages) for some items than others.
489 # XXX ought to cache this (in the index?) if possible
490 total = 0
491 partitions = []
492 for table in tables:
493 count = self.session.query(table).count()
494 total += count
495 partitions.append((table, count))
496
497 n = random.randint(1, total)
498 while n > partitions[0][1]:
499 n -= partitions[0][1]
500 partitions.pop(0)
501
502 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
503
504 def prefix_lookup(self, prefix, valid_types=[]):
505 """Returns terms starting with the given exact prefix.
506
507 Type prefixes are recognized, but no other name munging is done.
508 """
509
510 # Pop off any type prefix and merge with valid_types
511 prefix, merged_valid_types, type_term = \
512 self._apply_valid_types(prefix, valid_types)
513
514 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
515
516 if type_term:
517 query = query & type_term
518
519 searcher = self.index.searcher()
520 searcher.weighting = LanguageWeighting()
521 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
522
523 return self._whoosh_records_to_results(results)