b00f5a794189f74c7a548d46a2dedbf452f7546a
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 from collections import namedtuple
3 import os, os.path
4 import random
5 import re
6 import shutil
7 import unicodedata
8
9 from sqlalchemy.sql import func
10 import whoosh
11 import whoosh.filedb.filestore
12 import whoosh.filedb.fileindex
13 import whoosh.index
14 from whoosh.qparser import QueryParser
15 import whoosh.scoring
16 import whoosh.spelling
17
18 from pokedex.db import connect
19 import pokedex.db.tables as tables
20 from pokedex.roomaji import romanize
21 from pokedex.defaults import get_default_index_dir
22
23 __all__ = ['PokedexLookup']
24
25
26 rx_is_number = re.compile('^\d+$')
27
28 LookupResult = namedtuple('LookupResult',
29 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
30
31 class UninitializedIndex(object):
32 class UninitializedIndexError(Exception):
33 pass
34
35 def __nonzero__(self):
36 """Dummy object should identify itself as False."""
37 return False
38
39 def __bool__(self):
40 """Python 3000 version of the above. Future-proofing rules!"""
41 return False
42
43 def __getattr__(self, *args, **kwargs):
44 raise self.UninitializedIndexError(
45 "The lookup index does not exist. Please use `pokedex setup` "
46 "or lookup.rebuild_index() to create it."
47 )
48
49 class LanguageWeighting(whoosh.scoring.Weighting):
50 """A scoring class that forces otherwise-equal English results to come
51 before foreign results.
52 """
53
54 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
55 doc = searcher.stored_fields(docnum)
56 if doc['language'] == None:
57 # English (well, "default"); leave it at 1
58 return weight
59 elif doc['language'] == u'Roomaji':
60 # Give Roomaji a little boost; it's most likely to be searched
61 return weight * 0.95
62 else:
63 # Everything else can drop down the totem pole
64 return weight * 0.9
65
66
67 class PokedexLookup(object):
68 INTERMEDIATE_LOOKUP_RESULTS = 25
69 MAX_LOOKUP_RESULTS = 10
70
71 # Dictionary of table name => table class.
72 # Need the table name so we can get the class from the table name after we
73 # retrieve something from the index
74 indexed_tables = dict(
75 (cls.__tablename__, cls)
76 for cls in (
77 tables.Ability,
78 tables.Item,
79 tables.Location,
80 tables.Move,
81 tables.Nature,
82 tables.Pokemon,
83 tables.Type,
84 )
85 )
86
87
88 def __init__(self, directory=None, session=None):
89 """Opens the whoosh index stored in the named directory. If the index
90 doesn't already exist, it will be created.
91
92 `directory`
93 Directory containing the index. Defaults to a location within the
94 `pokedex` egg directory.
95
96 `session`
97 Used for creating the index and retrieving objects. Defaults to an
98 attempt to connect to the default SQLite database installed by
99 `pokedex setup`.
100 """
101
102 # By the time this returns, self.index, self.speller, and self.session
103 # must be set
104
105 # If a directory was not given, use the default
106 if directory is None:
107 directory = get_default_index_dir()
108
109 self.directory = directory
110
111 if session:
112 self.session = session
113 else:
114 self.session = connect()
115
116 # Attempt to open or create the index
117 if not os.path.exists(directory) or not os.listdir(directory):
118 # Directory doesn't exist OR is empty; caller needs to use
119 # rebuild_index before doing anything. Provide a dummy object that
120 # complains when used
121 self.index = UninitializedIndex()
122 self.speller = UninitializedIndex()
123 return
124
125 # Otherwise, already exists; should be an index! Bam, done.
126 # Note that this will explode if the directory exists but doesn't
127 # contain an index; that's a feature
128 try:
129 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
130 except whoosh.index.EmptyIndexError:
131 raise IOError(
132 "The index directory already contains files. "
133 "Please use a dedicated directory for the lookup index."
134 )
135
136 # Create speller, and done
137 spell_store = whoosh.filedb.filestore.FileStorage(directory)
138 self.speller = whoosh.spelling.SpellChecker(spell_store)
139
140
141 def rebuild_index(self):
142 """Creates the index from scratch."""
143
144 schema = whoosh.fields.Schema(
145 name=whoosh.fields.ID(stored=True),
146 table=whoosh.fields.ID(stored=True),
147 row_id=whoosh.fields.ID(stored=True),
148 language=whoosh.fields.STORED,
149 iso3166=whoosh.fields.STORED,
150 display_name=whoosh.fields.STORED, # non-lowercased name
151 )
152
153 if not os.path.exists(self.directory):
154 os.mkdir(self.directory)
155
156 self.index = whoosh.index.create_in(self.directory, schema=schema,
157 indexname='MAIN')
158 writer = self.index.writer()
159
160 # Index every name in all our tables of interest
161 # speller_entries becomes a list of (word, score) tuples; the score is
162 # 2 for English names, 1.5 for Roomaji, and 1 for everything else. I
163 # think this biases the results in the direction most people expect,
164 # especially when e.g. German names are very similar to English names
165 speller_entries = []
166 for cls in self.indexed_tables.values():
167 q = self.session.query(cls)
168
169 for row in q.yield_per(5):
170 row_key = dict(table=unicode(cls.__tablename__),
171 row_id=unicode(row.id))
172
173 def add(name, language, iso3166, score):
174 normalized_name = self.normalize_name(name)
175
176 writer.add_document(
177 name=normalized_name, display_name=name,
178 language=language, iso3166=iso3166,
179 **row_key
180 )
181
182 speller_entries.append((normalized_name, score))
183
184
185 # Add the basic English name to the index
186 if cls == tables.Pokemon:
187 # Pokémon need their form name added
188 # XXX kinda kludgy
189 add(row.full_name, None, u'us', 1)
190
191 # If this is a default form, ALSO add the unadorned name,
192 # so 'Deoxys' alone will still do the right thing
193 if row.forme_name and not row.forme_base_pokemon_id:
194 add(row.name, None, u'us', 1)
195 else:
196 add(row.name, None, u'us', 1)
197
198 # Some things also have other languages' names
199 # XXX other language form names..?
200 for foreign_name in getattr(row, 'foreign_names', []):
201 moonspeak = foreign_name.name
202 if row.name == moonspeak:
203 # Don't add the English name again as a different
204 # language; no point and it makes spell results
205 # confusing
206 continue
207
208 add(moonspeak, foreign_name.language.name,
209 foreign_name.language.iso3166,
210 3)
211
212 # Add Roomaji too
213 if foreign_name.language.name == 'Japanese':
214 roomaji = romanize(foreign_name.name)
215 add(roomaji, u'Roomaji', u'jp', 8)
216
217 writer.commit()
218
219 # Construct and populate a spell-checker index. Quicker to do it all
220 # at once, as every call to add_* does a commit(), and those seem to be
221 # expensive
222 self.speller = whoosh.spelling.SpellChecker(self.index.storage)
223 self.speller.add_scored_words(speller_entries)
224
225
226 def normalize_name(self, name):
227 """Strips irrelevant formatting junk from name input.
228
229 Specifically: everything is lowercased, and accents are removed.
230 """
231 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
232 # Makes sense to me. Decompose by Unicode rules, then remove combining
233 # characters, then recombine. I'm explicitly doing it this way instead
234 # of testing combining() because Korean characters apparently
235 # decompose! But the results are considered letters, not combining
236 # characters, so testing for Mn works well, and combining them again
237 # makes them look right.
238 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
239 name = u"".join(c for c in nkfd_form
240 if unicodedata.category(c) != 'Mn')
241 name = unicodedata.normalize('NFC', name)
242
243 name = name.strip()
244 name = name.lower()
245
246 return name
247
248
249 def _apply_valid_types(self, name, valid_types):
250 """Combines the enforced `valid_types` with any from the search string
251 itself and updates the query.
252
253 For example, a name of 'a,b:foo' and valid_types of b,c will search for
254 only `b`s named "foo".
255
256 Returns `(name, merged_valid_types, term)`, where `name` has had any type
257 prefix stripped, `merged_valid_types` combines the original
258 `valid_types` with the type prefix, and `term` is a query term for
259 limited to just the allowed types. If there are no type restrictions
260 at all, `term` will be None.
261 """
262
263 # Remove any type prefix (pokemon:133) first
264 user_valid_types = []
265 if ':' in name:
266 prefix_chunk, name = name.split(':', 1)
267 name = name.strip()
268
269 prefixes = prefix_chunk.split(',')
270 user_valid_types = [_.strip() for _ in prefixes]
271
272 # Merge the valid types together. Only types that appear in BOTH lists
273 # may be used.
274 # As a special case, if the user asked for types that are explicitly
275 # forbidden, completely ignore what the user requested
276 combined_valid_types = []
277 if user_valid_types and valid_types:
278 combined_valid_types = list(
279 set(user_valid_types) & set(combined_valid_types)
280 )
281
282 if not combined_valid_types:
283 # No overlap! Just use the enforced ones
284 combined_valid_types = valid_types
285 else:
286 # One list or the other was blank, so just use the one that isn't
287 combined_valid_types = valid_types + user_valid_types
288
289 if not combined_valid_types:
290 # No restrictions
291 return name, [], None
292
293 # Construct the term
294 type_terms = []
295 final_valid_types = []
296 for valid_type in combined_valid_types:
297 table_name = self._parse_table_name(valid_type)
298
299 # Quietly ignore bogus valid_types; more likely to DTRT
300 if table_name:
301 final_valid_types.append(valid_type)
302 type_terms.append(whoosh.query.Term(u'table', table_name))
303
304 return name, final_valid_types, whoosh.query.Or(type_terms)
305
306
307 def _parse_table_name(self, name):
308 """Takes a singular table name, table name, or table object and returns
309 the table name.
310
311 Returns None for a bogus name.
312 """
313 # Table object
314 if hasattr(name, '__tablename__'):
315 return getattr(name, '__tablename__')
316
317 # Table name
318 for table in self.indexed_tables.values():
319 if name in (table.__tablename__, table.__singlename__):
320 return table.__tablename__
321
322 # Bogus. Be nice and return dummy
323 return None
324
325 def _whoosh_records_to_results(self, records, exact=True):
326 """Converts a list of whoosh's indexed records to LookupResult tuples
327 containing database objects.
328 """
329 # XXX this 'exact' thing is getting kinda leaky. would like a better
330 # way to handle it, since only lookup() cares about fuzzy results
331 seen = {}
332 results = []
333 for record in records:
334 # Skip dupes
335 seen_key = record['table'], record['row_id']
336 if seen_key in seen:
337 continue
338 seen[seen_key] = True
339
340 cls = self.indexed_tables[record['table']]
341 obj = self.session.query(cls).get(record['row_id'])
342
343 results.append(LookupResult(object=obj,
344 indexed_name=record['name'],
345 name=record['display_name'],
346 language=record['language'],
347 iso3166=record['iso3166'],
348 exact=exact))
349
350 return results
351
352
353 def lookup(self, input, valid_types=[], exact_only=False):
354 """Attempts to find some sort of object, given a name.
355
356 Returns a list of named (object, name, language, iso3166, exact)
357 tuples. `object` is a database object, `name` is the name under which
358 the object was found, `language` and `iso3166` are the name and country
359 code of the language in which the name was found, and `exact` is True
360 iff this was an
361 exact match.
362
363 This function currently ONLY does fuzzy matching if there are no exact
364 matches.
365
366 Formes are not returned unless requested; "Shaymin" will return only
367 grass Shaymin.
368
369 Extraneous whitespace is removed with extreme prejudice.
370
371 Recognizes:
372 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
373 - Foreign names: "Iibui", "Eivui"
374 - Fuzzy names in whatever language: "Evee", "Ibui"
375 - IDs: "133", "192", "250"
376 Also:
377 - Type restrictions. "type:psychic" will only return the type. This
378 is how to make ID lookup useful. Multiple type specs can be entered
379 with commas, as "move,item:1". If `valid_types` are provided, any
380 type prefix will be ignored.
381 - Alternate formes can be specified merely like "wash rotom".
382
383 `input`
384 Name of the thing to look for.
385
386 `valid_types`
387 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
388 this is provided, only results in one of the given tables will be
389 returned.
390
391 `exact_only`
392 If True, only exact matches are returned. If set to False (the
393 default), and the provided `name` doesn't match anything exactly,
394 spelling correction will be attempted.
395 """
396
397 name = self.normalize_name(input)
398 exact = True
399 form = None
400
401 # Pop off any type prefix and merge with valid_types
402 name, merged_valid_types, type_term = \
403 self._apply_valid_types(name, valid_types)
404
405 # Random lookup
406 if name == 'random':
407 return self.random_lookup(valid_types=merged_valid_types)
408
409 # Do different things depending what the query looks like
410 # Note: Term objects do an exact match, so we don't have to worry about
411 # a query parser tripping on weird characters in the input
412 try:
413 # Let Python try to convert to a number, so 0xff works
414 name_as_number = int(name, base=0)
415 except ValueError:
416 # Oh well
417 name_as_number = None
418
419 if '*' in name or '?' in name:
420 exact_only = True
421 query = whoosh.query.Wildcard(u'name', name)
422 elif name_as_number is not None:
423 # Don't spell-check numbers!
424 exact_only = True
425 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
426 else:
427 # Not an integer
428 query = whoosh.query.Term(u'name', name)
429
430 if type_term:
431 query = query & type_term
432
433
434 ### Actual searching
435 searcher = self.index.searcher()
436 # XXX is this kosher? docs say search() takes a weighting arg, but it
437 # certainly does not
438 searcher.weighting = LanguageWeighting()
439 results = searcher.search(query,
440 limit=self.INTERMEDIATE_LOOKUP_RESULTS)
441
442 # Look for some fuzzy matches if necessary
443 if not exact_only and not results:
444 exact = False
445 results = []
446
447 for suggestion in self.speller.suggest(
448 name, self.INTERMEDIATE_LOOKUP_RESULTS):
449
450 query = whoosh.query.Term('name', suggestion)
451 results.extend(searcher.search(query))
452
453 ### Convert results to db objects
454 objects = self._whoosh_records_to_results(results, exact=exact)
455
456 # Only return up to 10 matches; beyond that, something is wrong. We
457 # strip out duplicate entries above, so it's remotely possible that we
458 # should have more than 10 here and lost a few. The speller returns 25
459 # to give us some padding, and should avoid that problem. Not a big
460 # deal if we lose the 25th-most-likely match anyway.
461 return objects[:self.MAX_LOOKUP_RESULTS]
462
463
464 def random_lookup(self, valid_types=[]):
465 """Returns a random lookup result from one of the provided
466 `valid_types`.
467 """
468
469 tables = []
470 for valid_type in valid_types:
471 table_name = self._parse_table_name(valid_type)
472 if table_name:
473 tables.append(self.indexed_tables[table_name])
474
475 if not tables:
476 # n.b.: It's possible we got a list of valid_types and none of them
477 # were valid, but this function is guaranteed to return
478 # *something*, so it politely selects from the entire index isntead
479 tables = self.indexed_tables.values()
480
481 # Rather than create an array of many hundred items and pick randomly
482 # from it, just pick a number up to the total number of potential
483 # items, then pick randomly from that, and partition the whole range
484 # into chunks. This also avoids the slight problem that the index
485 # contains more rows (for languages) for some items than others.
486 # XXX ought to cache this (in the index?) if possible
487 total = 0
488 partitions = []
489 for table in tables:
490 count = self.session.query(table).count()
491 total += count
492 partitions.append((table, count))
493
494 n = random.randint(1, total)
495 while n > partitions[0][1]:
496 n -= partitions[0][1]
497 partitions.pop(0)
498
499 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
500
501 def prefix_lookup(self, prefix, valid_types=[]):
502 """Returns terms starting with the given exact prefix.
503
504 Type prefixes are recognized, but no other name munging is done.
505 """
506
507 # Pop off any type prefix and merge with valid_types
508 prefix, merged_valid_types, type_term = \
509 self._apply_valid_types(prefix, valid_types)
510
511 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
512
513 if type_term:
514 query = query & type_term
515
516 searcher = self.index.searcher()
517 searcher.weighting = LanguageWeighting()
518 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
519
520 return self._whoosh_records_to_results(results)