Bump required SQLA to 0.6.
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import random
4 import re
5 import shutil
6 import unicodedata
7
8 from sqlalchemy.sql import func
9 import whoosh
10 import whoosh.filedb.filestore
11 import whoosh.filedb.fileindex
12 import whoosh.index
13 from whoosh.qparser import QueryParser
14 import whoosh.scoring
15 import whoosh.spelling
16
17 from pokedex.util import namedtuple
18
19 from pokedex.db import connect
20 import pokedex.db.tables as tables
21 from pokedex.roomaji import romanize
22 from pokedex.defaults import get_default_index_dir
23
24 __all__ = ['PokedexLookup']
25
26
27 rx_is_number = re.compile('^\d+$')
28
29 LookupResult = namedtuple('LookupResult',
30 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
31
32 class UninitializedIndex(object):
33 class UninitializedIndexError(Exception):
34 pass
35
36 def __nonzero__(self):
37 """Dummy object should identify itself as False."""
38 return False
39
40 def __bool__(self):
41 """Python 3000 version of the above. Future-proofing rules!"""
42 return False
43
44 def __getattr__(self, *args, **kwargs):
45 raise self.UninitializedIndexError(
46 "The lookup index does not exist. Please use `pokedex setup` "
47 "or lookup.rebuild_index() to create it."
48 )
49
50 class LanguageWeighting(whoosh.scoring.Weighting):
51 """A scoring class that forces otherwise-equal English results to come
52 before foreign results.
53 """
54
55 def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
56 doc = searcher.stored_fields(docnum)
57 if doc['language'] == None:
58 # English (well, "default"); leave it at 1
59 return weight
60 elif doc['language'] == u'Roomaji':
61 # Give Roomaji a little boost; it's most likely to be searched
62 return weight * 0.95
63 else:
64 # Everything else can drop down the totem pole
65 return weight * 0.9
66
67
68 class PokedexLookup(object):
69 INTERMEDIATE_LOOKUP_RESULTS = 25
70 MAX_LOOKUP_RESULTS = 10
71
72 # Dictionary of table name => table class.
73 # Need the table name so we can get the class from the table name after we
74 # retrieve something from the index
75 indexed_tables = dict(
76 (cls.__tablename__, cls)
77 for cls in (
78 tables.Ability,
79 tables.Item,
80 tables.Location,
81 tables.Move,
82 tables.Nature,
83 tables.Pokemon,
84 tables.Type,
85 )
86 )
87
88
89 def __init__(self, directory=None, session=None):
90 """Opens the whoosh index stored in the named directory. If the index
91 doesn't already exist, it will be created.
92
93 `directory`
94 Directory containing the index. Defaults to a location within the
95 `pokedex` egg directory.
96
97 `session`
98 Used for creating the index and retrieving objects. Defaults to an
99 attempt to connect to the default SQLite database installed by
100 `pokedex setup`.
101 """
102
103 # By the time this returns, self.index, self.speller, and self.session
104 # must be set
105
106 # If a directory was not given, use the default
107 if directory is None:
108 directory = get_default_index_dir()
109
110 self.directory = directory
111
112 if session:
113 self.session = session
114 else:
115 self.session = connect()
116
117 # Attempt to open or create the index
118 if not os.path.exists(directory) or not os.listdir(directory):
119 # Directory doesn't exist OR is empty; caller needs to use
120 # rebuild_index before doing anything. Provide a dummy object that
121 # complains when used
122 self.index = UninitializedIndex()
123 self.speller = UninitializedIndex()
124 return
125
126 # Otherwise, already exists; should be an index! Bam, done.
127 # Note that this will explode if the directory exists but doesn't
128 # contain an index; that's a feature
129 try:
130 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
131 except whoosh.index.EmptyIndexError:
132 raise IOError(
133 "The index directory already contains files. "
134 "Please use a dedicated directory for the lookup index."
135 )
136
137 # Create speller, and done
138 spell_store = whoosh.filedb.filestore.FileStorage(directory)
139 self.speller = whoosh.spelling.SpellChecker(spell_store)
140
141
142 def rebuild_index(self):
143 """Creates the index from scratch."""
144
145 schema = whoosh.fields.Schema(
146 name=whoosh.fields.ID(stored=True),
147 table=whoosh.fields.ID(stored=True),
148 row_id=whoosh.fields.ID(stored=True),
149 language=whoosh.fields.STORED,
150 iso3166=whoosh.fields.STORED,
151 display_name=whoosh.fields.STORED, # non-lowercased name
152 )
153
154 if not os.path.exists(self.directory):
155 os.mkdir(self.directory)
156
157 self.index = whoosh.index.create_in(self.directory, schema=schema,
158 indexname='MAIN')
159 writer = self.index.writer()
160
161 # Index every name in all our tables of interest
162 # speller_entries becomes a list of (word, score) tuples; the score is
163 # 2 for English names, 1.5 for Roomaji, and 1 for everything else. I
164 # think this biases the results in the direction most people expect,
165 # especially when e.g. German names are very similar to English names
166 speller_entries = []
167 for cls in self.indexed_tables.values():
168 q = self.session.query(cls)
169
170 for row in q.yield_per(5):
171 row_key = dict(table=unicode(cls.__tablename__),
172 row_id=unicode(row.id))
173
174 def add(name, language, iso3166, score):
175 normalized_name = self.normalize_name(name)
176
177 writer.add_document(
178 name=normalized_name, display_name=name,
179 language=language, iso3166=iso3166,
180 **row_key
181 )
182
183 speller_entries.append((normalized_name, score))
184
185
186 # Add the basic English name to the index
187 if cls == tables.Pokemon:
188 # Pokémon need their form name added
189 # XXX kinda kludgy
190 add(row.full_name, None, u'us', 1)
191
192 # If this is a default form, ALSO add the unadorned name,
193 # so 'Deoxys' alone will still do the right thing
194 if row.forme_name and not row.forme_base_pokemon_id:
195 add(row.name, None, u'us', 1)
196 else:
197 add(row.name, None, u'us', 1)
198
199 # Some things also have other languages' names
200 # XXX other language form names..?
201 for foreign_name in getattr(row, 'foreign_names', []):
202 moonspeak = foreign_name.name
203 if row.name == moonspeak:
204 # Don't add the English name again as a different
205 # language; no point and it makes spell results
206 # confusing
207 continue
208
209 add(moonspeak, foreign_name.language.name,
210 foreign_name.language.iso3166,
211 3)
212
213 # Add Roomaji too
214 if foreign_name.language.name == 'Japanese':
215 roomaji = romanize(foreign_name.name)
216 add(roomaji, u'Roomaji', u'jp', 8)
217
218 writer.commit()
219
220 # Construct and populate a spell-checker index. Quicker to do it all
221 # at once, as every call to add_* does a commit(), and those seem to be
222 # expensive
223 self.speller = whoosh.spelling.SpellChecker(self.index.storage)
224 self.speller.add_scored_words(speller_entries)
225
226
227 def normalize_name(self, name):
228 """Strips irrelevant formatting junk from name input.
229
230 Specifically: everything is lowercased, and accents are removed.
231 """
232 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
233 # Makes sense to me. Decompose by Unicode rules, then remove combining
234 # characters, then recombine. I'm explicitly doing it this way instead
235 # of testing combining() because Korean characters apparently
236 # decompose! But the results are considered letters, not combining
237 # characters, so testing for Mn works well, and combining them again
238 # makes them look right.
239 nkfd_form = unicodedata.normalize('NFKD', unicode(name))
240 name = u"".join(c for c in nkfd_form
241 if unicodedata.category(c) != 'Mn')
242 name = unicodedata.normalize('NFC', name)
243
244 name = name.strip()
245 name = name.lower()
246
247 return name
248
249
250 def _apply_valid_types(self, name, valid_types):
251 """Combines the enforced `valid_types` with any from the search string
252 itself and updates the query.
253
254 For example, a name of 'a,b:foo' and valid_types of b,c will search for
255 only `b`s named "foo".
256
257 Returns `(name, merged_valid_types, term)`, where `name` has had any type
258 prefix stripped, `merged_valid_types` combines the original
259 `valid_types` with the type prefix, and `term` is a query term for
260 limited to just the allowed types. If there are no type restrictions
261 at all, `term` will be None.
262 """
263
264 # Remove any type prefix (pokemon:133) first
265 user_valid_types = []
266 if ':' in name:
267 prefix_chunk, name = name.split(':', 1)
268 name = name.strip()
269
270 prefixes = prefix_chunk.split(',')
271 user_valid_types = [_.strip() for _ in prefixes]
272
273 # Merge the valid types together. Only types that appear in BOTH lists
274 # may be used.
275 # As a special case, if the user asked for types that are explicitly
276 # forbidden, completely ignore what the user requested
277 combined_valid_types = []
278 if user_valid_types and valid_types:
279 combined_valid_types = list(
280 set(user_valid_types) & set(combined_valid_types)
281 )
282
283 if not combined_valid_types:
284 # No overlap! Just use the enforced ones
285 combined_valid_types = valid_types
286 else:
287 # One list or the other was blank, so just use the one that isn't
288 combined_valid_types = valid_types + user_valid_types
289
290 if not combined_valid_types:
291 # No restrictions
292 return name, [], None
293
294 # Construct the term
295 type_terms = []
296 final_valid_types = []
297 for valid_type in combined_valid_types:
298 table_name = self._parse_table_name(valid_type)
299
300 # Quietly ignore bogus valid_types; more likely to DTRT
301 if table_name:
302 final_valid_types.append(valid_type)
303 type_terms.append(whoosh.query.Term(u'table', table_name))
304
305 return name, final_valid_types, whoosh.query.Or(type_terms)
306
307
308 def _parse_table_name(self, name):
309 """Takes a singular table name, table name, or table object and returns
310 the table name.
311
312 Returns None for a bogus name.
313 """
314 # Table object
315 if hasattr(name, '__tablename__'):
316 return getattr(name, '__tablename__')
317
318 # Table name
319 for table in self.indexed_tables.values():
320 if name in (table.__tablename__, table.__singlename__):
321 return table.__tablename__
322
323 # Bogus. Be nice and return dummy
324 return None
325
326 def _whoosh_records_to_results(self, records, exact=True):
327 """Converts a list of whoosh's indexed records to LookupResult tuples
328 containing database objects.
329 """
330 # XXX this 'exact' thing is getting kinda leaky. would like a better
331 # way to handle it, since only lookup() cares about fuzzy results
332 seen = {}
333 results = []
334 for record in records:
335 # Skip dupes
336 seen_key = record['table'], record['row_id']
337 if seen_key in seen:
338 continue
339 seen[seen_key] = True
340
341 cls = self.indexed_tables[record['table']]
342 obj = self.session.query(cls).get(record['row_id'])
343
344 results.append(LookupResult(object=obj,
345 indexed_name=record['name'],
346 name=record['display_name'],
347 language=record['language'],
348 iso3166=record['iso3166'],
349 exact=exact))
350
351 return results
352
353
354 def lookup(self, input, valid_types=[], exact_only=False):
355 """Attempts to find some sort of object, given a name.
356
357 Returns a list of named (object, name, language, iso3166, exact)
358 tuples. `object` is a database object, `name` is the name under which
359 the object was found, `language` and `iso3166` are the name and country
360 code of the language in which the name was found, and `exact` is True
361 iff this was an
362 exact match.
363
364 This function currently ONLY does fuzzy matching if there are no exact
365 matches.
366
367 Formes are not returned unless requested; "Shaymin" will return only
368 grass Shaymin.
369
370 Extraneous whitespace is removed with extreme prejudice.
371
372 Recognizes:
373 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
374 - Foreign names: "Iibui", "Eivui"
375 - Fuzzy names in whatever language: "Evee", "Ibui"
376 - IDs: "133", "192", "250"
377 Also:
378 - Type restrictions. "type:psychic" will only return the type. This
379 is how to make ID lookup useful. Multiple type specs can be entered
380 with commas, as "move,item:1". If `valid_types` are provided, any
381 type prefix will be ignored.
382 - Alternate formes can be specified merely like "wash rotom".
383
384 `input`
385 Name of the thing to look for.
386
387 `valid_types`
388 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
389 this is provided, only results in one of the given tables will be
390 returned.
391
392 `exact_only`
393 If True, only exact matches are returned. If set to False (the
394 default), and the provided `name` doesn't match anything exactly,
395 spelling correction will be attempted.
396 """
397
398 name = self.normalize_name(input)
399 exact = True
400 form = None
401
402 # Pop off any type prefix and merge with valid_types
403 name, merged_valid_types, type_term = \
404 self._apply_valid_types(name, valid_types)
405
406 # Random lookup
407 if name == 'random':
408 return self.random_lookup(valid_types=merged_valid_types)
409
410 # Do different things depending what the query looks like
411 # Note: Term objects do an exact match, so we don't have to worry about
412 # a query parser tripping on weird characters in the input
413 try:
414 # Let Python try to convert to a number, so 0xff works
415 name_as_number = int(name, base=0)
416 except ValueError:
417 # Oh well
418 name_as_number = None
419
420 if '*' in name or '?' in name:
421 exact_only = True
422 query = whoosh.query.Wildcard(u'name', name)
423 elif name_as_number is not None:
424 # Don't spell-check numbers!
425 exact_only = True
426 query = whoosh.query.Term(u'row_id', unicode(name_as_number))
427 else:
428 # Not an integer
429 query = whoosh.query.Term(u'name', name)
430
431 if type_term:
432 query = query & type_term
433
434
435 ### Actual searching
436 searcher = self.index.searcher()
437 # XXX is this kosher? docs say search() takes a weighting arg, but it
438 # certainly does not
439 searcher.weighting = LanguageWeighting()
440 results = searcher.search(query,
441 limit=self.INTERMEDIATE_LOOKUP_RESULTS)
442
443 # Look for some fuzzy matches if necessary
444 if not exact_only and not results:
445 exact = False
446 results = []
447
448 for suggestion in self.speller.suggest(
449 name, self.INTERMEDIATE_LOOKUP_RESULTS):
450
451 query = whoosh.query.Term('name', suggestion)
452 results.extend(searcher.search(query))
453
454 ### Convert results to db objects
455 objects = self._whoosh_records_to_results(results, exact=exact)
456
457 # Only return up to 10 matches; beyond that, something is wrong. We
458 # strip out duplicate entries above, so it's remotely possible that we
459 # should have more than 10 here and lost a few. The speller returns 25
460 # to give us some padding, and should avoid that problem. Not a big
461 # deal if we lose the 25th-most-likely match anyway.
462 return objects[:self.MAX_LOOKUP_RESULTS]
463
464
465 def random_lookup(self, valid_types=[]):
466 """Returns a random lookup result from one of the provided
467 `valid_types`.
468 """
469
470 tables = []
471 for valid_type in valid_types:
472 table_name = self._parse_table_name(valid_type)
473 if table_name:
474 tables.append(self.indexed_tables[table_name])
475
476 if not tables:
477 # n.b.: It's possible we got a list of valid_types and none of them
478 # were valid, but this function is guaranteed to return
479 # *something*, so it politely selects from the entire index isntead
480 tables = self.indexed_tables.values()
481
482 # Rather than create an array of many hundred items and pick randomly
483 # from it, just pick a number up to the total number of potential
484 # items, then pick randomly from that, and partition the whole range
485 # into chunks. This also avoids the slight problem that the index
486 # contains more rows (for languages) for some items than others.
487 # XXX ought to cache this (in the index?) if possible
488 total = 0
489 partitions = []
490 for table in tables:
491 count = self.session.query(table).count()
492 total += count
493 partitions.append((table, count))
494
495 n = random.randint(1, total)
496 while n > partitions[0][1]:
497 n -= partitions[0][1]
498 partitions.pop(0)
499
500 return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
501
502 def prefix_lookup(self, prefix, valid_types=[]):
503 """Returns terms starting with the given exact prefix.
504
505 Type prefixes are recognized, but no other name munging is done.
506 """
507
508 # Pop off any type prefix and merge with valid_types
509 prefix, merged_valid_types, type_term = \
510 self._apply_valid_types(prefix, valid_types)
511
512 query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
513
514 if type_term:
515 query = query & type_term
516
517 searcher = self.index.searcher()
518 searcher.weighting = LanguageWeighting()
519 results = searcher.search(query) # XXX , limit=self.MAX_LOOKUP_RESULTS)
520
521 return self._whoosh_records_to_results(results)