pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8 import unicodedata
   9
  10 from sqlalchemy.sql import func
  11 import whoosh
  12 import whoosh.filedb.filestore
  13 import whoosh.filedb.fileindex
  14 import whoosh.index
  15 from whoosh.qparser import QueryParser
  16 import whoosh.scoring
  17 import whoosh.spelling
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22
  23 __all__ = ['PokedexLookup']
  24
  25
  26 rx_is_number = re.compile('^\d+$')
  27
  28 LookupResult = namedtuple('LookupResult',
  29                           ['object', 'name', 'language', 'iso3166', 'exact'])
  30
  31 class LanguageWeighting(whoosh.scoring.Weighting):
  32     """A scoring class that forces otherwise-equal English results to come
  33     before foreign results.
  34     """
  35
  36     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  37         doc = searcher.stored_fields(docnum)
  38         if doc['language'] == None:
  39             # English (well, "default"); leave it at 1
  40             return weight
  41         elif doc['language'] == u'Roomaji':
  42             # Give Roomaji a little boost; it's most likely to be searched
  43             return weight * 0.95
  44         else:
  45             # Everything else can drop down the totem pole
  46             return weight * 0.9
  47
  48
  49 class PokedexLookup(object):
  50     INTERMEDIATE_LOOKUP_RESULTS = 25
  51     MAX_LOOKUP_RESULTS = 10
  52
  53     # Dictionary of table name => table class.
  54     # Need the table name so we can get the class from the table name after we
  55     # retrieve something from the index
  56     indexed_tables = dict(
  57         (cls.__tablename__, cls)
  58         for cls in (
  59             tables.Ability,
  60             tables.Item,
  61             tables.Move,
  62             tables.Pokemon,
  63             tables.Type,
  64         )
  65     )
  66
  67
  68     def __init__(self, directory=None, session=None, recreate=False):
  69         """Opens the whoosh index stored in the named directory.  If the index
  70         doesn't already exist, it will be created.
  71
  72         `directory`
  73             Directory containing the index.  Defaults to a location within the
  74             `pokedex` egg directory.
  75
  76         `session`
  77             If the index needs to be created, this database session will be
  78             used.  Defaults to an attempt to connect to the default SQLite
  79             database installed by `pokedex setup`.
  80
  81         `recreate`
  82             If set to True, the whoosh index will be created even if it already
  83             exists.
  84         """
  85
  86         # By the time this returns, self.index, self.speller, and self.session
  87         # must be set
  88
  89         # Defaults
  90         if not directory:
  91             directory = pkg_resources.resource_filename('pokedex',
  92                                                         'data/whoosh-index')
  93
  94         if session:
  95             self.session = session
  96         else:
  97             self.session = connect()
  98
  99         # Attempt to open or create the index
 100         directory_exists = os.path.exists(directory)
 101         if directory_exists and not recreate:
 102             # Already exists; should be an index!  Bam, done.
 103             try:
 104                 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 105                 spell_store = whoosh.filedb.filestore.FileStorage(directory)
 106                 self.speller = whoosh.spelling.SpellChecker(spell_store)
 107                 return
 108             except whoosh.index.EmptyIndexError as e:
 109                 # Apparently not a real index.  Fall out and create it
 110                 pass
 111
 112         # Delete and start over if we're going to bail anyway.
 113         if directory_exists and recreate:
 114             # Be safe and only delete if it looks like a whoosh index, i.e.,
 115             # everything starts with _
 116             if all(f[0] == '_' for f in os.listdir(directory)):
 117                 shutil.rmtree(directory)
 118                 directory_exists = False
 119
 120         if not directory_exists:
 121             os.mkdir(directory)
 122
 123
 124         ### Create index
 125         schema = whoosh.fields.Schema(
 126             name=whoosh.fields.ID(stored=True),
 127             table=whoosh.fields.ID(stored=True),
 128             row_id=whoosh.fields.ID(stored=True),
 129             language=whoosh.fields.STORED,
 130             iso3166=whoosh.fields.STORED,
 131             display_name=whoosh.fields.STORED,  # non-lowercased name
 132             forme_name=whoosh.fields.ID,
 133         )
 134
 135         self.index = whoosh.index.create_in(directory, schema=schema,
 136                                             indexname='MAIN')
 137         writer = self.index.writer()
 138
 139         # Index every name in all our tables of interest
 140         # speller_entries becomes a list of (word, score) tuples; the score is
 141         # 2 for English names, 1.5 for Roomaji, and 1 for everything else.  I
 142         # think this biases the results in the direction most people expect,
 143         # especially when e.g. German names are very similar to English names
 144         speller_entries = []
 145         for cls in self.indexed_tables.values():
 146             q = session.query(cls)
 147
 148             for row in q.yield_per(5):
 149                 # Need to give forme_name a dummy value because I can't
 150                 # search for explicitly empty fields.  Boo.
 151                 row_keys = [
 152                     dict(table=unicode(cls.__tablename__),
 153                          row_id=unicode(row.id),
 154                          forme_name=u'__empty__')
 155                 ]
 156
 157                 # If this is a form, mark it as such
 158                 # XXX foreign form names...?
 159                 if getattr(row, 'forme_name', None):
 160                     # ...but if it's also the *default* form, index the name
 161                     # bare too
 162                     if not getattr(row, 'forme_base_pokemon_id', None):
 163                         new_key = row_keys[0].copy()
 164                         row_keys.append(new_key)
 165
 166                     row_keys[0]['forme_name'] = row.forme_name
 167
 168                 def add(name, language, iso3166, score):
 169                     normalized_name = self.normalize_name(name)
 170                     for row_key in row_keys:
 171                         writer.add_document(
 172                             name=normalized_name, display_name=name,
 173                             language=language, iso3166=iso3166,
 174                             **row_key
 175                         )
 176
 177                     speller_entries.append((normalized_name, score))
 178
 179
 180                 name = row.name
 181                 add(name, None, u'us', 1)
 182
 183                 # Pokemon also get other languages
 184                 for foreign_name in getattr(row, 'foreign_names', []):
 185                     moonspeak = foreign_name.name
 186                     if name == moonspeak:
 187                         # Don't add the English name again as a different
 188                         # language; no point and it makes spell results
 189                         # confusing
 190                         continue
 191
 192                     add(moonspeak, foreign_name.language.name,
 193                                    foreign_name.language.iso3166,
 194                                    3)
 195
 196                     # Add Roomaji too
 197                     if foreign_name.language.name == 'Japanese':
 198                         roomaji = romanize(foreign_name.name)
 199                         add(roomaji, u'Roomaji', u'jp', 8)
 200
 201         writer.commit()
 202
 203         # Construct and populate a spell-checker index.  Quicker to do it all
 204         # at once, as every call to add_* does a commit(), and those seem to be
 205         # expensive
 206         self.speller = whoosh.spelling.SpellChecker(self.index.storage)
 207         self.speller.add_scored_words(speller_entries)
 208
 209
 210     def normalize_name(self, name):
 211         """Strips irrelevant formatting junk from name input.
 212
 213         Specifically: everything is lowercased, and accents are removed.
 214         """
 215         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 216         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 217         # characters, then recombine.  I'm explicitly doing it this way instead
 218         # of testing combining() because Korean characters apparently
 219         # decompose!  But the results are considered letters, not combining
 220         # characters, so testing for Mn works well, and combining them again
 221         # makes them look right.
 222         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 223         name = u"".join(c for c in nkfd_form
 224                         if unicodedata.category(c) != 'Mn')
 225         name = unicodedata.normalize('NFC', name)
 226
 227         name = name.strip()
 228         name = name.lower()
 229
 230         return name
 231
 232
 233     def _parse_table_name(self, name):
 234         """Takes a singular table name, table name, or table object and returns
 235         the table name.
 236
 237         Returns None for a bogus name.
 238         """
 239         if hasattr(name, '__tablename__'):
 240             return getattr(name, '__tablename__')
 241         elif name in self.indexed_tables:
 242             return name
 243         elif name + 's' in self.indexed_tables:
 244             return name + 's'
 245         else:
 246             # Bogus.  Be nice and return dummy
 247             return None
 248
 249     def _whoosh_records_to_results(self, records, exact=True):
 250         """Converts a list of whoosh's indexed records to LookupResult tuples
 251         containing database objects.
 252         """
 253         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 254         # way to handle it, since only lookup() cares about fuzzy results
 255         seen = {}
 256         results = []
 257         for record in records:
 258             # Skip dupes
 259             seen_key = record['table'], record['row_id']
 260             if seen_key in seen:
 261                 continue
 262             seen[seen_key] = True
 263
 264             cls = self.indexed_tables[record['table']]
 265             obj = self.session.query(cls).get(record['row_id'])
 266
 267             results.append(LookupResult(object=obj,
 268                                         name=record['display_name'],
 269                                         language=record['language'],
 270                                         iso3166=record['iso3166'],
 271                                         exact=exact))
 272
 273         return results
 274
 275
 276     def lookup(self, input, valid_types=[], exact_only=False):
 277         """Attempts to find some sort of object, given a name.
 278
 279         Returns a list of named (object, name, language, iso3166, exact)
 280         tuples.  `object` is a database object, `name` is the name under which
 281         the object was found, `language` and `iso3166` are the name and country
 282         code of the language in which the name was found, and `exact` is True
 283         iff this was an
 284         exact match.
 285
 286         This function currently ONLY does fuzzy matching if there are no exact
 287         matches.
 288
 289         Formes are not returned unless requested; "Shaymin" will return only
 290         grass Shaymin.
 291
 292         Extraneous whitespace is removed with extreme prejudice.
 293
 294         Recognizes:
 295         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 296         - Foreign names: "Iibui", "Eivui"
 297         - Fuzzy names in whatever language: "Evee", "Ibui"
 298         - IDs: "133", "192", "250"
 299         Also:
 300         - Type restrictions.  "type:psychic" will only return the type.  This
 301           is how to make ID lookup useful.  Multiple type specs can be entered
 302           with commas, as "move,item:1".  If `valid_types` are provided, any
 303           type prefix will be ignored.
 304         - Alternate formes can be specified merely like "wash rotom".
 305
 306         `input`
 307             Name of the thing to look for.
 308
 309         `valid_types`
 310             A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 311             this is provided, only results in one of the given tables will be
 312             returned.
 313
 314         `exact_only`
 315             If True, only exact matches are returned.  If set to False (the
 316             default), and the provided `name` doesn't match anything exactly,
 317             spelling correction will be attempted.
 318         """
 319
 320         name = self.normalize_name(input)
 321         exact = True
 322         form = None
 323
 324         # Remove any type prefix (pokemon:133) before constructing a query
 325         if ':' in name:
 326             prefix_chunk, name = name.split(':', 1)
 327             name = name.strip()
 328
 329             if not valid_types:
 330                 # Only use types from the query string if none were explicitly
 331                 # provided
 332                 prefixes = prefix_chunk.split(',')
 333                 valid_types = [_.strip() for _ in prefixes]
 334
 335         # Random lookup
 336         if name == 'random':
 337             return self.random_lookup(valid_types=valid_types)
 338
 339         # Do different things depending what the query looks like
 340         # Note: Term objects do an exact match, so we don't have to worry about
 341         # a query parser tripping on weird characters in the input
 342         if '*' in name or '?' in name:
 343             exact_only = True
 344             query = whoosh.query.Wildcard(u'name', name)
 345         elif rx_is_number.match(name):
 346             # Don't spell-check numbers!
 347             exact_only = True
 348             query = whoosh.query.Term(u'row_id', name)
 349         else:
 350             # Not an integer
 351             query = whoosh.query.Term(u'name', name) \
 352                   & whoosh.query.Term(u'forme_name', u'__empty__')
 353
 354             # If there's a space in the input, this might be a form
 355             if ' ' in name:
 356                 form, formless_name = name.split(' ', 1)
 357                 form_query = whoosh.query.Term(u'name', formless_name) \
 358                            & whoosh.query.Term(u'forme_name', form)
 359                 query = query | form_query
 360
 361         ### Filter by type of object
 362         type_terms = []
 363         for valid_type in valid_types:
 364             table_name = self._parse_table_name(valid_type)
 365             if table_name:
 366                 # Quietly ignore bogus valid_types; more likely to DTRT
 367                 type_terms.append(whoosh.query.Term(u'table', table_name))
 368
 369         if type_terms:
 370             query = query & whoosh.query.Or(type_terms)
 371
 372
 373         ### Actual searching
 374         searcher = self.index.searcher()
 375         # XXX is this kosher?  docs say search() takes a weighting arg, but it
 376         # certainly does not
 377         searcher.weighting = LanguageWeighting()
 378         results = searcher.search(query,
 379                                   limit=self.INTERMEDIATE_LOOKUP_RESULTS)
 380
 381         # Look for some fuzzy matches if necessary
 382         if not exact_only and not results:
 383             exact = False
 384             results = []
 385
 386             for suggestion in self.speller.suggest(
 387                 name, self.INTERMEDIATE_LOOKUP_RESULTS):
 388
 389                 query = whoosh.query.Term('name', suggestion)
 390                 results.extend(searcher.search(query))
 391
 392         ### Convert results to db objects
 393         objects = self._whoosh_records_to_results(results, exact=exact)
 394
 395         # Only return up to 10 matches; beyond that, something is wrong.  We
 396         # strip out duplicate entries above, so it's remotely possible that we
 397         # should have more than 10 here and lost a few.  The speller returns 25
 398         # to give us some padding, and should avoid that problem.  Not a big
 399         # deal if we lose the 25th-most-likely match anyway.
 400         return objects[:self.MAX_LOOKUP_RESULTS]
 401
 402
 403     def random_lookup(self, valid_types=[]):
 404         """Returns a random lookup result from one of the provided
 405         `valid_types`.
 406         """
 407
 408         tables = []
 409         for valid_type in valid_types:
 410             table_name = self._parse_table_name(valid_type)
 411             if table_name:
 412                 tables.append(self.indexed_tables[table_name])
 413
 414         if not tables:
 415             # n.b.: It's possible we got a list of valid_types and none of them
 416             # were valid, but this function is guaranteed to return
 417             # *something*, so it politely selects from the entire index isntead
 418             tables = self.indexed_tables.values()
 419
 420         # Rather than create an array of many hundred items and pick randomly
 421         # from it, just pick a number up to the total number of potential
 422         # items, then pick randomly from that, and partition the whole range
 423         # into chunks.  This also avoids the slight problem that the index
 424         # contains more rows (for languages) for some items than others.
 425         # XXX ought to cache this (in the index?) if possible
 426         total = 0
 427         partitions = []
 428         for table in tables:
 429             count = self.session.query(table).count()
 430             total += count
 431             partitions.append((table, count))
 432
 433         n = random.randint(1, total)
 434         while n > partitions[0][1]:
 435             n -= partitions[0][1]
 436             partitions.pop(0)
 437
 438         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 439
 440     def prefix_lookup(self, prefix):
 441         """Returns terms starting with the given exact prefix.
 442
 443         No special magic is currently done with the name; type prefixes are not
 444         recognized.
 445         """
 446
 447         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 448
 449         searcher = self.index.searcher()
 450         searcher.weighting = LanguageWeighting()
 451         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 452
 453         return self._whoosh_records_to_results(results)