pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8 import unicodedata
   9
  10 from sqlalchemy.sql import func
  11 import whoosh
  12 import whoosh.filedb.filestore
  13 import whoosh.filedb.fileindex
  14 import whoosh.index
  15 from whoosh.qparser import QueryParser
  16 import whoosh.scoring
  17 import whoosh.spelling
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22
  23 __all__ = ['PokedexLookup']
  24
  25
  26 rx_is_number = re.compile('^\d+$')
  27
  28 LookupResult = namedtuple('LookupResult',
  29     ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
  30
  31 class LanguageWeighting(whoosh.scoring.Weighting):
  32     """A scoring class that forces otherwise-equal English results to come
  33     before foreign results.
  34     """
  35
  36     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  37         doc = searcher.stored_fields(docnum)
  38         if doc['language'] == None:
  39             # English (well, "default"); leave it at 1
  40             return weight
  41         elif doc['language'] == u'Roomaji':
  42             # Give Roomaji a little boost; it's most likely to be searched
  43             return weight * 0.95
  44         else:
  45             # Everything else can drop down the totem pole
  46             return weight * 0.9
  47
  48
  49 class PokedexLookup(object):
  50     INTERMEDIATE_LOOKUP_RESULTS = 25
  51     MAX_LOOKUP_RESULTS = 10
  52
  53     # Dictionary of table name => table class.
  54     # Need the table name so we can get the class from the table name after we
  55     # retrieve something from the index
  56     indexed_tables = dict(
  57         (cls.__tablename__, cls)
  58         for cls in (
  59             tables.Ability,
  60             tables.Item,
  61             tables.Location,
  62             tables.Move,
  63             tables.Pokemon,
  64             tables.Type,
  65         )
  66     )
  67
  68
  69     def __init__(self, directory=None, session=None, recreate=False):
  70         """Opens the whoosh index stored in the named directory.  If the index
  71         doesn't already exist, it will be created.
  72
  73         `directory`
  74             Directory containing the index.  Defaults to a location within the
  75             `pokedex` egg directory.
  76
  77         `session`
  78             If the index needs to be created, this database session will be
  79             used.  Defaults to an attempt to connect to the default SQLite
  80             database installed by `pokedex setup`.
  81
  82         `recreate`
  83             If set to True, the whoosh index will be created even if it already
  84             exists.
  85         """
  86
  87         # By the time this returns, self.index, self.speller, and self.session
  88         # must be set
  89
  90         # Defaults
  91         if not directory:
  92             directory = pkg_resources.resource_filename('pokedex',
  93                                                         'data/whoosh-index')
  94
  95         if session:
  96             self.session = session
  97         else:
  98             self.session = connect()
  99
 100         # Attempt to open or create the index
 101         directory_exists = os.path.exists(directory)
 102         if directory_exists and not recreate:
 103             # Already exists; should be an index!  Bam, done.
 104             try:
 105                 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 106                 spell_store = whoosh.filedb.filestore.FileStorage(directory)
 107                 self.speller = whoosh.spelling.SpellChecker(spell_store)
 108                 return
 109             except whoosh.index.EmptyIndexError as e:
 110                 # Apparently not a real index.  Fall out and create it
 111                 pass
 112
 113         # Delete and start over if we're going to bail anyway.
 114         if directory_exists and recreate:
 115             # Be safe and only delete if it looks like a whoosh index, i.e.,
 116             # everything starts with _
 117             if all(f[0] == '_' for f in os.listdir(directory)):
 118                 shutil.rmtree(directory)
 119                 directory_exists = False
 120
 121         if not directory_exists:
 122             os.mkdir(directory)
 123
 124
 125         ### Create index
 126         schema = whoosh.fields.Schema(
 127             name=whoosh.fields.ID(stored=True),
 128             table=whoosh.fields.ID(stored=True),
 129             row_id=whoosh.fields.ID(stored=True),
 130             language=whoosh.fields.STORED,
 131             iso3166=whoosh.fields.STORED,
 132             display_name=whoosh.fields.STORED,  # non-lowercased name
 133         )
 134
 135         self.index = whoosh.index.create_in(directory, schema=schema,
 136                                             indexname='MAIN')
 137         writer = self.index.writer()
 138
 139         # Index every name in all our tables of interest
 140         # speller_entries becomes a list of (word, score) tuples; the score is
 141         # 2 for English names, 1.5 for Roomaji, and 1 for everything else.  I
 142         # think this biases the results in the direction most people expect,
 143         # especially when e.g. German names are very similar to English names
 144         speller_entries = []
 145         for cls in self.indexed_tables.values():
 146             q = self.session.query(cls)
 147
 148             for row in q.yield_per(5):
 149                 row_key = dict(table=unicode(cls.__tablename__),
 150                                row_id=unicode(row.id))
 151
 152                 def add(name, language, iso3166, score):
 153                     normalized_name = self.normalize_name(name)
 154
 155                     writer.add_document(
 156                         name=normalized_name, display_name=name,
 157                         language=language, iso3166=iso3166,
 158                         **row_key
 159                     )
 160
 161                     speller_entries.append((normalized_name, score))
 162
 163
 164                 # Add the basic English name to the index
 165                 if cls == tables.Pokemon:
 166                     # Pokémon need their form name added
 167                     # XXX kinda kludgy
 168                     add(row.full_name, None, u'us', 1)
 169
 170                     # If this is a default form, ALSO add the unadorned name,
 171                     # so 'Deoxys' alone will still do the right thing
 172                     if row.forme_name and not row.forme_base_pokemon_id:
 173                         add(row.name, None, u'us', 1)
 174                 else:
 175                     add(row.name, None, u'us', 1)
 176
 177                 # Some things also have other languages' names
 178                 # XXX other language form names..?
 179                 for foreign_name in getattr(row, 'foreign_names', []):
 180                     moonspeak = foreign_name.name
 181                     if row.name == moonspeak:
 182                         # Don't add the English name again as a different
 183                         # language; no point and it makes spell results
 184                         # confusing
 185                         continue
 186
 187                     add(moonspeak, foreign_name.language.name,
 188                                    foreign_name.language.iso3166,
 189                                    3)
 190
 191                     # Add Roomaji too
 192                     if foreign_name.language.name == 'Japanese':
 193                         roomaji = romanize(foreign_name.name)
 194                         add(roomaji, u'Roomaji', u'jp', 8)
 195
 196         writer.commit()
 197
 198         # Construct and populate a spell-checker index.  Quicker to do it all
 199         # at once, as every call to add_* does a commit(), and those seem to be
 200         # expensive
 201         self.speller = whoosh.spelling.SpellChecker(self.index.storage)
 202         self.speller.add_scored_words(speller_entries)
 203
 204
 205     def normalize_name(self, name):
 206         """Strips irrelevant formatting junk from name input.
 207
 208         Specifically: everything is lowercased, and accents are removed.
 209         """
 210         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 211         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 212         # characters, then recombine.  I'm explicitly doing it this way instead
 213         # of testing combining() because Korean characters apparently
 214         # decompose!  But the results are considered letters, not combining
 215         # characters, so testing for Mn works well, and combining them again
 216         # makes them look right.
 217         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 218         name = u"".join(c for c in nkfd_form
 219                         if unicodedata.category(c) != 'Mn')
 220         name = unicodedata.normalize('NFC', name)
 221
 222         name = name.strip()
 223         name = name.lower()
 224
 225         return name
 226
 227
 228     def _parse_table_name(self, name):
 229         """Takes a singular table name, table name, or table object and returns
 230         the table name.
 231
 232         Returns None for a bogus name.
 233         """
 234         if hasattr(name, '__tablename__'):
 235             return getattr(name, '__tablename__')
 236         elif name in self.indexed_tables:
 237             return name
 238         elif name + 's' in self.indexed_tables:
 239             return name + 's'
 240         else:
 241             # Bogus.  Be nice and return dummy
 242             return None
 243
 244     def _whoosh_records_to_results(self, records, exact=True):
 245         """Converts a list of whoosh's indexed records to LookupResult tuples
 246         containing database objects.
 247         """
 248         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 249         # way to handle it, since only lookup() cares about fuzzy results
 250         seen = {}
 251         results = []
 252         for record in records:
 253             # Skip dupes
 254             seen_key = record['table'], record['row_id']
 255             if seen_key in seen:
 256                 continue
 257             seen[seen_key] = True
 258
 259             cls = self.indexed_tables[record['table']]
 260             obj = self.session.query(cls).get(record['row_id'])
 261
 262             results.append(LookupResult(object=obj,
 263                                         indexed_name=record['name'],
 264                                         name=record['display_name'],
 265                                         language=record['language'],
 266                                         iso3166=record['iso3166'],
 267                                         exact=exact))
 268
 269         return results
 270
 271
 272     def lookup(self, input, valid_types=[], exact_only=False):
 273         """Attempts to find some sort of object, given a name.
 274
 275         Returns a list of named (object, name, language, iso3166, exact)
 276         tuples.  `object` is a database object, `name` is the name under which
 277         the object was found, `language` and `iso3166` are the name and country
 278         code of the language in which the name was found, and `exact` is True
 279         iff this was an
 280         exact match.
 281
 282         This function currently ONLY does fuzzy matching if there are no exact
 283         matches.
 284
 285         Formes are not returned unless requested; "Shaymin" will return only
 286         grass Shaymin.
 287
 288         Extraneous whitespace is removed with extreme prejudice.
 289
 290         Recognizes:
 291         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 292         - Foreign names: "Iibui", "Eivui"
 293         - Fuzzy names in whatever language: "Evee", "Ibui"
 294         - IDs: "133", "192", "250"
 295         Also:
 296         - Type restrictions.  "type:psychic" will only return the type.  This
 297           is how to make ID lookup useful.  Multiple type specs can be entered
 298           with commas, as "move,item:1".  If `valid_types` are provided, any
 299           type prefix will be ignored.
 300         - Alternate formes can be specified merely like "wash rotom".
 301
 302         `input`
 303             Name of the thing to look for.
 304
 305         `valid_types`
 306             A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 307             this is provided, only results in one of the given tables will be
 308             returned.
 309
 310         `exact_only`
 311             If True, only exact matches are returned.  If set to False (the
 312             default), and the provided `name` doesn't match anything exactly,
 313             spelling correction will be attempted.
 314         """
 315
 316         name = self.normalize_name(input)
 317         exact = True
 318         form = None
 319
 320         # Remove any type prefix (pokemon:133) before constructing a query
 321         if ':' in name:
 322             prefix_chunk, name = name.split(':', 1)
 323             name = name.strip()
 324
 325             if not valid_types:
 326                 # Only use types from the query string if none were explicitly
 327                 # provided
 328                 prefixes = prefix_chunk.split(',')
 329                 valid_types = [_.strip() for _ in prefixes]
 330
 331         # Random lookup
 332         if name == 'random':
 333             return self.random_lookup(valid_types=valid_types)
 334
 335         # Do different things depending what the query looks like
 336         # Note: Term objects do an exact match, so we don't have to worry about
 337         # a query parser tripping on weird characters in the input
 338         if '*' in name or '?' in name:
 339             exact_only = True
 340             query = whoosh.query.Wildcard(u'name', name)
 341         elif rx_is_number.match(name):
 342             # Don't spell-check numbers!
 343             exact_only = True
 344             query = whoosh.query.Term(u'row_id', name)
 345         else:
 346             # Not an integer
 347             query = whoosh.query.Term(u'name', name)
 348
 349         ### Filter by type of object
 350         type_terms = []
 351         for valid_type in valid_types:
 352             table_name = self._parse_table_name(valid_type)
 353             if table_name:
 354                 # Quietly ignore bogus valid_types; more likely to DTRT
 355                 type_terms.append(whoosh.query.Term(u'table', table_name))
 356
 357         if type_terms:
 358             query = query & whoosh.query.Or(type_terms)
 359
 360
 361         ### Actual searching
 362         searcher = self.index.searcher()
 363         # XXX is this kosher?  docs say search() takes a weighting arg, but it
 364         # certainly does not
 365         searcher.weighting = LanguageWeighting()
 366         results = searcher.search(query,
 367                                   limit=self.INTERMEDIATE_LOOKUP_RESULTS)
 368
 369         # Look for some fuzzy matches if necessary
 370         if not exact_only and not results:
 371             exact = False
 372             results = []
 373
 374             for suggestion in self.speller.suggest(
 375                 name, self.INTERMEDIATE_LOOKUP_RESULTS):
 376
 377                 query = whoosh.query.Term('name', suggestion)
 378                 results.extend(searcher.search(query))
 379
 380         ### Convert results to db objects
 381         objects = self._whoosh_records_to_results(results, exact=exact)
 382
 383         # Only return up to 10 matches; beyond that, something is wrong.  We
 384         # strip out duplicate entries above, so it's remotely possible that we
 385         # should have more than 10 here and lost a few.  The speller returns 25
 386         # to give us some padding, and should avoid that problem.  Not a big
 387         # deal if we lose the 25th-most-likely match anyway.
 388         return objects[:self.MAX_LOOKUP_RESULTS]
 389
 390
 391     def random_lookup(self, valid_types=[]):
 392         """Returns a random lookup result from one of the provided
 393         `valid_types`.
 394         """
 395
 396         tables = []
 397         for valid_type in valid_types:
 398             table_name = self._parse_table_name(valid_type)
 399             if table_name:
 400                 tables.append(self.indexed_tables[table_name])
 401
 402         if not tables:
 403             # n.b.: It's possible we got a list of valid_types and none of them
 404             # were valid, but this function is guaranteed to return
 405             # *something*, so it politely selects from the entire index isntead
 406             tables = self.indexed_tables.values()
 407
 408         # Rather than create an array of many hundred items and pick randomly
 409         # from it, just pick a number up to the total number of potential
 410         # items, then pick randomly from that, and partition the whole range
 411         # into chunks.  This also avoids the slight problem that the index
 412         # contains more rows (for languages) for some items than others.
 413         # XXX ought to cache this (in the index?) if possible
 414         total = 0
 415         partitions = []
 416         for table in tables:
 417             count = self.session.query(table).count()
 418             total += count
 419             partitions.append((table, count))
 420
 421         n = random.randint(1, total)
 422         while n > partitions[0][1]:
 423             n -= partitions[0][1]
 424             partitions.pop(0)
 425
 426         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 427
 428     def prefix_lookup(self, prefix):
 429         """Returns terms starting with the given exact prefix.
 430
 431         No special magic is currently done with the name; type prefixes are not
 432         recognized.
 433         """
 434
 435         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 436
 437         searcher = self.index.searcher()
 438         searcher.weighting = LanguageWeighting()
 439         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 440
 441         return self._whoosh_records_to_results(results)