pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8 import unicodedata
   9
  10 from sqlalchemy.sql import func
  11 import whoosh
  12 import whoosh.filedb.filestore
  13 import whoosh.filedb.fileindex
  14 import whoosh.index
  15 from whoosh.qparser import QueryParser
  16 import whoosh.scoring
  17 import whoosh.spelling
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22
  23 __all__ = ['PokedexLookup']
  24
  25
  26 rx_is_number = re.compile('^\d+$')
  27
  28 LookupResult = namedtuple('LookupResult',
  29                           ['object', 'name', 'language', 'iso3166', 'exact'])
  30
  31 class LanguageWeighting(whoosh.scoring.Weighting):
  32     """A scoring class that forces otherwise-equal English results to come
  33     before foreign results.
  34     """
  35
  36     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  37         doc = searcher.stored_fields(docnum)
  38         if doc['language'] == None:
  39             # English (well, "default"); leave it at 1
  40             return weight
  41         elif doc['language'] == u'Roomaji':
  42             # Give Roomaji a little boost; it's most likely to be searched
  43             return weight * 0.95
  44         else:
  45             # Everything else can drop down the totem pole
  46             return weight * 0.9
  47
  48
  49 class PokedexLookup(object):
  50     INTERMEDIATE_LOOKUP_RESULTS = 25
  51     MAX_LOOKUP_RESULTS = 10
  52
  53     # Dictionary of table name => table class.
  54     # Need the table name so we can get the class from the table name after we
  55     # retrieve something from the index
  56     indexed_tables = dict(
  57         (cls.__tablename__, cls)
  58         for cls in (
  59             tables.Ability,
  60             tables.Item,
  61             tables.Move,
  62             tables.Pokemon,
  63             tables.Type,
  64         )
  65     )
  66
  67
  68     def __init__(self, directory=None, session=None, recreate=False):
  69         """Opens the whoosh index stored in the named directory.  If the index
  70         doesn't already exist, it will be created.
  71
  72         `directory`
  73             Directory containing the index.  Defaults to a location within the
  74             `pokedex` egg directory.
  75
  76         `session`
  77             If the index needs to be created, this database session will be
  78             used.  Defaults to an attempt to connect to the default SQLite
  79             database installed by `pokedex setup`.
  80
  81         `recreate`
  82             If set to True, the whoosh index will be created even if it already
  83             exists.
  84         """
  85
  86         # By the time this returns, self.index, self.speller, and self.session
  87         # must be set
  88
  89         # Defaults
  90         if not directory:
  91             directory = pkg_resources.resource_filename('pokedex',
  92                                                         'data/whoosh-index')
  93
  94         if session:
  95             self.session = session
  96         else:
  97             self.session = connect()
  98
  99         # Attempt to open or create the index
 100         directory_exists = os.path.exists(directory)
 101         if directory_exists and not recreate:
 102             # Already exists; should be an index!  Bam, done.
 103             try:
 104                 self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 105                 spell_store = whoosh.filedb.filestore.FileStorage(directory)
 106                 self.speller = whoosh.spelling.SpellChecker(spell_store)
 107                 return
 108             except whoosh.index.EmptyIndexError as e:
 109                 # Apparently not a real index.  Fall out and create it
 110                 pass
 111
 112         # Delete and start over if we're going to bail anyway.
 113         if directory_exists and recreate:
 114             # Be safe and only delete if it looks like a whoosh index, i.e.,
 115             # everything starts with _
 116             if all(f[0] == '_' for f in os.listdir(directory)):
 117                 shutil.rmtree(directory)
 118                 directory_exists = False
 119
 120         if not directory_exists:
 121             os.mkdir(directory)
 122
 123
 124         ### Create index
 125         schema = whoosh.fields.Schema(
 126             name=whoosh.fields.ID(stored=True),
 127             table=whoosh.fields.ID(stored=True),
 128             row_id=whoosh.fields.ID(stored=True),
 129             language=whoosh.fields.STORED,
 130             iso3166=whoosh.fields.STORED,
 131             display_name=whoosh.fields.STORED,  # non-lowercased name
 132             forme_name=whoosh.fields.ID,
 133         )
 134
 135         self.index = whoosh.index.create_in(directory, schema=schema,
 136                                             indexname='MAIN')
 137         writer = self.index.writer()
 138
 139         # Index every name in all our tables of interest
 140         # speller_entries becomes a list of (word, score) tuples; the score is
 141         # 2 for English names, 1.5 for Roomaji, and 1 for everything else.  I
 142         # think this biases the results in the direction most people expect,
 143         # especially when e.g. German names are very similar to English names
 144         speller_entries = []
 145         for cls in self.indexed_tables.values():
 146             q = session.query(cls)
 147
 148             for row in q.yield_per(5):
 149                 # Need to give forme_name a dummy value because I can't
 150                 # search for explicitly empty fields.  Boo.
 151                 row_key = dict(table=unicode(cls.__tablename__),
 152                                row_id=unicode(row.id),
 153                                forme_name=u'__empty__')
 154
 155                 def add(name, language, iso3166, score):
 156                     normalized_name = self.normalize_name(name)
 157                     writer.add_document(
 158                         name=normalized_name, display_name=name,
 159                         language=language, iso3166=iso3166,
 160                         **row_key
 161                     )
 162                     speller_entries.append((normalized_name, score))
 163
 164                 # If this is a form, mark it as such
 165                 if getattr(row, 'forme_base_pokemon_id', None):
 166                     row_key['forme_name'] = row.forme_name
 167
 168                 name = row.name
 169                 add(name, None, u'us', 1)
 170
 171                 # Pokemon also get other languages
 172                 for foreign_name in getattr(row, 'foreign_names', []):
 173                     moonspeak = foreign_name.name
 174                     if name == moonspeak:
 175                         # Don't add the English name again as a different
 176                         # language; no point and it makes spell results
 177                         # confusing
 178                         continue
 179
 180                     add(moonspeak, foreign_name.language.name,
 181                                    foreign_name.language.iso3166,
 182                                    3)
 183
 184                     # Add Roomaji too
 185                     if foreign_name.language.name == 'Japanese':
 186                         roomaji = romanize(foreign_name.name)
 187                         add(roomaji, u'Roomaji', u'jp', 8)
 188
 189         writer.commit()
 190
 191         # Construct and populate a spell-checker index.  Quicker to do it all
 192         # at once, as every call to add_* does a commit(), and those seem to be
 193         # expensive
 194         self.speller = whoosh.spelling.SpellChecker(self.index.storage)
 195         self.speller.add_scored_words(speller_entries)
 196
 197
 198     def normalize_name(self, name):
 199         """Strips irrelevant formatting junk from name input.
 200
 201         Specifically: everything is lowercased, and accents are removed.
 202         """
 203         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 204         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 205         # characters, then recombine.  I'm explicitly doing it this way instead
 206         # of testing combining() because Korean characters apparently
 207         # decompose!  But the results are considered letters, not combining
 208         # characters, so testing for Mn works well, and combining them again
 209         # makes them look right.
 210         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 211         name = u"".join(c for c in nkfd_form
 212                         if unicodedata.category(c) != 'Mn')
 213         name = unicodedata.normalize('NFC', name)
 214
 215         name = name.strip()
 216         name = name.lower()
 217
 218         return name
 219
 220
 221     def _parse_table_name(self, name):
 222         """Takes a singular table name, table name, or table object and returns
 223         the table name.
 224
 225         Returns None for a bogus name.
 226         """
 227         if hasattr(name, '__tablename__'):
 228             return getattr(name, '__tablename__')
 229         elif name in self.indexed_tables:
 230             return name
 231         elif name + 's' in self.indexed_tables:
 232             return name + 's'
 233         else:
 234             # Bogus.  Be nice and return dummy
 235             return None
 236
 237     def _whoosh_records_to_results(self, records, exact=True):
 238         """Converts a list of whoosh's indexed records to LookupResult tuples
 239         containing database objects.
 240         """
 241         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 242         # way to handle it, since only lookup() cares about fuzzy results
 243         seen = {}
 244         results = []
 245         for record in records:
 246             # Skip dupes
 247             seen_key = record['table'], record['row_id']
 248             if seen_key in seen:
 249                 continue
 250             seen[seen_key] = True
 251
 252             cls = self.indexed_tables[record['table']]
 253             obj = self.session.query(cls).get(record['row_id'])
 254
 255             results.append(LookupResult(object=obj,
 256                                         name=record['display_name'],
 257                                         language=record['language'],
 258                                         iso3166=record['iso3166'],
 259                                         exact=exact))
 260
 261         return results
 262
 263
 264     def lookup(self, input, valid_types=[], exact_only=False):
 265         """Attempts to find some sort of object, given a name.
 266
 267         Returns a list of named (object, name, language, iso3166, exact)
 268         tuples.  `object` is a database object, `name` is the name under which
 269         the object was found, `language` and `iso3166` are the name and country
 270         code of the language in which the name was found, and `exact` is True
 271         iff this was an
 272         exact match.
 273
 274         This function currently ONLY does fuzzy matching if there are no exact
 275         matches.
 276
 277         Formes are not returned unless requested; "Shaymin" will return only
 278         grass Shaymin.
 279
 280         Extraneous whitespace is removed with extreme prejudice.
 281
 282         Recognizes:
 283         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 284         - Foreign names: "Iibui", "Eivui"
 285         - Fuzzy names in whatever language: "Evee", "Ibui"
 286         - IDs: "133", "192", "250"
 287         Also:
 288         - Type restrictions.  "type:psychic" will only return the type.  This
 289           is how to make ID lookup useful.  Multiple type specs can be entered
 290           with commas, as "move,item:1".  If `valid_types` are provided, any
 291           type prefix will be ignored.
 292         - Alternate formes can be specified merely like "wash rotom".
 293
 294         `input`
 295             Name of the thing to look for.
 296
 297         `valid_types`
 298             A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 299             this is provided, only results in one of the given tables will be
 300             returned.
 301
 302         `exact_only`
 303             If True, only exact matches are returned.  If set to False (the
 304             default), and the provided `name` doesn't match anything exactly,
 305             spelling correction will be attempted.
 306         """
 307
 308         name = self.normalize_name(input)
 309         exact = True
 310         form = None
 311
 312         # Remove any type prefix (pokemon:133) before constructing a query
 313         if ':' in name:
 314             prefix_chunk, name = name.split(':', 1)
 315             name = name.strip()
 316
 317             if not valid_types:
 318                 # Only use types from the query string if none were explicitly
 319                 # provided
 320                 prefixes = prefix_chunk.split(',')
 321                 valid_types = [_.strip() for _ in prefixes]
 322
 323         # Random lookup
 324         if name == 'random':
 325             return self.random_lookup(valid_types=valid_types)
 326
 327         # Do different things depending what the query looks like
 328         # Note: Term objects do an exact match, so we don't have to worry about
 329         # a query parser tripping on weird characters in the input
 330         if '*' in name or '?' in name:
 331             exact_only = True
 332             query = whoosh.query.Wildcard(u'name', name)
 333         elif rx_is_number.match(name):
 334             # Don't spell-check numbers!
 335             exact_only = True
 336             query = whoosh.query.Term(u'row_id', name)
 337         else:
 338             # Not an integer
 339             query = whoosh.query.Term(u'name', name) \
 340                   & whoosh.query.Term(u'forme_name', u'__empty__')
 341
 342             # If there's a space in the input, this might be a form
 343             if ' ' in name:
 344                 form, formless_name = name.split(' ', 1)
 345                 form_query = whoosh.query.Term(u'name', formless_name) \
 346                            & whoosh.query.Term(u'forme_name', form)
 347                 query = query | form_query
 348
 349         ### Filter by type of object
 350         type_terms = []
 351         for valid_type in valid_types:
 352             table_name = self._parse_table_name(valid_type)
 353             if table_name:
 354                 # Quietly ignore bogus valid_types; more likely to DTRT
 355                 type_terms.append(whoosh.query.Term(u'table', table_name))
 356
 357         if type_terms:
 358             query = query & whoosh.query.Or(type_terms)
 359
 360
 361         ### Actual searching
 362         searcher = self.index.searcher()
 363         # XXX is this kosher?  docs say search() takes a weighting arg, but it
 364         # certainly does not
 365         searcher.weighting = LanguageWeighting()
 366         results = searcher.search(query,
 367                                   limit=self.INTERMEDIATE_LOOKUP_RESULTS)
 368
 369         # Look for some fuzzy matches if necessary
 370         if not exact_only and not results:
 371             exact = False
 372             results = []
 373
 374             for suggestion in self.speller.suggest(
 375                 name, self.INTERMEDIATE_LOOKUP_RESULTS):
 376
 377                 query = whoosh.query.Term('name', suggestion)
 378                 results.extend(searcher.search(query))
 379
 380         ### Convert results to db objects
 381         objects = self._whoosh_records_to_results(results, exact=exact)
 382
 383         # Only return up to 10 matches; beyond that, something is wrong.  We
 384         # strip out duplicate entries above, so it's remotely possible that we
 385         # should have more than 10 here and lost a few.  The speller returns 25
 386         # to give us some padding, and should avoid that problem.  Not a big
 387         # deal if we lose the 25th-most-likely match anyway.
 388         return objects[:self.MAX_LOOKUP_RESULTS]
 389
 390
 391     def random_lookup(self, valid_types=[]):
 392         """Returns a random lookup result from one of the provided
 393         `valid_types`.
 394         """
 395
 396         tables = []
 397         for valid_type in valid_types:
 398             table_name = self._parse_table_name(valid_type)
 399             if table_name:
 400                 tables.append(self.indexed_tables[table_name])
 401
 402         if not tables:
 403             # n.b.: It's possible we got a list of valid_types and none of them
 404             # were valid, but this function is guaranteed to return
 405             # *something*, so it politely selects from the entire index isntead
 406             tables = self.indexed_tables.values()
 407
 408         # Rather than create an array of many hundred items and pick randomly
 409         # from it, just pick a number up to the total number of potential
 410         # items, then pick randomly from that, and partition the whole range
 411         # into chunks.  This also avoids the slight problem that the index
 412         # contains more rows (for languages) for some items than others.
 413         # XXX ought to cache this (in the index?) if possible
 414         total = 0
 415         partitions = []
 416         for table in tables:
 417             count = self.session.query(table).count()
 418             total += count
 419             partitions.append((table, count))
 420
 421         n = random.randint(1, total)
 422         while n > partitions[0][1]:
 423             n -= partitions[0][1]
 424             partitions.pop(0)
 425
 426         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 427
 428     def prefix_lookup(self, prefix):
 429         """Returns terms starting with the given exact prefix.
 430
 431         No special magic is currently done with the name; type prefixes are not
 432         recognized.
 433         """
 434
 435         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 436
 437         searcher = self.index.searcher()
 438         searcher.weighting = LanguageWeighting()
 439         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 440
 441         return self._whoosh_records_to_results(results)