pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8 import unicodedata
   9
  10 from sqlalchemy.sql import func
  11 import whoosh
  12 import whoosh.filedb.filestore
  13 import whoosh.filedb.fileindex
  14 import whoosh.index
  15 from whoosh.qparser import QueryParser
  16 import whoosh.scoring
  17 import whoosh.spelling
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22
  23 __all__ = ['open_index', 'lookup', 'random_lookup']
  24
  25 INTERMEDIATE_LOOKUP_RESULTS = 25
  26 MAX_LOOKUP_RESULTS = 10
  27
  28 # Dictionary of table name => table class.
  29 # Need the table name so we can get the class from the table name after we
  30 # retrieve something from the index
  31 indexed_tables = {}
  32 for cls in [
  33         tables.Ability,
  34         tables.Item,
  35         tables.Move,
  36         tables.Pokemon,
  37         tables.Type,
  38     ]:
  39     indexed_tables[cls.__tablename__] = cls
  40
  41 def normalize(name):
  42     """Strips irrelevant formatting junk from name input.
  43
  44     Specifically: everything is lowercased, and accents are removed.
  45     """
  46     # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
  47     # Makes sense to me.  Decompose by Unicode rules, then remove combining
  48     # characters, then recombine.  I'm explicitly doing it this way instead of
  49     # testing combining() because Korean characters apparently decompose!  But
  50     # the results are considered letters, not combining characters, so testing
  51     # for Mn works well, and combining them again makes them look right.
  52     nkfd_form = unicodedata.normalize('NFKD', unicode(name))
  53     name = u"".join(c for c in nkfd_form
  54                     if unicodedata.category(c) != 'Mn')
  55     name = unicodedata.normalize('NFC', name)
  56
  57     name = name.strip()
  58     name = name.lower()
  59
  60     return name
  61
  62
  63 def open_index(directory=None, session=None, recreate=False):
  64     """Opens the whoosh index stored in the named directory and returns (index,
  65     speller).  If the index doesn't already exist, it will be created.
  66
  67     `directory`
  68         Directory containing the index.  Defaults to a location within the
  69         `pokedex` egg directory.
  70
  71     `session`
  72         If the index needs to be created, this database session will be used.
  73         Defaults to an attempt to connect to the default SQLite database
  74         installed by `pokedex setup`.
  75
  76     `recreate`
  77         If set to True, the whoosh index will be created even if it already
  78         exists.
  79     """
  80
  81     # Defaults
  82     if not directory:
  83         directory = pkg_resources.resource_filename('pokedex',
  84                                                     'data/whoosh-index')
  85
  86     if not session:
  87         session = connect()
  88
  89     # Attempt to open or create the index
  90     directory_exists = os.path.exists(directory)
  91     if directory_exists and not recreate:
  92         # Already exists; should be an index!
  93         try:
  94             index = whoosh.index.open_dir(directory, indexname='MAIN')
  95             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  96             speller = whoosh.spelling.SpellChecker(spell_store)
  97             return index, speller
  98         except whoosh.index.EmptyIndexError as e:
  99             # Apparently not a real index.  Fall out of the if and create it
 100             pass
 101
 102     # Delete and start over if we're going to bail anyway.
 103     if directory_exists and recreate:
 104         # Be safe and only delete if it looks like a whoosh index, i.e.,
 105         # everything starts with _
 106         if all(f[0] == '_' for f in os.listdir(directory)):
 107             shutil.rmtree(directory)
 108             directory_exists = False
 109
 110     if not directory_exists:
 111         os.mkdir(directory)
 112
 113
 114     ### Create index
 115     schema = whoosh.fields.Schema(
 116         name=whoosh.fields.ID(stored=True),
 117         table=whoosh.fields.ID(stored=True),
 118         row_id=whoosh.fields.ID(stored=True),
 119         language=whoosh.fields.STORED,
 120         iso3166=whoosh.fields.STORED,
 121         display_name=whoosh.fields.STORED,  # non-lowercased name
 122         forme_name=whoosh.fields.ID,
 123     )
 124
 125     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 126     writer = index.writer()
 127
 128     # Index every name in all our tables of interest
 129     # speller_entries becomes a list of (word, score) tuples; the score is 2
 130     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 131     # this biases the results in the direction most people expect, especially
 132     # when e.g. German names are very similar to English names
 133     speller_entries = []
 134     for cls in indexed_tables.values():
 135         q = session.query(cls)
 136
 137         for row in q.yield_per(5):
 138             # XXX need to give forme_name a dummy value because I can't search
 139             # for explicitly empty fields.  boo.
 140             row_key = dict(table=unicode(cls.__tablename__),
 141                            row_id=unicode(row.id),
 142                            forme_name=u'XXX')
 143
 144             def add(name, language, iso3166, score):
 145                 normalized_name = normalize(name)
 146                 writer.add_document(name=normalized_name, display_name=name,
 147                                     language=language,
 148                                     iso3166=iso3166,
 149                                     **row_key)
 150                 speller_entries.append((normalized_name, score))
 151
 152             # If this is a form, mark it as such
 153             if getattr(row, 'forme_base_pokemon_id', None):
 154                 row_key['forme_name'] = row.forme_name
 155
 156             name = row.name
 157             add(name, None, u'us', 1)
 158
 159             # Pokemon also get other languages
 160             for foreign_name in getattr(row, 'foreign_names', []):
 161                 moonspeak = foreign_name.name
 162                 if name == moonspeak:
 163                     # Don't add the English name again as a different language;
 164                     # no point and it makes spell results confusing
 165                     continue
 166
 167                 add(moonspeak, foreign_name.language.name,
 168                                foreign_name.language.iso3166,
 169                                3)
 170
 171                 # Add Roomaji too
 172                 if foreign_name.language.name == 'Japanese':
 173                     roomaji = romanize(foreign_name.name)
 174                     add(roomaji, u'Roomaji', u'jp', 8)
 175
 176     writer.commit()
 177
 178     # Construct and populate a spell-checker index.  Quicker to do it all
 179     # at once, as every call to add_* does a commit(), and those seem to be
 180     # expensive
 181     speller = whoosh.spelling.SpellChecker(index.storage)
 182     speller.add_scored_words(speller_entries)
 183
 184     return index, speller
 185
 186
 187 class LanguageWeighting(whoosh.scoring.Weighting):
 188     """A scoring class that forces otherwise-equal English results to come
 189     before foreign results.
 190     """
 191
 192     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 193         doc = searcher.stored_fields(docnum)
 194         if doc['language'] == None:
 195             # English (well, "default"); leave it at 1
 196             return weight
 197         elif doc['language'] == u'Roomaji':
 198             # Give Roomaji a bit of a boost, as it's most likely to be searched
 199             return weight * 0.95
 200         else:
 201             # Everything else can drop down the totem pole
 202             return weight * 0.9
 203
 204 rx_is_number = re.compile('^\d+$')
 205
 206 LookupResult = namedtuple('LookupResult',
 207                           ['object', 'name', 'language', 'iso3166', 'exact'])
 208
 209 def _parse_table_name(name):
 210     """Takes a singular table name, table name, or table object and returns the
 211     table name.
 212
 213     Returns None for a bogus name.
 214     """
 215     if hasattr(name, '__tablename__'):
 216         return getattr(name, '__tablename__')
 217     elif name in indexed_tables:
 218         return name
 219     elif name + 's' in indexed_tables:
 220         return name + 's'
 221     else:
 222         # Bogus.  Be nice and return dummy
 223         return None
 224
 225 def _whoosh_records_to_results(records, session, exact=True):
 226     """Converts a list of whoosh's indexed records to LookupResult tuples
 227     containing database objects.
 228     """
 229     # XXX this 'exact' thing is getting kinda leaky.  would like a better way
 230     # to handle it, since only lookup() cares about fuzzy results
 231     seen = {}
 232     results = []
 233     for record in records:
 234         # Skip dupes
 235         seen_key = record['table'], record['row_id']
 236         if seen_key in seen:
 237             continue
 238         seen[seen_key] = True
 239
 240         cls = indexed_tables[record['table']]
 241         obj = session.query(cls).get(record['row_id'])
 242
 243         results.append(LookupResult(object=obj,
 244                                     name=record['display_name'],
 245                                     language=record['language'],
 246                                     iso3166=record['iso3166'],
 247                                     exact=exact))
 248
 249     return results
 250
 251
 252 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 253     """Attempts to find some sort of object, given a database session and name.
 254
 255     Returns a list of named (object, name, language, iso3166, exact) tuples.
 256     `object` is a database object, `name` is the name under which the object
 257     was found, `language` and `iso3166` are the name and country code of the
 258     language in which the name was found, and `exact` is True iff this was an
 259     exact match.
 260
 261     This function currently ONLY does fuzzy matching if there are no exact
 262     matches.
 263
 264     Formes are not returned unless requested; "Shaymin" will return only grass
 265     Shaymin.
 266
 267     Extraneous whitespace is removed with extreme prejudice.
 268
 269     Recognizes:
 270     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 271     - Foreign names: "Iibui", "Eivui"
 272     - Fuzzy names in whatever language: "Evee", "Ibui"
 273     - IDs: "133", "192", "250"
 274     Also:
 275     - Type restrictions.  "type:psychic" will only return the type.  This is
 276       how to make ID lookup useful.  Multiple type specs can be entered with
 277       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 278       will be ignored.
 279     - Alternate formes can be specified merely like "wash rotom".
 280
 281     `input`
 282         Name of the thing to look for.
 283
 284     `valid_types`
 285         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 286         this is provided, only results in one of the given tables will be
 287         returned.
 288
 289     `session`
 290         A database session to use for retrieving objects.  As with get_index,
 291         if this is not provided, a connection to the default database will be
 292         attempted.
 293
 294     `indices`
 295         Tuple of index, speller as returned from `open_index()`.  Defaults to
 296         a call to `open_index()`.
 297
 298     `exact_only`
 299         If True, only exact matches are returned.  If set to False (the
 300         default), and the provided `name` doesn't match anything exactly,
 301         spelling correction will be attempted.
 302     """
 303
 304     if not session:
 305         session = connect()
 306
 307     if indices:
 308         index, speller = indices
 309     else:
 310         index, speller = open_index()
 311
 312     name = normalize(input)
 313     exact = True
 314     form = None
 315
 316     # Remove any type prefix (pokemon:133) before constructing a query
 317     if ':' in name:
 318         prefix_chunk, name = name.split(':', 1)
 319         name = name.strip()
 320
 321         if not valid_types:
 322             # Only use types from the query string if none were explicitly
 323             # provided
 324             prefixes = prefix_chunk.split(',')
 325             valid_types = [_.strip() for _ in prefixes]
 326
 327     # Random lookup
 328     if name == 'random':
 329         return random_lookup(indices=(index, speller),
 330                              session=session,
 331                              valid_types=valid_types)
 332
 333     # Do different things depending what the query looks like
 334     # Note: Term objects do an exact match, so we don't have to worry about a
 335     # query parser tripping on weird characters in the input
 336     if '*' in name or '?' in name:
 337         exact_only = True
 338         query = whoosh.query.Wildcard(u'name', name)
 339     elif rx_is_number.match(name):
 340         # Don't spell-check numbers!
 341         exact_only = True
 342         query = whoosh.query.Term(u'row_id', name)
 343     else:
 344         # Not an integer
 345         query = whoosh.query.Term(u'name', name) \
 346               & whoosh.query.Term(u'forme_name', u'XXX')
 347
 348         # If there's a space in the input, this might be a form
 349         if ' ' in name:
 350             form, formless_name = name.split(' ', 1)
 351             form_query = whoosh.query.Term(u'name', formless_name) \
 352                        & whoosh.query.Term(u'forme_name', form)
 353             query = query | form_query
 354
 355     ### Filter by type of object
 356     type_terms = []
 357     for valid_type in valid_types:
 358         table_name = _parse_table_name(valid_type)
 359         if table_name:
 360             # Quietly ignore bogus valid_types; more likely to DTRT
 361             type_terms.append(whoosh.query.Term(u'table', table_name))
 362
 363     if type_terms:
 364         query = query & whoosh.query.Or(type_terms)
 365
 366
 367     ### Actual searching
 368     searcher = index.searcher()
 369     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 370                                               # takes a weighting kw but it
 371                                               # certainly does not
 372     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 373
 374     # Look for some fuzzy matches if necessary
 375     if not exact_only and not results:
 376         exact = False
 377         results = []
 378
 379         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 380             query = whoosh.query.Term('name', suggestion)
 381             results.extend(searcher.search(query))
 382
 383     ### Convert results to db objects
 384     objects = _whoosh_records_to_results(results, session, exact=exact)
 385
 386     # Only return up to 10 matches; beyond that, something is wrong.
 387     # We strip out duplicate entries above, so it's remotely possible that we
 388     # should have more than 10 here and lost a few.  The speller returns 25 to
 389     # give us some padding, and should avoid that problem.  Not a big deal if
 390     # we lose the 25th-most-likely match anyway.
 391     return objects[:MAX_LOOKUP_RESULTS]
 392
 393
 394 def random_lookup(valid_types=[], session=None, indices=None):
 395     """Takes similar arguments as `lookup()`, but returns a random lookup
 396     result from one of the provided `valid_types`.
 397     """
 398
 399     tables = []
 400     for valid_type in valid_types:
 401         table_name = _parse_table_name(valid_type)
 402         if table_name:
 403             tables.append(indexed_tables[table_name])
 404
 405     if not tables:
 406         # n.b.: It's possible we got a list of valid_types and none of them
 407         # were valid, but this function is guaranteed to return *something*, so
 408         # it politely selects from the entire index isntead
 409         tables = indexed_tables.values()
 410
 411     # Rather than create an array of many hundred items and pick randomly from
 412     # it, just pick a number up to the total number of potential items, then
 413     # pick randomly from that, and partition the whole range into chunks.
 414     # This also avoids the slight problem that the index contains more rows
 415     # (for languages) for some items than others.
 416     # XXX ought to cache this (in the index?) if possible
 417     total = 0
 418     partitions = []
 419     for table in tables:
 420         count = session.query(table).count()
 421         total += count
 422         partitions.append((table, count))
 423
 424     n = random.randint(1, total)
 425     while n > partitions[0][1]:
 426         n -= partitions[0][1]
 427         partitions.pop(0)
 428
 429     return lookup(unicode(n), valid_types=[ partitions[0][0] ],
 430                   indices=indices, session=session)
 431
 432 def prefix_lookup(prefix, session=None, indices=None):
 433     """Returns terms starting with the given exact prefix.
 434
 435     No special magic is currently done with the name; type prefixes are not
 436     recognized.
 437
 438     `session` and `indices` are treated as with `lookup()`.
 439     """
 440
 441     if not session:
 442         session = connect()
 443
 444     if indices:
 445         index, speller = indices
 446     else:
 447         index, speller = open_index()
 448
 449     query = whoosh.query.Prefix(u'name', normalize(prefix))
 450
 451     searcher = index.searcher()
 452     searcher.weighting = LanguageWeighting()
 453     results = searcher.search(query)  # XXX , limit=MAX_LOOKUP_RESULTS)
 454
 455     return _whoosh_records_to_results(results, session)