pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8
   9 from sqlalchemy.sql import func
  10 import whoosh
  11 import whoosh.filedb.filestore
  12 import whoosh.filedb.fileindex
  13 import whoosh.index
  14 from whoosh.qparser import QueryParser
  15 import whoosh.scoring
  16 import whoosh.spelling
  17
  18 from pokedex.db import connect
  19 import pokedex.db.tables as tables
  20 from pokedex.roomaji import romanize
  21
  22 __all__ = ['open_index', 'lookup', 'random_lookup']
  23
  24 INTERMEDIATE_LOOKUP_RESULTS = 25
  25 MAX_LOOKUP_RESULTS = 10
  26
  27 # Dictionary of table name => table class.
  28 # Need the table name so we can get the class from the table name after we
  29 # retrieve something from the index
  30 indexed_tables = {}
  31 for cls in [
  32         tables.Ability,
  33         tables.Item,
  34         tables.Move,
  35         tables.Pokemon,
  36         tables.Type,
  37     ]:
  38     indexed_tables[cls.__tablename__] = cls
  39
  40 def open_index(directory=None, session=None, recreate=False):
  41     """Opens the whoosh index stored in the named directory and returns (index,
  42     speller).  If the index doesn't already exist, it will be created.
  43
  44     `directory`
  45         Directory containing the index.  Defaults to a location within the
  46         `pokedex` egg directory.
  47
  48     `session`
  49         If the index needs to be created, this database session will be used.
  50         Defaults to an attempt to connect to the default SQLite database
  51         installed by `pokedex setup`.
  52
  53     `recreate`
  54         If set to True, the whoosh index will be created even if it already
  55         exists.
  56     """
  57
  58     # Defaults
  59     if not directory:
  60         directory = pkg_resources.resource_filename('pokedex',
  61                                                     'data/whoosh-index')
  62
  63     if not session:
  64         session = connect()
  65
  66     # Attempt to open or create the index
  67     directory_exists = os.path.exists(directory)
  68     if directory_exists and not recreate:
  69         # Already exists; should be an index!
  70         try:
  71             index = whoosh.index.open_dir(directory, indexname='MAIN')
  72             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  73             speller = whoosh.spelling.SpellChecker(spell_store)
  74             return index, speller
  75         except whoosh.index.EmptyIndexError as e:
  76             # Apparently not a real index.  Fall out of the if and create it
  77             pass
  78
  79     # Delete and start over if we're going to bail anyway.
  80     if directory_exists and recreate:
  81         # Be safe and only delete if it looks like a whoosh index, i.e.,
  82         # everything starts with _
  83         if all(f[0] == '_' for f in os.listdir(directory)):
  84             shutil.rmtree(directory)
  85             directory_exists = False
  86
  87     if not directory_exists:
  88         os.mkdir(directory)
  89
  90
  91     ### Create index
  92     schema = whoosh.fields.Schema(
  93         name=whoosh.fields.ID(stored=True),
  94         table=whoosh.fields.ID(stored=True),
  95         row_id=whoosh.fields.ID(stored=True),
  96         language=whoosh.fields.STORED,
  97         iso3166=whoosh.fields.STORED,
  98         display_name=whoosh.fields.STORED,  # non-lowercased name
  99         forme_name=whoosh.fields.ID,
 100     )
 101
 102     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 103     writer = index.writer()
 104
 105     # Index every name in all our tables of interest
 106     # speller_entries becomes a list of (word, score) tuples; the score is 2
 107     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 108     # this biases the results in the direction most people expect, especially
 109     # when e.g. German names are very similar to English names
 110     speller_entries = []
 111     for cls in indexed_tables.values():
 112         q = session.query(cls)
 113
 114         for row in q.yield_per(5):
 115             # XXX need to give forme_name a dummy value because I can't search
 116             # for explicitly empty fields.  boo.
 117             row_key = dict(table=unicode(cls.__tablename__),
 118                            row_id=unicode(row.id),
 119                            forme_name=u'XXX')
 120
 121             def add(name, language, iso3166, score):
 122                 writer.add_document(name=name.lower(), display_name=name,
 123                                     language=language,
 124                                     iso3166=iso3166,
 125                                     **row_key)
 126                 speller_entries.append((name.lower(), score))
 127
 128             # If this is a form, mark it as such
 129             if getattr(row, 'forme_base_pokemon_id', None):
 130                 row_key['forme_name'] = row.forme_name
 131
 132             name = row.name
 133             add(name, None, u'us', 1)
 134
 135             # Pokemon also get other languages
 136             for foreign_name in getattr(row, 'foreign_names', []):
 137                 moonspeak = foreign_name.name
 138                 if name == moonspeak:
 139                     # Don't add the English name again as a different language;
 140                     # no point and it makes spell results confusing
 141                     continue
 142
 143                 add(moonspeak, foreign_name.language.name,
 144                                foreign_name.language.iso3166,
 145                                3)
 146
 147                 # Add Roomaji too
 148                 if foreign_name.language.name == 'Japanese':
 149                     roomaji = romanize(foreign_name.name)
 150                     add(roomaji, u'Roomaji', u'jp', 8)
 151
 152     writer.commit()
 153
 154     # Construct and populate a spell-checker index.  Quicker to do it all
 155     # at once, as every call to add_* does a commit(), and those seem to be
 156     # expensive
 157     speller = whoosh.spelling.SpellChecker(index.storage)
 158     speller.add_scored_words(speller_entries)
 159
 160     return index, speller
 161
 162
 163 class LanguageWeighting(whoosh.scoring.Weighting):
 164     """A scoring class that forces otherwise-equal English results to come
 165     before foreign results.
 166     """
 167
 168     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 169         doc = searcher.stored_fields(docnum)
 170         if doc['language'] == None:
 171             # English (well, "default"); leave it at 1
 172             return weight
 173         elif doc['language'] == u'Roomaji':
 174             # Give Roomaji a bit of a boost, as it's most likely to be searched
 175             return weight * 0.95
 176         else:
 177             # Everything else can drop down the totem pole
 178             return weight * 0.9
 179
 180 rx_is_number = re.compile('^\d+$')
 181
 182 LookupResult = namedtuple('LookupResult',
 183                           ['object', 'name', 'language', 'iso3166', 'exact'])
 184
 185 def _parse_table_name(name):
 186     """Takes a singular table name, table name, or table object and returns the
 187     table name.
 188
 189     Returns None for a bogus name.
 190     """
 191     if hasattr(name, '__tablename__'):
 192         return getattr(name, '__tablename__')
 193     elif name in indexed_tables:
 194         return name
 195     elif name + 's' in indexed_tables:
 196         return name + 's'
 197     else:
 198         # Bogus.  Be nice and return dummy
 199         return None
 200
 201 def _whoosh_records_to_results(records, session, exact=True):
 202     """Converts a list of whoosh's indexed records to LookupResult tuples
 203     containing database objects.
 204     """
 205     # XXX this 'exact' thing is getting kinda leaky.  would like a better way
 206     # to handle it, since only lookup() cares about fuzzy results
 207     seen = {}
 208     results = []
 209     for record in records:
 210         # Skip dupes
 211         seen_key = record['table'], record['row_id']
 212         if seen_key in seen:
 213             continue
 214         seen[seen_key] = True
 215
 216         cls = indexed_tables[record['table']]
 217         obj = session.query(cls).get(record['row_id'])
 218
 219         results.append(LookupResult(object=obj,
 220                                     name=record['display_name'],
 221                                     language=record['language'],
 222                                     iso3166=record['iso3166'],
 223                                     exact=exact))
 224
 225     return results
 226
 227
 228 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 229     """Attempts to find some sort of object, given a database session and name.
 230
 231     Returns a list of named (object, name, language, iso3166, exact) tuples.
 232     `object` is a database object, `name` is the name under which the object
 233     was found, `language` and `iso3166` are the name and country code of the
 234     language in which the name was found, and `exact` is True iff this was an
 235     exact match.
 236
 237     This function currently ONLY does fuzzy matching if there are no exact
 238     matches.
 239
 240     Formes are not returned unless requested; "Shaymin" will return only grass
 241     Shaymin.
 242
 243     Extraneous whitespace is removed with extreme prejudice.
 244
 245     Recognizes:
 246     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 247     - Foreign names: "Iibui", "Eivui"
 248     - Fuzzy names in whatever language: "Evee", "Ibui"
 249     - IDs: "133", "192", "250"
 250     Also:
 251     - Type restrictions.  "type:psychic" will only return the type.  This is
 252       how to make ID lookup useful.  Multiple type specs can be entered with
 253       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 254       will be ignored.
 255     - Alternate formes can be specified merely like "wash rotom".
 256
 257     `input`
 258         Name of the thing to look for.
 259
 260     `valid_types`
 261         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 262         this is provided, only results in one of the given tables will be
 263         returned.
 264
 265     `session`
 266         A database session to use for retrieving objects.  As with get_index,
 267         if this is not provided, a connection to the default database will be
 268         attempted.
 269
 270     `indices`
 271         Tuple of index, speller as returned from `open_index()`.  Defaults to
 272         a call to `open_index()`.
 273
 274     `exact_only`
 275         If True, only exact matches are returned.  If set to False (the
 276         default), and the provided `name` doesn't match anything exactly,
 277         spelling correction will be attempted.
 278     """
 279
 280     if not session:
 281         session = connect()
 282
 283     if indices:
 284         index, speller = indices
 285     else:
 286         index, speller = open_index()
 287
 288     name = unicode(input).strip().lower()
 289     exact = True
 290     form = None
 291
 292     # Remove any type prefix (pokemon:133) before constructing a query
 293     if ':' in name:
 294         prefix_chunk, name = name.split(':', 1)
 295         name = name.strip()
 296
 297         if not valid_types:
 298             # Only use types from the query string if none were explicitly
 299             # provided
 300             prefixes = prefix_chunk.split(',')
 301             valid_types = [_.strip() for _ in prefixes]
 302
 303     # Random lookup
 304     if name == 'random':
 305         return random_lookup(indices=(index, speller),
 306                              session=session,
 307                              valid_types=valid_types)
 308
 309     # Do different things depending what the query looks like
 310     # Note: Term objects do an exact match, so we don't have to worry about a
 311     # query parser tripping on weird characters in the input
 312     if '*' in name or '?' in name:
 313         exact_only = True
 314         query = whoosh.query.Wildcard(u'name', name)
 315     elif rx_is_number.match(name):
 316         # Don't spell-check numbers!
 317         exact_only = True
 318         query = whoosh.query.Term(u'row_id', name)
 319     else:
 320         # Not an integer
 321         query = whoosh.query.Term(u'name', name) \
 322               & whoosh.query.Term(u'forme_name', u'XXX')
 323
 324         # If there's a space in the input, this might be a form
 325         if ' ' in name:
 326             form, formless_name = name.split(' ', 1)
 327             form_query = whoosh.query.Term(u'name', formless_name) \
 328                        & whoosh.query.Term(u'forme_name', form)
 329             query = query | form_query
 330
 331     ### Filter by type of object
 332     type_terms = []
 333     for valid_type in valid_types:
 334         table_name = _parse_table_name(valid_type)
 335         if table_name:
 336             # Quietly ignore bogus valid_types; more likely to DTRT
 337             type_terms.append(whoosh.query.Term(u'table', table_name))
 338
 339     if type_terms:
 340         query = query & whoosh.query.Or(type_terms)
 341
 342
 343     ### Actual searching
 344     searcher = index.searcher()
 345     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 346                                               # takes a weighting kw but it
 347                                               # certainly does not
 348     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 349
 350     # Look for some fuzzy matches if necessary
 351     if not exact_only and not results:
 352         exact = False
 353         results = []
 354
 355         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 356             query = whoosh.query.Term('name', suggestion)
 357             results.extend(searcher.search(query))
 358
 359     ### Convert results to db objects
 360     objects = _whoosh_records_to_results(results, session, exact=exact)
 361
 362     # Only return up to 10 matches; beyond that, something is wrong.
 363     # We strip out duplicate entries above, so it's remotely possible that we
 364     # should have more than 10 here and lost a few.  The speller returns 25 to
 365     # give us some padding, and should avoid that problem.  Not a big deal if
 366     # we lose the 25th-most-likely match anyway.
 367     return objects[:MAX_LOOKUP_RESULTS]
 368
 369
 370 def random_lookup(valid_types=[], session=None, indices=None):
 371     """Takes similar arguments as `lookup()`, but returns a random lookup
 372     result from one of the provided `valid_types`.
 373     """
 374
 375     tables = []
 376     for valid_type in valid_types:
 377         table_name = _parse_table_name(valid_type)
 378         if table_name:
 379             tables.append(indexed_tables[table_name])
 380
 381     if not tables:
 382         # n.b.: It's possible we got a list of valid_types and none of them
 383         # were valid, but this function is guaranteed to return *something*, so
 384         # it politely selects from the entire index isntead
 385         tables = indexed_tables.values()
 386
 387     # Rather than create an array of many hundred items and pick randomly from
 388     # it, just pick a number up to the total number of potential items, then
 389     # pick randomly from that, and partition the whole range into chunks.
 390     # This also avoids the slight problem that the index contains more rows
 391     # (for languages) for some items than others.
 392     # XXX ought to cache this (in the index?) if possible
 393     total = 0
 394     partitions = []
 395     for table in tables:
 396         count = session.query(table).count()
 397         total += count
 398         partitions.append((table, count))
 399
 400     n = random.randint(1, total)
 401     while n > partitions[0][1]:
 402         n -= partitions[0][1]
 403         partitions.pop(0)
 404
 405     return lookup(unicode(n), valid_types=[ partitions[0][0] ],
 406                   indices=indices, session=session)
 407
 408 def prefix_lookup(prefix, session=None, indices=None):
 409     """Returns terms starting with the given exact prefix.
 410
 411     No special magic is currently done with the name; type prefixes are not
 412     recognized.
 413
 414     `session` and `indices` are treated as with `lookup()`.
 415     """
 416
 417     if not session:
 418         session = connect()
 419
 420     if indices:
 421         index, speller = indices
 422     else:
 423         index, speller = open_index()
 424
 425     query = whoosh.query.Prefix(u'name', prefix.lower())
 426
 427     searcher = index.searcher()
 428     searcher.weighting = LanguageWeighting()
 429     results = searcher.search(query)  # XXX , limit=MAX_LOOKUP_RESULTS)
 430
 431     return _whoosh_records_to_results(results, session)