pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8
   9 from sqlalchemy.sql import func
  10 import whoosh
  11 import whoosh.filedb.filestore
  12 import whoosh.filedb.fileindex
  13 import whoosh.index
  14 from whoosh.qparser import QueryParser
  15 import whoosh.scoring
  16 import whoosh.spelling
  17
  18 from pokedex.db import connect
  19 import pokedex.db.tables as tables
  20 from pokedex.roomaji import romanize
  21
  22 __all__ = ['open_index', 'lookup', 'random_lookup']
  23
  24 INTERMEDIATE_LOOKUP_RESULTS = 25
  25 MAX_LOOKUP_RESULTS = 10
  26
  27 # Dictionary of table name => table class.
  28 # Need the table name so we can get the class from the table name after we
  29 # retrieve something from the index
  30 indexed_tables = {}
  31 for cls in [
  32         tables.Ability,
  33         tables.Item,
  34         tables.Move,
  35         tables.Pokemon,
  36         tables.Type,
  37     ]:
  38     indexed_tables[cls.__tablename__] = cls
  39
  40 def open_index(directory=None, session=None, recreate=False):
  41     """Opens the whoosh index stored in the named directory and returns (index,
  42     speller).  If the index doesn't already exist, it will be created.
  43
  44     `directory`
  45         Directory containing the index.  Defaults to a location within the
  46         `pokedex` egg directory.
  47
  48     `session`
  49         If the index needs to be created, this database session will be used.
  50         Defaults to an attempt to connect to the default SQLite database
  51         installed by `pokedex setup`.
  52
  53     `recreate`
  54         If set to True, the whoosh index will be created even if it already
  55         exists.
  56     """
  57
  58     # Defaults
  59     if not directory:
  60         directory = pkg_resources.resource_filename('pokedex',
  61                                                     'data/whoosh-index')
  62
  63     if not session:
  64         session = connect()
  65
  66     # Attempt to open or create the index
  67     directory_exists = os.path.exists(directory)
  68     if directory_exists and not recreate:
  69         # Already exists; should be an index!
  70         try:
  71             index = whoosh.index.open_dir(directory, indexname='MAIN')
  72             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  73             speller = whoosh.spelling.SpellChecker(spell_store)
  74             return index, speller
  75         except whoosh.index.EmptyIndexError as e:
  76             # Apparently not a real index.  Fall out of the if and create it
  77             pass
  78
  79     # Delete and start over if we're going to bail anyway.
  80     if directory_exists and recreate:
  81         # Be safe and only delete if it looks like a whoosh index, i.e.,
  82         # everything starts with _
  83         if all(f[0] == '_' for f in os.listdir(directory)):
  84             shutil.rmtree(directory)
  85             directory_exists = False
  86
  87     if not directory_exists:
  88         os.mkdir(directory)
  89
  90
  91     ### Create index
  92     schema = whoosh.fields.Schema(
  93         name=whoosh.fields.ID(stored=True),
  94         table=whoosh.fields.ID(stored=True),
  95         row_id=whoosh.fields.ID(stored=True),
  96         language=whoosh.fields.STORED,
  97         iso3166=whoosh.fields.STORED,
  98         display_name=whoosh.fields.STORED,  # non-lowercased name
  99         forme_name=whoosh.fields.ID,
 100     )
 101
 102     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 103     writer = index.writer()
 104
 105     # Index every name in all our tables of interest
 106     # speller_entries becomes a list of (word, score) tuples; the score is 2
 107     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 108     # this biases the results in the direction most people expect, especially
 109     # when e.g. German names are very similar to English names
 110     speller_entries = []
 111     for cls in indexed_tables.values():
 112         q = session.query(cls)
 113
 114         for row in q.yield_per(5):
 115             # XXX need to give forme_name a dummy value because I can't search
 116             # for explicitly empty fields.  boo.
 117             row_key = dict(table=unicode(cls.__tablename__),
 118                            row_id=unicode(row.id),
 119                            forme_name=u'XXX')
 120
 121             def add(name, language, iso3166, score):
 122                 writer.add_document(name=name.lower(), display_name=name,
 123                                     language=language,
 124                                     iso3166=iso3166,
 125                                     **row_key)
 126                 speller_entries.append((name.lower(), score))
 127
 128             # If this is a form, mark it as such
 129             if getattr(row, 'forme_base_pokemon_id', None):
 130                 row_key['forme_name'] = row.forme_name
 131
 132             name = row.name
 133             add(name, None, u'us', 1)
 134
 135             # Pokemon also get other languages
 136             for foreign_name in getattr(row, 'foreign_names', []):
 137                 moonspeak = foreign_name.name
 138                 if name == moonspeak:
 139                     # Don't add the English name again as a different language;
 140                     # no point and it makes spell results confusing
 141                     continue
 142
 143                 add(moonspeak, foreign_name.language.name,
 144                                foreign_name.language.iso3166,
 145                                3)
 146
 147                 # Add Roomaji too
 148                 if foreign_name.language.name == 'Japanese':
 149                     roomaji = romanize(foreign_name.name)
 150                     add(roomaji, u'Roomaji', u'jp', 8)
 151
 152     writer.commit()
 153
 154     # Construct and populate a spell-checker index.  Quicker to do it all
 155     # at once, as every call to add_* does a commit(), and those seem to be
 156     # expensive
 157     speller = whoosh.spelling.SpellChecker(index.storage)
 158     speller.add_scored_words(speller_entries)
 159
 160     return index, speller
 161
 162
 163 class LanguageWeighting(whoosh.scoring.Weighting):
 164     """A scoring class that forces otherwise-equal English results to come
 165     before foreign results.
 166     """
 167
 168     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 169         doc = searcher.stored_fields(docnum)
 170         if doc['language'] == None:
 171             # English (well, "default"); leave it at 1
 172             return weight
 173         elif doc['language'] == u'Roomaji':
 174             # Give Roomaji a bit of a boost, as it's most likely to be searched
 175             return weight * 0.95
 176         else:
 177             # Everything else can drop down the totem pole
 178             return weight * 0.9
 179
 180 rx_is_number = re.compile('^\d+$')
 181
 182 LookupResult = namedtuple('LookupResult',
 183                           ['object', 'name', 'language', 'iso3166', 'exact'])
 184
 185 def _parse_table_name(name):
 186     """Takes a singular table name, table name, or table object and returns the
 187     table name.
 188
 189     Returns None for a bogus name.
 190     """
 191     if hasattr(name, '__tablename__'):
 192         return getattr(name, '__tablename__')
 193     elif name in indexed_tables:
 194         return name
 195     elif name + 's' in indexed_tables:
 196         return name + 's'
 197     else:
 198         # Bogus.  Be nice and return dummy
 199         return None
 200
 201 def _whoosh_records_to_results(records, session, exact=True):
 202     """Converts a list of whoosh's indexed records to LookupResult tuples
 203     containing database objects.
 204     """
 205     # XXX this 'exact' thing is getting kinda leaky.  would like a better way
 206     # to handle it, since only lookup() cares about fuzzy results
 207     seen = {}
 208     results = []
 209     for record in records:
 210         # Skip dupes
 211         seen_key = record['table'], record['row_id']
 212         if seen_key in seen:
 213             continue
 214         seen[seen_key] = True
 215
 216         cls = indexed_tables[record['table']]
 217         obj = session.query(cls).get(record['row_id'])
 218
 219         results.append(LookupResult(object=obj,
 220                                     name=record['display_name'],
 221                                     language=record['language'],
 222                                     iso3166=record['iso3166'],
 223                                     exact=exact))
 224
 225     return results
 226
 227
 228 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 229     """Attempts to find some sort of object, given a database session and name.
 230
 231     Returns a list of named (object, name, language, iso3166, exact) tuples.
 232     `object` is a database object, `name` is the name under which the object
 233     was found, `language` and `iso3166` are the name and country code of the
 234     language in which the name was found, and `exact` is True iff this was an
 235     exact match.
 236
 237     This function currently ONLY does fuzzy matching if there are no exact
 238     matches.
 239
 240     Formes are not returned unless requested; "Shaymin" will return only grass
 241     Shaymin.
 242
 243     Extraneous whitespace is removed with extreme prejudice.
 244
 245     Recognizes:
 246     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 247     - Foreign names: "Iibui", "Eivui"
 248     - Fuzzy names in whatever language: "Evee", "Ibui"
 249     - IDs: "133", "192", "250"
 250     Also:
 251     - Type restrictions.  "type:psychic" will only return the type.  This is
 252       how to make ID lookup useful.  Multiple type specs can be entered with
 253       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 254       will be ignored.
 255     - Alternate formes can be specified merely like "wash rotom".
 256
 257     `input`
 258         Name of the thing to look for.
 259
 260     `valid_types`
 261         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 262         this is provided, only results in one of the given tables will be
 263         returned.
 264
 265     `session`
 266         A database session to use for retrieving objects.  As with get_index,
 267         if this is not provided, a connection to the default database will be
 268         attempted.
 269
 270     `indices`
 271         Tuple of index, speller as returned from `open_index()`.  Defaults to
 272         a call to `open_index()`.
 273
 274     `exact_only`
 275         If True, only exact matches are returned.  If set to False (the
 276         default), and the provided `name` doesn't match anything exactly,
 277         spelling correction will be attempted.
 278     """
 279
 280     if not session:
 281         session = connect()
 282
 283     if indices:
 284         index, speller = indices
 285     else:
 286         index, speller = open_index()
 287
 288     name = unicode(input).strip().lower()
 289     exact = True
 290     form = None
 291
 292     # Remove any type prefix (pokemon:133) before constructing a query
 293     if ':' in name:
 294         prefix_chunk, name = name.split(':', 1)
 295         name = name.strip()
 296
 297         if not valid_types:
 298             # Only use types from the query string if none were explicitly
 299             # provided
 300             prefixes = prefix_chunk.split(',')
 301             valid_types = [_.strip() for _ in prefixes]
 302
 303     # Random lookup
 304     if name == 'random':
 305         return random_lookup(indices=(index, speller),
 306                              session=session,
 307                              valid_types=valid_types)
 308
 309     # Do different things depending what the query looks like
 310     # Note: Term objects do an exact match, so we don't have to worry about a
 311     # query parser tripping on weird characters in the input
 312     if '*' in name or '?' in name:
 313         exact_only = True
 314         query = whoosh.query.Wildcard(u'name', name)
 315     elif rx_is_number.match(name):
 316         # Don't spell-check numbers!
 317         exact_only = True
 318         query = whoosh.query.Term(u'row_id', name)
 319     else:
 320         # Not an integer
 321         query = whoosh.query.Term(u'name', name) \
 322               & whoosh.query.Term(u'forme_name', u'XXX')
 323
 324         # If there's a space in the input, this might be a form
 325         if ' ' in name:
 326             form, formless_name = name.split(' ', 1)
 327             form_query = whoosh.query.Term(u'name', formless_name) \
 328                        & whoosh.query.Term(u'forme_name', form)
 329             query = query | form_query
 330
 331     ### Filter by type of object
 332     type_terms = []
 333     for valid_type in valid_types:
 334         table_name = _parse_table_name(valid_type)
 335         type_terms.append(whoosh.query.Term(u'table', table_name))
 336
 337     if type_terms:
 338         query = query & whoosh.query.Or(type_terms)
 339
 340
 341     ### Actual searching
 342     searcher = index.searcher()
 343     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 344                                               # takes a weighting kw but it
 345                                               # certainly does not
 346     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 347
 348     # Look for some fuzzy matches if necessary
 349     if not exact_only and not results:
 350         exact = False
 351         results = []
 352
 353         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 354             query = whoosh.query.Term('name', suggestion)
 355             results.extend(searcher.search(query))
 356
 357     ### Convert results to db objects
 358     objects = _whoosh_records_to_results(results, session, exact=exact)
 359
 360     # Only return up to 10 matches; beyond that, something is wrong.
 361     # We strip out duplicate entries above, so it's remotely possible that we
 362     # should have more than 10 here and lost a few.  The speller returns 25 to
 363     # give us some padding, and should avoid that problem.  Not a big deal if
 364     # we lose the 25th-most-likely match anyway.
 365     return objects[:MAX_LOOKUP_RESULTS]
 366
 367
 368 def random_lookup(valid_types=[], session=None, indices=None):
 369     """Takes similar arguments as `lookup()`, but returns a random lookup
 370     result from one of the provided `valid_types`.
 371     """
 372
 373     tables = []
 374     for valid_type in valid_types:
 375         table_name = _parse_table_name(valid_type)
 376         if table_name:
 377             tables.append(indexed_tables[table_name])
 378
 379     if not tables:
 380         tables = indexed_tables.values()
 381
 382     # Rather than create an array of many hundred items and pick randomly from
 383     # it, just pick a number up to the total number of potential items, then
 384     # pick randomly from that, and partition the whole range into chunks
 385     total = 0
 386     partitions = []
 387     for table in tables:
 388         count = session.query(table).count()
 389         total += count
 390         partitions.append((table, count))
 391
 392     n = random.randint(1, total)
 393     while n > partitions[0][1]:
 394         n -= partitions[0][1]
 395         partitions.pop(0)
 396
 397     return lookup(unicode(n), valid_types=[ partitions[0][0] ],
 398                   indices=indices, session=session)
 399
 400 def prefix_lookup(prefix, session=None, indices=None):
 401     """Returns terms starting with the given exact prefix.
 402
 403     No special magic is currently done with the name; type prefixes are not
 404     recognized.
 405
 406     `session` and `indices` are treated as with `lookup()`.
 407     """
 408
 409     if not session:
 410         session = connect()
 411
 412     if indices:
 413         index, speller = indices
 414     else:
 415         index, speller = open_index()
 416
 417     query = whoosh.query.Prefix(u'name', prefix.lower())
 418
 419     searcher = index.searcher()
 420     searcher.weighting = LanguageWeighting()
 421     results = searcher.search(query)  # XXX , limit=MAX_LOOKUP_RESULTS)
 422
 423     return _whoosh_records_to_results(results, session)