pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8
   9 from sqlalchemy.sql import func
  10 import whoosh
  11 import whoosh.filedb.filestore
  12 import whoosh.filedb.fileindex
  13 import whoosh.index
  14 from whoosh.qparser import QueryParser
  15 import whoosh.scoring
  16 import whoosh.spelling
  17
  18 from pokedex.db import connect
  19 import pokedex.db.tables as tables
  20 from pokedex.roomaji import romanize
  21
  22 __all__ = ['open_index', 'lookup', 'random_lookup']
  23
  24 INTERMEDIATE_LOOKUP_RESULTS = 25
  25 MAX_LOOKUP_RESULTS = 10
  26
  27 # Dictionary of table name => table class.
  28 # Need the table name so we can get the class from the table name after we
  29 # retrieve something from the index
  30 indexed_tables = {}
  31 for cls in [
  32         tables.Ability,
  33         tables.Item,
  34         tables.Move,
  35         tables.Pokemon,
  36         tables.Type,
  37     ]:
  38     indexed_tables[cls.__tablename__] = cls
  39
  40 def open_index(directory=None, session=None, recreate=False):
  41     """Opens the whoosh index stored in the named directory and returns (index,
  42     speller).  If the index doesn't already exist, it will be created.
  43
  44     `directory`
  45         Directory containing the index.  Defaults to a location within the
  46         `pokedex` egg directory.
  47
  48     `session`
  49         If the index needs to be created, this database session will be used.
  50         Defaults to an attempt to connect to the default SQLite database
  51         installed by `pokedex setup`.
  52
  53     `recreate`
  54         If set to True, the whoosh index will be created even if it already
  55         exists.
  56     """
  57
  58     # Defaults
  59     if not directory:
  60         directory = pkg_resources.resource_filename('pokedex',
  61                                                     'data/whoosh-index')
  62
  63     if not session:
  64         session = connect()
  65
  66     # Attempt to open or create the index
  67     directory_exists = os.path.exists(directory)
  68     if directory_exists and not recreate:
  69         # Already exists; should be an index!
  70         try:
  71             index = whoosh.index.open_dir(directory, indexname='MAIN')
  72             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  73             speller = whoosh.spelling.SpellChecker(spell_store)
  74             return index, speller
  75         except whoosh.index.EmptyIndexError as e:
  76             # Apparently not a real index.  Fall out of the if and create it
  77             pass
  78
  79     # Delete and start over if we're going to bail anyway.
  80     if directory_exists and recreate:
  81         # Be safe and only delete if it looks like a whoosh index, i.e.,
  82         # everything starts with _
  83         if all(f[0] == '_' for f in os.listdir(directory)):
  84             shutil.rmtree(directory)
  85             directory_exists = False
  86
  87     if not directory_exists:
  88         os.mkdir(directory)
  89
  90
  91     ### Create index
  92     schema = whoosh.fields.Schema(
  93         name=whoosh.fields.ID(stored=True),
  94         table=whoosh.fields.ID(stored=True),
  95         row_id=whoosh.fields.ID(stored=True),
  96         language=whoosh.fields.STORED,
  97         iso3166=whoosh.fields.STORED,
  98         display_name=whoosh.fields.STORED,  # non-lowercased name
  99         forme_name=whoosh.fields.ID,
 100     )
 101
 102     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 103     writer = index.writer()
 104
 105     # Index every name in all our tables of interest
 106     # speller_entries becomes a list of (word, score) tuples; the score is 2
 107     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 108     # this biases the results in the direction most people expect, especially
 109     # when e.g. German names are very similar to English names
 110     speller_entries = []
 111     for cls in indexed_tables.values():
 112         q = session.query(cls)
 113
 114         for row in q.yield_per(5):
 115             # XXX need to give forme_name a dummy value because I can't search
 116             # for explicitly empty fields.  boo.
 117             row_key = dict(table=unicode(cls.__tablename__),
 118                            row_id=unicode(row.id),
 119                            forme_name=u'XXX')
 120
 121             def add(name, language, iso3166, score):
 122                 writer.add_document(name=name.lower(), display_name=name,
 123                                     language=language,
 124                                     iso3166=iso3166,
 125                                     **row_key)
 126                 speller_entries.append((name.lower(), score))
 127
 128             # If this is a form, mark it as such
 129             if getattr(row, 'forme_base_pokemon_id', None):
 130                 row_key['forme_name'] = row.forme_name
 131
 132             name = row.name
 133             add(name, None, u'us', 1)
 134
 135             # Pokemon also get other languages
 136             for foreign_name in getattr(row, 'foreign_names', []):
 137                 moonspeak = foreign_name.name
 138                 if name == moonspeak:
 139                     # Don't add the English name again as a different language;
 140                     # no point and it makes spell results confusing
 141                     continue
 142
 143                 add(moonspeak, foreign_name.language.name,
 144                                foreign_name.language.iso3166,
 145                                3)
 146
 147                 # Add Roomaji too
 148                 if foreign_name.language.name == 'Japanese':
 149                     roomaji = romanize(foreign_name.name)
 150                     add(roomaji, u'Roomaji', u'jp', 8)
 151
 152     writer.commit()
 153
 154     # Construct and populate a spell-checker index.  Quicker to do it all
 155     # at once, as every call to add_* does a commit(), and those seem to be
 156     # expensive
 157     speller = whoosh.spelling.SpellChecker(index.storage)
 158     speller.add_scored_words(speller_entries)
 159
 160     return index, speller
 161
 162
 163 class LanguageWeighting(whoosh.scoring.Weighting):
 164     """A scoring class that forces otherwise-equal English results to come
 165     before foreign results.
 166     """
 167
 168     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 169         doc = searcher.stored_fields(docnum)
 170         if doc['language'] == None:
 171             # English (well, "default"); leave it at 1
 172             return weight
 173         elif doc['language'] == u'Roomaji':
 174             # Give Roomaji a bit of a boost, as it's most likely to be searched
 175             return weight * 0.95
 176         else:
 177             # Everything else can drop down the totem pole
 178             return weight * 0.9
 179
 180 rx_is_number = re.compile('^\d+$')
 181
 182 LookupResult = namedtuple('LookupResult',
 183                           ['object', 'name', 'language', 'iso3166', 'exact'])
 184
 185 def _parse_table_name(name):
 186     """Takes a singular table name, table name, or table object and returns the
 187     table name.
 188
 189     Returns None for a bogus name.
 190     """
 191     if hasattr(name, '__tablename__'):
 192         return getattr(name, '__tablename__')
 193     elif name in indexed_tables:
 194         return name
 195     elif name + 's' in indexed_tables:
 196         return name + 's'
 197     else:
 198         # Bogus.  Be nice and return dummy
 199         return None
 200
 201 def _whoosh_records_to_results(records, session, exact=True):
 202     """Converts a list of whoosh's indexed records to LookupResult tuples
 203     containing database objects.
 204     """
 205     # XXX this 'exact' thing is getting kinda leaky.  would like a better way
 206     # to handle it, since only lookup() cares about fuzzy results
 207     seen = {}
 208     results = []
 209     for record in records:
 210         # Skip dupes
 211         seen_key = record['table'], record['row_id']
 212         if seen_key in seen:
 213             continue
 214         seen[seen_key] = True
 215
 216         cls = indexed_tables[record['table']]
 217         obj = session.query(cls).get(record['row_id'])
 218
 219         results.append(LookupResult(object=obj,
 220                                     name=record['display_name'],
 221                                     language=record['language'],
 222                                     iso3166=record['iso3166'],
 223                                     exact=exact))
 224
 225     return results
 226
 227
 228 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 229     """Attempts to find some sort of object, given a database session and name.
 230
 231     Returns a list of named (object, name, language, exact) tuples.  `object`
 232     is a database object, `name` is the name under which the object was found,
 233     `language` is the name of the language in which the name was found, and
 234     `exact` is True iff this was an exact match.
 235
 236     This function currently ONLY does fuzzy matching if there are no exact
 237     matches.
 238
 239     Formes are not returned unless requested; "Shaymin" will return only grass
 240     Shaymin.
 241
 242     Extraneous whitespace is removed with extreme prejudice.
 243
 244     Recognizes:
 245     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 246     - Foreign names: "Iibui", "Eivui"
 247     - Fuzzy names in whatever language: "Evee", "Ibui"
 248     - IDs: "133", "192", "250"
 249     Also:
 250     - Type restrictions.  "type:psychic" will only return the type.  This is
 251       how to make ID lookup useful.  Multiple type specs can be entered with
 252       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 253       will be ignored.
 254     - Alternate formes can be specified merely like "wash rotom".
 255
 256     `input`
 257         Name of the thing to look for.
 258
 259     `valid_types`
 260         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 261         this is provided, only results in one of the given tables will be
 262         returned.
 263
 264     `session`
 265         A database session to use for retrieving objects.  As with get_index,
 266         if this is not provided, a connection to the default database will be
 267         attempted.
 268
 269     `indices`
 270         Tuple of index, speller as returned from `open_index()`.  Defaults to
 271         a call to `open_index()`.
 272
 273     `exact_only`
 274         If True, only exact matches are returned.  If set to False (the
 275         default), and the provided `name` doesn't match anything exactly,
 276         spelling correction will be attempted.
 277     """
 278
 279     if not session:
 280         session = connect()
 281
 282     if indices:
 283         index, speller = indices
 284     else:
 285         index, speller = open_index()
 286
 287     name = unicode(input).strip().lower()
 288     exact = True
 289     form = None
 290
 291     # Remove any type prefix (pokemon:133) before constructing a query
 292     if ':' in name:
 293         prefix_chunk, name = name.split(':', 1)
 294         name = name.strip()
 295
 296         if not valid_types:
 297             # Only use types from the query string if none were explicitly
 298             # provided
 299             prefixes = prefix_chunk.split(',')
 300             valid_types = [_.strip() for _ in prefixes]
 301
 302     # Random lookup
 303     if name == 'random':
 304         return random_lookup(indices=(index, speller),
 305                              session=session,
 306                              valid_types=valid_types)
 307
 308     # Do different things depending what the query looks like
 309     # Note: Term objects do an exact match, so we don't have to worry about a
 310     # query parser tripping on weird characters in the input
 311     if '*' in name or '?' in name:
 312         exact_only = True
 313         query = whoosh.query.Wildcard(u'name', name)
 314     elif rx_is_number.match(name):
 315         # Don't spell-check numbers!
 316         exact_only = True
 317         query = whoosh.query.Term(u'row_id', name)
 318     else:
 319         # Not an integer
 320         query = whoosh.query.Term(u'name', name) \
 321               & whoosh.query.Term(u'forme_name', u'XXX')
 322
 323         # If there's a space in the input, this might be a form
 324         if ' ' in name:
 325             form, formless_name = name.split(' ', 1)
 326             form_query = whoosh.query.Term(u'name', formless_name) \
 327                        & whoosh.query.Term(u'forme_name', form)
 328             query = query | form_query
 329
 330     ### Filter by type of object
 331     type_terms = []
 332     for valid_type in valid_types:
 333         table_name = _parse_table_name(valid_type)
 334         type_terms.append(whoosh.query.Term(u'table', table_name))
 335
 336     if type_terms:
 337         query = query & whoosh.query.Or(type_terms)
 338
 339
 340     ### Actual searching
 341     searcher = index.searcher()
 342     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 343                                               # takes a weighting kw but it
 344                                               # certainly does not
 345     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 346
 347     # Look for some fuzzy matches if necessary
 348     if not exact_only and not results:
 349         exact = False
 350         results = []
 351
 352         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 353             query = whoosh.query.Term('name', suggestion)
 354             results.extend(searcher.search(query))
 355
 356     ### Convert results to db objects
 357     objects = _whoosh_records_to_results(results, session, exact=exact)
 358
 359     # Only return up to 10 matches; beyond that, something is wrong.
 360     # We strip out duplicate entries above, so it's remotely possible that we
 361     # should have more than 10 here and lost a few.  The speller returns 25 to
 362     # give us some padding, and should avoid that problem.  Not a big deal if
 363     # we lose the 25th-most-likely match anyway.
 364     return objects[:MAX_LOOKUP_RESULTS]
 365
 366
 367 def random_lookup(valid_types=[], session=None, indices=None):
 368     """Takes similar arguments as `lookup()`, but returns a random lookup
 369     result from one of the provided `valid_types`.
 370     """
 371
 372     tables = []
 373     for valid_type in valid_types:
 374         table_name = _parse_table_name(valid_type)
 375         if table_name:
 376             tables.append(indexed_tables[table_name])
 377
 378     if not tables:
 379         tables = indexed_tables.values()
 380
 381     # Rather than create an array of many hundred items and pick randomly from
 382     # it, just pick a number up to the total number of potential items, then
 383     # pick randomly from that, and partition the whole range into chunks
 384     total = 0
 385     partitions = []
 386     for table in tables:
 387         count = session.query(table).count()
 388         total += count
 389         partitions.append((table, count))
 390
 391     n = random.randint(1, total)
 392     while n > partitions[0][1]:
 393         n -= partitions[0][1]
 394         partitions.pop(0)
 395
 396     return lookup(unicode(n), valid_types=[ partitions[0][0] ],
 397                   indices=indices, session=session)
 398
 399 def prefix_lookup(prefix, session=None, indices=None):
 400     """Returns terms starting with the given exact prefix.
 401
 402     No special magic is currently done with the name; type prefixes are not
 403     recognized.
 404
 405     `session` and `indices` are treated as with `lookup()`.
 406     """
 407
 408     if not session:
 409         session = connect()
 410
 411     if indices:
 412         index, speller = indices
 413     else:
 414         index, speller = open_index()
 415
 416     query = whoosh.query.Prefix(u'name', prefix.lower())
 417
 418     searcher = index.searcher()
 419     searcher.weighting = LanguageWeighting()
 420     results = searcher.search(query)  # XXX , limit=MAX_LOOKUP_RESULTS)
 421
 422     return _whoosh_records_to_results(results, session)