pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8
   9 from sqlalchemy.sql import func
  10 import whoosh
  11 import whoosh.filedb.filestore
  12 import whoosh.filedb.fileindex
  13 import whoosh.index
  14 from whoosh.qparser import QueryParser
  15 import whoosh.scoring
  16 import whoosh.spelling
  17
  18 from pokedex.db import connect
  19 import pokedex.db.tables as tables
  20 from pokedex.roomaji import romanize
  21
  22 __all__ = ['open_index', 'lookup', 'random_lookup']
  23
  24 INTERMEDIATE_LOOKUP_RESULTS = 25
  25 MAX_LOOKUP_RESULTS = 10
  26
  27 # Dictionary of table name => table class.
  28 # Need the table name so we can get the class from the table name after we
  29 # retrieve something from the index
  30 indexed_tables = {}
  31 for cls in [
  32         tables.Ability,
  33         tables.Item,
  34         tables.Move,
  35         tables.Pokemon,
  36         tables.Type,
  37     ]:
  38     indexed_tables[cls.__tablename__] = cls
  39
  40 def open_index(directory=None, session=None, recreate=False):
  41     """Opens the whoosh index stored in the named directory and returns (index,
  42     speller).  If the index doesn't already exist, it will be created.
  43
  44     `directory`
  45         Directory containing the index.  Defaults to a location within the
  46         `pokedex` egg directory.
  47
  48     `session`
  49         If the index needs to be created, this database session will be used.
  50         Defaults to an attempt to connect to the default SQLite database
  51         installed by `pokedex setup`.
  52
  53     `recreate`
  54         If set to True, the whoosh index will be created even if it already
  55         exists.
  56     """
  57
  58     # Defaults
  59     if not directory:
  60         directory = pkg_resources.resource_filename('pokedex',
  61                                                     'data/whoosh-index')
  62
  63     if not session:
  64         session = connect()
  65
  66     # Attempt to open or create the index
  67     directory_exists = os.path.exists(directory)
  68     if directory_exists and not recreate:
  69         # Already exists; should be an index!
  70         try:
  71             index = whoosh.index.open_dir(directory, indexname='MAIN')
  72             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  73             speller = whoosh.spelling.SpellChecker(spell_store)
  74             return index, speller
  75         except whoosh.index.EmptyIndexError as e:
  76             # Apparently not a real index.  Fall out of the if and create it
  77             pass
  78
  79     # Delete and start over if we're going to bail anyway.
  80     if directory_exists and recreate:
  81         # Be safe and only delete if it looks like a whoosh index, i.e.,
  82         # everything starts with _
  83         if all(f[0] == '_' for f in os.listdir(directory)):
  84             shutil.rmtree(directory)
  85             directory_exists = False
  86
  87     if not directory_exists:
  88         os.mkdir(directory)
  89
  90
  91     ### Create index
  92     schema = whoosh.fields.Schema(
  93         name=whoosh.fields.ID(stored=True),
  94         table=whoosh.fields.ID(stored=True),
  95         row_id=whoosh.fields.ID(stored=True),
  96         language=whoosh.fields.STORED,
  97         display_name=whoosh.fields.STORED,  # non-lowercased name
  98         forme_name=whoosh.fields.ID,
  99     )
 100
 101     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 102     writer = index.writer()
 103
 104     # Index every name in all our tables of interest
 105     # speller_entries becomes a list of (word, score) tuples; the score is 2
 106     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 107     # this biases the results in the direction most people expect, especially
 108     # when e.g. German names are very similar to English names
 109     speller_entries = []
 110     for cls in indexed_tables.values():
 111         q = session.query(cls)
 112
 113         for row in q.yield_per(5):
 114             # XXX need to give forme_name a dummy value because I can't search
 115             # for explicitly empty fields.  boo.
 116             row_key = dict(table=unicode(cls.__tablename__),
 117                            row_id=unicode(row.id),
 118                            forme_name=u'XXX')
 119
 120             def add(name, language, score):
 121                 writer.add_document(name=name.lower(), display_name=name,
 122                                     language=language,
 123                                     **row_key)
 124                 speller_entries.append((name.lower(), score))
 125
 126             # If this is a form, mark it as such
 127             if getattr(row, 'forme_base_pokemon_id', None):
 128                 row_key['forme_name'] = row.forme_name
 129
 130             name = row.name
 131             add(name, None, 1)
 132
 133             # Pokemon also get other languages
 134             for foreign_name in getattr(row, 'foreign_names', []):
 135                 moonspeak = foreign_name.name
 136                 if name == moonspeak:
 137                     # Don't add the English name again as a different language;
 138                     # no point and it makes spell results confusing
 139                     continue
 140
 141                 add(moonspeak, foreign_name.language.name, 3)
 142
 143                 # Add Roomaji too
 144                 if foreign_name.language.name == 'Japanese':
 145                     roomaji = romanize(foreign_name.name)
 146                     add(roomaji, u'Roomaji', 8)
 147
 148     writer.commit()
 149
 150     # Construct and populate a spell-checker index.  Quicker to do it all
 151     # at once, as every call to add_* does a commit(), and those seem to be
 152     # expensive
 153     speller = whoosh.spelling.SpellChecker(index.storage)
 154     speller.add_scored_words(speller_entries)
 155
 156     return index, speller
 157
 158
 159 class LanguageWeighting(whoosh.scoring.Weighting):
 160     """A scoring class that forces otherwise-equal English results to come
 161     before foreign results.
 162     """
 163
 164     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 165         doc = searcher.stored_fields(docnum)
 166         if doc['language'] == None:
 167             # English (well, "default"); leave it at 1
 168             return weight
 169         elif doc['language'] == u'Roomaji':
 170             # Give Roomaji a bit of a boost, as it's most likely to be searched
 171             return weight * 0.95
 172         else:
 173             # Everything else can drop down the totem pole
 174             return weight * 0.9
 175
 176 rx_is_number = re.compile('^\d+$')
 177
 178 LookupResult = namedtuple('LookupResult',
 179                           ['object', 'name', 'language', 'exact'])
 180
 181 def _parse_table_name(name):
 182     """Takes a singular table name, table name, or table object and returns the
 183     table name.
 184
 185     Returns None for a bogus name.
 186     """
 187     if hasattr(name, '__tablename__'):
 188         return getattr(name, '__tablename__')
 189     elif name in indexed_tables:
 190         return name
 191     elif name + 's' in indexed_tables:
 192         return name + 's'
 193     else:
 194         # Bogus.  Be nice and return dummy
 195         return None
 196
 197 def _whoosh_records_to_results(records, session, exact=True):
 198     """Converts a list of whoosh's indexed records to LookupResult tuples
 199     containing database objects.
 200     """
 201     # XXX this 'exact' thing is getting kinda leaky.  would like a better way
 202     # to handle it, since only lookup() cares about fuzzy results
 203     seen = {}
 204     results = []
 205     for record in records:
 206         # Skip dupes
 207         seen_key = record['table'], record['row_id']
 208         if seen_key in seen:
 209             continue
 210         seen[seen_key] = True
 211
 212         cls = indexed_tables[record['table']]
 213         obj = session.query(cls).get(record['row_id'])
 214
 215         results.append(LookupResult(object=obj,
 216                                     name=record['display_name'],
 217                                     language=record['language'],
 218                                     exact=exact))
 219
 220     return results
 221
 222
 223 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 224     """Attempts to find some sort of object, given a database session and name.
 225
 226     Returns a list of named (object, name, language, exact) tuples.  `object`
 227     is a database object, `name` is the name under which the object was found,
 228     `language` is the name of the language in which the name was found, and
 229     `exact` is True iff this was an exact match.
 230
 231     This function currently ONLY does fuzzy matching if there are no exact
 232     matches.
 233
 234     Formes are not returned unless requested; "Shaymin" will return only grass
 235     Shaymin.
 236
 237     Extraneous whitespace is removed with extreme prejudice.
 238
 239     Recognizes:
 240     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 241     - Foreign names: "Iibui", "Eivui"
 242     - Fuzzy names in whatever language: "Evee", "Ibui"
 243     - IDs: "133", "192", "250"
 244     Also:
 245     - Type restrictions.  "type:psychic" will only return the type.  This is
 246       how to make ID lookup useful.  Multiple type specs can be entered with
 247       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 248       will be ignored.
 249     - Alternate formes can be specified merely like "wash rotom".
 250
 251     `input`
 252         Name of the thing to look for.
 253
 254     `valid_types`
 255         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 256         this is provided, only results in one of the given tables will be
 257         returned.
 258
 259     `session`
 260         A database session to use for retrieving objects.  As with get_index,
 261         if this is not provided, a connection to the default database will be
 262         attempted.
 263
 264     `indices`
 265         Tuple of index, speller as returned from `open_index()`.  Defaults to
 266         a call to `open_index()`.
 267
 268     `exact_only`
 269         If True, only exact matches are returned.  If set to False (the
 270         default), and the provided `name` doesn't match anything exactly,
 271         spelling correction will be attempted.
 272     """
 273
 274     if not session:
 275         session = connect()
 276
 277     if indices:
 278         index, speller = indices
 279     else:
 280         index, speller = open_index()
 281
 282     name = unicode(input).strip().lower()
 283     exact = True
 284     form = None
 285
 286     # Remove any type prefix (pokemon:133) before constructing a query
 287     if ':' in name:
 288         prefix_chunk, name = name.split(':', 1)
 289         name = name.strip()
 290
 291         if not valid_types:
 292             # Only use types from the query string if none were explicitly
 293             # provided
 294             prefixes = prefix_chunk.split(',')
 295             valid_types = [_.strip() for _ in prefixes]
 296
 297     # Random lookup
 298     if name == 'random':
 299         return random_lookup(indices=(index, speller),
 300                              session=session,
 301                              valid_types=valid_types)
 302
 303     # Do different things depending what the query looks like
 304     # Note: Term objects do an exact match, so we don't have to worry about a
 305     # query parser tripping on weird characters in the input
 306     if '*' in name or '?' in name:
 307         exact_only = True
 308         query = whoosh.query.Wildcard(u'name', name)
 309     elif rx_is_number.match(name):
 310         # Don't spell-check numbers!
 311         exact_only = True
 312         query = whoosh.query.Term(u'row_id', name)
 313     else:
 314         # Not an integer
 315         query = whoosh.query.Term(u'name', name) \
 316               & whoosh.query.Term(u'forme_name', u'XXX')
 317
 318         # If there's a space in the input, this might be a form
 319         if ' ' in name:
 320             form, formless_name = name.split(' ', 1)
 321             form_query = whoosh.query.Term(u'name', formless_name) \
 322                        & whoosh.query.Term(u'forme_name', form)
 323             query = query | form_query
 324
 325     ### Filter by type of object
 326     type_terms = []
 327     for valid_type in valid_types:
 328         table_name = _parse_table_name(valid_type)
 329         type_terms.append(whoosh.query.Term(u'table', table_name))
 330
 331     if type_terms:
 332         query = query & whoosh.query.Or(type_terms)
 333
 334
 335     ### Actual searching
 336     searcher = index.searcher()
 337     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 338                                               # takes a weighting kw but it
 339                                               # certainly does not
 340     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 341
 342     # Look for some fuzzy matches if necessary
 343     if not exact_only and not results:
 344         exact = False
 345         results = []
 346
 347         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 348             query = whoosh.query.Term('name', suggestion)
 349             results.extend(searcher.search(query))
 350
 351     ### Convert results to db objects
 352     objects = _whoosh_records_to_results(results, session, exact=exact)
 353
 354     # Only return up to 10 matches; beyond that, something is wrong.
 355     # We strip out duplicate entries above, so it's remotely possible that we
 356     # should have more than 10 here and lost a few.  The speller returns 25 to
 357     # give us some padding, and should avoid that problem.  Not a big deal if
 358     # we lose the 25th-most-likely match anyway.
 359     return objects[:MAX_LOOKUP_RESULTS]
 360
 361
 362 def random_lookup(valid_types=[], session=None, indices=None):
 363     """Takes similar arguments as `lookup()`, but returns a random lookup
 364     result from one of the provided `valid_types`.
 365     """
 366
 367     tables = []
 368     for valid_type in valid_types:
 369         table_name = _parse_table_name(valid_type)
 370         if table_name:
 371             tables.append(indexed_tables[table_name])
 372
 373     if not tables:
 374         tables = indexed_tables.values()
 375
 376     # Rather than create an array of many hundred items and pick randomly from
 377     # it, just pick a number up to the total number of potential items, then
 378     # pick randomly from that, and partition the whole range into chunks
 379     total = 0
 380     partitions = []
 381     for table in tables:
 382         count = session.query(table).count()
 383         total += count
 384         partitions.append((table, count))
 385
 386     n = random.randint(1, total)
 387     while n > partitions[0][1]:
 388         n -= partitions[0][1]
 389         partitions.pop(0)
 390
 391     return lookup(unicode(n), valid_types=[ partitions[0][0] ],
 392                   indices=indices, session=session)
 393
 394 def prefix_lookup(prefix, session=None, indices=None):
 395     """Returns terms starting with the given exact prefix.
 396
 397     No special magic is currently done with the name; type prefixes are not
 398     recognized.
 399
 400     `session` and `indices` are treated as with `lookup()`.
 401     """
 402
 403     if not session:
 404         session = connect()
 405
 406     if indices:
 407         index, speller = indices
 408     else:
 409         index, speller = open_index()
 410
 411     query = whoosh.query.Prefix(u'name', prefix.lower())
 412
 413     searcher = index.searcher()
 414     searcher.weighting = LanguageWeighting()
 415     results = searcher.search(query)  # XXX , limit=MAX_LOOKUP_RESULTS)
 416
 417     return _whoosh_records_to_results(results, session)