pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8
   9 from sqlalchemy.sql import func
  10 import whoosh
  11 import whoosh.filedb.filestore
  12 import whoosh.filedb.fileindex
  13 import whoosh.index
  14 from whoosh.qparser import QueryParser
  15 import whoosh.scoring
  16 import whoosh.spelling
  17
  18 from pokedex.db import connect
  19 import pokedex.db.tables as tables
  20 from pokedex.roomaji import romanize
  21
  22 __all__ = ['open_index', 'lookup', 'random_lookup']
  23
  24 INTERMEDIATE_LOOKUP_RESULTS = 25
  25 MAX_LOOKUP_RESULTS = 10
  26
  27 # Dictionary of table name => table class.
  28 # Need the table name so we can get the class from the table name after we
  29 # retrieve something from the index
  30 indexed_tables = {}
  31 for cls in [
  32         tables.Ability,
  33         tables.Item,
  34         tables.Move,
  35         tables.Pokemon,
  36         tables.Type,
  37     ]:
  38     indexed_tables[cls.__tablename__] = cls
  39
  40 def open_index(directory=None, session=None, recreate=False):
  41     """Opens the whoosh index stored in the named directory and returns (index,
  42     speller).  If the index doesn't already exist, it will be created.
  43
  44     `directory`
  45         Directory containing the index.  Defaults to a location within the
  46         `pokedex` egg directory.
  47
  48     `session`
  49         If the index needs to be created, this database session will be used.
  50         Defaults to an attempt to connect to the default SQLite database
  51         installed by `pokedex setup`.
  52
  53     `recreate`
  54         If set to True, the whoosh index will be created even if it already
  55         exists.
  56     """
  57
  58     # Defaults
  59     if not directory:
  60         directory = pkg_resources.resource_filename('pokedex',
  61                                                     'data/whoosh-index')
  62
  63     if not session:
  64         session = connect()
  65
  66     # Attempt to open or create the index
  67     directory_exists = os.path.exists(directory)
  68     if directory_exists and not recreate:
  69         # Already exists; should be an index!
  70         try:
  71             index = whoosh.index.open_dir(directory, indexname='MAIN')
  72             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  73             speller = whoosh.spelling.SpellChecker(spell_store)
  74             return index, speller
  75         except whoosh.index.EmptyIndexError as e:
  76             # Apparently not a real index.  Fall out of the if and create it
  77             pass
  78
  79     # Delete and start over if we're going to bail anyway.
  80     if directory_exists and recreate:
  81         # Be safe and only delete if it looks like a whoosh index, i.e.,
  82         # everything starts with _
  83         if all(f[0] == '_' for f in os.listdir(directory)):
  84             shutil.rmtree(directory)
  85             directory_exists = False
  86
  87     if not directory_exists:
  88         os.mkdir(directory)
  89
  90
  91     ### Create index
  92     schema = whoosh.fields.Schema(
  93         name=whoosh.fields.ID(stored=True),
  94         table=whoosh.fields.ID(stored=True),
  95         row_id=whoosh.fields.ID(stored=True),
  96         language=whoosh.fields.STORED,
  97         display_name=whoosh.fields.STORED,  # non-lowercased name
  98         forme_name=whoosh.fields.ID,
  99     )
 100
 101     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 102     writer = index.writer()
 103
 104     # Index every name in all our tables of interest
 105     # speller_entries becomes a list of (word, score) tuples; the score is 2
 106     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 107     # this biases the results in the direction most people expect, especially
 108     # when e.g. German names are very similar to English names
 109     speller_entries = []
 110     for cls in indexed_tables.values():
 111         q = session.query(cls)
 112
 113         for row in q.yield_per(5):
 114             # XXX need to give forme_name a dummy value because I can't search
 115             # for explicitly empty fields.  boo.
 116             row_key = dict(table=unicode(cls.__tablename__),
 117                            row_id=unicode(row.id),
 118                            forme_name=u'XXX')
 119
 120             def add(name, language, score):
 121                 writer.add_document(name=name.lower(), display_name=name,
 122                                     language=language,
 123                                     **row_key)
 124                 speller_entries.append((name.lower(), score))
 125
 126             # If this is a form, mark it as such
 127             if getattr(row, 'forme_base_pokemon_id', None):
 128                 row_key['forme_name'] = row.forme_name
 129
 130             name = row.name
 131             add(name, None, 1)
 132
 133             # Pokemon also get other languages
 134             for foreign_name in getattr(row, 'foreign_names', []):
 135                 moonspeak = foreign_name.name
 136                 if name == moonspeak:
 137                     # Don't add the English name again as a different language;
 138                     # no point and it makes spell results confusing
 139                     continue
 140
 141                 add(moonspeak, foreign_name.language.name, 3)
 142
 143                 # Add Roomaji too
 144                 if foreign_name.language.name == 'Japanese':
 145                     roomaji = romanize(foreign_name.name)
 146                     add(roomaji, u'Roomaji', 8)
 147
 148     writer.commit()
 149
 150     # Construct and populate a spell-checker index.  Quicker to do it all
 151     # at once, as every call to add_* does a commit(), and those seem to be
 152     # expensive
 153     speller = whoosh.spelling.SpellChecker(index.storage)
 154     speller.add_scored_words(speller_entries)
 155
 156     return index, speller
 157
 158
 159 class LanguageWeighting(whoosh.scoring.Weighting):
 160     """A scoring class that forces otherwise-equal English results to come
 161     before foreign results.
 162     """
 163
 164     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 165         doc = searcher.stored_fields(docnum)
 166         if doc['language'] == None:
 167             # English (well, "default"); leave it at 1
 168             return weight
 169         elif doc['language'] == u'Roomaji':
 170             # Give Roomaji a bit of a boost, as it's most likely to be searched
 171             return weight * 0.95
 172         else:
 173             # Everything else can drop down the totem pole
 174             return weight * 0.9
 175
 176 rx_is_number = re.compile('^\d+$')
 177
 178 LookupResult = namedtuple('LookupResult',
 179                           ['object', 'name', 'language', 'exact'])
 180
 181 def _parse_table_name(name):
 182     """Takes a singular table name, table name, or table object and returns the
 183     table name.
 184
 185     Returns None for a bogus name.
 186     """
 187     if hasattr(name, '__tablename__'):
 188         return getattr(name, '__tablename__')
 189     elif name in indexed_tables:
 190         return name
 191     elif name + 's' in indexed_tables:
 192         return name + 's'
 193     else:
 194         # Bogus.  Be nice and return dummy
 195         return None
 196
 197
 198 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 199     """Attempts to find some sort of object, given a database session and name.
 200
 201     Returns a list of named (object, name, language, exact) tuples.  `object`
 202     is a database object, `name` is the name under which the object was found,
 203     `language` is the name of the language in which the name was found, and
 204     `exact` is True iff this was an exact match.
 205
 206     This function currently ONLY does fuzzy matching if there are no exact
 207     matches.
 208
 209     Formes are not returned; "Shaymin" will return only grass Shaymin.
 210
 211     Recognizes:
 212     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 213     - Foreign names: "Iibui", "Eivui"
 214     - Fuzzy names in whatever language: "Evee", "Ibui"
 215     - IDs: "133", "192", "250"
 216     Also:
 217     - Type restrictions.  "type:psychic" will only return the type.  This is
 218       how to make ID lookup useful.  Multiple type specs can be entered with
 219       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 220       will be ignored.
 221     - Alternate formes can be specified merely like "wash rotom".
 222
 223     `input`
 224         Name of the thing to look for.
 225
 226     `valid_types`
 227         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 228         this is provided, only results in one of the given tables will be
 229         returned.
 230
 231     `session`
 232         A database session to use for retrieving objects.  As with get_index,
 233         if this is not provided, a connection to the default database will be
 234         attempted.
 235
 236     `indices`
 237         Tuple of index, speller as returned from `open_index()`.  Defaults to
 238         a call to `open_index()`.
 239
 240     `exact_only`
 241         If True, only exact matches are returned.  If set to False (the
 242         default), and the provided `name` doesn't match anything exactly,
 243         spelling correction will be attempted.
 244     """
 245
 246     if not session:
 247         session = connect()
 248
 249     if indices:
 250         index, speller = indices
 251     else:
 252         index, speller = open_index()
 253
 254     name = unicode(input).lower()
 255     exact = True
 256     form = None
 257
 258     # Remove any type prefix (pokemon:133) before constructing a query
 259     if ':' in name:
 260         prefix_chunk, name = name.split(':', 2)
 261         prefixes = prefix_chunk.split(',')
 262         if not valid_types:
 263             # Only use types from the query string if none were explicitly
 264             # provided
 265             valid_types = prefixes
 266
 267     # Random lookup
 268     if name == 'random':
 269         return random_lookup(indices=(index, speller),
 270                              session=session,
 271                              valid_types=valid_types)
 272
 273     # Do different things depending what the query looks like
 274     # Note: Term objects do an exact match, so we don't have to worry about a
 275     # query parser tripping on weird characters in the input
 276     if '*' in name or '?' in name:
 277         exact_only = True
 278         query = whoosh.query.Wildcard(u'name', name)
 279     elif rx_is_number.match(name):
 280         # Don't spell-check numbers!
 281         exact_only = True
 282         query = whoosh.query.Term(u'row_id', name)
 283     else:
 284         # Not an integer
 285         query = whoosh.query.Term(u'name', name) \
 286               & whoosh.query.Term(u'forme_name', u'XXX')
 287
 288         # If there's a space in the input, this might be a form
 289         if ' ' in name:
 290             form, formless_name = name.split(' ', 2)
 291             form_query = whoosh.query.Term(u'name', formless_name) \
 292                        & whoosh.query.Term(u'forme_name', form)
 293             query = query | form_query
 294
 295     ### Filter by type of object
 296     type_terms = []
 297     for valid_type in valid_types:
 298         table_name = _parse_table_name(valid_type)
 299         type_terms.append(whoosh.query.Term(u'table', table_name))
 300
 301     if type_terms:
 302         query = query & whoosh.query.Or(type_terms)
 303
 304
 305     ### Actual searching
 306     searcher = index.searcher()
 307     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 308                                               # takes a weighting kw but it
 309                                               # certainly does not
 310     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 311
 312     # Look for some fuzzy matches if necessary
 313     if not exact_only and not results:
 314         exact = False
 315         results = []
 316
 317         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 318             query = whoosh.query.Term('name', suggestion)
 319             results.extend(searcher.search(query))
 320
 321     ### Convert results to db objects
 322     objects = []
 323     seen = {}
 324     for result in results:
 325         # Skip dupe results
 326         seen_key = result['table'], result['row_id']
 327         if seen_key in seen:
 328             continue
 329         seen[seen_key] = True
 330
 331         cls = indexed_tables[result['table']]
 332         obj = session.query(cls).get(result['row_id'])
 333
 334         objects.append(LookupResult(object=obj,
 335                                     name=result['display_name'],
 336                                     language=result['language'],
 337                                     exact=exact))
 338
 339     # Only return up to 10 matches; beyond that, something is wrong.
 340     # We strip out duplicate entries above, so it's remotely possible that we
 341     # should have more than 10 here and lost a few.  The speller returns 25 to
 342     # give us some padding, and should avoid that problem.  Not a big deal if
 343     # we lose the 25th-most-likely match anyway.
 344     return objects[:MAX_LOOKUP_RESULTS]
 345
 346
 347 def random_lookup(valid_types=[], session=None, indices=None):
 348     """Takes similar arguments as `lookup()`, but returns a random lookup
 349     result from one of the provided `valid_types`.
 350     """
 351
 352     tables = []
 353     for valid_type in valid_types:
 354         table_name = _parse_table_name(valid_type)
 355         if table_name:
 356             tables.append(indexed_tables[table_name])
 357
 358     if not tables:
 359         tables = indexed_tables.values()
 360
 361     # Rather than create an array of many hundred items and pick randomly from
 362     # it, just pick a number up to the total number of potential items, then
 363     # pick randomly from that, and partition the whole range into chunks
 364     total = 0
 365     partitions = []
 366     for table in tables:
 367         count = session.query(table).count()
 368         total += count
 369         partitions.append((table, count))
 370
 371     n = random.randint(1, total)
 372     while n > partitions[0][1]:
 373         n -= partitions[0][1]
 374         partitions.pop(0)
 375
 376     return lookup(unicode(n), valid_types=[ partitions[0][0] ],
 377                   indices=indices, session=session)