pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8
   9 from sqlalchemy.sql import func
  10 import whoosh
  11 import whoosh.filedb.filestore
  12 import whoosh.filedb.fileindex
  13 import whoosh.index
  14 from whoosh.qparser import QueryParser
  15 import whoosh.scoring
  16 import whoosh.spelling
  17
  18 from pokedex.db import connect
  19 import pokedex.db.tables as tables
  20 from pokedex.roomaji import romanize
  21
  22 __all__ = ['open_index', 'lookup', 'random_lookup']
  23
  24 INTERMEDIATE_LOOKUP_RESULTS = 25
  25 MAX_LOOKUP_RESULTS = 10
  26
  27 # Dictionary of table name => table class.
  28 # Need the table name so we can get the class from the table name after we
  29 # retrieve something from the index
  30 indexed_tables = {}
  31 for cls in [
  32         tables.Ability,
  33         tables.Item,
  34         tables.Move,
  35         tables.Pokemon,
  36         tables.Type,
  37     ]:
  38     indexed_tables[cls.__tablename__] = cls
  39
  40 def open_index(directory=None, session=None, recreate=False):
  41     """Opens the whoosh index stored in the named directory and returns (index,
  42     speller).  If the index doesn't already exist, it will be created.
  43
  44     `directory`
  45         Directory containing the index.  Defaults to a location within the
  46         `pokedex` egg directory.
  47
  48     `session`
  49         If the index needs to be created, this database session will be used.
  50         Defaults to an attempt to connect to the default SQLite database
  51         installed by `pokedex setup`.
  52
  53     `recreate`
  54         If set to True, the whoosh index will be created even if it already
  55         exists.
  56     """
  57
  58     # Defaults
  59     if not directory:
  60         directory = pkg_resources.resource_filename('pokedex',
  61                                                     'data/whoosh-index')
  62
  63     if not session:
  64         session = connect()
  65
  66     # Attempt to open or create the index
  67     directory_exists = os.path.exists(directory)
  68     if directory_exists and not recreate:
  69         # Already exists; should be an index!
  70         try:
  71             index = whoosh.index.open_dir(directory, indexname='MAIN')
  72             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  73             speller = whoosh.spelling.SpellChecker(spell_store)
  74             return index, speller
  75         except whoosh.index.EmptyIndexError as e:
  76             # Apparently not a real index.  Fall out of the if and create it
  77             pass
  78
  79     # Delete and start over if we're going to bail anyway.
  80     if directory_exists and recreate:
  81         # Be safe and only delete if it looks like a whoosh index, i.e.,
  82         # everything starts with _
  83         if all(f[0] == '_' for f in os.listdir(directory)):
  84             shutil.rmtree(directory)
  85             directory_exists = False
  86
  87     if not directory_exists:
  88         os.mkdir(directory)
  89
  90
  91     ### Create index
  92     schema = whoosh.fields.Schema(
  93         name=whoosh.fields.ID(stored=True),
  94         table=whoosh.fields.ID(stored=True),
  95         row_id=whoosh.fields.ID(stored=True),
  96         language=whoosh.fields.STORED,
  97         display_name=whoosh.fields.STORED,  # non-lowercased name
  98         forme_name=whoosh.fields.ID,
  99     )
 100
 101     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 102     writer = index.writer()
 103
 104     # Index every name in all our tables of interest
 105     # speller_entries becomes a list of (word, score) tuples; the score is 2
 106     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 107     # this biases the results in the direction most people expect, especially
 108     # when e.g. German names are very similar to English names
 109     speller_entries = []
 110     for cls in indexed_tables.values():
 111         q = session.query(cls)
 112
 113         for row in q.yield_per(5):
 114             # XXX need to give forme_name a dummy value because I can't search
 115             # for explicitly empty fields.  boo.
 116             row_key = dict(table=unicode(cls.__tablename__),
 117                            row_id=unicode(row.id),
 118                            forme_name=u'XXX')
 119
 120             def add(name, language, score):
 121                 writer.add_document(name=name.lower(), display_name=name,
 122                                     language=language,
 123                                     **row_key)
 124                 speller_entries.append((name.lower(), score))
 125
 126             # If this is a form, mark it as such
 127             if getattr(row, 'forme_base_pokemon_id', None):
 128                 row_key['forme_name'] = row.forme_name
 129
 130             name = row.name
 131             add(name, None, 1)
 132
 133             # Pokemon also get other languages
 134             for foreign_name in getattr(row, 'foreign_names', []):
 135                 moonspeak = foreign_name.name
 136                 if name == moonspeak:
 137                     # Don't add the English name again as a different language;
 138                     # no point and it makes spell results confusing
 139                     continue
 140
 141                 add(moonspeak, foreign_name.language.name, 3)
 142
 143                 # Add Roomaji too
 144                 if foreign_name.language.name == 'Japanese':
 145                     roomaji = romanize(foreign_name.name)
 146                     add(roomaji, u'Roomaji', 8)
 147
 148     writer.commit()
 149
 150     # Construct and populate a spell-checker index.  Quicker to do it all
 151     # at once, as every call to add_* does a commit(), and those seem to be
 152     # expensive
 153     speller = whoosh.spelling.SpellChecker(index.storage)
 154     speller.add_scored_words(speller_entries)
 155
 156     return index, speller
 157
 158
 159 class LanguageWeighting(whoosh.scoring.Weighting):
 160     """A scoring class that forces otherwise-equal English results to come
 161     before foreign results.
 162     """
 163
 164     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 165         doc = searcher.stored_fields(docnum)
 166         if doc['language'] == None:
 167             # English (well, "default"); leave it at 1
 168             return weight
 169         elif doc['language'] == u'Roomaji':
 170             # Give Roomaji a bit of a boost, as it's most likely to be searched
 171             return weight * 0.95
 172         else:
 173             # Everything else can drop down the totem pole
 174             return weight * 0.9
 175
 176 rx_is_number = re.compile('^\d+$')
 177
 178 LookupResult = namedtuple('LookupResult',
 179                           ['object', 'name', 'language', 'exact'])
 180
 181 def _parse_table_name(name):
 182     """Takes a singular table name, table name, or table object and returns the
 183     table name.
 184
 185     Returns None for a bogus name.
 186     """
 187     if hasattr(name, '__tablename__'):
 188         return getattr(name, '__tablename__')
 189     elif name in indexed_tables:
 190         return name
 191     elif name + 's' in indexed_tables:
 192         return name + 's'
 193     else:
 194         # Bogus.  Be nice and return dummy
 195         return None
 196
 197
 198 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 199     """Attempts to find some sort of object, given a database session and name.
 200
 201     Returns a list of named (object, name, language, exact) tuples.  `object`
 202     is a database object, `name` is the name under which the object was found,
 203     `language` is the name of the language in which the name was found, and
 204     `exact` is True iff this was an exact match.
 205
 206     This function currently ONLY does fuzzy matching if there are no exact
 207     matches.
 208
 209     Formes are not returned unless requested; "Shaymin" will return only grass
 210     Shaymin.
 211
 212     Extraneous whitespace is removed with extreme prejudice.
 213
 214     Recognizes:
 215     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 216     - Foreign names: "Iibui", "Eivui"
 217     - Fuzzy names in whatever language: "Evee", "Ibui"
 218     - IDs: "133", "192", "250"
 219     Also:
 220     - Type restrictions.  "type:psychic" will only return the type.  This is
 221       how to make ID lookup useful.  Multiple type specs can be entered with
 222       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 223       will be ignored.
 224     - Alternate formes can be specified merely like "wash rotom".
 225
 226     `input`
 227         Name of the thing to look for.
 228
 229     `valid_types`
 230         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 231         this is provided, only results in one of the given tables will be
 232         returned.
 233
 234     `session`
 235         A database session to use for retrieving objects.  As with get_index,
 236         if this is not provided, a connection to the default database will be
 237         attempted.
 238
 239     `indices`
 240         Tuple of index, speller as returned from `open_index()`.  Defaults to
 241         a call to `open_index()`.
 242
 243     `exact_only`
 244         If True, only exact matches are returned.  If set to False (the
 245         default), and the provided `name` doesn't match anything exactly,
 246         spelling correction will be attempted.
 247     """
 248
 249     if not session:
 250         session = connect()
 251
 252     if indices:
 253         index, speller = indices
 254     else:
 255         index, speller = open_index()
 256
 257     name = unicode(input).strip().lower()
 258     exact = True
 259     form = None
 260
 261     # Remove any type prefix (pokemon:133) before constructing a query
 262     if ':' in name:
 263         prefix_chunk, name = name.split(':', 1)
 264         name = name.strip()
 265
 266         if not valid_types:
 267             # Only use types from the query string if none were explicitly
 268             # provided
 269             prefixes = prefix_chunk.split(',')
 270             valid_types = [_.strip() for _ in prefixes]
 271
 272     # Random lookup
 273     if name == 'random':
 274         return random_lookup(indices=(index, speller),
 275                              session=session,
 276                              valid_types=valid_types)
 277
 278     # Do different things depending what the query looks like
 279     # Note: Term objects do an exact match, so we don't have to worry about a
 280     # query parser tripping on weird characters in the input
 281     if '*' in name or '?' in name:
 282         exact_only = True
 283         query = whoosh.query.Wildcard(u'name', name)
 284     elif rx_is_number.match(name):
 285         # Don't spell-check numbers!
 286         exact_only = True
 287         query = whoosh.query.Term(u'row_id', name)
 288     else:
 289         # Not an integer
 290         query = whoosh.query.Term(u'name', name) \
 291               & whoosh.query.Term(u'forme_name', u'XXX')
 292
 293         # If there's a space in the input, this might be a form
 294         if ' ' in name:
 295             form, formless_name = name.split(' ', 1)
 296             form_query = whoosh.query.Term(u'name', formless_name) \
 297                        & whoosh.query.Term(u'forme_name', form)
 298             query = query | form_query
 299
 300     ### Filter by type of object
 301     type_terms = []
 302     for valid_type in valid_types:
 303         table_name = _parse_table_name(valid_type)
 304         type_terms.append(whoosh.query.Term(u'table', table_name))
 305
 306     if type_terms:
 307         query = query & whoosh.query.Or(type_terms)
 308
 309
 310     ### Actual searching
 311     searcher = index.searcher()
 312     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 313                                               # takes a weighting kw but it
 314                                               # certainly does not
 315     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 316
 317     # Look for some fuzzy matches if necessary
 318     if not exact_only and not results:
 319         exact = False
 320         results = []
 321
 322         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 323             query = whoosh.query.Term('name', suggestion)
 324             results.extend(searcher.search(query))
 325
 326     ### Convert results to db objects
 327     objects = []
 328     seen = {}
 329     for result in results:
 330         # Skip dupe results
 331         seen_key = result['table'], result['row_id']
 332         if seen_key in seen:
 333             continue
 334         seen[seen_key] = True
 335
 336         cls = indexed_tables[result['table']]
 337         obj = session.query(cls).get(result['row_id'])
 338
 339         objects.append(LookupResult(object=obj,
 340                                     name=result['display_name'],
 341                                     language=result['language'],
 342                                     exact=exact))
 343
 344     # Only return up to 10 matches; beyond that, something is wrong.
 345     # We strip out duplicate entries above, so it's remotely possible that we
 346     # should have more than 10 here and lost a few.  The speller returns 25 to
 347     # give us some padding, and should avoid that problem.  Not a big deal if
 348     # we lose the 25th-most-likely match anyway.
 349     return objects[:MAX_LOOKUP_RESULTS]
 350
 351
 352 def random_lookup(valid_types=[], session=None, indices=None):
 353     """Takes similar arguments as `lookup()`, but returns a random lookup
 354     result from one of the provided `valid_types`.
 355     """
 356
 357     tables = []
 358     for valid_type in valid_types:
 359         table_name = _parse_table_name(valid_type)
 360         if table_name:
 361             tables.append(indexed_tables[table_name])
 362
 363     if not tables:
 364         tables = indexed_tables.values()
 365
 366     # Rather than create an array of many hundred items and pick randomly from
 367     # it, just pick a number up to the total number of potential items, then
 368     # pick randomly from that, and partition the whole range into chunks
 369     total = 0
 370     partitions = []
 371     for table in tables:
 372         count = session.query(table).count()
 373         total += count
 374         partitions.append((table, count))
 375
 376     n = random.randint(1, total)
 377     while n > partitions[0][1]:
 378         n -= partitions[0][1]
 379         partitions.pop(0)
 380
 381     return lookup(unicode(n), valid_types=[ partitions[0][0] ],
 382                   indices=indices, session=session)