pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import re
   6 import shutil
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.db import connect
  18 import pokedex.db.tables as tables
  19 from pokedex.roomaji import romanize
  20
  21 __all__ = ['open_index', 'lookup']
  22
  23 # Dictionary of table name => table class.
  24 # Need the table name so we can get the class from the table name after we
  25 # retrieve something from the index
  26 indexed_tables = {}
  27 for cls in [
  28         tables.Ability,
  29         tables.Item,
  30         tables.Move,
  31         tables.Pokemon,
  32         tables.Type,
  33     ]:
  34     indexed_tables[cls.__tablename__] = cls
  35
  36 def open_index(directory=None, session=None, recreate=False):
  37     """Opens the whoosh index stored in the named directory and returns (index,
  38     speller).  If the index doesn't already exist, it will be created.
  39
  40     `directory`
  41         Directory containing the index.  Defaults to a location within the
  42         `pokedex` egg directory.
  43
  44     `session`
  45         If the index needs to be created, this database session will be used.
  46         Defaults to an attempt to connect to the default SQLite database
  47         installed by `pokedex setup`.
  48
  49     `recreate`
  50         If set to True, the whoosh index will be created even if it already
  51         exists.
  52     """
  53
  54     # Defaults
  55     if not directory:
  56         directory = pkg_resources.resource_filename('pokedex',
  57                                                     'data/whoosh-index')
  58
  59     if not session:
  60         session = connect()
  61
  62     # Attempt to open or create the index
  63     directory_exists = os.path.exists(directory)
  64     if directory_exists and not recreate:
  65         # Already exists; should be an index!
  66         try:
  67             index = whoosh.index.open_dir(directory, indexname='MAIN')
  68             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  69             speller = whoosh.spelling.SpellChecker(spell_store)
  70             return index, speller
  71         except whoosh.index.EmptyIndexError as e:
  72             # Apparently not a real index.  Fall out of the if and create it
  73             pass
  74
  75     # Delete and start over if we're going to bail anyway.
  76     if directory_exists and recreate:
  77         # Be safe and only delete if it looks like a whoosh index, i.e.,
  78         # everything starts with _
  79         if all(f[0] == '_' for f in os.listdir(directory)):
  80             shutil.rmtree(directory)
  81             directory_exists = False
  82
  83     if not directory_exists:
  84         os.mkdir(directory)
  85
  86
  87     ### Create index
  88     schema = whoosh.fields.Schema(
  89         name=whoosh.fields.ID(stored=True),
  90         table=whoosh.fields.ID(stored=True),
  91         row_id=whoosh.fields.ID(stored=True),
  92         language=whoosh.fields.STORED,
  93         display_name=whoosh.fields.STORED,  # non-lowercased name
  94         forme_name=whoosh.fields.ID,
  95     )
  96
  97     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
  98     writer = index.writer()
  99
 100     # Index every name in all our tables of interest
 101     # speller_entries becomes a list of (word, score) tuples; the score is 2
 102     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 103     # this biases the results in the direction most people expect, especially
 104     # when e.g. German names are very similar to English names
 105     speller_entries = []
 106     for cls in indexed_tables.values():
 107         q = session.query(cls)
 108
 109         for row in q.yield_per(5):
 110             # XXX need to give forme_name a dummy value because I can't search
 111             # for explicitly empty fields.  boo.
 112             row_key = dict(table=unicode(cls.__tablename__),
 113                            row_id=unicode(row.id),
 114                            forme_name=u'XXX')
 115
 116             def add(name, language, score):
 117                 writer.add_document(name=name.lower(), display_name=name,
 118                                     language=language,
 119                                     **row_key)
 120                 speller_entries.append((name.lower(), score))
 121
 122             # If this is a form, mark it as such
 123             if getattr(row, 'forme_base_pokemon_id', None):
 124                 row_key['forme_name'] = row.forme_name
 125
 126             name = row.name
 127             add(name, None, 1)
 128
 129             # Pokemon also get other languages
 130             for foreign_name in getattr(row, 'foreign_names', []):
 131                 moonspeak = foreign_name.name
 132                 if name == moonspeak:
 133                     # Don't add the English name again as a different language;
 134                     # no point and it makes spell results confusing
 135                     continue
 136
 137                 add(moonspeak, foreign_name.language.name, 3)
 138
 139                 # Add Roomaji too
 140                 if foreign_name.language.name == 'Japanese':
 141                     roomaji = romanize(foreign_name.name)
 142                     add(roomaji, u'Roomaji', 8)
 143
 144     writer.commit()
 145
 146     # Construct and populate a spell-checker index.  Quicker to do it all
 147     # at once, as every call to add_* does a commit(), and those seem to be
 148     # expensive
 149     speller = whoosh.spelling.SpellChecker(index.storage)
 150     speller.add_scored_words(speller_entries)
 151
 152     return index, speller
 153
 154
 155 class LanguageWeighting(whoosh.scoring.Weighting):
 156     """A scoring class that forces otherwise-equal English results to come
 157     before foreign results.
 158     """
 159
 160     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 161         doc = searcher.stored_fields(docnum)
 162         if doc['language'] == None:
 163             # English (well, "default"); leave it at 1
 164             return weight
 165         elif doc['language'] == u'Roomaji':
 166             # Give Roomaji a bit of a boost, as it's most likely to be searched
 167             return weight * 0.95
 168         else:
 169             # Everything else can drop down the totem pole
 170             return weight * 0.9
 171
 172 rx_is_number = re.compile('^\d+$')
 173
 174 LookupResult = namedtuple('LookupResult',
 175                           ['object', 'name', 'language', 'exact'])
 176 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 177     """Attempts to find some sort of object, given a database session and name.
 178
 179     Returns a list of named (object, name, language, exact) tuples.  `object`
 180     is a database object, `name` is the name under which the object was found,
 181     `language` is the name of the language in which the name was found, and
 182     `exact` is True iff this was an exact match.
 183
 184     This function currently ONLY does fuzzy matching if there are no exact
 185     matches.
 186
 187     Formes are not returned; "Shaymin" will return only grass Shaymin.
 188
 189     Recognizes:
 190     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 191     - Foreign names: "Iibui", "Eivui"
 192     - Fuzzy names in whatever language: "Evee", "Ibui"
 193     - IDs: "133", "192", "250"
 194     Also:
 195     - Type restrictions.  "type:psychic" will only return the type.  This is
 196       how to make ID lookup useful.  Multiple type specs can be entered with
 197       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 198       will be ignored.
 199     - Alternate formes can be specified merely like "wash rotom".
 200
 201     `input`
 202         Name of the thing to look for.
 203
 204     `valid_types`
 205         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 206         this is provided, only results in one of the given tables will be
 207         returned.
 208
 209     `session`
 210         A database session to use for retrieving objects.  As with get_index,
 211         if this is not provided, a connection to the default database will be
 212         attempted.
 213
 214     `indices`
 215         Tuple of index, speller as returned from `open_index()`.  Defaults to
 216         a call to `open_index()`.
 217
 218     `exact_only`
 219         If True, only exact matches are returned.  If set to False (the
 220         default), and the provided `name` doesn't match anything exactly,
 221         spelling correction will be attempted.
 222     """
 223
 224     if not session:
 225         session = connect()
 226
 227     if indices:
 228         index, speller = indices
 229     else:
 230         index, speller = open_index()
 231
 232     name = unicode(input).lower()
 233     exact = True
 234     form = None
 235
 236     # Remove any type prefix (pokemon:133) before constructing a query
 237     if ':' in name:
 238         prefix_chunk, name = name.split(':', 2)
 239         prefixes = prefix_chunk.split(',')
 240         if not valid_types:
 241             # Only use types from the query string if none were explicitly
 242             # provided
 243             valid_types = prefixes
 244
 245     # If the input provided is a number, match it as an id.  Otherwise, name.
 246     # Term objects do an exact match, so we don't have to worry about a query
 247     # parser tripping on weird characters in the input
 248     if rx_is_number.match(name):
 249         # Don't spell-check numbers!
 250         exact_only = True
 251         query = whoosh.query.Term(u'row_id', name)
 252     else:
 253         # Not an integer
 254         query = whoosh.query.Term(u'name', name) \
 255               & whoosh.query.Term(u'forme_name', u'XXX')
 256
 257         # If there's a space in the input, this might be a form
 258         if ' ' in name:
 259             form, formless_name = name.split(' ', 2)
 260             form_query = whoosh.query.Term(u'name', formless_name) \
 261                        & whoosh.query.Term(u'forme_name', form)
 262             query = query | form_query
 263
 264     ### Filter by type of object
 265     type_terms = []
 266     for valid_type in valid_types:
 267         if hasattr(valid_type, '__tablename__'):
 268             table_name = getattr(valid_type, '__tablename__')
 269         elif valid_type in indexed_tables:
 270             table_name = valid_type
 271         elif valid_type + 's' in indexed_tables:
 272             table_name = valid_type + 's'
 273         else:
 274             # Bogus.  Be nice and ignore it
 275             continue
 276
 277         type_terms.append(whoosh.query.Term(u'table', table_name))
 278
 279     if type_terms:
 280         query = query & whoosh.query.Or(type_terms)
 281
 282
 283     ### Actual searching
 284     searcher = index.searcher()
 285     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 286                                               # takes a weighting kw but it
 287                                               # certainly does not
 288     results = searcher.search(query)
 289
 290     # Look for some fuzzy matches if necessary
 291     if not exact_only and not results:
 292         exact = False
 293         results = []
 294
 295         for suggestion in speller.suggest(name, 25):
 296             query = whoosh.query.Term('name', suggestion)
 297             results.extend(searcher.search(query))
 298
 299     ### Convert results to db objects
 300     objects = []
 301     seen = {}
 302     for result in results:
 303         # Skip dupe results
 304         seen_key = result['table'], result['row_id']
 305         if seen_key in seen:
 306             continue
 307         seen[seen_key] = True
 308
 309         cls = indexed_tables[result['table']]
 310         obj = session.query(cls).get(result['row_id'])
 311
 312         objects.append(LookupResult(object=obj,
 313                                     name=result['display_name'],
 314                                     language=result['language'],
 315                                     exact=exact))
 316
 317     # Only return up to 10 matches; beyond that, something is wrong.
 318     # We strip out duplicate entries above, so it's remotely possible that we
 319     # should have more than 10 here and lost a few.  The speller returns 25 to
 320     # give us some padding, and should avoid that problem.  Not a big deal if
 321     # we lose the 25th-most-likely match anyway.
 322     return objects[:10]