pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import re
   6 import shutil
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.db import connect
  18 import pokedex.db.tables as tables
  19 from pokedex.roomaji import romanize
  20
  21 __all__ = ['open_index', 'lookup']
  22
  23 INTERMEDIATE_LOOKUP_RESULTS = 25
  24 MAX_LOOKUP_RESULTS = 10
  25
  26 # Dictionary of table name => table class.
  27 # Need the table name so we can get the class from the table name after we
  28 # retrieve something from the index
  29 indexed_tables = {}
  30 for cls in [
  31         tables.Ability,
  32         tables.Item,
  33         tables.Move,
  34         tables.Pokemon,
  35         tables.Type,
  36     ]:
  37     indexed_tables[cls.__tablename__] = cls
  38
  39 def open_index(directory=None, session=None, recreate=False):
  40     """Opens the whoosh index stored in the named directory and returns (index,
  41     speller).  If the index doesn't already exist, it will be created.
  42
  43     `directory`
  44         Directory containing the index.  Defaults to a location within the
  45         `pokedex` egg directory.
  46
  47     `session`
  48         If the index needs to be created, this database session will be used.
  49         Defaults to an attempt to connect to the default SQLite database
  50         installed by `pokedex setup`.
  51
  52     `recreate`
  53         If set to True, the whoosh index will be created even if it already
  54         exists.
  55     """
  56
  57     # Defaults
  58     if not directory:
  59         directory = pkg_resources.resource_filename('pokedex',
  60                                                     'data/whoosh-index')
  61
  62     if not session:
  63         session = connect()
  64
  65     # Attempt to open or create the index
  66     directory_exists = os.path.exists(directory)
  67     if directory_exists and not recreate:
  68         # Already exists; should be an index!
  69         try:
  70             index = whoosh.index.open_dir(directory, indexname='MAIN')
  71             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  72             speller = whoosh.spelling.SpellChecker(spell_store)
  73             return index, speller
  74         except whoosh.index.EmptyIndexError as e:
  75             # Apparently not a real index.  Fall out of the if and create it
  76             pass
  77
  78     # Delete and start over if we're going to bail anyway.
  79     if directory_exists and recreate:
  80         # Be safe and only delete if it looks like a whoosh index, i.e.,
  81         # everything starts with _
  82         if all(f[0] == '_' for f in os.listdir(directory)):
  83             shutil.rmtree(directory)
  84             directory_exists = False
  85
  86     if not directory_exists:
  87         os.mkdir(directory)
  88
  89
  90     ### Create index
  91     schema = whoosh.fields.Schema(
  92         name=whoosh.fields.ID(stored=True),
  93         table=whoosh.fields.ID(stored=True),
  94         row_id=whoosh.fields.ID(stored=True),
  95         language=whoosh.fields.STORED,
  96         display_name=whoosh.fields.STORED,  # non-lowercased name
  97         forme_name=whoosh.fields.ID,
  98     )
  99
 100     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
 101     writer = index.writer()
 102
 103     # Index every name in all our tables of interest
 104     # speller_entries becomes a list of (word, score) tuples; the score is 2
 105     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
 106     # this biases the results in the direction most people expect, especially
 107     # when e.g. German names are very similar to English names
 108     speller_entries = []
 109     for cls in indexed_tables.values():
 110         q = session.query(cls)
 111
 112         for row in q.yield_per(5):
 113             # XXX need to give forme_name a dummy value because I can't search
 114             # for explicitly empty fields.  boo.
 115             row_key = dict(table=unicode(cls.__tablename__),
 116                            row_id=unicode(row.id),
 117                            forme_name=u'XXX')
 118
 119             def add(name, language, score):
 120                 writer.add_document(name=name.lower(), display_name=name,
 121                                     language=language,
 122                                     **row_key)
 123                 speller_entries.append((name.lower(), score))
 124
 125             # If this is a form, mark it as such
 126             if getattr(row, 'forme_base_pokemon_id', None):
 127                 row_key['forme_name'] = row.forme_name
 128
 129             name = row.name
 130             add(name, None, 1)
 131
 132             # Pokemon also get other languages
 133             for foreign_name in getattr(row, 'foreign_names', []):
 134                 moonspeak = foreign_name.name
 135                 if name == moonspeak:
 136                     # Don't add the English name again as a different language;
 137                     # no point and it makes spell results confusing
 138                     continue
 139
 140                 add(moonspeak, foreign_name.language.name, 3)
 141
 142                 # Add Roomaji too
 143                 if foreign_name.language.name == 'Japanese':
 144                     roomaji = romanize(foreign_name.name)
 145                     add(roomaji, u'Roomaji', 8)
 146
 147     writer.commit()
 148
 149     # Construct and populate a spell-checker index.  Quicker to do it all
 150     # at once, as every call to add_* does a commit(), and those seem to be
 151     # expensive
 152     speller = whoosh.spelling.SpellChecker(index.storage)
 153     speller.add_scored_words(speller_entries)
 154
 155     return index, speller
 156
 157
 158 class LanguageWeighting(whoosh.scoring.Weighting):
 159     """A scoring class that forces otherwise-equal English results to come
 160     before foreign results.
 161     """
 162
 163     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 164         doc = searcher.stored_fields(docnum)
 165         if doc['language'] == None:
 166             # English (well, "default"); leave it at 1
 167             return weight
 168         elif doc['language'] == u'Roomaji':
 169             # Give Roomaji a bit of a boost, as it's most likely to be searched
 170             return weight * 0.95
 171         else:
 172             # Everything else can drop down the totem pole
 173             return weight * 0.9
 174
 175 rx_is_number = re.compile('^\d+$')
 176
 177 LookupResult = namedtuple('LookupResult',
 178                           ['object', 'name', 'language', 'exact'])
 179 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 180     """Attempts to find some sort of object, given a database session and name.
 181
 182     Returns a list of named (object, name, language, exact) tuples.  `object`
 183     is a database object, `name` is the name under which the object was found,
 184     `language` is the name of the language in which the name was found, and
 185     `exact` is True iff this was an exact match.
 186
 187     This function currently ONLY does fuzzy matching if there are no exact
 188     matches.
 189
 190     Formes are not returned; "Shaymin" will return only grass Shaymin.
 191
 192     Recognizes:
 193     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 194     - Foreign names: "Iibui", "Eivui"
 195     - Fuzzy names in whatever language: "Evee", "Ibui"
 196     - IDs: "133", "192", "250"
 197     Also:
 198     - Type restrictions.  "type:psychic" will only return the type.  This is
 199       how to make ID lookup useful.  Multiple type specs can be entered with
 200       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 201       will be ignored.
 202     - Alternate formes can be specified merely like "wash rotom".
 203
 204     `input`
 205         Name of the thing to look for.
 206
 207     `valid_types`
 208         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 209         this is provided, only results in one of the given tables will be
 210         returned.
 211
 212     `session`
 213         A database session to use for retrieving objects.  As with get_index,
 214         if this is not provided, a connection to the default database will be
 215         attempted.
 216
 217     `indices`
 218         Tuple of index, speller as returned from `open_index()`.  Defaults to
 219         a call to `open_index()`.
 220
 221     `exact_only`
 222         If True, only exact matches are returned.  If set to False (the
 223         default), and the provided `name` doesn't match anything exactly,
 224         spelling correction will be attempted.
 225     """
 226
 227     if not session:
 228         session = connect()
 229
 230     if indices:
 231         index, speller = indices
 232     else:
 233         index, speller = open_index()
 234
 235     name = unicode(input).lower()
 236     exact = True
 237     form = None
 238
 239     # Remove any type prefix (pokemon:133) before constructing a query
 240     if ':' in name:
 241         prefix_chunk, name = name.split(':', 2)
 242         prefixes = prefix_chunk.split(',')
 243         if not valid_types:
 244             # Only use types from the query string if none were explicitly
 245             # provided
 246             valid_types = prefixes
 247
 248     # Do different things depending what the query looks like
 249     # Note: Term objects do an exact match, so we don't have to worry about a
 250     # query parser tripping on weird characters in the input
 251     if '*' in name or '?' in name:
 252         exact_only = True
 253         query = whoosh.query.Wildcard(u'name', name)
 254     elif rx_is_number.match(name):
 255         # Don't spell-check numbers!
 256         exact_only = True
 257         query = whoosh.query.Term(u'row_id', name)
 258     else:
 259         # Not an integer
 260         query = whoosh.query.Term(u'name', name) \
 261               & whoosh.query.Term(u'forme_name', u'XXX')
 262
 263         # If there's a space in the input, this might be a form
 264         if ' ' in name:
 265             form, formless_name = name.split(' ', 2)
 266             form_query = whoosh.query.Term(u'name', formless_name) \
 267                        & whoosh.query.Term(u'forme_name', form)
 268             query = query | form_query
 269
 270     ### Filter by type of object
 271     type_terms = []
 272     for valid_type in valid_types:
 273         if hasattr(valid_type, '__tablename__'):
 274             table_name = getattr(valid_type, '__tablename__')
 275         elif valid_type in indexed_tables:
 276             table_name = valid_type
 277         elif valid_type + 's' in indexed_tables:
 278             table_name = valid_type + 's'
 279         else:
 280             # Bogus.  Be nice and ignore it
 281             continue
 282
 283         type_terms.append(whoosh.query.Term(u'table', table_name))
 284
 285     if type_terms:
 286         query = query & whoosh.query.Or(type_terms)
 287
 288
 289     ### Actual searching
 290     searcher = index.searcher()
 291     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 292                                               # takes a weighting kw but it
 293                                               # certainly does not
 294     results = searcher.search(query, limit=INTERMEDIATE_LOOKUP_RESULTS)
 295
 296     # Look for some fuzzy matches if necessary
 297     if not exact_only and not results:
 298         exact = False
 299         results = []
 300
 301         for suggestion in speller.suggest(name, INTERMEDIATE_LOOKUP_RESULTS):
 302             query = whoosh.query.Term('name', suggestion)
 303             results.extend(searcher.search(query))
 304
 305     ### Convert results to db objects
 306     objects = []
 307     seen = {}
 308     for result in results:
 309         # Skip dupe results
 310         seen_key = result['table'], result['row_id']
 311         if seen_key in seen:
 312             continue
 313         seen[seen_key] = True
 314
 315         cls = indexed_tables[result['table']]
 316         obj = session.query(cls).get(result['row_id'])
 317
 318         objects.append(LookupResult(object=obj,
 319                                     name=result['display_name'],
 320                                     language=result['language'],
 321                                     exact=exact))
 322
 323     # Only return up to 10 matches; beyond that, something is wrong.
 324     # We strip out duplicate entries above, so it's remotely possible that we
 325     # should have more than 10 here and lost a few.  The speller returns 25 to
 326     # give us some padding, and should avoid that problem.  Not a big deal if
 327     # we lose the 25th-most-likely match anyway.
 328     return objects[:MAX_LOOKUP_RESULTS]