pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import re
   6
   7 from sqlalchemy.sql import func
   8 import whoosh
   9 import whoosh.filedb.filestore
  10 import whoosh.filedb.fileindex
  11 import whoosh.index
  12 from whoosh.qparser import QueryParser
  13 import whoosh.scoring
  14 import whoosh.spelling
  15
  16 from pokedex.db import connect
  17 import pokedex.db.tables as tables
  18 from pokedex.roomaji import romanize
  19
  20 __all__ = ['open_index', 'lookup']
  21
  22 # Dictionary of table name => table class.
  23 # Need the table name so we can get the class from the table name after we
  24 # retrieve something from the index
  25 indexed_tables = {}
  26 for cls in [
  27         tables.Ability,
  28         tables.Item,
  29         tables.Move,
  30         tables.Pokemon,
  31         tables.Type,
  32     ]:
  33     indexed_tables[cls.__tablename__] = cls
  34
  35 def open_index(directory=None, session=None, recreate=False):
  36     """Opens the whoosh index stored in the named directory and returns (index,
  37     speller).  If the index doesn't already exist, it will be created.
  38
  39     `directory`
  40         Directory containing the index.  Defaults to a location within the
  41         `pokedex` egg directory.
  42
  43     `session`
  44         If the index needs to be created, this database session will be used.
  45         Defaults to an attempt to connect to the default SQLite database
  46         installed by `pokedex setup`.
  47
  48     `recreate`
  49         If set to True, the whoosh index will be created even if it already
  50         exists.
  51     """
  52
  53     # Defaults
  54     if not directory:
  55         directory = pkg_resources.resource_filename('pokedex',
  56                                                     'data/whoosh-index')
  57
  58     if not session:
  59         session = connect()
  60
  61     # Attempt to open or create the index
  62     directory_exists = os.path.exists(directory)
  63     if directory_exists and not recreate:
  64         # Already exists; should be an index!
  65         try:
  66             index = whoosh.index.open_dir(directory, indexname='MAIN')
  67             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  68             speller = whoosh.spelling.SpellChecker(spell_store)
  69             return index, speller
  70         except whoosh.index.EmptyIndexError as e:
  71             # Apparently not a real index.  Fall out of the if and create it
  72             pass
  73
  74     if not directory_exists:
  75         os.mkdir(directory)
  76
  77
  78     ### Create index
  79     schema = whoosh.fields.Schema(
  80         name=whoosh.fields.ID(stored=True),
  81         table=whoosh.fields.ID(stored=True),
  82         row_id=whoosh.fields.ID(stored=True),
  83         language=whoosh.fields.STORED,
  84         display_name=whoosh.fields.STORED,  # non-lowercased name
  85         forme_name=whoosh.fields.ID,
  86     )
  87
  88     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
  89     writer = index.writer()
  90
  91     # Index every name in all our tables of interest
  92     # speller_entries becomes a list of (word, score) tuples; the score is 2
  93     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
  94     # this biases the results in the direction most people expect, especially
  95     # when e.g. German names are very similar to English names
  96     speller_entries = []
  97     for cls in indexed_tables.values():
  98         q = session.query(cls)
  99
 100         for row in q.yield_per(5):
 101             # XXX need to give forme_name a dummy value because I can't search
 102             # for explicitly empty fields.  boo.
 103             row_key = dict(table=unicode(cls.__tablename__),
 104                            row_id=unicode(row.id),
 105                            forme_name=u'XXX')
 106
 107             # If this is a form, mark it as such
 108             if getattr(row, 'forme_base_pokemon_id', None):
 109                 row_key['forme_name'] = row.forme_name
 110
 111             name = row.name
 112             writer.add_document(name=name.lower(),
 113                                 display_name=name,
 114                                 **row_key)
 115             speller_entries.append((name.lower(), 1))
 116
 117             # Pokemon also get other languages
 118             for foreign_name in getattr(row, 'foreign_names', []):
 119                 moonspeak = foreign_name.name
 120                 if name == moonspeak:
 121                     # Don't add the English name again as a different language;
 122                     # no point and it makes spell results confusing
 123                     continue
 124
 125                 writer.add_document(name=moonspeak.lower(),
 126                                     language=foreign_name.language.name,
 127                                     display_name=moonspeak,
 128                                     **row_key)
 129                 speller_entries.append((moonspeak.lower(), 3))
 130
 131                 # Add Roomaji too
 132                 if foreign_name.language.name == 'Japanese':
 133                     roomaji = romanize(foreign_name.name)
 134                     writer.add_document(name=roomaji.lower(),
 135                                         language='Roomaji',
 136                                         display_name=roomaji,
 137                                         **row_key)
 138                     speller_entries.append((roomaji.lower(), 8))
 139
 140
 141     writer.commit()
 142
 143     # Construct and populate a spell-checker index.  Quicker to do it all
 144     # at once, as every call to add_* does a commit(), and those seem to be
 145     # expensive
 146     speller = whoosh.spelling.SpellChecker(index.storage)
 147     speller.add_scored_words(speller_entries)
 148
 149     return index, speller
 150
 151
 152 class LanguageWeighting(whoosh.scoring.Weighting):
 153     """A scoring class that forces otherwise-equal English results to come
 154     before foreign results.
 155     """
 156
 157     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 158         doc = searcher.stored_fields(docnum)
 159         if doc['language'] == None:
 160             # English (well, "default"); leave it at 1
 161             return weight
 162         elif doc['language'] == u'Roomaji':
 163             # Give Roomaji a bit of a boost, as it's most likely to be searched
 164             return weight * 0.95
 165         else:
 166             # Everything else can drop down the totem pole
 167             return weight * 0.9
 168
 169 rx_is_number = re.compile('^\d+$')
 170
 171 LookupResult = namedtuple('LookupResult',
 172                           ['object', 'name', 'language', 'exact'])
 173 def lookup(input, valid_types=[], session=None, indices=None, exact_only=False):
 174     """Attempts to find some sort of object, given a database session and name.
 175
 176     Returns a list of named (object, name, language, exact) tuples.  `object`
 177     is a database object, `name` is the name under which the object was found,
 178     `language` is the name of the language in which the name was found, and
 179     `exact` is True iff this was an exact match.
 180
 181     This function currently ONLY does fuzzy matching if there are no exact
 182     matches.
 183
 184     Formes are not returned; "Shaymin" will return only grass Shaymin.
 185
 186     Recognizes:
 187     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 188     - Foreign names: "Iibui", "Eivui"
 189     - Fuzzy names in whatever language: "Evee", "Ibui"
 190     - IDs: "133", "192", "250"
 191     Also:
 192     - Type restrictions.  "type:psychic" will only return the type.  This is
 193       how to make ID lookup useful.  Multiple type specs can be entered with
 194       commas, as "move,item:1".  If `valid_types` are provided, any type prefix
 195       will be ignored.
 196     - Alternate formes can be specified merely like "wash rotom".
 197
 198     `input`
 199         Name of the thing to look for.
 200
 201     `valid_types`
 202         A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 203         this is provided, only results in one of the given tables will be
 204         returned.
 205
 206     `session`
 207         A database session to use for retrieving objects.  As with get_index,
 208         if this is not provided, a connection to the default database will be
 209         attempted.
 210
 211     `indices`
 212         Tuple of index, speller as returned from `open_index()`.  Defaults to
 213         a call to `open_index()`.
 214
 215     `exact_only`
 216         If True, only exact matches are returned.  If set to False (the
 217         default), and the provided `name` doesn't match anything exactly,
 218         spelling correction will be attempted.
 219     """
 220
 221     if not session:
 222         session = connect()
 223
 224     if indices:
 225         index, speller = indices
 226     else:
 227         index, speller = open_index()
 228
 229     name = unicode(input).lower()
 230     exact = True
 231     form = None
 232
 233     # Remove any type prefix (pokemon:133) before constructing a query
 234     if ':' in name:
 235         prefix_chunk, name = name.split(':', 2)
 236         prefixes = prefix_chunk.split(',')
 237         if not valid_types:
 238             # Only use types from the query string if none were explicitly
 239             # provided
 240             valid_types = prefixes
 241
 242     # If the input provided is a number, match it as an id.  Otherwise, name.
 243     # Term objects do an exact match, so we don't have to worry about a query
 244     # parser tripping on weird characters in the input
 245     if rx_is_number.match(name):
 246         # Don't spell-check numbers!
 247         exact_only = True
 248         query = whoosh.query.Term(u'row_id', name)
 249     else:
 250         # Not an integer
 251         query = whoosh.query.Term(u'name', name) \
 252               & whoosh.query.Term(u'forme_name', u'XXX')
 253
 254         # If there's a space in the input, this might be a form
 255         if ' ' in name:
 256             form, formless_name = name.split(' ', 2)
 257             form_query = whoosh.query.Term(u'name', formless_name) \
 258                        & whoosh.query.Term(u'forme_name', form)
 259             query = query | form_query
 260
 261     ### Filter by type of object
 262     type_terms = []
 263     for valid_type in valid_types:
 264         if hasattr(valid_type, '__tablename__'):
 265             table_name = getattr(valid_type, '__tablename__')
 266         elif valid_type in indexed_tables:
 267             table_name = valid_type
 268         elif valid_type + 's' in indexed_tables:
 269             table_name = valid_type + 's'
 270         else:
 271             # Bogus.  Be nice and ignore it
 272             continue
 273
 274         type_terms.append(whoosh.query.Term(u'table', table_name))
 275
 276     if type_terms:
 277         query = query & whoosh.query.Or(type_terms)
 278
 279
 280     ### Actual searching
 281     searcher = index.searcher()
 282     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 283                                               # takes a weighting kw but it
 284                                               # certainly does not
 285     results = searcher.search(query)
 286
 287     # Look for some fuzzy matches if necessary
 288     if not exact_only and not results:
 289         exact = False
 290         results = []
 291
 292         for suggestion in speller.suggest(name, 25):
 293             query = whoosh.query.Term('name', suggestion)
 294             results.extend(searcher.search(query))
 295
 296     ### Convert results to db objects
 297     objects = []
 298     seen = {}
 299     for result in results:
 300         # Skip dupe results
 301         seen_key = result['table'], result['row_id']
 302         if seen_key in seen:
 303             continue
 304         seen[seen_key] = True
 305
 306         cls = indexed_tables[result['table']]
 307         obj = session.query(cls).get(result['row_id'])
 308
 309         objects.append(LookupResult(object=obj,
 310                                     name=result['display_name'],
 311                                     language=result['language'],
 312                                     exact=exact))
 313
 314     # Only return up to 10 matches; beyond that, something is wrong.
 315     # We strip out duplicate entries above, so it's remotely possible that we
 316     # should have more than 10 here and lost a few.  The speller returns 25 to
 317     # give us some padding, and should avoid that problem.  Not a big deal if
 318     # we lose the 25th-most-likely match anyway.
 319     return objects[:10]