pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import re
   6
   7 from sqlalchemy.sql import func
   8 import whoosh
   9 import whoosh.filedb.filestore
  10 import whoosh.filedb.fileindex
  11 import whoosh.index
  12 from whoosh.qparser import QueryParser
  13 import whoosh.scoring
  14 import whoosh.spelling
  15
  16 from pokedex.db import connect
  17 import pokedex.db.tables as tables
  18 from pokedex.roomaji import romanize
  19
  20 __all__ = ['open_index', 'lookup']
  21
  22 # Dictionary of table name => table class.
  23 # Need the table name so we can get the class from the table name after we
  24 # retrieve something from the index
  25 indexed_tables = {}
  26 for cls in [
  27         tables.Ability,
  28         tables.Item,
  29         tables.Move,
  30         tables.Pokemon,
  31         tables.Type,
  32     ]:
  33     indexed_tables[cls.__tablename__] = cls
  34
  35 def open_index(directory=None, session=None, recreate=False):
  36     """Opens the whoosh index stored in the named directory and returns (index,
  37     speller).  If the index doesn't already exist, it will be created.
  38
  39     `directory`
  40         Directory containing the index.  Defaults to a location within the
  41         `pokedex` egg directory.
  42
  43     `session`
  44         If the index needs to be created, this database session will be used.
  45         Defaults to an attempt to connect to the default SQLite database
  46         installed by `pokedex setup`.
  47
  48     `recreate`
  49         If set to True, the whoosh index will be created even if it already
  50         exists.
  51     """
  52
  53     # Defaults
  54     if not directory:
  55         directory = pkg_resources.resource_filename('pokedex',
  56                                                     'data/whoosh_index')
  57
  58     if not session:
  59         session = connect()
  60
  61     # Attempt to open or create the index
  62     directory_exists = os.path.exists(directory)
  63     if directory_exists and not recreate:
  64         # Already exists; should be an index!
  65         try:
  66             index = whoosh.index.open_dir(directory, indexname='MAIN')
  67             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  68             speller = whoosh.spelling.SpellChecker(spell_store)
  69             return index, speller
  70         except whoosh.index.EmptyIndexError as e:
  71             # Apparently not a real index.  Fall out of the if and create it
  72             pass
  73
  74     if not directory_exists:
  75         os.mkdir(directory)
  76
  77
  78     # Create index
  79     schema = whoosh.fields.Schema(
  80         name=whoosh.fields.ID(stored=True),
  81         table=whoosh.fields.STORED,
  82         row_id=whoosh.fields.ID(stored=True),
  83         language=whoosh.fields.STORED,
  84     )
  85
  86     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
  87     writer = index.writer()
  88
  89     # Index every name in all our tables of interest
  90     # speller_entries becomes a list of (word, score) tuples; the score is 2
  91     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
  92     # this biases the results in the direction most people expect, especially
  93     # when e.g. German names are very similar to English names
  94     speller_entries = []
  95     for cls in indexed_tables.values():
  96         q = session.query(cls)
  97
  98         # Only index base Pokémon formes
  99         if hasattr(cls, 'forme_base_pokemon_id'):
 100             q = q.filter_by(forme_base_pokemon_id=None)
 101
 102         for row in q.yield_per(5):
 103             row_key = dict(table=cls.__tablename__, row_id=unicode(row.id))
 104
 105             name = row.name.lower()
 106             writer.add_document(name=name, **row_key)
 107             speller_entries.append((name, 1))
 108
 109             # Pokemon also get other languages
 110             for foreign_name in getattr(row, 'foreign_names', []):
 111                 moonspeak = foreign_name.name.lower()
 112                 if name == moonspeak:
 113                     # Don't add the English name again as a different language;
 114                     # no point and it makes spell results confusing
 115                     continue
 116
 117                 writer.add_document(name=moonspeak,
 118                                     language=foreign_name.language.name,
 119                                     **row_key)
 120                 speller_entries.append((moonspeak, 3))
 121
 122                 # Add Roomaji too
 123                 if foreign_name.language.name == 'Japanese':
 124                     roomaji = romanize(foreign_name.name).lower()
 125                     writer.add_document(name=roomaji, language='Roomaji',
 126                                         **row_key)
 127                     speller_entries.append((roomaji, 8))
 128
 129
 130     writer.commit()
 131
 132     # Construct and populate a spell-checker index.  Quicker to do it all
 133     # at once, as every call to add_* does a commit(), and those seem to be
 134     # expensive
 135     speller = whoosh.spelling.SpellChecker(index.storage)
 136     speller.add_scored_words(speller_entries)
 137
 138     return index, speller
 139
 140
 141 class LanguageWeighting(whoosh.scoring.Weighting):
 142     """A scoring class that forces otherwise-equal English results to come
 143     before foreign results.
 144     """
 145
 146     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 147         doc = searcher.stored_fields(docnum)
 148         if doc['language'] == None:
 149             # English (well, "default"); leave it at 1
 150             return weight
 151         elif doc['language'] == u'Roomaji':
 152             # Give Roomaji a bit of a boost, as it's most likely to be searched
 153             return weight * 0.95
 154         else:
 155             # Everything else can drop down the totem pole
 156             return weight * 0.9
 157
 158 rx_is_number = re.compile('^\d+$')
 159
 160 LookupResult = namedtuple('LookupResult',
 161                           ['object', 'name', 'language', 'exact'])
 162 def lookup(input, session=None, indices=None, exact_only=False):
 163     """Attempts to find some sort of object, given a database session and name.
 164
 165     Returns a list of named (object, name, language, exact) tuples.  `object`
 166     is a database object, `name` is the name under which the object was found,
 167     `language` is the name of the language in which the name was found, and
 168     `exact` is True iff this was an exact match.
 169
 170     This function currently ONLY does fuzzy matching if there are no exact
 171     matches.
 172
 173     Formes are not returned; "Shaymin" will return only grass Shaymin.
 174
 175     Recognizes:
 176     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 177     - Foreign names: "Iibui", "Eivui"
 178     - Fuzzy names in whatever language: "Evee", "Ibui"
 179     - IDs: "pokemon 133", "move 192", "item 250"
 180     - Dex numbers: "sinnoh 55", "133", "johto 180"
 181
 182     `input`
 183         Name of the thing to look for.
 184
 185     `session`
 186         A database session to use for retrieving objects.  As with get_index,
 187         if this is not provided, a connection to the default database will be
 188         attempted.
 189
 190     `indices`
 191         Tuple of index, speller as returned from `open_index()`.  Defaults to
 192         a call to `open_index()`.
 193
 194     `exact_only`
 195         If True, only exact matches are returned.  If set to False (the
 196         default), and the provided `name` doesn't match anything exactly,
 197         spelling correction will be attempted.
 198     """
 199
 200     if not session:
 201         session = connect()
 202
 203     if indices:
 204         index, speller = indices
 205     else:
 206         index, speller = open_index()
 207
 208     name = unicode(input).lower()
 209     exact = True
 210
 211     # If the input provided is a number, match it as an id.  Otherwise, name.
 212     # Term objects do an exact match, so we don't have to worry about a query
 213     # parser tripping on weird characters in the input
 214     if rx_is_number.match(name):
 215         # Don't spell-check numbers!
 216         exact_only = True
 217         query = whoosh.query.Term(u'row_id', name)
 218     else:
 219         # Not an integer
 220         query = whoosh.query.Term(u'name', name)
 221
 222     ### Actual searching
 223     searcher = index.searcher()
 224     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 225                                               # takes a weighting kw but it
 226                                               # certainly does not
 227     results = searcher.search(query)
 228
 229     # Look for some fuzzy matches if necessary
 230     if not exact_only and not results:
 231         exact = False
 232         results = []
 233
 234         for suggestion in speller.suggest(name, 25):
 235             query = whoosh.query.Term('name', suggestion)
 236             results.extend(searcher.search(query))
 237
 238     ### Convert results to db objects
 239     objects = []
 240     seen = {}
 241     for result in results:
 242         # Skip dupe results
 243         seen_key = result['table'], result['row_id']
 244         if seen_key in seen:
 245             continue
 246         seen[seen_key] = True
 247
 248         cls = indexed_tables[result['table']]
 249         obj = session.query(cls).get(result['row_id'])
 250         objects.append(LookupResult(object=obj,
 251                                     name=result['name'],
 252                                     language=result['language'],
 253                                     exact=exact))
 254
 255     # Only return up to 10 matches; beyond that, something is wrong.
 256     # We strip out duplicate entries above, so it's remotely possible that we
 257     # should have more than 10 here and lost a few.  The speller returns 25 to
 258     # give us some padding, and should avoid that problem.  Not a big deal if
 259     # we lose the 25th-most-likely match anyway.
 260     return objects[:10]