pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import re
   6
   7 from sqlalchemy.sql import func
   8 import whoosh
   9 import whoosh.filedb.filestore
  10 import whoosh.filedb.fileindex
  11 import whoosh.index
  12 from whoosh.qparser import QueryParser
  13 import whoosh.scoring
  14 import whoosh.spelling
  15
  16 from pokedex.db import connect
  17 import pokedex.db.tables as tables
  18 from pokedex.roomaji import romanize
  19
  20 __all__ = ['open_index', 'lookup']
  21
  22 # Dictionary of table name => table class.
  23 # Need the table name so we can get the class from the table name after we
  24 # retrieve something from the index
  25 indexed_tables = {}
  26 for cls in [
  27         tables.Ability,
  28         tables.Item,
  29         tables.Move,
  30         tables.Pokemon,
  31         tables.Type,
  32     ]:
  33     indexed_tables[cls.__tablename__] = cls
  34
  35 def open_index(directory=None, session=None, recreate=False):
  36     """Opens the whoosh index stored in the named directory and returns (index,
  37     speller).  If the index doesn't already exist, it will be created.
  38
  39     `directory`
  40         Directory containing the index.  Defaults to a location within the
  41         `pokedex` egg directory.
  42
  43     `session`
  44         If the index needs to be created, this database session will be used.
  45         Defaults to an attempt to connect to the default SQLite database
  46         installed by `pokedex setup`.
  47
  48     `recreate`
  49         If set to True, the whoosh index will be created even if it already
  50         exists.
  51     """
  52
  53     # Defaults
  54     if not directory:
  55         directory = pkg_resources.resource_filename('pokedex',
  56                                                     'data/whoosh_index')
  57
  58     if not session:
  59         session = connect()
  60
  61     # Attempt to open or create the index
  62     directory_exists = os.path.exists(directory)
  63     if directory_exists and not recreate:
  64         # Already exists; should be an index!
  65         try:
  66             index = whoosh.index.open_dir(directory, indexname='MAIN')
  67             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  68             speller = whoosh.spelling.SpellChecker(spell_store)
  69             return index, speller
  70         except whoosh.index.EmptyIndexError as e:
  71             # Apparently not a real index.  Fall out of the if and create it
  72             pass
  73
  74     if not directory_exists:
  75         os.mkdir(directory)
  76
  77
  78     # Create index
  79     schema = whoosh.fields.Schema(
  80         name=whoosh.fields.ID(stored=True),
  81         table=whoosh.fields.STORED,
  82         row_id=whoosh.fields.ID(stored=True),
  83         language=whoosh.fields.STORED,
  84         display_name=whoosh.fields.STORED,  # non-lowercased name
  85     )
  86
  87     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
  88     writer = index.writer()
  89
  90     # Index every name in all our tables of interest
  91     # speller_entries becomes a list of (word, score) tuples; the score is 2
  92     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
  93     # this biases the results in the direction most people expect, especially
  94     # when e.g. German names are very similar to English names
  95     speller_entries = []
  96     for cls in indexed_tables.values():
  97         q = session.query(cls)
  98
  99         # Only index base Pokémon formes
 100         if hasattr(cls, 'forme_base_pokemon_id'):
 101             q = q.filter_by(forme_base_pokemon_id=None)
 102
 103         for row in q.yield_per(5):
 104             row_key = dict(table=cls.__tablename__, row_id=unicode(row.id))
 105
 106             name = row.name
 107             writer.add_document(name=name.lower(),
 108                                 display_name=name,
 109                                 **row_key)
 110             speller_entries.append((name.lower(), 1))
 111
 112             # Pokemon also get other languages
 113             for foreign_name in getattr(row, 'foreign_names', []):
 114                 moonspeak = foreign_name.name
 115                 if name == moonspeak:
 116                     # Don't add the English name again as a different language;
 117                     # no point and it makes spell results confusing
 118                     continue
 119
 120                 writer.add_document(name=moonspeak.lower(),
 121                                     language=foreign_name.language.name,
 122                                     display_name=moonspeak,
 123                                     **row_key)
 124                 speller_entries.append((moonspeak.lower(), 3))
 125
 126                 # Add Roomaji too
 127                 if foreign_name.language.name == 'Japanese':
 128                     roomaji = romanize(foreign_name.name)
 129                     writer.add_document(name=roomaji.lower(),
 130                                         language='Roomaji',
 131                                         display_name=roomaji,
 132                                         **row_key)
 133                     speller_entries.append((roomaji.lower(), 8))
 134
 135
 136     writer.commit()
 137
 138     # Construct and populate a spell-checker index.  Quicker to do it all
 139     # at once, as every call to add_* does a commit(), and those seem to be
 140     # expensive
 141     speller = whoosh.spelling.SpellChecker(index.storage)
 142     speller.add_scored_words(speller_entries)
 143
 144     return index, speller
 145
 146
 147 class LanguageWeighting(whoosh.scoring.Weighting):
 148     """A scoring class that forces otherwise-equal English results to come
 149     before foreign results.
 150     """
 151
 152     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
 153         doc = searcher.stored_fields(docnum)
 154         if doc['language'] == None:
 155             # English (well, "default"); leave it at 1
 156             return weight
 157         elif doc['language'] == u'Roomaji':
 158             # Give Roomaji a bit of a boost, as it's most likely to be searched
 159             return weight * 0.95
 160         else:
 161             # Everything else can drop down the totem pole
 162             return weight * 0.9
 163
 164 rx_is_number = re.compile('^\d+$')
 165
 166 LookupResult = namedtuple('LookupResult',
 167                           ['object', 'name', 'language', 'exact'])
 168 def lookup(input, session=None, indices=None, exact_only=False):
 169     """Attempts to find some sort of object, given a database session and name.
 170
 171     Returns a list of named (object, name, language, exact) tuples.  `object`
 172     is a database object, `name` is the name under which the object was found,
 173     `language` is the name of the language in which the name was found, and
 174     `exact` is True iff this was an exact match.
 175
 176     This function currently ONLY does fuzzy matching if there are no exact
 177     matches.
 178
 179     Formes are not returned; "Shaymin" will return only grass Shaymin.
 180
 181     Recognizes:
 182     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 183     - Foreign names: "Iibui", "Eivui"
 184     - Fuzzy names in whatever language: "Evee", "Ibui"
 185     - IDs: "pokemon 133", "move 192", "item 250"
 186     - Dex numbers: "sinnoh 55", "133", "johto 180"
 187
 188     `input`
 189         Name of the thing to look for.
 190
 191     `session`
 192         A database session to use for retrieving objects.  As with get_index,
 193         if this is not provided, a connection to the default database will be
 194         attempted.
 195
 196     `indices`
 197         Tuple of index, speller as returned from `open_index()`.  Defaults to
 198         a call to `open_index()`.
 199
 200     `exact_only`
 201         If True, only exact matches are returned.  If set to False (the
 202         default), and the provided `name` doesn't match anything exactly,
 203         spelling correction will be attempted.
 204     """
 205
 206     if not session:
 207         session = connect()
 208
 209     if indices:
 210         index, speller = indices
 211     else:
 212         index, speller = open_index()
 213
 214     name = unicode(input).lower()
 215     exact = True
 216
 217     # If the input provided is a number, match it as an id.  Otherwise, name.
 218     # Term objects do an exact match, so we don't have to worry about a query
 219     # parser tripping on weird characters in the input
 220     if rx_is_number.match(name):
 221         # Don't spell-check numbers!
 222         exact_only = True
 223         query = whoosh.query.Term(u'row_id', name)
 224     else:
 225         # Not an integer
 226         query = whoosh.query.Term(u'name', name)
 227
 228     ### Actual searching
 229     searcher = index.searcher()
 230     searcher.weighting = LanguageWeighting()  # XXX kosher?  docs say search()
 231                                               # takes a weighting kw but it
 232                                               # certainly does not
 233     results = searcher.search(query)
 234
 235     # Look for some fuzzy matches if necessary
 236     if not exact_only and not results:
 237         exact = False
 238         results = []
 239
 240         for suggestion in speller.suggest(name, 25):
 241             query = whoosh.query.Term('name', suggestion)
 242             results.extend(searcher.search(query))
 243
 244     ### Convert results to db objects
 245     objects = []
 246     seen = {}
 247     for result in results:
 248         # Skip dupe results
 249         seen_key = result['table'], result['row_id']
 250         if seen_key in seen:
 251             continue
 252         seen[seen_key] = True
 253
 254         cls = indexed_tables[result['table']]
 255         obj = session.query(cls).get(result['row_id'])
 256         objects.append(LookupResult(object=obj,
 257                                     name=result['display_name'],
 258                                     language=result['language'],
 259                                     exact=exact))
 260
 261     # Only return up to 10 matches; beyond that, something is wrong.
 262     # We strip out duplicate entries above, so it's remotely possible that we
 263     # should have more than 10 here and lost a few.  The speller returns 25 to
 264     # give us some padding, and should avoid that problem.  Not a big deal if
 265     # we lose the 25th-most-likely match anyway.
 266     return objects[:10]