pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import re
   6
   7 from sqlalchemy.sql import func
   8 import whoosh
   9 import whoosh.filedb.filestore
  10 import whoosh.filedb.fileindex
  11 import whoosh.index
  12 from whoosh.qparser import QueryParser
  13 import whoosh.spelling
  14
  15 from pokedex.db import connect
  16 import pokedex.db.tables as tables
  17 from pokedex.roomaji import romanize
  18
  19 # Dictionary of table name => table class.
  20 # Need the table name so we can get the class from the table name after we
  21 # retrieve something from the index
  22 indexed_tables = {}
  23 for cls in [
  24         tables.Ability,
  25         tables.Item,
  26         tables.Move,
  27         tables.Pokemon,
  28         tables.Type,
  29     ]:
  30     indexed_tables[cls.__tablename__] = cls
  31
  32 def open_index(directory=None, session=None, recreate=False):
  33     """Opens the whoosh index stored in the named directory and returns (index,
  34     speller).  If the index doesn't already exist, it will be created.
  35
  36     `directory`
  37         Directory containing the index.  Defaults to a location within the
  38         `pokedex` egg directory.
  39
  40     `session`
  41         If the index needs to be created, this database session will be used.
  42         Defaults to an attempt to connect to the default SQLite database
  43         installed by `pokedex setup`.
  44
  45     `recreate`
  46         If set to True, the whoosh index will be created even if it already
  47         exists.
  48     """
  49
  50     # Defaults
  51     if not directory:
  52         directory = pkg_resources.resource_filename('pokedex',
  53                                                     'data/whoosh_index')
  54
  55     if not session:
  56         session = connect()
  57
  58     # Attempt to open or create the index
  59     directory_exists = os.path.exists(directory)
  60     if directory_exists and not recreate:
  61         # Already exists; should be an index!
  62         try:
  63             index = whoosh.index.open_dir(directory, indexname='MAIN')
  64             spell_store = whoosh.filedb.filestore.FileStorage(directory)
  65             speller = whoosh.spelling.SpellChecker(spell_store)
  66             return index, speller
  67         except whoosh.index.EmptyIndexError as e:
  68             # Apparently not a real index.  Fall out of the if and create it
  69             pass
  70
  71     if not directory_exists:
  72         os.mkdir(directory)
  73
  74
  75     # Create index
  76     schema = whoosh.fields.Schema(
  77         name=whoosh.fields.ID(stored=True),
  78         table=whoosh.fields.STORED,
  79         row_id=whoosh.fields.ID(stored=True),
  80         language=whoosh.fields.STORED,
  81     )
  82
  83     index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN')
  84     writer = index.writer()
  85
  86     # Index every name in all our tables of interest
  87     # speller_entries becomes a list of (word, score) tuples; the score is 2
  88     # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
  89     # this biases the results in the direction most people expect, especially
  90     # when e.g. German names are very similar to English names
  91     speller_entries = []
  92     for cls in indexed_tables.values():
  93         q = session.query(cls)
  94
  95         # Only index base Pokémon formes
  96         if hasattr(cls, 'forme_base_pokemon_id'):
  97             q = q.filter_by(forme_base_pokemon_id=None)
  98
  99         for row in q.yield_per(5):
 100             row_key = dict(table=cls.__tablename__, row_id=unicode(row.id))
 101
 102             name = row.name.lower()
 103             writer.add_document(name=name, **row_key)
 104             speller_entries.append((name, 1))
 105
 106             # Pokemon also get other languages
 107             for foreign_name in getattr(row, 'foreign_names', []):
 108                 moonspeak = foreign_name.name.lower()
 109                 if name == moonspeak:
 110                     # Don't add the English name again as a different language;
 111                     # no point and it makes spell results confusing
 112                     continue
 113
 114                 writer.add_document(name=moonspeak,
 115                                     language=foreign_name.language.name,
 116                                     **row_key)
 117                 speller_entries.append((moonspeak, 3))
 118
 119                 # Add Roomaji too
 120                 if foreign_name.language.name == 'Japanese':
 121                     roomaji = romanize(foreign_name.name).lower()
 122                     writer.add_document(name=roomaji, language='Roomaji',
 123                                         **row_key)
 124                     speller_entries.append((roomaji, 8))
 125
 126
 127     writer.commit()
 128
 129     # Construct and populate a spell-checker index.  Quicker to do it all
 130     # at once, as every call to add_* does a commit(), and those seem to be
 131     # expensive
 132     speller = whoosh.spelling.SpellChecker(index.storage)
 133     speller.add_scored_words(speller_entries)
 134
 135     return index, speller
 136
 137
 138 rx_is_number = re.compile('^\d+$')
 139
 140 LookupResult = namedtuple('LookupResult',
 141                           ['object', 'name', 'language', 'exact'])
 142 def lookup(input, session=None, indices=None, exact_only=False):
 143     """Attempts to find some sort of object, given a database session and name.
 144
 145     Returns a list of named (object, name, language, exact) tuples.  `object`
 146     is a database object, `name` is the name under which the object was found,
 147     `language` is the name of the language in which the name was found, and
 148     `exact` is True iff this was an exact match.
 149
 150     This function currently ONLY does fuzzy matching if there are no exact
 151     matches.
 152
 153     Formes are not returned; "Shaymin" will return only grass Shaymin.
 154
 155     Recognizes:
 156     - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 157     - Foreign names: "Iibui", "Eivui"
 158     - Fuzzy names in whatever language: "Evee", "Ibui"
 159     - IDs: "pokemon 133", "move 192", "item 250"
 160     - Dex numbers: "sinnoh 55", "133", "johto 180"
 161
 162     `input`
 163         Name of the thing to look for.
 164
 165     `session`
 166         A database session to use for retrieving objects.  As with get_index,
 167         if this is not provided, a connection to the default database will be
 168         attempted.
 169
 170     `indices`
 171         Tuple of index, speller as returned from `open_index()`.  Defaults to
 172         a call to `open_index()`.
 173
 174     `exact_only`
 175         If True, only exact matches are returned.  If set to False (the
 176         default), and the provided `name` doesn't match anything exactly,
 177         spelling correction will be attempted.
 178     """
 179
 180     if not session:
 181         session = connect()
 182
 183     if indices:
 184         index, speller = indices
 185     else:
 186         index, speller = open_index()
 187
 188     name = unicode(input).lower()
 189     exact = True
 190
 191     # If the input provided is a number, match it as an id.  Otherwise, name
 192     if rx_is_number.match(input):
 193         query_column = 'row_id'
 194         exact_only = True  # don't spell-check numbers!
 195     else:
 196         # Not an integer
 197         query_column = 'name'
 198
 199     # Look for exact name.  A Term object does an exact match, so we don't have
 200     # to worry about a query parser tripping on weird characters in the input
 201     searcher = index.searcher()
 202     query = whoosh.query.Term(query_column, name)
 203     print query
 204     results = searcher.search(query)
 205
 206     # Look for some fuzzy matches if necessary
 207     if not exact_only and not results:
 208         exact = False
 209         results = []
 210
 211         for suggestion in speller.suggest(name, 25):
 212             query = whoosh.query.Term('name', suggestion)
 213             results.extend(searcher.search(query))
 214
 215     ### Convert results to db objects
 216     objects = []
 217     seen = {}
 218     for result in results:
 219         # Skip dupe results
 220         # Note!  The speller prefers English names, but the query does not.  So
 221         # "latias" comes over "ratiasu".  "latias" matches only the English
 222         # row, comes out first, and all is well.
 223         # However!  The speller could then return "foo" which happens to be the
 224         # name for two different things in different languages, and the
 225         # non-English one could appear preferred.  This is not very likely.
 226         seen_key = result['table'], result['row_id']
 227         if seen_key in seen:
 228             continue
 229         seen[seen_key] = True
 230
 231         cls = indexed_tables[result['table']]
 232         obj = session.query(cls).get(result['row_id'])
 233         objects.append(LookupResult(object=obj,
 234                                     name=result['name'],
 235                                     language=result['language'],
 236                                     exact=exact))
 237
 238     # Only return up to 10 matches; beyond that, something is wrong.
 239     # We strip out duplicate entries above, so it's remotely possible that we
 240     # should have more than 10 here and lost a few.  The speller returns 25 to
 241     # give us some padding, and should avoid that problem.  Not a big deal if
 242     # we lose the 25th-most-likely match anyway.
 243     return objects[:10]