pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult',
  30     ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
  31
  32 class UninitializedIndex(object):
  33     class UninitializedIndexError(Exception):
  34         pass
  35
  36     def __nonzero__(self):
  37         """Dummy object should identify itself as False."""
  38         return False
  39
  40     def __bool__(self):
  41         """Python 3000 version of the above.  Future-proofing rules!"""
  42         return False
  43
  44     def __getattr__(self, *args, **kwargs):
  45         raise self.UninitializedIndexError(
  46             "The lookup index does not exist.  Please use `pokedex setup` "
  47             "or lookup.rebuild_index() to create it."
  48         )
  49
  50 class LanguageWeighting(whoosh.scoring.Weighting):
  51     """A scoring class that forces otherwise-equal English results to come
  52     before foreign results.
  53     """
  54
  55     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  56         doc = searcher.stored_fields(docnum)
  57         if doc['language'] == None:
  58             # English (well, "default"); leave it at 1
  59             return weight
  60         elif doc['language'] == u'Roomaji':
  61             # Give Roomaji a little boost; it's most likely to be searched
  62             return weight * 0.95
  63         else:
  64             # Everything else can drop down the totem pole
  65             return weight * 0.9
  66
  67
  68 class PokedexLookup(object):
  69     INTERMEDIATE_LOOKUP_RESULTS = 25
  70     MAX_LOOKUP_RESULTS = 10
  71
  72     # Dictionary of table name => table class.
  73     # Need the table name so we can get the class from the table name after we
  74     # retrieve something from the index
  75     indexed_tables = dict(
  76         (cls.__tablename__, cls)
  77         for cls in (
  78             tables.Ability,
  79             tables.Item,
  80             tables.Location,
  81             tables.Move,
  82             tables.Nature,
  83             tables.Pokemon,
  84             tables.Type,
  85         )
  86     )
  87
  88
  89     def __init__(self, directory=None, session=None):
  90         """Opens the whoosh index stored in the named directory.  If the index
  91         doesn't already exist, it will be created.
  92
  93         `directory`
  94             Directory containing the index.  Defaults to a location within the
  95             `pokedex` egg directory.
  96
  97         `session`
  98             Used for creating the index and retrieving objects.  Defaults to an
  99             attempt to connect to the default SQLite database installed by
 100             `pokedex setup`.
 101         """
 102
 103         # By the time this returns, self.index, self.speller, and self.session
 104         # must be set
 105
 106         # If a directory was not given, use the default
 107         if directory is None:
 108             directory = get_default_index_dir()
 109
 110         self.directory = directory
 111
 112         if session:
 113             self.session = session
 114         else:
 115             self.session = connect()
 116
 117         # Attempt to open or create the index
 118         if not os.path.exists(directory) or not os.listdir(directory):
 119             # Directory doesn't exist OR is empty; caller needs to use
 120             # rebuild_index before doing anything.  Provide a dummy object that
 121             # complains when used
 122             self.index = UninitializedIndex()
 123             self.speller = UninitializedIndex()
 124             return
 125
 126         # Otherwise, already exists; should be an index!  Bam, done.
 127         # Note that this will explode if the directory exists but doesn't
 128         # contain an index; that's a feature
 129         try:
 130             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 131         except whoosh.index.EmptyIndexError:
 132             raise IOError(
 133                 "The index directory already contains files.  "
 134                 "Please use a dedicated directory for the lookup index."
 135             )
 136
 137         # Create speller, and done
 138         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 139         self.speller = whoosh.spelling.SpellChecker(spell_store)
 140
 141
 142     def rebuild_index(self):
 143         """Creates the index from scratch."""
 144
 145         schema = whoosh.fields.Schema(
 146             name=whoosh.fields.ID(stored=True),
 147             table=whoosh.fields.ID(stored=True),
 148             row_id=whoosh.fields.ID(stored=True),
 149             language=whoosh.fields.STORED,
 150             iso3166=whoosh.fields.STORED,
 151             display_name=whoosh.fields.STORED,  # non-lowercased name
 152         )
 153
 154         if not os.path.exists(self.directory):
 155             os.mkdir(self.directory)
 156
 157         self.index = whoosh.index.create_in(self.directory, schema=schema,
 158                                                             indexname='MAIN')
 159         writer = self.index.writer()
 160
 161         # Index every name in all our tables of interest
 162         # speller_entries becomes a list of (word, score) tuples; the score is
 163         # 2 for English names, 1.5 for Roomaji, and 1 for everything else.  I
 164         # think this biases the results in the direction most people expect,
 165         # especially when e.g. German names are very similar to English names
 166         speller_entries = []
 167         for cls in self.indexed_tables.values():
 168             q = self.session.query(cls)
 169
 170             for row in q.yield_per(5):
 171                 row_key = dict(table=unicode(cls.__tablename__),
 172                                row_id=unicode(row.id))
 173
 174                 def add(name, language, iso3166, score):
 175                     normalized_name = self.normalize_name(name)
 176
 177                     writer.add_document(
 178                         name=normalized_name, display_name=name,
 179                         language=language, iso3166=iso3166,
 180                         **row_key
 181                     )
 182
 183                     speller_entries.append((normalized_name, score))
 184
 185
 186                 # Add the basic English name to the index
 187                 if cls == tables.Pokemon:
 188                     # Pokémon need their form name added
 189                     # XXX kinda kludgy
 190                     add(row.full_name, None, u'us', 1)
 191
 192                     # If this is a default form, ALSO add the unadorned name,
 193                     # so 'Deoxys' alone will still do the right thing
 194                     if row.forme_name and not row.forme_base_pokemon_id:
 195                         add(row.name, None, u'us', 1)
 196                 else:
 197                     add(row.name, None, u'us', 1)
 198
 199                 # Some things also have other languages' names
 200                 # XXX other language form names..?
 201                 for foreign_name in getattr(row, 'foreign_names', []):
 202                     moonspeak = foreign_name.name
 203                     if row.name == moonspeak:
 204                         # Don't add the English name again as a different
 205                         # language; no point and it makes spell results
 206                         # confusing
 207                         continue
 208
 209                     add(moonspeak, foreign_name.language.name,
 210                                    foreign_name.language.iso3166,
 211                                    3)
 212
 213                     # Add Roomaji too
 214                     if foreign_name.language.name == 'Japanese':
 215                         roomaji = romanize(foreign_name.name)
 216                         add(roomaji, u'Roomaji', u'jp', 8)
 217
 218         writer.commit()
 219
 220         # Construct and populate a spell-checker index.  Quicker to do it all
 221         # at once, as every call to add_* does a commit(), and those seem to be
 222         # expensive
 223         self.speller = whoosh.spelling.SpellChecker(self.index.storage)
 224         self.speller.add_scored_words(speller_entries)
 225
 226
 227     def normalize_name(self, name):
 228         """Strips irrelevant formatting junk from name input.
 229
 230         Specifically: everything is lowercased, and accents are removed.
 231         """
 232         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 233         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 234         # characters, then recombine.  I'm explicitly doing it this way instead
 235         # of testing combining() because Korean characters apparently
 236         # decompose!  But the results are considered letters, not combining
 237         # characters, so testing for Mn works well, and combining them again
 238         # makes them look right.
 239         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 240         name = u"".join(c for c in nkfd_form
 241                         if unicodedata.category(c) != 'Mn')
 242         name = unicodedata.normalize('NFC', name)
 243
 244         name = name.strip()
 245         name = name.lower()
 246
 247         return name
 248
 249
 250     def _apply_valid_types(self, name, valid_types):
 251         """Combines the enforced `valid_types` with any from the search string
 252         itself and updates the query.
 253
 254         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 255         only `b`s named "foo".
 256
 257         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 258         prefix stripped, `merged_valid_types` combines the original
 259         `valid_types` with the type prefix, and `term` is a query term for
 260         limited to just the allowed types.  If there are no type restrictions
 261         at all, `term` will be None.
 262         """
 263
 264         # Remove any type prefix (pokemon:133) first
 265         user_valid_types = []
 266         if ':' in name:
 267             prefix_chunk, name = name.split(':', 1)
 268             name = name.strip()
 269
 270             prefixes = prefix_chunk.split(',')
 271             user_valid_types = [_.strip() for _ in prefixes]
 272
 273         # Merge the valid types together.  Only types that appear in BOTH lists
 274         # may be used.
 275         # As a special case, if the user asked for types that are explicitly
 276         # forbidden, completely ignore what the user requested
 277         combined_valid_types = []
 278         if user_valid_types and valid_types:
 279             combined_valid_types = list(
 280                 set(user_valid_types) & set(combined_valid_types)
 281             )
 282
 283             if not combined_valid_types:
 284                 # No overlap!  Just use the enforced ones
 285                 combined_valid_types = valid_types
 286         else:
 287             # One list or the other was blank, so just use the one that isn't
 288             combined_valid_types = valid_types + user_valid_types
 289
 290         if not combined_valid_types:
 291             # No restrictions
 292             return name, [], None
 293
 294         # Construct the term
 295         type_terms = []
 296         final_valid_types = []
 297         for valid_type in combined_valid_types:
 298             table_name = self._parse_table_name(valid_type)
 299
 300             # Quietly ignore bogus valid_types; more likely to DTRT
 301             if table_name:
 302                 final_valid_types.append(valid_type)
 303                 type_terms.append(whoosh.query.Term(u'table', table_name))
 304
 305         return name, final_valid_types, whoosh.query.Or(type_terms)
 306
 307
 308     def _parse_table_name(self, name):
 309         """Takes a singular table name, table name, or table object and returns
 310         the table name.
 311
 312         Returns None for a bogus name.
 313         """
 314         # Table object
 315         if hasattr(name, '__tablename__'):
 316             return getattr(name, '__tablename__')
 317
 318         # Table name
 319         for table in self.indexed_tables.values():
 320             if name in (table.__tablename__, table.__singlename__):
 321                 return table.__tablename__
 322
 323         # Bogus.  Be nice and return dummy
 324         return None
 325
 326     def _whoosh_records_to_results(self, records, exact=True):
 327         """Converts a list of whoosh's indexed records to LookupResult tuples
 328         containing database objects.
 329         """
 330         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 331         # way to handle it, since only lookup() cares about fuzzy results
 332         seen = {}
 333         results = []
 334         for record in records:
 335             # Skip dupes
 336             seen_key = record['table'], record['row_id']
 337             if seen_key in seen:
 338                 continue
 339             seen[seen_key] = True
 340
 341             cls = self.indexed_tables[record['table']]
 342             obj = self.session.query(cls).get(record['row_id'])
 343
 344             results.append(LookupResult(object=obj,
 345                                         indexed_name=record['name'],
 346                                         name=record['display_name'],
 347                                         language=record['language'],
 348                                         iso3166=record['iso3166'],
 349                                         exact=exact))
 350
 351         return results
 352
 353
 354     def lookup(self, input, valid_types=[], exact_only=False):
 355         """Attempts to find some sort of object, given a name.
 356
 357         Returns a list of named (object, name, language, iso3166, exact)
 358         tuples.  `object` is a database object, `name` is the name under which
 359         the object was found, `language` and `iso3166` are the name and country
 360         code of the language in which the name was found, and `exact` is True
 361         iff this was an
 362         exact match.
 363
 364         This function currently ONLY does fuzzy matching if there are no exact
 365         matches.
 366
 367         Formes are not returned unless requested; "Shaymin" will return only
 368         grass Shaymin.
 369
 370         Extraneous whitespace is removed with extreme prejudice.
 371
 372         Recognizes:
 373         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 374         - Foreign names: "Iibui", "Eivui"
 375         - Fuzzy names in whatever language: "Evee", "Ibui"
 376         - IDs: "133", "192", "250"
 377         Also:
 378         - Type restrictions.  "type:psychic" will only return the type.  This
 379           is how to make ID lookup useful.  Multiple type specs can be entered
 380           with commas, as "move,item:1".  If `valid_types` are provided, any
 381           type prefix will be ignored.
 382         - Alternate formes can be specified merely like "wash rotom".
 383
 384         `input`
 385             Name of the thing to look for.
 386
 387         `valid_types`
 388             A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 389             this is provided, only results in one of the given tables will be
 390             returned.
 391
 392         `exact_only`
 393             If True, only exact matches are returned.  If set to False (the
 394             default), and the provided `name` doesn't match anything exactly,
 395             spelling correction will be attempted.
 396         """
 397
 398         name = self.normalize_name(input)
 399         exact = True
 400         form = None
 401
 402         # Pop off any type prefix and merge with valid_types
 403         name, merged_valid_types, type_term = \
 404             self._apply_valid_types(name, valid_types)
 405
 406         # Random lookup
 407         if name == 'random':
 408             return self.random_lookup(valid_types=merged_valid_types)
 409
 410         # Do different things depending what the query looks like
 411         # Note: Term objects do an exact match, so we don't have to worry about
 412         # a query parser tripping on weird characters in the input
 413         try:
 414             # Let Python try to convert to a number, so 0xff works
 415             name_as_number = int(name, base=0)
 416         except ValueError:
 417             # Oh well
 418             name_as_number = None
 419
 420         if '*' in name or '?' in name:
 421             exact_only = True
 422             query = whoosh.query.Wildcard(u'name', name)
 423         elif name_as_number is not None:
 424             # Don't spell-check numbers!
 425             exact_only = True
 426             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 427         else:
 428             # Not an integer
 429             query = whoosh.query.Term(u'name', name)
 430
 431         if type_term:
 432             query = query & type_term
 433
 434
 435         ### Actual searching
 436         searcher = self.index.searcher()
 437         # XXX is this kosher?  docs say search() takes a weighting arg, but it
 438         # certainly does not
 439         searcher.weighting = LanguageWeighting()
 440         results = searcher.search(query,
 441                                   limit=self.INTERMEDIATE_LOOKUP_RESULTS)
 442
 443         # Look for some fuzzy matches if necessary
 444         if not exact_only and not results:
 445             exact = False
 446             results = []
 447
 448             for suggestion in self.speller.suggest(
 449                 name, self.INTERMEDIATE_LOOKUP_RESULTS):
 450
 451                 query = whoosh.query.Term('name', suggestion)
 452                 results.extend(searcher.search(query))
 453
 454         ### Convert results to db objects
 455         objects = self._whoosh_records_to_results(results, exact=exact)
 456
 457         # Only return up to 10 matches; beyond that, something is wrong.  We
 458         # strip out duplicate entries above, so it's remotely possible that we
 459         # should have more than 10 here and lost a few.  The speller returns 25
 460         # to give us some padding, and should avoid that problem.  Not a big
 461         # deal if we lose the 25th-most-likely match anyway.
 462         return objects[:self.MAX_LOOKUP_RESULTS]
 463
 464
 465     def random_lookup(self, valid_types=[]):
 466         """Returns a random lookup result from one of the provided
 467         `valid_types`.
 468         """
 469
 470         tables = []
 471         for valid_type in valid_types:
 472             table_name = self._parse_table_name(valid_type)
 473             if table_name:
 474                 tables.append(self.indexed_tables[table_name])
 475
 476         if not tables:
 477             # n.b.: It's possible we got a list of valid_types and none of them
 478             # were valid, but this function is guaranteed to return
 479             # *something*, so it politely selects from the entire index isntead
 480             tables = self.indexed_tables.values()
 481
 482         # Rather than create an array of many hundred items and pick randomly
 483         # from it, just pick a number up to the total number of potential
 484         # items, then pick randomly from that, and partition the whole range
 485         # into chunks.  This also avoids the slight problem that the index
 486         # contains more rows (for languages) for some items than others.
 487         # XXX ought to cache this (in the index?) if possible
 488         total = 0
 489         partitions = []
 490         for table in tables:
 491             count = self.session.query(table).count()
 492             total += count
 493             partitions.append((table, count))
 494
 495         n = random.randint(1, total)
 496         while n > partitions[0][1]:
 497             n -= partitions[0][1]
 498             partitions.pop(0)
 499
 500         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 501
 502     def prefix_lookup(self, prefix, valid_types=[]):
 503         """Returns terms starting with the given exact prefix.
 504
 505         Type prefixes are recognized, but no other name munging is done.
 506         """
 507
 508         # Pop off any type prefix and merge with valid_types
 509         prefix, merged_valid_types, type_term = \
 510             self._apply_valid_types(prefix, valid_types)
 511
 512         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 513
 514         if type_term:
 515             query = query & type_term
 516
 517         searcher = self.index.searcher()
 518         searcher.weighting = LanguageWeighting()
 519         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 520
 521         return self._whoosh_records_to_results(results)