pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult',
  30     ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
  31
  32 class UninitializedIndex(object):
  33     class UninitializedIndexError(Exception):
  34         pass
  35
  36     def __nonzero__(self):
  37         """Dummy object should identify itself as False."""
  38         return False
  39
  40     def __bool__(self):
  41         """Python 3000 version of the above.  Future-proofing rules!"""
  42         return False
  43
  44     def __getattr__(self, *args, **kwargs):
  45         raise self.UninitializedIndexError(
  46             "The lookup index does not exist.  Please use `pokedex setup` "
  47             "or lookup.rebuild_index() to create it."
  48         )
  49
  50 class LanguageWeighting(whoosh.scoring.Weighting):
  51     """A scoring class that forces otherwise-equal English results to come
  52     before foreign results.
  53     """
  54
  55     def __init__(self, extra_weights={}, *args, **kwargs):
  56         """`extra_weights` may be a dictionary of weights which will be
  57         factored in.
  58
  59         Intended for use with spelling corrections, which come along with their
  60         own weightings.
  61         """
  62         self.extra_weights = extra_weights
  63         super(LanguageWeighting, self).__init__(*args, **kwargs)
  64
  65     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  66         doc = searcher.stored_fields(docnum)
  67
  68         # Apply extra weight
  69         weight = weight * self.extra_weights.get(text, 1.0)
  70
  71         if doc['language'] == None:
  72             # English (well, "default"); leave it at 1
  73             return weight
  74         elif doc['language'] == u'Roomaji':
  75             # Give Roomaji a little boost; it's most likely to be searched
  76             return weight * 0.9
  77         else:
  78             # Everything else can drop down the totem pole
  79             return weight * 0.8
  80
  81
  82 class PokedexLookup(object):
  83     INTERMEDIATE_LOOKUP_RESULTS = 25
  84     MAX_LOOKUP_RESULTS = 10
  85
  86     # The speller only checks how much the input matches a word; there can be
  87     # all manner of extra unmatched junk, and it won't affect the weighting.
  88     # To compensate, greatly boost the weighting of matches at the beginning
  89     # and end, so nearly-full-word-matches are much better
  90     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  91
  92     # Dictionary of table name => table class.
  93     # Need the table name so we can get the class from the table name after we
  94     # retrieve something from the index
  95     indexed_tables = dict(
  96         (cls.__tablename__, cls)
  97         for cls in (
  98             tables.Ability,
  99             tables.Item,
 100             tables.Location,
 101             tables.Move,
 102             tables.Nature,
 103             tables.Pokemon,
 104             tables.Type,
 105         )
 106     )
 107
 108
 109     def __init__(self, directory=None, session=None):
 110         """Opens the whoosh index stored in the named directory.  If the index
 111         doesn't already exist, it will be created.
 112
 113         `directory`
 114             Directory containing the index.  Defaults to a location within the
 115             `pokedex` egg directory.
 116
 117         `session`
 118             Used for creating the index and retrieving objects.  Defaults to an
 119             attempt to connect to the default SQLite database installed by
 120             `pokedex setup`.
 121         """
 122
 123         # By the time this returns, self.index, self.speller, and self.session
 124         # must be set
 125
 126         # If a directory was not given, use the default
 127         if directory is None:
 128             directory = get_default_index_dir()
 129
 130         self.directory = directory
 131
 132         if session:
 133             self.session = session
 134         else:
 135             self.session = connect()
 136
 137         # Attempt to open or create the index
 138         if not os.path.exists(directory) or not os.listdir(directory):
 139             # Directory doesn't exist OR is empty; caller needs to use
 140             # rebuild_index before doing anything.  Provide a dummy object that
 141             # complains when used
 142             self.index = UninitializedIndex()
 143             self.speller = UninitializedIndex()
 144             return
 145
 146         # Otherwise, already exists; should be an index!  Bam, done.
 147         # Note that this will explode if the directory exists but doesn't
 148         # contain an index; that's a feature
 149         try:
 150             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 151         except whoosh.index.EmptyIndexError:
 152             raise IOError(
 153                 "The index directory already contains files.  "
 154                 "Please use a dedicated directory for the lookup index."
 155             )
 156
 157         # Create speller, and done
 158         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 159         self.speller = whoosh.spelling.SpellChecker(spell_store,
 160             **self.SPELLER_OPTIONS)
 161
 162
 163     def rebuild_index(self):
 164         """Creates the index from scratch."""
 165
 166         schema = whoosh.fields.Schema(
 167             name=whoosh.fields.ID(stored=True),
 168             table=whoosh.fields.ID(stored=True),
 169             row_id=whoosh.fields.ID(stored=True),
 170             language=whoosh.fields.STORED,
 171             iso3166=whoosh.fields.STORED,
 172             display_name=whoosh.fields.STORED,  # non-lowercased name
 173         )
 174
 175         if not os.path.exists(self.directory):
 176             os.mkdir(self.directory)
 177
 178         self.index = whoosh.index.create_in(self.directory, schema=schema,
 179                                                             indexname='MAIN')
 180         writer = self.index.writer()
 181
 182         # Index every name in all our tables of interest
 183         speller_entries = set()
 184         for cls in self.indexed_tables.values():
 185             q = self.session.query(cls)
 186
 187             for row in q.yield_per(5):
 188                 row_key = dict(table=unicode(cls.__tablename__),
 189                                row_id=unicode(row.id))
 190
 191                 def add(name, language, iso3166):
 192                     normalized_name = self.normalize_name(name)
 193
 194                     writer.add_document(
 195                         name=normalized_name, display_name=name,
 196                         language=language, iso3166=iso3166,
 197                         **row_key
 198                     )
 199
 200                     speller_entries.add(normalized_name)
 201
 202
 203                 # Add the basic English name to the index
 204                 if cls == tables.Pokemon:
 205                     # Pokémon need their form name added
 206                     # XXX kinda kludgy
 207                     add(row.full_name, None, u'us')
 208
 209                     # If this is a default form, ALSO add the unadorned name,
 210                     # so 'Deoxys' alone will still do the right thing
 211                     if row.forme_name and not row.forme_base_pokemon_id:
 212                         add(row.name, None, u'us')
 213                 else:
 214                     add(row.name, None, u'us')
 215
 216                 # Some things also have other languages' names
 217                 # XXX other language form names..?
 218                 for foreign_name in getattr(row, 'foreign_names', []):
 219                     moonspeak = foreign_name.name
 220                     if row.name == moonspeak:
 221                         # Don't add the English name again as a different
 222                         # language; no point and it makes spell results
 223                         # confusing
 224                         continue
 225
 226                     add(moonspeak, foreign_name.language.name,
 227                                    foreign_name.language.iso3166)
 228
 229                     # Add Roomaji too
 230                     if foreign_name.language.name == 'Japanese':
 231                         roomaji = romanize(foreign_name.name)
 232                         add(roomaji, u'Roomaji', u'jp')
 233
 234         writer.commit()
 235
 236         # Construct and populate a spell-checker index.  Quicker to do it all
 237         # at once, as every call to add_* does a commit(), and those seem to be
 238         # expensive
 239         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 240             **self.SPELLER_OPTIONS)
 241         self.speller.add_words(speller_entries)
 242
 243
 244     def normalize_name(self, name):
 245         """Strips irrelevant formatting junk from name input.
 246
 247         Specifically: everything is lowercased, and accents are removed.
 248         """
 249         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 250         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 251         # characters, then recombine.  I'm explicitly doing it this way instead
 252         # of testing combining() because Korean characters apparently
 253         # decompose!  But the results are considered letters, not combining
 254         # characters, so testing for Mn works well, and combining them again
 255         # makes them look right.
 256         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 257         name = u"".join(c for c in nkfd_form
 258                         if unicodedata.category(c) != 'Mn')
 259         name = unicodedata.normalize('NFC', name)
 260
 261         name = name.strip()
 262         name = name.lower()
 263
 264         return name
 265
 266
 267     def _apply_valid_types(self, name, valid_types):
 268         """Combines the enforced `valid_types` with any from the search string
 269         itself and updates the query.
 270
 271         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 272         only `b`s named "foo".
 273
 274         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 275         prefix stripped, `merged_valid_types` combines the original
 276         `valid_types` with the type prefix, and `term` is a query term for
 277         limited to just the allowed types.  If there are no type restrictions
 278         at all, `term` will be None.
 279         """
 280
 281         # Remove any type prefix (pokemon:133) first
 282         user_valid_types = []
 283         if ':' in name:
 284             prefix_chunk, name = name.split(':', 1)
 285             name = name.strip()
 286
 287             prefixes = prefix_chunk.split(',')
 288             user_valid_types = [_.strip() for _ in prefixes]
 289
 290         # Merge the valid types together.  Only types that appear in BOTH lists
 291         # may be used.
 292         # As a special case, if the user asked for types that are explicitly
 293         # forbidden, completely ignore what the user requested
 294         combined_valid_types = []
 295         if user_valid_types and valid_types:
 296             combined_valid_types = list(
 297                 set(user_valid_types) & set(combined_valid_types)
 298             )
 299
 300             if not combined_valid_types:
 301                 # No overlap!  Just use the enforced ones
 302                 combined_valid_types = valid_types
 303         else:
 304             # One list or the other was blank, so just use the one that isn't
 305             combined_valid_types = valid_types + user_valid_types
 306
 307         if not combined_valid_types:
 308             # No restrictions
 309             return name, [], None
 310
 311         # Construct the term
 312         type_terms = []
 313         final_valid_types = []
 314         for valid_type in combined_valid_types:
 315             table_name = self._parse_table_name(valid_type)
 316
 317             # Quietly ignore bogus valid_types; more likely to DTRT
 318             if table_name:
 319                 final_valid_types.append(valid_type)
 320                 type_terms.append(whoosh.query.Term(u'table', table_name))
 321
 322         return name, final_valid_types, whoosh.query.Or(type_terms)
 323
 324
 325     def _parse_table_name(self, name):
 326         """Takes a singular table name, table name, or table object and returns
 327         the table name.
 328
 329         Returns None for a bogus name.
 330         """
 331         # Table object
 332         if hasattr(name, '__tablename__'):
 333             return getattr(name, '__tablename__')
 334
 335         # Table name
 336         for table in self.indexed_tables.values():
 337             if name in (table.__tablename__, table.__singlename__):
 338                 return table.__tablename__
 339
 340         # Bogus.  Be nice and return dummy
 341         return None
 342
 343     def _whoosh_records_to_results(self, records, exact=True):
 344         """Converts a list of whoosh's indexed records to LookupResult tuples
 345         containing database objects.
 346         """
 347         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 348         # way to handle it, since only lookup() cares about fuzzy results
 349         seen = {}
 350         results = []
 351         for record in records:
 352             # Skip dupes
 353             seen_key = record['table'], record['row_id']
 354             if seen_key in seen:
 355                 continue
 356             seen[seen_key] = True
 357
 358             cls = self.indexed_tables[record['table']]
 359             obj = self.session.query(cls).get(record['row_id'])
 360
 361             results.append(LookupResult(object=obj,
 362                                         indexed_name=record['name'],
 363                                         name=record['display_name'],
 364                                         language=record['language'],
 365                                         iso3166=record['iso3166'],
 366                                         exact=exact))
 367
 368         return results
 369
 370
 371     def lookup(self, input, valid_types=[], exact_only=False):
 372         """Attempts to find some sort of object, given a name.
 373
 374         Returns a list of named (object, name, language, iso3166, exact)
 375         tuples.  `object` is a database object, `name` is the name under which
 376         the object was found, `language` and `iso3166` are the name and country
 377         code of the language in which the name was found, and `exact` is True
 378         iff this was an
 379         exact match.
 380
 381         This function currently ONLY does fuzzy matching if there are no exact
 382         matches.
 383
 384         Formes are not returned unless requested; "Shaymin" will return only
 385         grass Shaymin.
 386
 387         Extraneous whitespace is removed with extreme prejudice.
 388
 389         Recognizes:
 390         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 391         - Foreign names: "Iibui", "Eivui"
 392         - Fuzzy names in whatever language: "Evee", "Ibui"
 393         - IDs: "133", "192", "250"
 394         Also:
 395         - Type restrictions.  "type:psychic" will only return the type.  This
 396           is how to make ID lookup useful.  Multiple type specs can be entered
 397           with commas, as "move,item:1".  If `valid_types` are provided, any
 398           type prefix will be ignored.
 399         - Alternate formes can be specified merely like "wash rotom".
 400
 401         `input`
 402             Name of the thing to look for.
 403
 404         `valid_types`
 405             A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 406             this is provided, only results in one of the given tables will be
 407             returned.
 408
 409         `exact_only`
 410             If True, only exact matches are returned.  If set to False (the
 411             default), and the provided `name` doesn't match anything exactly,
 412             spelling correction will be attempted.
 413         """
 414
 415         name = self.normalize_name(input)
 416         exact = True
 417         form = None
 418
 419         # Pop off any type prefix and merge with valid_types
 420         name, merged_valid_types, type_term = \
 421             self._apply_valid_types(name, valid_types)
 422
 423         # Random lookup
 424         if name == 'random':
 425             return self.random_lookup(valid_types=merged_valid_types)
 426
 427         # Do different things depending what the query looks like
 428         # Note: Term objects do an exact match, so we don't have to worry about
 429         # a query parser tripping on weird characters in the input
 430         try:
 431             # Let Python try to convert to a number, so 0xff works
 432             name_as_number = int(name, base=0)
 433         except ValueError:
 434             # Oh well
 435             name_as_number = None
 436
 437         if '*' in name or '?' in name:
 438             exact_only = True
 439             query = whoosh.query.Wildcard(u'name', name)
 440         elif name_as_number is not None:
 441             # Don't spell-check numbers!
 442             exact_only = True
 443             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 444         else:
 445             # Not an integer
 446             query = whoosh.query.Term(u'name', name)
 447
 448         if type_term:
 449             query = query & type_term
 450
 451
 452         ### Actual searching
 453         searcher = self.index.searcher()
 454         # XXX is this kosher?  docs say search() takes a weighting arg, but it
 455         # certainly does not
 456         searcher.weighting = LanguageWeighting()
 457         results = searcher.search(query,
 458                                   limit=self.INTERMEDIATE_LOOKUP_RESULTS)
 459
 460         # Look for some fuzzy matches if necessary
 461         if not exact_only and not results:
 462             exact = False
 463             results = []
 464
 465             fuzzy_query_parts = []
 466             fuzzy_weights = {}
 467             min_weight = [None]
 468             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 469                 # Only allow the top 50% of scores; otherwise there will always
 470                 # be a lot of trailing junk
 471                 if min_weight[0] is None:
 472                     min_weight[0] = weight * 0.5
 473                 elif weight < min_weight[0]:
 474                     break
 475
 476                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 477                 fuzzy_weights[suggestion] = weight
 478
 479             if not fuzzy_query_parts:
 480                 # Nothing at all; don't try querying
 481                 return []
 482
 483             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 484             if type_term:
 485                 fuzzy_query = fuzzy_query & type_term
 486
 487             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 488             results = searcher.search(fuzzy_query)
 489
 490         ### Convert results to db objects
 491         objects = self._whoosh_records_to_results(results, exact=exact)
 492
 493         # Only return up to 10 matches; beyond that, something is wrong.  We
 494         # strip out duplicate entries above, so it's remotely possible that we
 495         # should have more than 10 here and lost a few.  The speller returns 25
 496         # to give us some padding, and should avoid that problem.  Not a big
 497         # deal if we lose the 25th-most-likely match anyway.
 498         return objects[:self.MAX_LOOKUP_RESULTS]
 499
 500
 501     def random_lookup(self, valid_types=[]):
 502         """Returns a random lookup result from one of the provided
 503         `valid_types`.
 504         """
 505
 506         tables = []
 507         for valid_type in valid_types:
 508             table_name = self._parse_table_name(valid_type)
 509             if table_name:
 510                 tables.append(self.indexed_tables[table_name])
 511
 512         if not tables:
 513             # n.b.: It's possible we got a list of valid_types and none of them
 514             # were valid, but this function is guaranteed to return
 515             # *something*, so it politely selects from the entire index isntead
 516             tables = self.indexed_tables.values()
 517
 518         # Rather than create an array of many hundred items and pick randomly
 519         # from it, just pick a number up to the total number of potential
 520         # items, then pick randomly from that, and partition the whole range
 521         # into chunks.  This also avoids the slight problem that the index
 522         # contains more rows (for languages) for some items than others.
 523         # XXX ought to cache this (in the index?) if possible
 524         total = 0
 525         partitions = []
 526         for table in tables:
 527             count = self.session.query(table).count()
 528             total += count
 529             partitions.append((table, count))
 530
 531         n = random.randint(1, total)
 532         while n > partitions[0][1]:
 533             n -= partitions[0][1]
 534             partitions.pop(0)
 535
 536         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 537
 538     def prefix_lookup(self, prefix, valid_types=[]):
 539         """Returns terms starting with the given exact prefix.
 540
 541         Type prefixes are recognized, but no other name munging is done.
 542         """
 543
 544         # Pop off any type prefix and merge with valid_types
 545         prefix, merged_valid_types, type_term = \
 546             self._apply_valid_types(prefix, valid_types)
 547
 548         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 549
 550         if type_term:
 551             query = query & type_term
 552
 553         searcher = self.index.searcher()
 554         searcher.weighting = LanguageWeighting()
 555         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 556
 557         return self._whoosh_records_to_results(results)