pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult', [
  30     'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
  31 ])
  32
  33 class UninitializedIndex(object):
  34     class UninitializedIndexError(Exception):
  35         pass
  36
  37     def __nonzero__(self):
  38         """Dummy object should identify itself as False."""
  39         return False
  40
  41     def __bool__(self):
  42         """Python 3000 version of the above.  Future-proofing rules!"""
  43         return False
  44
  45     def __getattr__(self, *args, **kwargs):
  46         raise self.UninitializedIndexError(
  47             "The lookup index does not exist.  Please use `pokedex setup` "
  48             "or lookup.rebuild_index() to create it."
  49         )
  50
  51 class LanguageWeighting(whoosh.scoring.Weighting):
  52     """A scoring class that forces otherwise-equal English results to come
  53     before foreign results.
  54     """
  55
  56     def __init__(self, extra_weights={}, *args, **kwargs):
  57         """`extra_weights` may be a dictionary of weights which will be
  58         factored in.
  59
  60         Intended for use with spelling corrections, which come along with their
  61         own weightings.
  62         """
  63         self.extra_weights = extra_weights
  64         super(LanguageWeighting, self).__init__(*args, **kwargs)
  65
  66     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  67         doc = searcher.stored_fields(docnum)
  68
  69         # Apply extra weight
  70         weight = weight * self.extra_weights.get(text, 1.0)
  71
  72         language = doc.get('language')
  73         if language is None:
  74             # English (well, "default"); leave it at 1
  75             return weight
  76         elif language == u'Roomaji':
  77             # Give Roomaji a little boost; it's most likely to be searched
  78             return weight * 0.9
  79         else:
  80             # Everything else can drop down the totem pole
  81             return weight * 0.8
  82
  83
  84 class PokedexLookup(object):
  85     MAX_FUZZY_RESULTS = 10
  86     MAX_EXACT_RESULTS = 43
  87     INTERMEDIATE_FACTOR = 2
  88
  89     # The speller only checks how much the input matches a word; there can be
  90     # all manner of extra unmatched junk, and it won't affect the weighting.
  91     # To compensate, greatly boost the weighting of matches at the beginning
  92     # and end, so nearly-full-word-matches are much better
  93     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  94
  95     # Dictionary of table name => table class.
  96     # Need the table name so we can get the class from the table name after we
  97     # retrieve something from the index
  98     indexed_tables = dict(
  99         (cls.__tablename__, cls)
 100         for cls in (
 101             tables.Ability,
 102             tables.Item,
 103             tables.Location,
 104             tables.Move,
 105             tables.Nature,
 106             tables.Pokemon,
 107             tables.PokemonForm,
 108             tables.Type,
 109         )
 110     )
 111
 112
 113     def __init__(self, directory=None, session=None):
 114         """Opens the whoosh index stored in the named directory.  If the index
 115         doesn't already exist, it will be created.
 116
 117         `directory`
 118             Directory containing the index.  Defaults to a location within the
 119             `pokedex` egg directory.
 120
 121         `session`
 122             Used for creating the index and retrieving objects.  Defaults to an
 123             attempt to connect to the default SQLite database installed by
 124             `pokedex setup`.
 125         """
 126
 127         # By the time this returns, self.index, self.speller, and self.session
 128         # must be set
 129
 130         # If a directory was not given, use the default
 131         if directory is None:
 132             directory = get_default_index_dir()
 133
 134         self.directory = directory
 135
 136         if session:
 137             self.session = session
 138         else:
 139             self.session = connect()
 140
 141         # Attempt to open or create the index
 142         if not os.path.exists(directory) or not os.listdir(directory):
 143             # Directory doesn't exist OR is empty; caller needs to use
 144             # rebuild_index before doing anything.  Provide a dummy object that
 145             # complains when used
 146             self.index = UninitializedIndex()
 147             self.speller = UninitializedIndex()
 148             return
 149
 150         # Otherwise, already exists; should be an index!  Bam, done.
 151         # Note that this will explode if the directory exists but doesn't
 152         # contain an index; that's a feature
 153         try:
 154             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 155         except whoosh.index.EmptyIndexError:
 156             raise IOError(
 157                 "The index directory already contains files.  "
 158                 "Please use a dedicated directory for the lookup index."
 159             )
 160
 161         # Create speller, and done
 162         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 163         self.speller = whoosh.spelling.SpellChecker(spell_store,
 164             **self.SPELLER_OPTIONS)
 165
 166
 167     def rebuild_index(self):
 168         """Creates the index from scratch."""
 169
 170         schema = whoosh.fields.Schema(
 171             name=whoosh.fields.ID(stored=True),
 172             table=whoosh.fields.ID(stored=True),
 173             row_id=whoosh.fields.ID(stored=True),
 174             language=whoosh.fields.STORED,
 175             iso639=whoosh.fields.ID(stored=True),
 176             iso3166=whoosh.fields.ID(stored=True),
 177             display_name=whoosh.fields.STORED,  # non-lowercased name
 178         )
 179
 180         if os.path.exists(self.directory):
 181             # create_in() isn't totally reliable, so just nuke whatever's there
 182             # manually.  Try to be careful about this...
 183             for f in os.listdir(self.directory):
 184                 if re.match('^_?(MAIN|SPELL)_', f):
 185                     os.remove(os.path.join(self.directory, f))
 186         else:
 187             os.mkdir(self.directory)
 188
 189         self.index = whoosh.index.create_in(self.directory, schema=schema,
 190                                                             indexname='MAIN')
 191         writer = self.index.writer()
 192
 193         # Index every name in all our tables of interest
 194         speller_entries = set()
 195         for cls in self.indexed_tables.values():
 196             q = self.session.query(cls)
 197
 198             for row in q.yield_per(5):
 199                 row_key = dict(table=unicode(cls.__tablename__),
 200                                row_id=unicode(row.id))
 201
 202                 def add(name, language, iso639, iso3166):
 203                     normalized_name = self.normalize_name(name)
 204
 205                     writer.add_document(
 206                         name=normalized_name, display_name=name,
 207                         language=language, iso639=iso639, iso3166=iso3166,
 208                         **row_key
 209                     )
 210
 211                     speller_entries.add(normalized_name)
 212
 213
 214                 # Add the basic English name to the index
 215                 if cls == tables.Pokemon:
 216                     # Don't re-add alternate forms of the same Pokémon; they'll
 217                     # be added as Pokémon forms instead
 218                     if not row.is_base_form:
 219                         continue
 220                 elif cls == tables.PokemonForm:
 221                     if row.name:
 222                         add(row.pokemon_name, None, u'en', u'us')
 223                     continue
 224
 225                 add(row.name, None, u'en', u'us')
 226
 227                 # Some things also have other languages' names
 228                 # XXX other language form names..?
 229                 for foreign_name in getattr(row, 'foreign_names', []):
 230                     moonspeak = foreign_name.name
 231                     if row.name == moonspeak:
 232                         # Don't add the English name again as a different
 233                         # language; no point and it makes spell results
 234                         # confusing
 235                         continue
 236
 237                     add(moonspeak, foreign_name.language.name,
 238                                    foreign_name.language.iso639,
 239                                    foreign_name.language.iso3166)
 240
 241                     # Add Roomaji too
 242                     if foreign_name.language.name == 'Japanese':
 243                         roomaji = romanize(foreign_name.name)
 244                         add(roomaji, u'Roomaji', u'ja', u'jp')
 245
 246         writer.commit()
 247
 248         # Construct and populate a spell-checker index.  Quicker to do it all
 249         # at once, as every call to add_* does a commit(), and those seem to be
 250         # expensive
 251         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 252             **self.SPELLER_OPTIONS)
 253         self.speller.add_words(speller_entries)
 254
 255
 256     def normalize_name(self, name):
 257         """Strips irrelevant formatting junk from name input.
 258
 259         Specifically: everything is lowercased, and accents are removed.
 260         """
 261         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 262         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 263         # characters, then recombine.  I'm explicitly doing it this way instead
 264         # of testing combining() because Korean characters apparently
 265         # decompose!  But the results are considered letters, not combining
 266         # characters, so testing for Mn works well, and combining them again
 267         # makes them look right.
 268         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 269         name = u"".join(c for c in nkfd_form
 270                         if unicodedata.category(c) != 'Mn')
 271         name = unicodedata.normalize('NFC', name)
 272
 273         name = name.strip()
 274         name = name.lower()
 275
 276         return name
 277
 278
 279     def _apply_valid_types(self, name, valid_types):
 280         """Combines the enforced `valid_types` with any from the search string
 281         itself and updates the query.
 282
 283         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 284         only `b`s named "foo".
 285
 286         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 287         prefix stripped, `merged_valid_types` combines the original
 288         `valid_types` with the type prefix, and `term` is a query term for
 289         limited to just the allowed types.  If there are no type restrictions
 290         at all, `term` will be None.
 291         """
 292
 293         # Remove any type prefix (pokemon:133) first
 294         user_valid_types = []
 295         if ':' in name:
 296             prefix_chunk, name = name.split(':', 1)
 297             name = name.strip()
 298
 299             prefixes = prefix_chunk.split(',')
 300             user_valid_types = []
 301             for prefix in prefixes:
 302                 prefix = prefix.strip()
 303                 if prefix:
 304                     user_valid_types.append(prefix)
 305
 306         # Merge the valid types together.  Only types that appear in BOTH lists
 307         # may be used.
 308         # As a special case, if the user asked for types that are explicitly
 309         # forbidden, completely ignore what the user requested.
 310         # And, just to complicate matters: "type" and language need to be
 311         # considered separately.
 312         def merge_requirements(func):
 313             user = filter(func, user_valid_types)
 314             system = filter(func, valid_types)
 315
 316             if user and system:
 317                 merged = list(set(user) & set(system))
 318                 if merged:
 319                     return merged
 320                 else:
 321                     # No overlap; use the system restrictions
 322                     return system
 323             else:
 324                 # One or the other is blank; use the one that's not
 325                 return user or system
 326
 327         # @foo means language must be foo; otherwise it's a table name
 328         lang_requirements = merge_requirements(lambda req: req[0] == u'@')
 329         type_requirements = merge_requirements(lambda req: req[0] != u'@')
 330         all_requirements = lang_requirements + type_requirements
 331
 332         # Construct the term
 333         lang_terms = []
 334         for lang in lang_requirements:
 335             # Allow for either country or language codes
 336             lang_code = lang[1:]
 337             lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
 338             lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
 339
 340         type_terms = []
 341         for type in type_requirements:
 342             table_name = self._parse_table_name(type)
 343
 344             # Quietly ignore bogus valid_types; more likely to DTRT
 345             if table_name:
 346                 type_terms.append(whoosh.query.Term(u'table', table_name))
 347
 348         # Combine both kinds of restriction
 349         all_terms = []
 350         if type_terms:
 351             all_terms.append(whoosh.query.Or(type_terms))
 352         if lang_terms:
 353             all_terms.append(whoosh.query.Or(lang_terms))
 354
 355         return name, all_requirements, whoosh.query.And(all_terms)
 356
 357
 358     def _parse_table_name(self, name):
 359         """Takes a singular table name, table name, or table object and returns
 360         the table name.
 361
 362         Returns None for a bogus name.
 363         """
 364         # Table object
 365         if hasattr(name, '__tablename__'):
 366             return getattr(name, '__tablename__')
 367
 368         # Table name
 369         for table in self.indexed_tables.values():
 370             if name in (table.__tablename__, table.__singlename__):
 371                 return table.__tablename__
 372
 373         # Bogus.  Be nice and return dummy
 374         return None
 375
 376     def _whoosh_records_to_results(self, records, exact=True):
 377         """Converts a list of whoosh's indexed records to LookupResult tuples
 378         containing database objects.
 379         """
 380         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 381         # way to handle it, since only lookup() cares about fuzzy results
 382         seen = {}
 383         results = []
 384         for record in records:
 385             # Skip dupes
 386             seen_key = record['table'], record['row_id']
 387             if seen_key in seen:
 388                 continue
 389             seen[seen_key] = True
 390
 391             cls = self.indexed_tables[record['table']]
 392             obj = self.session.query(cls).get(record['row_id'])
 393
 394             results.append(LookupResult(object=obj,
 395                                         indexed_name=record['name'],
 396                                         name=record['display_name'],
 397                                         language=record.get('language'),
 398                                         iso639=record['iso639'],
 399                                         iso3166=record['iso3166'],
 400                                         exact=exact))
 401
 402         return results
 403
 404
 405     def lookup(self, input, valid_types=[], exact_only=False):
 406         """Attempts to find some sort of object, given a name.
 407
 408         Returns a list of named (object, name, language, iso639, iso3166,
 409         exact) tuples.  `object` is a database object, `name` is the name under
 410         which the object was found, `language` and the two isos are the name
 411         and country codes of the language in which the name was found, and
 412         `exact` is True iff this was an exact match.
 413
 414         This function currently ONLY does fuzzy matching if there are no exact
 415         matches.
 416
 417         Formes are not returned unless requested; "Shaymin" will return only
 418         grass Shaymin.
 419
 420         Extraneous whitespace is removed with extreme prejudice.
 421
 422         Recognizes:
 423         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 424         - Foreign names: "Iibui", "Eivui"
 425         - Fuzzy names in whatever language: "Evee", "Ibui"
 426         - IDs: "133", "192", "250"
 427         Also:
 428         - Type restrictions.  "type:psychic" will only return the type.  This
 429           is how to make ID lookup useful.  Multiple type specs can be entered
 430           with commas, as "move,item:1".
 431         - Language restrictions.  "@fr:charge" will only return Tackle, which
 432           is called "Charge" in French.  These can be combined with type
 433           restrictions, e.g., "@fr,move:charge".
 434         - Alternate formes can be specified merely like "wash rotom".
 435
 436         `input`
 437             Name of the thing to look for.
 438
 439         `valid_types`
 440             A list of type or language restrictions, e.g., `['pokemon',
 441             '@ja']`.  If this is provided, only results in one of the given
 442             tables will be returned.
 443
 444         `exact_only`
 445             If True, only exact matches are returned.  If set to False (the
 446             default), and the provided `name` doesn't match anything exactly,
 447             spelling correction will be attempted.
 448         """
 449
 450         name = self.normalize_name(input)
 451         exact = True
 452         form = None
 453
 454         # Pop off any type prefix and merge with valid_types
 455         name, merged_valid_types, type_term = \
 456             self._apply_valid_types(name, valid_types)
 457
 458         # Random lookup
 459         if name == 'random':
 460             return self.random_lookup(valid_types=merged_valid_types)
 461
 462         # Do different things depending what the query looks like
 463         # Note: Term objects do an exact match, so we don't have to worry about
 464         # a query parser tripping on weird characters in the input
 465         try:
 466             # Let Python try to convert to a number, so 0xff works
 467             name_as_number = int(name, base=0)
 468         except ValueError:
 469             # Oh well
 470             name_as_number = None
 471
 472         if '*' in name or '?' in name:
 473             exact_only = True
 474             query = whoosh.query.Wildcard(u'name', name)
 475         elif name_as_number is not None:
 476             # Don't spell-check numbers!
 477             exact_only = True
 478             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 479         else:
 480             # Not an integer
 481             query = whoosh.query.Term(u'name', name)
 482
 483         if type_term:
 484             query = query & type_term
 485
 486
 487         ### Actual searching
 488         # Limits; result limits are constants, and intermediate results (before
 489         # duplicate items are stripped out) are capped at the result limit
 490         # times another constant.
 491         # Fuzzy are capped at 10, beyond which something is probably very
 492         # wrong.  Exact matches -- that is, wildcards and ids -- are far less
 493         # constrained.
 494         # Also, exact matches are sorted by name, since weight doesn't matter.
 495         sort_by = dict()
 496         if exact_only:
 497             max_results = self.MAX_EXACT_RESULTS
 498             sort_by['sortedby'] = (u'table', u'name')
 499         else:
 500             max_results = self.MAX_FUZZY_RESULTS
 501
 502         searcher = self.index.searcher(weighting=LanguageWeighting())
 503         results = searcher.search(
 504             query,
 505             limit=int(max_results * self.INTERMEDIATE_FACTOR),
 506             **sort_by
 507         )
 508
 509         # Look for some fuzzy matches if necessary
 510         if not exact_only and not results:
 511             exact = False
 512             results = []
 513
 514             fuzzy_query_parts = []
 515             fuzzy_weights = {}
 516             min_weight = [None]
 517             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 518                 # Only allow the top 50% of scores; otherwise there will always
 519                 # be a lot of trailing junk
 520                 if min_weight[0] is None:
 521                     min_weight[0] = weight * 0.5
 522                 elif weight < min_weight[0]:
 523                     break
 524
 525                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 526                 fuzzy_weights[suggestion] = weight
 527
 528             if not fuzzy_query_parts:
 529                 # Nothing at all; don't try querying
 530                 return []
 531
 532             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 533             if type_term:
 534                 fuzzy_query = fuzzy_query & type_term
 535
 536             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 537             results = searcher.search(fuzzy_query)
 538
 539         ### Convert results to db objects
 540         objects = self._whoosh_records_to_results(results, exact=exact)
 541
 542         # Truncate and return
 543         return objects[:max_results]
 544
 545
 546     def random_lookup(self, valid_types=[]):
 547         """Returns a random lookup result from one of the provided
 548         `valid_types`.
 549         """
 550
 551         table_names = []
 552         for valid_type in valid_types:
 553             table_name = self._parse_table_name(valid_type)
 554             # Skip anything not recognized.  Could be, say, a language code.
 555             # XXX The vast majority of Pokémon forms are unnamed and unindexed,
 556             #     which can produce blank results.  So skip them too for now.
 557             if table_name and table_name != 'pokemon_forms':
 558                 table_names.append(table_name)
 559
 560         if not table_names:
 561             # n.b.: It's possible we got a list of valid_types and none of them
 562             # were valid, but this function is guaranteed to return
 563             # *something*, so it politely selects from the entire index instead
 564             table_names = self.indexed_tables.keys()
 565             table_names.remove('pokemon_forms')
 566
 567         # Rather than create an array of many hundred items and pick randomly
 568         # from it, just pick a number up to the total number of potential
 569         # items, then pick randomly from that, and partition the whole range
 570         # into chunks.  This also avoids the slight problem that the index
 571         # contains more rows (for languages) for some items than others.
 572         # XXX ought to cache this (in the index?) if possible
 573         total = 0
 574         partitions = []
 575         for table_name in table_names:
 576             count = self.session.query(self.indexed_tables[table_name]).count()
 577             total += count
 578             partitions.append((table_name, count))
 579
 580         n = random.randint(1, total)
 581         while n > partitions[0][1]:
 582             n -= partitions[0][1]
 583             partitions.pop(0)
 584
 585         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 586
 587     def prefix_lookup(self, prefix, valid_types=[]):
 588         """Returns terms starting with the given exact prefix.
 589
 590         Type prefixes are recognized, but no other name munging is done.
 591         """
 592
 593         # Pop off any type prefix and merge with valid_types
 594         prefix, merged_valid_types, type_term = \
 595             self._apply_valid_types(prefix, valid_types)
 596
 597         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 598
 599         if type_term:
 600             query = query & type_term
 601
 602         searcher = self.index.searcher()
 603         searcher.weighting = LanguageWeighting()
 604         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 605
 606         return self._whoosh_records_to_results(results)