pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult', [
  30     'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
  31 ])
  32
  33 class UninitializedIndex(object):
  34     class UninitializedIndexError(Exception):
  35         pass
  36
  37     def __nonzero__(self):
  38         """Dummy object should identify itself as False."""
  39         return False
  40
  41     def __bool__(self):
  42         """Python 3000 version of the above.  Future-proofing rules!"""
  43         return False
  44
  45     def __getattr__(self, *args, **kwargs):
  46         raise self.UninitializedIndexError(
  47             "The lookup index does not exist.  Please use `pokedex setup` "
  48             "or lookup.rebuild_index() to create it."
  49         )
  50
  51 class LanguageWeighting(whoosh.scoring.Weighting):
  52     """A scoring class that forces otherwise-equal English results to come
  53     before foreign results.
  54     """
  55
  56     def __init__(self, extra_weights={}, *args, **kwargs):
  57         """`extra_weights` may be a dictionary of weights which will be
  58         factored in.
  59
  60         Intended for use with spelling corrections, which come along with their
  61         own weightings.
  62         """
  63         self.extra_weights = extra_weights
  64         super(LanguageWeighting, self).__init__(*args, **kwargs)
  65
  66     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  67         doc = searcher.stored_fields(docnum)
  68
  69         # Apply extra weight
  70         weight = weight * self.extra_weights.get(text, 1.0)
  71
  72         if doc['language'] == None:
  73             # English (well, "default"); leave it at 1
  74             return weight
  75         elif doc['language'] == u'Roomaji':
  76             # Give Roomaji a little boost; it's most likely to be searched
  77             return weight * 0.9
  78         else:
  79             # Everything else can drop down the totem pole
  80             return weight * 0.8
  81
  82
  83 class PokedexLookup(object):
  84     INTERMEDIATE_LOOKUP_RESULTS = 25
  85     MAX_LOOKUP_RESULTS = 10
  86
  87     # The speller only checks how much the input matches a word; there can be
  88     # all manner of extra unmatched junk, and it won't affect the weighting.
  89     # To compensate, greatly boost the weighting of matches at the beginning
  90     # and end, so nearly-full-word-matches are much better
  91     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  92
  93     # Dictionary of table name => table class.
  94     # Need the table name so we can get the class from the table name after we
  95     # retrieve something from the index
  96     indexed_tables = dict(
  97         (cls.__tablename__, cls)
  98         for cls in (
  99             tables.Ability,
 100             tables.Item,
 101             tables.Location,
 102             tables.Move,
 103             tables.Nature,
 104             tables.Pokemon,
 105             tables.Type,
 106         )
 107     )
 108
 109
 110     def __init__(self, directory=None, session=None):
 111         """Opens the whoosh index stored in the named directory.  If the index
 112         doesn't already exist, it will be created.
 113
 114         `directory`
 115             Directory containing the index.  Defaults to a location within the
 116             `pokedex` egg directory.
 117
 118         `session`
 119             Used for creating the index and retrieving objects.  Defaults to an
 120             attempt to connect to the default SQLite database installed by
 121             `pokedex setup`.
 122         """
 123
 124         # By the time this returns, self.index, self.speller, and self.session
 125         # must be set
 126
 127         # If a directory was not given, use the default
 128         if directory is None:
 129             directory = get_default_index_dir()
 130
 131         self.directory = directory
 132
 133         if session:
 134             self.session = session
 135         else:
 136             self.session = connect()
 137
 138         # Attempt to open or create the index
 139         if not os.path.exists(directory) or not os.listdir(directory):
 140             # Directory doesn't exist OR is empty; caller needs to use
 141             # rebuild_index before doing anything.  Provide a dummy object that
 142             # complains when used
 143             self.index = UninitializedIndex()
 144             self.speller = UninitializedIndex()
 145             return
 146
 147         # Otherwise, already exists; should be an index!  Bam, done.
 148         # Note that this will explode if the directory exists but doesn't
 149         # contain an index; that's a feature
 150         try:
 151             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 152         except whoosh.index.EmptyIndexError:
 153             raise IOError(
 154                 "The index directory already contains files.  "
 155                 "Please use a dedicated directory for the lookup index."
 156             )
 157
 158         # Create speller, and done
 159         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 160         self.speller = whoosh.spelling.SpellChecker(spell_store,
 161             **self.SPELLER_OPTIONS)
 162
 163
 164     def rebuild_index(self):
 165         """Creates the index from scratch."""
 166
 167         schema = whoosh.fields.Schema(
 168             name=whoosh.fields.ID(stored=True),
 169             table=whoosh.fields.ID(stored=True),
 170             row_id=whoosh.fields.ID(stored=True),
 171             language=whoosh.fields.STORED,
 172             iso639=whoosh.fields.ID(stored=True),
 173             iso3166=whoosh.fields.ID(stored=True),
 174             display_name=whoosh.fields.STORED,  # non-lowercased name
 175         )
 176
 177         if not os.path.exists(self.directory):
 178             os.mkdir(self.directory)
 179
 180         self.index = whoosh.index.create_in(self.directory, schema=schema,
 181                                                             indexname='MAIN')
 182         writer = self.index.writer()
 183
 184         # Index every name in all our tables of interest
 185         speller_entries = set()
 186         for cls in self.indexed_tables.values():
 187             q = self.session.query(cls)
 188
 189             for row in q.yield_per(5):
 190                 row_key = dict(table=unicode(cls.__tablename__),
 191                                row_id=unicode(row.id))
 192
 193                 def add(name, language, iso639, iso3166):
 194                     normalized_name = self.normalize_name(name)
 195
 196                     writer.add_document(
 197                         name=normalized_name, display_name=name,
 198                         language=language, iso639=iso639, iso3166=iso3166,
 199                         **row_key
 200                     )
 201
 202                     speller_entries.add(normalized_name)
 203
 204
 205                 # Add the basic English name to the index
 206                 if cls == tables.Pokemon:
 207                     # Pokémon need their form name added
 208                     # XXX kinda kludgy
 209                     add(row.full_name, None, u'en', u'us')
 210
 211                     # If this is a default form, ALSO add the unadorned name,
 212                     # so 'Deoxys' alone will still do the right thing
 213                     if row.forme_name and not row.forme_base_pokemon_id:
 214                         add(row.name, None, u'en', u'us')
 215                 else:
 216                     add(row.name, None, u'en', u'us')
 217
 218                 # Some things also have other languages' names
 219                 # XXX other language form names..?
 220                 for foreign_name in getattr(row, 'foreign_names', []):
 221                     moonspeak = foreign_name.name
 222                     if row.name == moonspeak:
 223                         # Don't add the English name again as a different
 224                         # language; no point and it makes spell results
 225                         # confusing
 226                         continue
 227
 228                     add(moonspeak, foreign_name.language.name,
 229                                    foreign_name.language.iso639,
 230                                    foreign_name.language.iso3166)
 231
 232                     # Add Roomaji too
 233                     if foreign_name.language.name == 'Japanese':
 234                         roomaji = romanize(foreign_name.name)
 235                         add(roomaji, u'Roomaji', u'ja', u'jp')
 236
 237         writer.commit()
 238
 239         # Construct and populate a spell-checker index.  Quicker to do it all
 240         # at once, as every call to add_* does a commit(), and those seem to be
 241         # expensive
 242         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 243             **self.SPELLER_OPTIONS)
 244         self.speller.add_words(speller_entries)
 245
 246
 247     def normalize_name(self, name):
 248         """Strips irrelevant formatting junk from name input.
 249
 250         Specifically: everything is lowercased, and accents are removed.
 251         """
 252         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 253         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 254         # characters, then recombine.  I'm explicitly doing it this way instead
 255         # of testing combining() because Korean characters apparently
 256         # decompose!  But the results are considered letters, not combining
 257         # characters, so testing for Mn works well, and combining them again
 258         # makes them look right.
 259         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 260         name = u"".join(c for c in nkfd_form
 261                         if unicodedata.category(c) != 'Mn')
 262         name = unicodedata.normalize('NFC', name)
 263
 264         name = name.strip()
 265         name = name.lower()
 266
 267         return name
 268
 269
 270     def _apply_valid_types(self, name, valid_types):
 271         """Combines the enforced `valid_types` with any from the search string
 272         itself and updates the query.
 273
 274         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 275         only `b`s named "foo".
 276
 277         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 278         prefix stripped, `merged_valid_types` combines the original
 279         `valid_types` with the type prefix, and `term` is a query term for
 280         limited to just the allowed types.  If there are no type restrictions
 281         at all, `term` will be None.
 282         """
 283
 284         # Remove any type prefix (pokemon:133) first
 285         user_valid_types = []
 286         if ':' in name:
 287             prefix_chunk, name = name.split(':', 1)
 288             name = name.strip()
 289
 290             prefixes = prefix_chunk.split(',')
 291             user_valid_types = [_.strip() for _ in prefixes]
 292
 293         # Merge the valid types together.  Only types that appear in BOTH lists
 294         # may be used.
 295         # As a special case, if the user asked for types that are explicitly
 296         # forbidden, completely ignore what the user requested
 297         combined_valid_types = []
 298         if user_valid_types and valid_types:
 299             combined_valid_types = list(
 300                 set(user_valid_types) & set(combined_valid_types)
 301             )
 302
 303             if not combined_valid_types:
 304                 # No overlap!  Just use the enforced ones
 305                 combined_valid_types = valid_types
 306         else:
 307             # One list or the other was blank, so just use the one that isn't
 308             combined_valid_types = valid_types + user_valid_types
 309
 310         if not combined_valid_types:
 311             # No restrictions
 312             return name, [], None
 313
 314         # Construct the term
 315         type_terms = []
 316         lang_terms = []
 317         final_valid_types = []
 318         for valid_type in combined_valid_types:
 319             if valid_type.startswith(u'@'):
 320                 # @foo means: language must be foo.
 321                 # Allow for either country or language codes
 322                 lang_code = valid_type[1:]
 323                 lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
 324                 lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
 325             else:
 326                 # otherwise, this is a type/table name
 327                 table_name = self._parse_table_name(valid_type)
 328
 329                 # Quietly ignore bogus valid_types; more likely to DTRT
 330                 if table_name:
 331                     type_terms.append(whoosh.query.Term(u'table', table_name))
 332
 333         # Combine both kinds of restriction
 334         all_terms = []
 335         if type_terms:
 336             all_terms.append(whoosh.query.Or(type_terms))
 337         if lang_terms:
 338             all_terms.append(whoosh.query.Or(lang_terms))
 339
 340         return name, combined_valid_types, whoosh.query.And(all_terms)
 341
 342
 343     def _parse_table_name(self, name):
 344         """Takes a singular table name, table name, or table object and returns
 345         the table name.
 346
 347         Returns None for a bogus name.
 348         """
 349         # Table object
 350         if hasattr(name, '__tablename__'):
 351             return getattr(name, '__tablename__')
 352
 353         # Table name
 354         for table in self.indexed_tables.values():
 355             if name in (table.__tablename__, table.__singlename__):
 356                 return table.__tablename__
 357
 358         # Bogus.  Be nice and return dummy
 359         return None
 360
 361     def _whoosh_records_to_results(self, records, exact=True):
 362         """Converts a list of whoosh's indexed records to LookupResult tuples
 363         containing database objects.
 364         """
 365         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 366         # way to handle it, since only lookup() cares about fuzzy results
 367         seen = {}
 368         results = []
 369         for record in records:
 370             # Skip dupes
 371             seen_key = record['table'], record['row_id']
 372             if seen_key in seen:
 373                 continue
 374             seen[seen_key] = True
 375
 376             cls = self.indexed_tables[record['table']]
 377             obj = self.session.query(cls).get(record['row_id'])
 378
 379             results.append(LookupResult(object=obj,
 380                                         indexed_name=record['name'],
 381                                         name=record['display_name'],
 382                                         language=record['language'],
 383                                         iso639=record['iso639'],
 384                                         iso3166=record['iso3166'],
 385                                         exact=exact))
 386
 387         return results
 388
 389
 390     def lookup(self, input, valid_types=[], exact_only=False):
 391         """Attempts to find some sort of object, given a name.
 392
 393         Returns a list of named (object, name, language, iso639, iso3166,
 394         exact) tuples.  `object` is a database object, `name` is the name under
 395         which the object was found, `language` and the two isos are the name
 396         and country codes of the language in which the name was found, and
 397         `exact` is True iff this was an exact match.
 398
 399         This function currently ONLY does fuzzy matching if there are no exact
 400         matches.
 401
 402         Formes are not returned unless requested; "Shaymin" will return only
 403         grass Shaymin.
 404
 405         Extraneous whitespace is removed with extreme prejudice.
 406
 407         Recognizes:
 408         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 409         - Foreign names: "Iibui", "Eivui"
 410         - Fuzzy names in whatever language: "Evee", "Ibui"
 411         - IDs: "133", "192", "250"
 412         Also:
 413         - Type restrictions.  "type:psychic" will only return the type.  This
 414           is how to make ID lookup useful.  Multiple type specs can be entered
 415           with commas, as "move,item:1".
 416         - Language restrictions.  "@fr:charge" will only return Tackle, which
 417           is called "Charge" in French.  These can be combined with type
 418           restrictions, e.g., "@fr,move:charge".
 419         - Alternate formes can be specified merely like "wash rotom".
 420
 421         `input`
 422             Name of the thing to look for.
 423
 424         `valid_types`
 425             A list of type or language restrictions, e.g., `['pokemon',
 426             '@ja']`.  If this is provided, only results in one of the given
 427             tables will be returned.
 428
 429         `exact_only`
 430             If True, only exact matches are returned.  If set to False (the
 431             default), and the provided `name` doesn't match anything exactly,
 432             spelling correction will be attempted.
 433         """
 434
 435         name = self.normalize_name(input)
 436         exact = True
 437         form = None
 438
 439         # Pop off any type prefix and merge with valid_types
 440         name, merged_valid_types, type_term = \
 441             self._apply_valid_types(name, valid_types)
 442
 443         # Random lookup
 444         if name == 'random':
 445             return self.random_lookup(valid_types=merged_valid_types)
 446
 447         # Do different things depending what the query looks like
 448         # Note: Term objects do an exact match, so we don't have to worry about
 449         # a query parser tripping on weird characters in the input
 450         try:
 451             # Let Python try to convert to a number, so 0xff works
 452             name_as_number = int(name, base=0)
 453         except ValueError:
 454             # Oh well
 455             name_as_number = None
 456
 457         if '*' in name or '?' in name:
 458             exact_only = True
 459             query = whoosh.query.Wildcard(u'name', name)
 460         elif name_as_number is not None:
 461             # Don't spell-check numbers!
 462             exact_only = True
 463             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 464         else:
 465             # Not an integer
 466             query = whoosh.query.Term(u'name', name)
 467
 468         if type_term:
 469             query = query & type_term
 470
 471
 472         ### Actual searching
 473         searcher = self.index.searcher()
 474         # XXX is this kosher?  docs say search() takes a weighting arg, but it
 475         # certainly does not
 476         searcher.weighting = LanguageWeighting()
 477         results = searcher.search(query,
 478                                   limit=self.INTERMEDIATE_LOOKUP_RESULTS)
 479
 480         # Look for some fuzzy matches if necessary
 481         if not exact_only and not results:
 482             exact = False
 483             results = []
 484
 485             fuzzy_query_parts = []
 486             fuzzy_weights = {}
 487             min_weight = [None]
 488             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 489                 # Only allow the top 50% of scores; otherwise there will always
 490                 # be a lot of trailing junk
 491                 if min_weight[0] is None:
 492                     min_weight[0] = weight * 0.5
 493                 elif weight < min_weight[0]:
 494                     break
 495
 496                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 497                 fuzzy_weights[suggestion] = weight
 498
 499             if not fuzzy_query_parts:
 500                 # Nothing at all; don't try querying
 501                 return []
 502
 503             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 504             if type_term:
 505                 fuzzy_query = fuzzy_query & type_term
 506
 507             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 508             results = searcher.search(fuzzy_query)
 509
 510         ### Convert results to db objects
 511         objects = self._whoosh_records_to_results(results, exact=exact)
 512
 513         # Only return up to 10 matches; beyond that, something is wrong.  We
 514         # strip out duplicate entries above, so it's remotely possible that we
 515         # should have more than 10 here and lost a few.  The speller returns 25
 516         # to give us some padding, and should avoid that problem.  Not a big
 517         # deal if we lose the 25th-most-likely match anyway.
 518         return objects[:self.MAX_LOOKUP_RESULTS]
 519
 520
 521     def random_lookup(self, valid_types=[]):
 522         """Returns a random lookup result from one of the provided
 523         `valid_types`.
 524         """
 525
 526         table_names = []
 527         for valid_type in valid_types:
 528             table_name = self._parse_table_name(valid_type)
 529             # Skip anything not recognized.  Could be, say, a language code
 530             if table_name:
 531                 table_names.append(table_name)
 532
 533         if not table_names:
 534             # n.b.: It's possible we got a list of valid_types and none of them
 535             # were valid, but this function is guaranteed to return
 536             # *something*, so it politely selects from the entire index instead
 537             table_names = self.indexed_tables.keys()
 538
 539         # Rather than create an array of many hundred items and pick randomly
 540         # from it, just pick a number up to the total number of potential
 541         # items, then pick randomly from that, and partition the whole range
 542         # into chunks.  This also avoids the slight problem that the index
 543         # contains more rows (for languages) for some items than others.
 544         # XXX ought to cache this (in the index?) if possible
 545         total = 0
 546         partitions = []
 547         for table_name in table_names:
 548             count = self.session.query(self.indexed_tables[table_name]).count()
 549             total += count
 550             partitions.append((table_name, count))
 551
 552         n = random.randint(1, total)
 553         while n > partitions[0][1]:
 554             n -= partitions[0][1]
 555             partitions.pop(0)
 556
 557         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 558
 559     def prefix_lookup(self, prefix, valid_types=[]):
 560         """Returns terms starting with the given exact prefix.
 561
 562         Type prefixes are recognized, but no other name munging is done.
 563         """
 564
 565         # Pop off any type prefix and merge with valid_types
 566         prefix, merged_valid_types, type_term = \
 567             self._apply_valid_types(prefix, valid_types)
 568
 569         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 570
 571         if type_term:
 572             query = query & type_term
 573
 574         searcher = self.index.searcher()
 575         searcher.weighting = LanguageWeighting()
 576         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 577
 578         return self._whoosh_records_to_results(results)