pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult', [
  30     'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
  31 ])
  32
  33 class UninitializedIndex(object):
  34     class UninitializedIndexError(Exception):
  35         pass
  36
  37     def __nonzero__(self):
  38         """Dummy object should identify itself as False."""
  39         return False
  40
  41     def __bool__(self):
  42         """Python 3000 version of the above.  Future-proofing rules!"""
  43         return False
  44
  45     def __getattr__(self, *args, **kwargs):
  46         raise self.UninitializedIndexError(
  47             "The lookup index does not exist.  Please use `pokedex setup` "
  48             "or lookup.rebuild_index() to create it."
  49         )
  50
  51 class LanguageWeighting(whoosh.scoring.Weighting):
  52     """A scoring class that forces otherwise-equal English results to come
  53     before foreign results.
  54     """
  55
  56     def __init__(self, extra_weights={}, *args, **kwargs):
  57         """`extra_weights` may be a dictionary of weights which will be
  58         factored in.
  59
  60         Intended for use with spelling corrections, which come along with their
  61         own weightings.
  62         """
  63         self.extra_weights = extra_weights
  64         super(LanguageWeighting, self).__init__(*args, **kwargs)
  65
  66     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  67         doc = searcher.stored_fields(docnum)
  68
  69         # Apply extra weight
  70         weight = weight * self.extra_weights.get(text, 1.0)
  71
  72         language = doc.get('language')
  73         if language is None:
  74             # English (well, "default"); leave it at 1
  75             return weight
  76         elif language == u'Roomaji':
  77             # Give Roomaji a little boost; it's most likely to be searched
  78             return weight * 0.9
  79         else:
  80             # Everything else can drop down the totem pole
  81             return weight * 0.8
  82
  83
  84 class PokedexLookup(object):
  85     MAX_FUZZY_RESULTS = 10
  86     MAX_EXACT_RESULTS = 43
  87     INTERMEDIATE_FACTOR = 2
  88
  89     # The speller only checks how much the input matches a word; there can be
  90     # all manner of extra unmatched junk, and it won't affect the weighting.
  91     # To compensate, greatly boost the weighting of matches at the beginning
  92     # and end, so nearly-full-word-matches are much better
  93     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  94
  95     # Dictionary of table name => table class.
  96     # Need the table name so we can get the class from the table name after we
  97     # retrieve something from the index
  98     indexed_tables = dict(
  99         (cls.__tablename__, cls)
 100         for cls in (
 101             tables.Ability,
 102             tables.Item,
 103             tables.Location,
 104             tables.Move,
 105             tables.Nature,
 106             tables.Pokemon,
 107             tables.PokemonForm,
 108             tables.Type,
 109         )
 110     )
 111
 112
 113     def __init__(self, directory=None, session=None):
 114         """Opens the whoosh index stored in the named directory.  If the index
 115         doesn't already exist, it will be created.
 116
 117         `directory`
 118             Directory containing the index.  Defaults to a location within the
 119             `pokedex` egg directory.
 120
 121         `session`
 122             Used for creating the index and retrieving objects.  Defaults to an
 123             attempt to connect to the default SQLite database installed by
 124             `pokedex setup`.
 125         """
 126
 127         # By the time this returns, self.index, self.speller, and self.session
 128         # must be set
 129
 130         # If a directory was not given, use the default
 131         if directory is None:
 132             directory = get_default_index_dir()
 133
 134         self.directory = directory
 135
 136         if session:
 137             self.session = session
 138         else:
 139             self.session = connect()
 140
 141         # Attempt to open or create the index
 142         if not os.path.exists(directory) or not os.listdir(directory):
 143             # Directory doesn't exist OR is empty; caller needs to use
 144             # rebuild_index before doing anything.  Provide a dummy object that
 145             # complains when used
 146             self.index = UninitializedIndex()
 147             self.speller = UninitializedIndex()
 148             return
 149
 150         # Otherwise, already exists; should be an index!  Bam, done.
 151         # Note that this will explode if the directory exists but doesn't
 152         # contain an index; that's a feature
 153         try:
 154             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 155         except whoosh.index.EmptyIndexError:
 156             raise IOError(
 157                 "The index directory already contains files.  "
 158                 "Please use a dedicated directory for the lookup index."
 159             )
 160
 161         # Create speller, and done
 162         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 163         self.speller = whoosh.spelling.SpellChecker(spell_store,
 164             **self.SPELLER_OPTIONS)
 165
 166
 167     def rebuild_index(self):
 168         """Creates the index from scratch."""
 169
 170         schema = whoosh.fields.Schema(
 171             name=whoosh.fields.ID(stored=True),
 172             table=whoosh.fields.ID(stored=True),
 173             row_id=whoosh.fields.ID(stored=True),
 174             language=whoosh.fields.STORED,
 175             iso639=whoosh.fields.ID(stored=True),
 176             iso3166=whoosh.fields.ID(stored=True),
 177             display_name=whoosh.fields.STORED,  # non-lowercased name
 178         )
 179
 180         if os.path.exists(self.directory):
 181             # create_in() isn't totally reliable, so just nuke whatever's there
 182             # manually.  Try to be careful about this...
 183             for f in os.listdir(self.directory):
 184                 if re.match('^_?(MAIN|SPELL)_', f):
 185                     os.remove(os.path.join(self.directory, f))
 186         else:
 187             os.mkdir(self.directory)
 188
 189         self.index = whoosh.index.create_in(self.directory, schema=schema,
 190                                                             indexname='MAIN')
 191         writer = self.index.writer()
 192
 193         # Index every name in all our tables of interest
 194         speller_entries = set()
 195         for cls in self.indexed_tables.values():
 196             q = self.session.query(cls)
 197
 198             for row in q.yield_per(5):
 199                 row_key = dict(table=unicode(cls.__tablename__),
 200                                row_id=unicode(row.id))
 201
 202                 def add(name, language, iso639, iso3166):
 203                     normalized_name = self.normalize_name(name)
 204
 205                     writer.add_document(
 206                         name=normalized_name, display_name=name,
 207                         language=language, iso639=iso639, iso3166=iso3166,
 208                         **row_key
 209                     )
 210
 211                     speller_entries.add(normalized_name)
 212
 213
 214                 # Add the basic English name to the index
 215                 if cls == tables.Pokemon:
 216                     # Don't re-add alternate forms of the same Pokémon; they'll
 217                     # be added as Pokémon forms instead
 218                     if not row.is_base_form:
 219                         continue
 220                 elif cls == tables.PokemonForm:
 221                     if row.name:
 222                         add(row.pokemon_name, None, u'en', u'us')
 223                     continue
 224
 225                 # Some things also have other languages' names
 226                 # XXX other language form names..?
 227                 seen = set()
 228                 for language, name in getattr(row, 'names', []).items():
 229                     if name in seen:
 230                         # Don't add the name again as a different
 231                         # language; no point and it makes spell results
 232                         # confusing
 233                         continue
 234                     seen.add(name)
 235
 236                     add(name, language.name,
 237                               language.iso639,
 238                               language.iso3166)
 239
 240                     # Add Roomaji too
 241                     if language.identifier == 'ja':
 242                         roomaji = romanize(name)
 243                         add(roomaji, u'Roomaji', u'ja', u'jp')
 244
 245         writer.commit()
 246
 247         # Construct and populate a spell-checker index.  Quicker to do it all
 248         # at once, as every call to add_* does a commit(), and those seem to be
 249         # expensive
 250         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 251             **self.SPELLER_OPTIONS)
 252         self.speller.add_words(speller_entries)
 253
 254
 255     def normalize_name(self, name):
 256         """Strips irrelevant formatting junk from name input.
 257
 258         Specifically: everything is lowercased, and accents are removed.
 259         """
 260         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 261         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 262         # characters, then recombine.  I'm explicitly doing it this way instead
 263         # of testing combining() because Korean characters apparently
 264         # decompose!  But the results are considered letters, not combining
 265         # characters, so testing for Mn works well, and combining them again
 266         # makes them look right.
 267         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 268         name = u"".join(c for c in nkfd_form
 269                         if unicodedata.category(c) != 'Mn')
 270         name = unicodedata.normalize('NFC', name)
 271
 272         name = name.strip()
 273         name = name.lower()
 274
 275         return name
 276
 277
 278     def _apply_valid_types(self, name, valid_types):
 279         """Combines the enforced `valid_types` with any from the search string
 280         itself and updates the query.
 281
 282         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 283         only `b`s named "foo".
 284
 285         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 286         prefix stripped, `merged_valid_types` combines the original
 287         `valid_types` with the type prefix, and `term` is a query term for
 288         limited to just the allowed types.  If there are no type restrictions
 289         at all, `term` will be None.
 290         """
 291
 292         # Remove any type prefix (pokemon:133) first
 293         user_valid_types = []
 294         if ':' in name:
 295             prefix_chunk, name = name.split(':', 1)
 296             name = name.strip()
 297
 298             prefixes = prefix_chunk.split(',')
 299             user_valid_types = []
 300             for prefix in prefixes:
 301                 prefix = prefix.strip()
 302                 if prefix:
 303                     user_valid_types.append(prefix)
 304
 305         # Merge the valid types together.  Only types that appear in BOTH lists
 306         # may be used.
 307         # As a special case, if the user asked for types that are explicitly
 308         # forbidden, completely ignore what the user requested.
 309         # And, just to complicate matters: "type" and language need to be
 310         # considered separately.
 311         def merge_requirements(func):
 312             user = filter(func, user_valid_types)
 313             system = filter(func, valid_types)
 314
 315             if user and system:
 316                 merged = list(set(user) & set(system))
 317                 if merged:
 318                     return merged
 319                 else:
 320                     # No overlap; use the system restrictions
 321                     return system
 322             else:
 323                 # One or the other is blank; use the one that's not
 324                 return user or system
 325
 326         # @foo means language must be foo; otherwise it's a table name
 327         lang_requirements = merge_requirements(lambda req: req[0] == u'@')
 328         type_requirements = merge_requirements(lambda req: req[0] != u'@')
 329         all_requirements = lang_requirements + type_requirements
 330
 331         # Construct the term
 332         lang_terms = []
 333         for lang in lang_requirements:
 334             # Allow for either country or language codes
 335             lang_code = lang[1:]
 336             lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
 337             lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
 338
 339         type_terms = []
 340         for type in type_requirements:
 341             table_name = self._parse_table_name(type)
 342
 343             # Quietly ignore bogus valid_types; more likely to DTRT
 344             if table_name:
 345                 type_terms.append(whoosh.query.Term(u'table', table_name))
 346
 347         # Combine both kinds of restriction
 348         all_terms = []
 349         if type_terms:
 350             all_terms.append(whoosh.query.Or(type_terms))
 351         if lang_terms:
 352             all_terms.append(whoosh.query.Or(lang_terms))
 353
 354         return name, all_requirements, whoosh.query.And(all_terms)
 355
 356
 357     def _parse_table_name(self, name):
 358         """Takes a singular table name, table name, or table object and returns
 359         the table name.
 360
 361         Returns None for a bogus name.
 362         """
 363         # Table object
 364         if hasattr(name, '__tablename__'):
 365             return getattr(name, '__tablename__')
 366
 367         # Table name
 368         for table in self.indexed_tables.values():
 369             if name in (table.__tablename__, table.__singlename__):
 370                 return table.__tablename__
 371
 372         # Bogus.  Be nice and return dummy
 373         return None
 374
 375     def _whoosh_records_to_results(self, records, exact=True):
 376         """Converts a list of whoosh's indexed records to LookupResult tuples
 377         containing database objects.
 378         """
 379         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 380         # way to handle it, since only lookup() cares about fuzzy results
 381         seen = {}
 382         results = []
 383         for record in records:
 384             # Skip dupes
 385             seen_key = record['table'], record['row_id']
 386             if seen_key in seen:
 387                 continue
 388             seen[seen_key] = True
 389
 390             cls = self.indexed_tables[record['table']]
 391             obj = self.session.query(cls).get(record['row_id'])
 392
 393             results.append(LookupResult(object=obj,
 394                                         indexed_name=record['name'],
 395                                         name=record['display_name'],
 396                                         language=record.get('language'),
 397                                         iso639=record['iso639'],
 398                                         iso3166=record['iso3166'],
 399                                         exact=exact))
 400
 401         return results
 402
 403
 404     def lookup(self, input, valid_types=[], exact_only=False):
 405         """Attempts to find some sort of object, given a name.
 406
 407         Returns a list of named (object, name, language, iso639, iso3166,
 408         exact) tuples.  `object` is a database object, `name` is the name under
 409         which the object was found, `language` and the two isos are the name
 410         and country codes of the language in which the name was found, and
 411         `exact` is True iff this was an exact match.
 412
 413         This function currently ONLY does fuzzy matching if there are no exact
 414         matches.
 415
 416         Formes are not returned unless requested; "Shaymin" will return only
 417         grass Shaymin.
 418
 419         Extraneous whitespace is removed with extreme prejudice.
 420
 421         Recognizes:
 422         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 423         - Foreign names: "Iibui", "Eivui"
 424         - Fuzzy names in whatever language: "Evee", "Ibui"
 425         - IDs: "133", "192", "250"
 426         Also:
 427         - Type restrictions.  "type:psychic" will only return the type.  This
 428           is how to make ID lookup useful.  Multiple type specs can be entered
 429           with commas, as "move,item:1".
 430         - Language restrictions.  "@fr:charge" will only return Tackle, which
 431           is called "Charge" in French.  These can be combined with type
 432           restrictions, e.g., "@fr,move:charge".
 433         - Alternate formes can be specified merely like "wash rotom".
 434
 435         `input`
 436             Name of the thing to look for.
 437
 438         `valid_types`
 439             A list of type or language restrictions, e.g., `['pokemon',
 440             '@ja']`.  If this is provided, only results in one of the given
 441             tables will be returned.
 442
 443         `exact_only`
 444             If True, only exact matches are returned.  If set to False (the
 445             default), and the provided `name` doesn't match anything exactly,
 446             spelling correction will be attempted.
 447         """
 448
 449         name = self.normalize_name(input)
 450         exact = True
 451         form = None
 452
 453         # Pop off any type prefix and merge with valid_types
 454         name, merged_valid_types, type_term = \
 455             self._apply_valid_types(name, valid_types)
 456
 457         # Random lookup
 458         if name == 'random':
 459             return self.random_lookup(valid_types=merged_valid_types)
 460
 461         # Do different things depending what the query looks like
 462         # Note: Term objects do an exact match, so we don't have to worry about
 463         # a query parser tripping on weird characters in the input
 464         try:
 465             # Let Python try to convert to a number, so 0xff works
 466             name_as_number = int(name, base=0)
 467         except ValueError:
 468             # Oh well
 469             name_as_number = None
 470
 471         if '*' in name or '?' in name:
 472             exact_only = True
 473             query = whoosh.query.Wildcard(u'name', name)
 474         elif name_as_number is not None:
 475             # Don't spell-check numbers!
 476             exact_only = True
 477             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 478         else:
 479             # Not an integer
 480             query = whoosh.query.Term(u'name', name)
 481
 482         if type_term:
 483             query = query & type_term
 484
 485
 486         ### Actual searching
 487         # Limits; result limits are constants, and intermediate results (before
 488         # duplicate items are stripped out) are capped at the result limit
 489         # times another constant.
 490         # Fuzzy are capped at 10, beyond which something is probably very
 491         # wrong.  Exact matches -- that is, wildcards and ids -- are far less
 492         # constrained.
 493         # Also, exact matches are sorted by name, since weight doesn't matter.
 494         sort_by = dict()
 495         if exact_only:
 496             max_results = self.MAX_EXACT_RESULTS
 497             sort_by['sortedby'] = (u'table', u'name')
 498         else:
 499             max_results = self.MAX_FUZZY_RESULTS
 500
 501         searcher = self.index.searcher(weighting=LanguageWeighting())
 502         results = searcher.search(
 503             query,
 504             limit=int(max_results * self.INTERMEDIATE_FACTOR),
 505             **sort_by
 506         )
 507
 508         # Look for some fuzzy matches if necessary
 509         if not exact_only and not results:
 510             exact = False
 511             results = []
 512
 513             fuzzy_query_parts = []
 514             fuzzy_weights = {}
 515             min_weight = [None]
 516             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 517                 # Only allow the top 50% of scores; otherwise there will always
 518                 # be a lot of trailing junk
 519                 if min_weight[0] is None:
 520                     min_weight[0] = weight * 0.5
 521                 elif weight < min_weight[0]:
 522                     break
 523
 524                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 525                 fuzzy_weights[suggestion] = weight
 526
 527             if not fuzzy_query_parts:
 528                 # Nothing at all; don't try querying
 529                 return []
 530
 531             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 532             if type_term:
 533                 fuzzy_query = fuzzy_query & type_term
 534
 535             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 536             results = searcher.search(fuzzy_query)
 537
 538         ### Convert results to db objects
 539         objects = self._whoosh_records_to_results(results, exact=exact)
 540
 541         # Truncate and return
 542         return objects[:max_results]
 543
 544
 545     def random_lookup(self, valid_types=[]):
 546         """Returns a random lookup result from one of the provided
 547         `valid_types`.
 548         """
 549
 550         table_names = []
 551         for valid_type in valid_types:
 552             table_name = self._parse_table_name(valid_type)
 553             # Skip anything not recognized.  Could be, say, a language code.
 554             # XXX The vast majority of Pokémon forms are unnamed and unindexed,
 555             #     which can produce blank results.  So skip them too for now.
 556             if table_name and table_name != 'pokemon_forms':
 557                 table_names.append(table_name)
 558
 559         if not table_names:
 560             # n.b.: It's possible we got a list of valid_types and none of them
 561             # were valid, but this function is guaranteed to return
 562             # *something*, so it politely selects from the entire index instead
 563             table_names = self.indexed_tables.keys()
 564             table_names.remove('pokemon_forms')
 565
 566         # Rather than create an array of many hundred items and pick randomly
 567         # from it, just pick a number up to the total number of potential
 568         # items, then pick randomly from that, and partition the whole range
 569         # into chunks.  This also avoids the slight problem that the index
 570         # contains more rows (for languages) for some items than others.
 571         # XXX ought to cache this (in the index?) if possible
 572         total = 0
 573         partitions = []
 574         for table_name in table_names:
 575             count = self.session.query(self.indexed_tables[table_name]).count()
 576             total += count
 577             partitions.append((table_name, count))
 578
 579         n = random.randint(1, total)
 580         while n > partitions[0][1]:
 581             n -= partitions[0][1]
 582             partitions.pop(0)
 583
 584         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 585
 586     def prefix_lookup(self, prefix, valid_types=[]):
 587         """Returns terms starting with the given exact prefix.
 588
 589         Type prefixes are recognized, but no other name munging is done.
 590         """
 591
 592         # Pop off any type prefix and merge with valid_types
 593         prefix, merged_valid_types, type_term = \
 594             self._apply_valid_types(prefix, valid_types)
 595
 596         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 597
 598         if type_term:
 599             query = query & type_term
 600
 601         searcher = self.index.searcher()
 602         searcher.weighting = LanguageWeighting()
 603         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 604
 605         return self._whoosh_records_to_results(results)