pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult', [
  30     'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
  31 ])
  32
  33 class UninitializedIndex(object):
  34     class UninitializedIndexError(Exception):
  35         pass
  36
  37     def __nonzero__(self):
  38         """Dummy object should identify itself as False."""
  39         return False
  40
  41     def __bool__(self):
  42         """Python 3000 version of the above.  Future-proofing rules!"""
  43         return False
  44
  45     def __getattr__(self, *args, **kwargs):
  46         raise self.UninitializedIndexError(
  47             "The lookup index does not exist.  Please use `pokedex setup` "
  48             "or lookup.rebuild_index() to create it."
  49         )
  50
  51 class LanguageWeighting(whoosh.scoring.Weighting):
  52     """A scoring class that forces otherwise-equal English results to come
  53     before foreign results.
  54     """
  55
  56     def __init__(self, extra_weights={}, *args, **kwargs):
  57         """`extra_weights` may be a dictionary of weights which will be
  58         factored in.
  59
  60         Intended for use with spelling corrections, which come along with their
  61         own weightings.
  62         """
  63         self.extra_weights = extra_weights
  64         super(LanguageWeighting, self).__init__(*args, **kwargs)
  65
  66     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  67         doc = searcher.stored_fields(docnum)
  68
  69         # Apply extra weight
  70         weight = weight * self.extra_weights.get(text, 1.0)
  71
  72         language = doc.get('language')
  73         if language is None:
  74             # English (well, "default"); leave it at 1
  75             return weight
  76         elif language == u'Roomaji':
  77             # Give Roomaji a little boost; it's most likely to be searched
  78             return weight * 0.9
  79         else:
  80             # Everything else can drop down the totem pole
  81             return weight * 0.8
  82
  83
  84 class PokedexLookup(object):
  85     MAX_FUZZY_RESULTS = 10
  86     MAX_EXACT_RESULTS = 43
  87     INTERMEDIATE_FACTOR = 2
  88
  89     # The speller only checks how much the input matches a word; there can be
  90     # all manner of extra unmatched junk, and it won't affect the weighting.
  91     # To compensate, greatly boost the weighting of matches at the beginning
  92     # and end, so nearly-full-word-matches are much better
  93     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  94
  95     # Dictionary of table name => table class.
  96     # Need the table name so we can get the class from the table name after we
  97     # retrieve something from the index
  98     indexed_tables = dict(
  99         (cls.__tablename__, cls)
 100         for cls in (
 101             tables.Ability,
 102             tables.Item,
 103             tables.Location,
 104             tables.Move,
 105             tables.Nature,
 106             tables.Pokemon,
 107             tables.Type,
 108         )
 109     )
 110
 111
 112     def __init__(self, directory=None, session=None):
 113         """Opens the whoosh index stored in the named directory.  If the index
 114         doesn't already exist, it will be created.
 115
 116         `directory`
 117             Directory containing the index.  Defaults to a location within the
 118             `pokedex` egg directory.
 119
 120         `session`
 121             Used for creating the index and retrieving objects.  Defaults to an
 122             attempt to connect to the default SQLite database installed by
 123             `pokedex setup`.
 124         """
 125
 126         # By the time this returns, self.index, self.speller, and self.session
 127         # must be set
 128
 129         # If a directory was not given, use the default
 130         if directory is None:
 131             directory = get_default_index_dir()
 132
 133         self.directory = directory
 134
 135         if session:
 136             self.session = session
 137         else:
 138             self.session = connect()
 139
 140         # Attempt to open or create the index
 141         if not os.path.exists(directory) or not os.listdir(directory):
 142             # Directory doesn't exist OR is empty; caller needs to use
 143             # rebuild_index before doing anything.  Provide a dummy object that
 144             # complains when used
 145             self.index = UninitializedIndex()
 146             self.speller = UninitializedIndex()
 147             return
 148
 149         # Otherwise, already exists; should be an index!  Bam, done.
 150         # Note that this will explode if the directory exists but doesn't
 151         # contain an index; that's a feature
 152         try:
 153             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 154         except whoosh.index.EmptyIndexError:
 155             raise IOError(
 156                 "The index directory already contains files.  "
 157                 "Please use a dedicated directory for the lookup index."
 158             )
 159
 160         # Create speller, and done
 161         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 162         self.speller = whoosh.spelling.SpellChecker(spell_store,
 163             **self.SPELLER_OPTIONS)
 164
 165
 166     def rebuild_index(self):
 167         """Creates the index from scratch."""
 168
 169         schema = whoosh.fields.Schema(
 170             name=whoosh.fields.ID(stored=True),
 171             table=whoosh.fields.ID(stored=True),
 172             row_id=whoosh.fields.ID(stored=True),
 173             language=whoosh.fields.STORED,
 174             iso639=whoosh.fields.ID(stored=True),
 175             iso3166=whoosh.fields.ID(stored=True),
 176             display_name=whoosh.fields.STORED,  # non-lowercased name
 177         )
 178
 179         if os.path.exists(self.directory):
 180             # create_in() isn't totally reliable, so just nuke whatever's there
 181             # manually.  Try to be careful about this...
 182             for f in os.listdir(self.directory):
 183                 if re.match('^_?(MAIN|SPELL)_', f):
 184                     os.remove(os.path.join(self.directory, f))
 185         else:
 186             os.mkdir(self.directory)
 187
 188         self.index = whoosh.index.create_in(self.directory, schema=schema,
 189                                                             indexname='MAIN')
 190         writer = self.index.writer()
 191
 192         # Index every name in all our tables of interest
 193         speller_entries = set()
 194         for cls in self.indexed_tables.values():
 195             q = self.session.query(cls)
 196
 197             for row in q.yield_per(5):
 198                 row_key = dict(table=unicode(cls.__tablename__),
 199                                row_id=unicode(row.id))
 200
 201                 def add(name, language, iso639, iso3166):
 202                     normalized_name = self.normalize_name(name)
 203
 204                     writer.add_document(
 205                         name=normalized_name, display_name=name,
 206                         language=language, iso639=iso639, iso3166=iso3166,
 207                         **row_key
 208                     )
 209
 210                     speller_entries.add(normalized_name)
 211
 212
 213                 # Add the basic English name to the index
 214                 if cls == tables.Pokemon:
 215                     # Pokémon need their form name added
 216                     # XXX kinda kludgy
 217                     add(row.full_name, None, u'en', u'us')
 218
 219                     # If this is a default form, ALSO add the unadorned name,
 220                     # so 'Deoxys' alone will still do the right thing
 221                     if row.forme_name and not row.forme_base_pokemon_id:
 222                         add(row.name, None, u'en', u'us')
 223                 else:
 224                     add(row.name, None, u'en', u'us')
 225
 226                 # Some things also have other languages' names
 227                 # XXX other language form names..?
 228                 for foreign_name in getattr(row, 'foreign_names', []):
 229                     moonspeak = foreign_name.name
 230                     if row.name == moonspeak:
 231                         # Don't add the English name again as a different
 232                         # language; no point and it makes spell results
 233                         # confusing
 234                         continue
 235
 236                     add(moonspeak, foreign_name.language.name,
 237                                    foreign_name.language.iso639,
 238                                    foreign_name.language.iso3166)
 239
 240                     # Add Roomaji too
 241                     if foreign_name.language.name == 'Japanese':
 242                         roomaji = romanize(foreign_name.name)
 243                         add(roomaji, u'Roomaji', u'ja', u'jp')
 244
 245         writer.commit()
 246
 247         # Construct and populate a spell-checker index.  Quicker to do it all
 248         # at once, as every call to add_* does a commit(), and those seem to be
 249         # expensive
 250         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 251             **self.SPELLER_OPTIONS)
 252         self.speller.add_words(speller_entries)
 253
 254
 255     def normalize_name(self, name):
 256         """Strips irrelevant formatting junk from name input.
 257
 258         Specifically: everything is lowercased, and accents are removed.
 259         """
 260         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 261         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 262         # characters, then recombine.  I'm explicitly doing it this way instead
 263         # of testing combining() because Korean characters apparently
 264         # decompose!  But the results are considered letters, not combining
 265         # characters, so testing for Mn works well, and combining them again
 266         # makes them look right.
 267         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 268         name = u"".join(c for c in nkfd_form
 269                         if unicodedata.category(c) != 'Mn')
 270         name = unicodedata.normalize('NFC', name)
 271
 272         name = name.strip()
 273         name = name.lower()
 274
 275         return name
 276
 277
 278     def _apply_valid_types(self, name, valid_types):
 279         """Combines the enforced `valid_types` with any from the search string
 280         itself and updates the query.
 281
 282         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 283         only `b`s named "foo".
 284
 285         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 286         prefix stripped, `merged_valid_types` combines the original
 287         `valid_types` with the type prefix, and `term` is a query term for
 288         limited to just the allowed types.  If there are no type restrictions
 289         at all, `term` will be None.
 290         """
 291
 292         # Remove any type prefix (pokemon:133) first
 293         user_valid_types = []
 294         if ':' in name:
 295             prefix_chunk, name = name.split(':', 1)
 296             name = name.strip()
 297
 298             prefixes = prefix_chunk.split(',')
 299             user_valid_types = [_.strip() for _ in prefixes]
 300
 301         # Merge the valid types together.  Only types that appear in BOTH lists
 302         # may be used.
 303         # As a special case, if the user asked for types that are explicitly
 304         # forbidden, completely ignore what the user requested.
 305         # And, just to complicate matters: "type" and language need to be
 306         # considered separately.
 307         def merge_requirements(func):
 308             user = filter(func, user_valid_types)
 309             system = filter(func, valid_types)
 310
 311             if user and system:
 312                 merged = list(set(user) & set(system))
 313                 if merged:
 314                     return merged
 315                 else:
 316                     # No overlap; use the system restrictions
 317                     return system
 318             else:
 319                 # One or the other is blank; use the one that's not
 320                 return user or system
 321
 322         # @foo means language must be foo; otherwise it's a table name
 323         lang_requirements = merge_requirements(lambda req: req[0] == u'@')
 324         type_requirements = merge_requirements(lambda req: req[0] != u'@')
 325         all_requirements = lang_requirements + type_requirements
 326
 327         # Construct the term
 328         lang_terms = []
 329         for lang in lang_requirements:
 330             # Allow for either country or language codes
 331             lang_code = lang[1:]
 332             lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
 333             lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
 334
 335         type_terms = []
 336         for type in type_requirements:
 337             table_name = self._parse_table_name(type)
 338
 339             # Quietly ignore bogus valid_types; more likely to DTRT
 340             if table_name:
 341                 type_terms.append(whoosh.query.Term(u'table', table_name))
 342
 343         # Combine both kinds of restriction
 344         all_terms = []
 345         if type_terms:
 346             all_terms.append(whoosh.query.Or(type_terms))
 347         if lang_terms:
 348             all_terms.append(whoosh.query.Or(lang_terms))
 349
 350         return name, all_requirements, whoosh.query.And(all_terms)
 351
 352
 353     def _parse_table_name(self, name):
 354         """Takes a singular table name, table name, or table object and returns
 355         the table name.
 356
 357         Returns None for a bogus name.
 358         """
 359         # Table object
 360         if hasattr(name, '__tablename__'):
 361             return getattr(name, '__tablename__')
 362
 363         # Table name
 364         for table in self.indexed_tables.values():
 365             if name in (table.__tablename__, table.__singlename__):
 366                 return table.__tablename__
 367
 368         # Bogus.  Be nice and return dummy
 369         return None
 370
 371     def _whoosh_records_to_results(self, records, exact=True):
 372         """Converts a list of whoosh's indexed records to LookupResult tuples
 373         containing database objects.
 374         """
 375         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 376         # way to handle it, since only lookup() cares about fuzzy results
 377         seen = {}
 378         results = []
 379         for record in records:
 380             # Skip dupes
 381             seen_key = record['table'], record['row_id']
 382             if seen_key in seen:
 383                 continue
 384             seen[seen_key] = True
 385
 386             cls = self.indexed_tables[record['table']]
 387             obj = self.session.query(cls).get(record['row_id'])
 388
 389             results.append(LookupResult(object=obj,
 390                                         indexed_name=record['name'],
 391                                         name=record['display_name'],
 392                                         language=record.get('language'),
 393                                         iso639=record['iso639'],
 394                                         iso3166=record['iso3166'],
 395                                         exact=exact))
 396
 397         return results
 398
 399
 400     def lookup(self, input, valid_types=[], exact_only=False):
 401         """Attempts to find some sort of object, given a name.
 402
 403         Returns a list of named (object, name, language, iso639, iso3166,
 404         exact) tuples.  `object` is a database object, `name` is the name under
 405         which the object was found, `language` and the two isos are the name
 406         and country codes of the language in which the name was found, and
 407         `exact` is True iff this was an exact match.
 408
 409         This function currently ONLY does fuzzy matching if there are no exact
 410         matches.
 411
 412         Formes are not returned unless requested; "Shaymin" will return only
 413         grass Shaymin.
 414
 415         Extraneous whitespace is removed with extreme prejudice.
 416
 417         Recognizes:
 418         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 419         - Foreign names: "Iibui", "Eivui"
 420         - Fuzzy names in whatever language: "Evee", "Ibui"
 421         - IDs: "133", "192", "250"
 422         Also:
 423         - Type restrictions.  "type:psychic" will only return the type.  This
 424           is how to make ID lookup useful.  Multiple type specs can be entered
 425           with commas, as "move,item:1".
 426         - Language restrictions.  "@fr:charge" will only return Tackle, which
 427           is called "Charge" in French.  These can be combined with type
 428           restrictions, e.g., "@fr,move:charge".
 429         - Alternate formes can be specified merely like "wash rotom".
 430
 431         `input`
 432             Name of the thing to look for.
 433
 434         `valid_types`
 435             A list of type or language restrictions, e.g., `['pokemon',
 436             '@ja']`.  If this is provided, only results in one of the given
 437             tables will be returned.
 438
 439         `exact_only`
 440             If True, only exact matches are returned.  If set to False (the
 441             default), and the provided `name` doesn't match anything exactly,
 442             spelling correction will be attempted.
 443         """
 444
 445         name = self.normalize_name(input)
 446         exact = True
 447         form = None
 448
 449         # Pop off any type prefix and merge with valid_types
 450         name, merged_valid_types, type_term = \
 451             self._apply_valid_types(name, valid_types)
 452
 453         # Random lookup
 454         if name == 'random':
 455             return self.random_lookup(valid_types=merged_valid_types)
 456
 457         # Do different things depending what the query looks like
 458         # Note: Term objects do an exact match, so we don't have to worry about
 459         # a query parser tripping on weird characters in the input
 460         try:
 461             # Let Python try to convert to a number, so 0xff works
 462             name_as_number = int(name, base=0)
 463         except ValueError:
 464             # Oh well
 465             name_as_number = None
 466
 467         if '*' in name or '?' in name:
 468             exact_only = True
 469             query = whoosh.query.Wildcard(u'name', name)
 470         elif name_as_number is not None:
 471             # Don't spell-check numbers!
 472             exact_only = True
 473             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 474         else:
 475             # Not an integer
 476             query = whoosh.query.Term(u'name', name)
 477
 478         if type_term:
 479             query = query & type_term
 480
 481
 482         ### Actual searching
 483         # Limits; result limits are constants, and intermediate results (before
 484         # duplicate items are stripped out) are capped at the result limit
 485         # times another constant.
 486         # Fuzzy are capped at 10, beyond which something is probably very
 487         # wrong.  Exact matches -- that is, wildcards and ids -- are far less
 488         # constrained.
 489         # Also, exact matches are sorted by name, since weight doesn't matter.
 490         sort_by = dict()
 491         if exact_only:
 492             max_results = self.MAX_EXACT_RESULTS
 493             sort_by['sortedby'] = (u'table', u'name')
 494         else:
 495             max_results = self.MAX_FUZZY_RESULTS
 496
 497         searcher = self.index.searcher(weighting=LanguageWeighting())
 498         results = searcher.search(
 499             query,
 500             limit=int(max_results * self.INTERMEDIATE_FACTOR),
 501             **sort_by
 502         )
 503
 504         # Look for some fuzzy matches if necessary
 505         if not exact_only and not results:
 506             exact = False
 507             results = []
 508
 509             fuzzy_query_parts = []
 510             fuzzy_weights = {}
 511             min_weight = [None]
 512             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 513                 # Only allow the top 50% of scores; otherwise there will always
 514                 # be a lot of trailing junk
 515                 if min_weight[0] is None:
 516                     min_weight[0] = weight * 0.5
 517                 elif weight < min_weight[0]:
 518                     break
 519
 520                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 521                 fuzzy_weights[suggestion] = weight
 522
 523             if not fuzzy_query_parts:
 524                 # Nothing at all; don't try querying
 525                 return []
 526
 527             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 528             if type_term:
 529                 fuzzy_query = fuzzy_query & type_term
 530
 531             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 532             results = searcher.search(fuzzy_query)
 533
 534         ### Convert results to db objects
 535         objects = self._whoosh_records_to_results(results, exact=exact)
 536
 537         # Truncate and return
 538         return objects[:max_results]
 539
 540
 541     def random_lookup(self, valid_types=[]):
 542         """Returns a random lookup result from one of the provided
 543         `valid_types`.
 544         """
 545
 546         table_names = []
 547         for valid_type in valid_types:
 548             table_name = self._parse_table_name(valid_type)
 549             # Skip anything not recognized.  Could be, say, a language code
 550             if table_name:
 551                 table_names.append(table_name)
 552
 553         if not table_names:
 554             # n.b.: It's possible we got a list of valid_types and none of them
 555             # were valid, but this function is guaranteed to return
 556             # *something*, so it politely selects from the entire index instead
 557             table_names = self.indexed_tables.keys()
 558
 559         # Rather than create an array of many hundred items and pick randomly
 560         # from it, just pick a number up to the total number of potential
 561         # items, then pick randomly from that, and partition the whole range
 562         # into chunks.  This also avoids the slight problem that the index
 563         # contains more rows (for languages) for some items than others.
 564         # XXX ought to cache this (in the index?) if possible
 565         total = 0
 566         partitions = []
 567         for table_name in table_names:
 568             count = self.session.query(self.indexed_tables[table_name]).count()
 569             total += count
 570             partitions.append((table_name, count))
 571
 572         n = random.randint(1, total)
 573         while n > partitions[0][1]:
 574             n -= partitions[0][1]
 575             partitions.pop(0)
 576
 577         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 578
 579     def prefix_lookup(self, prefix, valid_types=[]):
 580         """Returns terms starting with the given exact prefix.
 581
 582         Type prefixes are recognized, but no other name munging is done.
 583         """
 584
 585         # Pop off any type prefix and merge with valid_types
 586         prefix, merged_valid_types, type_term = \
 587             self._apply_valid_types(prefix, valid_types)
 588
 589         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 590
 591         if type_term:
 592             query = query & type_term
 593
 594         searcher = self.index.searcher()
 595         searcher.weighting = LanguageWeighting()
 596         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 597
 598         return self._whoosh_records_to_results(results)