pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult', [
  30     'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
  31 ])
  32
  33 class UninitializedIndex(object):
  34     class UninitializedIndexError(Exception):
  35         pass
  36
  37     def __nonzero__(self):
  38         """Dummy object should identify itself as False."""
  39         return False
  40
  41     def __bool__(self):
  42         """Python 3000 version of the above.  Future-proofing rules!"""
  43         return False
  44
  45     def __getattr__(self, *args, **kwargs):
  46         raise self.UninitializedIndexError(
  47             "The lookup index does not exist.  Please use `pokedex setup` "
  48             "or lookup.rebuild_index() to create it."
  49         )
  50
  51 class LanguageWeighting(whoosh.scoring.Weighting):
  52     """A scoring class that forces otherwise-equal English results to come
  53     before foreign results.
  54     """
  55
  56     def __init__(self, extra_weights={}, *args, **kwargs):
  57         """`extra_weights` may be a dictionary of weights which will be
  58         factored in.
  59
  60         Intended for use with spelling corrections, which come along with their
  61         own weightings.
  62         """
  63         self.extra_weights = extra_weights
  64         super(LanguageWeighting, self).__init__(*args, **kwargs)
  65
  66     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  67         doc = searcher.stored_fields(docnum)
  68
  69         # Apply extra weight
  70         weight = weight * self.extra_weights.get(text, 1.0)
  71
  72         if doc['language'] == None:
  73             # English (well, "default"); leave it at 1
  74             return weight
  75         elif doc['language'] == u'Roomaji':
  76             # Give Roomaji a little boost; it's most likely to be searched
  77             return weight * 0.9
  78         else:
  79             # Everything else can drop down the totem pole
  80             return weight * 0.8
  81
  82
  83 class PokedexLookup(object):
  84     MAX_FUZZY_RESULTS = 10
  85     MAX_EXACT_RESULTS = 43
  86     INTERMEDIATE_FACTOR = 2
  87
  88     # The speller only checks how much the input matches a word; there can be
  89     # all manner of extra unmatched junk, and it won't affect the weighting.
  90     # To compensate, greatly boost the weighting of matches at the beginning
  91     # and end, so nearly-full-word-matches are much better
  92     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  93
  94     # Dictionary of table name => table class.
  95     # Need the table name so we can get the class from the table name after we
  96     # retrieve something from the index
  97     indexed_tables = dict(
  98         (cls.__tablename__, cls)
  99         for cls in (
 100             tables.Ability,
 101             tables.Item,
 102             tables.Location,
 103             tables.Move,
 104             tables.Nature,
 105             tables.Pokemon,
 106             tables.Type,
 107         )
 108     )
 109
 110
 111     def __init__(self, directory=None, session=None):
 112         """Opens the whoosh index stored in the named directory.  If the index
 113         doesn't already exist, it will be created.
 114
 115         `directory`
 116             Directory containing the index.  Defaults to a location within the
 117             `pokedex` egg directory.
 118
 119         `session`
 120             Used for creating the index and retrieving objects.  Defaults to an
 121             attempt to connect to the default SQLite database installed by
 122             `pokedex setup`.
 123         """
 124
 125         # By the time this returns, self.index, self.speller, and self.session
 126         # must be set
 127
 128         # If a directory was not given, use the default
 129         if directory is None:
 130             directory = get_default_index_dir()
 131
 132         self.directory = directory
 133
 134         if session:
 135             self.session = session
 136         else:
 137             self.session = connect()
 138
 139         # Attempt to open or create the index
 140         if not os.path.exists(directory) or not os.listdir(directory):
 141             # Directory doesn't exist OR is empty; caller needs to use
 142             # rebuild_index before doing anything.  Provide a dummy object that
 143             # complains when used
 144             self.index = UninitializedIndex()
 145             self.speller = UninitializedIndex()
 146             return
 147
 148         # Otherwise, already exists; should be an index!  Bam, done.
 149         # Note that this will explode if the directory exists but doesn't
 150         # contain an index; that's a feature
 151         try:
 152             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 153         except whoosh.index.EmptyIndexError:
 154             raise IOError(
 155                 "The index directory already contains files.  "
 156                 "Please use a dedicated directory for the lookup index."
 157             )
 158
 159         # Create speller, and done
 160         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 161         self.speller = whoosh.spelling.SpellChecker(spell_store,
 162             **self.SPELLER_OPTIONS)
 163
 164
 165     def rebuild_index(self):
 166         """Creates the index from scratch."""
 167
 168         schema = whoosh.fields.Schema(
 169             name=whoosh.fields.ID(stored=True),
 170             table=whoosh.fields.ID(stored=True),
 171             row_id=whoosh.fields.ID(stored=True),
 172             language=whoosh.fields.STORED,
 173             iso639=whoosh.fields.ID(stored=True),
 174             iso3166=whoosh.fields.ID(stored=True),
 175             display_name=whoosh.fields.STORED,  # non-lowercased name
 176         )
 177
 178         if not os.path.exists(self.directory):
 179             os.mkdir(self.directory)
 180
 181         self.index = whoosh.index.create_in(self.directory, schema=schema,
 182                                                             indexname='MAIN')
 183         writer = self.index.writer()
 184
 185         # Index every name in all our tables of interest
 186         speller_entries = set()
 187         for cls in self.indexed_tables.values():
 188             q = self.session.query(cls)
 189
 190             for row in q.yield_per(5):
 191                 row_key = dict(table=unicode(cls.__tablename__),
 192                                row_id=unicode(row.id))
 193
 194                 def add(name, language, iso639, iso3166):
 195                     normalized_name = self.normalize_name(name)
 196
 197                     writer.add_document(
 198                         name=normalized_name, display_name=name,
 199                         language=language, iso639=iso639, iso3166=iso3166,
 200                         **row_key
 201                     )
 202
 203                     speller_entries.add(normalized_name)
 204
 205
 206                 # Add the basic English name to the index
 207                 if cls == tables.Pokemon:
 208                     # Pokémon need their form name added
 209                     # XXX kinda kludgy
 210                     add(row.full_name, None, u'en', u'us')
 211
 212                     # If this is a default form, ALSO add the unadorned name,
 213                     # so 'Deoxys' alone will still do the right thing
 214                     if row.forme_name and not row.forme_base_pokemon_id:
 215                         add(row.name, None, u'en', u'us')
 216                 else:
 217                     add(row.name, None, u'en', u'us')
 218
 219                 # Some things also have other languages' names
 220                 # XXX other language form names..?
 221                 for foreign_name in getattr(row, 'foreign_names', []):
 222                     moonspeak = foreign_name.name
 223                     if row.name == moonspeak:
 224                         # Don't add the English name again as a different
 225                         # language; no point and it makes spell results
 226                         # confusing
 227                         continue
 228
 229                     add(moonspeak, foreign_name.language.name,
 230                                    foreign_name.language.iso639,
 231                                    foreign_name.language.iso3166)
 232
 233                     # Add Roomaji too
 234                     if foreign_name.language.name == 'Japanese':
 235                         roomaji = romanize(foreign_name.name)
 236                         add(roomaji, u'Roomaji', u'ja', u'jp')
 237
 238         writer.commit()
 239
 240         # Construct and populate a spell-checker index.  Quicker to do it all
 241         # at once, as every call to add_* does a commit(), and those seem to be
 242         # expensive
 243         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 244             **self.SPELLER_OPTIONS)
 245         self.speller.add_words(speller_entries)
 246
 247
 248     def normalize_name(self, name):
 249         """Strips irrelevant formatting junk from name input.
 250
 251         Specifically: everything is lowercased, and accents are removed.
 252         """
 253         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 254         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 255         # characters, then recombine.  I'm explicitly doing it this way instead
 256         # of testing combining() because Korean characters apparently
 257         # decompose!  But the results are considered letters, not combining
 258         # characters, so testing for Mn works well, and combining them again
 259         # makes them look right.
 260         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 261         name = u"".join(c for c in nkfd_form
 262                         if unicodedata.category(c) != 'Mn')
 263         name = unicodedata.normalize('NFC', name)
 264
 265         name = name.strip()
 266         name = name.lower()
 267
 268         return name
 269
 270
 271     def _apply_valid_types(self, name, valid_types):
 272         """Combines the enforced `valid_types` with any from the search string
 273         itself and updates the query.
 274
 275         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 276         only `b`s named "foo".
 277
 278         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 279         prefix stripped, `merged_valid_types` combines the original
 280         `valid_types` with the type prefix, and `term` is a query term for
 281         limited to just the allowed types.  If there are no type restrictions
 282         at all, `term` will be None.
 283         """
 284
 285         # Remove any type prefix (pokemon:133) first
 286         user_valid_types = []
 287         if ':' in name:
 288             prefix_chunk, name = name.split(':', 1)
 289             name = name.strip()
 290
 291             prefixes = prefix_chunk.split(',')
 292             user_valid_types = [_.strip() for _ in prefixes]
 293
 294         # Merge the valid types together.  Only types that appear in BOTH lists
 295         # may be used.
 296         # As a special case, if the user asked for types that are explicitly
 297         # forbidden, completely ignore what the user requested.
 298         # And, just to complicate matters: "type" and language need to be
 299         # considered separately.
 300         def merge_requirements(func):
 301             user = filter(func, user_valid_types)
 302             system = filter(func, valid_types)
 303
 304             if user and system:
 305                 merged = list(set(user) & set(system))
 306                 if merged:
 307                     return merged
 308                 else:
 309                     # No overlap; use the system restrictions
 310                     return system
 311             else:
 312                 # One or the other is blank; use the one that's not
 313                 return user or system
 314
 315         # @foo means language must be foo; otherwise it's a table name
 316         lang_requirements = merge_requirements(lambda req: req[0] == u'@')
 317         type_requirements = merge_requirements(lambda req: req[0] != u'@')
 318         all_requirements = lang_requirements + type_requirements
 319
 320         # Construct the term
 321         lang_terms = []
 322         for lang in lang_requirements:
 323             # Allow for either country or language codes
 324             lang_code = lang[1:]
 325             lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
 326             lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
 327
 328         type_terms = []
 329         for type in type_requirements:
 330             table_name = self._parse_table_name(type)
 331
 332             # Quietly ignore bogus valid_types; more likely to DTRT
 333             if table_name:
 334                 type_terms.append(whoosh.query.Term(u'table', table_name))
 335
 336         # Combine both kinds of restriction
 337         all_terms = []
 338         if type_terms:
 339             all_terms.append(whoosh.query.Or(type_terms))
 340         if lang_terms:
 341             all_terms.append(whoosh.query.Or(lang_terms))
 342
 343         return name, all_requirements, whoosh.query.And(all_terms)
 344
 345
 346     def _parse_table_name(self, name):
 347         """Takes a singular table name, table name, or table object and returns
 348         the table name.
 349
 350         Returns None for a bogus name.
 351         """
 352         # Table object
 353         if hasattr(name, '__tablename__'):
 354             return getattr(name, '__tablename__')
 355
 356         # Table name
 357         for table in self.indexed_tables.values():
 358             if name in (table.__tablename__, table.__singlename__):
 359                 return table.__tablename__
 360
 361         # Bogus.  Be nice and return dummy
 362         return None
 363
 364     def _whoosh_records_to_results(self, records, exact=True):
 365         """Converts a list of whoosh's indexed records to LookupResult tuples
 366         containing database objects.
 367         """
 368         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 369         # way to handle it, since only lookup() cares about fuzzy results
 370         seen = {}
 371         results = []
 372         for record in records:
 373             # Skip dupes
 374             seen_key = record['table'], record['row_id']
 375             if seen_key in seen:
 376                 continue
 377             seen[seen_key] = True
 378
 379             cls = self.indexed_tables[record['table']]
 380             obj = self.session.query(cls).get(record['row_id'])
 381
 382             results.append(LookupResult(object=obj,
 383                                         indexed_name=record['name'],
 384                                         name=record['display_name'],
 385                                         language=record['language'],
 386                                         iso639=record['iso639'],
 387                                         iso3166=record['iso3166'],
 388                                         exact=exact))
 389
 390         return results
 391
 392
 393     def lookup(self, input, valid_types=[], exact_only=False):
 394         """Attempts to find some sort of object, given a name.
 395
 396         Returns a list of named (object, name, language, iso639, iso3166,
 397         exact) tuples.  `object` is a database object, `name` is the name under
 398         which the object was found, `language` and the two isos are the name
 399         and country codes of the language in which the name was found, and
 400         `exact` is True iff this was an exact match.
 401
 402         This function currently ONLY does fuzzy matching if there are no exact
 403         matches.
 404
 405         Formes are not returned unless requested; "Shaymin" will return only
 406         grass Shaymin.
 407
 408         Extraneous whitespace is removed with extreme prejudice.
 409
 410         Recognizes:
 411         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 412         - Foreign names: "Iibui", "Eivui"
 413         - Fuzzy names in whatever language: "Evee", "Ibui"
 414         - IDs: "133", "192", "250"
 415         Also:
 416         - Type restrictions.  "type:psychic" will only return the type.  This
 417           is how to make ID lookup useful.  Multiple type specs can be entered
 418           with commas, as "move,item:1".
 419         - Language restrictions.  "@fr:charge" will only return Tackle, which
 420           is called "Charge" in French.  These can be combined with type
 421           restrictions, e.g., "@fr,move:charge".
 422         - Alternate formes can be specified merely like "wash rotom".
 423
 424         `input`
 425             Name of the thing to look for.
 426
 427         `valid_types`
 428             A list of type or language restrictions, e.g., `['pokemon',
 429             '@ja']`.  If this is provided, only results in one of the given
 430             tables will be returned.
 431
 432         `exact_only`
 433             If True, only exact matches are returned.  If set to False (the
 434             default), and the provided `name` doesn't match anything exactly,
 435             spelling correction will be attempted.
 436         """
 437
 438         name = self.normalize_name(input)
 439         exact = True
 440         form = None
 441
 442         # Pop off any type prefix and merge with valid_types
 443         name, merged_valid_types, type_term = \
 444             self._apply_valid_types(name, valid_types)
 445
 446         # Random lookup
 447         if name == 'random':
 448             return self.random_lookup(valid_types=merged_valid_types)
 449
 450         # Do different things depending what the query looks like
 451         # Note: Term objects do an exact match, so we don't have to worry about
 452         # a query parser tripping on weird characters in the input
 453         try:
 454             # Let Python try to convert to a number, so 0xff works
 455             name_as_number = int(name, base=0)
 456         except ValueError:
 457             # Oh well
 458             name_as_number = None
 459
 460         if '*' in name or '?' in name:
 461             exact_only = True
 462             query = whoosh.query.Wildcard(u'name', name)
 463         elif name_as_number is not None:
 464             # Don't spell-check numbers!
 465             exact_only = True
 466             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 467         else:
 468             # Not an integer
 469             query = whoosh.query.Term(u'name', name)
 470
 471         if type_term:
 472             query = query & type_term
 473
 474
 475         ### Actual searching
 476         # Limits; result limits are constants, and intermediate results (before
 477         # duplicate items are stripped out) are capped at the result limit
 478         # times another constant.
 479         # Fuzzy are capped at 10, beyond which something is probably very
 480         # wrong.  Exact matches -- that is, wildcards and ids -- are far less
 481         # constrained.
 482         # Also, exact matches are sorted by name, since weight doesn't matter.
 483         sort_by = dict()
 484         if exact_only:
 485             max_results = self.MAX_EXACT_RESULTS
 486             sort_by['sortedby'] = (u'table', u'name')
 487         else:
 488             max_results = self.MAX_FUZZY_RESULTS
 489
 490         searcher = self.index.searcher(weighting=LanguageWeighting())
 491         results = searcher.search(
 492             query,
 493             limit=int(max_results * self.INTERMEDIATE_FACTOR),
 494             **sort_by
 495         )
 496
 497         # Look for some fuzzy matches if necessary
 498         if not exact_only and not results:
 499             exact = False
 500             results = []
 501
 502             fuzzy_query_parts = []
 503             fuzzy_weights = {}
 504             min_weight = [None]
 505             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 506                 # Only allow the top 50% of scores; otherwise there will always
 507                 # be a lot of trailing junk
 508                 if min_weight[0] is None:
 509                     min_weight[0] = weight * 0.5
 510                 elif weight < min_weight[0]:
 511                     break
 512
 513                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 514                 fuzzy_weights[suggestion] = weight
 515
 516             if not fuzzy_query_parts:
 517                 # Nothing at all; don't try querying
 518                 return []
 519
 520             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 521             if type_term:
 522                 fuzzy_query = fuzzy_query & type_term
 523
 524             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 525             results = searcher.search(fuzzy_query)
 526
 527         ### Convert results to db objects
 528         objects = self._whoosh_records_to_results(results, exact=exact)
 529
 530         # Truncate and return
 531         return objects[:max_results]
 532
 533
 534     def random_lookup(self, valid_types=[]):
 535         """Returns a random lookup result from one of the provided
 536         `valid_types`.
 537         """
 538
 539         table_names = []
 540         for valid_type in valid_types:
 541             table_name = self._parse_table_name(valid_type)
 542             # Skip anything not recognized.  Could be, say, a language code
 543             if table_name:
 544                 table_names.append(table_name)
 545
 546         if not table_names:
 547             # n.b.: It's possible we got a list of valid_types and none of them
 548             # were valid, but this function is guaranteed to return
 549             # *something*, so it politely selects from the entire index instead
 550             table_names = self.indexed_tables.keys()
 551
 552         # Rather than create an array of many hundred items and pick randomly
 553         # from it, just pick a number up to the total number of potential
 554         # items, then pick randomly from that, and partition the whole range
 555         # into chunks.  This also avoids the slight problem that the index
 556         # contains more rows (for languages) for some items than others.
 557         # XXX ought to cache this (in the index?) if possible
 558         total = 0
 559         partitions = []
 560         for table_name in table_names:
 561             count = self.session.query(self.indexed_tables[table_name]).count()
 562             total += count
 563             partitions.append((table_name, count))
 564
 565         n = random.randint(1, total)
 566         while n > partitions[0][1]:
 567             n -= partitions[0][1]
 568             partitions.pop(0)
 569
 570         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 571
 572     def prefix_lookup(self, prefix, valid_types=[]):
 573         """Returns terms starting with the given exact prefix.
 574
 575         Type prefixes are recognized, but no other name munging is done.
 576         """
 577
 578         # Pop off any type prefix and merge with valid_types
 579         prefix, merged_valid_types, type_term = \
 580             self._apply_valid_types(prefix, valid_types)
 581
 582         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 583
 584         if type_term:
 585             query = query & type_term
 586
 587         searcher = self.index.searcher()
 588         searcher.weighting = LanguageWeighting()
 589         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 590
 591         return self._whoosh_records_to_results(results)