pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult', [
  30     'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
  31 ])
  32
  33 class UninitializedIndex(object):
  34     class UninitializedIndexError(Exception):
  35         pass
  36
  37     def __nonzero__(self):
  38         """Dummy object should identify itself as False."""
  39         return False
  40
  41     def __bool__(self):
  42         """Python 3000 version of the above.  Future-proofing rules!"""
  43         return False
  44
  45     def __getattr__(self, *args, **kwargs):
  46         raise self.UninitializedIndexError(
  47             "The lookup index does not exist.  Please use `pokedex setup` "
  48             "or lookup.rebuild_index() to create it."
  49         )
  50
  51 class LanguageWeighting(whoosh.scoring.Weighting):
  52     """A scoring class that forces otherwise-equal English results to come
  53     before foreign results.
  54     """
  55
  56     def __init__(self, extra_weights={}, *args, **kwargs):
  57         """`extra_weights` may be a dictionary of weights which will be
  58         factored in.
  59
  60         Intended for use with spelling corrections, which come along with their
  61         own weightings.
  62         """
  63         self.extra_weights = extra_weights
  64         super(LanguageWeighting, self).__init__(*args, **kwargs)
  65
  66     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  67         doc = searcher.stored_fields(docnum)
  68
  69         # Apply extra weight
  70         weight = weight * self.extra_weights.get(text, 1.0)
  71
  72         language = doc.get('language')
  73         if language is None:
  74             # English (well, "default"); leave it at 1
  75             return weight
  76         elif language == u'Roomaji':
  77             # Give Roomaji a little boost; it's most likely to be searched
  78             return weight * 0.9
  79         else:
  80             # Everything else can drop down the totem pole
  81             return weight * 0.8
  82
  83
  84 class PokedexLookup(object):
  85     MAX_FUZZY_RESULTS = 10
  86     MAX_EXACT_RESULTS = 43
  87     INTERMEDIATE_FACTOR = 2
  88
  89     # The speller only checks how much the input matches a word; there can be
  90     # all manner of extra unmatched junk, and it won't affect the weighting.
  91     # To compensate, greatly boost the weighting of matches at the beginning
  92     # and end, so nearly-full-word-matches are much better
  93     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  94
  95     # Dictionary of table name => table class.
  96     # Need the table name so we can get the class from the table name after we
  97     # retrieve something from the index
  98     indexed_tables = dict(
  99         (cls.__tablename__, cls)
 100         for cls in (
 101             tables.Ability,
 102             tables.Item,
 103             tables.Location,
 104             tables.Move,
 105             tables.Nature,
 106             tables.Pokemon,
 107             tables.Type,
 108         )
 109     )
 110
 111
 112     def __init__(self, directory=None, session=None):
 113         """Opens the whoosh index stored in the named directory.  If the index
 114         doesn't already exist, it will be created.
 115
 116         `directory`
 117             Directory containing the index.  Defaults to a location within the
 118             `pokedex` egg directory.
 119
 120         `session`
 121             Used for creating the index and retrieving objects.  Defaults to an
 122             attempt to connect to the default SQLite database installed by
 123             `pokedex setup`.
 124         """
 125
 126         # By the time this returns, self.index, self.speller, and self.session
 127         # must be set
 128
 129         # If a directory was not given, use the default
 130         if directory is None:
 131             directory = get_default_index_dir()
 132
 133         self.directory = directory
 134
 135         if session:
 136             self.session = session
 137         else:
 138             self.session = connect()
 139
 140         # Attempt to open or create the index
 141         if not os.path.exists(directory) or not os.listdir(directory):
 142             # Directory doesn't exist OR is empty; caller needs to use
 143             # rebuild_index before doing anything.  Provide a dummy object that
 144             # complains when used
 145             self.index = UninitializedIndex()
 146             self.speller = UninitializedIndex()
 147             return
 148
 149         # Otherwise, already exists; should be an index!  Bam, done.
 150         # Note that this will explode if the directory exists but doesn't
 151         # contain an index; that's a feature
 152         try:
 153             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 154         except whoosh.index.EmptyIndexError:
 155             raise IOError(
 156                 "The index directory already contains files.  "
 157                 "Please use a dedicated directory for the lookup index."
 158             )
 159
 160         # Create speller, and done
 161         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 162         self.speller = whoosh.spelling.SpellChecker(spell_store,
 163             **self.SPELLER_OPTIONS)
 164
 165
 166     def rebuild_index(self):
 167         """Creates the index from scratch."""
 168
 169         schema = whoosh.fields.Schema(
 170             name=whoosh.fields.ID(stored=True),
 171             table=whoosh.fields.ID(stored=True),
 172             row_id=whoosh.fields.ID(stored=True),
 173             language=whoosh.fields.STORED,
 174             iso639=whoosh.fields.ID(stored=True),
 175             iso3166=whoosh.fields.ID(stored=True),
 176             display_name=whoosh.fields.STORED,  # non-lowercased name
 177         )
 178
 179         if os.path.exists(self.directory):
 180             # create_in() isn't totally reliable, so just nuke whatever's there
 181             # manually.  Try to be careful about this...
 182             for f in os.listdir(self.directory):
 183                 if re.match('^_?(MAIN|SPELL)_', f):
 184                     os.remove(os.path.join(self.directory, f))
 185         else:
 186             os.mkdir(self.directory)
 187
 188         self.index = whoosh.index.create_in(self.directory, schema=schema,
 189                                                             indexname='MAIN')
 190         writer = self.index.writer()
 191
 192         # Index every name in all our tables of interest
 193         speller_entries = set()
 194         for cls in self.indexed_tables.values():
 195             q = self.session.query(cls)
 196
 197             for row in q.yield_per(5):
 198                 row_key = dict(table=unicode(cls.__tablename__),
 199                                row_id=unicode(row.id))
 200
 201                 def add(name, language, iso639, iso3166):
 202                     normalized_name = self.normalize_name(name)
 203
 204                     writer.add_document(
 205                         name=normalized_name, display_name=name,
 206                         language=language, iso639=iso639, iso3166=iso3166,
 207                         **row_key
 208                     )
 209
 210                     speller_entries.add(normalized_name)
 211
 212
 213                 # Add the basic English name to the index
 214                 if cls == tables.Pokemon:
 215                     # Pokémon need their form name added
 216                     # XXX kinda kludgy
 217                     add(row.full_name, None, u'en', u'us')
 218
 219                     # If this is a default form, ALSO add the unadorned name,
 220                     # so 'Deoxys' alone will still do the right thing
 221                     if row.forme_name and not row.forme_base_pokemon_id:
 222                         add(row.name, None, u'en', u'us')
 223                 else:
 224                     add(row.name, None, u'en', u'us')
 225
 226                 # Some things also have other languages' names
 227                 # XXX other language form names..?
 228                 for foreign_name in getattr(row, 'foreign_names', []):
 229                     moonspeak = foreign_name.name
 230                     if row.name == moonspeak:
 231                         # Don't add the English name again as a different
 232                         # language; no point and it makes spell results
 233                         # confusing
 234                         continue
 235
 236                     add(moonspeak, foreign_name.language.name,
 237                                    foreign_name.language.iso639,
 238                                    foreign_name.language.iso3166)
 239
 240                     # Add Roomaji too
 241                     if foreign_name.language.name == 'Japanese':
 242                         roomaji = romanize(foreign_name.name)
 243                         add(roomaji, u'Roomaji', u'ja', u'jp')
 244
 245         writer.commit()
 246
 247         # Construct and populate a spell-checker index.  Quicker to do it all
 248         # at once, as every call to add_* does a commit(), and those seem to be
 249         # expensive
 250         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 251             **self.SPELLER_OPTIONS)
 252         self.speller.add_words(speller_entries)
 253
 254
 255     def normalize_name(self, name):
 256         """Strips irrelevant formatting junk from name input.
 257
 258         Specifically: everything is lowercased, and accents are removed.
 259         """
 260         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 261         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 262         # characters, then recombine.  I'm explicitly doing it this way instead
 263         # of testing combining() because Korean characters apparently
 264         # decompose!  But the results are considered letters, not combining
 265         # characters, so testing for Mn works well, and combining them again
 266         # makes them look right.
 267         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 268         name = u"".join(c for c in nkfd_form
 269                         if unicodedata.category(c) != 'Mn')
 270         name = unicodedata.normalize('NFC', name)
 271
 272         name = name.strip()
 273         name = name.lower()
 274
 275         return name
 276
 277
 278     def _apply_valid_types(self, name, valid_types):
 279         """Combines the enforced `valid_types` with any from the search string
 280         itself and updates the query.
 281
 282         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 283         only `b`s named "foo".
 284
 285         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 286         prefix stripped, `merged_valid_types` combines the original
 287         `valid_types` with the type prefix, and `term` is a query term for
 288         limited to just the allowed types.  If there are no type restrictions
 289         at all, `term` will be None.
 290         """
 291
 292         # Remove any type prefix (pokemon:133) first
 293         user_valid_types = []
 294         if ':' in name:
 295             prefix_chunk, name = name.split(':', 1)
 296             name = name.strip()
 297
 298             prefixes = prefix_chunk.split(',')
 299             user_valid_types = []
 300             for prefix in prefixes:
 301                 prefix = prefix.strip()
 302                 if prefix:
 303                     user_valid_types.append(prefix)
 304
 305         # Merge the valid types together.  Only types that appear in BOTH lists
 306         # may be used.
 307         # As a special case, if the user asked for types that are explicitly
 308         # forbidden, completely ignore what the user requested.
 309         # And, just to complicate matters: "type" and language need to be
 310         # considered separately.
 311         def merge_requirements(func):
 312             user = filter(func, user_valid_types)
 313             system = filter(func, valid_types)
 314
 315             if user and system:
 316                 merged = list(set(user) & set(system))
 317                 if merged:
 318                     return merged
 319                 else:
 320                     # No overlap; use the system restrictions
 321                     return system
 322             else:
 323                 # One or the other is blank; use the one that's not
 324                 return user or system
 325
 326         # @foo means language must be foo; otherwise it's a table name
 327         lang_requirements = merge_requirements(lambda req: req[0] == u'@')
 328         type_requirements = merge_requirements(lambda req: req[0] != u'@')
 329         all_requirements = lang_requirements + type_requirements
 330
 331         # Construct the term
 332         lang_terms = []
 333         for lang in lang_requirements:
 334             # Allow for either country or language codes
 335             lang_code = lang[1:]
 336             lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
 337             lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
 338
 339         type_terms = []
 340         for type in type_requirements:
 341             table_name = self._parse_table_name(type)
 342
 343             # Quietly ignore bogus valid_types; more likely to DTRT
 344             if table_name:
 345                 type_terms.append(whoosh.query.Term(u'table', table_name))
 346
 347         # Combine both kinds of restriction
 348         all_terms = []
 349         if type_terms:
 350             all_terms.append(whoosh.query.Or(type_terms))
 351         if lang_terms:
 352             all_terms.append(whoosh.query.Or(lang_terms))
 353
 354         return name, all_requirements, whoosh.query.And(all_terms)
 355
 356
 357     def _parse_table_name(self, name):
 358         """Takes a singular table name, table name, or table object and returns
 359         the table name.
 360
 361         Returns None for a bogus name.
 362         """
 363         # Table object
 364         if hasattr(name, '__tablename__'):
 365             return getattr(name, '__tablename__')
 366
 367         # Table name
 368         for table in self.indexed_tables.values():
 369             if name in (table.__tablename__, table.__singlename__):
 370                 return table.__tablename__
 371
 372         # Bogus.  Be nice and return dummy
 373         return None
 374
 375     def _whoosh_records_to_results(self, records, exact=True):
 376         """Converts a list of whoosh's indexed records to LookupResult tuples
 377         containing database objects.
 378         """
 379         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 380         # way to handle it, since only lookup() cares about fuzzy results
 381         seen = {}
 382         results = []
 383         for record in records:
 384             # Skip dupes
 385             seen_key = record['table'], record['row_id']
 386             if seen_key in seen:
 387                 continue
 388             seen[seen_key] = True
 389
 390             cls = self.indexed_tables[record['table']]
 391             obj = self.session.query(cls).get(record['row_id'])
 392
 393             results.append(LookupResult(object=obj,
 394                                         indexed_name=record['name'],
 395                                         name=record['display_name'],
 396                                         language=record.get('language'),
 397                                         iso639=record['iso639'],
 398                                         iso3166=record['iso3166'],
 399                                         exact=exact))
 400
 401         return results
 402
 403
 404     def lookup(self, input, valid_types=[], exact_only=False):
 405         """Attempts to find some sort of object, given a name.
 406
 407         Returns a list of named (object, name, language, iso639, iso3166,
 408         exact) tuples.  `object` is a database object, `name` is the name under
 409         which the object was found, `language` and the two isos are the name
 410         and country codes of the language in which the name was found, and
 411         `exact` is True iff this was an exact match.
 412
 413         This function currently ONLY does fuzzy matching if there are no exact
 414         matches.
 415
 416         Formes are not returned unless requested; "Shaymin" will return only
 417         grass Shaymin.
 418
 419         Extraneous whitespace is removed with extreme prejudice.
 420
 421         Recognizes:
 422         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 423         - Foreign names: "Iibui", "Eivui"
 424         - Fuzzy names in whatever language: "Evee", "Ibui"
 425         - IDs: "133", "192", "250"
 426         Also:
 427         - Type restrictions.  "type:psychic" will only return the type.  This
 428           is how to make ID lookup useful.  Multiple type specs can be entered
 429           with commas, as "move,item:1".
 430         - Language restrictions.  "@fr:charge" will only return Tackle, which
 431           is called "Charge" in French.  These can be combined with type
 432           restrictions, e.g., "@fr,move:charge".
 433         - Alternate formes can be specified merely like "wash rotom".
 434
 435         `input`
 436             Name of the thing to look for.
 437
 438         `valid_types`
 439             A list of type or language restrictions, e.g., `['pokemon',
 440             '@ja']`.  If this is provided, only results in one of the given
 441             tables will be returned.
 442
 443         `exact_only`
 444             If True, only exact matches are returned.  If set to False (the
 445             default), and the provided `name` doesn't match anything exactly,
 446             spelling correction will be attempted.
 447         """
 448
 449         name = self.normalize_name(input)
 450         exact = True
 451         form = None
 452
 453         # Pop off any type prefix and merge with valid_types
 454         name, merged_valid_types, type_term = \
 455             self._apply_valid_types(name, valid_types)
 456
 457         # Random lookup
 458         if name == 'random':
 459             return self.random_lookup(valid_types=merged_valid_types)
 460
 461         # Do different things depending what the query looks like
 462         # Note: Term objects do an exact match, so we don't have to worry about
 463         # a query parser tripping on weird characters in the input
 464         try:
 465             # Let Python try to convert to a number, so 0xff works
 466             name_as_number = int(name, base=0)
 467         except ValueError:
 468             # Oh well
 469             name_as_number = None
 470
 471         if '*' in name or '?' in name:
 472             exact_only = True
 473             query = whoosh.query.Wildcard(u'name', name)
 474         elif name_as_number is not None:
 475             # Don't spell-check numbers!
 476             exact_only = True
 477             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 478         else:
 479             # Not an integer
 480             query = whoosh.query.Term(u'name', name)
 481
 482         if type_term:
 483             query = query & type_term
 484
 485
 486         ### Actual searching
 487         # Limits; result limits are constants, and intermediate results (before
 488         # duplicate items are stripped out) are capped at the result limit
 489         # times another constant.
 490         # Fuzzy are capped at 10, beyond which something is probably very
 491         # wrong.  Exact matches -- that is, wildcards and ids -- are far less
 492         # constrained.
 493         # Also, exact matches are sorted by name, since weight doesn't matter.
 494         sort_by = dict()
 495         if exact_only:
 496             max_results = self.MAX_EXACT_RESULTS
 497             sort_by['sortedby'] = (u'table', u'name')
 498         else:
 499             max_results = self.MAX_FUZZY_RESULTS
 500
 501         searcher = self.index.searcher(weighting=LanguageWeighting())
 502         results = searcher.search(
 503             query,
 504             limit=int(max_results * self.INTERMEDIATE_FACTOR),
 505             **sort_by
 506         )
 507
 508         # Look for some fuzzy matches if necessary
 509         if not exact_only and not results:
 510             exact = False
 511             results = []
 512
 513             fuzzy_query_parts = []
 514             fuzzy_weights = {}
 515             min_weight = [None]
 516             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 517                 # Only allow the top 50% of scores; otherwise there will always
 518                 # be a lot of trailing junk
 519                 if min_weight[0] is None:
 520                     min_weight[0] = weight * 0.5
 521                 elif weight < min_weight[0]:
 522                     break
 523
 524                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 525                 fuzzy_weights[suggestion] = weight
 526
 527             if not fuzzy_query_parts:
 528                 # Nothing at all; don't try querying
 529                 return []
 530
 531             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 532             if type_term:
 533                 fuzzy_query = fuzzy_query & type_term
 534
 535             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 536             results = searcher.search(fuzzy_query)
 537
 538         ### Convert results to db objects
 539         objects = self._whoosh_records_to_results(results, exact=exact)
 540
 541         # Truncate and return
 542         return objects[:max_results]
 543
 544
 545     def random_lookup(self, valid_types=[]):
 546         """Returns a random lookup result from one of the provided
 547         `valid_types`.
 548         """
 549
 550         table_names = []
 551         for valid_type in valid_types:
 552             table_name = self._parse_table_name(valid_type)
 553             # Skip anything not recognized.  Could be, say, a language code
 554             if table_name:
 555                 table_names.append(table_name)
 556
 557         if not table_names:
 558             # n.b.: It's possible we got a list of valid_types and none of them
 559             # were valid, but this function is guaranteed to return
 560             # *something*, so it politely selects from the entire index instead
 561             table_names = self.indexed_tables.keys()
 562
 563         # Rather than create an array of many hundred items and pick randomly
 564         # from it, just pick a number up to the total number of potential
 565         # items, then pick randomly from that, and partition the whole range
 566         # into chunks.  This also avoids the slight problem that the index
 567         # contains more rows (for languages) for some items than others.
 568         # XXX ought to cache this (in the index?) if possible
 569         total = 0
 570         partitions = []
 571         for table_name in table_names:
 572             count = self.session.query(self.indexed_tables[table_name]).count()
 573             total += count
 574             partitions.append((table_name, count))
 575
 576         n = random.randint(1, total)
 577         while n > partitions[0][1]:
 578             n -= partitions[0][1]
 579             partitions.pop(0)
 580
 581         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 582
 583     def prefix_lookup(self, prefix, valid_types=[]):
 584         """Returns terms starting with the given exact prefix.
 585
 586         Type prefixes are recognized, but no other name munging is done.
 587         """
 588
 589         # Pop off any type prefix and merge with valid_types
 590         prefix, merged_valid_types, type_term = \
 591             self._apply_valid_types(prefix, valid_types)
 592
 593         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 594
 595         if type_term:
 596             query = query & type_term
 597
 598         searcher = self.index.searcher()
 599         searcher.weighting = LanguageWeighting()
 600         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 601
 602         return self._whoosh_records_to_results(results)