pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import random
   4 import re
   5 import shutil
   6 import unicodedata
   7
   8 from sqlalchemy.sql import func
   9 import whoosh
  10 import whoosh.filedb.filestore
  11 import whoosh.filedb.fileindex
  12 import whoosh.index
  13 from whoosh.qparser import QueryParser
  14 import whoosh.scoring
  15 import whoosh.spelling
  16
  17 from pokedex.util import namedtuple
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22 from pokedex.defaults import get_default_index_dir
  23
  24 __all__ = ['PokedexLookup']
  25
  26
  27 rx_is_number = re.compile('^\d+$')
  28
  29 LookupResult = namedtuple('LookupResult', [
  30     'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
  31 ])
  32
  33 class UninitializedIndex(object):
  34     class UninitializedIndexError(Exception):
  35         pass
  36
  37     def __nonzero__(self):
  38         """Dummy object should identify itself as False."""
  39         return False
  40
  41     def __bool__(self):
  42         """Python 3000 version of the above.  Future-proofing rules!"""
  43         return False
  44
  45     def __getattr__(self, *args, **kwargs):
  46         raise self.UninitializedIndexError(
  47             "The lookup index does not exist.  Please use `pokedex setup` "
  48             "or lookup.rebuild_index() to create it."
  49         )
  50
  51 class LanguageWeighting(whoosh.scoring.Weighting):
  52     """A scoring class that forces otherwise-equal English results to come
  53     before foreign results.
  54     """
  55
  56     def __init__(self, extra_weights={}, *args, **kwargs):
  57         """`extra_weights` may be a dictionary of weights which will be
  58         factored in.
  59
  60         Intended for use with spelling corrections, which come along with their
  61         own weightings.
  62         """
  63         self.extra_weights = extra_weights
  64         super(LanguageWeighting, self).__init__(*args, **kwargs)
  65
  66     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  67         doc = searcher.stored_fields(docnum)
  68
  69         # Apply extra weight
  70         weight = weight * self.extra_weights.get(text, 1.0)
  71
  72         if doc['language'] == None:
  73             # English (well, "default"); leave it at 1
  74             return weight
  75         elif doc['language'] == u'Roomaji':
  76             # Give Roomaji a little boost; it's most likely to be searched
  77             return weight * 0.9
  78         else:
  79             # Everything else can drop down the totem pole
  80             return weight * 0.8
  81
  82
  83 class PokedexLookup(object):
  84     MAX_FUZZY_RESULTS = 10
  85     MAX_EXACT_RESULTS = 43
  86     INTERMEDIATE_FACTOR = 2
  87
  88     # The speller only checks how much the input matches a word; there can be
  89     # all manner of extra unmatched junk, and it won't affect the weighting.
  90     # To compensate, greatly boost the weighting of matches at the beginning
  91     # and end, so nearly-full-word-matches are much better
  92     SPELLER_OPTIONS = dict(booststart=10.0, boostend=9.0)
  93
  94     # Dictionary of table name => table class.
  95     # Need the table name so we can get the class from the table name after we
  96     # retrieve something from the index
  97     indexed_tables = dict(
  98         (cls.__tablename__, cls)
  99         for cls in (
 100             tables.Ability,
 101             tables.Item,
 102             tables.Location,
 103             tables.Move,
 104             tables.Nature,
 105             tables.Pokemon,
 106             tables.Type,
 107         )
 108     )
 109
 110
 111     def __init__(self, directory=None, session=None):
 112         """Opens the whoosh index stored in the named directory.  If the index
 113         doesn't already exist, it will be created.
 114
 115         `directory`
 116             Directory containing the index.  Defaults to a location within the
 117             `pokedex` egg directory.
 118
 119         `session`
 120             Used for creating the index and retrieving objects.  Defaults to an
 121             attempt to connect to the default SQLite database installed by
 122             `pokedex setup`.
 123         """
 124
 125         # By the time this returns, self.index, self.speller, and self.session
 126         # must be set
 127
 128         # If a directory was not given, use the default
 129         if directory is None:
 130             directory = get_default_index_dir()
 131
 132         self.directory = directory
 133
 134         if session:
 135             self.session = session
 136         else:
 137             self.session = connect()
 138
 139         # Attempt to open or create the index
 140         if not os.path.exists(directory) or not os.listdir(directory):
 141             # Directory doesn't exist OR is empty; caller needs to use
 142             # rebuild_index before doing anything.  Provide a dummy object that
 143             # complains when used
 144             self.index = UninitializedIndex()
 145             self.speller = UninitializedIndex()
 146             return
 147
 148         # Otherwise, already exists; should be an index!  Bam, done.
 149         # Note that this will explode if the directory exists but doesn't
 150         # contain an index; that's a feature
 151         try:
 152             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 153         except whoosh.index.EmptyIndexError:
 154             raise IOError(
 155                 "The index directory already contains files.  "
 156                 "Please use a dedicated directory for the lookup index."
 157             )
 158
 159         # Create speller, and done
 160         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 161         self.speller = whoosh.spelling.SpellChecker(spell_store,
 162             **self.SPELLER_OPTIONS)
 163
 164
 165     def rebuild_index(self):
 166         """Creates the index from scratch."""
 167
 168         schema = whoosh.fields.Schema(
 169             name=whoosh.fields.ID(stored=True),
 170             table=whoosh.fields.ID(stored=True),
 171             row_id=whoosh.fields.ID(stored=True),
 172             language=whoosh.fields.STORED,
 173             iso639=whoosh.fields.ID(stored=True),
 174             iso3166=whoosh.fields.ID(stored=True),
 175             display_name=whoosh.fields.STORED,  # non-lowercased name
 176         )
 177
 178         if not os.path.exists(self.directory):
 179             os.mkdir(self.directory)
 180
 181         self.index = whoosh.index.create_in(self.directory, schema=schema,
 182                                                             indexname='MAIN')
 183         writer = self.index.writer()
 184
 185         # Index every name in all our tables of interest
 186         speller_entries = set()
 187         for cls in self.indexed_tables.values():
 188             q = self.session.query(cls)
 189
 190             for row in q.yield_per(5):
 191                 row_key = dict(table=unicode(cls.__tablename__),
 192                                row_id=unicode(row.id))
 193
 194                 def add(name, language, iso639, iso3166):
 195                     normalized_name = self.normalize_name(name)
 196
 197                     writer.add_document(
 198                         name=normalized_name, display_name=name,
 199                         language=language, iso639=iso639, iso3166=iso3166,
 200                         **row_key
 201                     )
 202
 203                     speller_entries.add(normalized_name)
 204
 205
 206                 # Add the basic English name to the index
 207                 if cls == tables.Pokemon:
 208                     # Pokémon need their form name added
 209                     # XXX kinda kludgy
 210                     add(row.full_name, None, u'en', u'us')
 211
 212                     # If this is a default form, ALSO add the unadorned name,
 213                     # so 'Deoxys' alone will still do the right thing
 214                     if row.forme_name and not row.forme_base_pokemon_id:
 215                         add(row.name, None, u'en', u'us')
 216                 else:
 217                     add(row.name, None, u'en', u'us')
 218
 219                 # Some things also have other languages' names
 220                 # XXX other language form names..?
 221                 for foreign_name in getattr(row, 'foreign_names', []):
 222                     moonspeak = foreign_name.name
 223                     if row.name == moonspeak:
 224                         # Don't add the English name again as a different
 225                         # language; no point and it makes spell results
 226                         # confusing
 227                         continue
 228
 229                     add(moonspeak, foreign_name.language.name,
 230                                    foreign_name.language.iso639,
 231                                    foreign_name.language.iso3166)
 232
 233                     # Add Roomaji too
 234                     if foreign_name.language.name == 'Japanese':
 235                         roomaji = romanize(foreign_name.name)
 236                         add(roomaji, u'Roomaji', u'ja', u'jp')
 237
 238         writer.commit()
 239
 240         # Construct and populate a spell-checker index.  Quicker to do it all
 241         # at once, as every call to add_* does a commit(), and those seem to be
 242         # expensive
 243         self.speller = whoosh.spelling.SpellChecker(self.index.storage, mingram=2,
 244             **self.SPELLER_OPTIONS)
 245         self.speller.add_words(speller_entries)
 246
 247
 248     def normalize_name(self, name):
 249         """Strips irrelevant formatting junk from name input.
 250
 251         Specifically: everything is lowercased, and accents are removed.
 252         """
 253         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 254         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 255         # characters, then recombine.  I'm explicitly doing it this way instead
 256         # of testing combining() because Korean characters apparently
 257         # decompose!  But the results are considered letters, not combining
 258         # characters, so testing for Mn works well, and combining them again
 259         # makes them look right.
 260         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 261         name = u"".join(c for c in nkfd_form
 262                         if unicodedata.category(c) != 'Mn')
 263         name = unicodedata.normalize('NFC', name)
 264
 265         name = name.strip()
 266         name = name.lower()
 267
 268         return name
 269
 270
 271     def _apply_valid_types(self, name, valid_types):
 272         """Combines the enforced `valid_types` with any from the search string
 273         itself and updates the query.
 274
 275         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 276         only `b`s named "foo".
 277
 278         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 279         prefix stripped, `merged_valid_types` combines the original
 280         `valid_types` with the type prefix, and `term` is a query term for
 281         limited to just the allowed types.  If there are no type restrictions
 282         at all, `term` will be None.
 283         """
 284
 285         # Remove any type prefix (pokemon:133) first
 286         user_valid_types = []
 287         if ':' in name:
 288             prefix_chunk, name = name.split(':', 1)
 289             name = name.strip()
 290
 291             prefixes = prefix_chunk.split(',')
 292             user_valid_types = [_.strip() for _ in prefixes]
 293
 294         # Merge the valid types together.  Only types that appear in BOTH lists
 295         # may be used.
 296         # As a special case, if the user asked for types that are explicitly
 297         # forbidden, completely ignore what the user requested
 298         combined_valid_types = []
 299         if user_valid_types and valid_types:
 300             combined_valid_types = list(
 301                 set(user_valid_types) & set(combined_valid_types)
 302             )
 303
 304             if not combined_valid_types:
 305                 # No overlap!  Just use the enforced ones
 306                 combined_valid_types = valid_types
 307         else:
 308             # One list or the other was blank, so just use the one that isn't
 309             combined_valid_types = valid_types + user_valid_types
 310
 311         if not combined_valid_types:
 312             # No restrictions
 313             return name, [], None
 314
 315         # Construct the term
 316         type_terms = []
 317         lang_terms = []
 318         final_valid_types = []
 319         for valid_type in combined_valid_types:
 320             if valid_type.startswith(u'@'):
 321                 # @foo means: language must be foo.
 322                 # Allow for either country or language codes
 323                 lang_code = valid_type[1:]
 324                 lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
 325                 lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
 326             else:
 327                 # otherwise, this is a type/table name
 328                 table_name = self._parse_table_name(valid_type)
 329
 330                 # Quietly ignore bogus valid_types; more likely to DTRT
 331                 if table_name:
 332                     type_terms.append(whoosh.query.Term(u'table', table_name))
 333
 334         # Combine both kinds of restriction
 335         all_terms = []
 336         if type_terms:
 337             all_terms.append(whoosh.query.Or(type_terms))
 338         if lang_terms:
 339             all_terms.append(whoosh.query.Or(lang_terms))
 340
 341         return name, combined_valid_types, whoosh.query.And(all_terms)
 342
 343
 344     def _parse_table_name(self, name):
 345         """Takes a singular table name, table name, or table object and returns
 346         the table name.
 347
 348         Returns None for a bogus name.
 349         """
 350         # Table object
 351         if hasattr(name, '__tablename__'):
 352             return getattr(name, '__tablename__')
 353
 354         # Table name
 355         for table in self.indexed_tables.values():
 356             if name in (table.__tablename__, table.__singlename__):
 357                 return table.__tablename__
 358
 359         # Bogus.  Be nice and return dummy
 360         return None
 361
 362     def _whoosh_records_to_results(self, records, exact=True):
 363         """Converts a list of whoosh's indexed records to LookupResult tuples
 364         containing database objects.
 365         """
 366         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 367         # way to handle it, since only lookup() cares about fuzzy results
 368         seen = {}
 369         results = []
 370         for record in records:
 371             # Skip dupes
 372             seen_key = record['table'], record['row_id']
 373             if seen_key in seen:
 374                 continue
 375             seen[seen_key] = True
 376
 377             cls = self.indexed_tables[record['table']]
 378             obj = self.session.query(cls).get(record['row_id'])
 379
 380             results.append(LookupResult(object=obj,
 381                                         indexed_name=record['name'],
 382                                         name=record['display_name'],
 383                                         language=record['language'],
 384                                         iso639=record['iso639'],
 385                                         iso3166=record['iso3166'],
 386                                         exact=exact))
 387
 388         return results
 389
 390
 391     def lookup(self, input, valid_types=[], exact_only=False):
 392         """Attempts to find some sort of object, given a name.
 393
 394         Returns a list of named (object, name, language, iso639, iso3166,
 395         exact) tuples.  `object` is a database object, `name` is the name under
 396         which the object was found, `language` and the two isos are the name
 397         and country codes of the language in which the name was found, and
 398         `exact` is True iff this was an exact match.
 399
 400         This function currently ONLY does fuzzy matching if there are no exact
 401         matches.
 402
 403         Formes are not returned unless requested; "Shaymin" will return only
 404         grass Shaymin.
 405
 406         Extraneous whitespace is removed with extreme prejudice.
 407
 408         Recognizes:
 409         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 410         - Foreign names: "Iibui", "Eivui"
 411         - Fuzzy names in whatever language: "Evee", "Ibui"
 412         - IDs: "133", "192", "250"
 413         Also:
 414         - Type restrictions.  "type:psychic" will only return the type.  This
 415           is how to make ID lookup useful.  Multiple type specs can be entered
 416           with commas, as "move,item:1".
 417         - Language restrictions.  "@fr:charge" will only return Tackle, which
 418           is called "Charge" in French.  These can be combined with type
 419           restrictions, e.g., "@fr,move:charge".
 420         - Alternate formes can be specified merely like "wash rotom".
 421
 422         `input`
 423             Name of the thing to look for.
 424
 425         `valid_types`
 426             A list of type or language restrictions, e.g., `['pokemon',
 427             '@ja']`.  If this is provided, only results in one of the given
 428             tables will be returned.
 429
 430         `exact_only`
 431             If True, only exact matches are returned.  If set to False (the
 432             default), and the provided `name` doesn't match anything exactly,
 433             spelling correction will be attempted.
 434         """
 435
 436         name = self.normalize_name(input)
 437         exact = True
 438         form = None
 439
 440         # Pop off any type prefix and merge with valid_types
 441         name, merged_valid_types, type_term = \
 442             self._apply_valid_types(name, valid_types)
 443
 444         # Random lookup
 445         if name == 'random':
 446             return self.random_lookup(valid_types=merged_valid_types)
 447
 448         # Do different things depending what the query looks like
 449         # Note: Term objects do an exact match, so we don't have to worry about
 450         # a query parser tripping on weird characters in the input
 451         try:
 452             # Let Python try to convert to a number, so 0xff works
 453             name_as_number = int(name, base=0)
 454         except ValueError:
 455             # Oh well
 456             name_as_number = None
 457
 458         if '*' in name or '?' in name:
 459             exact_only = True
 460             query = whoosh.query.Wildcard(u'name', name)
 461         elif name_as_number is not None:
 462             # Don't spell-check numbers!
 463             exact_only = True
 464             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 465         else:
 466             # Not an integer
 467             query = whoosh.query.Term(u'name', name)
 468
 469         if type_term:
 470             query = query & type_term
 471
 472
 473         ### Actual searching
 474         # Limits; result limits are constants, and intermediate results (before
 475         # duplicate items are stripped out) are capped at the result limit
 476         # times another constant.
 477         # Fuzzy are capped at 10, beyond which something is probably very
 478         # wrong.  Exact matches -- that is, wildcards and ids -- are far less
 479         # constrained.
 480         # Also, exact matches are sorted by name, since weight doesn't matter.
 481         sort_by = dict()
 482         if exact_only:
 483             max_results = self.MAX_EXACT_RESULTS
 484             sort_by['sortedby'] = (u'table', u'name')
 485         else:
 486             max_results = self.MAX_FUZZY_RESULTS
 487
 488         searcher = self.index.searcher(weighting=LanguageWeighting())
 489         results = searcher.search(
 490             query,
 491             limit=int(max_results * self.INTERMEDIATE_FACTOR),
 492             **sort_by
 493         )
 494
 495         # Look for some fuzzy matches if necessary
 496         if not exact_only and not results:
 497             exact = False
 498             results = []
 499
 500             fuzzy_query_parts = []
 501             fuzzy_weights = {}
 502             min_weight = [None]
 503             for suggestion, _, weight in self.speller.suggestions_and_scores(name):
 504                 # Only allow the top 50% of scores; otherwise there will always
 505                 # be a lot of trailing junk
 506                 if min_weight[0] is None:
 507                     min_weight[0] = weight * 0.5
 508                 elif weight < min_weight[0]:
 509                     break
 510
 511                 fuzzy_query_parts.append(whoosh.query.Term('name', suggestion))
 512                 fuzzy_weights[suggestion] = weight
 513
 514             if not fuzzy_query_parts:
 515                 # Nothing at all; don't try querying
 516                 return []
 517
 518             fuzzy_query = whoosh.query.Or(fuzzy_query_parts)
 519             if type_term:
 520                 fuzzy_query = fuzzy_query & type_term
 521
 522             searcher.weighting = LanguageWeighting(extra_weights=fuzzy_weights)
 523             results = searcher.search(fuzzy_query)
 524
 525         ### Convert results to db objects
 526         objects = self._whoosh_records_to_results(results, exact=exact)
 527
 528         # Truncate and return
 529         return objects[:max_results]
 530
 531
 532     def random_lookup(self, valid_types=[]):
 533         """Returns a random lookup result from one of the provided
 534         `valid_types`.
 535         """
 536
 537         table_names = []
 538         for valid_type in valid_types:
 539             table_name = self._parse_table_name(valid_type)
 540             # Skip anything not recognized.  Could be, say, a language code
 541             if table_name:
 542                 table_names.append(table_name)
 543
 544         if not table_names:
 545             # n.b.: It's possible we got a list of valid_types and none of them
 546             # were valid, but this function is guaranteed to return
 547             # *something*, so it politely selects from the entire index instead
 548             table_names = self.indexed_tables.keys()
 549
 550         # Rather than create an array of many hundred items and pick randomly
 551         # from it, just pick a number up to the total number of potential
 552         # items, then pick randomly from that, and partition the whole range
 553         # into chunks.  This also avoids the slight problem that the index
 554         # contains more rows (for languages) for some items than others.
 555         # XXX ought to cache this (in the index?) if possible
 556         total = 0
 557         partitions = []
 558         for table_name in table_names:
 559             count = self.session.query(self.indexed_tables[table_name]).count()
 560             total += count
 561             partitions.append((table_name, count))
 562
 563         n = random.randint(1, total)
 564         while n > partitions[0][1]:
 565             n -= partitions[0][1]
 566             partitions.pop(0)
 567
 568         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 569
 570     def prefix_lookup(self, prefix, valid_types=[]):
 571         """Returns terms starting with the given exact prefix.
 572
 573         Type prefixes are recognized, but no other name munging is done.
 574         """
 575
 576         # Pop off any type prefix and merge with valid_types
 577         prefix, merged_valid_types, type_term = \
 578             self._apply_valid_types(prefix, valid_types)
 579
 580         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 581
 582         if type_term:
 583             query = query & type_term
 584
 585         searcher = self.index.searcher()
 586         searcher.weighting = LanguageWeighting()
 587         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 588
 589         return self._whoosh_records_to_results(results)