pokedex/lookup.py

   1 # encoding: utf8
   2 from collections import namedtuple
   3 import os, os.path
   4 import pkg_resources
   5 import random
   6 import re
   7 import shutil
   8 import unicodedata
   9
  10 from sqlalchemy.sql import func
  11 import whoosh
  12 import whoosh.filedb.filestore
  13 import whoosh.filedb.fileindex
  14 import whoosh.index
  15 from whoosh.qparser import QueryParser
  16 import whoosh.scoring
  17 import whoosh.spelling
  18
  19 from pokedex.db import connect
  20 import pokedex.db.tables as tables
  21 from pokedex.roomaji import romanize
  22
  23 __all__ = ['PokedexLookup']
  24
  25
  26 rx_is_number = re.compile('^\d+$')
  27
  28 LookupResult = namedtuple('LookupResult',
  29     ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
  30
  31 class UninitializedIndex(object):
  32     class UninitializedIndexError(Exception):
  33         pass
  34
  35     def __nonzero__(self):
  36         """Dummy object should identify itself as False."""
  37         return False
  38
  39     def __bool__(self):
  40         """Python 3000 version of the above.  Future-proofing rules!"""
  41         return False
  42
  43     def __getattr__(self, *args, **kwargs):
  44         raise self.UninitializedIndexError(
  45             "The lookup index does not exist.  Please use `pokedex setup` "
  46             "or lookup.rebuild_index() to create it."
  47         )
  48
  49 class LanguageWeighting(whoosh.scoring.Weighting):
  50     """A scoring class that forces otherwise-equal English results to come
  51     before foreign results.
  52     """
  53
  54     def score(self, searcher, fieldnum, text, docnum, weight, QTF=1):
  55         doc = searcher.stored_fields(docnum)
  56         if doc['language'] == None:
  57             # English (well, "default"); leave it at 1
  58             return weight
  59         elif doc['language'] == u'Roomaji':
  60             # Give Roomaji a little boost; it's most likely to be searched
  61             return weight * 0.95
  62         else:
  63             # Everything else can drop down the totem pole
  64             return weight * 0.9
  65
  66
  67 class PokedexLookup(object):
  68     INTERMEDIATE_LOOKUP_RESULTS = 25
  69     MAX_LOOKUP_RESULTS = 10
  70
  71     # Dictionary of table name => table class.
  72     # Need the table name so we can get the class from the table name after we
  73     # retrieve something from the index
  74     indexed_tables = dict(
  75         (cls.__tablename__, cls)
  76         for cls in (
  77             tables.Ability,
  78             tables.Item,
  79             tables.Location,
  80             tables.Move,
  81             tables.Nature,
  82             tables.Pokemon,
  83             tables.Type,
  84         )
  85     )
  86
  87
  88     def __init__(self, directory=None, session=None):
  89         """Opens the whoosh index stored in the named directory.  If the index
  90         doesn't already exist, it will be created.
  91
  92         `directory`
  93             Directory containing the index.  Defaults to a location within the
  94             `pokedex` egg directory.
  95
  96         `session`
  97             Used for creating the index and retrieving objects.  Defaults to an
  98             attempt to connect to the default SQLite database installed by
  99             `pokedex setup`.
 100         """
 101
 102         # By the time this returns, self.index, self.speller, and self.session
 103         # must be set
 104
 105         # Defaults
 106         if not directory:
 107             directory = pkg_resources.resource_filename('pokedex',
 108                                                         'data/whoosh-index')
 109         self.directory = directory
 110
 111         if session:
 112             self.session = session
 113         else:
 114             self.session = connect()
 115
 116         # Attempt to open or create the index
 117         if not os.path.exists(directory) or not os.listdir(directory):
 118             # Directory doesn't exist OR is empty; caller needs to use
 119             # rebuild_index before doing anything.  Provide a dummy object that
 120             # complains when used
 121             self.index = UninitializedIndex()
 122             self.speller = UninitializedIndex()
 123             return
 124
 125         # Otherwise, already exists; should be an index!  Bam, done.
 126         # Note that this will explode if the directory exists but doesn't
 127         # contain an index; that's a feature
 128         try:
 129             self.index = whoosh.index.open_dir(directory, indexname='MAIN')
 130         except whoosh.index.EmptyIndexError:
 131             raise IOError(
 132                 "The index directory already contains files.  "
 133                 "Please use a dedicated directory for the lookup index."
 134             )
 135
 136         # Create speller, and done
 137         spell_store = whoosh.filedb.filestore.FileStorage(directory)
 138         self.speller = whoosh.spelling.SpellChecker(spell_store)
 139
 140
 141     def rebuild_index(self):
 142         """Creates the index from scratch."""
 143
 144         schema = whoosh.fields.Schema(
 145             name=whoosh.fields.ID(stored=True),
 146             table=whoosh.fields.ID(stored=True),
 147             row_id=whoosh.fields.ID(stored=True),
 148             language=whoosh.fields.STORED,
 149             iso3166=whoosh.fields.STORED,
 150             display_name=whoosh.fields.STORED,  # non-lowercased name
 151         )
 152
 153         if not os.path.exists(self.directory):
 154             os.mkdir(self.directory)
 155
 156         self.index = whoosh.index.create_in(self.directory, schema=schema,
 157                                                             indexname='MAIN')
 158         writer = self.index.writer()
 159
 160         # Index every name in all our tables of interest
 161         # speller_entries becomes a list of (word, score) tuples; the score is
 162         # 2 for English names, 1.5 for Roomaji, and 1 for everything else.  I
 163         # think this biases the results in the direction most people expect,
 164         # especially when e.g. German names are very similar to English names
 165         speller_entries = []
 166         for cls in self.indexed_tables.values():
 167             q = self.session.query(cls)
 168
 169             for row in q.yield_per(5):
 170                 row_key = dict(table=unicode(cls.__tablename__),
 171                                row_id=unicode(row.id))
 172
 173                 def add(name, language, iso3166, score):
 174                     normalized_name = self.normalize_name(name)
 175
 176                     writer.add_document(
 177                         name=normalized_name, display_name=name,
 178                         language=language, iso3166=iso3166,
 179                         **row_key
 180                     )
 181
 182                     speller_entries.append((normalized_name, score))
 183
 184
 185                 # Add the basic English name to the index
 186                 if cls == tables.Pokemon:
 187                     # Pokémon need their form name added
 188                     # XXX kinda kludgy
 189                     add(row.full_name, None, u'us', 1)
 190
 191                     # If this is a default form, ALSO add the unadorned name,
 192                     # so 'Deoxys' alone will still do the right thing
 193                     if row.forme_name and not row.forme_base_pokemon_id:
 194                         add(row.name, None, u'us', 1)
 195                 else:
 196                     add(row.name, None, u'us', 1)
 197
 198                 # Some things also have other languages' names
 199                 # XXX other language form names..?
 200                 for foreign_name in getattr(row, 'foreign_names', []):
 201                     moonspeak = foreign_name.name
 202                     if row.name == moonspeak:
 203                         # Don't add the English name again as a different
 204                         # language; no point and it makes spell results
 205                         # confusing
 206                         continue
 207
 208                     add(moonspeak, foreign_name.language.name,
 209                                    foreign_name.language.iso3166,
 210                                    3)
 211
 212                     # Add Roomaji too
 213                     if foreign_name.language.name == 'Japanese':
 214                         roomaji = romanize(foreign_name.name)
 215                         add(roomaji, u'Roomaji', u'jp', 8)
 216
 217         writer.commit()
 218
 219         # Construct and populate a spell-checker index.  Quicker to do it all
 220         # at once, as every call to add_* does a commit(), and those seem to be
 221         # expensive
 222         self.speller = whoosh.spelling.SpellChecker(self.index.storage)
 223         self.speller.add_scored_words(speller_entries)
 224
 225
 226     def normalize_name(self, name):
 227         """Strips irrelevant formatting junk from name input.
 228
 229         Specifically: everything is lowercased, and accents are removed.
 230         """
 231         # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 232         # Makes sense to me.  Decompose by Unicode rules, then remove combining
 233         # characters, then recombine.  I'm explicitly doing it this way instead
 234         # of testing combining() because Korean characters apparently
 235         # decompose!  But the results are considered letters, not combining
 236         # characters, so testing for Mn works well, and combining them again
 237         # makes them look right.
 238         nkfd_form = unicodedata.normalize('NFKD', unicode(name))
 239         name = u"".join(c for c in nkfd_form
 240                         if unicodedata.category(c) != 'Mn')
 241         name = unicodedata.normalize('NFC', name)
 242
 243         name = name.strip()
 244         name = name.lower()
 245
 246         return name
 247
 248
 249     def _apply_valid_types(self, name, valid_types):
 250         """Combines the enforced `valid_types` with any from the search string
 251         itself and updates the query.
 252
 253         For example, a name of 'a,b:foo' and valid_types of b,c will search for
 254         only `b`s named "foo".
 255
 256         Returns `(name, merged_valid_types, term)`, where `name` has had any type
 257         prefix stripped, `merged_valid_types` combines the original
 258         `valid_types` with the type prefix, and `term` is a query term for
 259         limited to just the allowed types.  If there are no type restrictions
 260         at all, `term` will be None.
 261         """
 262
 263         # Remove any type prefix (pokemon:133) first
 264         user_valid_types = []
 265         if ':' in name:
 266             prefix_chunk, name = name.split(':', 1)
 267             name = name.strip()
 268
 269             prefixes = prefix_chunk.split(',')
 270             user_valid_types = [_.strip() for _ in prefixes]
 271
 272         # Merge the valid types together.  Only types that appear in BOTH lists
 273         # may be used.
 274         # As a special case, if the user asked for types that are explicitly
 275         # forbidden, completely ignore what the user requested
 276         combined_valid_types = []
 277         if user_valid_types and valid_types:
 278             combined_valid_types = list(
 279                 set(user_valid_types) & set(combined_valid_types)
 280             )
 281
 282             if not combined_valid_types:
 283                 # No overlap!  Just use the enforced ones
 284                 combined_valid_types = valid_types
 285         else:
 286             # One list or the other was blank, so just use the one that isn't
 287             combined_valid_types = valid_types + user_valid_types
 288
 289         if not combined_valid_types:
 290             # No restrictions
 291             return name, [], None
 292
 293         # Construct the term
 294         type_terms = []
 295         final_valid_types = []
 296         for valid_type in combined_valid_types:
 297             table_name = self._parse_table_name(valid_type)
 298
 299             # Quietly ignore bogus valid_types; more likely to DTRT
 300             if table_name:
 301                 final_valid_types.append(valid_type)
 302                 type_terms.append(whoosh.query.Term(u'table', table_name))
 303
 304         return name, final_valid_types, whoosh.query.Or(type_terms)
 305
 306
 307     def _parse_table_name(self, name):
 308         """Takes a singular table name, table name, or table object and returns
 309         the table name.
 310
 311         Returns None for a bogus name.
 312         """
 313         # Table object
 314         if hasattr(name, '__tablename__'):
 315             return getattr(name, '__tablename__')
 316
 317         # Table name
 318         for table in self.indexed_tables.values():
 319             if name in (table.__tablename__, table.__singlename__):
 320                 return table.__tablename__
 321
 322         # Bogus.  Be nice and return dummy
 323         return None
 324
 325     def _whoosh_records_to_results(self, records, exact=True):
 326         """Converts a list of whoosh's indexed records to LookupResult tuples
 327         containing database objects.
 328         """
 329         # XXX this 'exact' thing is getting kinda leaky.  would like a better
 330         # way to handle it, since only lookup() cares about fuzzy results
 331         seen = {}
 332         results = []
 333         for record in records:
 334             # Skip dupes
 335             seen_key = record['table'], record['row_id']
 336             if seen_key in seen:
 337                 continue
 338             seen[seen_key] = True
 339
 340             cls = self.indexed_tables[record['table']]
 341             obj = self.session.query(cls).get(record['row_id'])
 342
 343             results.append(LookupResult(object=obj,
 344                                         indexed_name=record['name'],
 345                                         name=record['display_name'],
 346                                         language=record['language'],
 347                                         iso3166=record['iso3166'],
 348                                         exact=exact))
 349
 350         return results
 351
 352
 353     def lookup(self, input, valid_types=[], exact_only=False):
 354         """Attempts to find some sort of object, given a name.
 355
 356         Returns a list of named (object, name, language, iso3166, exact)
 357         tuples.  `object` is a database object, `name` is the name under which
 358         the object was found, `language` and `iso3166` are the name and country
 359         code of the language in which the name was found, and `exact` is True
 360         iff this was an
 361         exact match.
 362
 363         This function currently ONLY does fuzzy matching if there are no exact
 364         matches.
 365
 366         Formes are not returned unless requested; "Shaymin" will return only
 367         grass Shaymin.
 368
 369         Extraneous whitespace is removed with extreme prejudice.
 370
 371         Recognizes:
 372         - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
 373         - Foreign names: "Iibui", "Eivui"
 374         - Fuzzy names in whatever language: "Evee", "Ibui"
 375         - IDs: "133", "192", "250"
 376         Also:
 377         - Type restrictions.  "type:psychic" will only return the type.  This
 378           is how to make ID lookup useful.  Multiple type specs can be entered
 379           with commas, as "move,item:1".  If `valid_types` are provided, any
 380           type prefix will be ignored.
 381         - Alternate formes can be specified merely like "wash rotom".
 382
 383         `input`
 384             Name of the thing to look for.
 385
 386         `valid_types`
 387             A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
 388             this is provided, only results in one of the given tables will be
 389             returned.
 390
 391         `exact_only`
 392             If True, only exact matches are returned.  If set to False (the
 393             default), and the provided `name` doesn't match anything exactly,
 394             spelling correction will be attempted.
 395         """
 396
 397         name = self.normalize_name(input)
 398         exact = True
 399         form = None
 400
 401         # Pop off any type prefix and merge with valid_types
 402         name, merged_valid_types, type_term = \
 403             self._apply_valid_types(name, valid_types)
 404
 405         # Random lookup
 406         if name == 'random':
 407             return self.random_lookup(valid_types=merged_valid_types)
 408
 409         # Do different things depending what the query looks like
 410         # Note: Term objects do an exact match, so we don't have to worry about
 411         # a query parser tripping on weird characters in the input
 412         try:
 413             # Let Python try to convert to a number, so 0xff works
 414             name_as_number = int(name, base=0)
 415         except ValueError:
 416             # Oh well
 417             name_as_number = None
 418
 419         if '*' in name or '?' in name:
 420             exact_only = True
 421             query = whoosh.query.Wildcard(u'name', name)
 422         elif name_as_number is not None:
 423             # Don't spell-check numbers!
 424             exact_only = True
 425             query = whoosh.query.Term(u'row_id', unicode(name_as_number))
 426         else:
 427             # Not an integer
 428             query = whoosh.query.Term(u'name', name)
 429
 430         if type_term:
 431             query = query & type_term
 432
 433
 434         ### Actual searching
 435         searcher = self.index.searcher()
 436         # XXX is this kosher?  docs say search() takes a weighting arg, but it
 437         # certainly does not
 438         searcher.weighting = LanguageWeighting()
 439         results = searcher.search(query,
 440                                   limit=self.INTERMEDIATE_LOOKUP_RESULTS)
 441
 442         # Look for some fuzzy matches if necessary
 443         if not exact_only and not results:
 444             exact = False
 445             results = []
 446
 447             for suggestion in self.speller.suggest(
 448                 name, self.INTERMEDIATE_LOOKUP_RESULTS):
 449
 450                 query = whoosh.query.Term('name', suggestion)
 451                 results.extend(searcher.search(query))
 452
 453         ### Convert results to db objects
 454         objects = self._whoosh_records_to_results(results, exact=exact)
 455
 456         # Only return up to 10 matches; beyond that, something is wrong.  We
 457         # strip out duplicate entries above, so it's remotely possible that we
 458         # should have more than 10 here and lost a few.  The speller returns 25
 459         # to give us some padding, and should avoid that problem.  Not a big
 460         # deal if we lose the 25th-most-likely match anyway.
 461         return objects[:self.MAX_LOOKUP_RESULTS]
 462
 463
 464     def random_lookup(self, valid_types=[]):
 465         """Returns a random lookup result from one of the provided
 466         `valid_types`.
 467         """
 468
 469         tables = []
 470         for valid_type in valid_types:
 471             table_name = self._parse_table_name(valid_type)
 472             if table_name:
 473                 tables.append(self.indexed_tables[table_name])
 474
 475         if not tables:
 476             # n.b.: It's possible we got a list of valid_types and none of them
 477             # were valid, but this function is guaranteed to return
 478             # *something*, so it politely selects from the entire index isntead
 479             tables = self.indexed_tables.values()
 480
 481         # Rather than create an array of many hundred items and pick randomly
 482         # from it, just pick a number up to the total number of potential
 483         # items, then pick randomly from that, and partition the whole range
 484         # into chunks.  This also avoids the slight problem that the index
 485         # contains more rows (for languages) for some items than others.
 486         # XXX ought to cache this (in the index?) if possible
 487         total = 0
 488         partitions = []
 489         for table in tables:
 490             count = self.session.query(table).count()
 491             total += count
 492             partitions.append((table, count))
 493
 494         n = random.randint(1, total)
 495         while n > partitions[0][1]:
 496             n -= partitions[0][1]
 497             partitions.pop(0)
 498
 499         return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
 500
 501     def prefix_lookup(self, prefix, valid_types=[]):
 502         """Returns terms starting with the given exact prefix.
 503
 504         Type prefixes are recognized, but no other name munging is done.
 505         """
 506
 507         # Pop off any type prefix and merge with valid_types
 508         prefix, merged_valid_types, type_term = \
 509             self._apply_valid_types(prefix, valid_types)
 510
 511         query = whoosh.query.Prefix(u'name', self.normalize_name(prefix))
 512
 513         if type_term:
 514             query = query & type_term
 515
 516         searcher = self.index.searcher()
 517         searcher.weighting = LanguageWeighting()
 518         results = searcher.search(query)  # XXX , limit=self.MAX_LOOKUP_RESULTS)
 519
 520         return self._whoosh_records_to_results(results)