pokedex/lookup.py

   1 # encoding: utf8
   2 import os, os.path
   3 import re
   4
   5 from sqlalchemy.sql import func
   6 import whoosh
   7 import whoosh.filedb.filestore
   8 import whoosh.filedb.fileindex
   9 import whoosh.index
  10 from whoosh.qparser import QueryParser
  11 import whoosh.spelling
  12
  13 import pokedex.db.tables as tables
  14
  15 # Dictionary of table name => table class.
  16 # Need the table name so we can get the class from the table name after we
  17 # retrieve something from the index
  18 indexed_tables = {}
  19 for cls in [
  20         tables.Ability,
  21         tables.Item,
  22         tables.Move,
  23         tables.Pokemon,
  24         tables.Type,
  25     ]:
  26     indexed_tables[cls.__tablename__] = cls
  27
  28 # Dictionary of extra keys to file types of objects under, e.g. Pokémon can
  29 # also be looked up purely by number
  30 extra_keys = {
  31     tables.Move: [
  32         lambda row: u"move %d" % row.id,
  33     ],
  34     tables.Pokemon: [
  35         lambda row: unicode(row.id),
  36     ],
  37 }
  38
  39 index_bits = {}
  40 def get_index(session):
  41     """Returns (index, speller).
  42
  43     Creates an index if one does not exist.
  44     """
  45
  46     if index_bits:
  47         return index_bits['index'], index_bits['speller']
  48
  49     store = whoosh.filedb.filestore.RamStorage()
  50     schema = whoosh.fields.Schema(
  51         name=whoosh.fields.ID(stored=True),
  52         table=whoosh.fields.STORED,
  53         row_id=whoosh.fields.STORED,
  54         language=whoosh.fields.STORED,
  55
  56         # Whoosh 0.2 explodes when using a file-stored schema with no TEXT
  57         # columns.  Appease it
  58         dummy=whoosh.fields.TEXT,
  59     )
  60
  61     index_directory = '/var/tmp/pokedex'
  62     if not os.path.exists(index_directory):
  63         os.mkdir(index_directory)
  64     index = whoosh.index.create_in(index_directory, schema=schema)
  65     writer = index.writer()
  66
  67     # Index every name in all our tables of interest
  68     speller_entries = []
  69     for cls in indexed_tables.values():
  70         q = session.query(cls)
  71
  72         # Only index base Pokémon formes
  73         if hasattr(cls, 'forme_base_pokemon_id'):
  74             q = q.filter_by(forme_base_pokemon_id=None)
  75
  76         for row in q.yield_per(5):
  77             row_key = dict(table=cls.__tablename__, row_id=row.id)
  78
  79             # Spelling index only indexes strings of letters, alas, so we
  80             # reduce every name to this to make the index work.  However, exact
  81             # matches are not returned, so e.g. 'nidoran' would neither match
  82             # exactly nor fuzzy-match.  Solution: add the spelling-munged name
  83             # as a regular index row too.
  84             name = row.name.lower()
  85             writer.add_document(name=name, **row_key)
  86
  87             speller_entries.append(name)
  88
  89             for extra_key_func in extra_keys.get(cls, []):
  90                 extra_key = extra_key_func(row)
  91                 writer.add_document(name=extra_key, **row_key)
  92
  93     writer.commit()
  94
  95     # XXX GIHWEGREHKG
  96     old__schema = whoosh.spelling.SpellChecker._schema
  97     def new__schema(self):
  98         schema = old__schema(self)
  99         schema.add('dummy', whoosh.fields.TEXT)
 100         return schema
 101     whoosh.spelling.SpellChecker._schema = new__schema
 102
 103     # Construct and populate a spell-checker index.  Quicker to do it all
 104     # at once, as every call to add_* does a commit(), and those seem to be
 105     # expensive
 106     speller = whoosh.spelling.SpellChecker(index.storage)
 107     # WARNING: HERE BE DRAGONS
 108     # whoosh.spelling refuses to index things that don't look like words.
 109     # Unfortunately, this doesn't work so well for Pokémon (Mr. Mime,
 110     # Porygon-Z, etc.), and attempts to work around it lead to further
 111     # complications.
 112     # The below is copied from SpellChecker.add_scored_words without the check
 113     # for isalpha().  XXX get whoosh patched to make this unnecessary!
 114     writer = speller.index(create=True).writer()
 115     for word in speller_entries:
 116         fields = {"word": word, "score": 1}
 117         for size in xrange(speller.mingram, speller.maxgram + 1):
 118             nga = whoosh.analysis.NgramAnalyzer(size)
 119             gramlist = [t.text for t in nga(word)]
 120             if len(gramlist) > 0:
 121                 fields["start%s" % size] = gramlist[0]
 122                 fields["end%s" % size] = gramlist[-1]
 123                 fields["gram%s" % size] = " ".join(gramlist)
 124         writer.add_document(**fields)
 125     writer.commit()
 126     # end copy-pasta
 127
 128     index_bits['index'] = index
 129     index_bits['speller'] = speller
 130     index_bits['store'] = store
 131     return index_bits['index'], index_bits['speller']
 132
 133 def lookup(session, name, exact_only=False):
 134     """Attempts to find some sort of object, given a database session and name.
 135
 136     Returns (objects, exact) where `objects` is a list of database objects, and
 137     `exact` is True iff the given name matched the returned objects exactly.
 138
 139     This function ONLY does fuzzy matching if there are no exact matches.
 140
 141     Formes are not returned; "Shaymin" will return only grass Shaymin.
 142
 143     Currently recognizes:
 144     - Pokémon names: "Eevee"
 145     """
 146
 147     exact = True
 148
 149     index, speller = get_index(session)
 150
 151     # Look for exact name.  A Term object does an exact match, so we don't have
 152     # to worry about a query parser tripping on weird characters in the input
 153     searcher = index.searcher()
 154     query = whoosh.query.Term('name', name.lower())
 155     results = searcher.search(query)
 156
 157     if not exact_only:
 158         # Look for some fuzzy matches
 159         if not results:
 160             exact = False
 161             results = []
 162
 163             for suggestion in speller.suggest(name, 3):
 164                 query = whoosh.query.Term('name', suggestion)
 165                 results.extend(searcher.search(query))
 166
 167     # Convert results to db objects
 168     objects = []
 169     seen = {}
 170     for result in results:
 171         # Skip dupe results
 172         seen_key = result['table'], result['row_id']
 173         if seen_key in seen:
 174             continue
 175         seen[seen_key] = True
 176
 177         cls = indexed_tables[result['table']]
 178         obj = session.query(cls).get(result['row_id'])
 179         objects.append(obj)
 180
 181     return objects, exact