Pokédex lookup now uses a whoosh index and spell-checker. #15
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import re
3
4 from sqlalchemy.sql import func
5 import whoosh
6 from whoosh.qparser import QueryParser
7 import whoosh.spelling
8
9 import pokedex.db.tables as tables
10
11 # Dictionary of table name => table class.
12 # Need the table name so we can get the class from the table name after we
13 # retrieve something from the index
14 indexed_tables = {}
15 for cls in [
16 tables.Pokemon,
17 ]:
18 indexed_tables[cls.__tablename__] = cls
19
20 index_bits = {}
21 def get_index(session):
22 """Returns (index, speller).
23
24 Creates an index if one does not exist.
25 """
26
27 if index_bits:
28 return index_bits['index'], index_bits['speller']
29
30 store = whoosh.store.RamStorage()
31 schema = whoosh.fields.Schema(
32 name=whoosh.fields.ID(stored=True),
33 spelling_name=whoosh.fields.ID(stored=True),
34 table=whoosh.fields.STORED,
35 row_id=whoosh.fields.STORED,
36 language_id=whoosh.fields.STORED,
37 )
38
39 index = whoosh.index.Index(store, schema=schema, create=True)
40 writer = index.writer()
41
42 # Index every name in all our tables of interest
43 for cls in indexed_tables.values():
44 q = session.query(cls)
45
46 # Only index base Pokémon formes
47 if hasattr(cls, 'forme_base_pokemon_id'):
48 q = q.filter_by(forme_base_pokemon_id=None)
49
50 for row in q.yield_per(5):
51 name = row.name.lower()
52 spelling_name = re.sub('[^a-z]', '', name)
53 writer.add_document(name=name,
54 spelling_name=spelling_name,
55 table=cls.__tablename__,
56 row_id=row.id)
57
58 writer.commit()
59
60 ### Construct a spell-checker index
61 speller = whoosh.spelling.SpellChecker(index.storage)
62
63 # Can't use speller.add_field because it tries to intuit a frequency, and
64 # names are in an ID field, which seems to be immune to frequency.
65 # Not hard to add everything ourselves, though
66 reader = index.doc_reader()
67 speller.add_words([ _['spelling_name'] for _ in reader ])
68 reader.close()
69
70 index_bits['index'] = index
71 index_bits['speller'] = speller
72 index_bits['store'] = store
73 return index_bits['index'], index_bits['speller']
74
75 def lookup(session, name, exact_only=False):
76 """Attempts to find some sort of object, given a database session and name.
77
78 Returns (objects, exact) where `objects` is a list of database objects, and
79 `exact` is True iff the given name matched the returned objects exactly.
80
81 This function ONLY does fuzzy matching if there are no exact matches.
82
83 Formes are not returned; "Shaymin" will return only grass Shaymin.
84
85 Currently recognizes:
86 - Pokémon names: "Eevee"
87 """
88
89 exact = True
90
91 # Alas! We have to make three attempts to find anything with this index.
92 # First: Try an exact match for a name in the index.
93 # Second: Try an exact match for a stripped-down name in the index.
94 # Third: Get spelling suggestions.
95 # The spelling module apparently only indexes *words* -- that is, [a-z]+.
96 # So we have a separate field that contains the same name, stripped down to
97 # just [a-z]+.
98 # Unfortunately, exact matches aren't returned as spelling suggestions, so
99 # we also have to do a regular index match against this separate field.
100 # Otherwise, 'nidoran' will never match anything
101 index, speller = get_index(session)
102
103 # Look for exact name
104 parser = QueryParser('name', schema=index.schema)
105 results = index.find(name.lower(), parser=parser)
106
107 if not exact_only:
108 # Look for a match with a reduced a-z name
109 if not results:
110 parser = QueryParser('spelling_name', schema=index.schema)
111 results = index.find(name.lower(), parser=parser)
112
113 # Look for some fuzzy matches
114 if not results:
115 results = []
116 exact = False
117
118 for suggestion in speller.suggest(name, 3):
119 results.extend( index.find(suggestion, parser=parser) )
120
121 # Convert results to db objects
122 objects = []
123 seen = {}
124 for result in results:
125 # Skip dupe results
126 seen_key = result['table'], result['row_id']
127 if seen_key in seen:
128 continue
129 seen[seen_key] = True
130
131 cls = indexed_tables[result['table']]
132 obj = session.query(cls).get(result['row_id'])
133 objects.append(obj)
134
135 return objects, exact