open_index was returning an index instead of a SpellChecker. #15
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import pkg_resources
4 import re
5
6 from sqlalchemy.sql import func
7 import whoosh
8 import whoosh.filedb.filestore
9 import whoosh.filedb.fileindex
10 import whoosh.index
11 from whoosh.qparser import QueryParser
12 import whoosh.spelling
13
14 from pokedex.db import connect
15 import pokedex.db.tables as tables
16
17 # Dictionary of table name => table class.
18 # Need the table name so we can get the class from the table name after we
19 # retrieve something from the index
20 indexed_tables = {}
21 for cls in [
22 tables.Ability,
23 tables.Item,
24 tables.Move,
25 tables.Pokemon,
26 tables.Type,
27 ]:
28 indexed_tables[cls.__tablename__] = cls
29
30 # Dictionary of extra keys to file types of objects under, e.g. Pokémon can
31 # also be looked up purely by number
32 extra_keys = {
33 tables.Move: [
34 lambda row: u"move %d" % row.id,
35 ],
36 tables.Pokemon: [
37 lambda row: unicode(row.id),
38 ],
39 }
40
41 def open_index(directory=None, session=None, recreate=False):
42 """Opens the whoosh index stored in the named directory and returns (index,
43 speller). If the index doesn't already exist, it will be created.
44
45 `directory`
46 Directory containing the index. Defaults to a location within the
47 `pokedex` egg directory.
48
49 `session`
50 If the index needs to be created, this database session will be used.
51 Defaults to an attempt to connect to the default SQLite database
52 installed by `pokedex setup`.
53
54 `recreate`
55 If set to True, the whoosh index will be created even if it already
56 exists.
57 """
58
59 # Defaults
60 if not directory:
61 directory = pkg_resources.resource_filename('pokedex',
62 'data/whoosh_index')
63
64 if not session:
65 session = connect()
66
67 # Attempt to open or create the index
68 directory_exists = os.path.exists(directory)
69 if directory_exists and not recreate:
70 # Already exists; should be an index!
71 try:
72 index = whoosh.index.open_dir(directory, indexname='pokedex')
73 spell_store = whoosh.filedb.filestore.FileStorage(directory)
74 speller = whoosh.spelling.SpellChecker(spell_store,
75 indexname='spelling')
76 return index, speller
77 except whoosh.index.EmptyIndexError as e:
78 # Apparently not a real index. Fall out of the if and create it
79 pass
80
81 if not directory_exists:
82 os.mkdir(directory)
83
84
85 # Create index
86 schema = whoosh.fields.Schema(
87 name=whoosh.fields.ID(stored=True),
88 table=whoosh.fields.STORED,
89 row_id=whoosh.fields.STORED,
90 language=whoosh.fields.STORED,
91 )
92
93 index = whoosh.index.create_in(directory, schema=schema,
94 indexname='pokedex')
95 writer = index.writer()
96
97 # Index every name in all our tables of interest
98 speller_entries = []
99 for cls in indexed_tables.values():
100 q = session.query(cls)
101
102 # Only index base Pokémon formes
103 if hasattr(cls, 'forme_base_pokemon_id'):
104 q = q.filter_by(forme_base_pokemon_id=None)
105
106 for row in q.yield_per(5):
107 row_key = dict(table=cls.__tablename__, row_id=row.id)
108
109 # Spelling index only indexes strings of letters, alas, so we
110 # reduce every name to this to make the index work. However, exact
111 # matches are not returned, so e.g. 'nidoran' would neither match
112 # exactly nor fuzzy-match. Solution: add the spelling-munged name
113 # as a regular index row too.
114 name = row.name.lower()
115 writer.add_document(name=name, **row_key)
116
117 speller_entries.append(name)
118
119 for extra_key_func in extra_keys.get(cls, []):
120 extra_key = extra_key_func(row)
121 writer.add_document(name=extra_key, **row_key)
122
123 writer.commit()
124
125 # Construct and populate a spell-checker index. Quicker to do it all
126 # at once, as every call to add_* does a commit(), and those seem to be
127 # expensive
128 speller = whoosh.spelling.SpellChecker(index.storage, indexname='spelling')
129 speller.add_words(speller_entries)
130
131 return index, speller
132
133
134 def lookup(name, session=None, indices=None, exact_only=False):
135 """Attempts to find some sort of object, given a database session and name.
136
137 Returns (objects, exact) where `objects` is a list of database objects, and
138 `exact` is True iff the given name matched the returned objects exactly.
139
140 This function ONLY does fuzzy matching if there are no exact matches.
141
142 Formes are not returned; "Shaymin" will return only grass Shaymin.
143
144 Currently recognizes:
145 - Pokémon names: "Eevee"
146
147 `name`
148 Name of the thing to look for.
149
150 `session`
151 A database session to use for retrieving objects. As with get_index,
152 if this is not provided, a connection to the default database will be
153 attempted.
154
155 `indices`
156 Tuple of index, speller as returned from `open_index()`. Defaults to
157 a call to `open_index()`.
158
159 `exact_only`
160 If True, only exact matches are returned. If set to False (the
161 default), and the provided `name` doesn't match anything exactly,
162 spelling correction will be attempted.
163 """
164
165 if not session:
166 session = connect()
167
168 if indices:
169 index, speller = indices
170 else:
171 index, speller = open_index()
172
173 exact = True
174
175 # Look for exact name. A Term object does an exact match, so we don't have
176 # to worry about a query parser tripping on weird characters in the input
177 searcher = index.searcher()
178 query = whoosh.query.Term('name', name.lower())
179 results = searcher.search(query)
180
181 if not exact_only:
182 # Look for some fuzzy matches
183 if not results:
184 exact = False
185 results = []
186
187 for suggestion in speller.suggest(name, 3):
188 query = whoosh.query.Term('name', suggestion)
189 results.extend(searcher.search(query))
190
191 # Convert results to db objects
192 objects = []
193 seen = {}
194 for result in results:
195 # Skip dupe results
196 seen_key = result['table'], result['row_id']
197 if seen_key in seen:
198 continue
199 seen[seen_key] = True
200
201 cls = indexed_tables[result['table']]
202 obj = session.query(cls).get(result['row_id'])
203 objects.append(obj)
204
205 return objects, exact