Allow lookup() callers to pass in their own whoosh indices.
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import pkg_resources
4 import re
5
6 from sqlalchemy.sql import func
7 import whoosh
8 import whoosh.filedb.filestore
9 import whoosh.filedb.fileindex
10 import whoosh.index
11 from whoosh.qparser import QueryParser
12 import whoosh.spelling
13
14 from pokedex.db import connect
15 import pokedex.db.tables as tables
16
17 # Dictionary of table name => table class.
18 # Need the table name so we can get the class from the table name after we
19 # retrieve something from the index
20 indexed_tables = {}
21 for cls in [
22 tables.Ability,
23 tables.Item,
24 tables.Move,
25 tables.Pokemon,
26 tables.Type,
27 ]:
28 indexed_tables[cls.__tablename__] = cls
29
30 # Dictionary of extra keys to file types of objects under, e.g. Pokémon can
31 # also be looked up purely by number
32 extra_keys = {
33 tables.Move: [
34 lambda row: u"move %d" % row.id,
35 ],
36 tables.Pokemon: [
37 lambda row: unicode(row.id),
38 ],
39 }
40
41 def open_index(directory=None, session=None, recreate=False):
42 """Opens the whoosh index stored in the named directory and returns (index,
43 speller). If the index doesn't already exist, it will be created.
44
45 `directory`
46 Directory containing the index. Defaults to a location within the
47 `pokedex` egg directory.
48
49 `session`
50 If the index needs to be created, this database session will be used.
51 Defaults to an attempt to connect to the default SQLite database
52 installed by `pokedex setup`.
53
54 `recreate`
55 If set to True, the whoosh index will be created even if it already
56 exists.
57 """
58
59 # Defaults
60 if not directory:
61 directory = pkg_resources.resource_filename('pokedex',
62 'data/whoosh_index')
63
64 if not session:
65 session = connect()
66
67 # Attempt to open or create the index
68 directory_exists = os.path.exists(directory)
69 if directory_exists and not recreate:
70 # Already exists; should be an index!
71 try:
72 index = whoosh.index.open_dir(directory, indexname='pokedex')
73 speller = whoosh.index.open_dir(directory, indexname='spelling')
74 return index, speller
75 except whoosh.index.EmptyIndexError as e:
76 # Apparently not a real index. Fall out of the if and create it
77 pass
78
79 if not directory_exists:
80 os.mkdir(directory)
81
82
83 # Create index
84 schema = whoosh.fields.Schema(
85 name=whoosh.fields.ID(stored=True),
86 table=whoosh.fields.STORED,
87 row_id=whoosh.fields.STORED,
88 language=whoosh.fields.STORED,
89 )
90
91 index = whoosh.index.create_in(directory, schema=schema,
92 indexname='pokedex')
93 writer = index.writer()
94
95 # Index every name in all our tables of interest
96 speller_entries = []
97 for cls in indexed_tables.values():
98 q = session.query(cls)
99
100 # Only index base Pokémon formes
101 if hasattr(cls, 'forme_base_pokemon_id'):
102 q = q.filter_by(forme_base_pokemon_id=None)
103
104 for row in q.yield_per(5):
105 row_key = dict(table=cls.__tablename__, row_id=row.id)
106
107 # Spelling index only indexes strings of letters, alas, so we
108 # reduce every name to this to make the index work. However, exact
109 # matches are not returned, so e.g. 'nidoran' would neither match
110 # exactly nor fuzzy-match. Solution: add the spelling-munged name
111 # as a regular index row too.
112 name = row.name.lower()
113 writer.add_document(name=name, **row_key)
114
115 speller_entries.append(name)
116
117 for extra_key_func in extra_keys.get(cls, []):
118 extra_key = extra_key_func(row)
119 writer.add_document(name=extra_key, **row_key)
120
121 writer.commit()
122
123 # Construct and populate a spell-checker index. Quicker to do it all
124 # at once, as every call to add_* does a commit(), and those seem to be
125 # expensive
126 speller = whoosh.spelling.SpellChecker(index.storage, indexname='spelling')
127 speller.add_words(speller_entries)
128
129 return index, speller
130
131
132 def lookup(name, session=None, indices=None, exact_only=False):
133 """Attempts to find some sort of object, given a database session and name.
134
135 Returns (objects, exact) where `objects` is a list of database objects, and
136 `exact` is True iff the given name matched the returned objects exactly.
137
138 This function ONLY does fuzzy matching if there are no exact matches.
139
140 Formes are not returned; "Shaymin" will return only grass Shaymin.
141
142 Currently recognizes:
143 - Pokémon names: "Eevee"
144
145 `name`
146 Name of the thing to look for.
147
148 `session`
149 A database session to use for retrieving objects. As with get_index,
150 if this is not provided, a connection to the default database will be
151 attempted.
152
153 `indices`
154 Tuple of index, speller as returned from `open_index()`. Defaults to
155 a call to `open_index()`.
156
157 `exact_only`
158 If True, only exact matches are returned. If set to False (the
159 default), and the provided `name` doesn't match anything exactly,
160 spelling correction will be attempted.
161 """
162
163 if not session:
164 session = connect()
165
166 if indices:
167 index, speller = indices
168 else:
169 index, speller = open_index()
170
171 exact = True
172
173 # Look for exact name. A Term object does an exact match, so we don't have
174 # to worry about a query parser tripping on weird characters in the input
175 searcher = index.searcher()
176 query = whoosh.query.Term('name', name.lower())
177 results = searcher.search(query)
178
179 if not exact_only:
180 # Look for some fuzzy matches
181 if not results:
182 exact = False
183 results = []
184
185 for suggestion in speller.suggest(name, 3):
186 query = whoosh.query.Term('name', suggestion)
187 results.extend(searcher.search(query))
188
189 # Convert results to db objects
190 objects = []
191 seen = {}
192 for result in results:
193 # Skip dupe results
194 seen_key = result['table'], result['row_id']
195 if seen_key in seen:
196 continue
197 seen[seen_key] = True
198
199 cls = indexed_tables[result['table']]
200 obj = session.query(cls).get(result['row_id'])
201 objects.append(obj)
202
203 return objects, exact