Added setup command and made lookup work sanely. #15
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import os, os.path
3 import pkg_resources
4 import re
5
6 from sqlalchemy.sql import func
7 import whoosh
8 import whoosh.filedb.filestore
9 import whoosh.filedb.fileindex
10 import whoosh.index
11 from whoosh.qparser import QueryParser
12 import whoosh.spelling
13
14 from pokedex.db import connect
15 import pokedex.db.tables as tables
16
17 # Dictionary of table name => table class.
18 # Need the table name so we can get the class from the table name after we
19 # retrieve something from the index
20 indexed_tables = {}
21 for cls in [
22 tables.Ability,
23 tables.Item,
24 tables.Move,
25 tables.Pokemon,
26 tables.Type,
27 ]:
28 indexed_tables[cls.__tablename__] = cls
29
30 # Dictionary of extra keys to file types of objects under, e.g. Pokémon can
31 # also be looked up purely by number
32 extra_keys = {
33 tables.Move: [
34 lambda row: u"move %d" % row.id,
35 ],
36 tables.Pokemon: [
37 lambda row: unicode(row.id),
38 ],
39 }
40
41 def open_index(directory=None, session=None, recreate=False):
42 """Opens the whoosh index stored in the named directory and returns (index,
43 speller). If the index doesn't already exist, it will be created.
44
45 `directory`
46 Directory containing the index. Defaults to a location within the
47 `pokedex` egg directory.
48
49 `session`
50 If the index needs to be created, this database session will be used.
51 Defaults to an attempt to connect to the default SQLite database
52 installed by `pokedex setup`.
53
54 `recreate`
55 If set to True, the whoosh index will be created even if it already
56 exists.
57 """
58
59 # Defaults
60 if not directory:
61 directory = pkg_resources.resource_filename('pokedex',
62 'data/whoosh_index')
63
64 if not session:
65 session = connect()
66
67 # Attempt to open or create the index
68 directory_exists = os.path.exists(directory)
69 if directory_exists and not recreate:
70 # Already exists; should be an index!
71 try:
72 index = whoosh.index.open_dir(directory, indexname='pokedex')
73 speller = whoosh.index.open_dir(directory, indexname='spelling')
74 return index, speller
75 except whoosh.index.EmptyIndexError as e:
76 # Apparently not a real index. Fall out of the if and create it
77 pass
78
79 if not directory_exists:
80 os.mkdir(directory)
81
82
83 # Create index
84 schema = whoosh.fields.Schema(
85 name=whoosh.fields.ID(stored=True),
86 table=whoosh.fields.STORED,
87 row_id=whoosh.fields.STORED,
88 language=whoosh.fields.STORED,
89
90 # Whoosh 0.2 explodes when using a file-stored schema with no TEXT
91 # columns. Appease it
92 dummy=whoosh.fields.TEXT,
93 )
94
95 index = whoosh.index.create_in(directory, schema=schema,
96 indexname='pokedex')
97 writer = index.writer()
98
99 # Index every name in all our tables of interest
100 speller_entries = []
101 for cls in indexed_tables.values():
102 q = session.query(cls)
103
104 # Only index base Pokémon formes
105 if hasattr(cls, 'forme_base_pokemon_id'):
106 q = q.filter_by(forme_base_pokemon_id=None)
107
108 for row in q.yield_per(5):
109 row_key = dict(table=cls.__tablename__, row_id=row.id)
110
111 # Spelling index only indexes strings of letters, alas, so we
112 # reduce every name to this to make the index work. However, exact
113 # matches are not returned, so e.g. 'nidoran' would neither match
114 # exactly nor fuzzy-match. Solution: add the spelling-munged name
115 # as a regular index row too.
116 name = row.name.lower()
117 writer.add_document(name=name, **row_key)
118
119 speller_entries.append(name)
120
121 for extra_key_func in extra_keys.get(cls, []):
122 extra_key = extra_key_func(row)
123 writer.add_document(name=extra_key, **row_key)
124
125 writer.commit()
126
127 # XXX GIHWEGREHKG
128 old__schema = whoosh.spelling.SpellChecker._schema
129 def new__schema(self):
130 schema = old__schema(self)
131 schema.add('dummy', whoosh.fields.TEXT)
132 return schema
133 whoosh.spelling.SpellChecker._schema = new__schema
134
135 # Construct and populate a spell-checker index. Quicker to do it all
136 # at once, as every call to add_* does a commit(), and those seem to be
137 # expensive
138 speller = whoosh.spelling.SpellChecker(index.storage, indexname='spelling')
139 # WARNING: HERE BE DRAGONS
140 # whoosh.spelling refuses to index things that don't look like words.
141 # Unfortunately, this doesn't work so well for Pokémon (Mr. Mime,
142 # Porygon-Z, etc.), and attempts to work around it lead to further
143 # complications.
144 # The below is copied from SpellChecker.add_scored_words without the check
145 # for isalpha(). XXX get whoosh patched to make this unnecessary!
146 writer = speller.index(create=True).writer()
147 for word in speller_entries:
148 fields = {"word": word, "score": 1}
149 for size in xrange(speller.mingram, speller.maxgram + 1):
150 nga = whoosh.analysis.NgramAnalyzer(size)
151 gramlist = [t.text for t in nga(word)]
152 if len(gramlist) > 0:
153 fields["start%s" % size] = gramlist[0]
154 fields["end%s" % size] = gramlist[-1]
155 fields["gram%s" % size] = " ".join(gramlist)
156 writer.add_document(**fields)
157 writer.commit()
158 # end copy-pasta
159
160 return index, speller
161
162
163 def lookup(name, session=None, exact_only=False):
164 """Attempts to find some sort of object, given a database session and name.
165
166 Returns (objects, exact) where `objects` is a list of database objects, and
167 `exact` is True iff the given name matched the returned objects exactly.
168
169 This function ONLY does fuzzy matching if there are no exact matches.
170
171 Formes are not returned; "Shaymin" will return only grass Shaymin.
172
173 Currently recognizes:
174 - Pokémon names: "Eevee"
175
176 `name`
177 Name of the thing to look for.
178
179 `session`
180 A database session to use for retrieving objects. As with get_index,
181 if this is not provided, a connection to the default database will be
182 attempted.
183
184 `exact_only`
185 If True, only exact matches are returned. If set to False (the
186 default), and the provided `name` doesn't match anything exactly,
187 spelling correction will be attempted.
188 """
189
190 if not session:
191 session = connect()
192
193 index, speller = open_index()
194
195 exact = True
196
197 # Look for exact name. A Term object does an exact match, so we don't have
198 # to worry about a query parser tripping on weird characters in the input
199 searcher = index.searcher()
200 query = whoosh.query.Term('name', name.lower())
201 results = searcher.search(query)
202
203 if not exact_only:
204 # Look for some fuzzy matches
205 if not results:
206 exact = False
207 results = []
208
209 for suggestion in speller.suggest(name, 3):
210 query = whoosh.query.Term('name', suggestion)
211 results.extend(searcher.search(query))
212
213 # Convert results to db objects
214 objects = []
215 seen = {}
216 for result in results:
217 # Skip dupe results
218 seen_key = result['table'], result['row_id']
219 if seen_key in seen:
220 continue
221 seen[seen_key] = True
222
223 cls = indexed_tables[result['table']]
224 obj = session.query(cls).get(result['row_id'])
225 objects.append(obj)
226
227 return objects, exact