Improved CSV import speed by several orders of magnitude.
[zzz-pokedex.git] / pokedex / lookup.py
1 # encoding: utf8
2 import re
3
4 from sqlalchemy.sql import func
5 import whoosh
6 from whoosh.qparser import QueryParser
7 import whoosh.spelling
8
9 import pokedex.db.tables as tables
10
11 # Dictionary of table name => table class.
12 # Need the table name so we can get the class from the table name after we
13 # retrieve something from the index
14 indexed_tables = {}
15 for cls in [
16 tables.Ability,
17 tables.Item,
18 tables.Move,
19 tables.Pokemon,
20 tables.Type,
21 ]:
22 indexed_tables[cls.__tablename__] = cls
23
24 # Dictionary of extra keys to file types of objects under, e.g. Pokémon can
25 # also be looked up purely by number
26 extra_keys = {
27 tables.Move: [
28 lambda row: u"move %d" % row.id,
29 ],
30 tables.Pokemon: [
31 lambda row: unicode(row.id),
32 ],
33 }
34
35 index_bits = {}
36 def get_index(session):
37 """Returns (index, speller).
38
39 Creates an index if one does not exist.
40 """
41
42 if index_bits:
43 return index_bits['index'], index_bits['speller']
44
45 store = whoosh.store.RamStorage()
46 schema = whoosh.fields.Schema(
47 name=whoosh.fields.ID(stored=True),
48 table=whoosh.fields.STORED,
49 row_id=whoosh.fields.STORED,
50 language_id=whoosh.fields.STORED,
51 )
52
53 # Construct a straight lookup index
54 index = whoosh.index.Index(store, schema=schema, create=True)
55 writer = index.writer()
56
57 # Index every name in all our tables of interest
58 speller_entries = []
59 for cls in indexed_tables.values():
60 q = session.query(cls)
61
62 # Only index base Pokémon formes
63 if hasattr(cls, 'forme_base_pokemon_id'):
64 q = q.filter_by(forme_base_pokemon_id=None)
65
66 for row in q.yield_per(5):
67 row_key = dict(table=cls.__tablename__, row_id=row.id)
68
69 # Spelling index only indexes strings of letters, alas, so we
70 # reduce every name to this to make the index work. However, exact
71 # matches are not returned, so e.g. 'nidoran' would neither match
72 # exactly nor fuzzy-match. Solution: add the spelling-munged name
73 # as a regular index row too.
74 name = row.name.lower()
75 writer.add_document(name=name, **row_key)
76
77 speller_entries.append(name)
78
79 for extra_key_func in extra_keys.get(cls, []):
80 extra_key = extra_key_func(row)
81 writer.add_document(name=extra_key, **row_key)
82
83 writer.commit()
84
85 # Construct and populate a spell-checker index. Quicker to do it all
86 # at once, as every call to add_* does a commit(), and those seem to be
87 # expensive
88 speller = whoosh.spelling.SpellChecker(index.storage)
89 # WARNING: HERE BE DRAGONS
90 # whoosh.spelling refuses to index things that don't look like words.
91 # Unfortunately, this doesn't work so well for Pokémon (Mr. Mime,
92 # Porygon-Z, etc.), and attempts to work around it lead to further
93 # complications.
94 # The below is copied from SpellChecker.add_scored_words without the check
95 # for isalpha(). XXX get whoosh patched to make this unnecessary!
96 writer = whoosh.writing.IndexWriter(speller.index())
97 for word in speller_entries:
98 fields = {"word": word, "score": 1}
99 for size in xrange(speller.mingram, speller.maxgram + 1):
100 nga = whoosh.analysis.NgramAnalyzer(size)
101 gramlist = [t.text for t in nga(word)]
102 if len(gramlist) > 0:
103 fields["start%s" % size] = gramlist[0]
104 fields["end%s" % size] = gramlist[-1]
105 fields["gram%s" % size] = " ".join(gramlist)
106 writer.add_document(**fields)
107 writer.commit()
108 # end copy-pasta
109
110 index_bits['index'] = index
111 index_bits['speller'] = speller
112 index_bits['store'] = store
113 return index_bits['index'], index_bits['speller']
114
115 def lookup(session, name, exact_only=False):
116 """Attempts to find some sort of object, given a database session and name.
117
118 Returns (objects, exact) where `objects` is a list of database objects, and
119 `exact` is True iff the given name matched the returned objects exactly.
120
121 This function ONLY does fuzzy matching if there are no exact matches.
122
123 Formes are not returned; "Shaymin" will return only grass Shaymin.
124
125 Currently recognizes:
126 - Pokémon names: "Eevee"
127 """
128
129 exact = True
130
131 index, speller = get_index(session)
132
133 # Look for exact name. A Term object does an exact match, so we don't have
134 # to worry about a query parser tripping on weird characters in the input
135 searcher = index.searcher()
136 query = whoosh.query.Term('name', name.lower())
137 results = searcher.search(query)
138
139 if not exact_only:
140 # Look for some fuzzy matches
141 if not results:
142 exact = False
143 results = []
144
145 for suggestion in speller.suggest(name, 3):
146 query = whoosh.query.Term('name', suggestion)
147 results.extend(searcher.search(query))
148
149 # Convert results to db objects
150 objects = []
151 seen = {}
152 for result in results:
153 # Skip dupe results
154 seen_key = result['table'], result['row_id']
155 if seen_key in seen:
156 continue
157 seen[seen_key] = True
158
159 cls = indexed_tables[result['table']]
160 obj = session.query(cls).get(result['row_id'])
161 objects.append(obj)
162
163 return objects, exact