2 from collections
import namedtuple
9 from sqlalchemy
.sql
import func
11 import whoosh
.filedb
.filestore
12 import whoosh
.filedb
.fileindex
14 from whoosh
.qparser
import QueryParser
16 import whoosh
.spelling
18 from pokedex
.db
import connect
19 import pokedex
.db
.tables
as tables
20 from pokedex
.roomaji
import romanize
21 from pokedex
.defaults
import get_default_index_dir
23 __all__
= ['PokedexLookup']
26 rx_is_number
= re
.compile('^\d+$')
28 LookupResult
= namedtuple('LookupResult',
29 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
31 class UninitializedIndex(object):
32 class UninitializedIndexError(Exception):
35 def __nonzero__(self
):
36 """Dummy object should identify itself as False."""
40 """Python 3000 version of the above. Future-proofing rules!"""
43 def __getattr__(self
, *args
, **kwargs
):
44 raise self
.UninitializedIndexError(
45 "The lookup index does not exist. Please use `pokedex setup` "
46 "or lookup.rebuild_index() to create it."
49 class LanguageWeighting(whoosh
.scoring
.Weighting
):
50 """A scoring class that forces otherwise-equal English results to come
51 before foreign results.
54 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
55 doc
= searcher
.stored_fields(docnum
)
56 if doc
['language'] == None:
57 # English (well, "default"); leave it at 1
59 elif doc
['language'] == u
'Roomaji':
60 # Give Roomaji a little boost; it's most likely to be searched
63 # Everything else can drop down the totem pole
67 class PokedexLookup(object):
68 INTERMEDIATE_LOOKUP_RESULTS
= 25
69 MAX_LOOKUP_RESULTS
= 10
71 # Dictionary of table name => table class.
72 # Need the table name so we can get the class from the table name after we
73 # retrieve something from the index
74 indexed_tables
= dict(
75 (cls
.__tablename__
, cls
)
88 def __init__(self
, directory
=None, session
=None):
89 """Opens the whoosh index stored in the named directory. If the index
90 doesn't already exist, it will be created.
93 Directory containing the index. Defaults to a location within the
94 `pokedex` egg directory.
97 Used for creating the index and retrieving objects. Defaults to an
98 attempt to connect to the default SQLite database installed by
102 # By the time this returns, self.index, self.speller, and self.session
105 # If a directory was not given, use the default
106 if directory
is None:
107 directory
= get_default_index_dir()
109 self
.directory
= directory
112 self
.session
= session
114 self
.session
= connect()
116 # Attempt to open or create the index
117 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
118 # Directory doesn't exist OR is empty; caller needs to use
119 # rebuild_index before doing anything. Provide a dummy object that
120 # complains when used
121 self
.index
= UninitializedIndex()
122 self
.speller
= UninitializedIndex()
125 # Otherwise, already exists; should be an index! Bam, done.
126 # Note that this will explode if the directory exists but doesn't
127 # contain an index; that's a feature
129 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
130 except whoosh
.index
.EmptyIndexError
:
132 "The index directory already contains files. "
133 "Please use a dedicated directory for the lookup index."
136 # Create speller, and done
137 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
138 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
)
141 def rebuild_index(self
):
142 """Creates the index from scratch."""
144 schema
= whoosh
.fields
.Schema(
145 name
=whoosh
.fields
.ID(stored
=True),
146 table
=whoosh
.fields
.ID(stored
=True),
147 row_id
=whoosh
.fields
.ID(stored
=True),
148 language
=whoosh
.fields
.STORED
,
149 iso3166
=whoosh
.fields
.STORED
,
150 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
153 if not os
.path
.exists(self
.directory
):
154 os
.mkdir(self
.directory
)
156 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
158 writer
= self
.index
.writer()
160 # Index every name in all our tables of interest
161 # speller_entries becomes a list of (word, score) tuples; the score is
162 # 2 for English names, 1.5 for Roomaji, and 1 for everything else. I
163 # think this biases the results in the direction most people expect,
164 # especially when e.g. German names are very similar to English names
166 for cls
in self
.indexed_tables
.values():
167 q
= self
.session
.query(cls
)
169 for row
in q
.yield_per(5):
170 row_key
= dict(table
=unicode(cls
.__tablename__
),
171 row_id
=unicode(row
.id))
173 def add(name
, language
, iso3166
, score
):
174 normalized_name
= self
.normalize_name(name
)
177 name
=normalized_name
, display_name
=name
,
178 language
=language
, iso3166
=iso3166
,
182 speller_entries
.append((normalized_name
, score
))
185 # Add the basic English name to the index
186 if cls
== tables
.Pokemon
:
187 # Pokémon need their form name added
189 add(row
.full_name
, None, u
'us', 1)
191 # If this is a default form, ALSO add the unadorned name,
192 # so 'Deoxys' alone will still do the right thing
193 if row
.forme_name
and not row
.forme_base_pokemon_id
:
194 add(row
.name
, None, u
'us', 1)
196 add(row
.name
, None, u
'us', 1)
198 # Some things also have other languages' names
199 # XXX other language form names..?
200 for foreign_name
in getattr(row
, 'foreign_names', []):
201 moonspeak
= foreign_name
.name
202 if row
.name
== moonspeak
:
203 # Don't add the English name again as a different
204 # language; no point and it makes spell results
208 add(moonspeak
, foreign_name
.language
.name
,
209 foreign_name
.language
.iso3166
,
213 if foreign_name
.language
.name
== 'Japanese':
214 roomaji
= romanize(foreign_name
.name
)
215 add(roomaji
, u
'Roomaji', u
'jp', 8)
219 # Construct and populate a spell-checker index. Quicker to do it all
220 # at once, as every call to add_* does a commit(), and those seem to be
222 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
)
223 self
.speller
.add_scored_words(speller_entries
)
226 def normalize_name(self
, name
):
227 """Strips irrelevant formatting junk from name input.
229 Specifically: everything is lowercased, and accents are removed.
231 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
232 # Makes sense to me. Decompose by Unicode rules, then remove combining
233 # characters, then recombine. I'm explicitly doing it this way instead
234 # of testing combining() because Korean characters apparently
235 # decompose! But the results are considered letters, not combining
236 # characters, so testing for Mn works well, and combining them again
237 # makes them look right.
238 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
239 name
= u
"".join(c
for c
in nkfd_form
240 if unicodedata
.category(c
) != 'Mn')
241 name
= unicodedata
.normalize('NFC', name
)
249 def _apply_valid_types(self
, name
, valid_types
):
250 """Combines the enforced `valid_types` with any from the search string
251 itself and updates the query.
253 For example, a name of 'a,b:foo' and valid_types of b,c will search for
254 only `b`s named "foo".
256 Returns `(name, merged_valid_types, term)`, where `name` has had any type
257 prefix stripped, `merged_valid_types` combines the original
258 `valid_types` with the type prefix, and `term` is a query term for
259 limited to just the allowed types. If there are no type restrictions
260 at all, `term` will be None.
263 # Remove any type prefix (pokemon:133) first
264 user_valid_types
= []
266 prefix_chunk
, name
= name
.split(':', 1)
269 prefixes
= prefix_chunk
.split(',')
270 user_valid_types
= [_
.strip() for _
in prefixes
]
272 # Merge the valid types together. Only types that appear in BOTH lists
274 # As a special case, if the user asked for types that are explicitly
275 # forbidden, completely ignore what the user requested
276 combined_valid_types
= []
277 if user_valid_types
and valid_types
:
278 combined_valid_types
= list(
279 set(user_valid_types
) & set(combined_valid_types
)
282 if not combined_valid_types
:
283 # No overlap! Just use the enforced ones
284 combined_valid_types
= valid_types
286 # One list or the other was blank, so just use the one that isn't
287 combined_valid_types
= valid_types
+ user_valid_types
289 if not combined_valid_types
:
291 return name
, [], None
295 final_valid_types
= []
296 for valid_type
in combined_valid_types
:
297 table_name
= self
._parse_table_name(valid_type
)
299 # Quietly ignore bogus valid_types; more likely to DTRT
301 final_valid_types
.append(valid_type
)
302 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
304 return name
, final_valid_types
, whoosh
.query
.Or(type_terms
)
307 def _parse_table_name(self
, name
):
308 """Takes a singular table name, table name, or table object and returns
311 Returns None for a bogus name.
314 if hasattr(name
, '__tablename__'):
315 return getattr(name
, '__tablename__')
318 for table
in self
.indexed_tables
.values():
319 if name
in (table
.__tablename__
, table
.__singlename__
):
320 return table
.__tablename__
322 # Bogus. Be nice and return dummy
325 def _whoosh_records_to_results(self
, records
, exact
=True):
326 """Converts a list of whoosh's indexed records to LookupResult tuples
327 containing database objects.
329 # XXX this 'exact' thing is getting kinda leaky. would like a better
330 # way to handle it, since only lookup() cares about fuzzy results
333 for record
in records
:
335 seen_key
= record
['table'], record
['row_id']
338 seen
[seen_key
] = True
340 cls
= self
.indexed_tables
[record
['table']]
341 obj
= self
.session
.query(cls
).get(record
['row_id'])
343 results
.append(LookupResult(object=obj
,
344 indexed_name
=record
['name'],
345 name
=record
['display_name'],
346 language
=record
['language'],
347 iso3166
=record
['iso3166'],
353 def lookup(self
, input, valid_types
=[], exact_only
=False):
354 """Attempts to find some sort of object, given a name.
356 Returns a list of named (object, name, language, iso3166, exact)
357 tuples. `object` is a database object, `name` is the name under which
358 the object was found, `language` and `iso3166` are the name and country
359 code of the language in which the name was found, and `exact` is True
363 This function currently ONLY does fuzzy matching if there are no exact
366 Formes are not returned unless requested; "Shaymin" will return only
369 Extraneous whitespace is removed with extreme prejudice.
372 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
373 - Foreign names: "Iibui", "Eivui"
374 - Fuzzy names in whatever language: "Evee", "Ibui"
375 - IDs: "133", "192", "250"
377 - Type restrictions. "type:psychic" will only return the type. This
378 is how to make ID lookup useful. Multiple type specs can be entered
379 with commas, as "move,item:1". If `valid_types` are provided, any
380 type prefix will be ignored.
381 - Alternate formes can be specified merely like "wash rotom".
384 Name of the thing to look for.
387 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
388 this is provided, only results in one of the given tables will be
392 If True, only exact matches are returned. If set to False (the
393 default), and the provided `name` doesn't match anything exactly,
394 spelling correction will be attempted.
397 name
= self
.normalize_name(input)
401 # Pop off any type prefix and merge with valid_types
402 name
, merged_valid_types
, type_term
= \
403 self
._apply_valid_types(name
, valid_types
)
407 return self
.random_lookup(valid_types
=merged_valid_types
)
409 # Do different things depending what the query looks like
410 # Note: Term objects do an exact match, so we don't have to worry about
411 # a query parser tripping on weird characters in the input
413 # Let Python try to convert to a number, so 0xff works
414 name_as_number
= int(name
, base
=0)
417 name_as_number
= None
419 if '*' in name
or '?' in name
:
421 query
= whoosh
.query
.Wildcard(u
'name', name
)
422 elif name_as_number
is not None:
423 # Don't spell-check numbers!
425 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
428 query
= whoosh
.query
.Term(u
'name', name
)
431 query
= query
& type_term
435 searcher
= self
.index
.searcher()
436 # XXX is this kosher? docs say search() takes a weighting arg, but it
438 searcher
.weighting
= LanguageWeighting()
439 results
= searcher
.search(query
,
440 limit
=self
.INTERMEDIATE_LOOKUP_RESULTS
)
442 # Look for some fuzzy matches if necessary
443 if not exact_only
and not results
:
447 for suggestion
in self
.speller
.suggest(
448 name
, self
.INTERMEDIATE_LOOKUP_RESULTS
):
450 query
= whoosh
.query
.Term('name', suggestion
)
451 results
.extend(searcher
.search(query
))
453 ### Convert results to db objects
454 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
456 # Only return up to 10 matches; beyond that, something is wrong. We
457 # strip out duplicate entries above, so it's remotely possible that we
458 # should have more than 10 here and lost a few. The speller returns 25
459 # to give us some padding, and should avoid that problem. Not a big
460 # deal if we lose the 25th-most-likely match anyway.
461 return objects
[:self
.MAX_LOOKUP_RESULTS
]
464 def random_lookup(self
, valid_types
=[]):
465 """Returns a random lookup result from one of the provided
470 for valid_type
in valid_types
:
471 table_name
= self
._parse_table_name(valid_type
)
473 tables
.append(self
.indexed_tables
[table_name
])
476 # n.b.: It's possible we got a list of valid_types and none of them
477 # were valid, but this function is guaranteed to return
478 # *something*, so it politely selects from the entire index isntead
479 tables
= self
.indexed_tables
.values()
481 # Rather than create an array of many hundred items and pick randomly
482 # from it, just pick a number up to the total number of potential
483 # items, then pick randomly from that, and partition the whole range
484 # into chunks. This also avoids the slight problem that the index
485 # contains more rows (for languages) for some items than others.
486 # XXX ought to cache this (in the index?) if possible
490 count
= self
.session
.query(table
).count()
492 partitions
.append((table
, count
))
494 n
= random
.randint(1, total
)
495 while n
> partitions
[0][1]:
496 n
-= partitions
[0][1]
499 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
501 def prefix_lookup(self
, prefix
, valid_types
=[]):
502 """Returns terms starting with the given exact prefix.
504 Type prefixes are recognized, but no other name munging is done.
507 # Pop off any type prefix and merge with valid_types
508 prefix
, merged_valid_types
, type_term
= \
509 self
._apply_valid_types(prefix
, valid_types
)
511 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
514 query
= query
& type_term
516 searcher
= self
.index
.searcher()
517 searcher
.weighting
= LanguageWeighting()
518 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
520 return self
._whoosh_records_to_results(results
)