8 from sqlalchemy
.sql
import func
10 import whoosh
.filedb
.filestore
11 import whoosh
.filedb
.fileindex
13 from whoosh
.qparser
import QueryParser
15 import whoosh
.spelling
17 from pokedex
.util
import namedtuple
19 from pokedex
.db
import connect
20 import pokedex
.db
.tables
as tables
21 from pokedex
.roomaji
import romanize
22 from pokedex
.defaults
import get_default_index_dir
24 __all__
= ['PokedexLookup']
27 rx_is_number
= re
.compile('^\d+$')
29 LookupResult
= namedtuple('LookupResult',
30 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
32 class UninitializedIndex(object):
33 class UninitializedIndexError(Exception):
36 def __nonzero__(self
):
37 """Dummy object should identify itself as False."""
41 """Python 3000 version of the above. Future-proofing rules!"""
44 def __getattr__(self
, *args
, **kwargs
):
45 raise self
.UninitializedIndexError(
46 "The lookup index does not exist. Please use `pokedex setup` "
47 "or lookup.rebuild_index() to create it."
50 class LanguageWeighting(whoosh
.scoring
.Weighting
):
51 """A scoring class that forces otherwise-equal English results to come
52 before foreign results.
55 def __init__(self
, extra_weights
={}, *args
, **kwargs
):
56 """`extra_weights` may be a dictionary of weights which will be
59 Intended for use with spelling corrections, which come along with their
62 self
.extra_weights
= extra_weights
63 super(LanguageWeighting
, self
).__init__(*args
, **kwargs
)
65 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
66 doc
= searcher
.stored_fields(docnum
)
69 weight
= weight
* self
.extra_weights
.get(text
, 1.0)
71 if doc
['language'] == None:
72 # English (well, "default"); leave it at 1
74 elif doc
['language'] == u
'Roomaji':
75 # Give Roomaji a little boost; it's most likely to be searched
78 # Everything else can drop down the totem pole
82 class PokedexLookup(object):
83 INTERMEDIATE_LOOKUP_RESULTS
= 25
84 MAX_LOOKUP_RESULTS
= 10
86 # The speller only checks how much the input matches a word; there can be
87 # all manner of extra unmatched junk, and it won't affect the weighting.
88 # To compensate, greatly boost the weighting of matches at the beginning
89 # and end, so nearly-full-word-matches are much better
90 SPELLER_OPTIONS
= dict(booststart
=10.0, boostend
=9.0)
92 # Dictionary of table name => table class.
93 # Need the table name so we can get the class from the table name after we
94 # retrieve something from the index
95 indexed_tables
= dict(
96 (cls
.__tablename__
, cls
)
109 def __init__(self
, directory
=None, session
=None):
110 """Opens the whoosh index stored in the named directory. If the index
111 doesn't already exist, it will be created.
114 Directory containing the index. Defaults to a location within the
115 `pokedex` egg directory.
118 Used for creating the index and retrieving objects. Defaults to an
119 attempt to connect to the default SQLite database installed by
123 # By the time this returns, self.index, self.speller, and self.session
126 # If a directory was not given, use the default
127 if directory
is None:
128 directory
= get_default_index_dir()
130 self
.directory
= directory
133 self
.session
= session
135 self
.session
= connect()
137 # Attempt to open or create the index
138 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
139 # Directory doesn't exist OR is empty; caller needs to use
140 # rebuild_index before doing anything. Provide a dummy object that
141 # complains when used
142 self
.index
= UninitializedIndex()
143 self
.speller
= UninitializedIndex()
146 # Otherwise, already exists; should be an index! Bam, done.
147 # Note that this will explode if the directory exists but doesn't
148 # contain an index; that's a feature
150 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
151 except whoosh
.index
.EmptyIndexError
:
153 "The index directory already contains files. "
154 "Please use a dedicated directory for the lookup index."
157 # Create speller, and done
158 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
159 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
,
160 **self
.SPELLER_OPTIONS
)
163 def rebuild_index(self
):
164 """Creates the index from scratch."""
166 schema
= whoosh
.fields
.Schema(
167 name
=whoosh
.fields
.ID(stored
=True),
168 table
=whoosh
.fields
.ID(stored
=True),
169 row_id
=whoosh
.fields
.ID(stored
=True),
170 language
=whoosh
.fields
.STORED
,
171 iso3166
=whoosh
.fields
.STORED
,
172 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
175 if not os
.path
.exists(self
.directory
):
176 os
.mkdir(self
.directory
)
178 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
180 writer
= self
.index
.writer()
182 # Index every name in all our tables of interest
183 speller_entries
= set()
184 for cls
in self
.indexed_tables
.values():
185 q
= self
.session
.query(cls
)
187 for row
in q
.yield_per(5):
188 row_key
= dict(table
=unicode(cls
.__tablename__
),
189 row_id
=unicode(row
.id))
191 def add(name
, language
, iso3166
):
192 normalized_name
= self
.normalize_name(name
)
195 name
=normalized_name
, display_name
=name
,
196 language
=language
, iso3166
=iso3166
,
200 speller_entries
.add(normalized_name
)
203 # Add the basic English name to the index
204 if cls
== tables
.Pokemon
:
205 # Pokémon need their form name added
207 add(row
.full_name
, None, u
'us')
209 # If this is a default form, ALSO add the unadorned name,
210 # so 'Deoxys' alone will still do the right thing
211 if row
.forme_name
and not row
.forme_base_pokemon_id
:
212 add(row
.name
, None, u
'us')
214 add(row
.name
, None, u
'us')
216 # Some things also have other languages' names
217 # XXX other language form names..?
218 for foreign_name
in getattr(row
, 'foreign_names', []):
219 moonspeak
= foreign_name
.name
220 if row
.name
== moonspeak
:
221 # Don't add the English name again as a different
222 # language; no point and it makes spell results
226 add(moonspeak
, foreign_name
.language
.name
,
227 foreign_name
.language
.iso3166
)
230 if foreign_name
.language
.name
== 'Japanese':
231 roomaji
= romanize(foreign_name
.name
)
232 add(roomaji
, u
'Roomaji', u
'jp')
236 # Construct and populate a spell-checker index. Quicker to do it all
237 # at once, as every call to add_* does a commit(), and those seem to be
239 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
, mingram
=2,
240 **self
.SPELLER_OPTIONS
)
241 self
.speller
.add_words(speller_entries
)
244 def normalize_name(self
, name
):
245 """Strips irrelevant formatting junk from name input.
247 Specifically: everything is lowercased, and accents are removed.
249 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
250 # Makes sense to me. Decompose by Unicode rules, then remove combining
251 # characters, then recombine. I'm explicitly doing it this way instead
252 # of testing combining() because Korean characters apparently
253 # decompose! But the results are considered letters, not combining
254 # characters, so testing for Mn works well, and combining them again
255 # makes them look right.
256 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
257 name
= u
"".join(c
for c
in nkfd_form
258 if unicodedata
.category(c
) != 'Mn')
259 name
= unicodedata
.normalize('NFC', name
)
267 def _apply_valid_types(self
, name
, valid_types
):
268 """Combines the enforced `valid_types` with any from the search string
269 itself and updates the query.
271 For example, a name of 'a,b:foo' and valid_types of b,c will search for
272 only `b`s named "foo".
274 Returns `(name, merged_valid_types, term)`, where `name` has had any type
275 prefix stripped, `merged_valid_types` combines the original
276 `valid_types` with the type prefix, and `term` is a query term for
277 limited to just the allowed types. If there are no type restrictions
278 at all, `term` will be None.
281 # Remove any type prefix (pokemon:133) first
282 user_valid_types
= []
284 prefix_chunk
, name
= name
.split(':', 1)
287 prefixes
= prefix_chunk
.split(',')
288 user_valid_types
= [_
.strip() for _
in prefixes
]
290 # Merge the valid types together. Only types that appear in BOTH lists
292 # As a special case, if the user asked for types that are explicitly
293 # forbidden, completely ignore what the user requested
294 combined_valid_types
= []
295 if user_valid_types
and valid_types
:
296 combined_valid_types
= list(
297 set(user_valid_types
) & set(combined_valid_types
)
300 if not combined_valid_types
:
301 # No overlap! Just use the enforced ones
302 combined_valid_types
= valid_types
304 # One list or the other was blank, so just use the one that isn't
305 combined_valid_types
= valid_types
+ user_valid_types
307 if not combined_valid_types
:
309 return name
, [], None
313 final_valid_types
= []
314 for valid_type
in combined_valid_types
:
315 table_name
= self
._parse_table_name(valid_type
)
317 # Quietly ignore bogus valid_types; more likely to DTRT
319 final_valid_types
.append(valid_type
)
320 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
322 return name
, final_valid_types
, whoosh
.query
.Or(type_terms
)
325 def _parse_table_name(self
, name
):
326 """Takes a singular table name, table name, or table object and returns
329 Returns None for a bogus name.
332 if hasattr(name
, '__tablename__'):
333 return getattr(name
, '__tablename__')
336 for table
in self
.indexed_tables
.values():
337 if name
in (table
.__tablename__
, table
.__singlename__
):
338 return table
.__tablename__
340 # Bogus. Be nice and return dummy
343 def _whoosh_records_to_results(self
, records
, exact
=True):
344 """Converts a list of whoosh's indexed records to LookupResult tuples
345 containing database objects.
347 # XXX this 'exact' thing is getting kinda leaky. would like a better
348 # way to handle it, since only lookup() cares about fuzzy results
351 for record
in records
:
353 seen_key
= record
['table'], record
['row_id']
356 seen
[seen_key
] = True
358 cls
= self
.indexed_tables
[record
['table']]
359 obj
= self
.session
.query(cls
).get(record
['row_id'])
361 results
.append(LookupResult(object=obj
,
362 indexed_name
=record
['name'],
363 name
=record
['display_name'],
364 language
=record
['language'],
365 iso3166
=record
['iso3166'],
371 def lookup(self
, input, valid_types
=[], exact_only
=False):
372 """Attempts to find some sort of object, given a name.
374 Returns a list of named (object, name, language, iso3166, exact)
375 tuples. `object` is a database object, `name` is the name under which
376 the object was found, `language` and `iso3166` are the name and country
377 code of the language in which the name was found, and `exact` is True
381 This function currently ONLY does fuzzy matching if there are no exact
384 Formes are not returned unless requested; "Shaymin" will return only
387 Extraneous whitespace is removed with extreme prejudice.
390 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
391 - Foreign names: "Iibui", "Eivui"
392 - Fuzzy names in whatever language: "Evee", "Ibui"
393 - IDs: "133", "192", "250"
395 - Type restrictions. "type:psychic" will only return the type. This
396 is how to make ID lookup useful. Multiple type specs can be entered
397 with commas, as "move,item:1". If `valid_types` are provided, any
398 type prefix will be ignored.
399 - Alternate formes can be specified merely like "wash rotom".
402 Name of the thing to look for.
405 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
406 this is provided, only results in one of the given tables will be
410 If True, only exact matches are returned. If set to False (the
411 default), and the provided `name` doesn't match anything exactly,
412 spelling correction will be attempted.
415 name
= self
.normalize_name(input)
419 # Pop off any type prefix and merge with valid_types
420 name
, merged_valid_types
, type_term
= \
421 self
._apply_valid_types(name
, valid_types
)
425 return self
.random_lookup(valid_types
=merged_valid_types
)
427 # Do different things depending what the query looks like
428 # Note: Term objects do an exact match, so we don't have to worry about
429 # a query parser tripping on weird characters in the input
431 # Let Python try to convert to a number, so 0xff works
432 name_as_number
= int(name
, base
=0)
435 name_as_number
= None
437 if '*' in name
or '?' in name
:
439 query
= whoosh
.query
.Wildcard(u
'name', name
)
440 elif name_as_number
is not None:
441 # Don't spell-check numbers!
443 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
446 query
= whoosh
.query
.Term(u
'name', name
)
449 query
= query
& type_term
453 searcher
= self
.index
.searcher()
454 # XXX is this kosher? docs say search() takes a weighting arg, but it
456 searcher
.weighting
= LanguageWeighting()
457 results
= searcher
.search(query
,
458 limit
=self
.INTERMEDIATE_LOOKUP_RESULTS
)
460 # Look for some fuzzy matches if necessary
461 if not exact_only
and not results
:
465 fuzzy_query_parts
= []
468 for suggestion
, _
, weight
in self
.speller
.suggestions_and_scores(name
):
469 # Only allow the top 50% of scores; otherwise there will always
470 # be a lot of trailing junk
471 if min_weight
[0] is None:
472 min_weight
[0] = weight
* 0.5
473 elif weight
< min_weight
[0]:
476 fuzzy_query_parts
.append(whoosh
.query
.Term('name', suggestion
))
477 fuzzy_weights
[suggestion
] = weight
479 if not fuzzy_query_parts
:
480 # Nothing at all; don't try querying
483 fuzzy_query
= whoosh
.query
.Or(fuzzy_query_parts
)
485 fuzzy_query
= fuzzy_query
& type_term
487 searcher
.weighting
= LanguageWeighting(extra_weights
=fuzzy_weights
)
488 results
= searcher
.search(fuzzy_query
)
490 ### Convert results to db objects
491 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
493 # Only return up to 10 matches; beyond that, something is wrong. We
494 # strip out duplicate entries above, so it's remotely possible that we
495 # should have more than 10 here and lost a few. The speller returns 25
496 # to give us some padding, and should avoid that problem. Not a big
497 # deal if we lose the 25th-most-likely match anyway.
498 return objects
[:self
.MAX_LOOKUP_RESULTS
]
501 def random_lookup(self
, valid_types
=[]):
502 """Returns a random lookup result from one of the provided
507 for valid_type
in valid_types
:
508 table_name
= self
._parse_table_name(valid_type
)
510 tables
.append(self
.indexed_tables
[table_name
])
513 # n.b.: It's possible we got a list of valid_types and none of them
514 # were valid, but this function is guaranteed to return
515 # *something*, so it politely selects from the entire index isntead
516 tables
= self
.indexed_tables
.values()
518 # Rather than create an array of many hundred items and pick randomly
519 # from it, just pick a number up to the total number of potential
520 # items, then pick randomly from that, and partition the whole range
521 # into chunks. This also avoids the slight problem that the index
522 # contains more rows (for languages) for some items than others.
523 # XXX ought to cache this (in the index?) if possible
527 count
= self
.session
.query(table
).count()
529 partitions
.append((table
, count
))
531 n
= random
.randint(1, total
)
532 while n
> partitions
[0][1]:
533 n
-= partitions
[0][1]
536 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
538 def prefix_lookup(self
, prefix
, valid_types
=[]):
539 """Returns terms starting with the given exact prefix.
541 Type prefixes are recognized, but no other name munging is done.
544 # Pop off any type prefix and merge with valid_types
545 prefix
, merged_valid_types
, type_term
= \
546 self
._apply_valid_types(prefix
, valid_types
)
548 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
551 query
= query
& type_term
553 searcher
= self
.index
.searcher()
554 searcher
.weighting
= LanguageWeighting()
555 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
557 return self
._whoosh_records_to_results(results
)