8 from sqlalchemy
.sql
import func
10 import whoosh
.filedb
.filestore
11 import whoosh
.filedb
.fileindex
13 from whoosh
.qparser
import QueryParser
15 import whoosh
.spelling
17 from pokedex
.util
import namedtuple
19 from pokedex
.db
import connect
20 import pokedex
.db
.tables
as tables
21 from pokedex
.roomaji
import romanize
22 from pokedex
.defaults
import get_default_index_dir
24 __all__
= ['PokedexLookup']
27 rx_is_number
= re
.compile('^\d+$')
29 LookupResult
= namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
37 def __nonzero__(self
):
38 """Dummy object should identify itself as False."""
42 """Python 3000 version of the above. Future-proofing rules!"""
45 def __getattr__(self
, *args
, **kwargs
):
46 raise self
.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
51 class LanguageWeighting(whoosh
.scoring
.Weighting
):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
56 def __init__(self
, extra_weights
={}, *args
, **kwargs
):
57 """`extra_weights` may be a dictionary of weights which will be
60 Intended for use with spelling corrections, which come along with their
63 self
.extra_weights
= extra_weights
64 super(LanguageWeighting
, self
).__init__(*args
, **kwargs
)
66 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
67 doc
= searcher
.stored_fields(docnum
)
70 weight
= weight
* self
.extra_weights
.get(text
, 1.0)
72 language
= doc
.get('language')
74 # English (well, "default"); leave it at 1
76 elif language
== u
'Roomaji':
77 # Give Roomaji a little boost; it's most likely to be searched
80 # Everything else can drop down the totem pole
84 class PokedexLookup(object):
85 MAX_FUZZY_RESULTS
= 10
86 MAX_EXACT_RESULTS
= 43
87 INTERMEDIATE_FACTOR
= 2
89 # The speller only checks how much the input matches a word; there can be
90 # all manner of extra unmatched junk, and it won't affect the weighting.
91 # To compensate, greatly boost the weighting of matches at the beginning
92 # and end, so nearly-full-word-matches are much better
93 SPELLER_OPTIONS
= dict(booststart
=10.0, boostend
=9.0)
95 # Dictionary of table name => table class.
96 # Need the table name so we can get the class from the table name after we
97 # retrieve something from the index
98 indexed_tables
= dict(
99 (cls
.__tablename__
, cls
)
113 def __init__(self
, directory
=None, session
=None):
114 """Opens the whoosh index stored in the named directory. If the index
115 doesn't already exist, it will be created.
118 Directory containing the index. Defaults to a location within the
119 `pokedex` egg directory.
122 Used for creating the index and retrieving objects. Defaults to an
123 attempt to connect to the default SQLite database installed by
127 # By the time this returns, self.index, self.speller, and self.session
130 # If a directory was not given, use the default
131 if directory
is None:
132 directory
= get_default_index_dir()
134 self
.directory
= directory
137 self
.session
= session
139 self
.session
= connect()
141 # Attempt to open or create the index
142 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
143 # Directory doesn't exist OR is empty; caller needs to use
144 # rebuild_index before doing anything. Provide a dummy object that
145 # complains when used
146 self
.index
= UninitializedIndex()
147 self
.speller
= UninitializedIndex()
150 # Otherwise, already exists; should be an index! Bam, done.
151 # Note that this will explode if the directory exists but doesn't
152 # contain an index; that's a feature
154 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
155 except whoosh
.index
.EmptyIndexError
:
157 "The index directory already contains files. "
158 "Please use a dedicated directory for the lookup index."
161 # Create speller, and done
162 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
163 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
,
164 **self
.SPELLER_OPTIONS
)
167 def rebuild_index(self
):
168 """Creates the index from scratch."""
170 schema
= whoosh
.fields
.Schema(
171 name
=whoosh
.fields
.ID(stored
=True),
172 table
=whoosh
.fields
.ID(stored
=True),
173 row_id
=whoosh
.fields
.ID(stored
=True),
174 language
=whoosh
.fields
.STORED
,
175 iso639
=whoosh
.fields
.ID(stored
=True),
176 iso3166
=whoosh
.fields
.ID(stored
=True),
177 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
180 if os
.path
.exists(self
.directory
):
181 # create_in() isn't totally reliable, so just nuke whatever's there
182 # manually. Try to be careful about this...
183 for f
in os
.listdir(self
.directory
):
184 if re
.match('^_?(MAIN|SPELL)_', f
):
185 os
.remove(os
.path
.join(self
.directory
, f
))
187 os
.mkdir(self
.directory
)
189 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
191 writer
= self
.index
.writer()
193 # Index every name in all our tables of interest
194 speller_entries
= set()
195 for cls
in self
.indexed_tables
.values():
196 q
= self
.session
.query(cls
)
198 for row
in q
.yield_per(5):
199 row_key
= dict(table
=unicode(cls
.__tablename__
),
200 row_id
=unicode(row
.id))
202 def add(name
, language
, iso639
, iso3166
):
203 normalized_name
= self
.normalize_name(name
)
206 name
=normalized_name
, display_name
=name
,
207 language
=language
, iso639
=iso639
, iso3166
=iso3166
,
211 speller_entries
.add(normalized_name
)
214 # Add the basic English name to the index
215 if cls
== tables
.Pokemon
:
216 # Don't re-add alternate forms of the same Pokémon; they'll
217 # be added as Pokémon forms instead
218 if not row
.is_base_form
:
220 elif cls
== tables
.PokemonForm
:
222 add(row
.pokemon_name
, None, u
'en', u
'us')
225 add(row
.name
, None, u
'en', u
'us')
227 # Some things also have other languages' names
228 # XXX other language form names..?
229 for foreign_name
in getattr(row
, 'foreign_names', []):
230 moonspeak
= foreign_name
.name
231 if row
.name
== moonspeak
:
232 # Don't add the English name again as a different
233 # language; no point and it makes spell results
237 add(moonspeak
, foreign_name
.language
.name
,
238 foreign_name
.language
.iso639
,
239 foreign_name
.language
.iso3166
)
242 if foreign_name
.language
.name
== 'Japanese':
243 roomaji
= romanize(foreign_name
.name
)
244 add(roomaji
, u
'Roomaji', u
'ja', u
'jp')
248 # Construct and populate a spell-checker index. Quicker to do it all
249 # at once, as every call to add_* does a commit(), and those seem to be
251 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
, mingram
=2,
252 **self
.SPELLER_OPTIONS
)
253 self
.speller
.add_words(speller_entries
)
256 def normalize_name(self
, name
):
257 """Strips irrelevant formatting junk from name input.
259 Specifically: everything is lowercased, and accents are removed.
261 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
262 # Makes sense to me. Decompose by Unicode rules, then remove combining
263 # characters, then recombine. I'm explicitly doing it this way instead
264 # of testing combining() because Korean characters apparently
265 # decompose! But the results are considered letters, not combining
266 # characters, so testing for Mn works well, and combining them again
267 # makes them look right.
268 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
269 name
= u
"".join(c
for c
in nkfd_form
270 if unicodedata
.category(c
) != 'Mn')
271 name
= unicodedata
.normalize('NFC', name
)
279 def _apply_valid_types(self
, name
, valid_types
):
280 """Combines the enforced `valid_types` with any from the search string
281 itself and updates the query.
283 For example, a name of 'a,b:foo' and valid_types of b,c will search for
284 only `b`s named "foo".
286 Returns `(name, merged_valid_types, term)`, where `name` has had any type
287 prefix stripped, `merged_valid_types` combines the original
288 `valid_types` with the type prefix, and `term` is a query term for
289 limited to just the allowed types. If there are no type restrictions
290 at all, `term` will be None.
293 # Remove any type prefix (pokemon:133) first
294 user_valid_types
= []
296 prefix_chunk
, name
= name
.split(':', 1)
299 prefixes
= prefix_chunk
.split(',')
300 user_valid_types
= []
301 for prefix
in prefixes
:
302 prefix
= prefix
.strip()
304 user_valid_types
.append(prefix
)
306 # Merge the valid types together. Only types that appear in BOTH lists
308 # As a special case, if the user asked for types that are explicitly
309 # forbidden, completely ignore what the user requested.
310 # And, just to complicate matters: "type" and language need to be
311 # considered separately.
312 def merge_requirements(func
):
313 user
= filter(func
, user_valid_types
)
314 system
= filter(func
, valid_types
)
317 merged
= list(set(user
) & set(system
))
321 # No overlap; use the system restrictions
324 # One or the other is blank; use the one that's not
325 return user
or system
327 # @foo means language must be foo; otherwise it's a table name
328 lang_requirements
= merge_requirements(lambda req
: req
[0] == u
'@')
329 type_requirements
= merge_requirements(lambda req
: req
[0] != u
'@')
330 all_requirements
= lang_requirements
+ type_requirements
334 for lang
in lang_requirements
:
335 # Allow for either country or language codes
337 lang_terms
.append(whoosh
.query
.Term(u
'iso639', lang_code
))
338 lang_terms
.append(whoosh
.query
.Term(u
'iso3166', lang_code
))
341 for type in type_requirements
:
342 table_name
= self
._parse_table_name(type)
344 # Quietly ignore bogus valid_types; more likely to DTRT
346 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
348 # Combine both kinds of restriction
351 all_terms
.append(whoosh
.query
.Or(type_terms
))
353 all_terms
.append(whoosh
.query
.Or(lang_terms
))
355 return name
, all_requirements
, whoosh
.query
.And(all_terms
)
358 def _parse_table_name(self
, name
):
359 """Takes a singular table name, table name, or table object and returns
362 Returns None for a bogus name.
365 if hasattr(name
, '__tablename__'):
366 return getattr(name
, '__tablename__')
369 for table
in self
.indexed_tables
.values():
370 if name
in (table
.__tablename__
, table
.__singlename__
):
371 return table
.__tablename__
373 # Bogus. Be nice and return dummy
376 def _whoosh_records_to_results(self
, records
, exact
=True):
377 """Converts a list of whoosh's indexed records to LookupResult tuples
378 containing database objects.
380 # XXX this 'exact' thing is getting kinda leaky. would like a better
381 # way to handle it, since only lookup() cares about fuzzy results
384 for record
in records
:
386 seen_key
= record
['table'], record
['row_id']
389 seen
[seen_key
] = True
391 cls
= self
.indexed_tables
[record
['table']]
392 obj
= self
.session
.query(cls
).get(record
['row_id'])
394 results
.append(LookupResult(object=obj
,
395 indexed_name
=record
['name'],
396 name
=record
['display_name'],
397 language
=record
.get('language'),
398 iso639
=record
['iso639'],
399 iso3166
=record
['iso3166'],
405 def lookup(self
, input, valid_types
=[], exact_only
=False):
406 """Attempts to find some sort of object, given a name.
408 Returns a list of named (object, name, language, iso639, iso3166,
409 exact) tuples. `object` is a database object, `name` is the name under
410 which the object was found, `language` and the two isos are the name
411 and country codes of the language in which the name was found, and
412 `exact` is True iff this was an exact match.
414 This function currently ONLY does fuzzy matching if there are no exact
417 Formes are not returned unless requested; "Shaymin" will return only
420 Extraneous whitespace is removed with extreme prejudice.
423 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
424 - Foreign names: "Iibui", "Eivui"
425 - Fuzzy names in whatever language: "Evee", "Ibui"
426 - IDs: "133", "192", "250"
428 - Type restrictions. "type:psychic" will only return the type. This
429 is how to make ID lookup useful. Multiple type specs can be entered
430 with commas, as "move,item:1".
431 - Language restrictions. "@fr:charge" will only return Tackle, which
432 is called "Charge" in French. These can be combined with type
433 restrictions, e.g., "@fr,move:charge".
434 - Alternate formes can be specified merely like "wash rotom".
437 Name of the thing to look for.
440 A list of type or language restrictions, e.g., `['pokemon',
441 '@ja']`. If this is provided, only results in one of the given
442 tables will be returned.
445 If True, only exact matches are returned. If set to False (the
446 default), and the provided `name` doesn't match anything exactly,
447 spelling correction will be attempted.
450 name
= self
.normalize_name(input)
454 # Pop off any type prefix and merge with valid_types
455 name
, merged_valid_types
, type_term
= \
456 self
._apply_valid_types(name
, valid_types
)
460 return self
.random_lookup(valid_types
=merged_valid_types
)
462 # Do different things depending what the query looks like
463 # Note: Term objects do an exact match, so we don't have to worry about
464 # a query parser tripping on weird characters in the input
466 # Let Python try to convert to a number, so 0xff works
467 name_as_number
= int(name
, base
=0)
470 name_as_number
= None
472 if '*' in name
or '?' in name
:
474 query
= whoosh
.query
.Wildcard(u
'name', name
)
475 elif name_as_number
is not None:
476 # Don't spell-check numbers!
478 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
481 query
= whoosh
.query
.Term(u
'name', name
)
484 query
= query
& type_term
488 # Limits; result limits are constants, and intermediate results (before
489 # duplicate items are stripped out) are capped at the result limit
490 # times another constant.
491 # Fuzzy are capped at 10, beyond which something is probably very
492 # wrong. Exact matches -- that is, wildcards and ids -- are far less
494 # Also, exact matches are sorted by name, since weight doesn't matter.
497 max_results
= self
.MAX_EXACT_RESULTS
498 sort_by
['sortedby'] = (u
'table', u
'name')
500 max_results
= self
.MAX_FUZZY_RESULTS
502 searcher
= self
.index
.searcher(weighting
=LanguageWeighting())
503 results
= searcher
.search(
505 limit
=int(max_results
* self
.INTERMEDIATE_FACTOR
),
509 # Look for some fuzzy matches if necessary
510 if not exact_only
and not results
:
514 fuzzy_query_parts
= []
517 for suggestion
, _
, weight
in self
.speller
.suggestions_and_scores(name
):
518 # Only allow the top 50% of scores; otherwise there will always
519 # be a lot of trailing junk
520 if min_weight
[0] is None:
521 min_weight
[0] = weight
* 0.5
522 elif weight
< min_weight
[0]:
525 fuzzy_query_parts
.append(whoosh
.query
.Term('name', suggestion
))
526 fuzzy_weights
[suggestion
] = weight
528 if not fuzzy_query_parts
:
529 # Nothing at all; don't try querying
532 fuzzy_query
= whoosh
.query
.Or(fuzzy_query_parts
)
534 fuzzy_query
= fuzzy_query
& type_term
536 searcher
.weighting
= LanguageWeighting(extra_weights
=fuzzy_weights
)
537 results
= searcher
.search(fuzzy_query
)
539 ### Convert results to db objects
540 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
542 # Truncate and return
543 return objects
[:max_results
]
546 def random_lookup(self
, valid_types
=[]):
547 """Returns a random lookup result from one of the provided
552 for valid_type
in valid_types
:
553 table_name
= self
._parse_table_name(valid_type
)
554 # Skip anything not recognized. Could be, say, a language code.
555 # XXX The vast majority of Pokémon forms are unnamed and unindexed,
556 # which can produce blank results. So skip them too for now.
557 if table_name
and table_name
!= 'pokemon_forms':
558 table_names
.append(table_name
)
561 # n.b.: It's possible we got a list of valid_types and none of them
562 # were valid, but this function is guaranteed to return
563 # *something*, so it politely selects from the entire index instead
564 table_names
= self
.indexed_tables
.keys()
565 table_names
.remove('pokemon_forms')
567 # Rather than create an array of many hundred items and pick randomly
568 # from it, just pick a number up to the total number of potential
569 # items, then pick randomly from that, and partition the whole range
570 # into chunks. This also avoids the slight problem that the index
571 # contains more rows (for languages) for some items than others.
572 # XXX ought to cache this (in the index?) if possible
575 for table_name
in table_names
:
576 count
= self
.session
.query(self
.indexed_tables
[table_name
]).count()
578 partitions
.append((table_name
, count
))
580 n
= random
.randint(1, total
)
581 while n
> partitions
[0][1]:
582 n
-= partitions
[0][1]
585 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
587 def prefix_lookup(self
, prefix
, valid_types
=[]):
588 """Returns terms starting with the given exact prefix.
590 Type prefixes are recognized, but no other name munging is done.
593 # Pop off any type prefix and merge with valid_types
594 prefix
, merged_valid_types
, type_term
= \
595 self
._apply_valid_types(prefix
, valid_types
)
597 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
600 query
= query
& type_term
602 searcher
= self
.index
.searcher()
603 searcher
.weighting
= LanguageWeighting()
604 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
606 return self
._whoosh_records_to_results(results
)