8 from sqlalchemy
.sql
import func
10 import whoosh
.filedb
.filestore
11 import whoosh
.filedb
.fileindex
13 from whoosh
.qparser
import QueryParser
15 import whoosh
.spelling
17 from pokedex
.util
import namedtuple
19 from pokedex
.db
import connect
20 import pokedex
.db
.tables
as tables
21 from pokedex
.roomaji
import romanize
22 from pokedex
.defaults
import get_default_index_dir
24 __all__
= ['PokedexLookup']
27 rx_is_number
= re
.compile('^\d+$')
29 LookupResult
= namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
37 def __nonzero__(self
):
38 """Dummy object should identify itself as False."""
42 """Python 3000 version of the above. Future-proofing rules!"""
45 def __getattr__(self
, *args
, **kwargs
):
46 raise self
.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
51 class LanguageWeighting(whoosh
.scoring
.Weighting
):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
56 def __init__(self
, extra_weights
={}, *args
, **kwargs
):
57 """`extra_weights` may be a dictionary of weights which will be
60 Intended for use with spelling corrections, which come along with their
63 self
.extra_weights
= extra_weights
64 super(LanguageWeighting
, self
).__init__(*args
, **kwargs
)
66 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
67 doc
= searcher
.stored_fields(docnum
)
70 weight
= weight
* self
.extra_weights
.get(text
, 1.0)
72 language
= doc
.get('language')
74 # English (well, "default"); leave it at 1
76 elif language
== u
'Roomaji':
77 # Give Roomaji a little boost; it's most likely to be searched
80 # Everything else can drop down the totem pole
84 class PokedexLookup(object):
85 MAX_FUZZY_RESULTS
= 10
86 MAX_EXACT_RESULTS
= 43
87 INTERMEDIATE_FACTOR
= 2
89 # The speller only checks how much the input matches a word; there can be
90 # all manner of extra unmatched junk, and it won't affect the weighting.
91 # To compensate, greatly boost the weighting of matches at the beginning
92 # and end, so nearly-full-word-matches are much better
93 SPELLER_OPTIONS
= dict(booststart
=10.0, boostend
=9.0)
95 # Dictionary of table name => table class.
96 # Need the table name so we can get the class from the table name after we
97 # retrieve something from the index
98 indexed_tables
= dict(
99 (cls
.__tablename__
, cls
)
112 def __init__(self
, directory
=None, session
=None):
113 """Opens the whoosh index stored in the named directory. If the index
114 doesn't already exist, it will be created.
117 Directory containing the index. Defaults to a location within the
118 `pokedex` egg directory.
121 Used for creating the index and retrieving objects. Defaults to an
122 attempt to connect to the default SQLite database installed by
126 # By the time this returns, self.index, self.speller, and self.session
129 # If a directory was not given, use the default
130 if directory
is None:
131 directory
= get_default_index_dir()
133 self
.directory
= directory
136 self
.session
= session
138 self
.session
= connect()
140 # Attempt to open or create the index
141 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
142 # Directory doesn't exist OR is empty; caller needs to use
143 # rebuild_index before doing anything. Provide a dummy object that
144 # complains when used
145 self
.index
= UninitializedIndex()
146 self
.speller
= UninitializedIndex()
149 # Otherwise, already exists; should be an index! Bam, done.
150 # Note that this will explode if the directory exists but doesn't
151 # contain an index; that's a feature
153 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
154 except whoosh
.index
.EmptyIndexError
:
156 "The index directory already contains files. "
157 "Please use a dedicated directory for the lookup index."
160 # Create speller, and done
161 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
162 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
,
163 **self
.SPELLER_OPTIONS
)
166 def rebuild_index(self
):
167 """Creates the index from scratch."""
169 schema
= whoosh
.fields
.Schema(
170 name
=whoosh
.fields
.ID(stored
=True),
171 table
=whoosh
.fields
.ID(stored
=True),
172 row_id
=whoosh
.fields
.ID(stored
=True),
173 language
=whoosh
.fields
.STORED
,
174 iso639
=whoosh
.fields
.ID(stored
=True),
175 iso3166
=whoosh
.fields
.ID(stored
=True),
176 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
179 if os
.path
.exists(self
.directory
):
180 # create_in() isn't totally reliable, so just nuke whatever's there
181 # manually. Try to be careful about this...
182 for f
in os
.listdir(self
.directory
):
183 if re
.match('^_?(MAIN|SPELL)_', f
):
184 os
.remove(os
.path
.join(self
.directory
, f
))
186 os
.mkdir(self
.directory
)
188 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
190 writer
= self
.index
.writer()
192 # Index every name in all our tables of interest
193 speller_entries
= set()
194 for cls
in self
.indexed_tables
.values():
195 q
= self
.session
.query(cls
)
197 for row
in q
.yield_per(5):
198 row_key
= dict(table
=unicode(cls
.__tablename__
),
199 row_id
=unicode(row
.id))
201 def add(name
, language
, iso639
, iso3166
):
202 normalized_name
= self
.normalize_name(name
)
205 name
=normalized_name
, display_name
=name
,
206 language
=language
, iso639
=iso639
, iso3166
=iso3166
,
210 speller_entries
.add(normalized_name
)
213 # Add the basic English name to the index
214 if cls
== tables
.Pokemon
:
215 # Pokémon need their form name added
217 add(row
.full_name
, None, u
'en', u
'us')
219 # If this is a default form, ALSO add the unadorned name,
220 # so 'Deoxys' alone will still do the right thing
221 if row
.forme_name
and not row
.forme_base_pokemon_id
:
222 add(row
.name
, None, u
'en', u
'us')
224 add(row
.name
, None, u
'en', u
'us')
226 # Some things also have other languages' names
227 # XXX other language form names..?
228 for foreign_name
in getattr(row
, 'foreign_names', []):
229 moonspeak
= foreign_name
.name
230 if row
.name
== moonspeak
:
231 # Don't add the English name again as a different
232 # language; no point and it makes spell results
236 add(moonspeak
, foreign_name
.language
.name
,
237 foreign_name
.language
.iso639
,
238 foreign_name
.language
.iso3166
)
241 if foreign_name
.language
.name
== 'Japanese':
242 roomaji
= romanize(foreign_name
.name
)
243 add(roomaji
, u
'Roomaji', u
'ja', u
'jp')
247 # Construct and populate a spell-checker index. Quicker to do it all
248 # at once, as every call to add_* does a commit(), and those seem to be
250 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
, mingram
=2,
251 **self
.SPELLER_OPTIONS
)
252 self
.speller
.add_words(speller_entries
)
255 def normalize_name(self
, name
):
256 """Strips irrelevant formatting junk from name input.
258 Specifically: everything is lowercased, and accents are removed.
260 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
261 # Makes sense to me. Decompose by Unicode rules, then remove combining
262 # characters, then recombine. I'm explicitly doing it this way instead
263 # of testing combining() because Korean characters apparently
264 # decompose! But the results are considered letters, not combining
265 # characters, so testing for Mn works well, and combining them again
266 # makes them look right.
267 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
268 name
= u
"".join(c
for c
in nkfd_form
269 if unicodedata
.category(c
) != 'Mn')
270 name
= unicodedata
.normalize('NFC', name
)
278 def _apply_valid_types(self
, name
, valid_types
):
279 """Combines the enforced `valid_types` with any from the search string
280 itself and updates the query.
282 For example, a name of 'a,b:foo' and valid_types of b,c will search for
283 only `b`s named "foo".
285 Returns `(name, merged_valid_types, term)`, where `name` has had any type
286 prefix stripped, `merged_valid_types` combines the original
287 `valid_types` with the type prefix, and `term` is a query term for
288 limited to just the allowed types. If there are no type restrictions
289 at all, `term` will be None.
292 # Remove any type prefix (pokemon:133) first
293 user_valid_types
= []
295 prefix_chunk
, name
= name
.split(':', 1)
298 prefixes
= prefix_chunk
.split(',')
299 user_valid_types
= []
300 for prefix
in prefixes
:
301 prefix
= prefix
.strip()
303 user_valid_types
.append(prefix
)
305 # Merge the valid types together. Only types that appear in BOTH lists
307 # As a special case, if the user asked for types that are explicitly
308 # forbidden, completely ignore what the user requested.
309 # And, just to complicate matters: "type" and language need to be
310 # considered separately.
311 def merge_requirements(func
):
312 user
= filter(func
, user_valid_types
)
313 system
= filter(func
, valid_types
)
316 merged
= list(set(user
) & set(system
))
320 # No overlap; use the system restrictions
323 # One or the other is blank; use the one that's not
324 return user
or system
326 # @foo means language must be foo; otherwise it's a table name
327 lang_requirements
= merge_requirements(lambda req
: req
[0] == u
'@')
328 type_requirements
= merge_requirements(lambda req
: req
[0] != u
'@')
329 all_requirements
= lang_requirements
+ type_requirements
333 for lang
in lang_requirements
:
334 # Allow for either country or language codes
336 lang_terms
.append(whoosh
.query
.Term(u
'iso639', lang_code
))
337 lang_terms
.append(whoosh
.query
.Term(u
'iso3166', lang_code
))
340 for type in type_requirements
:
341 table_name
= self
._parse_table_name(type)
343 # Quietly ignore bogus valid_types; more likely to DTRT
345 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
347 # Combine both kinds of restriction
350 all_terms
.append(whoosh
.query
.Or(type_terms
))
352 all_terms
.append(whoosh
.query
.Or(lang_terms
))
354 return name
, all_requirements
, whoosh
.query
.And(all_terms
)
357 def _parse_table_name(self
, name
):
358 """Takes a singular table name, table name, or table object and returns
361 Returns None for a bogus name.
364 if hasattr(name
, '__tablename__'):
365 return getattr(name
, '__tablename__')
368 for table
in self
.indexed_tables
.values():
369 if name
in (table
.__tablename__
, table
.__singlename__
):
370 return table
.__tablename__
372 # Bogus. Be nice and return dummy
375 def _whoosh_records_to_results(self
, records
, exact
=True):
376 """Converts a list of whoosh's indexed records to LookupResult tuples
377 containing database objects.
379 # XXX this 'exact' thing is getting kinda leaky. would like a better
380 # way to handle it, since only lookup() cares about fuzzy results
383 for record
in records
:
385 seen_key
= record
['table'], record
['row_id']
388 seen
[seen_key
] = True
390 cls
= self
.indexed_tables
[record
['table']]
391 obj
= self
.session
.query(cls
).get(record
['row_id'])
393 results
.append(LookupResult(object=obj
,
394 indexed_name
=record
['name'],
395 name
=record
['display_name'],
396 language
=record
.get('language'),
397 iso639
=record
['iso639'],
398 iso3166
=record
['iso3166'],
404 def lookup(self
, input, valid_types
=[], exact_only
=False):
405 """Attempts to find some sort of object, given a name.
407 Returns a list of named (object, name, language, iso639, iso3166,
408 exact) tuples. `object` is a database object, `name` is the name under
409 which the object was found, `language` and the two isos are the name
410 and country codes of the language in which the name was found, and
411 `exact` is True iff this was an exact match.
413 This function currently ONLY does fuzzy matching if there are no exact
416 Formes are not returned unless requested; "Shaymin" will return only
419 Extraneous whitespace is removed with extreme prejudice.
422 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
423 - Foreign names: "Iibui", "Eivui"
424 - Fuzzy names in whatever language: "Evee", "Ibui"
425 - IDs: "133", "192", "250"
427 - Type restrictions. "type:psychic" will only return the type. This
428 is how to make ID lookup useful. Multiple type specs can be entered
429 with commas, as "move,item:1".
430 - Language restrictions. "@fr:charge" will only return Tackle, which
431 is called "Charge" in French. These can be combined with type
432 restrictions, e.g., "@fr,move:charge".
433 - Alternate formes can be specified merely like "wash rotom".
436 Name of the thing to look for.
439 A list of type or language restrictions, e.g., `['pokemon',
440 '@ja']`. If this is provided, only results in one of the given
441 tables will be returned.
444 If True, only exact matches are returned. If set to False (the
445 default), and the provided `name` doesn't match anything exactly,
446 spelling correction will be attempted.
449 name
= self
.normalize_name(input)
453 # Pop off any type prefix and merge with valid_types
454 name
, merged_valid_types
, type_term
= \
455 self
._apply_valid_types(name
, valid_types
)
459 return self
.random_lookup(valid_types
=merged_valid_types
)
461 # Do different things depending what the query looks like
462 # Note: Term objects do an exact match, so we don't have to worry about
463 # a query parser tripping on weird characters in the input
465 # Let Python try to convert to a number, so 0xff works
466 name_as_number
= int(name
, base
=0)
469 name_as_number
= None
471 if '*' in name
or '?' in name
:
473 query
= whoosh
.query
.Wildcard(u
'name', name
)
474 elif name_as_number
is not None:
475 # Don't spell-check numbers!
477 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
480 query
= whoosh
.query
.Term(u
'name', name
)
483 query
= query
& type_term
487 # Limits; result limits are constants, and intermediate results (before
488 # duplicate items are stripped out) are capped at the result limit
489 # times another constant.
490 # Fuzzy are capped at 10, beyond which something is probably very
491 # wrong. Exact matches -- that is, wildcards and ids -- are far less
493 # Also, exact matches are sorted by name, since weight doesn't matter.
496 max_results
= self
.MAX_EXACT_RESULTS
497 sort_by
['sortedby'] = (u
'table', u
'name')
499 max_results
= self
.MAX_FUZZY_RESULTS
501 searcher
= self
.index
.searcher(weighting
=LanguageWeighting())
502 results
= searcher
.search(
504 limit
=int(max_results
* self
.INTERMEDIATE_FACTOR
),
508 # Look for some fuzzy matches if necessary
509 if not exact_only
and not results
:
513 fuzzy_query_parts
= []
516 for suggestion
, _
, weight
in self
.speller
.suggestions_and_scores(name
):
517 # Only allow the top 50% of scores; otherwise there will always
518 # be a lot of trailing junk
519 if min_weight
[0] is None:
520 min_weight
[0] = weight
* 0.5
521 elif weight
< min_weight
[0]:
524 fuzzy_query_parts
.append(whoosh
.query
.Term('name', suggestion
))
525 fuzzy_weights
[suggestion
] = weight
527 if not fuzzy_query_parts
:
528 # Nothing at all; don't try querying
531 fuzzy_query
= whoosh
.query
.Or(fuzzy_query_parts
)
533 fuzzy_query
= fuzzy_query
& type_term
535 searcher
.weighting
= LanguageWeighting(extra_weights
=fuzzy_weights
)
536 results
= searcher
.search(fuzzy_query
)
538 ### Convert results to db objects
539 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
541 # Truncate and return
542 return objects
[:max_results
]
545 def random_lookup(self
, valid_types
=[]):
546 """Returns a random lookup result from one of the provided
551 for valid_type
in valid_types
:
552 table_name
= self
._parse_table_name(valid_type
)
553 # Skip anything not recognized. Could be, say, a language code
555 table_names
.append(table_name
)
558 # n.b.: It's possible we got a list of valid_types and none of them
559 # were valid, but this function is guaranteed to return
560 # *something*, so it politely selects from the entire index instead
561 table_names
= self
.indexed_tables
.keys()
563 # Rather than create an array of many hundred items and pick randomly
564 # from it, just pick a number up to the total number of potential
565 # items, then pick randomly from that, and partition the whole range
566 # into chunks. This also avoids the slight problem that the index
567 # contains more rows (for languages) for some items than others.
568 # XXX ought to cache this (in the index?) if possible
571 for table_name
in table_names
:
572 count
= self
.session
.query(self
.indexed_tables
[table_name
]).count()
574 partitions
.append((table_name
, count
))
576 n
= random
.randint(1, total
)
577 while n
> partitions
[0][1]:
578 n
-= partitions
[0][1]
581 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
583 def prefix_lookup(self
, prefix
, valid_types
=[]):
584 """Returns terms starting with the given exact prefix.
586 Type prefixes are recognized, but no other name munging is done.
589 # Pop off any type prefix and merge with valid_types
590 prefix
, merged_valid_types
, type_term
= \
591 self
._apply_valid_types(prefix
, valid_types
)
593 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
596 query
= query
& type_term
598 searcher
= self
.index
.searcher()
599 searcher
.weighting
= LanguageWeighting()
600 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
602 return self
._whoosh_records_to_results(results
)