8 from sqlalchemy
.sql
import func
10 import whoosh
.filedb
.filestore
11 import whoosh
.filedb
.fileindex
13 from whoosh
.qparser
import QueryParser
15 import whoosh
.spelling
17 from pokedex
.util
import namedtuple
19 from pokedex
.db
import connect
20 import pokedex
.db
.tables
as tables
21 from pokedex
.roomaji
import romanize
22 from pokedex
.defaults
import get_default_index_dir
24 __all__
= ['PokedexLookup']
27 rx_is_number
= re
.compile('^\d+$')
29 LookupResult
= namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
37 def __nonzero__(self
):
38 """Dummy object should identify itself as False."""
42 """Python 3000 version of the above. Future-proofing rules!"""
45 def __getattr__(self
, *args
, **kwargs
):
46 raise self
.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
51 class LanguageWeighting(whoosh
.scoring
.Weighting
):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
56 def __init__(self
, extra_weights
={}, *args
, **kwargs
):
57 """`extra_weights` may be a dictionary of weights which will be
60 Intended for use with spelling corrections, which come along with their
63 self
.extra_weights
= extra_weights
64 super(LanguageWeighting
, self
).__init__(*args
, **kwargs
)
66 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
67 doc
= searcher
.stored_fields(docnum
)
70 weight
= weight
* self
.extra_weights
.get(text
, 1.0)
72 if doc
['language'] == None:
73 # English (well, "default"); leave it at 1
75 elif doc
['language'] == u
'Roomaji':
76 # Give Roomaji a little boost; it's most likely to be searched
79 # Everything else can drop down the totem pole
83 class PokedexLookup(object):
84 INTERMEDIATE_LOOKUP_RESULTS
= 25
85 MAX_LOOKUP_RESULTS
= 10
87 # The speller only checks how much the input matches a word; there can be
88 # all manner of extra unmatched junk, and it won't affect the weighting.
89 # To compensate, greatly boost the weighting of matches at the beginning
90 # and end, so nearly-full-word-matches are much better
91 SPELLER_OPTIONS
= dict(booststart
=10.0, boostend
=9.0)
93 # Dictionary of table name => table class.
94 # Need the table name so we can get the class from the table name after we
95 # retrieve something from the index
96 indexed_tables
= dict(
97 (cls
.__tablename__
, cls
)
110 def __init__(self
, directory
=None, session
=None):
111 """Opens the whoosh index stored in the named directory. If the index
112 doesn't already exist, it will be created.
115 Directory containing the index. Defaults to a location within the
116 `pokedex` egg directory.
119 Used for creating the index and retrieving objects. Defaults to an
120 attempt to connect to the default SQLite database installed by
124 # By the time this returns, self.index, self.speller, and self.session
127 # If a directory was not given, use the default
128 if directory
is None:
129 directory
= get_default_index_dir()
131 self
.directory
= directory
134 self
.session
= session
136 self
.session
= connect()
138 # Attempt to open or create the index
139 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
140 # Directory doesn't exist OR is empty; caller needs to use
141 # rebuild_index before doing anything. Provide a dummy object that
142 # complains when used
143 self
.index
= UninitializedIndex()
144 self
.speller
= UninitializedIndex()
147 # Otherwise, already exists; should be an index! Bam, done.
148 # Note that this will explode if the directory exists but doesn't
149 # contain an index; that's a feature
151 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
152 except whoosh
.index
.EmptyIndexError
:
154 "The index directory already contains files. "
155 "Please use a dedicated directory for the lookup index."
158 # Create speller, and done
159 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
160 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
,
161 **self
.SPELLER_OPTIONS
)
164 def rebuild_index(self
):
165 """Creates the index from scratch."""
167 schema
= whoosh
.fields
.Schema(
168 name
=whoosh
.fields
.ID(stored
=True),
169 table
=whoosh
.fields
.ID(stored
=True),
170 row_id
=whoosh
.fields
.ID(stored
=True),
171 language
=whoosh
.fields
.STORED
,
172 iso639
=whoosh
.fields
.ID(stored
=True),
173 iso3166
=whoosh
.fields
.ID(stored
=True),
174 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
177 if not os
.path
.exists(self
.directory
):
178 os
.mkdir(self
.directory
)
180 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
182 writer
= self
.index
.writer()
184 # Index every name in all our tables of interest
185 speller_entries
= set()
186 for cls
in self
.indexed_tables
.values():
187 q
= self
.session
.query(cls
)
189 for row
in q
.yield_per(5):
190 row_key
= dict(table
=unicode(cls
.__tablename__
),
191 row_id
=unicode(row
.id))
193 def add(name
, language
, iso639
, iso3166
):
194 normalized_name
= self
.normalize_name(name
)
197 name
=normalized_name
, display_name
=name
,
198 language
=language
, iso639
=iso639
, iso3166
=iso3166
,
202 speller_entries
.add(normalized_name
)
205 # Add the basic English name to the index
206 if cls
== tables
.Pokemon
:
207 # Pokémon need their form name added
209 add(row
.full_name
, None, u
'en', u
'us')
211 # If this is a default form, ALSO add the unadorned name,
212 # so 'Deoxys' alone will still do the right thing
213 if row
.forme_name
and not row
.forme_base_pokemon_id
:
214 add(row
.name
, None, u
'en', u
'us')
216 add(row
.name
, None, u
'en', u
'us')
218 # Some things also have other languages' names
219 # XXX other language form names..?
220 for foreign_name
in getattr(row
, 'foreign_names', []):
221 moonspeak
= foreign_name
.name
222 if row
.name
== moonspeak
:
223 # Don't add the English name again as a different
224 # language; no point and it makes spell results
228 add(moonspeak
, foreign_name
.language
.name
,
229 foreign_name
.language
.iso639
,
230 foreign_name
.language
.iso3166
)
233 if foreign_name
.language
.name
== 'Japanese':
234 roomaji
= romanize(foreign_name
.name
)
235 add(roomaji
, u
'Roomaji', u
'ja', u
'jp')
239 # Construct and populate a spell-checker index. Quicker to do it all
240 # at once, as every call to add_* does a commit(), and those seem to be
242 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
, mingram
=2,
243 **self
.SPELLER_OPTIONS
)
244 self
.speller
.add_words(speller_entries
)
247 def normalize_name(self
, name
):
248 """Strips irrelevant formatting junk from name input.
250 Specifically: everything is lowercased, and accents are removed.
252 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
253 # Makes sense to me. Decompose by Unicode rules, then remove combining
254 # characters, then recombine. I'm explicitly doing it this way instead
255 # of testing combining() because Korean characters apparently
256 # decompose! But the results are considered letters, not combining
257 # characters, so testing for Mn works well, and combining them again
258 # makes them look right.
259 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
260 name
= u
"".join(c
for c
in nkfd_form
261 if unicodedata
.category(c
) != 'Mn')
262 name
= unicodedata
.normalize('NFC', name
)
270 def _apply_valid_types(self
, name
, valid_types
):
271 """Combines the enforced `valid_types` with any from the search string
272 itself and updates the query.
274 For example, a name of 'a,b:foo' and valid_types of b,c will search for
275 only `b`s named "foo".
277 Returns `(name, merged_valid_types, term)`, where `name` has had any type
278 prefix stripped, `merged_valid_types` combines the original
279 `valid_types` with the type prefix, and `term` is a query term for
280 limited to just the allowed types. If there are no type restrictions
281 at all, `term` will be None.
284 # Remove any type prefix (pokemon:133) first
285 user_valid_types
= []
287 prefix_chunk
, name
= name
.split(':', 1)
290 prefixes
= prefix_chunk
.split(',')
291 user_valid_types
= [_
.strip() for _
in prefixes
]
293 # Merge the valid types together. Only types that appear in BOTH lists
295 # As a special case, if the user asked for types that are explicitly
296 # forbidden, completely ignore what the user requested
297 combined_valid_types
= []
298 if user_valid_types
and valid_types
:
299 combined_valid_types
= list(
300 set(user_valid_types
) & set(combined_valid_types
)
303 if not combined_valid_types
:
304 # No overlap! Just use the enforced ones
305 combined_valid_types
= valid_types
307 # One list or the other was blank, so just use the one that isn't
308 combined_valid_types
= valid_types
+ user_valid_types
310 if not combined_valid_types
:
312 return name
, [], None
317 final_valid_types
= []
318 for valid_type
in combined_valid_types
:
319 if valid_type
.startswith(u
'@'):
320 # @foo means: language must be foo.
321 # Allow for either country or language codes
322 lang_code
= valid_type
[1:]
323 lang_terms
.append(whoosh
.query
.Term(u
'iso639', lang_code
))
324 lang_terms
.append(whoosh
.query
.Term(u
'iso3166', lang_code
))
326 # otherwise, this is a type/table name
327 table_name
= self
._parse_table_name(valid_type
)
329 # Quietly ignore bogus valid_types; more likely to DTRT
331 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
333 # Combine both kinds of restriction
336 all_terms
.append(whoosh
.query
.Or(type_terms
))
338 all_terms
.append(whoosh
.query
.Or(lang_terms
))
340 return name
, combined_valid_types
, whoosh
.query
.And(all_terms
)
343 def _parse_table_name(self
, name
):
344 """Takes a singular table name, table name, or table object and returns
347 Returns None for a bogus name.
350 if hasattr(name
, '__tablename__'):
351 return getattr(name
, '__tablename__')
354 for table
in self
.indexed_tables
.values():
355 if name
in (table
.__tablename__
, table
.__singlename__
):
356 return table
.__tablename__
358 # Bogus. Be nice and return dummy
361 def _whoosh_records_to_results(self
, records
, exact
=True):
362 """Converts a list of whoosh's indexed records to LookupResult tuples
363 containing database objects.
365 # XXX this 'exact' thing is getting kinda leaky. would like a better
366 # way to handle it, since only lookup() cares about fuzzy results
369 for record
in records
:
371 seen_key
= record
['table'], record
['row_id']
374 seen
[seen_key
] = True
376 cls
= self
.indexed_tables
[record
['table']]
377 obj
= self
.session
.query(cls
).get(record
['row_id'])
379 results
.append(LookupResult(object=obj
,
380 indexed_name
=record
['name'],
381 name
=record
['display_name'],
382 language
=record
['language'],
383 iso639
=record
['iso639'],
384 iso3166
=record
['iso3166'],
390 def lookup(self
, input, valid_types
=[], exact_only
=False):
391 """Attempts to find some sort of object, given a name.
393 Returns a list of named (object, name, language, iso639, iso3166,
394 exact) tuples. `object` is a database object, `name` is the name under
395 which the object was found, `language` and the two isos are the name
396 and country codes of the language in which the name was found, and
397 `exact` is True iff this was an exact match.
399 This function currently ONLY does fuzzy matching if there are no exact
402 Formes are not returned unless requested; "Shaymin" will return only
405 Extraneous whitespace is removed with extreme prejudice.
408 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
409 - Foreign names: "Iibui", "Eivui"
410 - Fuzzy names in whatever language: "Evee", "Ibui"
411 - IDs: "133", "192", "250"
413 - Type restrictions. "type:psychic" will only return the type. This
414 is how to make ID lookup useful. Multiple type specs can be entered
415 with commas, as "move,item:1".
416 - Language restrictions. "@fr:charge" will only return Tackle, which
417 is called "Charge" in French. These can be combined with type
418 restrictions, e.g., "@fr,move:charge".
419 - Alternate formes can be specified merely like "wash rotom".
422 Name of the thing to look for.
425 A list of type or language restrictions, e.g., `['pokemon',
426 '@ja']`. If this is provided, only results in one of the given
427 tables will be returned.
430 If True, only exact matches are returned. If set to False (the
431 default), and the provided `name` doesn't match anything exactly,
432 spelling correction will be attempted.
435 name
= self
.normalize_name(input)
439 # Pop off any type prefix and merge with valid_types
440 name
, merged_valid_types
, type_term
= \
441 self
._apply_valid_types(name
, valid_types
)
445 return self
.random_lookup(valid_types
=merged_valid_types
)
447 # Do different things depending what the query looks like
448 # Note: Term objects do an exact match, so we don't have to worry about
449 # a query parser tripping on weird characters in the input
451 # Let Python try to convert to a number, so 0xff works
452 name_as_number
= int(name
, base
=0)
455 name_as_number
= None
457 if '*' in name
or '?' in name
:
459 query
= whoosh
.query
.Wildcard(u
'name', name
)
460 elif name_as_number
is not None:
461 # Don't spell-check numbers!
463 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
466 query
= whoosh
.query
.Term(u
'name', name
)
469 query
= query
& type_term
473 searcher
= self
.index
.searcher()
474 # XXX is this kosher? docs say search() takes a weighting arg, but it
476 searcher
.weighting
= LanguageWeighting()
477 results
= searcher
.search(query
,
478 limit
=self
.INTERMEDIATE_LOOKUP_RESULTS
)
480 # Look for some fuzzy matches if necessary
481 if not exact_only
and not results
:
485 fuzzy_query_parts
= []
488 for suggestion
, _
, weight
in self
.speller
.suggestions_and_scores(name
):
489 # Only allow the top 50% of scores; otherwise there will always
490 # be a lot of trailing junk
491 if min_weight
[0] is None:
492 min_weight
[0] = weight
* 0.5
493 elif weight
< min_weight
[0]:
496 fuzzy_query_parts
.append(whoosh
.query
.Term('name', suggestion
))
497 fuzzy_weights
[suggestion
] = weight
499 if not fuzzy_query_parts
:
500 # Nothing at all; don't try querying
503 fuzzy_query
= whoosh
.query
.Or(fuzzy_query_parts
)
505 fuzzy_query
= fuzzy_query
& type_term
507 searcher
.weighting
= LanguageWeighting(extra_weights
=fuzzy_weights
)
508 results
= searcher
.search(fuzzy_query
)
510 ### Convert results to db objects
511 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
513 # Only return up to 10 matches; beyond that, something is wrong. We
514 # strip out duplicate entries above, so it's remotely possible that we
515 # should have more than 10 here and lost a few. The speller returns 25
516 # to give us some padding, and should avoid that problem. Not a big
517 # deal if we lose the 25th-most-likely match anyway.
518 return objects
[:self
.MAX_LOOKUP_RESULTS
]
521 def random_lookup(self
, valid_types
=[]):
522 """Returns a random lookup result from one of the provided
527 for valid_type
in valid_types
:
528 table_name
= self
._parse_table_name(valid_type
)
529 # Skip anything not recognized. Could be, say, a language code
531 table_names
.append(table_name
)
534 # n.b.: It's possible we got a list of valid_types and none of them
535 # were valid, but this function is guaranteed to return
536 # *something*, so it politely selects from the entire index instead
537 table_names
= self
.indexed_tables
.keys()
539 # Rather than create an array of many hundred items and pick randomly
540 # from it, just pick a number up to the total number of potential
541 # items, then pick randomly from that, and partition the whole range
542 # into chunks. This also avoids the slight problem that the index
543 # contains more rows (for languages) for some items than others.
544 # XXX ought to cache this (in the index?) if possible
547 for table_name
in table_names
:
548 count
= self
.session
.query(self
.indexed_tables
[table_name
]).count()
550 partitions
.append((table_name
, count
))
552 n
= random
.randint(1, total
)
553 while n
> partitions
[0][1]:
554 n
-= partitions
[0][1]
557 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
559 def prefix_lookup(self
, prefix
, valid_types
=[]):
560 """Returns terms starting with the given exact prefix.
562 Type prefixes are recognized, but no other name munging is done.
565 # Pop off any type prefix and merge with valid_types
566 prefix
, merged_valid_types
, type_term
= \
567 self
._apply_valid_types(prefix
, valid_types
)
569 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
572 query
= query
& type_term
574 searcher
= self
.index
.searcher()
575 searcher
.weighting
= LanguageWeighting()
576 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
578 return self
._whoosh_records_to_results(results
)