8 from sqlalchemy
.sql
import func
10 import whoosh
.filedb
.filestore
11 import whoosh
.filedb
.fileindex
13 from whoosh
.qparser
import QueryParser
15 import whoosh
.spelling
17 from pokedex
.util
import namedtuple
19 from pokedex
.db
import connect
20 import pokedex
.db
.tables
as tables
21 from pokedex
.roomaji
import romanize
22 from pokedex
.defaults
import get_default_index_dir
24 __all__
= ['PokedexLookup']
27 rx_is_number
= re
.compile('^\d+$')
29 LookupResult
= namedtuple('LookupResult', [
30 'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
33 class UninitializedIndex(object):
34 class UninitializedIndexError(Exception):
37 def __nonzero__(self
):
38 """Dummy object should identify itself as False."""
42 """Python 3000 version of the above. Future-proofing rules!"""
45 def __getattr__(self
, *args
, **kwargs
):
46 raise self
.UninitializedIndexError(
47 "The lookup index does not exist. Please use `pokedex setup` "
48 "or lookup.rebuild_index() to create it."
51 class LanguageWeighting(whoosh
.scoring
.Weighting
):
52 """A scoring class that forces otherwise-equal English results to come
53 before foreign results.
56 def __init__(self
, extra_weights
={}, *args
, **kwargs
):
57 """`extra_weights` may be a dictionary of weights which will be
60 Intended for use with spelling corrections, which come along with their
63 self
.extra_weights
= extra_weights
64 super(LanguageWeighting
, self
).__init__(*args
, **kwargs
)
66 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
67 doc
= searcher
.stored_fields(docnum
)
70 weight
= weight
* self
.extra_weights
.get(text
, 1.0)
72 if doc
['language'] == None:
73 # English (well, "default"); leave it at 1
75 elif doc
['language'] == u
'Roomaji':
76 # Give Roomaji a little boost; it's most likely to be searched
79 # Everything else can drop down the totem pole
83 class PokedexLookup(object):
84 MAX_FUZZY_RESULTS
= 10
85 MAX_EXACT_RESULTS
= 43
86 INTERMEDIATE_FACTOR
= 2
88 # The speller only checks how much the input matches a word; there can be
89 # all manner of extra unmatched junk, and it won't affect the weighting.
90 # To compensate, greatly boost the weighting of matches at the beginning
91 # and end, so nearly-full-word-matches are much better
92 SPELLER_OPTIONS
= dict(booststart
=10.0, boostend
=9.0)
94 # Dictionary of table name => table class.
95 # Need the table name so we can get the class from the table name after we
96 # retrieve something from the index
97 indexed_tables
= dict(
98 (cls
.__tablename__
, cls
)
111 def __init__(self
, directory
=None, session
=None):
112 """Opens the whoosh index stored in the named directory. If the index
113 doesn't already exist, it will be created.
116 Directory containing the index. Defaults to a location within the
117 `pokedex` egg directory.
120 Used for creating the index and retrieving objects. Defaults to an
121 attempt to connect to the default SQLite database installed by
125 # By the time this returns, self.index, self.speller, and self.session
128 # If a directory was not given, use the default
129 if directory
is None:
130 directory
= get_default_index_dir()
132 self
.directory
= directory
135 self
.session
= session
137 self
.session
= connect()
139 # Attempt to open or create the index
140 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
141 # Directory doesn't exist OR is empty; caller needs to use
142 # rebuild_index before doing anything. Provide a dummy object that
143 # complains when used
144 self
.index
= UninitializedIndex()
145 self
.speller
= UninitializedIndex()
148 # Otherwise, already exists; should be an index! Bam, done.
149 # Note that this will explode if the directory exists but doesn't
150 # contain an index; that's a feature
152 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
153 except whoosh
.index
.EmptyIndexError
:
155 "The index directory already contains files. "
156 "Please use a dedicated directory for the lookup index."
159 # Create speller, and done
160 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
161 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
,
162 **self
.SPELLER_OPTIONS
)
165 def rebuild_index(self
):
166 """Creates the index from scratch."""
168 schema
= whoosh
.fields
.Schema(
169 name
=whoosh
.fields
.ID(stored
=True),
170 table
=whoosh
.fields
.ID(stored
=True),
171 row_id
=whoosh
.fields
.ID(stored
=True),
172 language
=whoosh
.fields
.STORED
,
173 iso639
=whoosh
.fields
.ID(stored
=True),
174 iso3166
=whoosh
.fields
.ID(stored
=True),
175 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
178 if not os
.path
.exists(self
.directory
):
179 os
.mkdir(self
.directory
)
181 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
183 writer
= self
.index
.writer()
185 # Index every name in all our tables of interest
186 speller_entries
= set()
187 for cls
in self
.indexed_tables
.values():
188 q
= self
.session
.query(cls
)
190 for row
in q
.yield_per(5):
191 row_key
= dict(table
=unicode(cls
.__tablename__
),
192 row_id
=unicode(row
.id))
194 def add(name
, language
, iso639
, iso3166
):
195 normalized_name
= self
.normalize_name(name
)
198 name
=normalized_name
, display_name
=name
,
199 language
=language
, iso639
=iso639
, iso3166
=iso3166
,
203 speller_entries
.add(normalized_name
)
206 # Add the basic English name to the index
207 if cls
== tables
.Pokemon
:
208 # Pokémon need their form name added
210 add(row
.full_name
, None, u
'en', u
'us')
212 # If this is a default form, ALSO add the unadorned name,
213 # so 'Deoxys' alone will still do the right thing
214 if row
.forme_name
and not row
.forme_base_pokemon_id
:
215 add(row
.name
, None, u
'en', u
'us')
217 add(row
.name
, None, u
'en', u
'us')
219 # Some things also have other languages' names
220 # XXX other language form names..?
221 for foreign_name
in getattr(row
, 'foreign_names', []):
222 moonspeak
= foreign_name
.name
223 if row
.name
== moonspeak
:
224 # Don't add the English name again as a different
225 # language; no point and it makes spell results
229 add(moonspeak
, foreign_name
.language
.name
,
230 foreign_name
.language
.iso639
,
231 foreign_name
.language
.iso3166
)
234 if foreign_name
.language
.name
== 'Japanese':
235 roomaji
= romanize(foreign_name
.name
)
236 add(roomaji
, u
'Roomaji', u
'ja', u
'jp')
240 # Construct and populate a spell-checker index. Quicker to do it all
241 # at once, as every call to add_* does a commit(), and those seem to be
243 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
, mingram
=2,
244 **self
.SPELLER_OPTIONS
)
245 self
.speller
.add_words(speller_entries
)
248 def normalize_name(self
, name
):
249 """Strips irrelevant formatting junk from name input.
251 Specifically: everything is lowercased, and accents are removed.
253 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
254 # Makes sense to me. Decompose by Unicode rules, then remove combining
255 # characters, then recombine. I'm explicitly doing it this way instead
256 # of testing combining() because Korean characters apparently
257 # decompose! But the results are considered letters, not combining
258 # characters, so testing for Mn works well, and combining them again
259 # makes them look right.
260 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
261 name
= u
"".join(c
for c
in nkfd_form
262 if unicodedata
.category(c
) != 'Mn')
263 name
= unicodedata
.normalize('NFC', name
)
271 def _apply_valid_types(self
, name
, valid_types
):
272 """Combines the enforced `valid_types` with any from the search string
273 itself and updates the query.
275 For example, a name of 'a,b:foo' and valid_types of b,c will search for
276 only `b`s named "foo".
278 Returns `(name, merged_valid_types, term)`, where `name` has had any type
279 prefix stripped, `merged_valid_types` combines the original
280 `valid_types` with the type prefix, and `term` is a query term for
281 limited to just the allowed types. If there are no type restrictions
282 at all, `term` will be None.
285 # Remove any type prefix (pokemon:133) first
286 user_valid_types
= []
288 prefix_chunk
, name
= name
.split(':', 1)
291 prefixes
= prefix_chunk
.split(',')
292 user_valid_types
= [_
.strip() for _
in prefixes
]
294 # Merge the valid types together. Only types that appear in BOTH lists
296 # As a special case, if the user asked for types that are explicitly
297 # forbidden, completely ignore what the user requested
298 combined_valid_types
= []
299 if user_valid_types
and valid_types
:
300 combined_valid_types
= list(
301 set(user_valid_types
) & set(combined_valid_types
)
304 if not combined_valid_types
:
305 # No overlap! Just use the enforced ones
306 combined_valid_types
= valid_types
308 # One list or the other was blank, so just use the one that isn't
309 combined_valid_types
= valid_types
+ user_valid_types
311 if not combined_valid_types
:
313 return name
, [], None
318 final_valid_types
= []
319 for valid_type
in combined_valid_types
:
320 if valid_type
.startswith(u
'@'):
321 # @foo means: language must be foo.
322 # Allow for either country or language codes
323 lang_code
= valid_type
[1:]
324 lang_terms
.append(whoosh
.query
.Term(u
'iso639', lang_code
))
325 lang_terms
.append(whoosh
.query
.Term(u
'iso3166', lang_code
))
327 # otherwise, this is a type/table name
328 table_name
= self
._parse_table_name(valid_type
)
330 # Quietly ignore bogus valid_types; more likely to DTRT
332 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
334 # Combine both kinds of restriction
337 all_terms
.append(whoosh
.query
.Or(type_terms
))
339 all_terms
.append(whoosh
.query
.Or(lang_terms
))
341 return name
, combined_valid_types
, whoosh
.query
.And(all_terms
)
344 def _parse_table_name(self
, name
):
345 """Takes a singular table name, table name, or table object and returns
348 Returns None for a bogus name.
351 if hasattr(name
, '__tablename__'):
352 return getattr(name
, '__tablename__')
355 for table
in self
.indexed_tables
.values():
356 if name
in (table
.__tablename__
, table
.__singlename__
):
357 return table
.__tablename__
359 # Bogus. Be nice and return dummy
362 def _whoosh_records_to_results(self
, records
, exact
=True):
363 """Converts a list of whoosh's indexed records to LookupResult tuples
364 containing database objects.
366 # XXX this 'exact' thing is getting kinda leaky. would like a better
367 # way to handle it, since only lookup() cares about fuzzy results
370 for record
in records
:
372 seen_key
= record
['table'], record
['row_id']
375 seen
[seen_key
] = True
377 cls
= self
.indexed_tables
[record
['table']]
378 obj
= self
.session
.query(cls
).get(record
['row_id'])
380 results
.append(LookupResult(object=obj
,
381 indexed_name
=record
['name'],
382 name
=record
['display_name'],
383 language
=record
['language'],
384 iso639
=record
['iso639'],
385 iso3166
=record
['iso3166'],
391 def lookup(self
, input, valid_types
=[], exact_only
=False):
392 """Attempts to find some sort of object, given a name.
394 Returns a list of named (object, name, language, iso639, iso3166,
395 exact) tuples. `object` is a database object, `name` is the name under
396 which the object was found, `language` and the two isos are the name
397 and country codes of the language in which the name was found, and
398 `exact` is True iff this was an exact match.
400 This function currently ONLY does fuzzy matching if there are no exact
403 Formes are not returned unless requested; "Shaymin" will return only
406 Extraneous whitespace is removed with extreme prejudice.
409 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
410 - Foreign names: "Iibui", "Eivui"
411 - Fuzzy names in whatever language: "Evee", "Ibui"
412 - IDs: "133", "192", "250"
414 - Type restrictions. "type:psychic" will only return the type. This
415 is how to make ID lookup useful. Multiple type specs can be entered
416 with commas, as "move,item:1".
417 - Language restrictions. "@fr:charge" will only return Tackle, which
418 is called "Charge" in French. These can be combined with type
419 restrictions, e.g., "@fr,move:charge".
420 - Alternate formes can be specified merely like "wash rotom".
423 Name of the thing to look for.
426 A list of type or language restrictions, e.g., `['pokemon',
427 '@ja']`. If this is provided, only results in one of the given
428 tables will be returned.
431 If True, only exact matches are returned. If set to False (the
432 default), and the provided `name` doesn't match anything exactly,
433 spelling correction will be attempted.
436 name
= self
.normalize_name(input)
440 # Pop off any type prefix and merge with valid_types
441 name
, merged_valid_types
, type_term
= \
442 self
._apply_valid_types(name
, valid_types
)
446 return self
.random_lookup(valid_types
=merged_valid_types
)
448 # Do different things depending what the query looks like
449 # Note: Term objects do an exact match, so we don't have to worry about
450 # a query parser tripping on weird characters in the input
452 # Let Python try to convert to a number, so 0xff works
453 name_as_number
= int(name
, base
=0)
456 name_as_number
= None
458 if '*' in name
or '?' in name
:
460 query
= whoosh
.query
.Wildcard(u
'name', name
)
461 elif name_as_number
is not None:
462 # Don't spell-check numbers!
464 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
467 query
= whoosh
.query
.Term(u
'name', name
)
470 query
= query
& type_term
474 # Limits; result limits are constants, and intermediate results (before
475 # duplicate items are stripped out) are capped at the result limit
476 # times another constant.
477 # Fuzzy are capped at 10, beyond which something is probably very
478 # wrong. Exact matches -- that is, wildcards and ids -- are far less
480 # Also, exact matches are sorted by name, since weight doesn't matter.
483 max_results
= self
.MAX_EXACT_RESULTS
484 sort_by
['sortedby'] = (u
'table', u
'name')
486 max_results
= self
.MAX_FUZZY_RESULTS
488 searcher
= self
.index
.searcher(weighting
=LanguageWeighting())
489 results
= searcher
.search(
491 limit
=int(max_results
* self
.INTERMEDIATE_FACTOR
),
495 # Look for some fuzzy matches if necessary
496 if not exact_only
and not results
:
500 fuzzy_query_parts
= []
503 for suggestion
, _
, weight
in self
.speller
.suggestions_and_scores(name
):
504 # Only allow the top 50% of scores; otherwise there will always
505 # be a lot of trailing junk
506 if min_weight
[0] is None:
507 min_weight
[0] = weight
* 0.5
508 elif weight
< min_weight
[0]:
511 fuzzy_query_parts
.append(whoosh
.query
.Term('name', suggestion
))
512 fuzzy_weights
[suggestion
] = weight
514 if not fuzzy_query_parts
:
515 # Nothing at all; don't try querying
518 fuzzy_query
= whoosh
.query
.Or(fuzzy_query_parts
)
520 fuzzy_query
= fuzzy_query
& type_term
522 searcher
.weighting
= LanguageWeighting(extra_weights
=fuzzy_weights
)
523 results
= searcher
.search(fuzzy_query
)
525 ### Convert results to db objects
526 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
528 # Truncate and return
529 return objects
[:max_results
]
532 def random_lookup(self
, valid_types
=[]):
533 """Returns a random lookup result from one of the provided
538 for valid_type
in valid_types
:
539 table_name
= self
._parse_table_name(valid_type
)
540 # Skip anything not recognized. Could be, say, a language code
542 table_names
.append(table_name
)
545 # n.b.: It's possible we got a list of valid_types and none of them
546 # were valid, but this function is guaranteed to return
547 # *something*, so it politely selects from the entire index instead
548 table_names
= self
.indexed_tables
.keys()
550 # Rather than create an array of many hundred items and pick randomly
551 # from it, just pick a number up to the total number of potential
552 # items, then pick randomly from that, and partition the whole range
553 # into chunks. This also avoids the slight problem that the index
554 # contains more rows (for languages) for some items than others.
555 # XXX ought to cache this (in the index?) if possible
558 for table_name
in table_names
:
559 count
= self
.session
.query(self
.indexed_tables
[table_name
]).count()
561 partitions
.append((table_name
, count
))
563 n
= random
.randint(1, total
)
564 while n
> partitions
[0][1]:
565 n
-= partitions
[0][1]
568 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
570 def prefix_lookup(self
, prefix
, valid_types
=[]):
571 """Returns terms starting with the given exact prefix.
573 Type prefixes are recognized, but no other name munging is done.
576 # Pop off any type prefix and merge with valid_types
577 prefix
, merged_valid_types
, type_term
= \
578 self
._apply_valid_types(prefix
, valid_types
)
580 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
583 query
= query
& type_term
585 searcher
= self
.index
.searcher()
586 searcher
.weighting
= LanguageWeighting()
587 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
589 return self
._whoosh_records_to_results(results
)