8 from sqlalchemy
.sql
import func
10 import whoosh
.filedb
.filestore
11 import whoosh
.filedb
.fileindex
13 from whoosh
.qparser
import QueryParser
15 import whoosh
.spelling
17 from pokedex
.util
import namedtuple
19 from pokedex
.db
import connect
20 import pokedex
.db
.tables
as tables
21 from pokedex
.roomaji
import romanize
22 from pokedex
.defaults
import get_default_index_dir
24 __all__
= ['PokedexLookup']
27 rx_is_number
= re
.compile('^\d+$')
29 LookupResult
= namedtuple('LookupResult',
30 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
32 class UninitializedIndex(object):
33 class UninitializedIndexError(Exception):
36 def __nonzero__(self
):
37 """Dummy object should identify itself as False."""
41 """Python 3000 version of the above. Future-proofing rules!"""
44 def __getattr__(self
, *args
, **kwargs
):
45 raise self
.UninitializedIndexError(
46 "The lookup index does not exist. Please use `pokedex setup` "
47 "or lookup.rebuild_index() to create it."
50 class LanguageWeighting(whoosh
.scoring
.Weighting
):
51 """A scoring class that forces otherwise-equal English results to come
52 before foreign results.
55 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
56 doc
= searcher
.stored_fields(docnum
)
57 if doc
['language'] == None:
58 # English (well, "default"); leave it at 1
60 elif doc
['language'] == u
'Roomaji':
61 # Give Roomaji a little boost; it's most likely to be searched
64 # Everything else can drop down the totem pole
68 class PokedexLookup(object):
69 INTERMEDIATE_LOOKUP_RESULTS
= 25
70 MAX_LOOKUP_RESULTS
= 10
72 # Dictionary of table name => table class.
73 # Need the table name so we can get the class from the table name after we
74 # retrieve something from the index
75 indexed_tables
= dict(
76 (cls
.__tablename__
, cls
)
89 def __init__(self
, directory
=None, session
=None):
90 """Opens the whoosh index stored in the named directory. If the index
91 doesn't already exist, it will be created.
94 Directory containing the index. Defaults to a location within the
95 `pokedex` egg directory.
98 Used for creating the index and retrieving objects. Defaults to an
99 attempt to connect to the default SQLite database installed by
103 # By the time this returns, self.index, self.speller, and self.session
106 # If a directory was not given, use the default
107 if directory
is None:
108 directory
= get_default_index_dir()
110 self
.directory
= directory
113 self
.session
= session
115 self
.session
= connect()
117 # Attempt to open or create the index
118 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
119 # Directory doesn't exist OR is empty; caller needs to use
120 # rebuild_index before doing anything. Provide a dummy object that
121 # complains when used
122 self
.index
= UninitializedIndex()
123 self
.speller
= UninitializedIndex()
126 # Otherwise, already exists; should be an index! Bam, done.
127 # Note that this will explode if the directory exists but doesn't
128 # contain an index; that's a feature
130 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
131 except whoosh
.index
.EmptyIndexError
:
133 "The index directory already contains files. "
134 "Please use a dedicated directory for the lookup index."
137 # Create speller, and done
138 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
139 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
)
142 def rebuild_index(self
):
143 """Creates the index from scratch."""
145 schema
= whoosh
.fields
.Schema(
146 name
=whoosh
.fields
.ID(stored
=True),
147 table
=whoosh
.fields
.ID(stored
=True),
148 row_id
=whoosh
.fields
.ID(stored
=True),
149 language
=whoosh
.fields
.STORED
,
150 iso3166
=whoosh
.fields
.STORED
,
151 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
154 if not os
.path
.exists(self
.directory
):
155 os
.mkdir(self
.directory
)
157 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
159 writer
= self
.index
.writer()
161 # Index every name in all our tables of interest
162 # speller_entries becomes a list of (word, score) tuples; the score is
163 # 2 for English names, 1.5 for Roomaji, and 1 for everything else. I
164 # think this biases the results in the direction most people expect,
165 # especially when e.g. German names are very similar to English names
167 for cls
in self
.indexed_tables
.values():
168 q
= self
.session
.query(cls
)
170 for row
in q
.yield_per(5):
171 row_key
= dict(table
=unicode(cls
.__tablename__
),
172 row_id
=unicode(row
.id))
174 def add(name
, language
, iso3166
, score
):
175 normalized_name
= self
.normalize_name(name
)
178 name
=normalized_name
, display_name
=name
,
179 language
=language
, iso3166
=iso3166
,
183 speller_entries
.append((normalized_name
, score
))
186 # Add the basic English name to the index
187 if cls
== tables
.Pokemon
:
188 # Pokémon need their form name added
190 add(row
.full_name
, None, u
'us', 1)
192 # If this is a default form, ALSO add the unadorned name,
193 # so 'Deoxys' alone will still do the right thing
194 if row
.forme_name
and not row
.forme_base_pokemon_id
:
195 add(row
.name
, None, u
'us', 1)
197 add(row
.name
, None, u
'us', 1)
199 # Some things also have other languages' names
200 # XXX other language form names..?
201 for foreign_name
in getattr(row
, 'foreign_names', []):
202 moonspeak
= foreign_name
.name
203 if row
.name
== moonspeak
:
204 # Don't add the English name again as a different
205 # language; no point and it makes spell results
209 add(moonspeak
, foreign_name
.language
.name
,
210 foreign_name
.language
.iso3166
,
214 if foreign_name
.language
.name
== 'Japanese':
215 roomaji
= romanize(foreign_name
.name
)
216 add(roomaji
, u
'Roomaji', u
'jp', 8)
220 # Construct and populate a spell-checker index. Quicker to do it all
221 # at once, as every call to add_* does a commit(), and those seem to be
223 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
)
224 self
.speller
.add_scored_words(speller_entries
)
227 def normalize_name(self
, name
):
228 """Strips irrelevant formatting junk from name input.
230 Specifically: everything is lowercased, and accents are removed.
232 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
233 # Makes sense to me. Decompose by Unicode rules, then remove combining
234 # characters, then recombine. I'm explicitly doing it this way instead
235 # of testing combining() because Korean characters apparently
236 # decompose! But the results are considered letters, not combining
237 # characters, so testing for Mn works well, and combining them again
238 # makes them look right.
239 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
240 name
= u
"".join(c
for c
in nkfd_form
241 if unicodedata
.category(c
) != 'Mn')
242 name
= unicodedata
.normalize('NFC', name
)
250 def _apply_valid_types(self
, name
, valid_types
):
251 """Combines the enforced `valid_types` with any from the search string
252 itself and updates the query.
254 For example, a name of 'a,b:foo' and valid_types of b,c will search for
255 only `b`s named "foo".
257 Returns `(name, merged_valid_types, term)`, where `name` has had any type
258 prefix stripped, `merged_valid_types` combines the original
259 `valid_types` with the type prefix, and `term` is a query term for
260 limited to just the allowed types. If there are no type restrictions
261 at all, `term` will be None.
264 # Remove any type prefix (pokemon:133) first
265 user_valid_types
= []
267 prefix_chunk
, name
= name
.split(':', 1)
270 prefixes
= prefix_chunk
.split(',')
271 user_valid_types
= [_
.strip() for _
in prefixes
]
273 # Merge the valid types together. Only types that appear in BOTH lists
275 # As a special case, if the user asked for types that are explicitly
276 # forbidden, completely ignore what the user requested
277 combined_valid_types
= []
278 if user_valid_types
and valid_types
:
279 combined_valid_types
= list(
280 set(user_valid_types
) & set(combined_valid_types
)
283 if not combined_valid_types
:
284 # No overlap! Just use the enforced ones
285 combined_valid_types
= valid_types
287 # One list or the other was blank, so just use the one that isn't
288 combined_valid_types
= valid_types
+ user_valid_types
290 if not combined_valid_types
:
292 return name
, [], None
296 final_valid_types
= []
297 for valid_type
in combined_valid_types
:
298 table_name
= self
._parse_table_name(valid_type
)
300 # Quietly ignore bogus valid_types; more likely to DTRT
302 final_valid_types
.append(valid_type
)
303 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
305 return name
, final_valid_types
, whoosh
.query
.Or(type_terms
)
308 def _parse_table_name(self
, name
):
309 """Takes a singular table name, table name, or table object and returns
312 Returns None for a bogus name.
315 if hasattr(name
, '__tablename__'):
316 return getattr(name
, '__tablename__')
319 for table
in self
.indexed_tables
.values():
320 if name
in (table
.__tablename__
, table
.__singlename__
):
321 return table
.__tablename__
323 # Bogus. Be nice and return dummy
326 def _whoosh_records_to_results(self
, records
, exact
=True):
327 """Converts a list of whoosh's indexed records to LookupResult tuples
328 containing database objects.
330 # XXX this 'exact' thing is getting kinda leaky. would like a better
331 # way to handle it, since only lookup() cares about fuzzy results
334 for record
in records
:
336 seen_key
= record
['table'], record
['row_id']
339 seen
[seen_key
] = True
341 cls
= self
.indexed_tables
[record
['table']]
342 obj
= self
.session
.query(cls
).get(record
['row_id'])
344 results
.append(LookupResult(object=obj
,
345 indexed_name
=record
['name'],
346 name
=record
['display_name'],
347 language
=record
['language'],
348 iso3166
=record
['iso3166'],
354 def lookup(self
, input, valid_types
=[], exact_only
=False):
355 """Attempts to find some sort of object, given a name.
357 Returns a list of named (object, name, language, iso3166, exact)
358 tuples. `object` is a database object, `name` is the name under which
359 the object was found, `language` and `iso3166` are the name and country
360 code of the language in which the name was found, and `exact` is True
364 This function currently ONLY does fuzzy matching if there are no exact
367 Formes are not returned unless requested; "Shaymin" will return only
370 Extraneous whitespace is removed with extreme prejudice.
373 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
374 - Foreign names: "Iibui", "Eivui"
375 - Fuzzy names in whatever language: "Evee", "Ibui"
376 - IDs: "133", "192", "250"
378 - Type restrictions. "type:psychic" will only return the type. This
379 is how to make ID lookup useful. Multiple type specs can be entered
380 with commas, as "move,item:1". If `valid_types` are provided, any
381 type prefix will be ignored.
382 - Alternate formes can be specified merely like "wash rotom".
385 Name of the thing to look for.
388 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
389 this is provided, only results in one of the given tables will be
393 If True, only exact matches are returned. If set to False (the
394 default), and the provided `name` doesn't match anything exactly,
395 spelling correction will be attempted.
398 name
= self
.normalize_name(input)
402 # Pop off any type prefix and merge with valid_types
403 name
, merged_valid_types
, type_term
= \
404 self
._apply_valid_types(name
, valid_types
)
408 return self
.random_lookup(valid_types
=merged_valid_types
)
410 # Do different things depending what the query looks like
411 # Note: Term objects do an exact match, so we don't have to worry about
412 # a query parser tripping on weird characters in the input
414 # Let Python try to convert to a number, so 0xff works
415 name_as_number
= int(name
, base
=0)
418 name_as_number
= None
420 if '*' in name
or '?' in name
:
422 query
= whoosh
.query
.Wildcard(u
'name', name
)
423 elif name_as_number
is not None:
424 # Don't spell-check numbers!
426 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
429 query
= whoosh
.query
.Term(u
'name', name
)
432 query
= query
& type_term
436 searcher
= self
.index
.searcher()
437 # XXX is this kosher? docs say search() takes a weighting arg, but it
439 searcher
.weighting
= LanguageWeighting()
440 results
= searcher
.search(query
,
441 limit
=self
.INTERMEDIATE_LOOKUP_RESULTS
)
443 # Look for some fuzzy matches if necessary
444 if not exact_only
and not results
:
448 for suggestion
in self
.speller
.suggest(
449 name
, self
.INTERMEDIATE_LOOKUP_RESULTS
):
451 query
= whoosh
.query
.Term('name', suggestion
)
452 results
.extend(searcher
.search(query
))
454 ### Convert results to db objects
455 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
457 # Only return up to 10 matches; beyond that, something is wrong. We
458 # strip out duplicate entries above, so it's remotely possible that we
459 # should have more than 10 here and lost a few. The speller returns 25
460 # to give us some padding, and should avoid that problem. Not a big
461 # deal if we lose the 25th-most-likely match anyway.
462 return objects
[:self
.MAX_LOOKUP_RESULTS
]
465 def random_lookup(self
, valid_types
=[]):
466 """Returns a random lookup result from one of the provided
471 for valid_type
in valid_types
:
472 table_name
= self
._parse_table_name(valid_type
)
474 tables
.append(self
.indexed_tables
[table_name
])
477 # n.b.: It's possible we got a list of valid_types and none of them
478 # were valid, but this function is guaranteed to return
479 # *something*, so it politely selects from the entire index isntead
480 tables
= self
.indexed_tables
.values()
482 # Rather than create an array of many hundred items and pick randomly
483 # from it, just pick a number up to the total number of potential
484 # items, then pick randomly from that, and partition the whole range
485 # into chunks. This also avoids the slight problem that the index
486 # contains more rows (for languages) for some items than others.
487 # XXX ought to cache this (in the index?) if possible
491 count
= self
.session
.query(table
).count()
493 partitions
.append((table
, count
))
495 n
= random
.randint(1, total
)
496 while n
> partitions
[0][1]:
497 n
-= partitions
[0][1]
500 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
502 def prefix_lookup(self
, prefix
, valid_types
=[]):
503 """Returns terms starting with the given exact prefix.
505 Type prefixes are recognized, but no other name munging is done.
508 # Pop off any type prefix and merge with valid_types
509 prefix
, merged_valid_types
, type_term
= \
510 self
._apply_valid_types(prefix
, valid_types
)
512 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
515 query
= query
& type_term
517 searcher
= self
.index
.searcher()
518 searcher
.weighting
= LanguageWeighting()
519 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
521 return self
._whoosh_records_to_results(results
)