2 from collections
import namedtuple
10 from sqlalchemy
.sql
import func
12 import whoosh
.filedb
.filestore
13 import whoosh
.filedb
.fileindex
15 from whoosh
.qparser
import QueryParser
17 import whoosh
.spelling
19 from pokedex
.db
import connect
20 import pokedex
.db
.tables
as tables
21 from pokedex
.roomaji
import romanize
23 __all__
= ['PokedexLookup']
26 rx_is_number
= re
.compile('^\d+$')
28 LookupResult
= namedtuple('LookupResult',
29 ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
31 class UninitializedIndex(object):
32 class UninitializedIndexError(Exception):
35 def __nonzero__(self
):
36 """Dummy object should identify itself as False."""
40 """Python 3000 version of the above. Future-proofing rules!"""
43 def __getattr__(self
, *args
, **kwargs
):
44 raise self
.UninitializedIndexError(
45 "The lookup index does not exist. Please use `pokedex setup` "
46 "or lookup.rebuild_index() to create it."
49 class LanguageWeighting(whoosh
.scoring
.Weighting
):
50 """A scoring class that forces otherwise-equal English results to come
51 before foreign results.
54 def score(self
, searcher
, fieldnum
, text
, docnum
, weight
, QTF
=1):
55 doc
= searcher
.stored_fields(docnum
)
56 if doc
['language'] == None:
57 # English (well, "default"); leave it at 1
59 elif doc
['language'] == u
'Roomaji':
60 # Give Roomaji a little boost; it's most likely to be searched
63 # Everything else can drop down the totem pole
67 class PokedexLookup(object):
68 INTERMEDIATE_LOOKUP_RESULTS
= 25
69 MAX_LOOKUP_RESULTS
= 10
71 # Dictionary of table name => table class.
72 # Need the table name so we can get the class from the table name after we
73 # retrieve something from the index
74 indexed_tables
= dict(
75 (cls
.__tablename__
, cls
)
88 def __init__(self
, directory
=None, session
=None):
89 """Opens the whoosh index stored in the named directory. If the index
90 doesn't already exist, it will be created.
93 Directory containing the index. Defaults to a location within the
94 `pokedex` egg directory.
97 Used for creating the index and retrieving objects. Defaults to an
98 attempt to connect to the default SQLite database installed by
102 # By the time this returns, self.index, self.speller, and self.session
107 directory
= os
.environ
.get('POKEDEX_INDEX_DIR', None)
110 directory
= pkg_resources
.resource_filename('pokedex',
112 self
.directory
= directory
115 self
.session
= session
117 self
.session
= connect()
119 # Attempt to open or create the index
120 if not os
.path
.exists(directory
) or not os
.listdir(directory
):
121 # Directory doesn't exist OR is empty; caller needs to use
122 # rebuild_index before doing anything. Provide a dummy object that
123 # complains when used
124 self
.index
= UninitializedIndex()
125 self
.speller
= UninitializedIndex()
128 # Otherwise, already exists; should be an index! Bam, done.
129 # Note that this will explode if the directory exists but doesn't
130 # contain an index; that's a feature
132 self
.index
= whoosh
.index
.open_dir(directory
, indexname
='MAIN')
133 except whoosh
.index
.EmptyIndexError
:
135 "The index directory already contains files. "
136 "Please use a dedicated directory for the lookup index."
139 # Create speller, and done
140 spell_store
= whoosh
.filedb
.filestore
.FileStorage(directory
)
141 self
.speller
= whoosh
.spelling
.SpellChecker(spell_store
)
144 def rebuild_index(self
):
145 """Creates the index from scratch."""
147 schema
= whoosh
.fields
.Schema(
148 name
=whoosh
.fields
.ID(stored
=True),
149 table
=whoosh
.fields
.ID(stored
=True),
150 row_id
=whoosh
.fields
.ID(stored
=True),
151 language
=whoosh
.fields
.STORED
,
152 iso3166
=whoosh
.fields
.STORED
,
153 display_name
=whoosh
.fields
.STORED
, # non-lowercased name
156 if not os
.path
.exists(self
.directory
):
157 os
.mkdir(self
.directory
)
159 self
.index
= whoosh
.index
.create_in(self
.directory
, schema
=schema
,
161 writer
= self
.index
.writer()
163 # Index every name in all our tables of interest
164 # speller_entries becomes a list of (word, score) tuples; the score is
165 # 2 for English names, 1.5 for Roomaji, and 1 for everything else. I
166 # think this biases the results in the direction most people expect,
167 # especially when e.g. German names are very similar to English names
169 for cls
in self
.indexed_tables
.values():
170 q
= self
.session
.query(cls
)
172 for row
in q
.yield_per(5):
173 row_key
= dict(table
=unicode(cls
.__tablename__
),
174 row_id
=unicode(row
.id))
176 def add(name
, language
, iso3166
, score
):
177 normalized_name
= self
.normalize_name(name
)
180 name
=normalized_name
, display_name
=name
,
181 language
=language
, iso3166
=iso3166
,
185 speller_entries
.append((normalized_name
, score
))
188 # Add the basic English name to the index
189 if cls
== tables
.Pokemon
:
190 # Pokémon need their form name added
192 add(row
.full_name
, None, u
'us', 1)
194 # If this is a default form, ALSO add the unadorned name,
195 # so 'Deoxys' alone will still do the right thing
196 if row
.forme_name
and not row
.forme_base_pokemon_id
:
197 add(row
.name
, None, u
'us', 1)
199 add(row
.name
, None, u
'us', 1)
201 # Some things also have other languages' names
202 # XXX other language form names..?
203 for foreign_name
in getattr(row
, 'foreign_names', []):
204 moonspeak
= foreign_name
.name
205 if row
.name
== moonspeak
:
206 # Don't add the English name again as a different
207 # language; no point and it makes spell results
211 add(moonspeak
, foreign_name
.language
.name
,
212 foreign_name
.language
.iso3166
,
216 if foreign_name
.language
.name
== 'Japanese':
217 roomaji
= romanize(foreign_name
.name
)
218 add(roomaji
, u
'Roomaji', u
'jp', 8)
222 # Construct and populate a spell-checker index. Quicker to do it all
223 # at once, as every call to add_* does a commit(), and those seem to be
225 self
.speller
= whoosh
.spelling
.SpellChecker(self
.index
.storage
)
226 self
.speller
.add_scored_words(speller_entries
)
229 def normalize_name(self
, name
):
230 """Strips irrelevant formatting junk from name input.
232 Specifically: everything is lowercased, and accents are removed.
234 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
235 # Makes sense to me. Decompose by Unicode rules, then remove combining
236 # characters, then recombine. I'm explicitly doing it this way instead
237 # of testing combining() because Korean characters apparently
238 # decompose! But the results are considered letters, not combining
239 # characters, so testing for Mn works well, and combining them again
240 # makes them look right.
241 nkfd_form
= unicodedata
.normalize('NFKD', unicode(name
))
242 name
= u
"".join(c
for c
in nkfd_form
243 if unicodedata
.category(c
) != 'Mn')
244 name
= unicodedata
.normalize('NFC', name
)
252 def _apply_valid_types(self
, name
, valid_types
):
253 """Combines the enforced `valid_types` with any from the search string
254 itself and updates the query.
256 For example, a name of 'a,b:foo' and valid_types of b,c will search for
257 only `b`s named "foo".
259 Returns `(name, merged_valid_types, term)`, where `name` has had any type
260 prefix stripped, `merged_valid_types` combines the original
261 `valid_types` with the type prefix, and `term` is a query term for
262 limited to just the allowed types. If there are no type restrictions
263 at all, `term` will be None.
266 # Remove any type prefix (pokemon:133) first
267 user_valid_types
= []
269 prefix_chunk
, name
= name
.split(':', 1)
272 prefixes
= prefix_chunk
.split(',')
273 user_valid_types
= [_
.strip() for _
in prefixes
]
275 # Merge the valid types together. Only types that appear in BOTH lists
277 # As a special case, if the user asked for types that are explicitly
278 # forbidden, completely ignore what the user requested
279 combined_valid_types
= []
280 if user_valid_types
and valid_types
:
281 combined_valid_types
= list(
282 set(user_valid_types
) & set(combined_valid_types
)
285 if not combined_valid_types
:
286 # No overlap! Just use the enforced ones
287 combined_valid_types
= valid_types
289 # One list or the other was blank, so just use the one that isn't
290 combined_valid_types
= valid_types
+ user_valid_types
292 if not combined_valid_types
:
294 return name
, [], None
298 final_valid_types
= []
299 for valid_type
in combined_valid_types
:
300 table_name
= self
._parse_table_name(valid_type
)
302 # Quietly ignore bogus valid_types; more likely to DTRT
304 final_valid_types
.append(valid_type
)
305 type_terms
.append(whoosh
.query
.Term(u
'table', table_name
))
307 return name
, final_valid_types
, whoosh
.query
.Or(type_terms
)
310 def _parse_table_name(self
, name
):
311 """Takes a singular table name, table name, or table object and returns
314 Returns None for a bogus name.
317 if hasattr(name
, '__tablename__'):
318 return getattr(name
, '__tablename__')
321 for table
in self
.indexed_tables
.values():
322 if name
in (table
.__tablename__
, table
.__singlename__
):
323 return table
.__tablename__
325 # Bogus. Be nice and return dummy
328 def _whoosh_records_to_results(self
, records
, exact
=True):
329 """Converts a list of whoosh's indexed records to LookupResult tuples
330 containing database objects.
332 # XXX this 'exact' thing is getting kinda leaky. would like a better
333 # way to handle it, since only lookup() cares about fuzzy results
336 for record
in records
:
338 seen_key
= record
['table'], record
['row_id']
341 seen
[seen_key
] = True
343 cls
= self
.indexed_tables
[record
['table']]
344 obj
= self
.session
.query(cls
).get(record
['row_id'])
346 results
.append(LookupResult(object=obj
,
347 indexed_name
=record
['name'],
348 name
=record
['display_name'],
349 language
=record
['language'],
350 iso3166
=record
['iso3166'],
356 def lookup(self
, input, valid_types
=[], exact_only
=False):
357 """Attempts to find some sort of object, given a name.
359 Returns a list of named (object, name, language, iso3166, exact)
360 tuples. `object` is a database object, `name` is the name under which
361 the object was found, `language` and `iso3166` are the name and country
362 code of the language in which the name was found, and `exact` is True
366 This function currently ONLY does fuzzy matching if there are no exact
369 Formes are not returned unless requested; "Shaymin" will return only
372 Extraneous whitespace is removed with extreme prejudice.
375 - Names: "Eevee", "Surf", "Run Away", "Payapa Berry", etc.
376 - Foreign names: "Iibui", "Eivui"
377 - Fuzzy names in whatever language: "Evee", "Ibui"
378 - IDs: "133", "192", "250"
380 - Type restrictions. "type:psychic" will only return the type. This
381 is how to make ID lookup useful. Multiple type specs can be entered
382 with commas, as "move,item:1". If `valid_types` are provided, any
383 type prefix will be ignored.
384 - Alternate formes can be specified merely like "wash rotom".
387 Name of the thing to look for.
390 A list of table objects or names, e.g., `['pokemon', 'moves']`. If
391 this is provided, only results in one of the given tables will be
395 If True, only exact matches are returned. If set to False (the
396 default), and the provided `name` doesn't match anything exactly,
397 spelling correction will be attempted.
400 name
= self
.normalize_name(input)
404 # Pop off any type prefix and merge with valid_types
405 name
, merged_valid_types
, type_term
= \
406 self
._apply_valid_types(name
, valid_types
)
410 return self
.random_lookup(valid_types
=merged_valid_types
)
412 # Do different things depending what the query looks like
413 # Note: Term objects do an exact match, so we don't have to worry about
414 # a query parser tripping on weird characters in the input
416 # Let Python try to convert to a number, so 0xff works
417 name_as_number
= int(name
, base
=0)
420 name_as_number
= None
422 if '*' in name
or '?' in name
:
424 query
= whoosh
.query
.Wildcard(u
'name', name
)
425 elif name_as_number
is not None:
426 # Don't spell-check numbers!
428 query
= whoosh
.query
.Term(u
'row_id', unicode(name_as_number
))
431 query
= whoosh
.query
.Term(u
'name', name
)
434 query
= query
& type_term
438 searcher
= self
.index
.searcher()
439 # XXX is this kosher? docs say search() takes a weighting arg, but it
441 searcher
.weighting
= LanguageWeighting()
442 results
= searcher
.search(query
,
443 limit
=self
.INTERMEDIATE_LOOKUP_RESULTS
)
445 # Look for some fuzzy matches if necessary
446 if not exact_only
and not results
:
450 for suggestion
in self
.speller
.suggest(
451 name
, self
.INTERMEDIATE_LOOKUP_RESULTS
):
453 query
= whoosh
.query
.Term('name', suggestion
)
454 results
.extend(searcher
.search(query
))
456 ### Convert results to db objects
457 objects
= self
._whoosh_records_to_results(results
, exact
=exact
)
459 # Only return up to 10 matches; beyond that, something is wrong. We
460 # strip out duplicate entries above, so it's remotely possible that we
461 # should have more than 10 here and lost a few. The speller returns 25
462 # to give us some padding, and should avoid that problem. Not a big
463 # deal if we lose the 25th-most-likely match anyway.
464 return objects
[:self
.MAX_LOOKUP_RESULTS
]
467 def random_lookup(self
, valid_types
=[]):
468 """Returns a random lookup result from one of the provided
473 for valid_type
in valid_types
:
474 table_name
= self
._parse_table_name(valid_type
)
476 tables
.append(self
.indexed_tables
[table_name
])
479 # n.b.: It's possible we got a list of valid_types and none of them
480 # were valid, but this function is guaranteed to return
481 # *something*, so it politely selects from the entire index isntead
482 tables
= self
.indexed_tables
.values()
484 # Rather than create an array of many hundred items and pick randomly
485 # from it, just pick a number up to the total number of potential
486 # items, then pick randomly from that, and partition the whole range
487 # into chunks. This also avoids the slight problem that the index
488 # contains more rows (for languages) for some items than others.
489 # XXX ought to cache this (in the index?) if possible
493 count
= self
.session
.query(table
).count()
495 partitions
.append((table
, count
))
497 n
= random
.randint(1, total
)
498 while n
> partitions
[0][1]:
499 n
-= partitions
[0][1]
502 return self
.lookup(unicode(n
), valid_types
=[ partitions
[0][0] ])
504 def prefix_lookup(self
, prefix
, valid_types
=[]):
505 """Returns terms starting with the given exact prefix.
507 Type prefixes are recognized, but no other name munging is done.
510 # Pop off any type prefix and merge with valid_types
511 prefix
, merged_valid_types
, type_term
= \
512 self
._apply_valid_types(prefix
, valid_types
)
514 query
= whoosh
.query
.Prefix(u
'name', self
.normalize_name(prefix
))
517 query
= query
& type_term
519 searcher
= self
.index
.searcher()
520 searcher
.weighting
= LanguageWeighting()
521 results
= searcher
.search(query
) # XXX , limit=self.MAX_LOOKUP_RESULTS)
523 return self
._whoosh_records_to_results(results
)