X-Git-Url: http://git.veekun.com/zzz-pokedex.git/blobdiff_plain/85e220b097f5bdc47cd4fa702ddf9924aba3022d..b5fde0da1952ae91f8addce211b3e967841fffd7:/pokedex/lookup.py diff --git a/pokedex/lookup.py b/pokedex/lookup.py index f419c1f..16c4e5e 100644 --- a/pokedex/lookup.py +++ b/pokedex/lookup.py @@ -10,12 +10,15 @@ import whoosh.filedb.filestore import whoosh.filedb.fileindex import whoosh.index from whoosh.qparser import QueryParser +import whoosh.scoring import whoosh.spelling from pokedex.db import connect import pokedex.db.tables as tables from pokedex.roomaji import romanize +__all__ = ['open_index', 'lookup'] + # Dictionary of table name => table class. # Need the table name so we can get the class from the table name after we # retrieve something from the index @@ -78,6 +81,7 @@ def open_index(directory=None, session=None, recreate=False): table=whoosh.fields.STORED, row_id=whoosh.fields.ID(stored=True), language=whoosh.fields.STORED, + display_name=whoosh.fields.STORED, # non-lowercased name ) index = whoosh.index.create_in(directory, schema=schema, indexname='MAIN') @@ -99,29 +103,34 @@ def open_index(directory=None, session=None, recreate=False): for row in q.yield_per(5): row_key = dict(table=cls.__tablename__, row_id=unicode(row.id)) - name = row.name.lower() - writer.add_document(name=name, **row_key) - speller_entries.append((name, 1)) + name = row.name + writer.add_document(name=name.lower(), + display_name=name, + **row_key) + speller_entries.append((name.lower(), 1)) # Pokemon also get other languages for foreign_name in getattr(row, 'foreign_names', []): - moonspeak = foreign_name.name.lower() + moonspeak = foreign_name.name if name == moonspeak: # Don't add the English name again as a different language; # no point and it makes spell results confusing continue - writer.add_document(name=moonspeak, + writer.add_document(name=moonspeak.lower(), language=foreign_name.language.name, + display_name=moonspeak, **row_key) - speller_entries.append((moonspeak, 3)) + speller_entries.append((moonspeak.lower(), 3)) # Add Roomaji too if foreign_name.language.name == 'Japanese': - roomaji = romanize(foreign_name.name).lower() - writer.add_document(name=roomaji, language='Roomaji', + roomaji = romanize(foreign_name.name) + writer.add_document(name=roomaji.lower(), + language='Roomaji', + display_name=roomaji, **row_key) - speller_entries.append((roomaji, 8)) + speller_entries.append((roomaji.lower(), 8)) writer.commit() @@ -135,6 +144,23 @@ def open_index(directory=None, session=None, recreate=False): return index, speller +class LanguageWeighting(whoosh.scoring.Weighting): + """A scoring class that forces otherwise-equal English results to come + before foreign results. + """ + + def score(self, searcher, fieldnum, text, docnum, weight, QTF=1): + doc = searcher.stored_fields(docnum) + if doc['language'] == None: + # English (well, "default"); leave it at 1 + return weight + elif doc['language'] == u'Roomaji': + # Give Roomaji a bit of a boost, as it's most likely to be searched + return weight * 0.95 + else: + # Everything else can drop down the totem pole + return weight * 0.9 + rx_is_number = re.compile('^\d+$') LookupResult = namedtuple('LookupResult', @@ -188,19 +214,22 @@ def lookup(input, session=None, indices=None, exact_only=False): name = unicode(input).lower() exact = True - # If the input provided is a number, match it as an id. Otherwise, name - if rx_is_number.match(input): - query_column = 'row_id' - exact_only = True # don't spell-check numbers! + # If the input provided is a number, match it as an id. Otherwise, name. + # Term objects do an exact match, so we don't have to worry about a query + # parser tripping on weird characters in the input + if rx_is_number.match(name): + # Don't spell-check numbers! + exact_only = True + query = whoosh.query.Term(u'row_id', name) else: # Not an integer - query_column = 'name' + query = whoosh.query.Term(u'name', name) - # Look for exact name. A Term object does an exact match, so we don't have - # to worry about a query parser tripping on weird characters in the input + ### Actual searching searcher = index.searcher() - query = whoosh.query.Term(query_column, name) - print query + searcher.weighting = LanguageWeighting() # XXX kosher? docs say search() + # takes a weighting kw but it + # certainly does not results = searcher.search(query) # Look for some fuzzy matches if necessary @@ -217,12 +246,6 @@ def lookup(input, session=None, indices=None, exact_only=False): seen = {} for result in results: # Skip dupe results - # Note! The speller prefers English names, but the query does not. So - # "latias" comes over "ratiasu". "latias" matches only the English - # row, comes out first, and all is well. - # However! The speller could then return "foo" which happens to be the - # name for two different things in different languages, and the - # non-English one could appear preferred. This is not very likely. seen_key = result['table'], result['row_id'] if seen_key in seen: continue @@ -231,7 +254,7 @@ def lookup(input, session=None, indices=None, exact_only=False): cls = indexed_tables[result['table']] obj = session.query(cls).get(result['row_id']) objects.append(LookupResult(object=obj, - name=result['name'], + name=result['display_name'], language=result['language'], exact=exact))