From: Eevee <git@veekun.com>
Date: Sat, 22 Aug 2009 08:13:34 +0000 (-0700)
Subject: Added support for lookup by other language name.  #15
X-Git-Tag: veekun-promotions/2010050901~203
X-Git-Url: http://git.veekun.com/zzz-pokedex.git/commitdiff_plain/1a34d3c566e747bbff9335b34369529eb1c585c7?ds=sidebyside

Added support for lookup by other language name.  #15

English fuzzy matches are preferred, followed by Roomaji and then
everything else.

The return tuple from lookup() now has a `name` parameter for the actual
name that was matched.
---

diff --git a/pokedex/__init__.py b/pokedex/__init__.py
index 4af3fa0..73391dc 100644
--- a/pokedex/__init__.py
+++ b/pokedex/__init__.py
@@ -66,8 +66,12 @@ def command_lookup(name):
     else:
         print "Fuzzy-matched:"
 
-    for object, language, exact in results:
-        print object.__tablename__, object.name, language
+    for result in results:
+        print "%s: %s" % (result.object.__tablename__, result.object.name),
+        if result.language:
+            print "(%s in %s)" % (result.name, result.language)
+        else:
+            print
 
 
 def command_help():
diff --git a/pokedex/lookup.py b/pokedex/lookup.py
index 0bf1c18..2653b9a 100644
--- a/pokedex/lookup.py
+++ b/pokedex/lookup.py
@@ -95,6 +95,10 @@ def open_index(directory=None, session=None, recreate=False):
     writer = index.writer()
 
     # Index every name in all our tables of interest
+    # speller_entries becomes a list of (word, score) tuples; the score is 2
+    # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
+    # this biases the results in the direction most people expect, especially
+    # when e.g. German names are very similar to English names
     speller_entries = []
     for cls in indexed_tables.values():
         q = session.query(cls)
@@ -108,28 +112,31 @@ def open_index(directory=None, session=None, recreate=False):
 
             name = row.name.lower()
             writer.add_document(name=name, **row_key)
-            speller_entries.append(name)
+            speller_entries.append((name, 1))
 
             for extra_key_func in extra_keys.get(cls, []):
                 extra_key = extra_key_func(row)
                 writer.add_document(name=extra_key, **row_key)
 
             # Pokemon also get other languages
-            if cls == tables.Pokemon:
-                for foreign_name in row.foreign_names:
-                    name = foreign_name.name.lower()
-                    writer.add_document(name=name,
-                                        language=foreign_name.language.name,
+            for foreign_name in getattr(row, 'foreign_names', []):
+                moonspeak = foreign_name.name.lower()
+                if name == moonspeak:
+                    # Don't add the English name again as a different language;
+                    # no point and it makes spell results confusing
+                    continue
+
+                writer.add_document(name=moonspeak,
+                                    language=foreign_name.language.name,
+                                    **row_key)
+                speller_entries.append((moonspeak, 3))
+
+                # Add Roomaji too
+                if foreign_name.language.name == 'Japanese':
+                    roomaji = romanize(foreign_name.name).lower()
+                    writer.add_document(name=roomaji, language='Roomaji',
                                         **row_key)
-                    speller_entries.append(name)
-
-                    if foreign_name.language.name == 'Japanese':
-                        # Add Roomaji too
-                        roomaji = romanize(foreign_name.name).lower()
-                        writer.add_document(name=roomaji,
-                                            language='Roomaji',
-                                            **row_key)
-                        speller_entries.append(roomaji)
+                    speller_entries.append((roomaji, 8))
 
 
     writer.commit()
@@ -138,18 +145,20 @@ def open_index(directory=None, session=None, recreate=False):
     # at once, as every call to add_* does a commit(), and those seem to be
     # expensive
     speller = whoosh.spelling.SpellChecker(index.storage)
-    speller.add_words(speller_entries)
+    speller.add_scored_words(speller_entries)
 
     return index, speller
 
 
-LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact'])
+LookupResult = namedtuple('LookupResult',
+                          ['object', 'name', 'language', 'exact'])
 def lookup(name, session=None, indices=None, exact_only=False):
     """Attempts to find some sort of object, given a database session and name.
 
-    Returns a list of named (object, language, exact) tuples.  `object` is a
-    database object, `language` is the name of the language in which the name
-    was found, and `exact` is True iff this was an exact match.
+    Returns a list of named (object, name, language, exact) tuples.  `object`
+    is a database object, `name` is the name under which the object was found,
+    `language` is the name of the language in which the name was found, and
+    `exact` is True iff this was an exact match.
 
     This function currently ONLY does fuzzy matching if there are no exact
     matches.
@@ -209,6 +218,12 @@ def lookup(name, session=None, indices=None, exact_only=False):
     seen = {}
     for result in results:
         # Skip dupe results
+        # Note!  The speller prefers English names, but the query does not.  So
+        # "latias" comes over "ratiasu".  "latias" matches only the English
+        # row, comes out first, and all is well.
+        # However!  The speller could then return "foo" which happens to be the
+        # name for two different things in different languages, and the
+        # non-English one could appear preferred.  This is not very likely.
         seen_key = result['table'], result['row_id']
         if seen_key in seen:
             continue
@@ -216,6 +231,9 @@ def lookup(name, session=None, indices=None, exact_only=False):
 
         cls = indexed_tables[result['table']]
         obj = session.query(cls).get(result['row_id'])
-        objects.append(LookupResult(obj, result['language'], exact))
+        objects.append(LookupResult(object=obj,
+                                    name=result['name'],
+                                    language=result['language'],
+                                    exact=exact))
 
-    return objects
+    return objects[:5]