Added support for lookup by other language name. #15
authorEevee <git@veekun.com>
Sat, 22 Aug 2009 08:13:34 +0000 (01:13 -0700)
committerEevee <git@veekun.com>
Sat, 22 Aug 2009 08:13:34 +0000 (01:13 -0700)
English fuzzy matches are preferred, followed by Roomaji and then
everything else.

The return tuple from lookup() now has a `name` parameter for the actual
name that was matched.

pokedex/__init__.py
pokedex/lookup.py

index 4af3fa0..73391dc 100644 (file)
@@ -66,8 +66,12 @@ def command_lookup(name):
     else:
         print "Fuzzy-matched:"
 
     else:
         print "Fuzzy-matched:"
 
-    for object, language, exact in results:
-        print object.__tablename__, object.name, language
+    for result in results:
+        print "%s: %s" % (result.object.__tablename__, result.object.name),
+        if result.language:
+            print "(%s in %s)" % (result.name, result.language)
+        else:
+            print
 
 
 def command_help():
 
 
 def command_help():
index 0bf1c18..2653b9a 100644 (file)
@@ -95,6 +95,10 @@ def open_index(directory=None, session=None, recreate=False):
     writer = index.writer()
 
     # Index every name in all our tables of interest
     writer = index.writer()
 
     # Index every name in all our tables of interest
+    # speller_entries becomes a list of (word, score) tuples; the score is 2
+    # for English names, 1.5 for Roomaji, and 1 for everything else.  I think
+    # this biases the results in the direction most people expect, especially
+    # when e.g. German names are very similar to English names
     speller_entries = []
     for cls in indexed_tables.values():
         q = session.query(cls)
     speller_entries = []
     for cls in indexed_tables.values():
         q = session.query(cls)
@@ -108,28 +112,31 @@ def open_index(directory=None, session=None, recreate=False):
 
             name = row.name.lower()
             writer.add_document(name=name, **row_key)
 
             name = row.name.lower()
             writer.add_document(name=name, **row_key)
-            speller_entries.append(name)
+            speller_entries.append((name, 1))
 
             for extra_key_func in extra_keys.get(cls, []):
                 extra_key = extra_key_func(row)
                 writer.add_document(name=extra_key, **row_key)
 
             # Pokemon also get other languages
 
             for extra_key_func in extra_keys.get(cls, []):
                 extra_key = extra_key_func(row)
                 writer.add_document(name=extra_key, **row_key)
 
             # Pokemon also get other languages
-            if cls == tables.Pokemon:
-                for foreign_name in row.foreign_names:
-                    name = foreign_name.name.lower()
-                    writer.add_document(name=name,
-                                        language=foreign_name.language.name,
+            for foreign_name in getattr(row, 'foreign_names', []):
+                moonspeak = foreign_name.name.lower()
+                if name == moonspeak:
+                    # Don't add the English name again as a different language;
+                    # no point and it makes spell results confusing
+                    continue
+
+                writer.add_document(name=moonspeak,
+                                    language=foreign_name.language.name,
+                                    **row_key)
+                speller_entries.append((moonspeak, 3))
+
+                # Add Roomaji too
+                if foreign_name.language.name == 'Japanese':
+                    roomaji = romanize(foreign_name.name).lower()
+                    writer.add_document(name=roomaji, language='Roomaji',
                                         **row_key)
                                         **row_key)
-                    speller_entries.append(name)
-
-                    if foreign_name.language.name == 'Japanese':
-                        # Add Roomaji too
-                        roomaji = romanize(foreign_name.name).lower()
-                        writer.add_document(name=roomaji,
-                                            language='Roomaji',
-                                            **row_key)
-                        speller_entries.append(roomaji)
+                    speller_entries.append((roomaji, 8))
 
 
     writer.commit()
 
 
     writer.commit()
@@ -138,18 +145,20 @@ def open_index(directory=None, session=None, recreate=False):
     # at once, as every call to add_* does a commit(), and those seem to be
     # expensive
     speller = whoosh.spelling.SpellChecker(index.storage)
     # at once, as every call to add_* does a commit(), and those seem to be
     # expensive
     speller = whoosh.spelling.SpellChecker(index.storage)
-    speller.add_words(speller_entries)
+    speller.add_scored_words(speller_entries)
 
     return index, speller
 
 
 
     return index, speller
 
 
-LookupResult = namedtuple('LookupResult', ['object', 'language', 'exact'])
+LookupResult = namedtuple('LookupResult',
+                          ['object', 'name', 'language', 'exact'])
 def lookup(name, session=None, indices=None, exact_only=False):
     """Attempts to find some sort of object, given a database session and name.
 
 def lookup(name, session=None, indices=None, exact_only=False):
     """Attempts to find some sort of object, given a database session and name.
 
-    Returns a list of named (object, language, exact) tuples.  `object` is a
-    database object, `language` is the name of the language in which the name
-    was found, and `exact` is True iff this was an exact match.
+    Returns a list of named (object, name, language, exact) tuples.  `object`
+    is a database object, `name` is the name under which the object was found,
+    `language` is the name of the language in which the name was found, and
+    `exact` is True iff this was an exact match.
 
     This function currently ONLY does fuzzy matching if there are no exact
     matches.
 
     This function currently ONLY does fuzzy matching if there are no exact
     matches.
@@ -209,6 +218,12 @@ def lookup(name, session=None, indices=None, exact_only=False):
     seen = {}
     for result in results:
         # Skip dupe results
     seen = {}
     for result in results:
         # Skip dupe results
+        # Note!  The speller prefers English names, but the query does not.  So
+        # "latias" comes over "ratiasu".  "latias" matches only the English
+        # row, comes out first, and all is well.
+        # However!  The speller could then return "foo" which happens to be the
+        # name for two different things in different languages, and the
+        # non-English one could appear preferred.  This is not very likely.
         seen_key = result['table'], result['row_id']
         if seen_key in seen:
             continue
         seen_key = result['table'], result['row_id']
         if seen_key in seen:
             continue
@@ -216,6 +231,9 @@ def lookup(name, session=None, indices=None, exact_only=False):
 
         cls = indexed_tables[result['table']]
         obj = session.query(cls).get(result['row_id'])
 
         cls = indexed_tables[result['table']]
         obj = session.query(cls).get(result['row_id'])
-        objects.append(LookupResult(obj, result['language'], exact))
+        objects.append(LookupResult(object=obj,
+                                    name=result['name'],
+                                    language=result['language'],
+                                    exact=exact))
 
 
-    return objects
+    return objects[:5]