Move pokémon media to a pokemon/ directory
[zzz-pokedex.git] / pokedex / lookup.py
index 8c3e444..8488f21 100644 (file)
@@ -26,8 +26,9 @@ __all__ = ['PokedexLookup']
 
 rx_is_number = re.compile('^\d+$')
 
 
 rx_is_number = re.compile('^\d+$')
 
-LookupResult = namedtuple('LookupResult',
-    ['object', 'indexed_name', 'name', 'language', 'iso3166', 'exact'])
+LookupResult = namedtuple('LookupResult', [
+    'object', 'indexed_name', 'name', 'language', 'iso639', 'iso3166', 'exact',
+])
 
 class UninitializedIndex(object):
     class UninitializedIndexError(Exception):
 
 class UninitializedIndex(object):
     class UninitializedIndexError(Exception):
@@ -68,10 +69,11 @@ class LanguageWeighting(whoosh.scoring.Weighting):
         # Apply extra weight
         weight = weight * self.extra_weights.get(text, 1.0)
 
         # Apply extra weight
         weight = weight * self.extra_weights.get(text, 1.0)
 
-        if doc['language'] == None:
+        language = doc.get('language')
+        if language is None:
             # English (well, "default"); leave it at 1
             return weight
             # English (well, "default"); leave it at 1
             return weight
-        elif doc['language'] == u'Roomaji':
+        elif language == u'Roomaji':
             # Give Roomaji a little boost; it's most likely to be searched
             return weight * 0.9
         else:
             # Give Roomaji a little boost; it's most likely to be searched
             return weight * 0.9
         else:
@@ -80,8 +82,9 @@ class LanguageWeighting(whoosh.scoring.Weighting):
 
 
 class PokedexLookup(object):
 
 
 class PokedexLookup(object):
-    INTERMEDIATE_LOOKUP_RESULTS = 25
-    MAX_LOOKUP_RESULTS = 10
+    MAX_FUZZY_RESULTS = 10
+    MAX_EXACT_RESULTS = 43
+    INTERMEDIATE_FACTOR = 2
 
     # The speller only checks how much the input matches a word; there can be
     # all manner of extra unmatched junk, and it won't affect the weighting.
 
     # The speller only checks how much the input matches a word; there can be
     # all manner of extra unmatched junk, and it won't affect the weighting.
@@ -101,6 +104,7 @@ class PokedexLookup(object):
             tables.Move,
             tables.Nature,
             tables.Pokemon,
             tables.Move,
             tables.Nature,
             tables.Pokemon,
+            tables.PokemonForm,
             tables.Type,
         )
     )
             tables.Type,
         )
     )
@@ -168,11 +172,18 @@ class PokedexLookup(object):
             table=whoosh.fields.ID(stored=True),
             row_id=whoosh.fields.ID(stored=True),
             language=whoosh.fields.STORED,
             table=whoosh.fields.ID(stored=True),
             row_id=whoosh.fields.ID(stored=True),
             language=whoosh.fields.STORED,
-            iso3166=whoosh.fields.STORED,
+            iso639=whoosh.fields.ID(stored=True),
+            iso3166=whoosh.fields.ID(stored=True),
             display_name=whoosh.fields.STORED,  # non-lowercased name
         )
 
             display_name=whoosh.fields.STORED,  # non-lowercased name
         )
 
-        if not os.path.exists(self.directory):
+        if os.path.exists(self.directory):
+            # create_in() isn't totally reliable, so just nuke whatever's there
+            # manually.  Try to be careful about this...
+            for f in os.listdir(self.directory):
+                if re.match('^_?(MAIN|SPELL)_', f):
+                    os.remove(os.path.join(self.directory, f))
+        else:
             os.mkdir(self.directory)
 
         self.index = whoosh.index.create_in(self.directory, schema=schema,
             os.mkdir(self.directory)
 
         self.index = whoosh.index.create_in(self.directory, schema=schema,
@@ -188,12 +199,12 @@ class PokedexLookup(object):
                 row_key = dict(table=unicode(cls.__tablename__),
                                row_id=unicode(row.id))
 
                 row_key = dict(table=unicode(cls.__tablename__),
                                row_id=unicode(row.id))
 
-                def add(name, language, iso3166):
+                def add(name, language, iso639, iso3166):
                     normalized_name = self.normalize_name(name)
 
                     writer.add_document(
                         name=normalized_name, display_name=name,
                     normalized_name = self.normalize_name(name)
 
                     writer.add_document(
                         name=normalized_name, display_name=name,
-                        language=language, iso3166=iso3166,
+                        language=language, iso639=iso639, iso3166=iso3166,
                         **row_key
                     )
 
                         **row_key
                     )
 
@@ -202,34 +213,34 @@ class PokedexLookup(object):
 
                 # Add the basic English name to the index
                 if cls == tables.Pokemon:
 
                 # Add the basic English name to the index
                 if cls == tables.Pokemon:
-                    # Pokémon need their form name added
-                    # XXX kinda kludgy
-                    add(row.full_name, None, u'us')
-
-                    # If this is a default form, ALSO add the unadorned name,
-                    # so 'Deoxys' alone will still do the right thing
-                    if row.forme_name and not row.forme_base_pokemon_id:
-                        add(row.name, None, u'us')
-                else:
-                    add(row.name, None, u'us')
+                    # Don't re-add alternate forms of the same Pokémon; they'll
+                    # be added as Pokémon forms instead
+                    if not row.is_base_form:
+                        continue
+                elif cls == tables.PokemonForm:
+                    if row.name:
+                        add(row.pokemon_name, None, u'en', u'us')
+                    continue
 
                 # Some things also have other languages' names
                 # XXX other language form names..?
 
                 # Some things also have other languages' names
                 # XXX other language form names..?
-                for foreign_name in getattr(row, 'foreign_names', []):
-                    moonspeak = foreign_name.name
-                    if row.name == moonspeak:
-                        # Don't add the English name again as a different
+                seen = set()
+                for language, name in getattr(row, 'name_map', {}).items():
+                    if name in seen:
+                        # Don't add the name again as a different
                         # language; no point and it makes spell results
                         # confusing
                         continue
                         # language; no point and it makes spell results
                         # confusing
                         continue
+                    seen.add(name)
 
 
-                    add(moonspeak, foreign_name.language.name,
-                                   foreign_name.language.iso3166)
+                    add(name, language.name,
+                              language.iso639,
+                              language.iso3166)
 
                     # Add Roomaji too
 
                     # Add Roomaji too
-                    if foreign_name.language.name == 'Japanese':
-                        roomaji = romanize(foreign_name.name)
-                        add(roomaji, u'Roomaji', u'jp')
+                    if language.identifier == 'ja':
+                        roomaji = romanize(name)
+                        add(roomaji, u'Roomaji', u'ja', u'jp')
 
         writer.commit()
 
 
         writer.commit()
 
@@ -285,41 +296,62 @@ class PokedexLookup(object):
             name = name.strip()
 
             prefixes = prefix_chunk.split(',')
             name = name.strip()
 
             prefixes = prefix_chunk.split(',')
-            user_valid_types = [_.strip() for _ in prefixes]
+            user_valid_types = []
+            for prefix in prefixes:
+                prefix = prefix.strip()
+                if prefix:
+                    user_valid_types.append(prefix)
 
         # Merge the valid types together.  Only types that appear in BOTH lists
         # may be used.
         # As a special case, if the user asked for types that are explicitly
 
         # Merge the valid types together.  Only types that appear in BOTH lists
         # may be used.
         # As a special case, if the user asked for types that are explicitly
-        # forbidden, completely ignore what the user requested
-        combined_valid_types = []
-        if user_valid_types and valid_types:
-            combined_valid_types = list(
-                set(user_valid_types) & set(combined_valid_types)
-            )
-
-            if not combined_valid_types:
-                # No overlap!  Just use the enforced ones
-                combined_valid_types = valid_types
-        else:
-            # One list or the other was blank, so just use the one that isn't
-            combined_valid_types = valid_types + user_valid_types
+        # forbidden, completely ignore what the user requested.
+        # And, just to complicate matters: "type" and language need to be
+        # considered separately.
+        def merge_requirements(func):
+            user = filter(func, user_valid_types)
+            system = filter(func, valid_types)
+
+            if user and system:
+                merged = list(set(user) & set(system))
+                if merged:
+                    return merged
+                else:
+                    # No overlap; use the system restrictions
+                    return system
+            else:
+                # One or the other is blank; use the one that's not
+                return user or system
 
 
-        if not combined_valid_types:
-            # No restrictions
-            return name, [], None
+        # @foo means language must be foo; otherwise it's a table name
+        lang_requirements = merge_requirements(lambda req: req[0] == u'@')
+        type_requirements = merge_requirements(lambda req: req[0] != u'@')
+        all_requirements = lang_requirements + type_requirements
 
         # Construct the term
 
         # Construct the term
+        lang_terms = []
+        for lang in lang_requirements:
+            # Allow for either country or language codes
+            lang_code = lang[1:]
+            lang_terms.append(whoosh.query.Term(u'iso639', lang_code))
+            lang_terms.append(whoosh.query.Term(u'iso3166', lang_code))
+
         type_terms = []
         type_terms = []
-        final_valid_types = []
-        for valid_type in combined_valid_types:
-            table_name = self._parse_table_name(valid_type)
+        for type in type_requirements:
+            table_name = self._parse_table_name(type)
 
             # Quietly ignore bogus valid_types; more likely to DTRT
             if table_name:
 
             # Quietly ignore bogus valid_types; more likely to DTRT
             if table_name:
-                final_valid_types.append(valid_type)
                 type_terms.append(whoosh.query.Term(u'table', table_name))
 
                 type_terms.append(whoosh.query.Term(u'table', table_name))
 
-        return name, final_valid_types, whoosh.query.Or(type_terms)
+        # Combine both kinds of restriction
+        all_terms = []
+        if type_terms:
+            all_terms.append(whoosh.query.Or(type_terms))
+        if lang_terms:
+            all_terms.append(whoosh.query.Or(lang_terms))
+
+        return name, all_requirements, whoosh.query.And(all_terms)
 
 
     def _parse_table_name(self, name):
 
 
     def _parse_table_name(self, name):
@@ -361,7 +393,8 @@ class PokedexLookup(object):
             results.append(LookupResult(object=obj,
                                         indexed_name=record['name'],
                                         name=record['display_name'],
             results.append(LookupResult(object=obj,
                                         indexed_name=record['name'],
                                         name=record['display_name'],
-                                        language=record['language'],
+                                        language=record.get('language'),
+                                        iso639=record['iso639'],
                                         iso3166=record['iso3166'],
                                         exact=exact))
 
                                         iso3166=record['iso3166'],
                                         exact=exact))
 
@@ -371,12 +404,11 @@ class PokedexLookup(object):
     def lookup(self, input, valid_types=[], exact_only=False):
         """Attempts to find some sort of object, given a name.
 
     def lookup(self, input, valid_types=[], exact_only=False):
         """Attempts to find some sort of object, given a name.
 
-        Returns a list of named (object, name, language, iso3166, exact)
-        tuples.  `object` is a database object, `name` is the name under which
-        the object was found, `language` and `iso3166` are the name and country
-        code of the language in which the name was found, and `exact` is True
-        iff this was an
-        exact match.
+        Returns a list of named (object, name, language, iso639, iso3166,
+        exact) tuples.  `object` is a database object, `name` is the name under
+        which the object was found, `language` and the two isos are the name
+        and country codes of the language in which the name was found, and
+        `exact` is True iff this was an exact match.
 
         This function currently ONLY does fuzzy matching if there are no exact
         matches.
 
         This function currently ONLY does fuzzy matching if there are no exact
         matches.
@@ -394,17 +426,19 @@ class PokedexLookup(object):
         Also:
         - Type restrictions.  "type:psychic" will only return the type.  This
           is how to make ID lookup useful.  Multiple type specs can be entered
         Also:
         - Type restrictions.  "type:psychic" will only return the type.  This
           is how to make ID lookup useful.  Multiple type specs can be entered
-          with commas, as "move,item:1".  If `valid_types` are provided, any
-          type prefix will be ignored.
+          with commas, as "move,item:1".
+        - Language restrictions.  "@fr:charge" will only return Tackle, which
+          is called "Charge" in French.  These can be combined with type
+          restrictions, e.g., "@fr,move:charge".
         - Alternate formes can be specified merely like "wash rotom".
 
         `input`
             Name of the thing to look for.
 
         `valid_types`
         - Alternate formes can be specified merely like "wash rotom".
 
         `input`
             Name of the thing to look for.
 
         `valid_types`
-            A list of table objects or names, e.g., `['pokemon', 'moves']`.  If
-            this is provided, only results in one of the given tables will be
-            returned.
+            A list of type or language restrictions, e.g., `['pokemon',
+            '@ja']`.  If this is provided, only results in one of the given
+            tables will be returned.
 
         `exact_only`
             If True, only exact matches are returned.  If set to False (the
 
         `exact_only`
             If True, only exact matches are returned.  If set to False (the
@@ -450,12 +484,26 @@ class PokedexLookup(object):
 
 
         ### Actual searching
 
 
         ### Actual searching
-        searcher = self.index.searcher()
-        # XXX is this kosher?  docs say search() takes a weighting arg, but it
-        # certainly does not
-        searcher.weighting = LanguageWeighting()
-        results = searcher.search(query,
-                                  limit=self.INTERMEDIATE_LOOKUP_RESULTS)
+        # Limits; result limits are constants, and intermediate results (before
+        # duplicate items are stripped out) are capped at the result limit
+        # times another constant.
+        # Fuzzy are capped at 10, beyond which something is probably very
+        # wrong.  Exact matches -- that is, wildcards and ids -- are far less
+        # constrained.
+        # Also, exact matches are sorted by name, since weight doesn't matter.
+        sort_by = dict()
+        if exact_only:
+            max_results = self.MAX_EXACT_RESULTS
+            sort_by['sortedby'] = (u'table', u'name')
+        else:
+            max_results = self.MAX_FUZZY_RESULTS
+
+        searcher = self.index.searcher(weighting=LanguageWeighting())
+        results = searcher.search(
+            query,
+            limit=int(max_results * self.INTERMEDIATE_FACTOR),
+            **sort_by
+        )
 
         # Look for some fuzzy matches if necessary
         if not exact_only and not results:
 
         # Look for some fuzzy matches if necessary
         if not exact_only and not results:
@@ -490,12 +538,8 @@ class PokedexLookup(object):
         ### Convert results to db objects
         objects = self._whoosh_records_to_results(results, exact=exact)
 
         ### Convert results to db objects
         objects = self._whoosh_records_to_results(results, exact=exact)
 
-        # Only return up to 10 matches; beyond that, something is wrong.  We
-        # strip out duplicate entries above, so it's remotely possible that we
-        # should have more than 10 here and lost a few.  The speller returns 25
-        # to give us some padding, and should avoid that problem.  Not a big
-        # deal if we lose the 25th-most-likely match anyway.
-        return objects[:self.MAX_LOOKUP_RESULTS]
+        # Truncate and return
+        return objects[:max_results]
 
 
     def random_lookup(self, valid_types=[]):
 
 
     def random_lookup(self, valid_types=[]):
@@ -503,37 +547,34 @@ class PokedexLookup(object):
         `valid_types`.
         """
 
         `valid_types`.
         """
 
-        tables = []
+        table_names = []
         for valid_type in valid_types:
             table_name = self._parse_table_name(valid_type)
         for valid_type in valid_types:
             table_name = self._parse_table_name(valid_type)
-            if table_name:
-                tables.append(self.indexed_tables[table_name])
+            # Skip anything not recognized.  Could be, say, a language code.
+            # XXX The vast majority of Pokémon forms are unnamed and unindexed,
+            #     which can produce blank results.  So skip them too for now.
+            if table_name and table_name != 'pokemon_forms':
+                table_names.append(table_name)
 
 
-        if not tables:
+        if not table_names:
             # n.b.: It's possible we got a list of valid_types and none of them
             # were valid, but this function is guaranteed to return
             # n.b.: It's possible we got a list of valid_types and none of them
             # were valid, but this function is guaranteed to return
-            # *something*, so it politely selects from the entire index isntead
-            tables = self.indexed_tables.values()
-
-        # Rather than create an array of many hundred items and pick randomly
-        # from it, just pick a number up to the total number of potential
-        # items, then pick randomly from that, and partition the whole range
-        # into chunks.  This also avoids the slight problem that the index
-        # contains more rows (for languages) for some items than others.
-        # XXX ought to cache this (in the index?) if possible
-        total = 0
-        partitions = []
-        for table in tables:
-            count = self.session.query(table).count()
-            total += count
-            partitions.append((table, count))
-
-        n = random.randint(1, total)
-        while n > partitions[0][1]:
-            n -= partitions[0][1]
-            partitions.pop(0)
-
-        return self.lookup(unicode(n), valid_types=[ partitions[0][0] ])
+            # *something*, so it politely selects from the entire index instead
+            table_names = self.indexed_tables.keys()
+            table_names.remove('pokemon_forms')
+
+        # Pick a random table, then pick a random item from it.  Small tables
+        # like Type will have an unnatural bias.  The alternative is that a
+        # simple search for "random" will do some eight queries, counting the
+        # rows in every single indexed table, and that's awful.
+        # XXX Can we improve on this, reasonably?
+        table_name = random.choice(table_names)
+        count = self.session.query(self.indexed_tables[table_name]).count()
+        id, = self.session.query(self.indexed_tables[table_name].id) \
+            .offset(random.randint(0, count - 1)) \
+            .first()
+
+        return self.lookup(unicode(id), valid_types=[table_name])
 
     def prefix_lookup(self, prefix, valid_types=[]):
         """Returns terms starting with the given exact prefix.
 
     def prefix_lookup(self, prefix, valid_types=[]):
         """Returns terms starting with the given exact prefix.