X-Git-Url: http://git.veekun.com/zzz-pokedex.git/blobdiff_plain/282096cd8d7897d85f89767091f9d486cc6dfd39..40f283a7a673d7e52cd13c29ebecea8a4e370ed4:/pokedex/lookup.py diff --git a/pokedex/lookup.py b/pokedex/lookup.py index 60c1b85..8488f21 100644 --- a/pokedex/lookup.py +++ b/pokedex/lookup.py @@ -69,10 +69,11 @@ class LanguageWeighting(whoosh.scoring.Weighting): # Apply extra weight weight = weight * self.extra_weights.get(text, 1.0) - if doc['language'] == None: + language = doc.get('language') + if language is None: # English (well, "default"); leave it at 1 return weight - elif doc['language'] == u'Roomaji': + elif language == u'Roomaji': # Give Roomaji a little boost; it's most likely to be searched return weight * 0.9 else: @@ -103,6 +104,7 @@ class PokedexLookup(object): tables.Move, tables.Nature, tables.Pokemon, + tables.PokemonForm, tables.Type, ) ) @@ -175,7 +177,13 @@ class PokedexLookup(object): display_name=whoosh.fields.STORED, # non-lowercased name ) - if not os.path.exists(self.directory): + if os.path.exists(self.directory): + # create_in() isn't totally reliable, so just nuke whatever's there + # manually. Try to be careful about this... + for f in os.listdir(self.directory): + if re.match('^_?(MAIN|SPELL)_', f): + os.remove(os.path.join(self.directory, f)) + else: os.mkdir(self.directory) self.index = whoosh.index.create_in(self.directory, schema=schema, @@ -205,34 +213,33 @@ class PokedexLookup(object): # Add the basic English name to the index if cls == tables.Pokemon: - # Pokémon need their form name added - # XXX kinda kludgy - add(row.full_name, None, u'en', u'us') - - # If this is a default form, ALSO add the unadorned name, - # so 'Deoxys' alone will still do the right thing - if row.forme_name and not row.forme_base_pokemon_id: - add(row.name, None, u'en', u'us') - else: - add(row.name, None, u'en', u'us') + # Don't re-add alternate forms of the same Pokémon; they'll + # be added as Pokémon forms instead + if not row.is_base_form: + continue + elif cls == tables.PokemonForm: + if row.name: + add(row.pokemon_name, None, u'en', u'us') + continue # Some things also have other languages' names # XXX other language form names..? - for foreign_name in getattr(row, 'foreign_names', []): - moonspeak = foreign_name.name - if row.name == moonspeak: - # Don't add the English name again as a different + seen = set() + for language, name in getattr(row, 'name_map', {}).items(): + if name in seen: + # Don't add the name again as a different # language; no point and it makes spell results # confusing continue + seen.add(name) - add(moonspeak, foreign_name.language.name, - foreign_name.language.iso639, - foreign_name.language.iso3166) + add(name, language.name, + language.iso639, + language.iso3166) # Add Roomaji too - if foreign_name.language.name == 'Japanese': - roomaji = romanize(foreign_name.name) + if language.identifier == 'ja': + roomaji = romanize(name) add(roomaji, u'Roomaji', u'ja', u'jp') writer.commit() @@ -289,7 +296,11 @@ class PokedexLookup(object): name = name.strip() prefixes = prefix_chunk.split(',') - user_valid_types = [_.strip() for _ in prefixes] + user_valid_types = [] + for prefix in prefixes: + prefix = prefix.strip() + if prefix: + user_valid_types.append(prefix) # Merge the valid types together. Only types that appear in BOTH lists # may be used. @@ -382,7 +393,7 @@ class PokedexLookup(object): results.append(LookupResult(object=obj, indexed_name=record['name'], name=record['display_name'], - language=record['language'], + language=record.get('language'), iso639=record['iso639'], iso3166=record['iso3166'], exact=exact)) @@ -539,8 +550,10 @@ class PokedexLookup(object): table_names = [] for valid_type in valid_types: table_name = self._parse_table_name(valid_type) - # Skip anything not recognized. Could be, say, a language code - if table_name: + # Skip anything not recognized. Could be, say, a language code. + # XXX The vast majority of Pokémon forms are unnamed and unindexed, + # which can produce blank results. So skip them too for now. + if table_name and table_name != 'pokemon_forms': table_names.append(table_name) if not table_names: @@ -548,26 +561,20 @@ class PokedexLookup(object): # were valid, but this function is guaranteed to return # *something*, so it politely selects from the entire index instead table_names = self.indexed_tables.keys() - - # Rather than create an array of many hundred items and pick randomly - # from it, just pick a number up to the total number of potential - # items, then pick randomly from that, and partition the whole range - # into chunks. This also avoids the slight problem that the index - # contains more rows (for languages) for some items than others. - # XXX ought to cache this (in the index?) if possible - total = 0 - partitions = [] - for table_name in table_names: - count = self.session.query(self.indexed_tables[table_name]).count() - total += count - partitions.append((table_name, count)) - - n = random.randint(1, total) - while n > partitions[0][1]: - n -= partitions[0][1] - partitions.pop(0) - - return self.lookup(unicode(n), valid_types=[ partitions[0][0] ]) + table_names.remove('pokemon_forms') + + # Pick a random table, then pick a random item from it. Small tables + # like Type will have an unnatural bias. The alternative is that a + # simple search for "random" will do some eight queries, counting the + # rows in every single indexed table, and that's awful. + # XXX Can we improve on this, reasonably? + table_name = random.choice(table_names) + count = self.session.query(self.indexed_tables[table_name]).count() + id, = self.session.query(self.indexed_tables[table_name].id) \ + .offset(random.randint(0, count - 1)) \ + .first() + + return self.lookup(unicode(id), valid_types=[table_name]) def prefix_lookup(self, prefix, valid_types=[]): """Returns terms starting with the given exact prefix.