From b14e23c787f1177d09d358fea683c6d72ce3d757 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 7 Apr 2011 01:28:54 +0300 Subject: [PATCH 1/1] Reading, merging, and writing translations --- bin/poupdate | 368 +++++++++++++++++++++ pokedex/db/translations.py | 659 +++++++++++++++++++++++++++++++++++++ pokedex/tests/test_translations.py | 183 ++++++++++ 3 files changed, 1210 insertions(+) create mode 100755 bin/poupdate create mode 100755 pokedex/db/translations.py create mode 100644 pokedex/tests/test_translations.py diff --git a/bin/poupdate b/bin/poupdate new file mode 100755 index 0000000..06ccdf2 --- /dev/null +++ b/bin/poupdate @@ -0,0 +1,368 @@ +#! /usr/bin/env python +# Encoding: UTF-8 + +u"""Creation and loading of GNU Gettext language files. + +poupdate [options] [file1.csv file2.csv ...] + +Use this script to +- Create .pot files (in pokedex/i18n/) +- Update the .po files (in pokedex/i18n/) +- Update the pokedex .csv files in (pokedex/data/csv/translations) + +To make pos for a new language, make sure it is in the database, make +a directory for it in pokedex/i18n/, and run this. + +You can also give one or more translation CSVs as arguments. +These are in the same format as veekun's main database CSVs, for example +pokedex/data/csv/ability_prose.csv. Be sure to set the correct language +ID (which implies the language must be in the database). +Also be sure to have the correct column order: first an appropriately named +foreign key, then local_language_id, and then the text columns. + +""" + +# Everything related to Gettext files, and the CLI interface, is here. +# General message handling and CSV I/O is in the pokedex library. + +# Notes on how we use PO format: +# The source information is stored in the occurences fields, using +# "table_name.column_name" for file and object ID for line number. This is used +# as a message key, instead of the source string. So it's important not to +# discard location information. It also means "obsolete" and "fuzzy" mean +# pretty much the same in our context. +# +# Also note that a pot file is just a po file with all strings untranslated. +# So some functions here will work on either. +# +# Gettext context (msgctxt) is written to the files so that tools don't merge +# unrelated strings together. It is ignored when reading the PO files. + +# Also of note, "polib" means "(do) kiss!" in Czech. + +import os +import re +import sys +from datetime import datetime +from optparse import OptionParser +from collections import defaultdict + +import pkg_resources + +from pokedex.db import tables, translations +from pokedex.defaults import get_default_csv_dir + +try: + import polib +except ImportError: + if __name__ == '__main__': + exit('This utility needs polib installed.\n$ pip install polib') + raise + +number_replacement_flag = '-pokedex-number-replacement' + +default_gettext_directory = pkg_resources.resource_filename('pokedex', 'i18n') + +mapped_class_dict = dict((c.__name__, c) for c in tables.mapped_classes) +for cls in tables.mapped_classes: + mapped_class_dict.update(dict((c.__name__, cls) for c in cls.translation_classes)) + +class PokedexPot(polib.POFile): + def __init__(self, name): + super(PokedexPot, self).__init__() + self.metadata = { + 'Project-Id-Version': 'pokedex-%s 0.1' % name, + 'Report-Msgid-Bugs-To': 'encukou@gmail.com', + 'POT-Creation-Date': datetime.now().isoformat(), + 'PO-Revision-Date': 'YEAR-MO-DA HO:MI+ZONE', + 'MIME-Version': '1.0', + 'Content-Type': 'text/plain; charset=utf-8', + 'Content-Transfer-Encoding': '8bit', + 'Generated-By': "The pokedex", + } + self.seen_entries = {} + + def append(self, entry): + """Append an entry. POEntries that only differ in numbers are merged. + + For example "Route 1", "Route 2", etc. are replaced by a single + "Route {num}". + + Multiple numbers might be replaced, for example in "{num}--{num} + different Unown caught" + + Entries without numbers are merged as well (e.g. "Has no overworld + effect" appears quite a few times in in AbilityChangelog) + """ + replaced = translations.number_re.sub('{num}', entry.msgid) + try: + common_entry = self.seen_entries[(entry.msgctxt, replaced)] + except KeyError: + self.seen_entries[(entry.msgctxt, replaced)] = entry + else: + common_entry.occurrences += entry.occurrences + # Only now is the actual entry replaced. So we get + # "Route {num}", but "Porygon2" because there's no Porygon3. + common_entry.msgid = replaced + common_entry.msgstr = translations.number_re.sub('{num}', common_entry.msgstr) + if replaced != entry.msgid and number_replacement_flag not in common_entry.flags: + common_entry.flags.append(number_replacement_flag) + return + self += [entry] + +class PotDict(dict): + """A defaultdict of pot files""" + def __missing__(self, name): + pot = PokedexPot(name) + self[name] = pot + return pot + +def yield_po_messages(pos): + """Yield messages from all given .po files + """ + merger = translations.Merge() + for po in pos.values(): + merger.add_iterator(_yield_one_po_messages(po, merger)) + return merger + +def entry_sort_key(entry): + try: + cls_col, line = entry.occurrences[0] + except IndexError: + return + else: + if line: + classname, col = cls_col.split('.') + fuzzy = entry.obsolete or 'fuzzy' in entry.flags + try: + cls = mapped_class_dict[classname] + except KeyError, k: + # Renamed table? + print 'Warning: Unknown class %s' % classname + return '', int(line), col, fuzzy + else: + return cls.__name__, int(line), col, fuzzy + +def _yield_one_po_messages(pofile, merger): + # Yield messages from one po file + # + # Messages in our po files are ordered by the first occurrence. + # The occurrences of a single message are also ordered. + # So just merge all the subsequences as we go + for entry in sorted(pofile, key=entry_sort_key): + if entry.msgstr: + fuzzy = (entry.obsolete or 'fuzzy' in entry.flags) + messages = [] + for occurrence in entry.occurrences: + cls_colname, id = occurrence + if id: + clsname, colname = cls_colname.split('.') + cls = mapped_class_dict[clsname] + messages.append(translations.Message( + mapped_class_dict[clsname].__name__, + int(id), + colname, + entry.msgstr, + source=entry.msgid, + number_replacement=number_replacement_flag in entry.flags, + origin='PO file', + fuzzy=fuzzy, + )) + if messages[1:]: + # Spawn extra iterators before yielding + merger.add_iterator(messages[1:]) + if messages: + yield messages[0] + +def create_pots(source, *translation_streams): + """Convert an iterator of Messages to a dictionary of pot/po files + + If translations are given, they're merged, and any exact matches are put + in the po file. Give some for po files, don't give any for pot files. + """ + obsolete = [] + pots = PotDict() + merged = translations.merge_translations(source, *translation_streams, unused=obsolete.append) + for source, sourcehash, string, exact in merged: + ctxt = '.'.join((source.cls, source.colname)) + entry = polib.POEntry( + msgid=source.string, + occurrences=[(ctxt, source.id)], + msgctxt=ctxt, + ) + if string: + entry.msgstr = string + if not exact: + entry.flags.append('fuzzy') + pots[source.pot].append(entry) + for message in obsolete: + ctxt = '.'.join((message.cls, message.colname)) + entry = polib.POEntry( + msgid=message.source or '???', + occurrences=[(ctxt, message.id)], + msgctxt=ctxt, + obsolete=True, + ) + return pots + +def save_pots(pots, gettext_directory=default_gettext_directory): + """Save pot files to a directory.""" + for name, pot in pots.items(): + pot.save(os.path.join(gettext_directory, 'pokedex-%s.pot' % name)) + +def save_pos(pos, lang, gettext_directory=default_gettext_directory): + """Save po files to the appropriate directory.""" + for name, po in pos.items(): + po.save(os.path.join(gettext_directory, lang, 'pokedex-%s.po' % name)) + +def read_pots(directory=default_gettext_directory, extension='.pot'): + """Read all files from the given directory with the given extension as pofiles + + Works on pos or pots. + """ + pots = {} + for filename in os.listdir(directory): + basename, ext = os.path.splitext(filename) + if ext == extension: + pots[basename] = polib.pofile(os.path.join(directory, filename)) + + return pots + +def all_langs(gettext_directory=default_gettext_directory): + return [ + d for d in os.listdir(gettext_directory) + if os.path.isdir(os.path.join(gettext_directory, d)) + ] + +def merge_pos(transl, lang, language_directory): + """Update all po files for the given language + + Takes into account the source, the official translations from the database, + the existing PO files, and the current translation CSV, in that order. + + Returns a name -> pofile dict + """ + return create_pots( + transl.source, + transl.official_messages(lang), + yield_po_messages(pos=read_pots(language_directory, '.po')), + transl.yield_target_messages(lang), + ) + +def bar(fraction, size, done_char='=', split_char='|', notdone_char='-'): + """Build an ASCII art progress bar + """ + size -= 1 + if fraction == 1: + split_char = done_char + completed = int(round(size * fraction)) + bar = [done_char] * completed + bar.append(split_char) + bar += notdone_char * (size - completed) + return ''.join(bar) + +def print_stats(pos): + """Print out some fun stats about a set of po files + """ + template = u"{0:>10}: {1:4}/{2:4} {3:6.2f}% [{4}]" + total_translated = 0 + total = 0 + for name, po in pos.items(): + num_translated = len(po.translated_entries()) + total_translated += num_translated + fraction_translated = 1. * num_translated / len(po) + total += len(po) + print template.format( + name, + num_translated, + len(po), + 100 * fraction_translated, + bar(fraction_translated, 47), + ).encode('utf-8') + fraction_translated = 1. * total_translated / total + print template.format( + 'Total', + total_translated, + total, + 100 * fraction_translated, + bar(fraction_translated, 47), + ).encode('utf-8') + + +if __name__ == '__main__': + parser = OptionParser(__doc__) + + parser.add_option('-l', '--langs', dest='langs', + help="List of languages to handle, separated by commas (example: -l 'en,de,ja') (default: all in gettext directory)") + parser.add_option('-P', '--no-pots', dest='pots', action='store_false', default=True, + help='Do not create POT files (templates)') + parser.add_option('-p', '--no-pos', dest='pos', action='store_false', default=True, + help='Do not update PO files (message catalogs)') + + parser.add_option('-c', '--no-csv', dest='csv', action='store_false', default=True, + help='Do not update pokedex translations files') + + parser.add_option('-d', '--directory', dest='directory', + help='Veekun data directory') + parser.add_option('-L', '--source-language', dest='source_lang', + help="Source language identifier (default: 'en')") + + parser.add_option('-g', '--gettext-dir', dest='gettext_directory', default=default_gettext_directory, + help='Gettext directory (default: pokedex/i18n/)') + + parser.add_option('-q', '--quiet', dest='verbose', default=True, action='store_false', + help="Don't print what's going on") + + options, arguments = parser.parse_args() + + transl = translations.Translations.from_parsed_options(options) + + gettext_directory = options.gettext_directory + if options.pots: + if options.verbose: + print 'Creating pots in', gettext_directory + save_pots(create_pots(transl.source), gettext_directory=gettext_directory) + + if options.pos or options.csv: + # Merge in CSV files from command line + csv_streams = defaultdict(translations.Merge) + for argument in arguments: + # Add each message in its own stream, to sort them. + file = open(argument, 'rb') + with file: + for message in translations.yield_guessed_csv_messages(file): + lang = transl.language_identifiers[message.language_id] + csv_streams[lang].add_iterator([message]) + streams = defaultdict(list) + for lang, stream in csv_streams.items(): + streams[lang].append(stream) + + # Merge in the PO files + if options.langs: + langs = options.langs.split(',') + else: + langs = all_langs(gettext_directory) + + for lang in langs: + language_directory = os.path.join(gettext_directory, lang) + if options.verbose: + print 'Merging translations for %s in %s' % (lang, language_directory) + pos = merge_pos(transl, lang, language_directory) + + if options.pos: + if options.verbose: + print 'Writing POs for %s' % lang + save_pos(pos, lang, gettext_directory=gettext_directory) + + if options.verbose: + print_stats(pos) + + streams[lang].append(yield_po_messages(pos)) + + if options.csv: + for lang, lang_streams in streams.items(): + if options.verbose: + print "Merging %s translation stream/s for '%s'" % (len(lang_streams), lang) + existing_messages = list(transl.yield_target_messages(lang)) + lang_streams.append(existing_messages) + transl.write_translations(lang, *lang_streams) diff --git a/pokedex/db/translations.py b/pokedex/db/translations.py new file mode 100755 index 0000000..5d1bda6 --- /dev/null +++ b/pokedex/db/translations.py @@ -0,0 +1,659 @@ +#! /usr/bin/env python +u"""General handling of translations + +The general idea is to get messages from somewhere: the source pokedex CSVs, +or the translation CSVs, etc., then merge them together in some way, and shove +them into the database. + +If a message is translated, it has a source string attached to it, with the +original English version. Or at least it has a CRC of the original. +When that doesn't match, it means the English string changed and the +translation has to be updated. +Also this is why we can't dump translations from the database: there's no +original string info. + +Some complications: + +Flavor text is so repetitive that we take strings from all the version, +separate the unique ones by blank lines, let translators work on that, and then +put it in flavor_summary tables. + +Routes names and other repetitive numeric things are replaced by e.g. +"Route {num}" so translators only have to work on each set once. + +""" + +import binascii +import csv +import heapq +import itertools +import os +import re +import sys +from collections import defaultdict + +from pokedex.db import tables +from pokedex.defaults import get_default_csv_dir + +default_source_lang = 'en' + +# Top-level classes we want translations for: in order, and by name +# These are all mapped_classes that have translatable texts and aren't summarized +toplevel_classes = [] +toplevel_class_by_name = {} + +# summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts +summary_map = {} + +# translation_class_by_column[class_name, column_name] == translation_class +translation_class_by_column = {} + +for cls in tables.mapped_classes: + try: + summary_class, col = cls.summary_column + except AttributeError: + if cls.translation_classes: + toplevel_classes.append(cls) + toplevel_class_by_name[cls.__name__] = cls + for translation_class in cls.translation_classes: + for column in translation_class.__table__.c: + translation_class_by_column[cls, column.name] = translation_class + else: + summary_map.setdefault(summary_class, {})[col] = cls + +number_re = re.compile("[0-9]+") + +def crc(string): + """Return a hash to we use in translation CSV files""" + return "%08x" % (binascii.crc32(string.encode('utf-8')) & 0xffffffff) + # Two special values are also used in source_crc: + # UNKNOWN: no source string was available + # OFFICIAL: an official string from the main database + +class Message(object): + """Holds all info about a translatable or translated string + + cls: Name of the mapped class the message belongs to + id: The id of the thing the message belongs to + colname: name of the database column + strings: A list of strings in the message, usualy of length 1. + + Optional attributes (None if not set): + colsize: Max length of the database column + source: The string this was translated from + number_replacement: True if this is a translation with {num} placeholders + pot: Name of the pot the message goes to (see pot_for_column) + source_crc: CRC of the source + origin: Some indication of where the string came from (CSV, PO, ...) + fuzzy: True for fuzzy translations + language_id: ID of the language + official: True if this is a known-good translation + """ + __slots__ = 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split() + def __init__(self, cls, id, colname, string, + colsize=None, source=None, number_replacement=None, pot=None, + source_crc=None, origin=None, fuzzy=None, language_id=None, + official=None, + ): + self.cls = cls + self.id = id + self.colname = colname + self.strings = [string] + self.colsize = colsize + self.source = source + self.number_replacement = number_replacement + self.pot = pot + self.source_crc = source_crc + if source and not source_crc: + self.source_crc = crc(source) + self.origin = origin + self.fuzzy = fuzzy + self.language_id = language_id + self.official = official + + def merge(self, other): + """Merge two messages, as required for flavor text summarizing + """ + assert self.merge_key == other.merge_key + for string in other.strings: + if string not in self.strings: + self.strings.append(string) + self.colsize = self.colsize or other.colsize + self.pot = self.pot or other.pot + self.source = None + self.source_crc = None + self.number_replacement = None + + @property + def string(self): + return '\n\n'.join(self.strings) + + @property + def merge_key(self): + return self.cls, self.id, self.colname + + @property + def sort_key(self): + return self.merge_key, self.language_id, self.fuzzy + + @property + def eq_key(self): + return self.sort_key, self.strings + + def __eq__(self, other): return self.eq_key == other.eq_key + def __ne__(self, other): return self.eq_key != other.eq_key + def __gt__(self, other): return self.sort_key > other.sort_key + def __lt__(self, other): return self.sort_key < other.sort_key + def __ge__(self, other): return self.sort_key >= other.sort_key + def __le__(self, other): return self.sort_key <= other.sort_key + + def __unicode__(self): + string = '"%s"' % self.string + if len(string) > 20: + string = string[:15] + u'"...' + template = u'' + return template.format(self=self, string=string) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __repr__(self): + return unicode(self).encode('utf-8') + +class Translations(object): + """Data and opertaions specific to a location on disk (and a source language) + """ + def __init__(self, source_lang=default_source_lang, csv_directory=None, translation_directory=None): + if csv_directory is None: + csv_directory = get_default_csv_dir() + + if translation_directory is None: + translation_directory = os.path.join(csv_directory, 'translations') + + self.source_lang = default_source_lang + self.csv_directory = csv_directory + self.translation_directory = translation_directory + + self.language_ids = {} + self.language_identifiers = {} + self.official_langs = [] + for row in self.reader_for_class(tables.Language, reader_class=csv.DictReader): + self.language_ids[row['identifier']] = int(row['id']) + self.language_identifiers[int(row['id'])] = row['identifier'] + if row['official'] and int(row['official']): + self.official_langs.append(row['identifier']) + + self.source_lang_id = self.language_ids[self.source_lang] + + @classmethod + def from_parsed_options(cls, options): + return cls(options.source_lang, options.directory) + + @property + def source(self): + """All source (i.e. English) messages + """ + return self.official_messages(self.source_lang) + + def official_messages(self, lang): + """All official messages (i.e. from main database) for the given lang + """ + # Cached as tuples, since they're used pretty often + lang_id = self.language_ids[lang] + try: + return self._sources[lang_id] + except AttributeError: + self._sources = {} + for message in self.yield_source_messages(): + self._sources.setdefault(message.language_id, []).append(message) + self._sources = dict((k, tuple(merge_adjacent(v))) for k, v in self._sources.items()) + return self.official_messages(lang) + except KeyError: + # Looks like there are no messages in the DB for this language + # This should only happen for non-official languages + assert lang not in self.official_langs + return () + + def write_translations(self, lang, *streams): + """Write a translation CSV containing messages from streams. + + Streams should be ordered by priority, from highest to lowest. + + Any official translations (from the main database) are added automatically. + """ + writer = self.writer_for_lang(lang) + + writer.writerow('language_id table id column source_crc string'.split()) + + messages = merge_translations(self.source, self.official_messages(lang), *streams) + + warnings = {} + for source, sourcehash, string, exact in messages: + if string and sourcehash != 'OFFICIAL': + utf8len = len(string.encode('utf-8')) + if source.colsize and utf8len > source.colsize: + key = source.cls, source.colname + warnings[key] = max(warnings.get(key, (0,)), (utf8len, source, string)) + else: + writer.writerow(( + self.language_ids[lang], + source.cls, + source.id, + source.colname, + sourcehash, + string.encode('utf-8'), + )) + for utf8len, source, string in warnings.values(): + template = u'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}' + warning = template.format( + key=source.merge_key, + string=string, + size=utf8len, + colsize=source.colsize, + ) + if len(warning) > 79: + warning = warning[:76] + u'...' + print warning.encode('utf-8') + + def reader_for_class(self, cls, reader_class=csv.reader): + tablename = cls.__table__.name + csvpath = os.path.join(self.csv_directory, tablename + '.csv') + return reader_class(open(csvpath, 'rb'), lineterminator='\n') + + def writer_for_lang(self, lang): + csvpath = os.path.join(self.translation_directory, '%s.csv' % lang) + return csv.writer(open(csvpath, 'wb'), lineterminator='\n') + + def yield_source_messages(self, language_id=None): + """Yield all messages from source CSV files + + Messages from all languages are returned. The messages are not ordered + properly, but splitting the stream by language (and filtering results + by merge_adjacent) will produce proper streams. + """ + if language_id is None: + language_id = self.source_lang_id + + for cls in sorted(toplevel_classes, key=lambda c: c.__name__): + streams = [] + for translation_class in cls.translation_classes: + streams.append(yield_source_csv_messages( + translation_class, + cls, + self.reader_for_class(translation_class), + )) + try: + colmap = summary_map[translation_class] + except KeyError: + pass + else: + for colname, summary_class in colmap.items(): + column = translation_class.__table__.c[colname] + streams.append(yield_source_csv_messages( + summary_class, + cls, + self.reader_for_class(summary_class), + force_column=column, + )) + for message in Merge(*streams): + yield message + + def yield_target_messages(self, lang): + """Yield messages from the data/csv/translations/.csv file + """ + path = os.path.join(self.csv_directory, 'translations', '%s.csv' % lang) + try: + file = open(path, 'rb') + except IOError: + return () + return yield_translation_csv_messages(file) + + def yield_all_translations(self): + stream = Merge() + for lang in self.language_identifiers.values(): + stream.add_iterator(self.yield_target_messages(lang)) + return (message for message in stream if not message.official) + + def get_load_data(self, langs=None): + """Yield (translation_class, data for INSERT) pairs for loading into the DB + + langs is either a list of language identifiers or None + """ + if langs is None: + langs = self.language_identifiers.values() + stream = Merge() + for lang in self.language_identifiers.values(): + stream.add_iterator(self.yield_target_messages(lang)) + stream = (message for message in stream if not message.official) + count = 0 + class GroupDict(dict): + """Dict to automatically set the foreign_id and local_language_id for new items + """ + def __missing__(self, key): + # depends on `cls` from outside scope + id, language_id = key + data = self[key] = defaultdict(lambda: None) + column_names = (c.name for c in translation_class.__table__.columns) + data.update(dict.fromkeys(column_names)) + data.update({ + '%s_id' % cls.__singlename__: id, + 'local_language_id': language_id, + }) + return data + # Nested dict: + # translation_class -> (lang, id) -> column -> value + everything = defaultdict(GroupDict) + # Group by object so we always have all of the messages for one DB row + for (cls_name, id), group in group_by_object(stream): + cls = toplevel_class_by_name[cls_name] + for message in group: + translation_class = translation_class_by_column[cls, message.colname] + key = id, message.language_id + colname = str(message.colname) + everything[translation_class][key][colname] = message.string + count += 1 + if count > 1000: + for translation_class, key_data in everything.items(): + yield translation_class, key_data.values() + count = 0 + everything.clear() + for translation_class, data_dict in everything.items(): + yield translation_class, data_dict.values() + +def group_by_object(stream): + """Group stream by object + + Yields ((class name, object ID), (list of messages)) pairs. + """ + stream = iter(stream) + current = stream.next() + current_key = current.cls, current.id + group = [current] + for message in stream: + if (message.cls, message.id) != current_key: + yield current_key, group + group = [] + group.append(message) + current = message + current_key = current.cls, current.id + yield current_key, group + +class Merge(object): + """Merge several sorted iterators together + + Additional iterators may be added at any time with add_iterator. + Accepts None for the initial iterators + If the same value appears in more iterators, there will be duplicates in + the output. + """ + def __init__(self, *iterators): + self.next_values = [] + for iterator in iterators: + if iterator is not None: + self.add_iterator(iterator) + + def add_iterator(self, iterator): + iterator = iter(iterator) + try: + value = iterator.next() + except StopIteration: + return + else: + heapq.heappush(self.next_values, (value, iterator)) + + def __iter__(self): + return self + + def next(self): + if self.next_values: + value, iterator = heapq.heappop(self.next_values) + self.add_iterator(iterator) + return value + else: + raise StopIteration + +def merge_adjacent(gen): + """Merge adjacent messages that compare equal""" + gen = iter(gen) + last = gen.next() + for this in gen: + if this.merge_key == last.merge_key: + last.merge(this) + elif last < this: + yield last + last = this + else: + raise AssertionError('Bad order, %s > %s' % (last, this)) + yield last + +def leftjoin(left_stream, right_stream, key=lambda x: x, unused=None): + """A "left join" operation on sorted iterators + + Yields (left, right) pairs, where left comes from left_stream and right + is the corresponding item from right, or None + + Note that if there are duplicates in right_stream, you won't get duplicate + rows for them. + + If given, unused should be a one-arg function that will get called on all + unused items in right_stream. + """ + left_stream = iter(left_stream) + right_stream = iter(right_stream) + try: + right = right_stream.next() + for left in left_stream: + while right and key(left) > key(right): + if unused is not None: + unused(right) + right = right_stream.next() + if key(left) == key(right): + yield left, right + del left + right = right_stream.next() + else: + yield left, None + except StopIteration: + try: + yield left, None + except NameError: + pass + for left in left_stream: + yield left, None + else: + if unused is not None: + try: + unused(right) + except NameError: + pass + for right in right_stream: + unused(right) + +def synchronize(reference, stream, key=lambda x: x, unused=None): + """Just the right side part of leftjoin(), Nones included""" + for left, right in leftjoin(reference, stream, key, unused): + yield right + +def yield_source_csv_messages(cls, foreign_cls, csvreader, force_column=None): + """Yield all messages from one source CSV file. + """ + columns = list(cls.__table__.c) + column_names = csvreader.next() + # Assumptions: rows are in lexicographic order + # (taking numeric values as numbers of course) + # Assumptions about the order of columns: + # 1. It's the same in the table and in CSV + # 2. Primary key is at the beginning + # 3. First thing in the PK is the object id + # 4. Last thing in the PK is the language + # 5. Everything that follows is some translatable text + assert [cls.__table__.c[name] for name in column_names] == columns, ','.join(c.name for c in columns) + pk = columns[:len(cls.__table__.primary_key.columns)] + first_string_index = len(pk) + return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, force_column=force_column) + +def _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin='source CSV', crc_value='OFFICIAL', force_column=None): + language_index = first_string_index - 1 + assert 'language' in columns[language_index].name, columns[language_index].name + string_columns = columns[first_string_index:] + if force_column is not None: + assert len(string_columns) == 1 + string_columns = [force_column] + for values in csvreader: + id = int(values[0]) + messages = [] + for string, column in zip(values[first_string_index:], string_columns): + message = Message( + foreign_cls.__name__, + id, + column.name, + string.decode('utf-8'), + column.type.length, + pot=pot_for_column(cls, column, force_column is not None), + origin=origin, + official=True, + source_crc=crc_value, + language_id=int(values[language_index]), + ) + messages.append(message) + messages.sort() + for message in messages: + yield message + +def yield_guessed_csv_messages(file): + """Yield messages from a CSV file, using the header to figure out what the data means. + """ + csvreader = csv.reader(file, lineterminator='\n') + column_names = csvreader.next() + if column_names == 'language_id,table,id,column,source_crc,string'.split(','): + # A translation CSV + return yield_translation_csv_messages(file, True) + # Not a translation CSV, figure out what the columns mean + assert column_names[0].endswith('_id') + assert column_names[1] == 'local_language_id' + first_string_index = 2 + foreign_singlename = column_names[0][:-len('_id')] + columns = [None] * len(column_names) + column_indexes = dict((name, i) for i, name in enumerate(column_names)) + for foreign_cls in toplevel_classes: + if foreign_cls.__singlename__ == foreign_singlename: + break + else: + raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" % (column_names[0], file)) + for translation_class in foreign_cls.translation_classes: + for column in translation_class.__table__.c: + column_index = column_indexes.get(column.name) + if column_index is not None: + columns[column_index] = column + assert all([c is not None for c in columns[first_string_index:]]) + return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin=file.name, crc_value='UNKNOWN') + +def yield_translation_csv_messages(file, no_header=False): + """Yield messages from a translation CSV file + """ + csvreader = csv.reader(file, lineterminator='\n') + if not no_header: + columns = csvreader.next() + assert columns == 'language_id,table,id,column,source_crc,string'.split(',') + for language_id, table, id, column, source_crc, string in csvreader: + yield Message( + table, + int(id), + column, + string.decode('utf-8'), + origin='target CSV', + source_crc=source_crc, + language_id=int(language_id), + ) + +def pot_for_column(cls, column, summary=False): + """Translatable texts get categorized into different POT files to help + translators prioritize. The pots are: + + - flavor: Flavor texts: here, strings from multiple versions are summarized + - ripped: Strings ripped from the games; translators for "official" + languages don't need to bother with these + - effects: Fanon descriptions of things; they usually use technical + language + - misc: Everything else; usually small texts + + Set source to true if this is a flavor summary column. Others are + determined by the column itself. + """ + if summary: + return 'flavor' + elif column.info.get('ripped'): + return 'ripped' + elif column.name.endswith('effect'): + return 'effects' + else: + return 'misc' + +def number_replace(source, string): + numbers_iter = iter(number_re.findall(source)) + next_number = lambda match: numbers_iter.next() + return re.sub(r'\{num\}', next_number, string) + +def match_to_source(source, *translations): + """Matches translated string(s) to source + + The first translation whose source matches the source message, or whose CRC + matches, or which is official, and which is not fuzzy, it is used. + If thre's no such translation, the first translation is used. + + Returns (source, source string CRC, string for CSV file, exact match?) + If there are no translations, returns (source, None, None, None) + + Handles translations where numbers have been replaced by {num}, if they + have source information. + """ + first = True + best_crc = None + for translation in translations: + if translation is None: + continue + if translation.number_replacement: + current_string = number_replace(source.string, translation.string) + current_source = number_replace(source.string, translation.source) + current_crc = crc(current_source) + elif '{num}' in translation.string: + print (u'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation).encode('utf-8') + continue + else: + current_string = translation.string + current_source = translation.source + current_crc = translation.source_crc + if translation.fuzzy: + match = False + elif translation.official: + match = True + elif current_source: + match = source.string == current_source + else: + match = current_crc == crc(source.string) + if first or match: + best_string = current_string + best_crc = current_crc + best_message = translation + if match: + break + first = False + if best_crc: + return source, best_crc, best_string, match + else: + return source, None, None, None + +def merge_translations(source_stream, *translation_streams, **kwargs): + """For each source message, get its best translation from translations. + + Translations should be ordered by priority, highest to lowest. + + Messages that don't appear in translations at all aren't included. + """ + source = tuple(source_stream) + streams = [ + synchronize(source, t, key=lambda m: m.merge_key, unused=kwargs.get('unused')) + for t in translation_streams + ] + for messages in itertools.izip(source, *streams): + yield match_to_source(*messages) diff --git a/pokedex/tests/test_translations.py b/pokedex/tests/test_translations.py new file mode 100644 index 0000000..af96331 --- /dev/null +++ b/pokedex/tests/test_translations.py @@ -0,0 +1,183 @@ +# Encoding: UTF-8 + +import csv + +from nose.tools import * + +from pokedex.db import translations, tables + +fake_version_names = ( + 'version_id,local_language_id,name', + '1,0,name1', '2,0,name2', '3,0,name3', '3,1,othername3', + ) + +fake_translation_csv = ( + 'language_id,table,id,column,source_crc,string', + '0,Version,1,name,,name1', + '0,Version,2,name,,name2', + '0,Version,3,name,,name3', + '1,Version,3,name,,othername3', + ) + +def test_yield_source_csv_messages(): + check_version_message_stream(translations.yield_source_csv_messages( + tables.Version.names_table, + tables.Version, + csv.reader(iter(fake_version_names)), + )) + +def test_yield_guessed_csv_messages(): + check_version_message_stream(translations.yield_guessed_csv_messages( + iter(fake_translation_csv), + )) + +def test_yield_translation_csv_messages(): + check_version_message_stream(translations.yield_translation_csv_messages( + iter(fake_translation_csv), + )) + +def check_version_message_stream(messages): + messages = list(messages) + assert messages[0].string == 'name1' + assert messages[1].string == 'name2' + assert messages[2].string == 'name3' + assert messages[3].string == 'othername3' + for message in messages[:3]: + assert message.language_id == 0 + assert messages[3].language_id == 1 + for id, message in zip((1, 2, 3, 3), messages): + assert message.merge_key == ('Version', id, 'name'), message.key + +def get_messages(*rows): + return list(translations.yield_translation_csv_messages(iter(rows), True)) + +def test_merge_translations(): + source = get_messages( + '0,Table,1,col,,none', + '0,Table,2,col,,new', + '0,Table,3,col,,existing', + '0,Table,4,col,,both', + '0,Table,5,col,,(gap)', + '0,Table,6,col,,new-bad', + '0,Table,7,col,,existing-bad', + '0,Table,8,col,,both-bad', + '0,Table,9,col,,new-bad-ex-good', + '0,Table,10,col,,new-good-ex-bad', + '0,Table,11,col,,(gap)', + '0,Table,12,col,,"Numbers: 1, 2, and 003"', + '0,Table,13,col,,"Numbers: 3, 2, and 001"', + ) + new = get_messages( + '0,Table,2,col,%s,new' % translations.crc('new'), + '0,Table,4,col,%s,new' % translations.crc('both'), + '0,Table,6,col,%s,new' % translations.crc('----'), + '0,Table,8,col,%s,new' % translations.crc('----'), + '0,Table,9,col,%s,new' % translations.crc('----'), + '0,Table,10,col,%s,new' % translations.crc('new-good-ex-bad'), + '0,Table,12,col,%s,{num} {num} {num}' % translations.crc('Numbers: {num}, {num}, and {num}'), + '0,Table,13,col,%s,{num} {num} {num}' % translations.crc('----'), + '0,Table,100,col,%s,unused' % translations.crc('----'), + ) + new[-3].number_replacement = True + new[-3].source = 'Numbers: 1, 2, and 003' + new[-2].number_replacement = True + new[-2].source = '----' + existing = get_messages( + '0,Table,3,col,%s,existing' % translations.crc('existing'), + '0,Table,4,col,%s,existing' % translations.crc('both'), + '0,Table,7,col,%s,existing' % translations.crc('----'), + '0,Table,8,col,%s,existing' % translations.crc('----'), + '0,Table,9,col,%s,existing' % translations.crc('new-bad-ex-good'), + '0,Table,10,col,%s,existing' % translations.crc('----'), + '0,Table,100,col,%s,unused' % translations.crc('----'), + ) + expected_list = ( + ('none', None, None), + ('new', True, 'new'), + ('existing', True, 'existing'), + ('both', True, 'new'), + ('(gap)', None, None), + ('new-bad', False, 'new'), + ('existing-bad', False, 'existing'), + ('both-bad', False, 'new'), + ('new-bad-ex-good', True, 'existing'), + ('new-good-ex-bad', True, 'new'), + ('(gap)', None, None), + ('Numbers: 1, 2, and 003', True, '1 2 003'), + ('Numbers: 3, 2, and 001', False, '3 2 001'), + ) + unused = [] + result_stream = list(translations.merge_translations(source, new, [], existing, unused=unused.append)) + for result, expected in zip(result_stream, expected_list): + res_src, res_crc, res_str, res_match = result + exp_src, exp_match, exp_str = expected + print result, expected + assert res_src.string == exp_src + assert res_str == exp_str, (res_str, exp_str) + if exp_match is None: + assert res_crc is None + elif exp_match is True: + assert res_crc == translations.crc(res_src.string) + elif exp_match is False: + assert res_crc == translations.crc('----') + assert res_match == exp_match + print 'unused:', unused + for message in unused: + assert message.string == 'unused' + assert message.id == 100 + +def test_merge(): + check_merge((0, 1, 2, 3)) + check_merge((0, 1), (2, 3)) + check_merge((2, 3), (0, 1)) + check_merge((0, 2), (1, 3)) + check_merge((0, 3), (1, 2)) + check_merge((0, 1), (2, 3), (2, 3)) + +def check_merge(*sequences): + merged = list(translations.Merge(*sequences)) + concatenated = [val for seq in sequences for val in seq] + assert merged == sorted(concatenated) + +def test_merge_dynamic_add(): + merge = translations.Merge((1, 2, 3)) + def adder(): + for val in (1, 2, 3): + yield val + merge.add_iterator([4]) + merge.add_iterator(adder()) + assert tuple(merge) == (1, 1, 2, 2, 3, 3, 4, 4, 4) + +def test_merge_adjacent(): + messages = get_messages( + '0,Table,1,col,,strA', + '0,Table,2,col,,strB', + '0,Table,2,col,,strC', + '0,Table,2,col,,strB', + '0,Table,2,col,,strD', + '0,Table,3,col,,strE', + ) + result = [m.string for m in translations.merge_adjacent(messages)] + expected = ['strA', 'strB\n\nstrC\n\nstrD', 'strE'] + assert result == expected + +def test_leftjoin(): + check_leftjoin([], [], [], []) + check_leftjoin([], [1], [], [1]) + check_leftjoin([], [1, 2], [], [1, 2]) + check_leftjoin([1], [], [(1, None)], []) + check_leftjoin([1], [1], [(1, 1)], []) + check_leftjoin([1], [2], [(1, None)], [2]) + check_leftjoin([1, 2], [1], [(1, 1), (2, None)], []) + check_leftjoin([1, 2], [1, 2], [(1, 1), (2, 2)], []) + check_leftjoin([1], [1, 2], [(1, 1)], [2]) + check_leftjoin([1, 2], [1, 3], [(1, 1), (2, None)], [3]) + check_leftjoin([1, 2, 3], [1, 3], [(1, 1), (2, None), (3, 3)], []) + check_leftjoin([1, 2, 2, 3], [1, 3], [(1, 1), (2, None), (2, None), (3, 3)], []) + check_leftjoin([1, 2, 2, 3], [2, 2, 2], [(1, None), (2, 2), (2, 2), (3, None)], [2]) + +def check_leftjoin(seqa, seqb, expected, expected_unused): + unused = [] + result = list(translations.leftjoin(seqa, seqb, unused=unused.append)) + assert result == list(expected) + assert unused == list(expected_unused) -- 2.7.4