Reading, merging, and writing translations
authorPetr Viktorin <encukou@gmail.com>
Wed, 6 Apr 2011 22:28:54 +0000 (01:28 +0300)
committerPetr Viktorin <encukou@gmail.com>
Mon, 11 Apr 2011 14:03:27 +0000 (17:03 +0300)
bin/poupdate [new file with mode: 0755]
pokedex/db/translations.py [new file with mode: 0755]
pokedex/tests/test_translations.py [new file with mode: 0644]

diff --git a/bin/poupdate b/bin/poupdate
new file mode 100755 (executable)
index 0000000..06ccdf2
--- /dev/null
@@ -0,0 +1,368 @@
+#! /usr/bin/env python
+# Encoding: UTF-8
+
+u"""Creation and loading of GNU Gettext language files.
+
+poupdate [options] [file1.csv file2.csv ...]
+
+Use this script to
+- Create .pot files (in pokedex/i18n/)
+- Update the .po files (in pokedex/i18n/<lang>)
+- Update the pokedex .csv files in (pokedex/data/csv/translations)
+
+To make pos for a new language, make sure it is in the database, make
+a directory for it in pokedex/i18n/, and run this.
+
+You can also give one or more translation CSVs as arguments.
+These are in the same format as veekun's main database CSVs, for example
+pokedex/data/csv/ability_prose.csv. Be sure to set the correct language
+ID (which implies the language must be in the database).
+Also be sure to have the correct column order: first an appropriately named
+foreign key, then local_language_id, and then the text columns.
+
+"""
+
+# Everything related to Gettext files, and the CLI interface, is here.
+# General message handling and CSV I/O is in the pokedex library.
+
+# Notes on how we use PO format:
+# The source information is stored in the occurences fields, using
+# "table_name.column_name" for file and object ID for line number. This is used
+# as a message key, instead of the source string. So it's important not to
+# discard location information. It also means "obsolete" and "fuzzy" mean
+# pretty much the same in our context.
+#
+# Also note that a pot file is just a po file with all strings untranslated.
+# So some functions here will work on either.
+#
+# Gettext context (msgctxt) is written to the files so that tools don't merge
+# unrelated strings together. It is ignored when reading the PO files.
+
+# Also of note, "polib" means "(do) kiss!" in Czech.
+
+import os
+import re
+import sys
+from datetime import datetime
+from optparse import OptionParser
+from collections import defaultdict
+
+import pkg_resources
+
+from pokedex.db import tables, translations
+from pokedex.defaults import get_default_csv_dir
+
+try:
+    import polib
+except ImportError:
+    if __name__ == '__main__':
+        exit('This utility needs polib installed.\n$ pip install polib')
+    raise
+
+number_replacement_flag = '-pokedex-number-replacement'
+
+default_gettext_directory = pkg_resources.resource_filename('pokedex', 'i18n')
+
+mapped_class_dict = dict((c.__name__, c) for c in tables.mapped_classes)
+for cls in tables.mapped_classes:
+    mapped_class_dict.update(dict((c.__name__, cls) for c in cls.translation_classes))
+
+class PokedexPot(polib.POFile):
+    def __init__(self, name):
+        super(PokedexPot, self).__init__()
+        self.metadata = {
+                'Project-Id-Version': 'pokedex-%s 0.1' % name,
+                'Report-Msgid-Bugs-To': 'encukou@gmail.com',
+                'POT-Creation-Date': datetime.now().isoformat(),
+                'PO-Revision-Date': 'YEAR-MO-DA HO:MI+ZONE',
+                'MIME-Version': '1.0',
+                'Content-Type': 'text/plain; charset=utf-8',
+                'Content-Transfer-Encoding': '8bit',
+                'Generated-By': "The pokedex",
+            }
+        self.seen_entries = {}
+
+    def append(self, entry):
+        """Append an entry. POEntries that only differ in numbers are merged.
+
+        For example "Route 1", "Route 2", etc. are replaced by a single
+        "Route {num}".
+
+        Multiple numbers might be replaced, for example in "{num}--{num}
+        different Unown caught"
+
+        Entries without numbers are merged as well (e.g. "Has no overworld
+        effect" appears quite a few times in in AbilityChangelog)
+        """
+        replaced = translations.number_re.sub('{num}', entry.msgid)
+        try:
+            common_entry = self.seen_entries[(entry.msgctxt, replaced)]
+        except KeyError:
+            self.seen_entries[(entry.msgctxt, replaced)] = entry
+        else:
+            common_entry.occurrences += entry.occurrences
+            # Only now is the actual entry replaced. So we get
+            # "Route {num}", but "Porygon2" because there's no Porygon3.
+            common_entry.msgid = replaced
+            common_entry.msgstr = translations.number_re.sub('{num}', common_entry.msgstr)
+            if replaced != entry.msgid and number_replacement_flag not in common_entry.flags:
+                common_entry.flags.append(number_replacement_flag)
+            return
+        self += [entry]
+
+class PotDict(dict):
+    """A defaultdict of pot files"""
+    def __missing__(self, name):
+        pot = PokedexPot(name)
+        self[name] = pot
+        return pot
+
+def yield_po_messages(pos):
+    """Yield messages from all given .po files
+    """
+    merger = translations.Merge()
+    for po in pos.values():
+        merger.add_iterator(_yield_one_po_messages(po, merger))
+    return merger
+
+def entry_sort_key(entry):
+    try:
+        cls_col, line = entry.occurrences[0]
+    except IndexError:
+        return
+    else:
+        if line:
+            classname, col = cls_col.split('.')
+            fuzzy = entry.obsolete or 'fuzzy' in entry.flags
+            try:
+                cls = mapped_class_dict[classname]
+            except KeyError, k:
+                # Renamed table?
+                print 'Warning: Unknown class %s' % classname
+                return '', int(line), col, fuzzy
+            else:
+                return cls.__name__, int(line), col, fuzzy
+
+def _yield_one_po_messages(pofile, merger):
+    # Yield messages from one po file
+    #
+    # Messages in our po files are ordered by the first occurrence.
+    # The occurrences of a single message are also ordered.
+    # So just merge all the subsequences as we go
+    for entry in sorted(pofile, key=entry_sort_key):
+        if entry.msgstr:
+            fuzzy = (entry.obsolete or 'fuzzy' in entry.flags)
+            messages = []
+            for occurrence in entry.occurrences:
+                cls_colname, id = occurrence
+                if id:
+                    clsname, colname = cls_colname.split('.')
+                    cls = mapped_class_dict[clsname]
+                    messages.append(translations.Message(
+                            mapped_class_dict[clsname].__name__,
+                            int(id),
+                            colname,
+                            entry.msgstr,
+                            source=entry.msgid,
+                            number_replacement=number_replacement_flag in entry.flags,
+                            origin='PO file',
+                            fuzzy=fuzzy,
+                        ))
+            if messages[1:]:
+                # Spawn extra iterators before yielding
+                merger.add_iterator(messages[1:])
+            if messages:
+                yield messages[0]
+
+def create_pots(source, *translation_streams):
+    """Convert an iterator of Messages to a dictionary of pot/po files
+
+    If translations are given, they're merged, and any exact matches are put
+    in the po file. Give some for po files, don't give any for pot files.
+    """
+    obsolete = []
+    pots = PotDict()
+    merged = translations.merge_translations(source, *translation_streams, unused=obsolete.append)
+    for source, sourcehash, string, exact in merged:
+        ctxt = '.'.join((source.cls, source.colname))
+        entry = polib.POEntry(
+                msgid=source.string,
+                occurrences=[(ctxt, source.id)],
+                msgctxt=ctxt,
+            )
+        if string:
+            entry.msgstr = string
+            if not exact:
+                entry.flags.append('fuzzy')
+        pots[source.pot].append(entry)
+    for message in obsolete:
+        ctxt = '.'.join((message.cls, message.colname))
+        entry = polib.POEntry(
+                msgid=message.source or '???',
+                occurrences=[(ctxt, message.id)],
+                msgctxt=ctxt,
+                obsolete=True,
+            )
+    return pots
+
+def save_pots(pots, gettext_directory=default_gettext_directory):
+    """Save pot files to a directory."""
+    for name, pot in pots.items():
+        pot.save(os.path.join(gettext_directory, 'pokedex-%s.pot' % name))
+
+def save_pos(pos, lang, gettext_directory=default_gettext_directory):
+    """Save po files to the appropriate directory."""
+    for name, po in pos.items():
+        po.save(os.path.join(gettext_directory, lang, 'pokedex-%s.po' % name))
+
+def read_pots(directory=default_gettext_directory, extension='.pot'):
+    """Read all files from the given directory with the given extension as pofiles
+
+    Works on pos or pots.
+    """
+    pots = {}
+    for filename in os.listdir(directory):
+        basename, ext = os.path.splitext(filename)
+        if ext == extension:
+            pots[basename] = polib.pofile(os.path.join(directory, filename))
+
+    return pots
+
+def all_langs(gettext_directory=default_gettext_directory):
+    return [
+            d for d in os.listdir(gettext_directory)
+            if os.path.isdir(os.path.join(gettext_directory, d))
+        ]
+
+def merge_pos(transl, lang, language_directory):
+    """Update all po files for the given language
+
+    Takes into account the source, the official translations from the database,
+    the existing PO files, and the current translation CSV, in that order.
+
+    Returns a name -> pofile dict
+    """
+    return create_pots(
+            transl.source,
+            transl.official_messages(lang),
+            yield_po_messages(pos=read_pots(language_directory, '.po')),
+            transl.yield_target_messages(lang),
+        )
+
+def bar(fraction, size, done_char='=', split_char='|', notdone_char='-'):
+    """Build an ASCII art progress bar
+    """
+    size -= 1
+    if fraction == 1:
+        split_char = done_char
+    completed = int(round(size * fraction))
+    bar = [done_char] * completed
+    bar.append(split_char)
+    bar += notdone_char * (size - completed)
+    return ''.join(bar)
+
+def print_stats(pos):
+    """Print out some fun stats about a set of po files
+    """
+    template = u"{0:>10}: {1:4}/{2:4} {3:6.2f}% [{4}]"
+    total_translated = 0
+    total = 0
+    for name, po in pos.items():
+        num_translated = len(po.translated_entries())
+        total_translated += num_translated
+        fraction_translated = 1. * num_translated / len(po)
+        total += len(po)
+        print template.format(
+                name,
+                num_translated,
+                len(po),
+                100 * fraction_translated,
+                bar(fraction_translated, 47),
+            ).encode('utf-8')
+    fraction_translated = 1. * total_translated / total
+    print template.format(
+            'Total',
+            total_translated,
+            total,
+            100 * fraction_translated,
+            bar(fraction_translated, 47),
+        ).encode('utf-8')
+
+
+if __name__ == '__main__':
+    parser = OptionParser(__doc__)
+
+    parser.add_option('-l', '--langs', dest='langs',
+            help="List of languages to handle, separated by commas (example: -l 'en,de,ja') (default: all in gettext directory)")
+    parser.add_option('-P', '--no-pots', dest='pots', action='store_false', default=True,
+            help='Do not create POT files (templates)')
+    parser.add_option('-p', '--no-pos', dest='pos', action='store_false', default=True,
+            help='Do not update PO files (message catalogs)')
+
+    parser.add_option('-c', '--no-csv', dest='csv', action='store_false', default=True,
+            help='Do not update pokedex translations files')
+
+    parser.add_option('-d', '--directory', dest='directory',
+            help='Veekun data directory')
+    parser.add_option('-L', '--source-language', dest='source_lang',
+            help="Source language identifier (default: 'en')")
+
+    parser.add_option('-g', '--gettext-dir', dest='gettext_directory', default=default_gettext_directory,
+            help='Gettext directory (default: pokedex/i18n/)')
+
+    parser.add_option('-q', '--quiet', dest='verbose', default=True, action='store_false',
+            help="Don't print what's going on")
+
+    options, arguments = parser.parse_args()
+
+    transl = translations.Translations.from_parsed_options(options)
+
+    gettext_directory = options.gettext_directory
+    if options.pots:
+        if options.verbose:
+            print 'Creating pots in', gettext_directory
+        save_pots(create_pots(transl.source), gettext_directory=gettext_directory)
+
+    if options.pos or options.csv:
+        # Merge in CSV files from command line
+        csv_streams = defaultdict(translations.Merge)
+        for argument in arguments:
+            # Add each message in its own stream, to sort them.
+            file = open(argument, 'rb')
+            with file:
+                for message in translations.yield_guessed_csv_messages(file):
+                    lang = transl.language_identifiers[message.language_id]
+                    csv_streams[lang].add_iterator([message])
+        streams = defaultdict(list)
+        for lang, stream in csv_streams.items():
+            streams[lang].append(stream)
+
+        # Merge in the PO files
+        if options.langs:
+            langs = options.langs.split(',')
+        else:
+            langs = all_langs(gettext_directory)
+
+        for lang in langs:
+            language_directory = os.path.join(gettext_directory, lang)
+            if options.verbose:
+                print 'Merging translations for %s in %s' % (lang, language_directory)
+            pos = merge_pos(transl, lang, language_directory)
+
+            if options.pos:
+                if options.verbose:
+                    print 'Writing POs for %s' % lang
+                save_pos(pos, lang, gettext_directory=gettext_directory)
+
+                if options.verbose:
+                    print_stats(pos)
+
+            streams[lang].append(yield_po_messages(pos))
+
+    if options.csv:
+        for lang, lang_streams in streams.items():
+            if options.verbose:
+                print "Merging %s translation stream/s for '%s'" % (len(lang_streams), lang)
+            existing_messages = list(transl.yield_target_messages(lang))
+            lang_streams.append(existing_messages)
+            transl.write_translations(lang, *lang_streams)
diff --git a/pokedex/db/translations.py b/pokedex/db/translations.py
new file mode 100755 (executable)
index 0000000..5d1bda6
--- /dev/null
@@ -0,0 +1,659 @@
+#! /usr/bin/env python
+u"""General handling of translations
+
+The general idea is to get messages from somewhere: the source pokedex CSVs,
+or the translation CSVs, etc., then merge them together in some way, and shove
+them into the database.
+
+If a message is translated, it has a source string attached to it, with the
+original English version. Or at least it has a CRC of the original.
+When that doesn't match, it means the English string changed and the
+translation has to be updated.
+Also this is why we can't dump translations from the database: there's no
+original string info.
+
+Some complications:
+
+Flavor text is so repetitive that we take strings from all the version,
+separate the unique ones by blank lines, let translators work on that, and then
+put it in flavor_summary tables.
+
+Routes names and other repetitive numeric things are replaced by e.g.
+"Route {num}" so translators only have to work on each set once.
+
+"""
+
+import binascii
+import csv
+import heapq
+import itertools
+import os
+import re
+import sys
+from collections import defaultdict
+
+from pokedex.db import tables
+from pokedex.defaults import get_default_csv_dir
+
+default_source_lang = 'en'
+
+# Top-level classes we want translations for: in order, and by name
+# These are all mapped_classes that have translatable texts and aren't summarized
+toplevel_classes = []
+toplevel_class_by_name = {}
+
+# summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts
+summary_map = {}
+
+# translation_class_by_column[class_name, column_name] == translation_class
+translation_class_by_column = {}
+
+for cls in tables.mapped_classes:
+    try:
+        summary_class, col = cls.summary_column
+    except AttributeError:
+        if cls.translation_classes:
+            toplevel_classes.append(cls)
+            toplevel_class_by_name[cls.__name__] = cls
+            for translation_class in cls.translation_classes:
+                for column in translation_class.__table__.c:
+                    translation_class_by_column[cls, column.name] = translation_class
+    else:
+        summary_map.setdefault(summary_class, {})[col] = cls
+
+number_re = re.compile("[0-9]+")
+
+def crc(string):
+    """Return a hash to we use in translation CSV files"""
+    return "%08x" % (binascii.crc32(string.encode('utf-8')) & 0xffffffff)
+    # Two special values are also used in source_crc:
+    # UNKNOWN: no source string was available
+    # OFFICIAL: an official string from the main database
+
+class Message(object):
+    """Holds all info about a translatable or translated string
+
+    cls: Name of the mapped class the message belongs to
+    id: The id of the thing the message belongs to
+    colname: name of the database column
+    strings: A list of strings in the message, usualy of length 1.
+
+    Optional attributes (None if not set):
+    colsize: Max length of the database column
+    source: The string this was translated from
+    number_replacement: True if this is a translation with {num} placeholders
+    pot: Name of the pot the message goes to (see pot_for_column)
+    source_crc: CRC of the source
+    origin: Some indication of where the string came from (CSV, PO, ...)
+    fuzzy: True for fuzzy translations
+    language_id: ID of the language
+    official: True if this is a known-good translation
+    """
+    __slots__ = 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split()
+    def __init__(self, cls, id, colname, string,
+            colsize=None, source=None, number_replacement=None, pot=None,
+            source_crc=None, origin=None, fuzzy=None, language_id=None,
+            official=None,
+        ):
+        self.cls = cls
+        self.id = id
+        self.colname = colname
+        self.strings = [string]
+        self.colsize = colsize
+        self.source = source
+        self.number_replacement = number_replacement
+        self.pot = pot
+        self.source_crc = source_crc
+        if source and not source_crc:
+             self.source_crc = crc(source)
+        self.origin = origin
+        self.fuzzy = fuzzy
+        self.language_id = language_id
+        self.official = official
+
+    def merge(self, other):
+        """Merge two messages, as required for flavor text summarizing
+        """
+        assert self.merge_key == other.merge_key
+        for string in other.strings:
+            if string not in self.strings:
+                self.strings.append(string)
+        self.colsize = self.colsize or other.colsize
+        self.pot = self.pot or other.pot
+        self.source = None
+        self.source_crc = None
+        self.number_replacement = None
+
+    @property
+    def string(self):
+        return '\n\n'.join(self.strings)
+
+    @property
+    def merge_key(self):
+        return self.cls, self.id, self.colname
+
+    @property
+    def sort_key(self):
+        return self.merge_key, self.language_id, self.fuzzy
+
+    @property
+    def eq_key(self):
+        return self.sort_key, self.strings
+
+    def __eq__(self, other): return self.eq_key == other.eq_key
+    def __ne__(self, other): return self.eq_key != other.eq_key
+    def __gt__(self, other): return self.sort_key > other.sort_key
+    def __lt__(self, other): return self.sort_key < other.sort_key
+    def __ge__(self, other): return self.sort_key >= other.sort_key
+    def __le__(self, other): return self.sort_key <= other.sort_key
+
+    def __unicode__(self):
+        string = '"%s"' % self.string
+        if len(string) > 20:
+            string = string[:15] + u'"...'
+        template = u'<Message from {self.origin} for {self.cls}.{self.colname}:{self.id} -- {string}>'
+        return template.format(self=self, string=string)
+
+    def __str__(self):
+        return unicode(self).encode('utf-8')
+
+    def __repr__(self):
+        return unicode(self).encode('utf-8')
+
+class Translations(object):
+    """Data and opertaions specific to a location on disk (and a source language)
+    """
+    def __init__(self, source_lang=default_source_lang, csv_directory=None, translation_directory=None):
+        if csv_directory is None:
+            csv_directory = get_default_csv_dir()
+
+        if translation_directory is None:
+            translation_directory = os.path.join(csv_directory, 'translations')
+
+        self.source_lang = default_source_lang
+        self.csv_directory = csv_directory
+        self.translation_directory = translation_directory
+
+        self.language_ids = {}
+        self.language_identifiers = {}
+        self.official_langs = []
+        for row in self.reader_for_class(tables.Language, reader_class=csv.DictReader):
+            self.language_ids[row['identifier']] = int(row['id'])
+            self.language_identifiers[int(row['id'])] = row['identifier']
+            if row['official'] and int(row['official']):
+                self.official_langs.append(row['identifier'])
+
+        self.source_lang_id = self.language_ids[self.source_lang]
+
+    @classmethod
+    def from_parsed_options(cls, options):
+        return cls(options.source_lang, options.directory)
+
+    @property
+    def source(self):
+        """All source (i.e. English) messages
+        """
+        return self.official_messages(self.source_lang)
+
+    def official_messages(self, lang):
+        """All official messages (i.e. from main database) for the given lang
+        """
+        # Cached as tuples, since they're used pretty often
+        lang_id = self.language_ids[lang]
+        try:
+            return self._sources[lang_id]
+        except AttributeError:
+            self._sources = {}
+            for message in self.yield_source_messages():
+                self._sources.setdefault(message.language_id, []).append(message)
+            self._sources = dict((k, tuple(merge_adjacent(v))) for k, v in self._sources.items())
+            return self.official_messages(lang)
+        except KeyError:
+            # Looks like there are no messages in the DB for this language
+            # This should only happen for non-official languages
+            assert lang not in self.official_langs
+            return ()
+
+    def write_translations(self, lang, *streams):
+        """Write a translation CSV containing messages from streams.
+
+        Streams should be ordered by priority, from highest to lowest.
+
+        Any official translations (from the main database) are added automatically.
+        """
+        writer = self.writer_for_lang(lang)
+
+        writer.writerow('language_id table id column source_crc string'.split())
+
+        messages = merge_translations(self.source, self.official_messages(lang), *streams)
+
+        warnings = {}
+        for source, sourcehash, string, exact in messages:
+            if string and sourcehash != 'OFFICIAL':
+                utf8len = len(string.encode('utf-8'))
+                if source.colsize and utf8len > source.colsize:
+                    key = source.cls, source.colname
+                    warnings[key] = max(warnings.get(key, (0,)), (utf8len, source, string))
+                else:
+                    writer.writerow((
+                            self.language_ids[lang],
+                            source.cls,
+                            source.id,
+                            source.colname,
+                            sourcehash,
+                            string.encode('utf-8'),
+                        ))
+        for utf8len, source, string in warnings.values():
+            template = u'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}'
+            warning = template.format(
+                    key=source.merge_key,
+                    string=string,
+                    size=utf8len,
+                    colsize=source.colsize,
+                )
+            if len(warning) > 79:
+                warning = warning[:76] + u'...'
+            print warning.encode('utf-8')
+
+    def reader_for_class(self, cls, reader_class=csv.reader):
+        tablename = cls.__table__.name
+        csvpath = os.path.join(self.csv_directory, tablename + '.csv')
+        return reader_class(open(csvpath, 'rb'), lineterminator='\n')
+
+    def writer_for_lang(self, lang):
+        csvpath = os.path.join(self.translation_directory, '%s.csv' % lang)
+        return csv.writer(open(csvpath, 'wb'), lineterminator='\n')
+
+    def yield_source_messages(self, language_id=None):
+        """Yield all messages from source CSV files
+
+        Messages from all languages are returned. The messages are not ordered
+        properly, but splitting the stream by language (and filtering results
+        by merge_adjacent) will produce proper streams.
+        """
+        if language_id is None:
+            language_id = self.source_lang_id
+
+        for cls in sorted(toplevel_classes, key=lambda c: c.__name__):
+            streams = []
+            for translation_class in cls.translation_classes:
+                streams.append(yield_source_csv_messages(
+                        translation_class,
+                        cls,
+                        self.reader_for_class(translation_class),
+                    ))
+                try:
+                    colmap = summary_map[translation_class]
+                except KeyError:
+                    pass
+                else:
+                    for colname, summary_class in colmap.items():
+                        column = translation_class.__table__.c[colname]
+                        streams.append(yield_source_csv_messages(
+                                summary_class,
+                                cls,
+                                self.reader_for_class(summary_class),
+                                force_column=column,
+                            ))
+            for message in Merge(*streams):
+                yield message
+
+    def yield_target_messages(self, lang):
+        """Yield messages from the data/csv/translations/<lang>.csv file
+        """
+        path = os.path.join(self.csv_directory, 'translations', '%s.csv' % lang)
+        try:
+            file = open(path, 'rb')
+        except IOError:
+            return ()
+        return yield_translation_csv_messages(file)
+
+    def yield_all_translations(self):
+        stream = Merge()
+        for lang in self.language_identifiers.values():
+            stream.add_iterator(self.yield_target_messages(lang))
+        return (message for message in stream if not message.official)
+
+    def get_load_data(self, langs=None):
+        """Yield (translation_class, data for INSERT) pairs for loading into the DB
+
+        langs is either a list of language identifiers or None
+        """
+        if langs is None:
+            langs = self.language_identifiers.values()
+        stream = Merge()
+        for lang in self.language_identifiers.values():
+            stream.add_iterator(self.yield_target_messages(lang))
+        stream = (message for message in stream if not message.official)
+        count = 0
+        class GroupDict(dict):
+            """Dict to automatically set the foreign_id and local_language_id for new items
+            """
+            def __missing__(self, key):
+                # depends on `cls` from outside scope
+                id, language_id = key
+                data = self[key] = defaultdict(lambda: None)
+                column_names = (c.name for c in translation_class.__table__.columns)
+                data.update(dict.fromkeys(column_names))
+                data.update({
+                        '%s_id' % cls.__singlename__: id,
+                        'local_language_id': language_id,
+                    })
+                return data
+        # Nested dict:
+        # translation_class -> (lang, id) -> column -> value
+        everything = defaultdict(GroupDict)
+        # Group by object so we always have all of the messages for one DB row
+        for (cls_name, id), group in group_by_object(stream):
+            cls = toplevel_class_by_name[cls_name]
+            for message in group:
+                translation_class = translation_class_by_column[cls, message.colname]
+                key = id, message.language_id
+                colname = str(message.colname)
+                everything[translation_class][key][colname] = message.string
+                count += 1
+            if count > 1000:
+                for translation_class, key_data in everything.items():
+                    yield translation_class, key_data.values()
+                count = 0
+                everything.clear()
+        for translation_class, data_dict in everything.items():
+            yield translation_class, data_dict.values()
+
+def group_by_object(stream):
+    """Group stream by object
+
+    Yields ((class name, object ID), (list of messages)) pairs.
+    """
+    stream = iter(stream)
+    current = stream.next()
+    current_key = current.cls, current.id
+    group = [current]
+    for message in stream:
+        if (message.cls, message.id) != current_key:
+            yield current_key, group
+            group = []
+        group.append(message)
+        current = message
+        current_key = current.cls, current.id
+    yield current_key, group
+
+class Merge(object):
+    """Merge several sorted iterators together
+
+    Additional iterators may be added at any time with add_iterator.
+    Accepts None for the initial iterators
+    If the same value appears in more iterators, there will be duplicates in
+    the output.
+    """
+    def __init__(self, *iterators):
+        self.next_values = []
+        for iterator in iterators:
+            if iterator is not None:
+                self.add_iterator(iterator)
+
+    def add_iterator(self, iterator):
+        iterator = iter(iterator)
+        try:
+            value = iterator.next()
+        except StopIteration:
+            return
+        else:
+            heapq.heappush(self.next_values, (value, iterator))
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        if self.next_values:
+            value, iterator = heapq.heappop(self.next_values)
+            self.add_iterator(iterator)
+            return value
+        else:
+            raise StopIteration
+
+def merge_adjacent(gen):
+    """Merge adjacent messages that compare equal"""
+    gen = iter(gen)
+    last = gen.next()
+    for this in gen:
+        if this.merge_key == last.merge_key:
+            last.merge(this)
+        elif last < this:
+            yield last
+            last = this
+        else:
+            raise AssertionError('Bad order, %s > %s' % (last, this))
+    yield last
+
+def leftjoin(left_stream, right_stream, key=lambda x: x, unused=None):
+    """A "left join" operation on sorted iterators
+
+    Yields (left, right) pairs, where left comes from left_stream and right
+    is the corresponding item from right, or None
+
+    Note that if there are duplicates in right_stream, you won't get duplicate
+    rows for them.
+
+    If given, unused should be a one-arg function that will get called on all
+    unused items in right_stream.
+    """
+    left_stream = iter(left_stream)
+    right_stream = iter(right_stream)
+    try:
+        right = right_stream.next()
+        for left in left_stream:
+            while right and key(left) > key(right):
+                if unused is not None:
+                    unused(right)
+                right = right_stream.next()
+            if key(left) == key(right):
+                yield left, right
+                del left
+                right = right_stream.next()
+            else:
+                yield left, None
+    except StopIteration:
+        try:
+            yield left, None
+        except NameError:
+            pass
+        for left in left_stream:
+            yield left, None
+    else:
+        if unused is not None:
+            try:
+                unused(right)
+            except NameError:
+                pass
+            for right in right_stream:
+                unused(right)
+
+def synchronize(reference, stream, key=lambda x: x, unused=None):
+    """Just the right side part of leftjoin(), Nones included"""
+    for left, right in leftjoin(reference, stream, key, unused):
+        yield right
+
+def yield_source_csv_messages(cls, foreign_cls, csvreader, force_column=None):
+    """Yield all messages from one source CSV file.
+    """
+    columns = list(cls.__table__.c)
+    column_names = csvreader.next()
+    # Assumptions: rows are in lexicographic order
+    #  (taking numeric values as numbers of course)
+    # Assumptions about the order of columns:
+    # 1. It's the same in the table and in CSV
+    # 2. Primary key is at the beginning
+    # 3. First thing in the PK is the object id
+    # 4. Last thing in the PK is the language
+    # 5. Everything that follows is some translatable text
+    assert [cls.__table__.c[name] for name in column_names] == columns, ','.join(c.name for c in columns)
+    pk = columns[:len(cls.__table__.primary_key.columns)]
+    first_string_index = len(pk)
+    return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, force_column=force_column)
+
+def _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin='source CSV', crc_value='OFFICIAL', force_column=None):
+    language_index = first_string_index - 1
+    assert 'language' in columns[language_index].name, columns[language_index].name
+    string_columns = columns[first_string_index:]
+    if force_column is not None:
+        assert len(string_columns) == 1
+        string_columns = [force_column]
+    for values in csvreader:
+        id = int(values[0])
+        messages = []
+        for string, column in zip(values[first_string_index:], string_columns):
+            message = Message(
+                    foreign_cls.__name__,
+                    id,
+                    column.name,
+                    string.decode('utf-8'),
+                    column.type.length,
+                    pot=pot_for_column(cls, column, force_column is not None),
+                    origin=origin,
+                    official=True,
+                    source_crc=crc_value,
+                    language_id=int(values[language_index]),
+                )
+            messages.append(message)
+        messages.sort()
+        for message in messages:
+            yield message
+
+def yield_guessed_csv_messages(file):
+    """Yield messages from a CSV file, using the header to figure out what the data means.
+    """
+    csvreader = csv.reader(file, lineterminator='\n')
+    column_names = csvreader.next()
+    if column_names == 'language_id,table,id,column,source_crc,string'.split(','):
+        # A translation CSV
+        return yield_translation_csv_messages(file, True)
+    # Not a translation CSV, figure out what the columns mean
+    assert column_names[0].endswith('_id')
+    assert column_names[1] == 'local_language_id'
+    first_string_index = 2
+    foreign_singlename = column_names[0][:-len('_id')]
+    columns = [None] * len(column_names)
+    column_indexes = dict((name, i) for i, name in enumerate(column_names))
+    for foreign_cls in toplevel_classes:
+        if foreign_cls.__singlename__ == foreign_singlename:
+            break
+    else:
+        raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" % (column_names[0], file))
+    for translation_class in foreign_cls.translation_classes:
+        for column in translation_class.__table__.c:
+            column_index = column_indexes.get(column.name)
+            if column_index is not None:
+                columns[column_index] = column
+    assert all([c is not None for c in columns[first_string_index:]])
+    return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin=file.name, crc_value='UNKNOWN')
+
+def yield_translation_csv_messages(file, no_header=False):
+    """Yield messages from a translation CSV file
+    """
+    csvreader = csv.reader(file, lineterminator='\n')
+    if not no_header:
+        columns = csvreader.next()
+        assert columns == 'language_id,table,id,column,source_crc,string'.split(',')
+    for language_id, table, id, column, source_crc, string in csvreader:
+        yield Message(
+                table,
+                int(id),
+                column,
+                string.decode('utf-8'),
+                origin='target CSV',
+                source_crc=source_crc,
+                language_id=int(language_id),
+            )
+
+def pot_for_column(cls, column, summary=False):
+    """Translatable texts get categorized into different POT files to help
+       translators prioritize. The pots are:
+
+    - flavor: Flavor texts: here, strings from multiple versions are summarized
+    - ripped: Strings ripped from the games; translators for "official"
+      languages don't need to bother with these
+    - effects: Fanon descriptions of things; they usually use technical
+      language
+    - misc: Everything else; usually small texts
+
+    Set source to true if this is a flavor summary column. Others are
+    determined by the column itself.
+    """
+    if summary:
+        return 'flavor'
+    elif column.info.get('ripped'):
+        return 'ripped'
+    elif column.name.endswith('effect'):
+        return 'effects'
+    else:
+        return 'misc'
+
+def number_replace(source, string):
+    numbers_iter = iter(number_re.findall(source))
+    next_number = lambda match: numbers_iter.next()
+    return re.sub(r'\{num\}', next_number, string)
+
+def match_to_source(source, *translations):
+    """Matches translated string(s) to source
+
+    The first translation whose source matches the source message, or whose CRC
+    matches, or which is official, and which is not fuzzy, it is used.
+    If thre's no such translation, the first translation is used.
+
+    Returns (source, source string CRC, string for CSV file, exact match?)
+    If there are no translations, returns (source, None, None, None)
+
+    Handles translations where numbers have been replaced by {num}, if they
+    have source information.
+    """
+    first = True
+    best_crc = None
+    for translation in translations:
+        if translation is None:
+            continue
+        if translation.number_replacement:
+            current_string = number_replace(source.string, translation.string)
+            current_source = number_replace(source.string, translation.source)
+            current_crc = crc(current_source)
+        elif '{num}' in translation.string:
+            print (u'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation).encode('utf-8')
+            continue
+        else:
+            current_string = translation.string
+            current_source = translation.source
+            current_crc = translation.source_crc
+        if translation.fuzzy:
+            match = False
+        elif translation.official:
+            match = True
+        elif current_source:
+            match = source.string == current_source
+        else:
+            match = current_crc == crc(source.string)
+        if first or match:
+            best_string = current_string
+            best_crc = current_crc
+            best_message = translation
+        if match:
+            break
+        first = False
+    if best_crc:
+        return source, best_crc, best_string, match
+    else:
+        return source, None, None, None
+
+def merge_translations(source_stream, *translation_streams, **kwargs):
+    """For each source message, get its best translation from translations.
+
+    Translations should be ordered by priority, highest to lowest.
+
+    Messages that don't appear in translations at all aren't included.
+    """
+    source = tuple(source_stream)
+    streams = [
+            synchronize(source, t, key=lambda m: m.merge_key, unused=kwargs.get('unused'))
+            for t in translation_streams
+        ]
+    for messages in itertools.izip(source, *streams):
+        yield match_to_source(*messages)
diff --git a/pokedex/tests/test_translations.py b/pokedex/tests/test_translations.py
new file mode 100644 (file)
index 0000000..af96331
--- /dev/null
@@ -0,0 +1,183 @@
+# Encoding: UTF-8
+
+import csv
+
+from nose.tools import *
+
+from pokedex.db import translations, tables
+
+fake_version_names = (
+        'version_id,local_language_id,name',
+        '1,0,name1', '2,0,name2', '3,0,name3', '3,1,othername3',
+    )
+
+fake_translation_csv = (
+        'language_id,table,id,column,source_crc,string',
+        '0,Version,1,name,,name1',
+        '0,Version,2,name,,name2',
+        '0,Version,3,name,,name3',
+        '1,Version,3,name,,othername3',
+    )
+
+def test_yield_source_csv_messages():
+    check_version_message_stream(translations.yield_source_csv_messages(
+            tables.Version.names_table,
+            tables.Version,
+            csv.reader(iter(fake_version_names)),
+        ))
+
+def test_yield_guessed_csv_messages():
+    check_version_message_stream(translations.yield_guessed_csv_messages(
+            iter(fake_translation_csv),
+        ))
+
+def test_yield_translation_csv_messages():
+    check_version_message_stream(translations.yield_translation_csv_messages(
+            iter(fake_translation_csv),
+        ))
+
+def check_version_message_stream(messages):
+    messages = list(messages)
+    assert messages[0].string == 'name1'
+    assert messages[1].string == 'name2'
+    assert messages[2].string == 'name3'
+    assert messages[3].string == 'othername3'
+    for message in messages[:3]:
+        assert message.language_id == 0
+    assert messages[3].language_id == 1
+    for id, message in zip((1, 2, 3, 3), messages):
+        assert message.merge_key == ('Version', id, 'name'), message.key
+
+def get_messages(*rows):
+    return list(translations.yield_translation_csv_messages(iter(rows), True))
+
+def test_merge_translations():
+    source = get_messages(
+            '0,Table,1,col,,none',
+            '0,Table,2,col,,new',
+            '0,Table,3,col,,existing',
+            '0,Table,4,col,,both',
+            '0,Table,5,col,,(gap)',
+            '0,Table,6,col,,new-bad',
+            '0,Table,7,col,,existing-bad',
+            '0,Table,8,col,,both-bad',
+            '0,Table,9,col,,new-bad-ex-good',
+            '0,Table,10,col,,new-good-ex-bad',
+            '0,Table,11,col,,(gap)',
+            '0,Table,12,col,,"Numbers: 1, 2, and 003"',
+            '0,Table,13,col,,"Numbers: 3, 2, and 001"',
+        )
+    new = get_messages(
+            '0,Table,2,col,%s,new' % translations.crc('new'),
+            '0,Table,4,col,%s,new' % translations.crc('both'),
+            '0,Table,6,col,%s,new' % translations.crc('----'),
+            '0,Table,8,col,%s,new' % translations.crc('----'),
+            '0,Table,9,col,%s,new' % translations.crc('----'),
+            '0,Table,10,col,%s,new' % translations.crc('new-good-ex-bad'),
+            '0,Table,12,col,%s,{num} {num} {num}' % translations.crc('Numbers: {num}, {num}, and {num}'),
+            '0,Table,13,col,%s,{num} {num} {num}' % translations.crc('----'),
+            '0,Table,100,col,%s,unused' % translations.crc('----'),
+        )
+    new[-3].number_replacement = True
+    new[-3].source = 'Numbers: 1, 2, and 003'
+    new[-2].number_replacement = True
+    new[-2].source = '----'
+    existing = get_messages(
+            '0,Table,3,col,%s,existing' % translations.crc('existing'),
+            '0,Table,4,col,%s,existing' % translations.crc('both'),
+            '0,Table,7,col,%s,existing' % translations.crc('----'),
+            '0,Table,8,col,%s,existing' % translations.crc('----'),
+            '0,Table,9,col,%s,existing' % translations.crc('new-bad-ex-good'),
+            '0,Table,10,col,%s,existing' % translations.crc('----'),
+            '0,Table,100,col,%s,unused' % translations.crc('----'),
+        )
+    expected_list = (
+            ('none', None, None),
+            ('new', True, 'new'),
+            ('existing', True, 'existing'),
+            ('both', True, 'new'),
+            ('(gap)', None, None),
+            ('new-bad', False, 'new'),
+            ('existing-bad', False, 'existing'),
+            ('both-bad', False, 'new'),
+            ('new-bad-ex-good', True, 'existing'),
+            ('new-good-ex-bad', True, 'new'),
+            ('(gap)', None, None),
+            ('Numbers: 1, 2, and 003', True, '1 2 003'),
+            ('Numbers: 3, 2, and 001', False, '3 2 001'),
+        )
+    unused = []
+    result_stream = list(translations.merge_translations(source, new, [], existing, unused=unused.append))
+    for result, expected in zip(result_stream, expected_list):
+        res_src, res_crc, res_str, res_match = result
+        exp_src, exp_match, exp_str = expected
+        print result, expected
+        assert res_src.string == exp_src
+        assert res_str == exp_str, (res_str, exp_str)
+        if exp_match is None:
+            assert res_crc is None
+        elif exp_match is True:
+            assert res_crc == translations.crc(res_src.string)
+        elif exp_match is False:
+            assert res_crc == translations.crc('----')
+        assert res_match == exp_match
+    print 'unused:', unused
+    for message in unused:
+        assert message.string == 'unused'
+        assert message.id == 100
+
+def test_merge():
+    check_merge((0, 1, 2, 3))
+    check_merge((0, 1), (2, 3))
+    check_merge((2, 3), (0, 1))
+    check_merge((0, 2), (1, 3))
+    check_merge((0, 3), (1, 2))
+    check_merge((0, 1), (2, 3), (2, 3))
+
+def check_merge(*sequences):
+    merged = list(translations.Merge(*sequences))
+    concatenated = [val for seq in sequences for val in seq]
+    assert merged == sorted(concatenated)
+
+def test_merge_dynamic_add():
+    merge = translations.Merge((1, 2, 3))
+    def adder():
+        for val in (1, 2, 3):
+            yield val
+            merge.add_iterator([4])
+    merge.add_iterator(adder())
+    assert tuple(merge) == (1, 1, 2, 2, 3, 3, 4, 4, 4)
+
+def test_merge_adjacent():
+    messages = get_messages(
+            '0,Table,1,col,,strA',
+            '0,Table,2,col,,strB',
+            '0,Table,2,col,,strC',
+            '0,Table,2,col,,strB',
+            '0,Table,2,col,,strD',
+            '0,Table,3,col,,strE',
+        )
+    result = [m.string for m in translations.merge_adjacent(messages)]
+    expected = ['strA', 'strB\n\nstrC\n\nstrD', 'strE']
+    assert result == expected
+
+def test_leftjoin():
+    check_leftjoin([], [], [], [])
+    check_leftjoin([], [1], [], [1])
+    check_leftjoin([], [1, 2], [], [1, 2])
+    check_leftjoin([1], [], [(1, None)], [])
+    check_leftjoin([1], [1], [(1, 1)], [])
+    check_leftjoin([1], [2], [(1, None)], [2])
+    check_leftjoin([1, 2], [1], [(1, 1), (2, None)], [])
+    check_leftjoin([1, 2], [1, 2], [(1, 1), (2, 2)], [])
+    check_leftjoin([1], [1, 2], [(1, 1)], [2])
+    check_leftjoin([1, 2], [1, 3], [(1, 1), (2, None)], [3])
+    check_leftjoin([1, 2, 3], [1, 3], [(1, 1), (2, None), (3, 3)], [])
+    check_leftjoin([1, 2, 2, 3], [1, 3], [(1, 1), (2, None), (2, None), (3, 3)], [])
+    check_leftjoin([1, 2, 2, 3], [2, 2, 2], [(1, None), (2, 2), (2, 2), (3, None)], [2])
+
+def check_leftjoin(seqa, seqb, expected, expected_unused):
+    unused = []
+    result = list(translations.leftjoin(seqa, seqb, unused=unused.append))
+    assert result == list(expected)
+    assert unused == list(expected_unused)