Reading, merging, and writing translations

[zzz-pokedex.git] / bin / poupdate
diff --git a/bin/poupdate b/bin/poupdate

new file mode 100755 (executable)

index 0000000..06ccdf2
--- /dev/null
+++ b/bin/poupdate
@@ -0,0 +1,368 @@
+#! /usr/bin/env python
+# Encoding: UTF-8
+
+u"""Creation and loading of GNU Gettext language files.
+
+poupdate [options] [file1.csv file2.csv ...]
+
+Use this script to
+- Create .pot files (in pokedex/i18n/)
+- Update the .po files (in pokedex/i18n/<lang>)
+- Update the pokedex .csv files in (pokedex/data/csv/translations)
+
+To make pos for a new language, make sure it is in the database, make
+a directory for it in pokedex/i18n/, and run this.
+
+You can also give one or more translation CSVs as arguments.
+These are in the same format as veekun's main database CSVs, for example
+pokedex/data/csv/ability_prose.csv. Be sure to set the correct language
+ID (which implies the language must be in the database).
+Also be sure to have the correct column order: first an appropriately named
+foreign key, then local_language_id, and then the text columns.
+
+"""
+
+# Everything related to Gettext files, and the CLI interface, is here.
+# General message handling and CSV I/O is in the pokedex library.
+
+# Notes on how we use PO format:
+# The source information is stored in the occurences fields, using
+# "table_name.column_name" for file and object ID for line number. This is used
+# as a message key, instead of the source string. So it's important not to
+# discard location information. It also means "obsolete" and "fuzzy" mean
+# pretty much the same in our context.
+#
+# Also note that a pot file is just a po file with all strings untranslated.
+# So some functions here will work on either.
+#
+# Gettext context (msgctxt) is written to the files so that tools don't merge
+# unrelated strings together. It is ignored when reading the PO files.
+
+# Also of note, "polib" means "(do) kiss!" in Czech.
+
+import os
+import re
+import sys
+from datetime import datetime
+from optparse import OptionParser
+from collections import defaultdict
+
+import pkg_resources
+
+from pokedex.db import tables, translations
+from pokedex.defaults import get_default_csv_dir
+
+try:
+    import polib
+except ImportError:
+    if __name__ == '__main__':
+        exit('This utility needs polib installed.\n$ pip install polib')
+    raise
+
+number_replacement_flag = '-pokedex-number-replacement'
+
+default_gettext_directory = pkg_resources.resource_filename('pokedex', 'i18n')
+
+mapped_class_dict = dict((c.__name__, c) for c in tables.mapped_classes)
+for cls in tables.mapped_classes:
+    mapped_class_dict.update(dict((c.__name__, cls) for c in cls.translation_classes))
+
+class PokedexPot(polib.POFile):
+    def __init__(self, name):
+        super(PokedexPot, self).__init__()
+        self.metadata = {
+                'Project-Id-Version': 'pokedex-%s 0.1' % name,
+                'Report-Msgid-Bugs-To': 'encukou@gmail.com',
+                'POT-Creation-Date': datetime.now().isoformat(),
+                'PO-Revision-Date': 'YEAR-MO-DA HO:MI+ZONE',
+                'MIME-Version': '1.0',
+                'Content-Type': 'text/plain; charset=utf-8',
+                'Content-Transfer-Encoding': '8bit',
+                'Generated-By': "The pokedex",
+            }
+        self.seen_entries = {}
+
+    def append(self, entry):
+        """Append an entry. POEntries that only differ in numbers are merged.
+
+        For example "Route 1", "Route 2", etc. are replaced by a single
+        "Route {num}".
+
+        Multiple numbers might be replaced, for example in "{num}--{num}
+        different Unown caught"
+
+        Entries without numbers are merged as well (e.g. "Has no overworld
+        effect" appears quite a few times in in AbilityChangelog)
+        """
+        replaced = translations.number_re.sub('{num}', entry.msgid)
+        try:
+            common_entry = self.seen_entries[(entry.msgctxt, replaced)]
+        except KeyError:
+            self.seen_entries[(entry.msgctxt, replaced)] = entry
+        else:
+            common_entry.occurrences += entry.occurrences
+            # Only now is the actual entry replaced. So we get
+            # "Route {num}", but "Porygon2" because there's no Porygon3.
+            common_entry.msgid = replaced
+            common_entry.msgstr = translations.number_re.sub('{num}', common_entry.msgstr)
+            if replaced != entry.msgid and number_replacement_flag not in common_entry.flags:
+                common_entry.flags.append(number_replacement_flag)
+            return
+        self += [entry]
+
+class PotDict(dict):
+    """A defaultdict of pot files"""
+    def __missing__(self, name):
+        pot = PokedexPot(name)
+        self[name] = pot
+        return pot
+
+def yield_po_messages(pos):
+    """Yield messages from all given .po files
+    """
+    merger = translations.Merge()
+    for po in pos.values():
+        merger.add_iterator(_yield_one_po_messages(po, merger))
+    return merger
+
+def entry_sort_key(entry):
+    try:
+        cls_col, line = entry.occurrences[0]
+    except IndexError:
+        return
+    else:
+        if line:
+            classname, col = cls_col.split('.')
+            fuzzy = entry.obsolete or 'fuzzy' in entry.flags
+            try:
+                cls = mapped_class_dict[classname]
+            except KeyError, k:
+                # Renamed table?
+                print 'Warning: Unknown class %s' % classname
+                return '', int(line), col, fuzzy
+            else:
+                return cls.__name__, int(line), col, fuzzy
+
+def _yield_one_po_messages(pofile, merger):
+    # Yield messages from one po file
+    #
+    # Messages in our po files are ordered by the first occurrence.
+    # The occurrences of a single message are also ordered.
+    # So just merge all the subsequences as we go
+    for entry in sorted(pofile, key=entry_sort_key):
+        if entry.msgstr:
+            fuzzy = (entry.obsolete or 'fuzzy' in entry.flags)
+            messages = []
+            for occurrence in entry.occurrences:
+                cls_colname, id = occurrence
+                if id:
+                    clsname, colname = cls_colname.split('.')
+                    cls = mapped_class_dict[clsname]
+                    messages.append(translations.Message(
+                            mapped_class_dict[clsname].__name__,
+                            int(id),
+                            colname,
+                            entry.msgstr,
+                            source=entry.msgid,
+                            number_replacement=number_replacement_flag in entry.flags,
+                            origin='PO file',
+                            fuzzy=fuzzy,
+                        ))
+            if messages[1:]:
+                # Spawn extra iterators before yielding
+                merger.add_iterator(messages[1:])
+            if messages:
+                yield messages[0]
+
+def create_pots(source, *translation_streams):
+    """Convert an iterator of Messages to a dictionary of pot/po files
+
+    If translations are given, they're merged, and any exact matches are put
+    in the po file. Give some for po files, don't give any for pot files.
+    """
+    obsolete = []
+    pots = PotDict()
+    merged = translations.merge_translations(source, *translation_streams, unused=obsolete.append)
+    for source, sourcehash, string, exact in merged:
+        ctxt = '.'.join((source.cls, source.colname))
+        entry = polib.POEntry(
+                msgid=source.string,
+                occurrences=[(ctxt, source.id)],
+                msgctxt=ctxt,
+            )
+        if string:
+            entry.msgstr = string
+            if not exact:
+                entry.flags.append('fuzzy')
+        pots[source.pot].append(entry)
+    for message in obsolete:
+        ctxt = '.'.join((message.cls, message.colname))
+        entry = polib.POEntry(
+                msgid=message.source or '???',
+                occurrences=[(ctxt, message.id)],
+                msgctxt=ctxt,
+                obsolete=True,
+            )
+    return pots
+
+def save_pots(pots, gettext_directory=default_gettext_directory):
+    """Save pot files to a directory."""
+    for name, pot in pots.items():
+        pot.save(os.path.join(gettext_directory, 'pokedex-%s.pot' % name))
+
+def save_pos(pos, lang, gettext_directory=default_gettext_directory):
+    """Save po files to the appropriate directory."""
+    for name, po in pos.items():
+        po.save(os.path.join(gettext_directory, lang, 'pokedex-%s.po' % name))
+
+def read_pots(directory=default_gettext_directory, extension='.pot'):
+    """Read all files from the given directory with the given extension as pofiles
+
+    Works on pos or pots.
+    """
+    pots = {}
+    for filename in os.listdir(directory):
+        basename, ext = os.path.splitext(filename)
+        if ext == extension:
+            pots[basename] = polib.pofile(os.path.join(directory, filename))
+
+    return pots
+
+def all_langs(gettext_directory=default_gettext_directory):
+    return [
+            d for d in os.listdir(gettext_directory)
+            if os.path.isdir(os.path.join(gettext_directory, d))
+        ]
+
+def merge_pos(transl, lang, language_directory):
+    """Update all po files for the given language
+
+    Takes into account the source, the official translations from the database,
+    the existing PO files, and the current translation CSV, in that order.
+
+    Returns a name -> pofile dict
+    """
+    return create_pots(
+            transl.source,
+            transl.official_messages(lang),
+            yield_po_messages(pos=read_pots(language_directory, '.po')),
+            transl.yield_target_messages(lang),
+        )
+
+def bar(fraction, size, done_char='=', split_char='|', notdone_char='-'):
+    """Build an ASCII art progress bar
+    """
+    size -= 1
+    if fraction == 1:
+        split_char = done_char
+    completed = int(round(size * fraction))
+    bar = [done_char] * completed
+    bar.append(split_char)
+    bar += notdone_char * (size - completed)
+    return ''.join(bar)
+
+def print_stats(pos):
+    """Print out some fun stats about a set of po files
+    """
+    template = u"{0:>10}: {1:4}/{2:4} {3:6.2f}% [{4}]"
+    total_translated = 0
+    total = 0
+    for name, po in pos.items():
+        num_translated = len(po.translated_entries())
+        total_translated += num_translated
+        fraction_translated = 1. * num_translated / len(po)
+        total += len(po)
+        print template.format(
+                name,
+                num_translated,
+                len(po),
+                100 * fraction_translated,
+                bar(fraction_translated, 47),
+            ).encode('utf-8')
+    fraction_translated = 1. * total_translated / total
+    print template.format(
+            'Total',
+            total_translated,
+            total,
+            100 * fraction_translated,
+            bar(fraction_translated, 47),
+        ).encode('utf-8')
+
+
+if __name__ == '__main__':
+    parser = OptionParser(__doc__)
+
+    parser.add_option('-l', '--langs', dest='langs',
+            help="List of languages to handle, separated by commas (example: -l 'en,de,ja') (default: all in gettext directory)")
+    parser.add_option('-P', '--no-pots', dest='pots', action='store_false', default=True,
+            help='Do not create POT files (templates)')
+    parser.add_option('-p', '--no-pos', dest='pos', action='store_false', default=True,
+            help='Do not update PO files (message catalogs)')
+
+    parser.add_option('-c', '--no-csv', dest='csv', action='store_false', default=True,
+            help='Do not update pokedex translations files')
+
+    parser.add_option('-d', '--directory', dest='directory',
+            help='Veekun data directory')
+    parser.add_option('-L', '--source-language', dest='source_lang',
+            help="Source language identifier (default: 'en')")
+
+    parser.add_option('-g', '--gettext-dir', dest='gettext_directory', default=default_gettext_directory,
+            help='Gettext directory (default: pokedex/i18n/)')
+
+    parser.add_option('-q', '--quiet', dest='verbose', default=True, action='store_false',
+            help="Don't print what's going on")
+
+    options, arguments = parser.parse_args()
+
+    transl = translations.Translations.from_parsed_options(options)
+
+    gettext_directory = options.gettext_directory
+    if options.pots:
+        if options.verbose:
+            print 'Creating pots in', gettext_directory
+        save_pots(create_pots(transl.source), gettext_directory=gettext_directory)
+
+    if options.pos or options.csv:
+        # Merge in CSV files from command line
+        csv_streams = defaultdict(translations.Merge)
+        for argument in arguments:
+            # Add each message in its own stream, to sort them.
+            file = open(argument, 'rb')
+            with file:
+                for message in translations.yield_guessed_csv_messages(file):
+                    lang = transl.language_identifiers[message.language_id]
+                    csv_streams[lang].add_iterator([message])
+        streams = defaultdict(list)
+        for lang, stream in csv_streams.items():
+            streams[lang].append(stream)
+
+        # Merge in the PO files
+        if options.langs:
+            langs = options.langs.split(',')
+        else:
+            langs = all_langs(gettext_directory)
+
+        for lang in langs:
+            language_directory = os.path.join(gettext_directory, lang)
+            if options.verbose:
+                print 'Merging translations for %s in %s' % (lang, language_directory)
+            pos = merge_pos(transl, lang, language_directory)
+
+            if options.pos:
+                if options.verbose:
+                    print 'Writing POs for %s' % lang
+                save_pos(pos, lang, gettext_directory=gettext_directory)
+
+                if options.verbose:
+                    print_stats(pos)
+
+            streams[lang].append(yield_po_messages(pos))
+
+    if options.csv:
+        for lang, lang_streams in streams.items():
+            if options.verbose:
+                print "Merging %s translation stream/s for '%s'" % (len(lang_streams), lang)
+            existing_messages = list(transl.yield_target_messages(lang))
+            lang_streams.append(existing_messages)
+            transl.write_translations(lang, *lang_streams)