X-Git-Url: http://git.veekun.com/zzz-pokedex.git/blobdiff_plain/b82af79bb9da4c033e3891bba40e913e19393be3..b14e23c787f1177d09d358fea683c6d72ce3d757:/pokedex/db/translations.py diff --git a/pokedex/db/translations.py b/pokedex/db/translations.py new file mode 100755 index 0000000..5d1bda6 --- /dev/null +++ b/pokedex/db/translations.py @@ -0,0 +1,659 @@ +#! /usr/bin/env python +u"""General handling of translations + +The general idea is to get messages from somewhere: the source pokedex CSVs, +or the translation CSVs, etc., then merge them together in some way, and shove +them into the database. + +If a message is translated, it has a source string attached to it, with the +original English version. Or at least it has a CRC of the original. +When that doesn't match, it means the English string changed and the +translation has to be updated. +Also this is why we can't dump translations from the database: there's no +original string info. + +Some complications: + +Flavor text is so repetitive that we take strings from all the version, +separate the unique ones by blank lines, let translators work on that, and then +put it in flavor_summary tables. + +Routes names and other repetitive numeric things are replaced by e.g. +"Route {num}" so translators only have to work on each set once. + +""" + +import binascii +import csv +import heapq +import itertools +import os +import re +import sys +from collections import defaultdict + +from pokedex.db import tables +from pokedex.defaults import get_default_csv_dir + +default_source_lang = 'en' + +# Top-level classes we want translations for: in order, and by name +# These are all mapped_classes that have translatable texts and aren't summarized +toplevel_classes = [] +toplevel_class_by_name = {} + +# summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts +summary_map = {} + +# translation_class_by_column[class_name, column_name] == translation_class +translation_class_by_column = {} + +for cls in tables.mapped_classes: + try: + summary_class, col = cls.summary_column + except AttributeError: + if cls.translation_classes: + toplevel_classes.append(cls) + toplevel_class_by_name[cls.__name__] = cls + for translation_class in cls.translation_classes: + for column in translation_class.__table__.c: + translation_class_by_column[cls, column.name] = translation_class + else: + summary_map.setdefault(summary_class, {})[col] = cls + +number_re = re.compile("[0-9]+") + +def crc(string): + """Return a hash to we use in translation CSV files""" + return "%08x" % (binascii.crc32(string.encode('utf-8')) & 0xffffffff) + # Two special values are also used in source_crc: + # UNKNOWN: no source string was available + # OFFICIAL: an official string from the main database + +class Message(object): + """Holds all info about a translatable or translated string + + cls: Name of the mapped class the message belongs to + id: The id of the thing the message belongs to + colname: name of the database column + strings: A list of strings in the message, usualy of length 1. + + Optional attributes (None if not set): + colsize: Max length of the database column + source: The string this was translated from + number_replacement: True if this is a translation with {num} placeholders + pot: Name of the pot the message goes to (see pot_for_column) + source_crc: CRC of the source + origin: Some indication of where the string came from (CSV, PO, ...) + fuzzy: True for fuzzy translations + language_id: ID of the language + official: True if this is a known-good translation + """ + __slots__ = 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split() + def __init__(self, cls, id, colname, string, + colsize=None, source=None, number_replacement=None, pot=None, + source_crc=None, origin=None, fuzzy=None, language_id=None, + official=None, + ): + self.cls = cls + self.id = id + self.colname = colname + self.strings = [string] + self.colsize = colsize + self.source = source + self.number_replacement = number_replacement + self.pot = pot + self.source_crc = source_crc + if source and not source_crc: + self.source_crc = crc(source) + self.origin = origin + self.fuzzy = fuzzy + self.language_id = language_id + self.official = official + + def merge(self, other): + """Merge two messages, as required for flavor text summarizing + """ + assert self.merge_key == other.merge_key + for string in other.strings: + if string not in self.strings: + self.strings.append(string) + self.colsize = self.colsize or other.colsize + self.pot = self.pot or other.pot + self.source = None + self.source_crc = None + self.number_replacement = None + + @property + def string(self): + return '\n\n'.join(self.strings) + + @property + def merge_key(self): + return self.cls, self.id, self.colname + + @property + def sort_key(self): + return self.merge_key, self.language_id, self.fuzzy + + @property + def eq_key(self): + return self.sort_key, self.strings + + def __eq__(self, other): return self.eq_key == other.eq_key + def __ne__(self, other): return self.eq_key != other.eq_key + def __gt__(self, other): return self.sort_key > other.sort_key + def __lt__(self, other): return self.sort_key < other.sort_key + def __ge__(self, other): return self.sort_key >= other.sort_key + def __le__(self, other): return self.sort_key <= other.sort_key + + def __unicode__(self): + string = '"%s"' % self.string + if len(string) > 20: + string = string[:15] + u'"...' + template = u'' + return template.format(self=self, string=string) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __repr__(self): + return unicode(self).encode('utf-8') + +class Translations(object): + """Data and opertaions specific to a location on disk (and a source language) + """ + def __init__(self, source_lang=default_source_lang, csv_directory=None, translation_directory=None): + if csv_directory is None: + csv_directory = get_default_csv_dir() + + if translation_directory is None: + translation_directory = os.path.join(csv_directory, 'translations') + + self.source_lang = default_source_lang + self.csv_directory = csv_directory + self.translation_directory = translation_directory + + self.language_ids = {} + self.language_identifiers = {} + self.official_langs = [] + for row in self.reader_for_class(tables.Language, reader_class=csv.DictReader): + self.language_ids[row['identifier']] = int(row['id']) + self.language_identifiers[int(row['id'])] = row['identifier'] + if row['official'] and int(row['official']): + self.official_langs.append(row['identifier']) + + self.source_lang_id = self.language_ids[self.source_lang] + + @classmethod + def from_parsed_options(cls, options): + return cls(options.source_lang, options.directory) + + @property + def source(self): + """All source (i.e. English) messages + """ + return self.official_messages(self.source_lang) + + def official_messages(self, lang): + """All official messages (i.e. from main database) for the given lang + """ + # Cached as tuples, since they're used pretty often + lang_id = self.language_ids[lang] + try: + return self._sources[lang_id] + except AttributeError: + self._sources = {} + for message in self.yield_source_messages(): + self._sources.setdefault(message.language_id, []).append(message) + self._sources = dict((k, tuple(merge_adjacent(v))) for k, v in self._sources.items()) + return self.official_messages(lang) + except KeyError: + # Looks like there are no messages in the DB for this language + # This should only happen for non-official languages + assert lang not in self.official_langs + return () + + def write_translations(self, lang, *streams): + """Write a translation CSV containing messages from streams. + + Streams should be ordered by priority, from highest to lowest. + + Any official translations (from the main database) are added automatically. + """ + writer = self.writer_for_lang(lang) + + writer.writerow('language_id table id column source_crc string'.split()) + + messages = merge_translations(self.source, self.official_messages(lang), *streams) + + warnings = {} + for source, sourcehash, string, exact in messages: + if string and sourcehash != 'OFFICIAL': + utf8len = len(string.encode('utf-8')) + if source.colsize and utf8len > source.colsize: + key = source.cls, source.colname + warnings[key] = max(warnings.get(key, (0,)), (utf8len, source, string)) + else: + writer.writerow(( + self.language_ids[lang], + source.cls, + source.id, + source.colname, + sourcehash, + string.encode('utf-8'), + )) + for utf8len, source, string in warnings.values(): + template = u'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}' + warning = template.format( + key=source.merge_key, + string=string, + size=utf8len, + colsize=source.colsize, + ) + if len(warning) > 79: + warning = warning[:76] + u'...' + print warning.encode('utf-8') + + def reader_for_class(self, cls, reader_class=csv.reader): + tablename = cls.__table__.name + csvpath = os.path.join(self.csv_directory, tablename + '.csv') + return reader_class(open(csvpath, 'rb'), lineterminator='\n') + + def writer_for_lang(self, lang): + csvpath = os.path.join(self.translation_directory, '%s.csv' % lang) + return csv.writer(open(csvpath, 'wb'), lineterminator='\n') + + def yield_source_messages(self, language_id=None): + """Yield all messages from source CSV files + + Messages from all languages are returned. The messages are not ordered + properly, but splitting the stream by language (and filtering results + by merge_adjacent) will produce proper streams. + """ + if language_id is None: + language_id = self.source_lang_id + + for cls in sorted(toplevel_classes, key=lambda c: c.__name__): + streams = [] + for translation_class in cls.translation_classes: + streams.append(yield_source_csv_messages( + translation_class, + cls, + self.reader_for_class(translation_class), + )) + try: + colmap = summary_map[translation_class] + except KeyError: + pass + else: + for colname, summary_class in colmap.items(): + column = translation_class.__table__.c[colname] + streams.append(yield_source_csv_messages( + summary_class, + cls, + self.reader_for_class(summary_class), + force_column=column, + )) + for message in Merge(*streams): + yield message + + def yield_target_messages(self, lang): + """Yield messages from the data/csv/translations/.csv file + """ + path = os.path.join(self.csv_directory, 'translations', '%s.csv' % lang) + try: + file = open(path, 'rb') + except IOError: + return () + return yield_translation_csv_messages(file) + + def yield_all_translations(self): + stream = Merge() + for lang in self.language_identifiers.values(): + stream.add_iterator(self.yield_target_messages(lang)) + return (message for message in stream if not message.official) + + def get_load_data(self, langs=None): + """Yield (translation_class, data for INSERT) pairs for loading into the DB + + langs is either a list of language identifiers or None + """ + if langs is None: + langs = self.language_identifiers.values() + stream = Merge() + for lang in self.language_identifiers.values(): + stream.add_iterator(self.yield_target_messages(lang)) + stream = (message for message in stream if not message.official) + count = 0 + class GroupDict(dict): + """Dict to automatically set the foreign_id and local_language_id for new items + """ + def __missing__(self, key): + # depends on `cls` from outside scope + id, language_id = key + data = self[key] = defaultdict(lambda: None) + column_names = (c.name for c in translation_class.__table__.columns) + data.update(dict.fromkeys(column_names)) + data.update({ + '%s_id' % cls.__singlename__: id, + 'local_language_id': language_id, + }) + return data + # Nested dict: + # translation_class -> (lang, id) -> column -> value + everything = defaultdict(GroupDict) + # Group by object so we always have all of the messages for one DB row + for (cls_name, id), group in group_by_object(stream): + cls = toplevel_class_by_name[cls_name] + for message in group: + translation_class = translation_class_by_column[cls, message.colname] + key = id, message.language_id + colname = str(message.colname) + everything[translation_class][key][colname] = message.string + count += 1 + if count > 1000: + for translation_class, key_data in everything.items(): + yield translation_class, key_data.values() + count = 0 + everything.clear() + for translation_class, data_dict in everything.items(): + yield translation_class, data_dict.values() + +def group_by_object(stream): + """Group stream by object + + Yields ((class name, object ID), (list of messages)) pairs. + """ + stream = iter(stream) + current = stream.next() + current_key = current.cls, current.id + group = [current] + for message in stream: + if (message.cls, message.id) != current_key: + yield current_key, group + group = [] + group.append(message) + current = message + current_key = current.cls, current.id + yield current_key, group + +class Merge(object): + """Merge several sorted iterators together + + Additional iterators may be added at any time with add_iterator. + Accepts None for the initial iterators + If the same value appears in more iterators, there will be duplicates in + the output. + """ + def __init__(self, *iterators): + self.next_values = [] + for iterator in iterators: + if iterator is not None: + self.add_iterator(iterator) + + def add_iterator(self, iterator): + iterator = iter(iterator) + try: + value = iterator.next() + except StopIteration: + return + else: + heapq.heappush(self.next_values, (value, iterator)) + + def __iter__(self): + return self + + def next(self): + if self.next_values: + value, iterator = heapq.heappop(self.next_values) + self.add_iterator(iterator) + return value + else: + raise StopIteration + +def merge_adjacent(gen): + """Merge adjacent messages that compare equal""" + gen = iter(gen) + last = gen.next() + for this in gen: + if this.merge_key == last.merge_key: + last.merge(this) + elif last < this: + yield last + last = this + else: + raise AssertionError('Bad order, %s > %s' % (last, this)) + yield last + +def leftjoin(left_stream, right_stream, key=lambda x: x, unused=None): + """A "left join" operation on sorted iterators + + Yields (left, right) pairs, where left comes from left_stream and right + is the corresponding item from right, or None + + Note that if there are duplicates in right_stream, you won't get duplicate + rows for them. + + If given, unused should be a one-arg function that will get called on all + unused items in right_stream. + """ + left_stream = iter(left_stream) + right_stream = iter(right_stream) + try: + right = right_stream.next() + for left in left_stream: + while right and key(left) > key(right): + if unused is not None: + unused(right) + right = right_stream.next() + if key(left) == key(right): + yield left, right + del left + right = right_stream.next() + else: + yield left, None + except StopIteration: + try: + yield left, None + except NameError: + pass + for left in left_stream: + yield left, None + else: + if unused is not None: + try: + unused(right) + except NameError: + pass + for right in right_stream: + unused(right) + +def synchronize(reference, stream, key=lambda x: x, unused=None): + """Just the right side part of leftjoin(), Nones included""" + for left, right in leftjoin(reference, stream, key, unused): + yield right + +def yield_source_csv_messages(cls, foreign_cls, csvreader, force_column=None): + """Yield all messages from one source CSV file. + """ + columns = list(cls.__table__.c) + column_names = csvreader.next() + # Assumptions: rows are in lexicographic order + # (taking numeric values as numbers of course) + # Assumptions about the order of columns: + # 1. It's the same in the table and in CSV + # 2. Primary key is at the beginning + # 3. First thing in the PK is the object id + # 4. Last thing in the PK is the language + # 5. Everything that follows is some translatable text + assert [cls.__table__.c[name] for name in column_names] == columns, ','.join(c.name for c in columns) + pk = columns[:len(cls.__table__.primary_key.columns)] + first_string_index = len(pk) + return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, force_column=force_column) + +def _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin='source CSV', crc_value='OFFICIAL', force_column=None): + language_index = first_string_index - 1 + assert 'language' in columns[language_index].name, columns[language_index].name + string_columns = columns[first_string_index:] + if force_column is not None: + assert len(string_columns) == 1 + string_columns = [force_column] + for values in csvreader: + id = int(values[0]) + messages = [] + for string, column in zip(values[first_string_index:], string_columns): + message = Message( + foreign_cls.__name__, + id, + column.name, + string.decode('utf-8'), + column.type.length, + pot=pot_for_column(cls, column, force_column is not None), + origin=origin, + official=True, + source_crc=crc_value, + language_id=int(values[language_index]), + ) + messages.append(message) + messages.sort() + for message in messages: + yield message + +def yield_guessed_csv_messages(file): + """Yield messages from a CSV file, using the header to figure out what the data means. + """ + csvreader = csv.reader(file, lineterminator='\n') + column_names = csvreader.next() + if column_names == 'language_id,table,id,column,source_crc,string'.split(','): + # A translation CSV + return yield_translation_csv_messages(file, True) + # Not a translation CSV, figure out what the columns mean + assert column_names[0].endswith('_id') + assert column_names[1] == 'local_language_id' + first_string_index = 2 + foreign_singlename = column_names[0][:-len('_id')] + columns = [None] * len(column_names) + column_indexes = dict((name, i) for i, name in enumerate(column_names)) + for foreign_cls in toplevel_classes: + if foreign_cls.__singlename__ == foreign_singlename: + break + else: + raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" % (column_names[0], file)) + for translation_class in foreign_cls.translation_classes: + for column in translation_class.__table__.c: + column_index = column_indexes.get(column.name) + if column_index is not None: + columns[column_index] = column + assert all([c is not None for c in columns[first_string_index:]]) + return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin=file.name, crc_value='UNKNOWN') + +def yield_translation_csv_messages(file, no_header=False): + """Yield messages from a translation CSV file + """ + csvreader = csv.reader(file, lineterminator='\n') + if not no_header: + columns = csvreader.next() + assert columns == 'language_id,table,id,column,source_crc,string'.split(',') + for language_id, table, id, column, source_crc, string in csvreader: + yield Message( + table, + int(id), + column, + string.decode('utf-8'), + origin='target CSV', + source_crc=source_crc, + language_id=int(language_id), + ) + +def pot_for_column(cls, column, summary=False): + """Translatable texts get categorized into different POT files to help + translators prioritize. The pots are: + + - flavor: Flavor texts: here, strings from multiple versions are summarized + - ripped: Strings ripped from the games; translators for "official" + languages don't need to bother with these + - effects: Fanon descriptions of things; they usually use technical + language + - misc: Everything else; usually small texts + + Set source to true if this is a flavor summary column. Others are + determined by the column itself. + """ + if summary: + return 'flavor' + elif column.info.get('ripped'): + return 'ripped' + elif column.name.endswith('effect'): + return 'effects' + else: + return 'misc' + +def number_replace(source, string): + numbers_iter = iter(number_re.findall(source)) + next_number = lambda match: numbers_iter.next() + return re.sub(r'\{num\}', next_number, string) + +def match_to_source(source, *translations): + """Matches translated string(s) to source + + The first translation whose source matches the source message, or whose CRC + matches, or which is official, and which is not fuzzy, it is used. + If thre's no such translation, the first translation is used. + + Returns (source, source string CRC, string for CSV file, exact match?) + If there are no translations, returns (source, None, None, None) + + Handles translations where numbers have been replaced by {num}, if they + have source information. + """ + first = True + best_crc = None + for translation in translations: + if translation is None: + continue + if translation.number_replacement: + current_string = number_replace(source.string, translation.string) + current_source = number_replace(source.string, translation.source) + current_crc = crc(current_source) + elif '{num}' in translation.string: + print (u'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation).encode('utf-8') + continue + else: + current_string = translation.string + current_source = translation.source + current_crc = translation.source_crc + if translation.fuzzy: + match = False + elif translation.official: + match = True + elif current_source: + match = source.string == current_source + else: + match = current_crc == crc(source.string) + if first or match: + best_string = current_string + best_crc = current_crc + best_message = translation + if match: + break + first = False + if best_crc: + return source, best_crc, best_string, match + else: + return source, None, None, None + +def merge_translations(source_stream, *translation_streams, **kwargs): + """For each source message, get its best translation from translations. + + Translations should be ordered by priority, highest to lowest. + + Messages that don't appear in translations at all aren't included. + """ + source = tuple(source_stream) + streams = [ + synchronize(source, t, key=lambda m: m.merge_key, unused=kwargs.get('unused')) + for t in translation_streams + ] + for messages in itertools.izip(source, *streams): + yield match_to_source(*messages)