pokedex/db/translations.py

   1 #! /usr/bin/env python
   2 u"""General handling of translations
   3
   4 The general idea is to get messages from somewhere: the source pokedex CSVs,
   5 or the translation CSVs, etc., then merge them together in some way, and shove
   6 them into the database.
   7
   8 If a message is translated, it has a source string attached to it, with the
   9 original English version. Or at least it has a CRC of the original.
  10 When that doesn't match, it means the English string changed and the
  11 translation has to be updated.
  12 Also this is why we can't dump translations from the database: there's no
  13 original string info.
  14
  15 Some complications:
  16
  17 Flavor text is so repetitive that we take strings from all the version,
  18 separate the unique ones by blank lines, let translators work on that, and then
  19 put it in flavor_summary tables.
  20
  21 Routes names and other repetitive numeric things are replaced by e.g.
  22 "Route {num}" so translators only have to work on each set once.
  23
  24 """
  25
  26 import binascii
  27 import csv
  28 import heapq
  29 import itertools
  30 import os
  31 import re
  32 import sys
  33 from collections import defaultdict
  34
  35 from pokedex.db import tables
  36 from pokedex.defaults import get_default_csv_dir
  37
  38 default_source_lang = 'en'
  39
  40 # Top-level classes we want translations for: in order, and by name
  41 # These are all mapped_classes that have translatable texts and aren't summarized
  42 toplevel_classes = []
  43 toplevel_class_by_name = {}
  44
  45 # summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts
  46 summary_map = {}
  47
  48 # translation_class_by_column[class_name, column_name] == translation_class
  49 translation_class_by_column = {}
  50
  51 for cls in tables.mapped_classes:
  52     try:
  53         summary_class, col = cls.summary_column
  54     except AttributeError:
  55         if cls.translation_classes:
  56             toplevel_classes.append(cls)
  57             toplevel_class_by_name[cls.__name__] = cls
  58             for translation_class in cls.translation_classes:
  59                 for column in translation_class.__table__.c:
  60                     translation_class_by_column[cls, column.name] = translation_class
  61     else:
  62         summary_map.setdefault(summary_class, {})[col] = cls
  63
  64 number_re = re.compile("[0-9]+")
  65
  66 def crc(string):
  67     """Return a hash to we use in translation CSV files"""
  68     return "%08x" % (binascii.crc32(string.encode('utf-8')) & 0xffffffff)
  69     # Two special values are also used in source_crc:
  70     # UNKNOWN: no source string was available
  71     # OFFICIAL: an official string from the main database
  72
  73 class Message(object):
  74     """Holds all info about a translatable or translated string
  75
  76     cls: Name of the mapped class the message belongs to
  77     id: The id of the thing the message belongs to
  78     colname: name of the database column
  79     strings: A list of strings in the message, usualy of length 1.
  80
  81     Optional attributes (None if not set):
  82     colsize: Max length of the database column
  83     source: The string this was translated from
  84     number_replacement: True if this is a translation with {num} placeholders
  85     pot: Name of the pot the message goes to (see pot_for_column)
  86     source_crc: CRC of the source
  87     origin: Some indication of where the string came from (CSV, PO, ...)
  88     fuzzy: True for fuzzy translations
  89     language_id: ID of the language
  90     official: True if this is a known-good translation
  91     """
  92     __slots__ = 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split()
  93     def __init__(self, cls, id, colname, string,
  94             colsize=None, source=None, number_replacement=None, pot=None,
  95             source_crc=None, origin=None, fuzzy=None, language_id=None,
  96             official=None,
  97         ):
  98         self.cls = cls
  99         self.id = id
 100         self.colname = colname
 101         self.strings = [string]
 102         self.colsize = colsize
 103         self.source = source
 104         self.number_replacement = number_replacement
 105         self.pot = pot
 106         self.source_crc = source_crc
 107         if source and not source_crc:
 108              self.source_crc = crc(source)
 109         self.origin = origin
 110         self.fuzzy = fuzzy
 111         self.language_id = language_id
 112         self.official = official
 113
 114     def merge(self, other):
 115         """Merge two messages, as required for flavor text summarizing
 116         """
 117         assert self.merge_key == other.merge_key
 118         for string in other.strings:
 119             if string not in self.strings:
 120                 self.strings.append(string)
 121         self.colsize = self.colsize or other.colsize
 122         self.pot = self.pot or other.pot
 123         self.source = None
 124         self.source_crc = None
 125         self.number_replacement = None
 126
 127     @property
 128     def string(self):
 129         return '\n\n'.join(self.strings)
 130
 131     @property
 132     def merge_key(self):
 133         return self.cls, self.id, self.colname
 134
 135     @property
 136     def sort_key(self):
 137         return self.merge_key, self.language_id, self.fuzzy
 138
 139     @property
 140     def eq_key(self):
 141         return self.sort_key, self.strings
 142
 143     def __eq__(self, other): return self.eq_key == other.eq_key
 144     def __ne__(self, other): return self.eq_key != other.eq_key
 145     def __gt__(self, other): return self.sort_key > other.sort_key
 146     def __lt__(self, other): return self.sort_key < other.sort_key
 147     def __ge__(self, other): return self.sort_key >= other.sort_key
 148     def __le__(self, other): return self.sort_key <= other.sort_key
 149
 150     def __unicode__(self):
 151         string = '"%s"' % self.string
 152         if len(string) > 20:
 153             string = string[:15] + u'"...'
 154         template = u'<Message from {self.origin} for {self.cls}.{self.colname}:{self.id} -- {string}>'
 155         return template.format(self=self, string=string)
 156
 157     def __str__(self):
 158         return unicode(self).encode('utf-8')
 159
 160     def __repr__(self):
 161         return unicode(self).encode('utf-8')
 162
 163 class Translations(object):
 164     """Data and opertaions specific to a location on disk (and a source language)
 165     """
 166     def __init__(self, source_lang=default_source_lang, csv_directory=None, translation_directory=None):
 167         if csv_directory is None:
 168             csv_directory = get_default_csv_dir()
 169
 170         if translation_directory is None:
 171             translation_directory = os.path.join(csv_directory, 'translations')
 172
 173         self.source_lang = default_source_lang
 174         self.csv_directory = csv_directory
 175         self.translation_directory = translation_directory
 176
 177         self.language_ids = {}
 178         self.language_identifiers = {}
 179         self.official_langs = []
 180         for row in self.reader_for_class(tables.Language, reader_class=csv.DictReader):
 181             self.language_ids[row['identifier']] = int(row['id'])
 182             self.language_identifiers[int(row['id'])] = row['identifier']
 183             if row['official'] and int(row['official']):
 184                 self.official_langs.append(row['identifier'])
 185
 186         self.source_lang_id = self.language_ids[self.source_lang]
 187
 188     @classmethod
 189     def from_parsed_options(cls, options):
 190         return cls(options.source_lang, options.directory)
 191
 192     @property
 193     def source(self):
 194         """All source (i.e. English) messages
 195         """
 196         return self.official_messages(self.source_lang)
 197
 198     def official_messages(self, lang):
 199         """All official messages (i.e. from main database) for the given lang
 200         """
 201         # Cached as tuples, since they're used pretty often
 202         lang_id = self.language_ids[lang]
 203         try:
 204             return self._sources[lang_id]
 205         except AttributeError:
 206             self._sources = {}
 207             for message in self.yield_source_messages():
 208                 self._sources.setdefault(message.language_id, []).append(message)
 209             self._sources = dict((k, tuple(merge_adjacent(v))) for k, v in self._sources.items())
 210             return self.official_messages(lang)
 211         except KeyError:
 212             # Looks like there are no messages in the DB for this language
 213             # This should only happen for non-official languages
 214             assert lang not in self.official_langs
 215             return ()
 216
 217     def write_translations(self, lang, *streams):
 218         """Write a translation CSV containing messages from streams.
 219
 220         Streams should be ordered by priority, from highest to lowest.
 221
 222         Any official translations (from the main database) are added automatically.
 223         """
 224         writer = self.writer_for_lang(lang)
 225
 226         writer.writerow('language_id table id column source_crc string'.split())
 227
 228         messages = merge_translations(self.source, self.official_messages(lang), *streams)
 229
 230         warnings = {}
 231         for source, sourcehash, string, exact in messages:
 232             if string and sourcehash != 'OFFICIAL':
 233                 utf8len = len(string.encode('utf-8'))
 234                 if source.colsize and utf8len > source.colsize:
 235                     key = source.cls, source.colname
 236                     warnings[key] = max(warnings.get(key, (0,)), (utf8len, source, string))
 237                 else:
 238                     writer.writerow((
 239                             self.language_ids[lang],
 240                             source.cls,
 241                             source.id,
 242                             source.colname,
 243                             sourcehash,
 244                             string.encode('utf-8'),
 245                         ))
 246         for utf8len, source, string in warnings.values():
 247             template = u'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}'
 248             warning = template.format(
 249                     key=source.merge_key,
 250                     string=string,
 251                     size=utf8len,
 252                     colsize=source.colsize,
 253                 )
 254             if len(warning) > 79:
 255                 warning = warning[:76] + u'...'
 256             print warning.encode('utf-8')
 257
 258     def reader_for_class(self, cls, reader_class=csv.reader):
 259         tablename = cls.__table__.name
 260         csvpath = os.path.join(self.csv_directory, tablename + '.csv')
 261         return reader_class(open(csvpath, 'rb'), lineterminator='\n')
 262
 263     def writer_for_lang(self, lang):
 264         csvpath = os.path.join(self.translation_directory, '%s.csv' % lang)
 265         return csv.writer(open(csvpath, 'wb'), lineterminator='\n')
 266
 267     def yield_source_messages(self, language_id=None):
 268         """Yield all messages from source CSV files
 269
 270         Messages from all languages are returned. The messages are not ordered
 271         properly, but splitting the stream by language (and filtering results
 272         by merge_adjacent) will produce proper streams.
 273         """
 274         if language_id is None:
 275             language_id = self.source_lang_id
 276
 277         for cls in sorted(toplevel_classes, key=lambda c: c.__name__):
 278             streams = []
 279             for translation_class in cls.translation_classes:
 280                 streams.append(yield_source_csv_messages(
 281                         translation_class,
 282                         cls,
 283                         self.reader_for_class(translation_class),
 284                     ))
 285                 try:
 286                     colmap = summary_map[translation_class]
 287                 except KeyError:
 288                     pass
 289                 else:
 290                     for colname, summary_class in colmap.items():
 291                         column = translation_class.__table__.c[colname]
 292                         streams.append(yield_source_csv_messages(
 293                                 summary_class,
 294                                 cls,
 295                                 self.reader_for_class(summary_class),
 296                                 force_column=column,
 297                             ))
 298             for message in Merge(*streams):
 299                 yield message
 300
 301     def yield_target_messages(self, lang):
 302         """Yield messages from the data/csv/translations/<lang>.csv file
 303         """
 304         path = os.path.join(self.csv_directory, 'translations', '%s.csv' % lang)
 305         try:
 306             file = open(path, 'rb')
 307         except IOError:
 308             return ()
 309         return yield_translation_csv_messages(file)
 310
 311     def yield_all_translations(self):
 312         stream = Merge()
 313         for lang in self.language_identifiers.values():
 314             stream.add_iterator(self.yield_target_messages(lang))
 315         return (message for message in stream if not message.official)
 316
 317     def get_load_data(self, langs=None):
 318         """Yield (translation_class, data for INSERT) pairs for loading into the DB
 319
 320         langs is either a list of language identifiers or None
 321         """
 322         if langs is None:
 323             langs = self.language_identifiers.values()
 324         stream = Merge()
 325         for lang in self.language_identifiers.values():
 326             stream.add_iterator(self.yield_target_messages(lang))
 327         stream = (message for message in stream if not message.official)
 328         count = 0
 329         class GroupDict(dict):
 330             """Dict to automatically set the foreign_id and local_language_id for new items
 331             """
 332             def __missing__(self, key):
 333                 # depends on `cls` from outside scope
 334                 id, language_id = key
 335                 data = self[key] = defaultdict(lambda: None)
 336                 column_names = (c.name for c in translation_class.__table__.columns)
 337                 data.update(dict.fromkeys(column_names))
 338                 data.update({
 339                         '%s_id' % cls.__singlename__: id,
 340                         'local_language_id': language_id,
 341                     })
 342                 return data
 343         # Nested dict:
 344         # translation_class -> (lang, id) -> column -> value
 345         everything = defaultdict(GroupDict)
 346         # Group by object so we always have all of the messages for one DB row
 347         for (cls_name, id), group in group_by_object(stream):
 348             cls = toplevel_class_by_name[cls_name]
 349             for message in group:
 350                 translation_class = translation_class_by_column[cls, message.colname]
 351                 key = id, message.language_id
 352                 colname = str(message.colname)
 353                 everything[translation_class][key][colname] = message.string
 354                 count += 1
 355             if count > 1000:
 356                 for translation_class, key_data in everything.items():
 357                     yield translation_class, key_data.values()
 358                 count = 0
 359                 everything.clear()
 360         for translation_class, data_dict in everything.items():
 361             yield translation_class, data_dict.values()
 362
 363 def group_by_object(stream):
 364     """Group stream by object
 365
 366     Yields ((class name, object ID), (list of messages)) pairs.
 367     """
 368     stream = iter(stream)
 369     current = stream.next()
 370     current_key = current.cls, current.id
 371     group = [current]
 372     for message in stream:
 373         if (message.cls, message.id) != current_key:
 374             yield current_key, group
 375             group = []
 376         group.append(message)
 377         current = message
 378         current_key = current.cls, current.id
 379     yield current_key, group
 380
 381 class Merge(object):
 382     """Merge several sorted iterators together
 383
 384     Additional iterators may be added at any time with add_iterator.
 385     Accepts None for the initial iterators
 386     If the same value appears in more iterators, there will be duplicates in
 387     the output.
 388     """
 389     def __init__(self, *iterators):
 390         self.next_values = []
 391         for iterator in iterators:
 392             if iterator is not None:
 393                 self.add_iterator(iterator)
 394
 395     def add_iterator(self, iterator):
 396         iterator = iter(iterator)
 397         try:
 398             value = iterator.next()
 399         except StopIteration:
 400             return
 401         else:
 402             heapq.heappush(self.next_values, (value, iterator))
 403
 404     def __iter__(self):
 405         return self
 406
 407     def next(self):
 408         if self.next_values:
 409             value, iterator = heapq.heappop(self.next_values)
 410             self.add_iterator(iterator)
 411             return value
 412         else:
 413             raise StopIteration
 414
 415 def merge_adjacent(gen):
 416     """Merge adjacent messages that compare equal"""
 417     gen = iter(gen)
 418     last = gen.next()
 419     for this in gen:
 420         if this.merge_key == last.merge_key:
 421             last.merge(this)
 422         elif last < this:
 423             yield last
 424             last = this
 425         else:
 426             raise AssertionError('Bad order, %s > %s' % (last, this))
 427     yield last
 428
 429 def leftjoin(left_stream, right_stream, key=lambda x: x, unused=None):
 430     """A "left join" operation on sorted iterators
 431
 432     Yields (left, right) pairs, where left comes from left_stream and right
 433     is the corresponding item from right, or None
 434
 435     Note that if there are duplicates in right_stream, you won't get duplicate
 436     rows for them.
 437
 438     If given, unused should be a one-arg function that will get called on all
 439     unused items in right_stream.
 440     """
 441     left_stream = iter(left_stream)
 442     right_stream = iter(right_stream)
 443     try:
 444         right = right_stream.next()
 445         for left in left_stream:
 446             while right and key(left) > key(right):
 447                 if unused is not None:
 448                     unused(right)
 449                 right = right_stream.next()
 450             if key(left) == key(right):
 451                 yield left, right
 452                 del left
 453                 right = right_stream.next()
 454             else:
 455                 yield left, None
 456     except StopIteration:
 457         try:
 458             yield left, None
 459         except NameError:
 460             pass
 461         for left in left_stream:
 462             yield left, None
 463     else:
 464         if unused is not None:
 465             try:
 466                 unused(right)
 467             except NameError:
 468                 pass
 469             for right in right_stream:
 470                 unused(right)
 471
 472 def synchronize(reference, stream, key=lambda x: x, unused=None):
 473     """Just the right side part of leftjoin(), Nones included"""
 474     for left, right in leftjoin(reference, stream, key, unused):
 475         yield right
 476
 477 def yield_source_csv_messages(cls, foreign_cls, csvreader, force_column=None):
 478     """Yield all messages from one source CSV file.
 479     """
 480     columns = list(cls.__table__.c)
 481     column_names = csvreader.next()
 482     # Assumptions: rows are in lexicographic order
 483     #  (taking numeric values as numbers of course)
 484     # Assumptions about the order of columns:
 485     # 1. It's the same in the table and in CSV
 486     # 2. Primary key is at the beginning
 487     # 3. First thing in the PK is the object id
 488     # 4. Last thing in the PK is the language
 489     # 5. Everything that follows is some translatable text
 490     assert [cls.__table__.c[name] for name in column_names] == columns, ','.join(c.name for c in columns)
 491     pk = columns[:len(cls.__table__.primary_key.columns)]
 492     first_string_index = len(pk)
 493     return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, force_column=force_column)
 494
 495 def _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin='source CSV', crc_value='OFFICIAL', force_column=None):
 496     language_index = first_string_index - 1
 497     assert 'language' in columns[language_index].name, columns[language_index].name
 498     string_columns = columns[first_string_index:]
 499     if force_column is not None:
 500         assert len(string_columns) == 1
 501         string_columns = [force_column]
 502     for values in csvreader:
 503         id = int(values[0])
 504         messages = []
 505         for string, column in zip(values[first_string_index:], string_columns):
 506             message = Message(
 507                     foreign_cls.__name__,
 508                     id,
 509                     column.name,
 510                     string.decode('utf-8'),
 511                     column.type.length,
 512                     pot=pot_for_column(cls, column, force_column is not None),
 513                     origin=origin,
 514                     official=True,
 515                     source_crc=crc_value,
 516                     language_id=int(values[language_index]),
 517                 )
 518             messages.append(message)
 519         messages.sort()
 520         for message in messages:
 521             yield message
 522
 523 def yield_guessed_csv_messages(file):
 524     """Yield messages from a CSV file, using the header to figure out what the data means.
 525     """
 526     csvreader = csv.reader(file, lineterminator='\n')
 527     column_names = csvreader.next()
 528     if column_names == 'language_id,table,id,column,source_crc,string'.split(','):
 529         # A translation CSV
 530         return yield_translation_csv_messages(file, True)
 531     # Not a translation CSV, figure out what the columns mean
 532     assert column_names[0].endswith('_id')
 533     assert column_names[1] == 'local_language_id'
 534     first_string_index = 2
 535     foreign_singlename = column_names[0][:-len('_id')]
 536     columns = [None] * len(column_names)
 537     column_indexes = dict((name, i) for i, name in enumerate(column_names))
 538     for foreign_cls in toplevel_classes:
 539         if foreign_cls.__singlename__ == foreign_singlename:
 540             break
 541     else:
 542         raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" % (column_names[0], file))
 543     for translation_class in foreign_cls.translation_classes:
 544         for column in translation_class.__table__.c:
 545             column_index = column_indexes.get(column.name)
 546             if column_index is not None:
 547                 columns[column_index] = column
 548     assert all([c is not None for c in columns[first_string_index:]])
 549     return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin=file.name, crc_value='UNKNOWN')
 550
 551 def yield_translation_csv_messages(file, no_header=False):
 552     """Yield messages from a translation CSV file
 553     """
 554     csvreader = csv.reader(file, lineterminator='\n')
 555     if not no_header:
 556         columns = csvreader.next()
 557         assert columns == 'language_id,table,id,column,source_crc,string'.split(',')
 558     for language_id, table, id, column, source_crc, string in csvreader:
 559         yield Message(
 560                 table,
 561                 int(id),
 562                 column,
 563                 string.decode('utf-8'),
 564                 origin='target CSV',
 565                 source_crc=source_crc,
 566                 language_id=int(language_id),
 567             )
 568
 569 def pot_for_column(cls, column, summary=False):
 570     """Translatable texts get categorized into different POT files to help
 571        translators prioritize. The pots are:
 572
 573     - flavor: Flavor texts: here, strings from multiple versions are summarized
 574     - ripped: Strings ripped from the games; translators for "official"
 575       languages don't need to bother with these
 576     - effects: Fanon descriptions of things; they usually use technical
 577       language
 578     - misc: Everything else; usually small texts
 579
 580     Set source to true if this is a flavor summary column. Others are
 581     determined by the column itself.
 582     """
 583     if summary:
 584         return 'flavor'
 585     elif column.info.get('ripped'):
 586         return 'ripped'
 587     elif column.name.endswith('effect'):
 588         return 'effects'
 589     else:
 590         return 'misc'
 591
 592 def number_replace(source, string):
 593     numbers_iter = iter(number_re.findall(source))
 594     next_number = lambda match: numbers_iter.next()
 595     return re.sub(r'\{num\}', next_number, string)
 596
 597 def match_to_source(source, *translations):
 598     """Matches translated string(s) to source
 599
 600     The first translation whose source matches the source message, or whose CRC
 601     matches, or which is official, and which is not fuzzy, it is used.
 602     If thre's no such translation, the first translation is used.
 603
 604     Returns (source, source string CRC, string for CSV file, exact match?)
 605     If there are no translations, returns (source, None, None, None)
 606
 607     Handles translations where numbers have been replaced by {num}, if they
 608     have source information.
 609     """
 610     first = True
 611     best_crc = None
 612     for translation in translations:
 613         if translation is None:
 614             continue
 615         if translation.number_replacement:
 616             current_string = number_replace(source.string, translation.string)
 617             current_source = number_replace(source.string, translation.source)
 618             current_crc = crc(current_source)
 619         elif '{num}' in translation.string:
 620             print (u'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation).encode('utf-8')
 621             continue
 622         else:
 623             current_string = translation.string
 624             current_source = translation.source
 625             current_crc = translation.source_crc
 626         if translation.fuzzy:
 627             match = False
 628         elif translation.official:
 629             match = True
 630         elif current_source:
 631             match = source.string == current_source
 632         else:
 633             match = current_crc == crc(source.string)
 634         if first or match:
 635             best_string = current_string
 636             best_crc = current_crc
 637             best_message = translation
 638         if match:
 639             break
 640         first = False
 641     if best_crc:
 642         return source, best_crc, best_string, match
 643     else:
 644         return source, None, None, None
 645
 646 def merge_translations(source_stream, *translation_streams, **kwargs):
 647     """For each source message, get its best translation from translations.
 648
 649     Translations should be ordered by priority, highest to lowest.
 650
 651     Messages that don't appear in translations at all aren't included.
 652     """
 653     source = tuple(source_stream)
 654     streams = [
 655             synchronize(source, t, key=lambda m: m.merge_key, unused=kwargs.get('unused'))
 656             for t in translation_streams
 657         ]
 658     for messages in itertools.izip(source, *streams):
 659         yield match_to_source(*messages)