Reading, merging, and writing translations
[zzz-pokedex.git] / pokedex / db /
1 #! /usr/bin/env python
2 u"""General handling of translations
4 The general idea is to get messages from somewhere: the source pokedex CSVs,
5 or the translation CSVs, etc., then merge them together in some way, and shove
6 them into the database.
8 If a message is translated, it has a source string attached to it, with the
9 original English version. Or at least it has a CRC of the original.
10 When that doesn't match, it means the English string changed and the
11 translation has to be updated.
12 Also this is why we can't dump translations from the database: there's no
13 original string info.
15 Some complications:
17 Flavor text is so repetitive that we take strings from all the version,
18 separate the unique ones by blank lines, let translators work on that, and then
19 put it in flavor_summary tables.
21 Routes names and other repetitive numeric things are replaced by e.g.
22 "Route {num}" so translators only have to work on each set once.
24 """
26 import binascii
27 import csv
28 import heapq
29 import itertools
30 import os
31 import re
32 import sys
33 from collections import defaultdict
35 from pokedex.db import tables
36 from pokedex.defaults import get_default_csv_dir
38 default_source_lang = 'en'
40 # Top-level classes we want translations for: in order, and by name
41 # These are all mapped_classes that have translatable texts and aren't summarized
42 toplevel_classes = []
43 toplevel_class_by_name = {}
45 # summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts
46 summary_map = {}
48 # translation_class_by_column[class_name, column_name] == translation_class
49 translation_class_by_column = {}
51 for cls in tables.mapped_classes:
52 try:
53 summary_class, col = cls.summary_column
54 except AttributeError:
55 if cls.translation_classes:
56 toplevel_classes.append(cls)
57 toplevel_class_by_name[cls.__name__] = cls
58 for translation_class in cls.translation_classes:
59 for column in translation_class.__table__.c:
60 translation_class_by_column[cls,] = translation_class
61 else:
62 summary_map.setdefault(summary_class, {})[col] = cls
64 number_re = re.compile("[0-9]+")
66 def crc(string):
67 """Return a hash to we use in translation CSV files"""
68 return "%08x" % (binascii.crc32(string.encode('utf-8')) & 0xffffffff)
69 # Two special values are also used in source_crc:
70 # UNKNOWN: no source string was available
71 # OFFICIAL: an official string from the main database
73 class Message(object):
74 """Holds all info about a translatable or translated string
76 cls: Name of the mapped class the message belongs to
77 id: The id of the thing the message belongs to
78 colname: name of the database column
79 strings: A list of strings in the message, usualy of length 1.
81 Optional attributes (None if not set):
82 colsize: Max length of the database column
83 source: The string this was translated from
84 number_replacement: True if this is a translation with {num} placeholders
85 pot: Name of the pot the message goes to (see pot_for_column)
86 source_crc: CRC of the source
87 origin: Some indication of where the string came from (CSV, PO, ...)
88 fuzzy: True for fuzzy translations
89 language_id: ID of the language
90 official: True if this is a known-good translation
91 """
92 __slots__ = 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split()
93 def __init__(self, cls, id, colname, string,
94 colsize=None, source=None, number_replacement=None, pot=None,
95 source_crc=None, origin=None, fuzzy=None, language_id=None,
96 official=None,
97 ):
98 self.cls = cls
99 = id
100 self.colname = colname
101 self.strings = [string]
102 self.colsize = colsize
103 self.source = source
104 self.number_replacement = number_replacement
105 self.pot = pot
106 self.source_crc = source_crc
107 if source and not source_crc:
108 self.source_crc = crc(source)
109 self.origin = origin
110 self.fuzzy = fuzzy
111 self.language_id = language_id
112 self.official = official
114 def merge(self, other):
115 """Merge two messages, as required for flavor text summarizing
116 """
117 assert self.merge_key == other.merge_key
118 for string in other.strings:
119 if string not in self.strings:
120 self.strings.append(string)
121 self.colsize = self.colsize or other.colsize
122 self.pot = self.pot or other.pot
123 self.source = None
124 self.source_crc = None
125 self.number_replacement = None
127 @property
128 def string(self):
129 return '\n\n'.join(self.strings)
131 @property
132 def merge_key(self):
133 return self.cls,, self.colname
135 @property
136 def sort_key(self):
137 return self.merge_key, self.language_id, self.fuzzy
139 @property
140 def eq_key(self):
141 return self.sort_key, self.strings
143 def __eq__(self, other): return self.eq_key == other.eq_key
144 def __ne__(self, other): return self.eq_key != other.eq_key
145 def __gt__(self, other): return self.sort_key > other.sort_key
146 def __lt__(self, other): return self.sort_key < other.sort_key
147 def __ge__(self, other): return self.sort_key >= other.sort_key
148 def __le__(self, other): return self.sort_key <= other.sort_key
150 def __unicode__(self):
151 string = '"%s"' % self.string
152 if len(string) > 20:
153 string = string[:15] + u'"...'
154 template = u'<Message from {self.origin} for {self.cls}.{self.colname}:{} -- {string}>'
155 return template.format(self=self, string=string)
157 def __str__(self):
158 return unicode(self).encode('utf-8')
160 def __repr__(self):
161 return unicode(self).encode('utf-8')
163 class Translations(object):
164 """Data and opertaions specific to a location on disk (and a source language)
165 """
166 def __init__(self, source_lang=default_source_lang, csv_directory=None, translation_directory=None):
167 if csv_directory is None:
168 csv_directory = get_default_csv_dir()
170 if translation_directory is None:
171 translation_directory = os.path.join(csv_directory, 'translations')
173 self.source_lang = default_source_lang
174 self.csv_directory = csv_directory
175 self.translation_directory = translation_directory
177 self.language_ids = {}
178 self.language_identifiers = {}
179 self.official_langs = []
180 for row in self.reader_for_class(tables.Language, reader_class=csv.DictReader):
181 self.language_ids[row['identifier']] = int(row['id'])
182 self.language_identifiers[int(row['id'])] = row['identifier']
183 if row['official'] and int(row['official']):
184 self.official_langs.append(row['identifier'])
186 self.source_lang_id = self.language_ids[self.source_lang]
188 @classmethod
189 def from_parsed_options(cls, options):
190 return cls(options.source_lang,
192 @property
193 def source(self):
194 """All source (i.e. English) messages
195 """
196 return self.official_messages(self.source_lang)
198 def official_messages(self, lang):
199 """All official messages (i.e. from main database) for the given lang
200 """
201 # Cached as tuples, since they're used pretty often
202 lang_id = self.language_ids[lang]
203 try:
204 return self._sources[lang_id]
205 except AttributeError:
206 self._sources = {}
207 for message in self.yield_source_messages():
208 self._sources.setdefault(message.language_id, []).append(message)
209 self._sources = dict((k, tuple(merge_adjacent(v))) for k, v in self._sources.items())
210 return self.official_messages(lang)
211 except KeyError:
212 # Looks like there are no messages in the DB for this language
213 # This should only happen for non-official languages
214 assert lang not in self.official_langs
215 return ()
217 def write_translations(self, lang, *streams):
218 """Write a translation CSV containing messages from streams.
220 Streams should be ordered by priority, from highest to lowest.
222 Any official translations (from the main database) are added automatically.
223 """
224 writer = self.writer_for_lang(lang)
226 writer.writerow('language_id table id column source_crc string'.split())
228 messages = merge_translations(self.source, self.official_messages(lang), *streams)
230 warnings = {}
231 for source, sourcehash, string, exact in messages:
232 if string and sourcehash != 'OFFICIAL':
233 utf8len = len(string.encode('utf-8'))
234 if source.colsize and utf8len > source.colsize:
235 key = source.cls, source.colname
236 warnings[key] = max(warnings.get(key, (0,)), (utf8len, source, string))
237 else:
238 writer.writerow((
239 self.language_ids[lang],
240 source.cls,
242 source.colname,
243 sourcehash,
244 string.encode('utf-8'),
245 ))
246 for utf8len, source, string in warnings.values():
247 template = u'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}'
248 warning = template.format(
249 key=source.merge_key,
250 string=string,
251 size=utf8len,
252 colsize=source.colsize,
253 )
254 if len(warning) > 79:
255 warning = warning[:76] + u'...'
256 print warning.encode('utf-8')
258 def reader_for_class(self, cls, reader_class=csv.reader):
259 tablename =
260 csvpath = os.path.join(self.csv_directory, tablename + '.csv')
261 return reader_class(open(csvpath, 'rb'), lineterminator='\n')
263 def writer_for_lang(self, lang):
264 csvpath = os.path.join(self.translation_directory, '%s.csv' % lang)
265 return csv.writer(open(csvpath, 'wb'), lineterminator='\n')
267 def yield_source_messages(self, language_id=None):
268 """Yield all messages from source CSV files
270 Messages from all languages are returned. The messages are not ordered
271 properly, but splitting the stream by language (and filtering results
272 by merge_adjacent) will produce proper streams.
273 """
274 if language_id is None:
275 language_id = self.source_lang_id
277 for cls in sorted(toplevel_classes, key=lambda c: c.__name__):
278 streams = []
279 for translation_class in cls.translation_classes:
280 streams.append(yield_source_csv_messages(
281 translation_class,
282 cls,
283 self.reader_for_class(translation_class),
284 ))
285 try:
286 colmap = summary_map[translation_class]
287 except KeyError:
288 pass
289 else:
290 for colname, summary_class in colmap.items():
291 column = translation_class.__table__.c[colname]
292 streams.append(yield_source_csv_messages(
293 summary_class,
294 cls,
295 self.reader_for_class(summary_class),
296 force_column=column,
297 ))
298 for message in Merge(*streams):
299 yield message
301 def yield_target_messages(self, lang):
302 """Yield messages from the data/csv/translations/<lang>.csv file
303 """
304 path = os.path.join(self.csv_directory, 'translations', '%s.csv' % lang)
305 try:
306 file = open(path, 'rb')
307 except IOError:
308 return ()
309 return yield_translation_csv_messages(file)
311 def yield_all_translations(self):
312 stream = Merge()
313 for lang in self.language_identifiers.values():
314 stream.add_iterator(self.yield_target_messages(lang))
315 return (message for message in stream if not message.official)
317 def get_load_data(self, langs=None):
318 """Yield (translation_class, data for INSERT) pairs for loading into the DB
320 langs is either a list of language identifiers or None
321 """
322 if langs is None:
323 langs = self.language_identifiers.values()
324 stream = Merge()
325 for lang in self.language_identifiers.values():
326 stream.add_iterator(self.yield_target_messages(lang))
327 stream = (message for message in stream if not message.official)
328 count = 0
329 class GroupDict(dict):
330 """Dict to automatically set the foreign_id and local_language_id for new items
331 """
332 def __missing__(self, key):
333 # depends on `cls` from outside scope
334 id, language_id = key
335 data = self[key] = defaultdict(lambda: None)
336 column_names = ( for c in translation_class.__table__.columns)
337 data.update(dict.fromkeys(column_names))
338 data.update({
339 '%s_id' % cls.__singlename__: id,
340 'local_language_id': language_id,
341 })
342 return data
343 # Nested dict:
344 # translation_class -> (lang, id) -> column -> value
345 everything = defaultdict(GroupDict)
346 # Group by object so we always have all of the messages for one DB row
347 for (cls_name, id), group in group_by_object(stream):
348 cls = toplevel_class_by_name[cls_name]
349 for message in group:
350 translation_class = translation_class_by_column[cls, message.colname]
351 key = id, message.language_id
352 colname = str(message.colname)
353 everything[translation_class][key][colname] = message.string
354 count += 1
355 if count > 1000:
356 for translation_class, key_data in everything.items():
357 yield translation_class, key_data.values()
358 count = 0
359 everything.clear()
360 for translation_class, data_dict in everything.items():
361 yield translation_class, data_dict.values()
363 def group_by_object(stream):
364 """Group stream by object
366 Yields ((class name, object ID), (list of messages)) pairs.
367 """
368 stream = iter(stream)
369 current =
370 current_key = current.cls,
371 group = [current]
372 for message in stream:
373 if (message.cls, != current_key:
374 yield current_key, group
375 group = []
376 group.append(message)
377 current = message
378 current_key = current.cls,
379 yield current_key, group
381 class Merge(object):
382 """Merge several sorted iterators together
384 Additional iterators may be added at any time with add_iterator.
385 Accepts None for the initial iterators
386 If the same value appears in more iterators, there will be duplicates in
387 the output.
388 """
389 def __init__(self, *iterators):
390 self.next_values = []
391 for iterator in iterators:
392 if iterator is not None:
393 self.add_iterator(iterator)
395 def add_iterator(self, iterator):
396 iterator = iter(iterator)
397 try:
398 value =
399 except StopIteration:
400 return
401 else:
402 heapq.heappush(self.next_values, (value, iterator))
404 def __iter__(self):
405 return self
407 def next(self):
408 if self.next_values:
409 value, iterator = heapq.heappop(self.next_values)
410 self.add_iterator(iterator)
411 return value
412 else:
413 raise StopIteration
415 def merge_adjacent(gen):
416 """Merge adjacent messages that compare equal"""
417 gen = iter(gen)
418 last =
419 for this in gen:
420 if this.merge_key == last.merge_key:
421 last.merge(this)
422 elif last < this:
423 yield last
424 last = this
425 else:
426 raise AssertionError('Bad order, %s > %s' % (last, this))
427 yield last
429 def leftjoin(left_stream, right_stream, key=lambda x: x, unused=None):
430 """A "left join" operation on sorted iterators
432 Yields (left, right) pairs, where left comes from left_stream and right
433 is the corresponding item from right, or None
435 Note that if there are duplicates in right_stream, you won't get duplicate
436 rows for them.
438 If given, unused should be a one-arg function that will get called on all
439 unused items in right_stream.
440 """
441 left_stream = iter(left_stream)
442 right_stream = iter(right_stream)
443 try:
444 right =
445 for left in left_stream:
446 while right and key(left) > key(right):
447 if unused is not None:
448 unused(right)
449 right =
450 if key(left) == key(right):
451 yield left, right
452 del left
453 right =
454 else:
455 yield left, None
456 except StopIteration:
457 try:
458 yield left, None
459 except NameError:
460 pass
461 for left in left_stream:
462 yield left, None
463 else:
464 if unused is not None:
465 try:
466 unused(right)
467 except NameError:
468 pass
469 for right in right_stream:
470 unused(right)
472 def synchronize(reference, stream, key=lambda x: x, unused=None):
473 """Just the right side part of leftjoin(), Nones included"""
474 for left, right in leftjoin(reference, stream, key, unused):
475 yield right
477 def yield_source_csv_messages(cls, foreign_cls, csvreader, force_column=None):
478 """Yield all messages from one source CSV file.
479 """
480 columns = list(cls.__table__.c)
481 column_names =
482 # Assumptions: rows are in lexicographic order
483 # (taking numeric values as numbers of course)
484 # Assumptions about the order of columns:
485 # 1. It's the same in the table and in CSV
486 # 2. Primary key is at the beginning
487 # 3. First thing in the PK is the object id
488 # 4. Last thing in the PK is the language
489 # 5. Everything that follows is some translatable text
490 assert [cls.__table__.c[name] for name in column_names] == columns, ','.join( for c in columns)
491 pk = columns[:len(cls.__table__.primary_key.columns)]
492 first_string_index = len(pk)
493 return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, force_column=force_column)
495 def _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin='source CSV', crc_value='OFFICIAL', force_column=None):
496 language_index = first_string_index - 1
497 assert 'language' in columns[language_index].name, columns[language_index].name
498 string_columns = columns[first_string_index:]
499 if force_column is not None:
500 assert len(string_columns) == 1
501 string_columns = [force_column]
502 for values in csvreader:
503 id = int(values[0])
504 messages = []
505 for string, column in zip(values[first_string_index:], string_columns):
506 message = Message(
507 foreign_cls.__name__,
508 id,
510 string.decode('utf-8'),
511 column.type.length,
512 pot=pot_for_column(cls, column, force_column is not None),
513 origin=origin,
514 official=True,
515 source_crc=crc_value,
516 language_id=int(values[language_index]),
517 )
518 messages.append(message)
519 messages.sort()
520 for message in messages:
521 yield message
523 def yield_guessed_csv_messages(file):
524 """Yield messages from a CSV file, using the header to figure out what the data means.
525 """
526 csvreader = csv.reader(file, lineterminator='\n')
527 column_names =
528 if column_names == 'language_id,table,id,column,source_crc,string'.split(','):
529 # A translation CSV
530 return yield_translation_csv_messages(file, True)
531 # Not a translation CSV, figure out what the columns mean
532 assert column_names[0].endswith('_id')
533 assert column_names[1] == 'local_language_id'
534 first_string_index = 2
535 foreign_singlename = column_names[0][:-len('_id')]
536 columns = [None] * len(column_names)
537 column_indexes = dict((name, i) for i, name in enumerate(column_names))
538 for foreign_cls in toplevel_classes:
539 if foreign_cls.__singlename__ == foreign_singlename:
540 break
541 else:
542 raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" % (column_names[0], file))
543 for translation_class in foreign_cls.translation_classes:
544 for column in translation_class.__table__.c:
545 column_index = column_indexes.get(
546 if column_index is not None:
547 columns[column_index] = column
548 assert all([c is not None for c in columns[first_string_index:]])
549 return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader,, crc_value='UNKNOWN')
551 def yield_translation_csv_messages(file, no_header=False):
552 """Yield messages from a translation CSV file
553 """
554 csvreader = csv.reader(file, lineterminator='\n')
555 if not no_header:
556 columns =
557 assert columns == 'language_id,table,id,column,source_crc,string'.split(',')
558 for language_id, table, id, column, source_crc, string in csvreader:
559 yield Message(
560 table,
561 int(id),
562 column,
563 string.decode('utf-8'),
564 origin='target CSV',
565 source_crc=source_crc,
566 language_id=int(language_id),
567 )
569 def pot_for_column(cls, column, summary=False):
570 """Translatable texts get categorized into different POT files to help
571 translators prioritize. The pots are:
573 - flavor: Flavor texts: here, strings from multiple versions are summarized
574 - ripped: Strings ripped from the games; translators for "official"
575 languages don't need to bother with these
576 - effects: Fanon descriptions of things; they usually use technical
577 language
578 - misc: Everything else; usually small texts
580 Set source to true if this is a flavor summary column. Others are
581 determined by the column itself.
582 """
583 if summary:
584 return 'flavor'
585 elif'ripped'):
586 return 'ripped'
587 elif'effect'):
588 return 'effects'
589 else:
590 return 'misc'
592 def number_replace(source, string):
593 numbers_iter = iter(number_re.findall(source))
594 next_number = lambda match:
595 return re.sub(r'\{num\}', next_number, string)
597 def match_to_source(source, *translations):
598 """Matches translated string(s) to source
600 The first translation whose source matches the source message, or whose CRC
601 matches, or which is official, and which is not fuzzy, it is used.
602 If thre's no such translation, the first translation is used.
604 Returns (source, source string CRC, string for CSV file, exact match?)
605 If there are no translations, returns (source, None, None, None)
607 Handles translations where numbers have been replaced by {num}, if they
608 have source information.
609 """
610 first = True
611 best_crc = None
612 for translation in translations:
613 if translation is None:
614 continue
615 if translation.number_replacement:
616 current_string = number_replace(source.string, translation.string)
617 current_source = number_replace(source.string, translation.source)
618 current_crc = crc(current_source)
619 elif '{num}' in translation.string:
620 print (u'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation).encode('utf-8')
621 continue
622 else:
623 current_string = translation.string
624 current_source = translation.source
625 current_crc = translation.source_crc
626 if translation.fuzzy:
627 match = False
628 elif translation.official:
629 match = True
630 elif current_source:
631 match = source.string == current_source
632 else:
633 match = current_crc == crc(source.string)
634 if first or match:
635 best_string = current_string
636 best_crc = current_crc
637 best_message = translation
638 if match:
639 break
640 first = False
641 if best_crc:
642 return source, best_crc, best_string, match
643 else:
644 return source, None, None, None
646 def merge_translations(source_stream, *translation_streams, **kwargs):
647 """For each source message, get its best translation from translations.
649 Translations should be ordered by priority, highest to lowest.
651 Messages that don't appear in translations at all aren't included.
652 """
653 source = tuple(source_stream)
654 streams = [
655 synchronize(source, t, key=lambda m: m.merge_key, unused=kwargs.get('unused'))
656 for t in translation_streams
657 ]
658 for messages in itertools.izip(source, *streams):
659 yield match_to_source(*messages)