2 u
"""General handling of translations
4 The general idea is to get messages from somewhere: the source pokedex CSVs,
5 or the translation CSVs, etc., then merge them together in some way, and shove
6 them into the database.
8 If a message is translated, it has a source string attached to it, with the
9 original English version. Or at least it has a CRC of the original.
10 When that doesn't match, it means the English string changed and the
11 translation has to be updated.
12 Also this is why we can't dump translations from the database: there's no
17 Flavor text is so repetitive that we take strings from all the version,
18 separate the unique ones by blank lines, let translators work on that, and then
19 put it in flavor_summary tables.
21 Routes names and other repetitive numeric things are replaced by e.g.
22 "Route {num}" so translators only have to work on each set once.
33 from collections
import defaultdict
35 from pokedex
.db
import tables
36 from pokedex
.defaults
import get_default_csv_dir
38 default_source_lang
= 'en'
40 # Top-level classes we want translations for: in order, and by name
41 # These are all mapped_classes that have translatable texts and aren't summarized
43 toplevel_class_by_name
= {}
45 # summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts
48 # translation_class_by_column[class_name, column_name] == translation_class
49 translation_class_by_column
= {}
51 for cls
in tables
.mapped_classes
:
53 summary_class
, col
= cls
.summary_column
54 except AttributeError:
55 if cls
.translation_classes
:
56 toplevel_classes
.append(cls
)
57 toplevel_class_by_name
[cls
.__name__
] = cls
58 for translation_class
in cls
.translation_classes
:
59 for column
in translation_class
.__table__
.c
:
60 translation_class_by_column
[cls
, column
.name
] = translation_class
62 summary_map
.setdefault(summary_class
, {})[col
] = cls
64 number_re
= re
.compile("[0-9]+")
67 """Return a hash to we use in translation CSV files"""
68 return "%08x" %
(binascii
.crc32(string
.encode('utf-8')) & 0xffffffff)
69 # Two special values are also used in source_crc:
70 # UNKNOWN: no source string was available
71 # OFFICIAL: an official string from the main database
73 class Message(object):
74 """Holds all info about a translatable or translated string
76 cls: Name of the mapped class the message belongs to
77 id: The id of the thing the message belongs to
78 colname: name of the database column
79 strings: A list of strings in the message, usualy of length 1.
81 Optional attributes (None if not set):
82 colsize: Max length of the database column
83 source: The string this was translated from
84 number_replacement: True if this is a translation with {num} placeholders
85 pot: Name of the pot the message goes to (see pot_for_column)
86 source_crc: CRC of the source
87 origin: Some indication of where the string came from (CSV, PO, ...)
88 fuzzy: True for fuzzy translations
89 language_id: ID of the language
90 official: True if this is a known-good translation
92 __slots__
= 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split()
93 def __init__(self
, cls
, id, colname
, string
,
94 colsize
=None, source
=None, number_replacement
=None, pot
=None,
95 source_crc
=None, origin
=None, fuzzy
=None, language_id
=None,
100 self
.colname
= colname
101 self
.strings
= [string
]
102 self
.colsize
= colsize
104 self
.number_replacement
= number_replacement
106 self
.source_crc
= source_crc
107 if source
and not source_crc
:
108 self
.source_crc
= crc(source
)
111 self
.language_id
= language_id
112 self
.official
= official
114 def merge(self
, other
):
115 """Merge two messages, as required for flavor text summarizing
117 assert self
.merge_key
== other
.merge_key
118 for string
in other
.strings
:
119 if string
not in self
.strings
:
120 self
.strings
.append(string
)
121 self
.colsize
= self
.colsize
or other
.colsize
122 self
.pot
= self
.pot
or other
.pot
124 self
.source_crc
= None
125 self
.number_replacement
= None
129 return '\n\n'.join(self
.strings
)
133 return self
.cls
, self
.id, self
.colname
137 return self
.merge_key
, self
.language_id
, self
.fuzzy
141 return self
.sort_key
, self
.strings
143 def __eq__(self
, other
): return self
.eq_key
== other
.eq_key
144 def __ne__(self
, other
): return self
.eq_key
!= other
.eq_key
145 def __gt__(self
, other
): return self
.sort_key
> other
.sort_key
146 def __lt__(self
, other
): return self
.sort_key
< other
.sort_key
147 def __ge__(self
, other
): return self
.sort_key
>= other
.sort_key
148 def __le__(self
, other
): return self
.sort_key
<= other
.sort_key
150 def __unicode__(self
):
151 string
= '"%s"' % self
.string
153 string
= string
[:15] + u
'"...'
154 template
= u
'<Message from {self.origin} for {self.cls}.{self.colname}:{self.id} -- {string}>'
155 return template
.format(self
=self
, string
=string
)
158 return unicode(self
).encode('utf-8')
161 return unicode(self
).encode('utf-8')
163 class Translations(object):
164 """Data and opertaions specific to a location on disk (and a source language)
166 def __init__(self
, source_lang
=default_source_lang
, csv_directory
=None, translation_directory
=None):
167 if csv_directory
is None:
168 csv_directory
= get_default_csv_dir()
170 if translation_directory
is None:
171 translation_directory
= os
.path
.join(csv_directory
, 'translations')
173 self
.source_lang
= default_source_lang
174 self
.csv_directory
= csv_directory
175 self
.translation_directory
= translation_directory
177 self
.language_ids
= {}
178 self
.language_identifiers
= {}
179 self
.official_langs
= []
180 for row
in self
.reader_for_class(tables
.Language
, reader_class
=csv
.DictReader
):
181 self
.language_ids
[row
['identifier']] = int(row
['id'])
182 self
.language_identifiers
[int(row
['id'])] = row
['identifier']
183 if row
['official'] and int(row
['official']):
184 self
.official_langs
.append(row
['identifier'])
186 self
.source_lang_id
= self
.language_ids
[self
.source_lang
]
189 def from_parsed_options(cls
, options
):
190 return cls(options
.source_lang
, options
.directory
)
194 """All source (i.e. English) messages
196 return self
.official_messages(self
.source_lang
)
198 def official_messages(self
, lang
):
199 """All official messages (i.e. from main database) for the given lang
201 # Cached as tuples, since they're used pretty often
202 lang_id
= self
.language_ids
[lang
]
204 return self
._sources
[lang_id
]
205 except AttributeError:
207 for message
in self
.yield_source_messages():
208 self
._sources
.setdefault(message
.language_id
, []).append(message
)
209 self
._sources
= dict((k
, tuple(merge_adjacent(v
))) for k
, v
in self
._sources
.items())
210 return self
.official_messages(lang
)
212 # Looks like there are no messages in the DB for this language
213 # This should only happen for non-official languages
214 assert lang
not in self
.official_langs
217 def write_translations(self
, lang
, *streams
):
218 """Write a translation CSV containing messages from streams.
220 Streams should be ordered by priority, from highest to lowest.
222 Any official translations (from the main database) are added automatically.
224 writer
= self
.writer_for_lang(lang
)
226 writer
.writerow('language_id table id column source_crc string'.split())
228 messages
= merge_translations(self
.source
, self
.official_messages(lang
), *streams
)
231 for source
, sourcehash
, string
, exact
in messages
:
232 if string
and sourcehash
!= 'OFFICIAL':
233 utf8len
= len(string
.encode('utf-8'))
234 if source
.colsize
and utf8len
> source
.colsize
:
235 key
= source
.cls
, source
.colname
236 warnings
[key
] = max(warnings
.get(key
, (0,)), (utf8len
, source
, string
))
239 self
.language_ids
[lang
],
244 string
.encode('utf-8'),
246 for utf8len
, source
, string
in warnings
.values():
247 template
= u
'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}'
248 warning
= template
.format(
249 key
=source
.merge_key
,
252 colsize
=source
.colsize
,
254 if len(warning
) > 79:
255 warning
= warning
[:76] + u
'...'
256 print warning
.encode('utf-8')
258 def reader_for_class(self
, cls
, reader_class
=csv
.reader
):
259 tablename
= cls
.__table__
.name
260 csvpath
= os
.path
.join(self
.csv_directory
, tablename
+ '.csv')
261 return reader_class(open(csvpath
, 'rb'), lineterminator
='\n')
263 def writer_for_lang(self
, lang
):
264 csvpath
= os
.path
.join(self
.translation_directory
, '%s.csv' % lang
)
265 return csv
.writer(open(csvpath
, 'wb'), lineterminator
='\n')
267 def yield_source_messages(self
, language_id
=None):
268 """Yield all messages from source CSV files
270 Messages from all languages are returned. The messages are not ordered
271 properly, but splitting the stream by language (and filtering results
272 by merge_adjacent) will produce proper streams.
274 if language_id
is None:
275 language_id
= self
.source_lang_id
277 for cls
in sorted(toplevel_classes
, key
=lambda c
: c
.__name__
):
279 for translation_class
in cls
.translation_classes
:
280 streams
.append(yield_source_csv_messages(
283 self
.reader_for_class(translation_class
),
286 colmap
= summary_map
[translation_class
]
290 for colname
, summary_class
in colmap
.items():
291 column
= translation_class
.__table__
.c
[colname
]
292 streams
.append(yield_source_csv_messages(
295 self
.reader_for_class(summary_class
),
298 for message
in Merge(*streams
):
301 def yield_target_messages(self
, lang
):
302 """Yield messages from the data/csv/translations/<lang>.csv file
304 path
= os
.path
.join(self
.csv_directory
, 'translations', '%s.csv' % lang
)
306 file = open(path
, 'rb')
309 return yield_translation_csv_messages(file)
311 def yield_all_translations(self
):
313 for lang
in self
.language_identifiers
.values():
314 stream
.add_iterator(self
.yield_target_messages(lang
))
315 return (message
for message
in stream
if not message
.official
)
317 def get_load_data(self
, langs
=None):
318 """Yield (translation_class, data for INSERT) pairs for loading into the DB
320 langs is either a list of language identifiers or None
323 langs
= self
.language_identifiers
.values()
325 for lang
in self
.language_identifiers
.values():
326 stream
.add_iterator(self
.yield_target_messages(lang
))
327 stream
= (message
for message
in stream
if not message
.official
)
329 class GroupDict(dict):
330 """Dict to automatically set the foreign_id and local_language_id for new items
332 def __missing__(self
, key
):
333 # depends on `cls` from outside scope
334 id, language_id
= key
335 data
= self
[key
] = defaultdict(lambda: None)
336 column_names
= (c
.name
for c
in translation_class
.__table__
.columns
)
337 data
.update(dict.fromkeys(column_names
))
339 '%s_id' % cls
.__singlename__
: id,
340 'local_language_id': language_id
,
344 # translation_class -> (lang, id) -> column -> value
345 everything
= defaultdict(GroupDict
)
346 # Group by object so we always have all of the messages for one DB row
347 for (cls_name
, id), group
in group_by_object(stream
):
348 cls
= toplevel_class_by_name
[cls_name
]
349 for message
in group
:
350 translation_class
= translation_class_by_column
[cls
, message
.colname
]
351 key
= id, message
.language_id
352 colname
= str(message
.colname
)
353 everything
[translation_class
][key
][colname
] = message
.string
356 for translation_class
, key_data
in everything
.items():
357 yield translation_class
, key_data
.values()
360 for translation_class
, data_dict
in everything
.items():
361 yield translation_class
, data_dict
.values()
363 def group_by_object(stream
):
364 """Group stream by object
366 Yields ((class name, object ID), (list of messages)) pairs.
368 stream
= iter(stream
)
369 current
= stream
.next()
370 current_key
= current
.cls
, current
.id
372 for message
in stream
:
373 if (message
.cls
, message
.id) != current_key
:
374 yield current_key
, group
376 group
.append(message
)
378 current_key
= current
.cls
, current
.id
379 yield current_key
, group
382 """Merge several sorted iterators together
384 Additional iterators may be added at any time with add_iterator.
385 Accepts None for the initial iterators
386 If the same value appears in more iterators, there will be duplicates in
389 def __init__(self
, *iterators
):
390 self
.next_values
= []
391 for iterator
in iterators
:
392 if iterator
is not None:
393 self
.add_iterator(iterator
)
395 def add_iterator(self
, iterator
):
396 iterator
= iter(iterator
)
398 value
= iterator
.next()
399 except StopIteration:
402 heapq
.heappush(self
.next_values
, (value
, iterator
))
409 value
, iterator
= heapq
.heappop(self
.next_values
)
410 self
.add_iterator(iterator
)
415 def merge_adjacent(gen
):
416 """Merge adjacent messages that compare equal"""
420 if this
.merge_key
== last
.merge_key
:
426 raise AssertionError('Bad order, %s > %s' %
(last
, this
))
429 def leftjoin(left_stream
, right_stream
, key
=lambda x
: x
, unused
=None):
430 """A "left join" operation on sorted iterators
432 Yields (left, right) pairs, where left comes from left_stream and right
433 is the corresponding item from right, or None
435 Note that if there are duplicates in right_stream, you won't get duplicate
438 If given, unused should be a one-arg function that will get called on all
439 unused items in right_stream.
441 left_stream
= iter(left_stream
)
442 right_stream
= iter(right_stream
)
444 right
= right_stream
.next()
445 for left
in left_stream
:
446 while right
and key(left
) > key(right
):
447 if unused
is not None:
449 right
= right_stream
.next()
450 if key(left
) == key(right
):
453 right
= right_stream
.next()
456 except StopIteration:
461 for left
in left_stream
:
464 if unused
is not None:
469 for right
in right_stream
:
472 def synchronize(reference
, stream
, key
=lambda x
: x
, unused
=None):
473 """Just the right side part of leftjoin(), Nones included"""
474 for left
, right
in leftjoin(reference
, stream
, key
, unused
):
477 def yield_source_csv_messages(cls
, foreign_cls
, csvreader
, force_column
=None):
478 """Yield all messages from one source CSV file.
480 columns
= list(cls
.__table__
.c
)
481 column_names
= csvreader
.next()
482 # Assumptions: rows are in lexicographic order
483 # (taking numeric values as numbers of course)
484 # Assumptions about the order of columns:
485 # 1. It's the same in the table and in CSV
486 # 2. Primary key is at the beginning
487 # 3. First thing in the PK is the object id
488 # 4. Last thing in the PK is the language
489 # 5. Everything that follows is some translatable text
490 assert [cls
.__table__
.c
[name
] for name
in column_names
] == columns
, ','.join(c
.name
for c
in columns
)
491 pk
= columns
[:len(cls
.__table__
.primary_key
.columns
)]
492 first_string_index
= len(pk
)
493 return _yield_csv_messages(foreign_cls
, columns
, first_string_index
, csvreader
, force_column
=force_column
)
495 def _yield_csv_messages(foreign_cls
, columns
, first_string_index
, csvreader
, origin
='source CSV', crc_value
='OFFICIAL', force_column
=None):
496 language_index
= first_string_index
- 1
497 assert 'language' in columns
[language_index
].name
, columns
[language_index
].name
498 string_columns
= columns
[first_string_index
:]
499 if force_column
is not None:
500 assert len(string_columns
) == 1
501 string_columns
= [force_column
]
502 for values
in csvreader
:
505 for string
, column
in zip(values
[first_string_index
:], string_columns
):
507 foreign_cls
.__name__
,
510 string
.decode('utf-8'),
512 pot
=pot_for_column(cls
, column
, force_column
is not None),
515 source_crc
=crc_value
,
516 language_id
=int(values
[language_index
]),
518 messages
.append(message
)
520 for message
in messages
:
523 def yield_guessed_csv_messages(file):
524 """Yield messages from a CSV file, using the header to figure out what the data means.
526 csvreader
= csv
.reader(file, lineterminator
='\n')
527 column_names
= csvreader
.next()
528 if column_names
== 'language_id,table,id,column,source_crc,string'.split(','):
530 return yield_translation_csv_messages(file, True)
531 # Not a translation CSV, figure out what the columns mean
532 assert column_names
[0].endswith('_id')
533 assert column_names
[1] == 'local_language_id'
534 first_string_index
= 2
535 foreign_singlename
= column_names
[0][:-len('_id')]
536 columns
= [None] * len(column_names
)
537 column_indexes
= dict((name
, i
) for i
, name
in enumerate(column_names
))
538 for foreign_cls
in toplevel_classes
:
539 if foreign_cls
.__singlename__
== foreign_singlename
:
542 raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" %
(column_names
[0], file))
543 for translation_class
in foreign_cls
.translation_classes
:
544 for column
in translation_class
.__table__
.c
:
545 column_index
= column_indexes
.get(column
.name
)
546 if column_index
is not None:
547 columns
[column_index
] = column
548 assert all([c
is not None for c
in columns
[first_string_index
:]])
549 return _yield_csv_messages(foreign_cls
, columns
, first_string_index
, csvreader
, origin
=file.name
, crc_value
='UNKNOWN')
551 def yield_translation_csv_messages(file, no_header
=False):
552 """Yield messages from a translation CSV file
554 csvreader
= csv
.reader(file, lineterminator
='\n')
556 columns
= csvreader
.next()
557 assert columns
== 'language_id,table,id,column,source_crc,string'.split(',')
558 for language_id
, table
, id, column
, source_crc
, string
in csvreader
:
563 string
.decode('utf-8'),
565 source_crc
=source_crc
,
566 language_id
=int(language_id
),
569 def pot_for_column(cls
, column
, summary
=False):
570 """Translatable texts get categorized into different POT files to help
571 translators prioritize. The pots are:
573 - flavor: Flavor texts: here, strings from multiple versions are summarized
574 - ripped: Strings ripped from the games; translators for "official"
575 languages don't need to bother with these
576 - effects: Fanon descriptions of things; they usually use technical
578 - misc: Everything else; usually small texts
580 Set source to true if this is a flavor summary column. Others are
581 determined by the column itself.
585 elif column
.info
.get('ripped'):
587 elif column
.name
.endswith('effect'):
592 def number_replace(source
, string
):
593 numbers_iter
= iter(number_re
.findall(source
))
594 next_number
= lambda match
: numbers_iter
.next()
595 return re
.sub(r
'\{num\}', next_number
, string
)
597 def match_to_source(source
, *translations
):
598 """Matches translated string(s) to source
600 The first translation whose source matches the source message, or whose CRC
601 matches, or which is official, and which is not fuzzy, it is used.
602 If thre's no such translation, the first translation is used.
604 Returns (source, source string CRC, string for CSV file, exact match?)
605 If there are no translations, returns (source, None, None, None)
607 Handles translations where numbers have been replaced by {num}, if they
608 have source information.
612 for translation
in translations
:
613 if translation
is None:
615 if translation
.number_replacement
:
616 current_string
= number_replace(source
.string
, translation
.string
)
617 current_source
= number_replace(source
.string
, translation
.source
)
618 current_crc
= crc(current_source
)
619 elif '{num}' in translation
.string
:
620 print (u
'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation
).encode('utf-8')
623 current_string
= translation
.string
624 current_source
= translation
.source
625 current_crc
= translation
.source_crc
626 if translation
.fuzzy
:
628 elif translation
.official
:
631 match
= source
.string
== current_source
633 match
= current_crc
== crc(source
.string
)
635 best_string
= current_string
636 best_crc
= current_crc
637 best_message
= translation
642 return source
, best_crc
, best_string
, match
644 return source
, None, None, None
646 def merge_translations(source_stream
, *translation_streams
, **kwargs
):
647 """For each source message, get its best translation from translations.
649 Translations should be ordered by priority, highest to lowest.
651 Messages that don't appear in translations at all aren't included.
653 source
= tuple(source_stream
)
655 synchronize(source
, t
, key
=lambda m
: m
.merge_key
, unused
=kwargs
.get('unused'))
656 for t
in translation_streams
658 for messages
in itertools
.izip(source
, *streams
):
659 yield match_to_source(*messages
)