Add migration script to move English texts to language-specific tables
[zzz-pokedex.git] / scripts / migration-i18n.py
1 # Encoding: UTF-8
2
3 """Moves/transforms values in CSVs in an ad-hoc way, based mainly on column name
4
5 Auto-creates identifiers from names
6 Auto-creates names from identifiers
7 Copies IDs for foreign keys
8 Creates autoincrement-style IDs when missing
9 Sets text language to 9 (en), except when it sets to 1 (jp)
10
11 And looks good doing it!
12 """
13
14 import csv
15 import re
16 import os
17 from StringIO import StringIO
18 from collections import namedtuple, defaultdict
19
20 from sqlalchemy.orm import class_mapper
21
22 from pokedex.db import tables, load
23
24 english_id = 9
25 japanese_id = 1
26
27 bw_version_group_id = 11
28
29 dir = load.get_default_csv_dir()
30
31 def tuple_key(tup):
32 """Return a sort key for mixed int/string tuples.
33
34 Strings sort first.
35 """
36 def generator():
37 for item in tup:
38 try:
39 yield (1, int(item))
40 except ValueError:
41 yield (0, item)
42 return tuple(generator())
43
44 class MakeFieldFuncs:
45 """Various ways to get a new value from the old one"""
46 @staticmethod
47 def copy(field_name, source, **kwargs):
48 """Plain copy"""
49 return source[field_name]
50
51 @staticmethod
52 def main(field_name, source, **kwargs):
53 """Populate aux table from the main table"""
54 return source[field_name]
55
56 @staticmethod
57 def Main(field_name, source, **kwargs):
58 """Capitalize"""
59 return source[field_name].capitalize()
60
61 @staticmethod
62 def ident(source, **kwargs):
63 """Craft an identifier from the 'identifier' or 'name' column"""
64 return name2ident(source.get('identifier', source.get('name')))
65
66 @staticmethod
67 def Name(source, **kwargs):
68 """Capitalize the name (or identifier) column"""
69 name = source.get('name', source.get('identifier', None))
70 name = ' '.join(word.capitalize() for word in name.split(' '))
71 return name
72
73 @staticmethod
74 def f_id(source, **kwargs):
75 """Capitalize the identifier column"""
76 return source['identifier'].capitalize()
77
78 @staticmethod
79 def name(source, **kwargs):
80 """Get the original name"""
81 return source['name']
82
83 @staticmethod
84 def newid(i, source, **kwargs):
85 """Assign a new "auto-incremented" id"""
86 source['id'] = i # hack to make srcid work
87 return i
88
89 @staticmethod
90 def en(source, **kwargs):
91 """Assign the value for English -- unless it's Japanese"""
92 if source.get('version_group_id', None) == str(bw_version_group_id):
93 return japanese_id
94 return english_id
95
96 @staticmethod
97 def srcid(source, field_name, **kwargs):
98 """The original table's id"""
99 try:
100 return source['id']
101 except KeyError:
102 if field_name == 'pokemon_form_group_id':
103 # This one reuses another table's ID
104 return source['pokemon_id']
105 else:
106 raise
107
108 def name2ident(name):
109 ident = name.decode('utf-8').lower()
110 ident = ident.replace(u'+', ' plus ')
111 ident = re.sub(u'[ _–]+', u'-', ident)
112 ident = re.sub(u'[\'./;’(),:]', u'', ident)
113 ident = ident.replace(u'é', 'e')
114 ident = ident.replace(u'♀', '-f')
115 ident = ident.replace(u'♂', '-m')
116 if ident in ('???', '????'):
117 ident = 'unknown'
118 elif ident == '!':
119 ident = 'exclamation'
120 elif ident == '?':
121 ident = 'question'
122 for c in ident:
123 assert c in "abcdefghijklmnopqrstuvwxyz0123456789-", repr(ident)
124 return ident
125
126
127 FieldSpec = namedtuple('FieldSpec', 'out name func')
128
129 def main():
130 for table in sorted(tables.all_tables(), key=lambda t: t.__name__):
131 datafilename = dir + '/' + table.__tablename__ + '.csv'
132 classname = table.__name__
133 if hasattr(table, 'object_table'):
134 # This is an auxilliary table; it'll be processed with the main one
135 continue
136 else:
137 print "%s: %s" % (classname, table.__tablename__)
138 with open(datafilename) as datafile:
139 datacsv = csv.reader(datafile, lineterminator='\n')
140 orig_fields = datacsv.next()
141 columns = class_mapper(table).c
142 new_fields = []
143 main_out = []
144 outputs = {datafilename: main_out}
145 name_out = None
146 srcfiles = [datafilename]
147 # Set new_fields to a list of FieldSpec object, one for each field we want in the csv
148 for column in columns:
149 name = column.name
150 if name == 'identifier':
151 new_fields.append(FieldSpec(datafilename, column.name, MakeFieldFuncs.ident))
152 elif name in orig_fields:
153 new_fields.append(FieldSpec(datafilename, column.name, MakeFieldFuncs.copy))
154 elif name == 'id':
155 new_fields.append(FieldSpec(datafilename, column.name, MakeFieldFuncs.newid))
156 elif name == 'language_id':
157 new_fields.insert(2, FieldSpec(datafilename, column.name, MakeFieldFuncs.en))
158 else:
159 raise AssertionError(name)
160 # Remember headers
161 headers = {datafilename: list(field.name for field in new_fields)}
162 # Pretty prnt :)
163 for field in new_fields:
164 print ' [{0.func.func_name:5}] {0.name}'.format(field)
165 # Do pretty much the same for aux tables
166 aux_tables = []
167 for attrname in 'text_table prose_table'.split():
168 aux_table = getattr(table, attrname, None)
169 if aux_table:
170 aux_datafilename = dir + '/' + aux_table.__tablename__ + '.csv'
171 print " %s: %s" % (aux_table.__name__, aux_table.__tablename__)
172 srcfiles.append(datafilename)
173 aux_tables.append(aux_table)
174 columns = class_mapper(aux_table).c
175 aux_out = []
176 outputs[aux_datafilename] = aux_out
177 aux_fields = []
178 for column in columns:
179 name = column.name
180 if name == 'language_id':
181 aux_fields.insert(1, FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.en))
182 elif name == 'name' and table.__name__ == 'ItemFlag':
183 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.f_id))
184 elif name == 'description' and table.__name__ == 'ItemFlag':
185 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.name))
186 elif name in orig_fields and name == 'name' and table.__name__ in 'PokemonColor ContestType BerryFirmness'.split():
187 # Capitalize these names
188 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.Name))
189 elif name in orig_fields and name in 'color flavor'.split() and table.__name__ == 'ContestType':
190 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.Main))
191 elif name in orig_fields:
192 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.main))
193 elif name == table.__singlename__ + '_id':
194 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.srcid))
195 elif name == 'name':
196 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.Name))
197 elif name == 'lang_id':
198 aux_fields.append(FieldSpec(aux_datafilename, column.name, MakeFieldFuncs.srcid))
199 else:
200 print orig_fields
201 raise AssertionError(name)
202 if name == 'name':
203 # If this table contains the name, remember that
204 name_fields = aux_fields
205 name_out = aux_out
206 # Sort aux tables nicely
207 def key(f):
208 if f.func == MakeFieldFuncs.srcid:
209 return 0
210 elif f.name == 'language_id':
211 return 1
212 elif f.name == 'name':
213 return 2
214 else:
215 return 10
216 aux_fields.sort(key=key)
217 new_fields += aux_fields
218 headers[aux_datafilename] = list(field.name for field in aux_fields)
219 # Pretty print :)
220 for field in aux_fields:
221 print ' [{0.func.func_name:5}] {0.name}'.format(field)
222 # Do nothing if the table's the same
223 if all(field.func == MakeFieldFuncs.copy for field in new_fields):
224 print u' → skipping'
225 continue
226 # Otherwise read the file
227 # outputs will be a (filename -> list of rows) dict
228 print u' → reading'
229 for autoincrement_id, src_row in enumerate(datacsv, start=1):
230 row = dict(zip(orig_fields, src_row))
231 new_rows = defaultdict(list)
232 for field in new_fields:
233 new_rows[field.out].append(field.func(
234 source=row,
235 field_name=field.name,
236 i=autoincrement_id,
237 ))
238 for name, row in new_rows.items():
239 outputs[name].append(row)
240 # If there was a _names table, read that and append it to the
241 # aux table that has names
242 try:
243 name_datafilename = dir + '/' + table.__singlename__ + '_names.csv'
244 name_file = open(name_datafilename)
245 except (AttributeError, IOError):
246 pass
247 else:
248 print u' → reading foreign names'
249 with name_file:
250 namecsv = csv.reader(name_file, lineterminator='\n')
251 src_fields = namecsv.next()
252 obj_id_fieldname = table.__singlename__ + '_id'
253 assert src_fields == [obj_id_fieldname, 'language_id', 'name']
254 for name_row in namecsv:
255 name_dict = dict(zip(src_fields, name_row))
256 row = []
257 for field in name_fields:
258 row.append(name_dict.get(field.name, ''))
259 name_out.append(row)
260 os.unlink(name_datafilename)
261 # For all out files, write a header & sorted rows
262 print u' → writing'
263 for filename, rows in outputs.items():
264 with open(filename, 'w') as outfile:
265 outcsv = csv.writer(outfile, lineterminator='\n')
266 outcsv.writerow(headers[filename])
267 rows.sort(key=tuple_key)
268 for row in rows:
269 outcsv.writerow(row)
270
271 if __name__ == '__main__':
272 main()