2 """Provides `romanize()` for romanizing simple Japanese text.
4 Also provides available romanizers in a dictionary keyed by language identifier.
7 class Romanizer(object):
8 def __init__(self
, parent
=None, **tables
):
11 parent: A LookupTables to base this one on
12 tables: Dicts that become the object's attributes. If a parent is given,
13 its tables are used, and updated with the given ones
17 self
.tables
= parent
.tables
18 for name
, table
in tables
.items():
19 # Take a copy -- don't want to clobber the parent's tables
20 self
.tables
[name
] = dict(self
.tables
[name
])
21 self
.tables
[name
].update(table
)
25 for name
, table
in self
.tables
.items():
26 setattr(self
, name
, table
)
28 def romanize(self
, string
):
29 """Convert a string of kana to roomaji."""
31 vowels
= ['a', 'e', 'i', 'o', 'u', 'y']
34 last_kana
= None # Used for ー; っ or ッ; ん or ン
35 last_char
= None # Used for small kana combos
38 if 0xff01 <= ord(char
) <= 0xff5e:
39 if last_kana
== 'sokuon':
40 raise ValueError("Sokuon cannot precede Latin characters.")
42 # XXX Real Unicode decomposition would be nicer
43 char
= chr(ord(char
) - 0xff01 + 0x21)
44 characters
.append(char
)
49 elif char
in self
.roomaji_small_kana
:
50 combo
= last_char
+ char
51 if combo
in self
.roomaji_small_kana_combos
:
52 characters
[-1] = self
.roomaji_small_kana_combos
[combo
]
55 # If we don't know what it is... act dumb and treat it as a
56 # full-size vowel. Better than bailing, and seems to occur a
57 # lot, e.g. ピィ is "pii"
58 characters
.append(self
.roomaji_small_kana
[char
])
60 last_kana
= self
.roomaji_small_kana
[char
]
63 elif char
in self
.roomaji_youon
:
64 if not last_kana
or last_kana
[-1] != 'i' or last_kana
== 'i':
65 raise ValueError("Youon must follow an -i sound.")
67 # Drop the -i and append the ya/yu/yo sound
68 new_sound
= self
.roomaji_youon
[char
]
69 if last_kana
in self
.y_drop
:
71 new_char
= self
.y_drop
[last_kana
] + new_sound
[1:]
73 new_char
= last_kana
[:-1] + new_sound
75 characters
[-1] = new_char
79 elif char
in (u
'っ', u
'ッ'):
80 # Remember it and double the consonant next time around
85 if last_kana
[-1] not in vowels
:
86 raise ValueError(u
"'ー' must follow by a vowel.")
87 if last_kana
[-1] in self
.lengthened_vowels
:
88 characters
[-1] = characters
[-1][:-1]
89 characters
.append(self
.lengthened_vowels
[last_kana
[-1]])
91 characters
.append(last_kana
[-1])
96 elif char
in self
.roomaji_kana
:
97 kana
= self
.roomaji_kana
[char
]
99 if last_kana
== 'sokuon':
100 if kana
[0] in vowels
:
101 raise ValueError("Sokuon cannot precede a vowel.")
103 characters
.append(kana
[0])
104 elif last_kana
== 'n' and kana
[0] in vowels
:
105 characters
.append("'")
107 # Special characters fo doubled kana
108 if kana
[0] in self
.lengthened_vowels
and characters
and kana
== characters
[-1][-1]:
109 kana
= self
.lengthened_vowels
[kana
[0]]
110 characters
[-1] = characters
[-1][:-1]
112 characters
.append(kana
)
118 if last_kana
== 'sokuon':
119 raise ValueError("Sokuon must be followed by another kana.")
121 characters
.append(char
)
128 if last_kana
== 'sokuon':
129 raise ValueError("Sokuon cannot be the last character.")
131 return unicode(''.join(characters
))
136 romanizers
['en'] = Romanizer(
139 u
'あ': 'a', u
'い': 'i', u
'う': 'u', u
'え': 'e', u
'お': 'o',
140 u
'か': 'ka', u
'き': 'ki', u
'く': 'ku', u
'け': 'ke', u
'こ': 'ko',
141 u
'さ': 'sa', u
'し': 'shi', u
'す': 'su', u
'せ': 'se', u
'そ': 'so',
142 u
'た': 'ta', u
'ち': 'chi', u
'つ': 'tsu', u
'て': 'te', u
'と': 'to',
143 u
'な': 'na', u
'に': 'ni', u
'ぬ': 'nu', u
'ね': 'ne', u
'の': 'no',
144 u
'は': 'ha', u
'ひ': 'hi', u
'ふ': 'fu', u
'へ': 'he', u
'ほ': 'ho',
145 u
'ま': 'ma', u
'み': 'mi', u
'む': 'mu', u
'め': 'me', u
'も': 'mo',
146 u
'や': 'ya', u
'ゆ': 'yu', u
'よ': 'yo',
147 u
'ら': 'ra', u
'り': 'ri', u
'る': 'ru', u
'れ': 're', u
'ろ': 'ro',
148 u
'わ': 'wa', u
'ゐ': 'wi', u
'ゑ': 'we', u
'を': 'wo',
150 u
'が': 'ga', u
'ぎ': 'gi', u
'ぐ': 'gu', u
'げ': 'ge', u
'ご': 'go',
151 u
'ざ': 'za', u
'じ': 'ji', u
'ず': 'zu', u
'ぜ': 'ze', u
'ぞ': 'zo',
152 u
'だ': 'da', u
'ぢ': 'ji', u
'づ': 'dzu', u
'で': 'de', u
'ど': 'do',
153 u
'ば': 'ba', u
'び': 'bi', u
'ぶ': 'bu', u
'べ': 'be', u
'ぼ': 'bo',
154 u
'ぱ': 'pa', u
'ぴ': 'pi', u
'ぷ': 'pu', u
'ぺ': 'pe', u
'ぽ': 'po',
157 u
'ア': 'a', u
'イ': 'i', u
'ウ': 'u', u
'エ': 'e', u
'オ': 'o',
158 u
'カ': 'ka', u
'キ': 'ki', u
'ク': 'ku', u
'ケ': 'ke', u
'コ': 'ko',
159 u
'サ': 'sa', u
'シ': 'shi', u
'ス': 'su', u
'セ': 'se', u
'ソ': 'so',
160 u
'タ': 'ta', u
'チ': 'chi', u
'ツ': 'tsu', u
'テ': 'te', u
'ト': 'to',
161 u
'ナ': 'na', u
'ニ': 'ni', u
'ヌ': 'nu', u
'ネ': 'ne', u
'ノ': 'no',
162 u
'ハ': 'ha', u
'ヒ': 'hi', u
'フ': 'fu', u
'ヘ': 'he', u
'ホ': 'ho',
163 u
'マ': 'ma', u
'ミ': 'mi', u
'ム': 'mu', u
'メ': 'me', u
'モ': 'mo',
164 u
'ヤ': 'ya', u
'ユ': 'yu', u
'ヨ': 'yo',
165 u
'ラ': 'ra', u
'リ': 'ri', u
'ル': 'ru', u
'レ': 're', u
'ロ': 'ro',
166 u
'ワ': 'wa', u
'ヰ': 'wi', u
'ヱ': 'we', u
'ヲ': 'wo',
168 u
'ガ': 'ga', u
'ギ': 'gi', u
'グ': 'gu', u
'ゲ': 'ge', u
'ゴ': 'go',
169 u
'ザ': 'za', u
'ジ': 'ji', u
'ズ': 'zu', u
'ゼ': 'ze', u
'ゾ': 'zo',
170 u
'ダ': 'da', u
'ヂ': 'ji', u
'ヅ': 'dzu', u
'デ': 'de', u
'ド': 'do',
171 u
'バ': 'ba', u
'ビ': 'bi', u
'ブ': 'bu', u
'ベ': 'be', u
'ボ': 'bo',
172 u
'パ': 'pa', u
'ピ': 'pi', u
'プ': 'pu', u
'ペ': 'pe', u
'ポ': 'po',
178 u
'ゃ': 'ya', u
'ゅ': 'yu', u
'ょ': 'yo',
181 u
'ャ': 'ya', u
'ュ': 'yu', u
'ョ': 'yo',
184 # XXX If romanize() ever handles hiragana, it will need to make sure that the
185 # preceding character was a katakana
186 # This does not include every small kana combination, but should include every
187 # one used in a Pokémon name. An exhaustive list would be.. very long
189 u
'ァ': 'a', u
'ィ': 'i', u
'ゥ': 'u', u
'ェ': 'e', u
'ォ': 'o',
191 roomaji_small_kana_combos
={
192 # These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
193 # particularly weird, but it seems to be what GF intends
195 # Simple vowel replacement
196 u
'ウィ': 'wi', u
'ウゥ': 'wu', u
'ウェ': 'we', u
'ウォ': 'wo',
197 u
'ヴァ': 'va', u
'ヴィ': 'vi', u
'ヴェ': 've', u
'ヴォ': 'vo',
201 u
'テァ': 'tha', u
'ティ': 'ti', u
'テゥ': 'thu', u
'テェ': 'tye', u
'テォ': 'tho',
202 u
'デァ': 'dha', u
'ディ': 'di', u
'デゥ': 'dhu', u
'デェ': 'dye', u
'デォ': 'dho',
203 u
'ファ': 'fa', u
'フィ': 'fi', u
'ホゥ': 'hu', u
'フェ': 'fe', u
'フォ': 'fo',
211 lengthened_vowels
={},
212 y_drop
={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
215 romanizers
['cs'] = Romanizer(parent
=romanizers
['en'],
217 u
'し': u
'ši', u
'ち': u
'či', u
'つ': u
'cu',
218 u
'や': u
'ja', u
'ゆ': u
'ju', u
'よ': u
'jo',
219 u
'じ': u
'dži', u
'ぢ': u
'dži',
220 u
'シ': u
'ši', u
'チ': u
'či', u
'ツ': u
'cu',
221 u
'ヤ': u
'ja', u
'ユ': u
'ju', u
'ヨ': 'jo',
222 u
'ジ': u
'dži', u
'ヂ': u
'dži',
225 u
'ゃ': 'ja', u
'ゅ': 'ju', u
'ょ': 'jo',
226 u
'ャ': 'ja', u
'ュ': 'ju', u
'ョ': 'jo',
228 roomaji_small_kana_combos
={
229 u
'チェ': u
'če', u
'シェ': u
'še', u
'ジェ': u
'dže',
230 u
'テェ': u
'tje', u
'デェ': u
'dje',
231 u
'シィ': u
'sí', u
'ミィ': u
'mí', u
'ビィ': u
'bí', u
'ピィ': u
'pí',
233 lengthened_vowels
={'a': u
'á', 'e': u
'é', 'i': u
'í', 'o': u
'ó', 'u': u
'ú'},
234 y_drop
={u
'či': u
'č', u
'ši': u
'š', u
'dži': u
'dž', u
'ni': u
'ňj'},
237 def romanize(string
, lang
='en'):
238 """Convert a string of kana to roomaji."""
240 # Get the correct romanizer; fall back to English
241 romanizer
= romanizers
.get(lang
, 'en')
244 return romanizer
.romanize(string
)