2 """Provides `romanize()` for romanizing simple Japanese text."""
6 u
'あ': 'a', u
'い': 'i', u
'う': 'u', u
'え': 'e', u
'お': 'o',
7 u
'か': 'ka', u
'き': 'ki', u
'く': 'ku', u
'け': 'ke', u
'こ': 'ko',
8 u
'さ': 'sa', u
'し': 'shi', u
'す': 'su', u
'せ': 'se', u
'そ': 'so',
9 u
'た': 'ta', u
'ち': 'chi', u
'つ': 'tsu', u
'て': 'te', u
'と': 'to',
10 u
'な': 'na', u
'に': 'ni', u
'ぬ': 'nu', u
'ね': 'ne', u
'の': 'no',
11 u
'は': 'ha', u
'ひ': 'hi', u
'ふ': 'fu', u
'へ': 'he', u
'ほ': 'ho',
12 u
'ま': 'ma', u
'み': 'mi', u
'む': 'mu', u
'め': 'me', u
'も': 'mo',
13 u
'や': 'ya', u
'ゆ': 'yu', u
'よ': 'yo',
14 u
'ら': 'ra', u
'り': 'ri', u
'る': 'ru', u
'れ': 're', u
'ろ': 'ro',
15 u
'わ': 'wa', u
'ゐ': 'wi', u
'ゑ': 'we', u
'を': 'wo',
17 u
'が': 'ga', u
'ぎ': 'gi', u
'ぐ': 'gu', u
'げ': 'ge', u
'ご': 'go',
18 u
'ざ': 'za', u
'じ': 'ji', u
'ず': 'zu', u
'ぜ': 'ze', u
'ぞ': 'zo',
19 u
'だ': 'da', u
'ぢ': 'ji', u
'づ': 'dzu', u
'で': 'de', u
'ど': 'do',
20 u
'ば': 'ba', u
'び': 'bi', u
'ぶ': 'bu', u
'べ': 'be', u
'ぼ': 'bo',
21 u
'ぱ': 'pa', u
'ぴ': 'pi', u
'ぷ': 'pu', u
'ぺ': 'pe', u
'ぽ': 'po',
24 u
'ア': 'a', u
'イ': 'i', u
'ウ': 'u', u
'エ': 'e', u
'オ': 'o',
25 u
'カ': 'ka', u
'キ': 'ki', u
'ク': 'ku', u
'ケ': 'ke', u
'コ': 'ko',
26 u
'サ': 'sa', u
'シ': 'shi', u
'ス': 'su', u
'セ': 'se', u
'ソ': 'so',
27 u
'タ': 'ta', u
'チ': 'chi', u
'ツ': 'tsu', u
'テ': 'te', u
'ト': 'to',
28 u
'ナ': 'na', u
'ニ': 'ni', u
'ヌ': 'nu', u
'ネ': 'ne', u
'ノ': 'no',
29 u
'ハ': 'ha', u
'ヒ': 'hi', u
'フ': 'fu', u
'ヘ': 'he', u
'ホ': 'ho',
30 u
'マ': 'ma', u
'ミ': 'mi', u
'ム': 'mu', u
'メ': 'me', u
'モ': 'mo',
31 u
'ヤ': 'ya', u
'ユ': 'yu', u
'ヨ': 'yo',
32 u
'ラ': 'ra', u
'リ': 'ri', u
'ル': 'ru', u
'レ': 're', u
'ロ': 'ro',
33 u
'ワ': 'wa', u
'ヰ': 'wi', u
'ヱ': 'we', u
'ヲ': 'wo',
35 u
'ガ': 'ga', u
'ギ': 'gi', u
'グ': 'gu', u
'ゲ': 'ge', u
'ゴ': 'go',
36 u
'ザ': 'za', u
'ジ': 'ji', u
'ズ': 'zu', u
'ゼ': 'ze', u
'ゾ': 'zo',
37 u
'ダ': 'da', u
'ヂ': 'ji', u
'ヅ': 'dzu', u
'デ': 'de', u
'ド': 'do',
38 u
'バ': 'ba', u
'ビ': 'bi', u
'ブ': 'bu', u
'ベ': 'be', u
'ボ': 'bo',
39 u
'パ': 'pa', u
'ピ': 'pi', u
'プ': 'pu', u
'ペ': 'pe', u
'ポ': 'po',
44 u
'ゃ': 'ya', u
'ゅ': 'yu', u
'ょ': 'yo',
47 u
'ャ': 'ya', u
'ュ': 'yu', u
'ョ': 'yo',
50 # XXX If romanize() ever handles hiragana, it will need to make sure that the
51 # preceding character was a katakana
52 # This does not include every small kana combination, but should include every
53 # one used in a Pokémon name. An exhaustive list would be.. very long
54 _roomaji_small_kana
= {
55 u
'ァ': 'a', u
'ィ': 'i', u
'ゥ': 'u', u
'ェ': 'e', u
'ォ': 'o',
57 _roomaji_small_kana_combos
= {
61 u
'テァ': 'tha', u
'ティ': 'ti', u
'テゥ': 'thu', u
'テェ': 'tye', u
'テォ': 'tho',
62 u
'デァ': 'dha', u
'ディ': 'di', u
'デゥ': 'dhu', u
'デェ': 'dye', u
'デォ': 'dho',
63 u
'ファ': 'fa', u
'フィ': 'fi', u
'ホゥ': 'hu', u
'フェ': 'fe', u
'フォ': 'fo',
67 """Converts a string of kana to roomaji."""
69 vowels
= ['a', 'e', 'i', 'o', 'u', 'y']
72 last_kana
= None # Used for ー; っ or ッ; ん or ン
73 last_char
= None # Used for small kana combos
76 if ord(char
) >= 0xff11 and ord(char
) <= 0xff5e:
77 if last_kana
== 'sokuon':
78 raise ValueError("Sokuon cannot precede Latin characters.")
80 char
= chr(ord(char
) - 0xff11 + 0x31)
81 characters
.append(char
)
86 elif char
in _roomaji_small_kana
:
87 combo
= last_char
+ char
88 if combo
in _roomaji_small_kana_combos
:
89 characters
[-1] = _roomaji_small_kana_combos
[combo
]
92 # If we don't know what it is... act dumb and treat it as a
93 # full-size vowel. Better than bailing, and seems to occur a
94 # lot, e.g. ピィ is "pii"
95 characters
.append(_roomaji_small_kana
[char
])
97 last_kana
= _roomaji_small_kana
[char
]
100 elif char
in _roomaji_youon
:
101 if not last_kana
or last_kana
[-1] != 'i' or last_kana
== 'i':
102 raise ValueError("Youon must follow an -i sound.")
104 # Drop the -i and append the ya/yu/yo sound
105 new_sound
= _roomaji_youon
[char
]
106 if last_kana
in ['chi', 'shi', 'ji']:
108 new_char
= last_kana
[:-1] + new_sound
[1:]
110 new_char
= last_kana
[:-1] + new_sound
112 characters
[-1] = new_char
116 #elif char in (u'っ', u'ッ'):
117 elif char
in (u
'ッ',):
118 # Remember it and double the consonant next time around
121 # Extended vowel or n
123 if last_kana
[-1] not in vowels
:
124 raise ValueError(u
"'ー' must follow by a vowel.")
125 characters
.append(last_kana
[-1])
130 elif char
in _roomaji_kana
:
131 kana
= _roomaji_kana
[char
]
133 if last_kana
== 'sokuon':
134 if kana
[0] in vowels
:
135 raise ValueError("Sokuon cannot precede a vowel.")
137 characters
.append(kana
[0])
138 elif last_kana
== 'n' and kana
[0] in vowels
:
139 characters
.append("'")
141 characters
.append(kana
)
147 if last_kana
== 'sokuon':
148 raise ValueError("Sokuon must be followed by another kana.")
150 characters
.append(char
)
157 if last_kana
== 'sokuon':
158 raise ValueError("Sokuon cannot be the last character.")
160 return unicode(''.join(characters
))