Let lookup accept hex/octal/binary numbers.
[zzz-pokedex.git] / pokedex / roomaji.py
1 # encoding: utf8
2 """Provides `romanize()` for romanizing simple Japanese text."""
3
4 _roomaji_kana = {
5 # Hiragana
6 u'あ': 'a', u'い': 'i', u'う': 'u', u'え': 'e', u'お': 'o',
7 u'か': 'ka', u'き': 'ki', u'く': 'ku', u'け': 'ke', u'こ': 'ko',
8 u'さ': 'sa', u'し': 'shi', u'す': 'su', u'せ': 'se', u'そ': 'so',
9 u'た': 'ta', u'ち': 'chi', u'つ': 'tsu', u'て': 'te', u'と': 'to',
10 u'な': 'na', u'に': 'ni', u'ぬ': 'nu', u'ね': 'ne', u'の': 'no',
11 u'は': 'ha', u'ひ': 'hi', u'ふ': 'fu', u'へ': 'he', u'ほ': 'ho',
12 u'ま': 'ma', u'み': 'mi', u'む': 'mu', u'め': 'me', u'も': 'mo',
13 u'や': 'ya', u'ゆ': 'yu', u'よ': 'yo',
14 u'ら': 'ra', u'り': 'ri', u'る': 'ru', u'れ': 're', u'ろ': 'ro',
15 u'わ': 'wa', u'ゐ': 'wi', u'ゑ': 'we', u'を': 'wo',
16 u'ん': 'n',
17 u'が': 'ga', u'ぎ': 'gi', u'ぐ': 'gu', u'げ': 'ge', u'ご': 'go',
18 u'ざ': 'za', u'じ': 'ji', u'ず': 'zu', u'ぜ': 'ze', u'ぞ': 'zo',
19 u'だ': 'da', u'ぢ': 'ji', u'づ': 'dzu', u'で': 'de', u'ど': 'do',
20 u'ば': 'ba', u'び': 'bi', u'ぶ': 'bu', u'べ': 'be', u'ぼ': 'bo',
21 u'ぱ': 'pa', u'ぴ': 'pi', u'ぷ': 'pu', u'ぺ': 'pe', u'ぽ': 'po',
22
23 # Katakana
24 u'ア': 'a', u'イ': 'i', u'ウ': 'u', u'エ': 'e', u'オ': 'o',
25 u'カ': 'ka', u'キ': 'ki', u'ク': 'ku', u'ケ': 'ke', u'コ': 'ko',
26 u'サ': 'sa', u'シ': 'shi', u'ス': 'su', u'セ': 'se', u'ソ': 'so',
27 u'タ': 'ta', u'チ': 'chi', u'ツ': 'tsu', u'テ': 'te', u'ト': 'to',
28 u'ナ': 'na', u'ニ': 'ni', u'ヌ': 'nu', u'ネ': 'ne', u'ノ': 'no',
29 u'ハ': 'ha', u'ヒ': 'hi', u'フ': 'fu', u'ヘ': 'he', u'ホ': 'ho',
30 u'マ': 'ma', u'ミ': 'mi', u'ム': 'mu', u'メ': 'me', u'モ': 'mo',
31 u'ヤ': 'ya', u'ユ': 'yu', u'ヨ': 'yo',
32 u'ラ': 'ra', u'リ': 'ri', u'ル': 'ru', u'レ': 're', u'ロ': 'ro',
33 u'ワ': 'wa', u'ヰ': 'wi', u'ヱ': 'we', u'ヲ': 'wo',
34 u'ン': 'n',
35 u'ガ': 'ga', u'ギ': 'gi', u'グ': 'gu', u'ゲ': 'ge', u'ゴ': 'go',
36 u'ザ': 'za', u'ジ': 'ji', u'ズ': 'zu', u'ゼ': 'ze', u'ゾ': 'zo',
37 u'ダ': 'da', u'ヂ': 'ji', u'ヅ': 'dzu', u'デ': 'de', u'ド': 'do',
38 u'バ': 'ba', u'ビ': 'bi', u'ブ': 'bu', u'ベ': 'be', u'ボ': 'bo',
39 u'パ': 'pa', u'ピ': 'pi', u'プ': 'pu', u'ペ': 'pe', u'ポ': 'po',
40 }
41
42 _roomaji_youon = {
43 # Hiragana
44 u'ゃ': 'ya', u'ゅ': 'yu', u'ょ': 'yo',
45
46 # Katakana
47 u'ャ': 'ya', u'ュ': 'yu', u'ョ': 'yo',
48 }
49
50 # XXX If romanize() ever handles hiragana, it will need to make sure that the
51 # preceding character was a katakana
52 # This does not include every small kana combination, but should include every
53 # one used in a Pokémon name. An exhaustive list would be.. very long
54 _roomaji_small_kana = {
55 u'ァ': 'a', u'ィ': 'i', u'ゥ': 'u', u'ェ': 'e', u'ォ': 'o',
56 }
57 _roomaji_small_kana_combos = {
58 # These are, by the way, fairly arbitrary. "shi xi" to mean "sy" is
59 # particularly weird, but it seems to be what GF intends
60
61 # Simple vowel replacement
62 u'ウィ': 'wi', u'ウゥ': 'wu', u'ウェ': 'we',
63 u'チェ': 'che',
64 u'シェ': 'she',
65 u'テァ': 'tha', u'ティ': 'ti', u'テゥ': 'thu', u'テェ': 'tye', u'テォ': 'tho',
66 u'デァ': 'dha', u'ディ': 'di', u'デゥ': 'dhu', u'デェ': 'dye', u'デォ': 'dho',
67 u'ファ': 'fa', u'フィ': 'fi', u'ホゥ': 'hu', u'フェ': 'fe', u'フォ': 'fo',
68
69 # Not so much
70 u'シィ': 'sy',
71 u'ビィ': 'by',
72 u'ピィ': 'py',
73 }
74
75 def romanize(string):
76 """Converts a string of kana to roomaji."""
77
78 vowels = ['a', 'e', 'i', 'o', 'u', 'y']
79
80 characters = []
81 last_kana = None # Used for ー; っ or ッ; ん or ン
82 last_char = None # Used for small kana combos
83 for char in string:
84 # Full-width Latin
85 if ord(char) >= 0xff11 and ord(char) <= 0xff5e:
86 if last_kana == 'sokuon':
87 raise ValueError("Sokuon cannot precede Latin characters.")
88
89 char = chr(ord(char) - 0xff11 + 0x31)
90 characters.append(char)
91
92 last_kana = None
93
94 # Small vowel kana
95 elif char in _roomaji_small_kana:
96 combo = last_char + char
97 if combo in _roomaji_small_kana_combos:
98 characters[-1] = _roomaji_small_kana_combos[combo]
99
100 else:
101 # If we don't know what it is... act dumb and treat it as a
102 # full-size vowel. Better than bailing, and seems to occur a
103 # lot, e.g. ピィ is "pii"
104 characters.append(_roomaji_small_kana[char])
105
106 last_kana = _roomaji_small_kana[char]
107
108 # Youon
109 elif char in _roomaji_youon:
110 if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
111 raise ValueError("Youon must follow an -i sound.")
112
113 # Drop the -i and append the ya/yu/yo sound
114 new_sound = _roomaji_youon[char]
115 if last_kana in ['chi', 'shi', 'ji']:
116 # Strip the y-
117 new_char = last_kana[:-1] + new_sound[1:]
118 else:
119 new_char = last_kana[:-1] + new_sound
120
121 characters[-1] = new_char
122 last_kana = new_char
123
124 # Sokuon
125 elif char in (u'っ', u'ッ'):
126 # Remember it and double the consonant next time around
127 last_kana = 'sokuon'
128
129 # Extended vowel or n
130 elif char == u'ー':
131 if last_kana[-1] not in vowels:
132 raise ValueError(u"'ー' must follow by a vowel.")
133 characters.append(last_kana[-1])
134
135 last_kana = None
136
137 # Regular ol' kana
138 elif char in _roomaji_kana:
139 kana = _roomaji_kana[char]
140
141 if last_kana == 'sokuon':
142 if kana[0] in vowels:
143 raise ValueError("Sokuon cannot precede a vowel.")
144
145 characters.append(kana[0])
146 elif last_kana == 'n' and kana[0] in vowels:
147 characters.append("'")
148
149 characters.append(kana)
150
151 last_kana = kana
152
153 # Not Japanese?
154 else:
155 if last_kana == 'sokuon':
156 raise ValueError("Sokuon must be followed by another kana.")
157
158 characters.append(char)
159
160 last_kana = None
161
162 last_char = char
163
164
165 if last_kana == 'sokuon':
166 raise ValueError("Sokuon cannot be the last character.")
167
168 return unicode(''.join(characters))