X-Git-Url: http://git.veekun.com/zzz-pokedex.git/blobdiff_plain/40300f9ea7fc541bd5f3878d98d4ff0ca4873d9c..012083841930091acb27446ad0c8a320b5f6fecf:/pokedex/roomaji.py?ds=sidebyside

diff --git a/pokedex/roomaji.py b/pokedex/roomaji.py
index d1bae0f..aa378ac 100644
--- a/pokedex/roomaji.py
+++ b/pokedex/roomaji.py
@@ -1,134 +1,244 @@
 # encoding: utf8
-"""Provides `romanize()` for romanizing simple Japanese text."""
-
-_roomaji_kana = {
-    u'ã¢': 'a',     u'ã¤': 'i',     u'ã¦': 'u',     u'ã¨': 'e',     u'ãª': 'o',
-    u'ã«': 'ka',    u'ã­': 'ki',    u'ã¯': 'ku',    u'ã±': 'ke',    u'ã³': 'ko',
-    u'ãµ': 'sa',    u'ã·': 'shi',   u'ã¹': 'su',    u'ã»': 'se',    u'ã½': 'so',
-    u'ã¿': 'ta',    u'ã': 'chi',   u'ã': 'tsu',   u'ã': 'te',    u'ã': 'to',
-    u'ã': 'na',    u'ã': 'ni',    u'ã': 'nu',    u'ã': 'ne',    u'ã': 'no',
-    u'ã': 'ha',    u'ã': 'hi',    u'ã': 'fu',    u'ã': 'he',    u'ã': 'ho',
-    u'ã': 'ma',    u'ã': 'mi',    u'ã ': 'mu',    u'ã¡': 'me',    u'ã¢': 'mo',
-    u'ã¤': 'ya',                    u'ã¦': 'yu',                    u'ã¨': 'yo',
-    u'ã©': 'ra',    u'ãª': 'ri',    u'ã«': 'ru',    u'ã¬': 're',    u'ã­': 'ro',
-    u'ã¯': 'wa',    u'ã°': 'wi',                    u'ã±': 'we',    u'ã²': 'wo',
-                                                                    u'ã³': 'n',
-    u'ã¬': 'ga',    u'ã®': 'gi',    u'ã°': 'gu',    u'ã²': 'ge',    u'ã´': 'go',
-    u'ã¶': 'za',    u'ã¸': 'ji',    u'ãº': 'zu',    u'ã¼': 'ze',    u'ã¾': 'zo',
-    u'ã': 'da',    u'ã': 'ji',    u'ã': 'dzu',   u'ã': 'de',    u'ã': 'do',
-    u'ã': 'ba',    u'ã': 'bi',    u'ã': 'bu',    u'ã': 'be',    u'ã': 'bo',
-    u'ã': 'pa',    u'ã': 'pi',    u'ã': 'pu',    u'ã': 'pe',    u'ã': 'po',
-}
-
-_roomaji_youon = {
-    u'ã£': 'ya',                    u'ã¥': 'yu',                    u'ã§': 'yo',
-    u'ã': 'ya',                    u'ã': 'yu',                    u'ã': 'yo',
-}
-
-# XXX If romanize() ever handles hiragana, it will need to make sure that the
-# preceding character was a katakana
-# This does not include every small kana combination, but should include every
-# one used in a PokÃ©mon name.  An exhaustive list would be..  very long
-_roomaji_small_kana = {
-    u'ã¡': 'a',     u'ã£': 'i',     u'ã¥': 'u',     u'ã§': 'e',     u'ã©': 'o',
-}
-_roomaji_small_kana_combos = {
-                                                    u'ãã§': 'che',
-                                                    u'ã·ã§': 'she',
-    u'ãã¡': 'tha', u'ãã£': 'ti',  u'ãã¥': 'thu', u'ãã§': 'tye', u'ãã©': 'tho',
-    u'ãã¡': 'dha', u'ãã£': 'di',  u'ãã¥': 'dhu', u'ãã§': 'dye', u'ãã©': 'dho',
-    u'ãã¡': 'fa',  u'ãã£': 'fi',  u'ãã¥': 'hu',  u'ãã§': 'fe',  u'ãã©': 'fo',
-}
-
-def romanize(string):
-    """Converts a string of kana to roomaji."""
-
-    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
-
-    characters = []
-    last_kana = None  # Used for ã¼; ã£ or ã; ã or ã³
-    last_char = None  # Used for small kana combos
-    for char in string:
-        # Full-width Latin
-        if ord(char) >= 0xff11 and ord(char) <= 0xff5e:
-            if last_kana == 'sokuon':
-                raise ValueError("Sokuon cannot precede Latin characters.")
-
-            char = chr(ord(char) - 0xff11 + 0x31)
-            characters.append(char)
-
-            last_kana = None
-
-        # Small vowel kana
-        elif char in _roomaji_small_kana:
-            combo = last_char + char
-            if combo in _roomaji_small_kana_combos:
-                characters[-1] = _roomaji_small_kana_combos[combo]
-
-            else:
-                # If we don't know what it is...  act dumb and treat it as a
-                # full-size vowel.  Better than bailing, and seems to occur a
-                # lot, e.g. ãã£ is "pii"
-                characters.append(_roomaji_small_kana[char])
-
-        # Youon
-        elif char in _roomaji_youon:
-            if last_kana[-1] != 'i' or last_kana == 'i':
-                raise ValueError("Youon must follow an -i sound.")
-
-            # Drop the -i and append the ya/yu/yo sound
-            new_sound = _roomaji_youon[char]
-            if last_kana in ['shi', 'ji']:
-                # Strip the y-
-                new_char = last_kana[:-1] + new_sound[1:]
-            else:
-                new_char = last_kana[:-1] + new_sound
-
-            characters[-1] = new_char
-            last_kana = new_char
-
-        # Sokuon
-        elif char in (u'ã£', u'ã'):
-            # Remember it and double the consonant next time around
-            last_kana = 'sokuon'
-
-        # Extended vowel or n
-        elif char == u'ã¼':
-            if last_kana[-1] not in vowels:
-                raise ValueError(u"'ã¼' must follow by a vowel.")
-            characters.append(last_kana[-1])
-
-            last_kana = None
-
-        # Regular ol' kana
-        elif char in _roomaji_kana:
-            kana = _roomaji_kana[char]
-
-            if last_kana == 'sokuon':
-                if kana[0] in vowels:
-                    raise ValueError("Sokuon cannot precede a vowel.")
-
-                characters.append(kana[0])
-            elif last_kana == 'n' and kana[0] in vowels:
-                characters.append("'")
-
-            characters.append(kana)
-
-            last_kana = kana
-
-        # Not Japanese?
+"""Provides `romanize()` for romanizing simple Japanese text.
+
+Also provides available romanizers in a dictionary keyed by language identifier.
+"""
+
+class Romanizer(object):
+    def __init__(self, parent=None, **tables):
+        """Create a Romanizer
+
+        parent: A LookupTables to base this one on
+        tables: Dicts that become the object's attributes. If a parent is given,
+            its tables are used, and updated with the given ones
+        """
+        self.parent = parent
+        if parent:
+            self.tables = parent.tables
+            for name, table in tables.items():
+                # Take a copy -- don't want to clobber the parent's tables
+                self.tables[name] = dict(self.tables[name])
+                self.tables[name].update(table)
         else:
-            if last_kana == 'sokuon':
-                raise ValueError("Sokuon must be followed by another kana.")
-
-            characters.append(char)
-
-            last_kana = None
-
-        last_char = char
-
-
-    if last_kana == 'sokuon':
-        raise ValueError("Sokuon cannot be the last character.")
-
-    return unicode(''.join(characters))
+            self.tables = tables
+
+        for name, table in self.tables.items():
+            setattr(self, name, table)
+
+    def romanize(self, string):
+        """Convert a string of kana to roomaji."""
+
+        vowels = ['a', 'e', 'i', 'o', 'u', 'y']
+
+        characters = []
+        last_kana = None  # Used for ã¼; ã£ or ã; ã or ã³
+        last_char = None  # Used for small kana combos
+        for char in string:
+            # Full-width Latin
+            if 0xff01 <= ord(char) <= 0xff5e:
+                if last_kana == 'sokuon':
+                    raise ValueError("Sokuon cannot precede Latin characters.")
+
+                # XXX Real Unicode decomposition would be nicer
+                char = chr(ord(char) - 0xff01 + 0x21)
+                characters.append(char)
+
+                last_kana = None
+
+            # Small vowel kana
+            elif char in self.roomaji_small_kana:
+                combo = last_char + char
+                if combo in self.roomaji_small_kana_combos:
+                    characters[-1] = self.roomaji_small_kana_combos[combo]
+
+                else:
+                    # If we don't know what it is...  act dumb and treat it as a
+                    # full-size vowel.  Better than bailing, and seems to occur a
+                    # lot, e.g. ãã£ is "pii"
+                    characters.append(self.roomaji_small_kana[char])
+
+                last_kana = self.roomaji_small_kana[char]
+
+            # Youon
+            elif char in self.roomaji_youon:
+                if not last_kana or last_kana[-1] != 'i' or last_kana == 'i':
+                    raise ValueError("Youon must follow an -i sound.")
+
+                # Drop the -i and append the ya/yu/yo sound
+                new_sound = self.roomaji_youon[char]
+                if last_kana in self.y_drop:
+                    # Strip the y-
+                    new_char = self.y_drop[last_kana] + new_sound[1:]
+                else:
+                    new_char = last_kana[:-1] + new_sound
+
+                characters[-1] = new_char
+                last_kana = new_char
+
+            # Sokuon
+            elif char in (u'ã£', u'ã'):
+                # Remember it and double the consonant next time around
+                last_kana = 'sokuon'
+
+            # Extended vowel or n
+            elif char == u'ã¼':
+                if last_kana[-1] not in vowels:
+                    raise ValueError(u"'ã¼' must follow by a vowel.")
+                if last_kana[-1] in self.lengthened_vowels:
+                    characters[-1] = characters[-1][:-1]
+                    characters.append(self.lengthened_vowels[last_kana[-1]])
+                else:
+                    characters.append(last_kana[-1])
+
+                last_kana = None
+
+            # Regular ol' kana
+            elif char in self.roomaji_kana:
+                kana = self.roomaji_kana[char]
+
+                if last_kana == 'sokuon':
+                    if kana[0] in vowels:
+                        raise ValueError("Sokuon cannot precede a vowel.")
+
+                    characters.append(kana[0])
+                elif last_kana == 'n' and kana[0] in vowels:
+                    characters.append("'")
+
+                # Special characters fo doubled kana
+                if kana[0] in self.lengthened_vowels and characters and kana == characters[-1][-1]:
+                    kana = self.lengthened_vowels[kana[0]]
+                    characters[-1] = characters[-1][:-1]
+
+                characters.append(kana)
+
+                last_kana = kana
+
+            # Not Japanese?
+            else:
+                if last_kana == 'sokuon':
+                    raise ValueError("Sokuon must be followed by another kana.")
+
+                characters.append(char)
+
+                last_kana = None
+
+            last_char = char
+
+
+        if last_kana == 'sokuon':
+            raise ValueError("Sokuon cannot be the last character.")
+
+        return unicode(''.join(characters))
+
+
+romanizers = dict()
+
+romanizers['en'] = Romanizer(
+    roomaji_kana={
+        # Hiragana
+        u'ã': 'a',     u'ã': 'i',     u'ã': 'u',     u'ã': 'e',     u'ã': 'o',
+        u'ã': 'ka',    u'ã': 'ki',    u'ã': 'ku',    u'ã': 'ke',    u'ã': 'ko',
+        u'ã': 'sa',    u'ã': 'shi',   u'ã': 'su',    u'ã': 'se',    u'ã': 'so',
+        u'ã': 'ta',    u'ã¡': 'chi',   u'ã¤': 'tsu',   u'ã¦': 'te',    u'ã¨': 'to',
+        u'ãª': 'na',    u'ã«': 'ni',    u'ã¬': 'nu',    u'ã­': 'ne',    u'ã®': 'no',
+        u'ã¯': 'ha',    u'ã²': 'hi',    u'ãµ': 'fu',    u'ã¸': 'he',    u'ã»': 'ho',
+        u'ã¾': 'ma',    u'ã¿': 'mi',    u'ã': 'mu',    u'ã': 'me',    u'ã': 'mo',
+        u'ã': 'ya',                    u'ã': 'yu',                    u'ã': 'yo',
+        u'ã': 'ra',    u'ã': 'ri',    u'ã': 'ru',    u'ã': 're',    u'ã': 'ro',
+        u'ã': 'wa',    u'ã': 'wi',                    u'ã': 'we',    u'ã': 'wo',
+                                                                        u'ã': 'n',
+        u'ã': 'ga',    u'ã': 'gi',    u'ã': 'gu',    u'ã': 'ge',    u'ã': 'go',
+        u'ã': 'za',    u'ã': 'ji',    u'ã': 'zu',    u'ã': 'ze',    u'ã': 'zo',
+        u'ã ': 'da',    u'ã¢': 'ji',    u'ã¥': 'dzu',   u'ã§': 'de',    u'ã©': 'do',
+        u'ã°': 'ba',    u'ã³': 'bi',    u'ã¶': 'bu',    u'ã¹': 'be',    u'ã¼': 'bo',
+        u'ã±': 'pa',    u'ã´': 'pi',    u'ã·': 'pu',    u'ãº': 'pe',    u'ã½': 'po',
+
+        # Katakana
+        u'ã¢': 'a',     u'ã¤': 'i',     u'ã¦': 'u',     u'ã¨': 'e',     u'ãª': 'o',
+        u'ã«': 'ka',    u'ã­': 'ki',    u'ã¯': 'ku',    u'ã±': 'ke',    u'ã³': 'ko',
+        u'ãµ': 'sa',    u'ã·': 'shi',   u'ã¹': 'su',    u'ã»': 'se',    u'ã½': 'so',
+        u'ã¿': 'ta',    u'ã': 'chi',   u'ã': 'tsu',   u'ã': 'te',    u'ã': 'to',
+        u'ã': 'na',    u'ã': 'ni',    u'ã': 'nu',    u'ã': 'ne',    u'ã': 'no',
+        u'ã': 'ha',    u'ã': 'hi',    u'ã': 'fu',    u'ã': 'he',    u'ã': 'ho',
+        u'ã': 'ma',    u'ã': 'mi',    u'ã ': 'mu',    u'ã¡': 'me',    u'ã¢': 'mo',
+        u'ã¤': 'ya',                    u'ã¦': 'yu',                    u'ã¨': 'yo',
+        u'ã©': 'ra',    u'ãª': 'ri',    u'ã«': 'ru',    u'ã¬': 're',    u'ã­': 'ro',
+        u'ã¯': 'wa',    u'ã°': 'wi',                    u'ã±': 'we',    u'ã²': 'wo',
+                                                                        u'ã³': 'n',
+        u'ã¬': 'ga',    u'ã®': 'gi',    u'ã°': 'gu',    u'ã²': 'ge',    u'ã´': 'go',
+        u'ã¶': 'za',    u'ã¸': 'ji',    u'ãº': 'zu',    u'ã¼': 'ze',    u'ã¾': 'zo',
+        u'ã': 'da',    u'ã': 'ji',    u'ã': 'dzu',   u'ã': 'de',    u'ã': 'do',
+        u'ã': 'ba',    u'ã': 'bi',    u'ã': 'bu',    u'ã': 'be',    u'ã': 'bo',
+        u'ã': 'pa',    u'ã': 'pi',    u'ã': 'pu',    u'ã': 'pe',    u'ã': 'po',
+                                        u'ã´': 'vu',
+    },
+
+    roomaji_youon={
+        # Hiragana
+        u'ã': 'ya',                    u'ã': 'yu',                    u'ã': 'yo',
+
+        # Katakana
+        u'ã£': 'ya',                    u'ã¥': 'yu',                    u'ã§': 'yo',
+    },
+
+    # XXX If romanize() ever handles hiragana, it will need to make sure that the
+    # preceding character was a katakana
+    # This does not include every small kana combination, but should include every
+    # one used in a PokÃ©mon name.  An exhaustive list would be..  very long
+    roomaji_small_kana={
+        u'ã¡': 'a',     u'ã£': 'i',     u'ã¥': 'u',     u'ã§': 'e',     u'ã©': 'o',
+    },
+    roomaji_small_kana_combos={
+        # These are, by the way, fairly arbitrary.  "shi xi" to mean "sy" is
+        # particularly weird, but it seems to be what GF intends
+
+        # Simple vowel replacement
+                        u'ã¦ã£': 'wi',  u'ã¦ã¥': 'wu',  u'ã¦ã§': 'we',  u'ã¦ã©': 'wo',
+        u'ã´ã¡': 'va',  u'ã´ã£': 'vi',                  u'ã´ã§': 've',  u'ã´ã©': 'vo',
+                                                        u'ãã§': 'che',
+                                                        u'ã·ã§': 'she',
+                                                        u'ã¸ã§': 'je',
+        u'ãã¡': 'tha', u'ãã£': 'ti',  u'ãã¥': 'thu', u'ãã§': 'tye', u'ãã©': 'tho',
+        u'ãã¡': 'dha', u'ãã£': 'di',  u'ãã¥': 'dhu', u'ãã§': 'dye', u'ãã©': 'dho',
+        u'ãã¡': 'fa',  u'ãã£': 'fi',  u'ãã¥': 'hu',  u'ãã§': 'fe',  u'ãã©': 'fo',
+
+        # Not so much
+        u'ã·ã£': 'sy',
+        u'ãã£': 'my',
+        u'ãã£': 'by',
+        u'ãã£': 'py',
+    },
+    lengthened_vowels={},
+    y_drop={'chi': 'ch', 'shi': 'sh', 'ji': 'j'},
+)
+
+romanizers['cs'] = Romanizer(parent=romanizers['en'],
+    roomaji_kana={
+        u'ã': u'Å¡i', u'ã¡': u'Äi', u'ã¤': u'cu',
+        u'ã': u'ja', u'ã': u'ju', u'ã': u'jo',
+        u'ã': u'dÅ¾i', u'ã¢': u'dÅ¾i',
+        u'ã·': u'Å¡i', u'ã': u'Äi', u'ã': u'cu',
+        u'ã¤': u'ja', u'ã¦': u'ju', u'ã¨': 'jo',
+        u'ã¸': u'dÅ¾i', u'ã': u'dÅ¾i',
+    },
+    roomaji_youon={
+        u'ã': 'ja', u'ã': 'ju', u'ã': 'jo',
+        u'ã£': 'ja', u'ã¥': 'ju', u'ã§': 'jo',
+    },
+    roomaji_small_kana_combos={
+        u'ãã§': u'Äe', u'ã·ã§': u'Å¡e', u'ã¸ã§': u'dÅ¾e',
+        u'ãã§': u'tje', u'ãã§': u'dje',
+        u'ã·ã£': u'sÃ­', u'ãã£': u'mÃ­', u'ãã£': u'bÃ­', u'ãã£': u'pÃ­',
+    },
+    lengthened_vowels={'a': u'Ã¡', 'e': u'Ã©', 'i': u'Ã­', 'o': u'Ã³', 'u': u'Ãº'},
+    y_drop={u'Äi': u'Ä', u'Å¡i': u'Å¡', u'dÅ¾i': u'dÅ¾', u'ni': u'Åj'},
+)
+
+def romanize(string, lang='en'):
+    """Convert a string of kana to roomaji."""
+
+    # Get the correct romanizer; fall back to English
+    romanizer = romanizers.get(lang, 'en')
+
+    # Romanize away!
+    return romanizer.romanize(string)