Utilisateur:Ftiercel/ja-kana2romaji.py

ja-kana2romaji.py
#!/usr/bin/python
# -*- coding: utf-8  -*-

# Author : Fabrice TIERCELIN
# Date : 2008-02-16
# Version : 1.1
# Licence : GPL

import sys
import re
import time

# kana2romaji returns the romaji of the given word in kana (either hiragana or katakana)
#
# How it works :
# The mecanism is based on the big characters (for instance : いらっしゃる -> いらしる)
# because the romaji of those characters are all separated by dots.
# Each iteration creates a syllable romaji.
# Then, the romaji of each big character is divised into three parts (1) :
#  - the consonant (if any)
#  - the vowel
# For the first consonant, there are several romaji, depending on different contexts (2) :
#  - there is a ん before
#  - there is a っ before
# For the vowel, there are several romaji too (2) :
#  - there is a ゃ or similar after
#  - there is no small character around
# Last, we check if there is any ん or small characters around and we choose the good romaji (3).
    
def pop(kana):
    char = kana[0]
    remainingKana = kana
    while (kana != char + remainingKana) and (len(remainingKana) > 0):
        remainingKana = remainingKana[1:]
    if (len(remainingKana) > 0):
        remainingKana = kana[1:]
    return char, remainingKana
    
def kana2romaji(kana):
    remainingKana = kana
##    remainingKana = kana.replace(u' ', u'')
    romaji = u''
    while (len(remainingKana) > 0):
        isFinished = False
        hasSmallTsuBefore = False
        hasNBefore = False
        char, remainingKana = pop(remainingKana)
        # First, we check if there is any っ or ん before a big character.
        # We notice it and then we increment the remaining kana.
        smallChar = u''
        if char in [u'っ' ,u'ッ']:
            hasSmallTsuBefore = True
            if (len(remainingKana) > 0):
                char, remainingKana = pop(remainingKana)
            else:
                isFinished = True
        elif char in [u'ん' ,u'ン']:
            hasNBefore = True
            if (len(remainingKana) > 0):
                char, remainingKana = pop(remainingKana)
            else:
                romaji = romaji + u'n'
                isFinished = True

        if not isFinished:
            # 1. The big character

            # For the ん romaji
            if hasNBefore:
                if char in [u'あ' ,u'い' ,u'う' ,u'え' ,u'お' ,u'を' ,u'や' ,u'ゆ' ,u'よ' ,u'ア' ,u'イ' ,u'ウ' ,u'エ' ,u'オ' ,u'ヲ' ,u'ヤ' ,u'ユ' ,u'ヨ']:
                    n = u'n\''
                else:
                    n = u'n'

            # For the first consonant romaji
            if char in [u'あ' ,u'い' ,u'う' ,u'え' ,u'お' ,u'を' ,u'ア' ,u'イ' ,u'ウ' ,u'エ' ,u'オ' ,u'ヲ']:
                consonantWithTsuBefore = u''
                consonant = u''
            elif char in [u'か' ,u'き' ,u'く' ,u'け' ,u'こ' ,u'カ' ,u'キ' ,u'ク' ,u'ケ' ,u'コ']:
                consonantWithTsuBefore = u'kk'
                consonant = u'k'
            elif char in [u'が' ,u'ぎ' ,u'ぐ' ,u'げ' ,u'ご' ,u'ガ' ,u'ギ' ,u'グ' ,u'ゲ' ,u'ゴ']:
                consonantWithTsuBefore = u'gg'
                consonant = u'g'
            elif char in [u'さ' ,u'す' ,u'せ' ,u'そ' ,u'サ' ,u'ス' ,u'セ' ,u'ソ']:
                consonantWithTsuBefore = u'ss'
                consonant = u's'
            elif char in [u'し' ,u'シ']:
                consonantWithTsuBefore = u'ssh'
                consonant = u'sh'
            elif char in [u'た' ,u'て' ,u'と' ,u'タ' ,u'テ' ,u'ト']:
                consonantWithTsuBefore = u'tt'
                consonant = u't'
            elif char in [u'ち' ,u'チ']:
                consonantWithTsuBefore = u'tch'
                consonant = u'ch'
            elif char in [u'つ' ,u'ツ']:
                consonantWithTsuBefore = u'tts'
                consonant = u'ts'
            elif char in [u'ざ' ,u'ず' ,u'ぜ' ,u'ぞ' ,u'づ' ,u'ザ' ,u'ズ' ,u'ゼ' ,u'ゾ' ,u'ヅ']:
                consonantWithTsuBefore = u'zz'
                consonant = u'z'
            elif char in [u'だ' ,u'で' ,u'ど' ,u'ダ' ,u'デ' ,u'ド']:
                consonantWithTsuBefore = u'dd'
                consonant = u'd'
            elif char in [u'じ' ,u'ぢ' ,u'ジ' ,u'ヂ']:
                consonantWithTsuBefore = u'jj'
                consonant = u'j'
            elif char in [u'な' ,u'に' ,u'ぬ' ,u'ね' ,u'の' ,u'ナ' ,u'ニ' ,u'ヌ' ,u'ネ' ,u'ノ']:
                consonantWithTsuBefore = u'nn'
                consonant = u'n'
            elif char in [u'は' ,u'ひ' ,u'へ' ,u'ほ' ,u'ハ' ,u'ヒ' ,u'ヘ' ,u'ホ']:
                consonantWithTsuBefore = u'hh'
                consonant = u'h'
            elif char in [u'ふ' ,u'フ']:
                consonantWithTsuBefore = u'ff'
                consonant = u'f'
            elif char in [u'ば' ,u'び' ,u'ぶ' ,u'べ' ,u'ぼ' ,u'バ' ,u'ビ' ,u'ブ' ,u'ベ' ,u'ボ']:
                consonantWithTsuBefore = u'bb'
                consonant = u'b'
            elif char in [u'ぱ' ,u'ぴ' ,u'ぷ' ,u'ぺ' ,u'ぽ' ,u'パ' ,u'ピ' ,u'プ' ,u'ペ' ,u'ポ']:
                consonantWithTsuBefore = u'pp'
                consonant = u'p'
            elif char in [u'ま' ,u'み' ,u'む' ,u'め' ,u'も' ,u'マ' ,u'ミ' ,u'ム' ,u'メ' ,u'モ']:
                consonantWithTsuBefore = u'mm'
                consonant = u'm'
            elif char in [u'や' ,u'ゆ' ,u'よ' ,u'ヤ' ,u'ユ' ,u'ヨ']:
                consonantWithTsuBefore = u'yy'
                consonant = u'y'
            elif char in [u'ら' ,u'り' ,u'る' ,u'れ' ,u'ろ' ,u'ラ' ,u'リ' ,u'ル' ,u'レ' ,u'ロ']:
                consonantWithTsuBefore = u'rr'
                consonant = u'r'
            elif char in [u'わ' ,u'ゐ' ,u'ゑ' ,u'ワ' ,u'ヰ' ,u'ヱ']:
                consonantWithTsuBefore = u'ww'
                consonant = u'w'
            else:
                consonantWithTsuBefore = u''
                consonant = u''

            # For the vowel romaji
            if char in [u'あ' ,u'か' ,u'が' ,u'さ' ,u'ざ' ,u'た' ,u'だ' ,u'な' ,u'は' ,u'ば' ,u'ぱ' ,u'ま' ,u'や' ,u'ら' ,u'わ' ,u'ア' ,u'カ' ,u'ガ' ,u'サ' ,u'ザ' ,u'タ' ,u'ダ' ,u'ナ' ,u'ハ' ,u'バ' ,u'パ' ,u'マ' ,u'ヤ' ,u'ラ' ,u'ワ']:
                vowelNormal = u'a'
                vowelLong = u'ā'
                vowelWithSmallChar = u''
            elif char in [u'い' ,u'き' ,u'ぎ' ,u'し' ,u'じ' ,u'ち' ,u'ぢ' ,u'に' ,u'ひ' ,u'び' ,u'ぴ' ,u'み' ,u'り' ,u'ゐ']:
                vowelNormal = u'i'
                vowelLong = u'ii'
                vowelWithSmallChar = u'y'
            elif char in [u'イ' ,u'キ' ,u'ギ' ,u'シ' ,u'ジ' ,u'チ' ,u'ヂ' ,u'ニ' ,u'ヒ' ,u'ビ' ,u'ピ' ,u'ミ' ,u'リ' ,u'ヰ']:
                vowelNormal = u'i'
                vowelLong = u'ī'
                vowelWithSmallChar = u'y'
            elif char in [u'う' ,u'ゔ' ,u'く' ,u'ぐ' ,u'す' ,u'ず' ,u'つ' ,u'づ' ,u'ぬ' ,u'ふ' ,u'ぶ' ,u'ぷ' ,u'む' ,u'ゆ' ,u'る' ,u'ウ' ,u'ヴ' ,u'ク' ,u'グ' ,u'ス' ,u'ズ' ,u'ツ' ,u'ヅ' ,u'ヌ' ,u'フ' ,u'ブ' ,u'プ' ,u'ム' ,u'ユ' ,u'ル']:
                vowelNormal = u'u'
                vowelLong = u'ū'
                vowelWithSmallChar = u''
            elif char in [u'え' ,u'け' ,u'げ' ,u'せ' ,u'ぜ' ,u'て' ,u'で' ,u'ね' ,u'へ' ,u'べ' ,u'ぺ' ,u'め' ,u'れ' ,u'ゑ']:
                vowelNormal = u'e'
                vowelLong = u'ei'
                vowelWithSmallChar = u''
            elif char in [u'エ' ,u'ケ' ,u'ゲ' ,u'セ' ,u'ゼ' ,u'テ' ,u'デ' ,u'ネ' ,u'ヘ' ,u'ベ' ,u'ペ' ,u'メ' ,u'レ' ,u'ヱ' ,u'ヹ']:
                vowelNormal = u'e'
                vowelLong = u'ē'
                vowelWithSmallChar = u''
            elif char in [u'お' ,u'こ' ,u'ご' ,u'そ' ,u'ぞ' ,u'と' ,u'ど' ,u'の' ,u'ほ' ,u'ぼ' ,u'ぽ' ,u'も' ,u'よ' ,u'ろ' ,u'を' ,u'オ' ,u'コ' ,u'ゴ' ,u'ソ' ,u'ゾ' ,u'ト' ,u'ド' ,u'ノ' ,u'ホ' ,u'ボ' ,u'ポ' ,u'モ' ,u'ヨ' ,u'ロ' ,u'ヲ' ,u'ヺ']:
                vowelNormal = u'o'
                vowelLong = u'ō'
                vowelWithSmallChar = u''
            else:
                vowelNormal = u'?'
                vowelLong = u'?'
                vowelWithSmallChar = u''

            # For the vowel romaji before a small character
            if char in [u'い' ,u'き' ,u'ぎ' ,u'に' ,u'ひ' ,u'び' ,u'ぴ' ,u'み' ,u'り' ,u'ゐ' ,u'イ' ,u'キ' ,u'ギ' ,u'ニ' ,u'ヒ' ,u'ビ' ,u'ピ' ,u'ミ' ,u'リ' ,u'ヰ']:
                vowelBeforeSmallChar = u'y'
            else:
                vowelBeforeSmallChar = u''


            # 2. The following small character
            hasLongVowel = False
            hasSmallCharAfter = False
            hasLongSmallCharAfter = False

            if (len(remainingKana) > 0):
                if remainingKana[0] in [u'ゃ' ,u'ゅ' ,u'ょ' ,u'ャ' ,u'ュ' ,u'ョ']:
                    hasSmallCharAfter = True
                elif (char in [u'し' ,u'じ' ,u'ち' ,u'ぢ' ,u'シ' ,u'ジ' ,u'チ' ,u'ヂ']) and (remainingKana[0] in [u'ぁ' ,u'ぅ' ,u'ぇ' ,u'ぉ' ,u'ァ' ,u'ゥ' ,u'ェ' ,u'ォ']):
                    hasSmallCharAfter = True
                elif remainingKana[0] in [u'ぁ' ,u'ぃ' ,u'ぅ' ,u'ぇ' ,u'ぉ' ,u'ァ' ,u'ィ' ,u'ゥ' ,u'ェ' ,u'ォ' ,u'ー']:
                    hasLongVowel = True

                if hasSmallCharAfter:
                    if remainingKana[0] in [u'ゃ' ,u'ぁ' ,u'ャ' ,u'ァ']:
                        smallChar = u'a'
                        longSmallChar = u'ā'
                    elif remainingKana[0] == u'ぃ':
                        smallChar = u'i'
                        longSmallChar = u'ii'
                    elif remainingKana[0] == u'ィ':
                        smallChar = u'i'
                        longSmallChar = u'ī'
                    elif remainingKana[0] in [u'ゅ' ,u'ぅ' ,u'ュ' ,u'ゥ']:
                        smallChar = u'u'
                        longSmallChar = u'ū'
                    elif remainingKana[0] == u'ぇ':
                        smallChar = u'e'
                        longSmallChar = u'ei'
                    elif remainingKana[0] == u'ェ':
                        smallChar = u'e'
                        longSmallChar = u'ē'
                    elif remainingKana[0] in [u'ょ' ,u'ぉ' ,u'ョ' ,u'ォ']:
                        smallChar = u'o'
                        longSmallChar = u'ō'
                    else:
                        smallChar = u'?'
                        longSmallChar = u'?'

                if hasLongVowel or hasSmallCharAfter:
                    remainingKana = pop(remainingKana)[1]
                
                if hasSmallCharAfter and (len(remainingKana) > 0):
                    if remainingKana[0] == u'ー':
                        hasLongSmallCharAfter = True
                        remainingKana = pop(remainingKana)[1]
                    

            # 3. Romaji writting
            # 3.1. consonants
            if hasNBefore:
                romaji = romaji + n + consonant
            elif hasSmallTsuBefore:
                romaji = romaji + consonantWithTsuBefore
            else:
                romaji = romaji + consonant

            # 3.2. vowels
            if hasLongSmallCharAfter:
                romaji = romaji + vowelBeforeSmallChar + longSmallChar
            elif hasSmallCharAfter:
                romaji = romaji + vowelBeforeSmallChar + smallChar
            elif hasLongVowel:
                romaji = romaji + vowelLong
            else:
                romaji = romaji + vowelNormal
    return romaji

if __name__ == "__main__":

    wordList = []

    ftin =open('./japaneseWords.txt', 'r')
    line = ftin.readline()
    while (line):
        wordList.append(line)
        line = ftin.readline()
    ftin.close()

    now = time.localtime()
    filename = './romaji-' + str(now.tm_hour) + '-' + str(now.tm_min) + '-' + str(now.tm_sec) + '.txt'
    outputFile = open(filename, 'w')

    for rawWord in wordList:
##    rawWord = wordList[21]
        word = rawWord.decode('utf-8')
        decodedWord = word.strip(u'\r\n ')
        romaji = kana2romaji(decodedWord)
        encodedWord = decodedWord.encode('utf-8')
        encodedRomaji = romaji.encode('utf-8')
        outputFile.write(encodedWord + '\t' + encodedRomaji)
        outputFile.write("\r\n")
    
    outputFile.close()