|
|
import re
|
|
|
from janome.tokenizer import Tokenizer
|
|
|
|
|
|
|
|
|
def normalize_ipa_string(raw_phoneme_str: str) -> str:
|
|
|
"""
|
|
|
Cleans a raw phoneme string from any G2P system into a standardized format
|
|
|
with IPA prosody markers.
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
normalized = raw_phoneme_str.replace('、', ' | ')
|
|
|
|
|
|
normalized = normalized.replace(',', ' | ')
|
|
|
|
|
|
|
|
|
|
|
|
normalized = re.sub(r'[.。??!!]', ' ‖ ', normalized)
|
|
|
|
|
|
|
|
|
|
|
|
normalized = re.sub(r'\s+', ' ', normalized).strip()
|
|
|
|
|
|
return normalized
|
|
|
|
|
|
|
|
|
class JapaneseToIPA:
|
|
|
"""
|
|
|
V5.0: A G2P engine improved with a flexible phonetic chart.
|
|
|
- Implements detailed phonetic transcriptions (palatalization, dentals).
|
|
|
- Handles contextual assimilation of the moraic nasal 'ん' (/N/).
|
|
|
- Models allophonic variations like bidakuon (nasal 'g') and fricative 'b'.
|
|
|
"""
|
|
|
|
|
|
def __init__(self, silent=False):
|
|
|
if not silent:
|
|
|
print("Initializing Janome tokenizer...")
|
|
|
|
|
|
self.tokenizer = Tokenizer()
|
|
|
|
|
|
if not silent:
|
|
|
print("Janome initialized.")
|
|
|
|
|
|
|
|
|
self.accent_dict = {
|
|
|
'ワタシ': 'h', 'ネコ': 'h', 'イヌ': 'o', 'ハシ': 'a', 'コトバ': 'h', 'オンガク': 'a', 'トウキョウ': 'h',
|
|
|
'ニホン': 'o', 'センセイ': 'h', 'ガクセイ': 'h', 'ケイザイ': 'h', 'カゾク': 'o', 'ジカン': 'h',
|
|
|
'クルマ': 'h', 'デンワ': 'a', 'トモダチ': 'h', 'オイシイ': 'h', 'タノシイ': 'h', 'タカイ': 'o',
|
|
|
'ヤスイ': 'o', 'アタラシイ': 'h', 'オオキイ': 'h', 'チイサイ': 'o', 'イイ': 'o', 'ワルイ': 'o',
|
|
|
'ムズカシイ': 'h', 'キレイ': 'o', 'ゲンキ': 'o', 'ベンリ': 'o', 'スキ': 'h', 'イク': 'h',
|
|
|
'タベル': 'o', 'ミル': 'o', 'スル': 'h', 'クル': 'o', 'カク': 'o', 'ヨム': 'o', 'ハナス': 'o',
|
|
|
'アル': 'o', 'イル': 'h', 'アリガトウ': 'o', 'デス': 'h', 'マス': 'o',
|
|
|
'オッカア': 'o', 'オレ': 'h', 'フシギ': 'h', 'クリ': 'o', 'マツタケ': 'o', 'マイニチ': 'h'
|
|
|
}
|
|
|
|
|
|
self.kana_to_romaji_map = {
|
|
|
'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o', 'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke',
|
|
|
'コ': 'ko', 'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so', 'タ': 'ta', 'チ': 'chi',
|
|
|
'ツ': 'tsu', 'テ': 'te', 'ト': 'to', 'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no',
|
|
|
'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho', 'マ': 'ma', 'ミ': 'mi', 'ム': 'mu',
|
|
|
'メ': 'me', 'モ': 'mo', 'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo', 'ラ': 'ra', 'リ': 'ri', 'ル': 'ru',
|
|
|
'レ': 're', 'ロ': 'ro', 'ワ': 'wa', 'ヲ': 'o', 'ン': 'n', 'ガ': 'ga', 'ギ': 'gi', 'グ': 'gu',
|
|
|
'ゲ': 'ge', 'ゴ': 'go', 'ザ': 'za', 'ジ': 'ji', 'ズ': 'zu', 'ゼ': 'ze', 'ゾ': 'zo', 'ダ': 'da',
|
|
|
'ヂ': 'ji', 'ヅ': 'zu', 'デ': 'de', 'ド': 'do', 'バ': 'ba', 'ビ': 'bi', 'ブ': 'bu', 'ベ': 'be',
|
|
|
'ボ': 'bo', 'パ': 'pa', 'ピ': 'pi', 'プ': 'pu', 'ペ': 'pe', 'ポ': 'po', 'キャ': 'kya', 'キュ': 'kyu',
|
|
|
'キョ': 'kyo', 'シャ': 'sha', 'シュ': 'shu', 'ショ': 'sho', 'チャ': 'cha', 'チュ': 'chu', 'チョ': 'cho',
|
|
|
'ニャ': 'nya', 'ニュ': 'nyu', 'ニョ': 'nyo', 'ヒャ': 'hya', 'ヒュ': 'hyu', 'ヒョ': 'hyo', 'ミャ': 'mya',
|
|
|
'ミュ': 'myu', 'ミョ': 'myo', 'リャ': 'rya', 'リュ': 'ryu', 'リョ': 'ryo', 'ギャ': 'gya', 'ギュ': 'gyu',
|
|
|
'ギョ': 'gyo', 'ジャ': 'ja', 'ジュ': 'ju', 'ジョ': 'jo', 'ビャ': 'bya', 'ビュ': 'byu', 'ビョ': 'byo',
|
|
|
'ピャ': 'pya', 'ピュ': 'pyu', 'ピョ': 'pyo', 'ー': ':', 'ッ': 'っ'
|
|
|
}
|
|
|
|
|
|
self.romaji_to_ipa_map = {
|
|
|
'a': 'a', 'i': 'i', 'u': 'ɯ', 'e': 'e', 'o': 'o',
|
|
|
'ka': 'ka', 'ki': 'kʲi', 'ku': 'kɯ', 'ke': 'ke', 'ko': 'ko', 'kya': 'kʲa', 'kyu': 'kʲu', 'kyo': 'kʲo',
|
|
|
'ga': 'ɡa', 'gi': 'ɡʲi', 'gu': 'ɡɯ', 'ge': 'ɡe', 'go': 'ɡo', 'gya': 'ɡʲa', 'gyu': 'ɡʲu', 'gyo': 'ɡʲo',
|
|
|
'sa': 'sa', 'shi': 'ɕi', 'su': 'sɯ', 'se': 'se', 'so': 'so', 'sha': 'ɕa', 'shu': 'ɕu', 'sho': 'ɕo',
|
|
|
'za': 'za', 'ji': 'ʑi', 'zu': 'zɯ', 'ze': 'ze', 'zo': 'zo', 'ja': 'ʑa', 'ju': 'ʑu', 'jo': 'ʑo',
|
|
|
'ta': 't̪a', 'chi': 'tɕi', 'tsu': 't̪sɯ', 'te': 't̪e', 'to': 't̪o', 'cha': 'tɕa', 'chu': 'tɕu',
|
|
|
'cho': 'tɕo',
|
|
|
'da': 'd̪a', 'de': 'd̪e', 'do': 'd̪o',
|
|
|
'na': 'n̪a', 'ni': 'ɲi', 'nu': 'n̪ɯ', 'ne': 'n̪e', 'no': 'n̪o', 'nya': 'ɲa', 'nyu': 'ɲu', 'nyo': 'ɲo',
|
|
|
'ha': 'ha', 'hi': 'çi', 'fu': 'ɸɯ', 'he': 'he', 'ho': 'ho', 'hya': 'ça', 'hyu': 'çu', 'hyo': 'ço',
|
|
|
'ba': 'ba', 'bi': 'bʲi', 'bu': 'bɯ', 'be': 'be', 'bo': 'bo', 'bya': 'bʲa', 'byu': 'bʲu', 'byo': 'bʲo',
|
|
|
'pa': 'pa', 'pi': 'pʲi', 'pu': 'pɯ', 'pe': 'pe', 'po': 'po', 'pya': 'pʲa', 'pyu': 'pʲu', 'pyo': 'pʲo',
|
|
|
'ma': 'ma', 'mi': 'mʲi', 'mu': 'mɯ', 'me': 'me', 'mo': 'mo', 'mya': 'mʲa', 'myu': 'mʲu', 'myo': 'mʲo',
|
|
|
'ya': 'ja', 'yu': 'jɯ', 'yo': 'jo',
|
|
|
'ra': 'ɾa', 'ri': 'ɾʲi', 'ru': 'ɾɯ', 're': 'ɾe', 'ro': 'ɾo', 'rya': 'ɾʲa', 'ryu': 'ɾʲu', 'ryo': 'ɾʲo',
|
|
|
'wa': 'wa', 'n': 'ɴ', ':': 'ː'
|
|
|
}
|
|
|
|
|
|
def _preprocess_text(self, text):
|
|
|
text = text.replace('おれあ', 'おれは').replace('知らんが', 'しらないが').replace('おっ母', 'おっかあ')
|
|
|
text = re.sub(r'([あいうえおアイウエオ])\1+', r'\1ー', text)
|
|
|
text = re.sub(r'んん+', 'んー', text)
|
|
|
text = text.replace('、', ' | ')
|
|
|
return text
|
|
|
|
|
|
def _kata_to_romaji(self, katakana_text):
|
|
|
romaji = ""
|
|
|
i = 0
|
|
|
while i < len(katakana_text):
|
|
|
|
|
|
if i + 1 < len(katakana_text) and katakana_text[i:i + 2] in self.kana_to_romaji_map:
|
|
|
romaji += self.kana_to_romaji_map[katakana_text[i:i + 2]]
|
|
|
i += 2
|
|
|
|
|
|
elif katakana_text[i] == 'ッ':
|
|
|
if i + 1 < len(katakana_text):
|
|
|
next_char_combo = katakana_text[i + 1:i + 3]
|
|
|
next_char = katakana_text[i + 1]
|
|
|
|
|
|
if next_char_combo in self.kana_to_romaji_map:
|
|
|
next_romaji = self.kana_to_romaji_map.get(next_char_combo)
|
|
|
else:
|
|
|
next_romaji = self.kana_to_romaji_map.get(next_char, "")
|
|
|
if next_romaji: romaji += next_romaji[0]
|
|
|
i += 1
|
|
|
|
|
|
elif katakana_text[i] in self.kana_to_romaji_map:
|
|
|
romaji += self.kana_to_romaji_map[katakana_text[i]]
|
|
|
i += 1
|
|
|
else:
|
|
|
i += 1
|
|
|
return romaji
|
|
|
|
|
|
def _romaji_to_ipa(self, romaji: str) -> str:
|
|
|
return self.romaji_to_ipa_map.get(romaji, romaji)
|
|
|
|
|
|
def _apply_allophonic_rules(self, ipa_word, speed, is_word_initial):
|
|
|
"""Applies contextual phonetic rules like bidakuon and fricative 'b'."""
|
|
|
|
|
|
|
|
|
if not is_word_initial and 'ɡ' in ipa_word:
|
|
|
ipa_word = ipa_word.replace('ɡʲ', 'ŋʲ').replace('ɡ', 'ŋ')
|
|
|
|
|
|
|
|
|
|
|
|
if speed > 1.15:
|
|
|
vowels = "aiɯeo"
|
|
|
ipa_word = re.sub(f'([{vowels}])bʲ', fr'\1βʲ', ipa_word)
|
|
|
ipa_word = re.sub(f'([{vowels}])b([{vowels}])', fr'\1β\2', ipa_word)
|
|
|
|
|
|
return ipa_word
|
|
|
|
|
|
def _get_nasal_allophone(self, next_mora):
|
|
|
"""Determines the correct allophone for 'ん' based on the following mora."""
|
|
|
if not next_mora:
|
|
|
return 'ɴ'
|
|
|
|
|
|
|
|
|
first_char = next_mora[0]
|
|
|
if first_char in ['b', 'p', 'm']: return 'm'
|
|
|
if first_char in ['k', 'g']: return 'ŋ'
|
|
|
|
|
|
|
|
|
if next_mora.startswith(('ch', 'j', 'ny')): return 'n̠'
|
|
|
if next_mora.startswith('sh'): return 'n̠'
|
|
|
|
|
|
if first_char in ['d', 't', 'r'] or next_mora.startswith('ts'):
|
|
|
return 'n̪'
|
|
|
|
|
|
|
|
|
if first_char in ['a', 'i', 'u', 'e', 'o', 'y', 'w', 'h', 's', 'z']:
|
|
|
return 'ɰ̃'
|
|
|
|
|
|
return 'ɴ'
|
|
|
|
|
|
def _phonemize_line(self, line):
|
|
|
if not line.strip(): return ("", 1.0)
|
|
|
speed_score = 1.0
|
|
|
|
|
|
|
|
|
clean_line = line.strip()
|
|
|
if '!!' in clean_line or '?!' in clean_line:
|
|
|
speed_score *= 1.25
|
|
|
elif '!' in clean_line:
|
|
|
speed_score *= 1.15
|
|
|
if '…' in clean_line or '...' in clean_line: speed_score *= 0.85
|
|
|
if '~' in clean_line or 'ー' in clean_line: speed_score *= 0.90
|
|
|
if '、' in clean_line: speed_score *= 0.95
|
|
|
if 'うーん' in clean_line or 'ええと' in clean_line or 'あのー' in clean_line or 'そのー' in clean_line:
|
|
|
speed_score *= 0.80
|
|
|
elif 'まあ' in clean_line:
|
|
|
speed_score *= 0.90
|
|
|
if 'きゃー' in clean_line or 'うわー' in clean_line:
|
|
|
speed_score *= 1.20
|
|
|
elif 'よし' in clean_line or 'ほら' in clean_line:
|
|
|
speed_score *= 1.15
|
|
|
if clean_line.endswith(('ぞ', 'ぜ', 'さ')):
|
|
|
speed_score *= 1.10
|
|
|
elif clean_line.endswith(('だよ', 'もん')):
|
|
|
speed_score *= 1.05
|
|
|
if clean_line.endswith(('かな', 'かしら')):
|
|
|
speed_score *= 0.90
|
|
|
elif clean_line.endswith(('なあ', 'なー', 'ねえ', 'ねー')):
|
|
|
speed_score *= 0.85
|
|
|
elif clean_line.endswith('ね'):
|
|
|
speed_score *= 0.95
|
|
|
if clean_line.endswith('っ'): speed_score *= 1.15
|
|
|
if len(clean_line.replace('「', '').replace('」', '')) < 8: speed_score *= 1.10
|
|
|
|
|
|
|
|
|
is_question = '?' in line
|
|
|
if not is_question:
|
|
|
tokens_for_check = list(self.tokenizer.tokenize(line))
|
|
|
for token in reversed(tokens_for_check):
|
|
|
pos_major = token.part_of_speech.split(',')[0]
|
|
|
if pos_major == '記号': continue
|
|
|
is_sf_particle = '終助詞' in token.part_of_speech
|
|
|
if (token.surface in ['か', 'の']) and is_sf_particle: is_question = True
|
|
|
break
|
|
|
|
|
|
|
|
|
processed_line = self._preprocess_text(line)
|
|
|
tokens = self.tokenizer.tokenize(processed_line)
|
|
|
line_ipa_parts = []
|
|
|
is_phrase_initial = True
|
|
|
|
|
|
for token in tokens:
|
|
|
surface = token.surface
|
|
|
if surface == '|':
|
|
|
line_ipa_parts.append(',')
|
|
|
is_phrase_initial = True
|
|
|
continue
|
|
|
|
|
|
pos_full = token.part_of_speech
|
|
|
pos_major = pos_full.split(',')[0]
|
|
|
reading = token.reading if token.reading != '*' else surface
|
|
|
|
|
|
if pos_major == '記号':
|
|
|
if not (is_question and surface in ['?', 'か']):
|
|
|
if surface not in ['「', '」']: line_ipa_parts.append(surface)
|
|
|
continue
|
|
|
|
|
|
romaji = self._kata_to_romaji(reading) if surface != 'は' or '係助詞' not in pos_full else 'wa'
|
|
|
moras = re.findall(r'[bcdfghjklmnpqrstvwxyz]?y?[aiueo]:?|n', romaji)
|
|
|
|
|
|
|
|
|
num_moras = len(moras)
|
|
|
primary_accent_mora = -1
|
|
|
if pos_major == '助詞':
|
|
|
pass
|
|
|
else:
|
|
|
known_pattern = self.accent_dict.get(reading)
|
|
|
if known_pattern == 'a':
|
|
|
primary_accent_mora = 0
|
|
|
elif known_pattern == 'o':
|
|
|
primary_accent_mora = num_moras - 1
|
|
|
else:
|
|
|
primary_accent_mora = 1 if num_moras > 1 else -1
|
|
|
apply_secondary_stress = (num_moras >= 4 and primary_accent_mora != 0)
|
|
|
|
|
|
|
|
|
ipa_parts = []
|
|
|
for i, mora in enumerate(moras):
|
|
|
ipa_mora = ""
|
|
|
if mora == 'n':
|
|
|
next_mora = moras[i + 1] if i + 1 < len(moras) else None
|
|
|
|
|
|
ipa_mora = self._get_nasal_allophone(next_mora)
|
|
|
else:
|
|
|
ipa_mora = self._romaji_to_ipa(mora)
|
|
|
|
|
|
|
|
|
if i == primary_accent_mora:
|
|
|
ipa_parts.append('ˈ' + ipa_mora)
|
|
|
elif i == 0 and apply_secondary_stress:
|
|
|
ipa_parts.append('ˌ' + ipa_mora)
|
|
|
else:
|
|
|
ipa_parts.append(ipa_mora)
|
|
|
|
|
|
word_ipa = "".join(ipa_parts)
|
|
|
|
|
|
|
|
|
word_ipa = self._apply_allophonic_rules(word_ipa, speed_score, is_phrase_initial)
|
|
|
|
|
|
line_ipa_parts.append(word_ipa)
|
|
|
if pos_major != '助詞': is_phrase_initial = False
|
|
|
|
|
|
line_output = " ".join(part for part in line_ipa_parts if part)
|
|
|
if is_question:
|
|
|
line_output += ' ↗'
|
|
|
speed_score *= 1.05
|
|
|
|
|
|
final_speed = max(0.7, min(1.5, speed_score))
|
|
|
return (normalize_ipa_string(line_output), final_speed)
|
|
|
|
|
|
def __call__(self, text_block):
|
|
|
lines = text_block.splitlines()
|
|
|
full_output = []
|
|
|
for line in lines:
|
|
|
if line.strip():
|
|
|
ipa_for_line, speed_for_line = self._phonemize_line(line)
|
|
|
full_output.append((ipa_for_line, speed_for_line))
|
|
|
return full_output
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
g2p = JapaneseToIPA()
|
|
|
|
|
|
print("\n--- Example 1: Assimilation and Allophones ---")
|
|
|
test_text_1 = "銀河(ぎんが)と新幹線(しんかんせん)と天ぷら(てんぷら)。"
|
|
|
print(f"Input: {test_text_1}")
|
|
|
result_1 = g2p(test_text_1)
|
|
|
print(f"Output: {result_1}")
|
|
|
|
|
|
|
|
|
print("\n--- Example 2: Gongitsune Block ---")
|
|
|
gongitsune_text = """「ああん?」
|
|
|
「おれあ、このごろ、とてもふしぎなことがあるんだ」
|
|
|
「何が?」
|
|
|
「おっ母が死んでからは、だれだか知らんが、おれに栗やまつたけなんかを、まいにちまいにちくれるんだよ」"""
|
|
|
print(f"Input:\n{gongitsune_text}")
|
|
|
result_2 = g2p(gongitsune_text)
|
|
|
print(f"Output: {result_2}") |