Pedro13543's picture
Upload 5 files
8cc9d61 verified
import re
from janome.tokenizer import Tokenizer
def normalize_ipa_string(raw_phoneme_str: str) -> str:
"""
Cleans a raw phoneme string from any G2P system into a standardized format
with IPA prosody markers.
"""
# --- Punctuation to Prosody Marker Conversion ---
# Handle full-width and half-width punctuation
# Japanese comma '、' becomes a short pause '|'
normalized = raw_phoneme_str.replace('、', ' | ')
# English comma ',' becomes a short pause '|'
normalized = normalized.replace(',', ' | ')
# All sentence-ending punctuation becomes a long pause '‖'
# This handles '.', '。', '?', '?', '!', '!'
normalized = re.sub(r'[.。??!!]', ' ‖ ', normalized)
# --- (Optional but Recommended) Cleanup ---
# Remove any extra whitespace that might have been introduced
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
class JapaneseToIPA:
"""
V5.0: A G2P engine improved with a flexible phonetic chart.
- Implements detailed phonetic transcriptions (palatalization, dentals).
- Handles contextual assimilation of the moraic nasal 'ん' (/N/).
- Models allophonic variations like bidakuon (nasal 'g') and fricative 'b'.
"""
def __init__(self, silent=False):
if not silent:
print("Initializing Janome tokenizer...")
self.tokenizer = Tokenizer()
if not silent:
print("Janome initialized.")
# Dictionary of known accent patterns.
self.accent_dict = {
'ワタシ': 'h', 'ネコ': 'h', 'イヌ': 'o', 'ハシ': 'a', 'コトバ': 'h', 'オンガク': 'a', 'トウキョウ': 'h',
'ニホン': 'o', 'センセイ': 'h', 'ガクセイ': 'h', 'ケイザイ': 'h', 'カゾク': 'o', 'ジカン': 'h',
'クルマ': 'h', 'デンワ': 'a', 'トモダチ': 'h', 'オイシイ': 'h', 'タノシイ': 'h', 'タカイ': 'o',
'ヤスイ': 'o', 'アタラシイ': 'h', 'オオキイ': 'h', 'チイサイ': 'o', 'イイ': 'o', 'ワルイ': 'o',
'ムズカシイ': 'h', 'キレイ': 'o', 'ゲンキ': 'o', 'ベンリ': 'o', 'スキ': 'h', 'イク': 'h',
'タベル': 'o', 'ミル': 'o', 'スル': 'h', 'クル': 'o', 'カク': 'o', 'ヨム': 'o', 'ハナス': 'o',
'アル': 'o', 'イル': 'h', 'アリガトウ': 'o', 'デス': 'h', 'マス': 'o',
'オッカア': 'o', 'オレ': 'h', 'フシギ': 'h', 'クリ': 'o', 'マツタケ': 'o', 'マイニチ': 'h'
}
# Katakana to Romaji Conversion Map
self.kana_to_romaji_map = {
'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o', 'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke',
'コ': 'ko', 'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so', 'タ': 'ta', 'チ': 'chi',
'ツ': 'tsu', 'テ': 'te', 'ト': 'to', 'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no',
'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho', 'マ': 'ma', 'ミ': 'mi', 'ム': 'mu',
'メ': 'me', 'モ': 'mo', 'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo', 'ラ': 'ra', 'リ': 'ri', 'ル': 'ru',
'レ': 're', 'ロ': 'ro', 'ワ': 'wa', 'ヲ': 'o', 'ン': 'n', 'ガ': 'ga', 'ギ': 'gi', 'グ': 'gu',
'ゲ': 'ge', 'ゴ': 'go', 'ザ': 'za', 'ジ': 'ji', 'ズ': 'zu', 'ゼ': 'ze', 'ゾ': 'zo', 'ダ': 'da',
'ヂ': 'ji', 'ヅ': 'zu', 'デ': 'de', 'ド': 'do', 'バ': 'ba', 'ビ': 'bi', 'ブ': 'bu', 'ベ': 'be',
'ボ': 'bo', 'パ': 'pa', 'ピ': 'pi', 'プ': 'pu', 'ペ': 'pe', 'ポ': 'po', 'キャ': 'kya', 'キュ': 'kyu',
'キョ': 'kyo', 'シャ': 'sha', 'シュ': 'shu', 'ショ': 'sho', 'チャ': 'cha', 'チュ': 'chu', 'チョ': 'cho',
'ニャ': 'nya', 'ニュ': 'nyu', 'ニョ': 'nyo', 'ヒャ': 'hya', 'ヒュ': 'hyu', 'ヒョ': 'hyo', 'ミャ': 'mya',
'ミュ': 'myu', 'ミョ': 'myo', 'リャ': 'rya', 'リュ': 'ryu', 'リョ': 'ryo', 'ギャ': 'gya', 'ギュ': 'gyu',
'ギョ': 'gyo', 'ジャ': 'ja', 'ジュ': 'ju', 'ジョ': 'jo', 'ビャ': 'bya', 'ビュ': 'byu', 'ビョ': 'byo',
'ピャ': 'pya', 'ピュ': 'pyu', 'ピョ': 'pyo', 'ー': ':', 'ッ': 'っ'
}
# Romaji to IPA Map - V5.0 Updated based on the provided phonetic chart
self.romaji_to_ipa_map = {
'a': 'a', 'i': 'i', 'u': 'ɯ', 'e': 'e', 'o': 'o',
'ka': 'ka', 'ki': 'kʲi', 'ku': 'kɯ', 'ke': 'ke', 'ko': 'ko', 'kya': 'kʲa', 'kyu': 'kʲu', 'kyo': 'kʲo',
'ga': 'ɡa', 'gi': 'ɡʲi', 'gu': 'ɡɯ', 'ge': 'ɡe', 'go': 'ɡo', 'gya': 'ɡʲa', 'gyu': 'ɡʲu', 'gyo': 'ɡʲo',
'sa': 'sa', 'shi': 'ɕi', 'su': 'sɯ', 'se': 'se', 'so': 'so', 'sha': 'ɕa', 'shu': 'ɕu', 'sho': 'ɕo',
'za': 'za', 'ji': 'ʑi', 'zu': 'zɯ', 'ze': 'ze', 'zo': 'zo', 'ja': 'ʑa', 'ju': 'ʑu', 'jo': 'ʑo',
'ta': 't̪a', 'chi': 'tɕi', 'tsu': 't̪sɯ', 'te': 't̪e', 'to': 't̪o', 'cha': 'tɕa', 'chu': 'tɕu',
'cho': 'tɕo',
'da': 'd̪a', 'de': 'd̪e', 'do': 'd̪o',
'na': 'n̪a', 'ni': 'ɲi', 'nu': 'n̪ɯ', 'ne': 'n̪e', 'no': 'n̪o', 'nya': 'ɲa', 'nyu': 'ɲu', 'nyo': 'ɲo',
'ha': 'ha', 'hi': 'çi', 'fu': 'ɸɯ', 'he': 'he', 'ho': 'ho', 'hya': 'ça', 'hyu': 'çu', 'hyo': 'ço',
'ba': 'ba', 'bi': 'bʲi', 'bu': 'bɯ', 'be': 'be', 'bo': 'bo', 'bya': 'bʲa', 'byu': 'bʲu', 'byo': 'bʲo',
'pa': 'pa', 'pi': 'pʲi', 'pu': 'pɯ', 'pe': 'pe', 'po': 'po', 'pya': 'pʲa', 'pyu': 'pʲu', 'pyo': 'pʲo',
'ma': 'ma', 'mi': 'mʲi', 'mu': 'mɯ', 'me': 'me', 'mo': 'mo', 'mya': 'mʲa', 'myu': 'mʲu', 'myo': 'mʲo',
'ya': 'ja', 'yu': 'jɯ', 'yo': 'jo',
'ra': 'ɾa', 'ri': 'ɾʲi', 'ru': 'ɾɯ', 're': 'ɾe', 'ro': 'ɾo', 'rya': 'ɾʲa', 'ryu': 'ɾʲu', 'ryo': 'ɾʲo',
'wa': 'wa', 'n': 'ɴ', ':': 'ː'
}
def _preprocess_text(self, text):
text = text.replace('おれあ', 'おれは').replace('知らんが', 'しらないが').replace('おっ母', 'おっかあ')
text = re.sub(r'([あいうえおアイウエオ])\1+', r'\1ー', text)
text = re.sub(r'んん+', 'んー', text)
text = text.replace('、', ' | ')
return text
def _kata_to_romaji(self, katakana_text):
romaji = ""
i = 0
while i < len(katakana_text):
# Check for 2-char kana first (e.g., キャ)
if i + 1 < len(katakana_text) and katakana_text[i:i + 2] in self.kana_to_romaji_map:
romaji += self.kana_to_romaji_map[katakana_text[i:i + 2]]
i += 2
# Handle gemination (ッ)
elif katakana_text[i] == 'ッ':
if i + 1 < len(katakana_text):
next_char_combo = katakana_text[i + 1:i + 3]
next_char = katakana_text[i + 1]
# Prioritize 2-char yōon sounds for gemination
if next_char_combo in self.kana_to_romaji_map:
next_romaji = self.kana_to_romaji_map.get(next_char_combo)
else:
next_romaji = self.kana_to_romaji_map.get(next_char, "")
if next_romaji: romaji += next_romaji[0]
i += 1
# Handle 1-char kana
elif katakana_text[i] in self.kana_to_romaji_map:
romaji += self.kana_to_romaji_map[katakana_text[i]]
i += 1
else:
i += 1
return romaji
def _romaji_to_ipa(self, romaji: str) -> str:
return self.romaji_to_ipa_map.get(romaji, romaji)
def _apply_allophonic_rules(self, ipa_word, speed, is_word_initial):
"""Applies contextual phonetic rules like bidakuon and fricative 'b'."""
# Rule 1: Bidakuon (Intervocalic /g/ becomes [ŋ])
# We apply this to any non-word-initial 'g' as a strong heuristic.
if not is_word_initial and 'ɡ' in ipa_word:
ipa_word = ipa_word.replace('ɡʲ', 'ŋʲ').replace('ɡ', 'ŋ')
# Rule 2: Fricative 'b' in fast speech
# /b/ becomes [β] between vowels.
if speed > 1.15: # Threshold for "fast speech"
vowels = "aiɯeo"
ipa_word = re.sub(f'([{vowels}])bʲ', fr'\1βʲ', ipa_word)
ipa_word = re.sub(f'([{vowels}])b([{vowels}])', fr'\1β\2', ipa_word)
return ipa_word
def _get_nasal_allophone(self, next_mora):
"""Determines the correct allophone for 'ん' based on the following mora."""
if not next_mora:
return 'ɴ' # End of utterance
# Based on the /N/ chart
first_char = next_mora[0]
if first_char in ['b', 'p', 'm']: return 'm'
if first_char in ['k', 'g']: return 'ŋ'
# Romaji lookahead for more complex cases
if next_mora.startswith(('ch', 'j', 'ny')): return 'n̠' # Before [tɕ, ʑ, ɲ]
if next_mora.startswith('sh'): return 'n̠' # Before [ɕ]
if first_char in ['d', 't', 'r'] or next_mora.startswith('ts'):
return 'n̪'
# Before vowels and most fricatives/approximants
if first_char in ['a', 'i', 'u', 'e', 'o', 'y', 'w', 'h', 's', 'z']:
return 'ɰ̃'
return 'ɴ' # Default fallback
def _phonemize_line(self, line):
if not line.strip(): return ("", 1.0)
speed_score = 1.0
# ... (Speed inference logic remains the same as original) ...
# --- SPEED INFERENCE LOGIC (V2 - More Expressive) ---
clean_line = line.strip()
if '!!' in clean_line or '?!' in clean_line:
speed_score *= 1.25
elif '!' in clean_line:
speed_score *= 1.15
if '…' in clean_line or '...' in clean_line: speed_score *= 0.85
if '~' in clean_line or 'ー' in clean_line: speed_score *= 0.90
if '、' in clean_line: speed_score *= 0.95
if 'うーん' in clean_line or 'ええと' in clean_line or 'あのー' in clean_line or 'そのー' in clean_line:
speed_score *= 0.80
elif 'まあ' in clean_line:
speed_score *= 0.90
if 'きゃー' in clean_line or 'うわー' in clean_line:
speed_score *= 1.20
elif 'よし' in clean_line or 'ほら' in clean_line:
speed_score *= 1.15
if clean_line.endswith(('ぞ', 'ぜ', 'さ')):
speed_score *= 1.10
elif clean_line.endswith(('だよ', 'もん')):
speed_score *= 1.05
if clean_line.endswith(('かな', 'かしら')):
speed_score *= 0.90
elif clean_line.endswith(('なあ', 'なー', 'ねえ', 'ねー')):
speed_score *= 0.85
elif clean_line.endswith('ね'):
speed_score *= 0.95
if clean_line.endswith('っ'): speed_score *= 1.15
if len(clean_line.replace('「', '').replace('」', '')) < 8: speed_score *= 1.10
# --- INTELLIGENT QUESTION DETECTION ---
is_question = '?' in line
if not is_question:
tokens_for_check = list(self.tokenizer.tokenize(line))
for token in reversed(tokens_for_check):
pos_major = token.part_of_speech.split(',')[0]
if pos_major == '記号': continue
is_sf_particle = '終助詞' in token.part_of_speech
if (token.surface in ['か', 'の']) and is_sf_particle: is_question = True
break
# --- PHONEMIZATION LOGIC ---
processed_line = self._preprocess_text(line)
tokens = self.tokenizer.tokenize(processed_line)
line_ipa_parts = []
is_phrase_initial = True
for token in tokens:
surface = token.surface
if surface == '|':
line_ipa_parts.append(',')
is_phrase_initial = True
continue
pos_full = token.part_of_speech
pos_major = pos_full.split(',')[0]
reading = token.reading if token.reading != '*' else surface
if pos_major == '記号':
if not (is_question and surface in ['?', 'か']):
if surface not in ['「', '」']: line_ipa_parts.append(surface)
continue
romaji = self._kata_to_romaji(reading) if surface != 'は' or '係助詞' not in pos_full else 'wa'
moras = re.findall(r'[bcdfghjklmnpqrstvwxyz]?y?[aiueo]:?|n', romaji)
# Accentuation logic
num_moras = len(moras)
primary_accent_mora = -1
if pos_major == '助詞':
pass
else:
known_pattern = self.accent_dict.get(reading)
if known_pattern == 'a':
primary_accent_mora = 0
elif known_pattern == 'o':
primary_accent_mora = num_moras - 1
else:
primary_accent_mora = 1 if num_moras > 1 else -1
apply_secondary_stress = (num_moras >= 4 and primary_accent_mora != 0)
# Mora to IPA conversion with 'ん' assimilation
ipa_parts = []
for i, mora in enumerate(moras):
ipa_mora = ""
if mora == 'n':
next_mora = moras[i + 1] if i + 1 < len(moras) else None
# NOTE: This handles within-word assimilation. Cross-word is a further complexity.
ipa_mora = self._get_nasal_allophone(next_mora)
else:
ipa_mora = self._romaji_to_ipa(mora)
# Apply stress markers
if i == primary_accent_mora:
ipa_parts.append('ˈ' + ipa_mora)
elif i == 0 and apply_secondary_stress:
ipa_parts.append('ˌ' + ipa_mora)
else:
ipa_parts.append(ipa_mora)
word_ipa = "".join(ipa_parts)
# Apply allophonic rules (e.g., bidakuon)
word_ipa = self._apply_allophonic_rules(word_ipa, speed_score, is_phrase_initial)
line_ipa_parts.append(word_ipa)
if pos_major != '助詞': is_phrase_initial = False
line_output = " ".join(part for part in line_ipa_parts if part)
if is_question:
line_output += ' ↗'
speed_score *= 1.05
final_speed = max(0.7, min(1.5, speed_score))
return (normalize_ipa_string(line_output), final_speed)
def __call__(self, text_block):
lines = text_block.splitlines()
full_output = []
for line in lines:
if line.strip():
ipa_for_line, speed_for_line = self._phonemize_line(line)
full_output.append((ipa_for_line, speed_for_line))
return full_output
# --- EXAMPLE USAGE ---
if __name__ == "__main__":
g2p = JapaneseToIPA()
print("\n--- Example 1: Assimilation and Allophones ---")
test_text_1 = "銀河(ぎんが)と新幹線(しんかんせん)と天ぷら(てんぷら)。"
print(f"Input: {test_text_1}")
result_1 = g2p(test_text_1)
print(f"Output: {result_1}")
# Expected: ぎんが [ɡiŋŋa], しんかんせん [ɕiŋkan̪sen̪], てんぷら [t̪empɯɾa]
print("\n--- Example 2: Gongitsune Block ---")
gongitsune_text = """「ああん?」
「おれあ、このごろ、とてもふしぎなことがあるんだ」
「何が?」
「おっ母が死んでからは、だれだか知らんが、おれに栗やまつたけなんかを、まいにちまいにちくれるんだよ」"""
print(f"Input:\n{gongitsune_text}")
result_2 = g2p(gongitsune_text)
print(f"Output: {result_2}")