parakeet-tdt-0.6b-v2

Running

App Files Files Community

sungo-ganpare commited on May 25

Commit

c9be4ad

1 Parent(s): f67cd0b

音声処理の設定を強化し、セグメント分割機能を改善。自然な区切り点を探す関数を追加し、VTTファイルのサイズ制限を設定。バッチサイズを2に変更し、エラーハンドリングを強化。

Browse files

Files changed (1) hide show

app.py +138 -22

app.py CHANGED Viewed

@@ -19,7 +19,23 @@ except ImportError:
 # グローバル設定
 MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
-LONG_AUDIO_THRESHOLD_SECONDS = 480  # 8分 (この秒数を超えると長尺用設定を試みる)
 device = "cuda" if torch.cuda.is_available() else "cpu" # スクリプト起動時のデバイス検出
 # モデルの初期化 (グローバルに一度だけ行う)
@@ -31,6 +47,67 @@ model.eval()
 model.cpu()
 print("ASR model initialized and moved to CPU.")
 def transcribe_audio_core(
     audio_path: str,
@@ -62,7 +139,7 @@ def transcribe_audio_core(
             gr.Info(f"Audio duration ({duration_sec:.2f}s) exceeds threshold. Applying long audio settings.", duration=3)
             try:
                 print("Applying long audio settings: Local Attention and Chunking.")
-                model.change_attention_model("rel_pos_local_attn", [256,256])
                 model.change_subsampling_conv_chunking_factor(1)
                 long_audio_settings_applied = True
                 print("Successfully applied long audio settings.")
@@ -79,7 +156,7 @@ def transcribe_audio_core(
         # 文字起こし実行
         print(f"Transcribing {audio_path}...")
-        output = model.transcribe([audio_path], timestamps=True)
         print("Transcription API call finished.")
         if not output or not isinstance(output, list) or not output[0] or \
@@ -91,14 +168,65 @@ def transcribe_audio_core(
             return None, None, None
         segment_timestamps = output[0].timestamp['segment']
-        vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
-        raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
         word_timestamps_raw = output[0].timestamp.get("word", [])
-        word_vis_data = [
-            [f"{w['start']:.2f}", f"{w['end']:.2f}", w["word"]]
-            for w in word_timestamps_raw if isinstance(w, dict) and 'start' in w and 'end' in w and 'word' in w
-        ]
         gr.Info("Transcription successful!", duration=3)
         return vis_data, raw_times_data, word_vis_data
@@ -162,7 +290,6 @@ def process_audio_file(audio_filepath: str) -> dict: # Gradioから渡される
             # pydubが失敗しても、NeMoは処理を試みることができるので、duration_sec = 0 で続行
             duration_sec = 0
     # 文字起こしコア処理を呼び出し
     vis_data, raw_times_data, word_vis_data = transcribe_audio_core(audio_filepath, duration_sec, current_processing_device)
@@ -200,12 +327,6 @@ with gr.Blocks() as demo:
     file_input = gr.File(label="Upload Audio File", type="filepath") # type="filepath" を明示
     output_json = gr.JSON(label="Transcription Result")
-    # .change() は非推奨なので .upload() を使うか、ボタンを使うのが一般的
-    # ここでは元の構造に合わせて .change() を使うが、ファイル選択後すぐに処理が走る
-    # 大容量ファイルの場合、アップロード完了を待つボタンの方がUXが良い
-    # transcribe_button = gr.Button("Transcribe File")
-    # transcribe_button.click(
     file_input.change( # ファイルがアップロード/変更されたら実行
         fn=process_audio_file,
         inputs=[file_input],
@@ -214,15 +335,11 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
             [os.path.join(os.path.dirname(__file__), "audio_example.wav") if os.path.exists(os.path.join(os.path.dirname(__file__), "audio_example.wav")) else "https://www.kozco.com/tech/piano2-CoolEdit.mp3"]
-            # ダミーの音声ファイルパスまたはURL。実際に存在するファイルパスに置き換えてください。
-            # Hugging Face Spacesで使う場合、���ポジトリにサンプル音声を含めてそのパスを指定するのが良いでしょう。
-            # 例: ["sample_audio.wav"] (リポジトリのルートにsample_audio.wavを置く場合)
         ],
         inputs=[file_input],
         label="Example Audio (Click to load)"
     )
 if __name__ == "__main__":
     # ダミーの音声ファイルを作成 (Examples用、もし存在しなければ)
     example_dir = os.path.dirname(__file__)
@@ -242,7 +359,6 @@ if __name__ == "__main__":
     elif not PYDUB_AVAILABLE:
          print("Skipping dummy audio file creation as pydub is not available.")
     print("Launching Gradio demo...")
     demo.queue()  # リクエストキューを有効化
     demo.launch()

 # グローバル設定
 MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
+TARGET_SAMPLE_RATE = 16000
+# 音声の長さに関する閾値 (秒)
+LONG_AUDIO_THRESHOLD_SECONDS = 480  # 8分
+VERY_LONG_AUDIO_THRESHOLD_SECONDS = 10800 # 3時間
+# チャンク分割時の設定
+CHUNK_LENGTH_SECONDS = 1800 # 30分
+CHUNK_OVERLAP_SECONDS = 60  # 1分
+# セグメント処理の設定
+MAX_SEGMENT_LENGTH_SECONDS = 15  # 最大セグメント長（秒）を15秒に短縮
+MAX_SEGMENT_CHARS = 100  # 最大セグメント文字数を100文字に短縮
+MIN_SEGMENT_GAP_SECONDS = 0.3  # 最小セグメント間隔（秒）
+# VTTファイルの最大サイズ（バイト）
+MAX_VTT_SIZE_BYTES = 10 * 1024 * 1024  # 10MB
+# 文の区切り文字
+SENTENCE_ENDINGS = ['.', '!', '?', '。', '！', '？']
+SENTENCE_PAUSES = [',', '、', ';', '；', ':', '：']
 device = "cuda" if torch.cuda.is_available() else "cpu" # スクリプト起動時のデバイス検出
 # モデルの初期化 (グローバルに一度だけ行う)
 model.cpu()
 print("ASR model initialized and moved to CPU.")
+def find_natural_break_point(text: str, max_length: int) -> int:
+    """テキスト内で自然な区切り点を探す"""
+    if len(text) <= max_length:
+        return len(text)
+    # 文末で区切る
+    for i in range(max_length, 0, -1):
+        if i < len(text) and text[i] in SENTENCE_ENDINGS:
+            return i + 1
+    # 文の区切りで区切る
+    for i in range(max_length, 0, -1):
+        if i < len(text) and text[i] in SENTENCE_PAUSES:
+            return i + 1
+    # スペースで区切る
+    for i in range(max_length, 0, -1):
+        if i < len(text) and text[i].isspace():
+            return i + 1
+    # それでも見つからない場合は最大長で区切る
+    return max_length
+def split_segment(segment: dict, max_length_seconds: float, max_chars: int) -> List[dict]:
+    """セグメントを自然な区切りで分割する"""
+    if (segment['end'] - segment['start']) <= max_length_seconds and len(segment['segment']) <= max_chars:
+        return [segment]
+    result = []
+    current_text = segment['segment']
+    current_start = segment['start']
+    total_duration = segment['end'] - segment['start']
+    while current_text:
+        # 文字数に基づく分割点を探す
+        break_point = find_natural_break_point(current_text, max_chars)
+        # 時間に基づく分割点を計算
+        text_ratio = break_point / len(segment['segment'])
+        segment_duration = total_duration * text_ratio
+        # 分割点が最大長を超えないように調整
+        if segment_duration > max_length_seconds:
+            time_ratio = max_length_seconds / total_duration
+            break_point = int(len(segment['segment']) * time_ratio)
+            break_point = find_natural_break_point(current_text, break_point)
+            segment_duration = max_length_seconds
+        # 新しいセグメントを作成
+        new_segment = {
+            'start': current_start,
+            'end': current_start + segment_duration,
+            'segment': current_text[:break_point].strip()
+        }
+        result.append(new_segment)
+        # 残りのテキストと開始時間を更新
+        current_text = current_text[break_point:].strip()
+        current_start = new_segment['end']
+    return result
 def transcribe_audio_core(
     audio_path: str,
             gr.Info(f"Audio duration ({duration_sec:.2f}s) exceeds threshold. Applying long audio settings.", duration=3)
             try:
                 print("Applying long audio settings: Local Attention and Chunking.")
+                model.change_attention_model("rel_pos_local_attn", [128, 128])  # 256,256から128,128に変更
                 model.change_subsampling_conv_chunking_factor(1)
                 long_audio_settings_applied = True
                 print("Successfully applied long audio settings.")
         # 文字起こし実行
         print(f"Transcribing {audio_path}...")
+        output = model.transcribe([audio_path], timestamps=True, batch_size=2)  # バッチサイズを2に設定
         print("Transcription API call finished.")
         if not output or not isinstance(output, list) or not output[0] or \
             return None, None, None
         segment_timestamps = output[0].timestamp['segment']
+        # セグメントの前処理：より適切なセグメント分割
+        processed_segments = []
+        current_segment = None
+        for ts in segment_timestamps:
+            if current_segment is None:
+                current_segment = ts
+            else:
+                # セグメント結合の条件を厳格化
+                time_gap = ts['start'] - current_segment['end']
+                current_text = current_segment['segment']
+                next_text = ts['segment']
+                # 結合条件のチェック
+                should_merge = (
+                    time_gap < MIN_SEGMENT_GAP_SECONDS and  # 時間間隔が短い
+                    len(current_text) + len(next_text) < MAX_SEGMENT_CHARS and  # 文字数制限
+                    (current_segment['end'] - current_segment['start']) < MAX_SEGMENT_LENGTH_SECONDS and  # 現在のセグメントが短い
+                    (ts['end'] - ts['start']) < MAX_SEGMENT_LENGTH_SECONDS and  # 次のセグメントが短い
+                    not any(current_text.strip().endswith(p) for p in SENTENCE_ENDINGS)  # 文の区切りでない
+                )
+                if should_merge:
+                    current_segment['end'] = ts['end']
+                    current_segment['segment'] += ' ' + ts['segment']
+                else:
+                    # 現在のセグメントを分割
+                    split_segments = split_segment(current_segment, MAX_SEGMENT_LENGTH_SECONDS, MAX_SEGMENT_CHARS)
+                    processed_segments.extend(split_segments)
+                    current_segment = ts
+        if current_segment is not None:
+            # 最後のセグメントも分割
+            split_segments = split_segment(current_segment, MAX_SEGMENT_LENGTH_SECONDS, MAX_SEGMENT_CHARS)
+            processed_segments.extend(split_segments)
+        # 処理済みセグメントからデータを生成
+        vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in processed_segments]
+        raw_times_data = [[ts['start'], ts['end']] for ts in processed_segments]
+        # 単語タイムスタンプの処理を改善
         word_timestamps_raw = output[0].timestamp.get("word", [])
+        word_vis_data = []
+        for w in word_timestamps_raw:
+            if not isinstance(w, dict) or not all(k in w for k in ['start', 'end', 'word']):
+                continue
+            # 単語のタイムスタンプを最も近いセグメントに割り当て
+            word_start = float(w['start'])
+            word_end = float(w['end'])
+            # 単語が完全に含まれるセグメントを探す
+            for seg in processed_segments:
+                if word_start >= seg['start'] - 0.05 and word_end <= seg['end'] + 0.05:
+                    word_vis_data.append([f"{word_start:.2f}", f"{word_end:.2f}", w["word"]])
+                    break
         gr.Info("Transcription successful!", duration=3)
         return vis_data, raw_times_data, word_vis_data
             # pydubが失敗しても、NeMoは処理を試みることができるので、duration_sec = 0 で続行
             duration_sec = 0
     # 文字起こしコア処理を呼び出し
     vis_data, raw_times_data, word_vis_data = transcribe_audio_core(audio_filepath, duration_sec, current_processing_device)
     file_input = gr.File(label="Upload Audio File", type="filepath") # type="filepath" を明示
     output_json = gr.JSON(label="Transcription Result")
     file_input.change( # ファイルがアップロード/変更されたら実行
         fn=process_audio_file,
         inputs=[file_input],
     gr.Examples(
         examples=[
             [os.path.join(os.path.dirname(__file__), "audio_example.wav") if os.path.exists(os.path.join(os.path.dirname(__file__), "audio_example.wav")) else "https://www.kozco.com/tech/piano2-CoolEdit.mp3"]
         ],
         inputs=[file_input],
         label="Example Audio (Click to load)"
     )
 if __name__ == "__main__":
     # ダミーの音声ファイルを作成 (Examples用、もし存在しなければ)
     example_dir = os.path.dirname(__file__)
     elif not PYDUB_AVAILABLE:
          print("Skipping dummy audio file creation as pydub is not available.")
     print("Launching Gradio demo...")
     demo.queue()  # リクエストキューを有効化
     demo.launch()