parakeet-tdt-0.6b-v2

Running

App Files Files Community

sungo-ganpare commited on May 25

Commit

f67cd0b

1 Parent(s): aaa4bd6

音声処理の設定を改善し、セグメント分割機能を強化。VTTファイルのサイズ制限を追加し、エラーハンドリングを強化。自然な区切り点を探す関数を追加し、文字起こしの精度を向上。

Browse files

Files changed (1) hide show

transcribe_cli.py +224 -64

transcribe_cli.py CHANGED Viewed

@@ -20,11 +20,20 @@ import shutil
 MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
 TARGET_SAMPLE_RATE = 16000
 # 音声の長さに関する閾値 (秒)
-LONG_AUDIO_THRESHOLD_SECONDS = 480  # 8分 (この長さを超えると長尺設定を試みる)
-VERY_LONG_AUDIO_THRESHOLD_SECONDS = 10800 # 3時間 (この長さを超えるとチャンク分割処理)
 # チャンク分割時の設定
-CHUNK_LENGTH_SECONDS = 3600 # 1時間
-CHUNK_OVERLAP_SECONDS = 30  # 30秒
 # ★ 入力ファイルの優先順位付き拡張子リスト
 INPUT_PRIORITY_EXTENSIONS: List[str] = ['.wav', '.mp3', '.mp4']
 # ★ デフォルトで出力するフォーマットリスト
@@ -168,47 +177,112 @@ def get_audio_duration_with_ffprobe(audio_path_str: str) -> Optional[float]:
         return None
 # --- 文字起こしコア関数 ---
 def transcribe_audio_cli(
     transcribe_path_str: str,
     model: ASRModel,
-    duration_sec: float, # この音声セグメントの長さ
     device: str
 ) -> Tuple[Optional[List], Optional[List], Optional[List]]:
-    """
-    指定されたオーディオファイルをNeMo ASRモデルで文字起こしします。
-    成功した場合、(セグメント情報リスト, RAWタイムスタンプリスト, 単語情報リスト) を返します。
-    失敗した場合、(None, None, None) を返します。
-    """
-    long_audio_settings_applied = False # 長尺設定が適用されたかどうかのフラグ
-    original_model_dtype = model.dtype # モデルの元のデータ型を保存 (通常は torch.float32)
     try:
-        # CUDAデバイス使用時はメモリキャッシュをクリア
         if device == 'cuda':
             torch.cuda.empty_cache()
             gc.collect()
-        model.to(device) # モデルを推論に使用するデバイスへ移動
-        # 音声長に応じてモデル設定を変更 (長尺音声対応)
         if duration_sec > LONG_AUDIO_THRESHOLD_SECONDS:
             try:
                 print(f"  情報: 音声長 ({duration_sec:.0f}s) が閾値 ({LONG_AUDIO_THRESHOLD_SECONDS}s) を超えるため、長尺音声向け設定を適用します。")
-                model.change_attention_model(self_attention_model="rel_pos_local_attn", att_context_size=[256, 256])
-                model.change_subsampling_conv_chunking_factor(1)
                 long_audio_settings_applied = True
-                if device == 'cuda': torch.cuda.empty_cache(); gc.collect()
             except Exception as setting_e:
                 print(f"  警告: 長尺音声向け設定の適用に失敗しました: {setting_e}。デフォルト設定で続行します。")
         if device == 'cuda' and torch.cuda.is_bf16_supported():
             print("  情報: モデルを bfloat16 に変換して推論を実行します。")
             model.to(torch.bfloat16)
-        elif model.dtype != original_model_dtype:
-             model.to(original_model_dtype)
         print(f"  文字起こしを実行中 (デバイス: {device}, モデルdtype: {model.dtype})...")
-        output = model.transcribe([transcribe_path_str], timestamps=True, batch_size=4)
         if not output or not isinstance(output, list) or not output[0] or \
            not hasattr(output[0], 'timestamp') or not output[0].timestamp or \
@@ -217,13 +291,65 @@ def transcribe_audio_cli(
             return None, None, None
         segment_timestamps = output[0].timestamp['segment']
-        vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
-        raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
-        word_timestamps_raw = output[0].timestamp.get("word", [])
-        word_vis_data = [
-            [f"{w['start']:.2f}", f"{w['end']:.2f}", w["word"]]
-            for w in word_timestamps_raw if isinstance(w, dict) and 'start' in w and 'end' in w and 'word' in w
-        ]
         print("  文字起こし完了。")
         return vis_data, raw_times_data, word_vis_data
@@ -280,8 +406,16 @@ def save_transcripts_cli(output_dir_str: str, audio_file_stem: str,
             print(f"    SRTファイルを保存: {srt_file_path.name}"); saved_files_count +=1
         if "vtt" in formats_to_save:
             vtt_file_path = output_dir_path / f"{audio_file_stem}.vtt"
-            write_vtt(vis_data, word_vis_data, vtt_file_path)
-            print(f"    VTTファイルを保存: {vtt_file_path.name}"); saved_files_count +=1
         if "json" in formats_to_save:
             json_file_path = output_dir_path / f"{audio_file_stem}.json"
             write_json(vis_data, word_vis_data, json_file_path)
@@ -295,6 +429,7 @@ def save_transcripts_cli(output_dir_str: str, audio_file_stem: str,
             print(f"  警告: 指定されたフォーマット {formats_to_save} でのファイルの保存は行われませんでした。")
     except Exception as e:
         print(f"  エラー: 文字起こしファイルの保存中にエラーが発生しました: {e}")
 # --- 書き出しヘルパー関数群 (SRT, VTT, JSON, LRC) ---
 def write_srt(segments: List, path: Path):
@@ -308,7 +443,8 @@ def write_srt(segments: List, path: Path):
 def write_vtt(segments: List, words: List, path: Path):
     def sec2vtt(t_float: float) -> str:
-        h, rem = divmod(int(t_float), 3600); m, s = divmod(rem, 60)
         ms = int((t_float - int(t_float)) * 1000)
         return f"{h:02}:{m:02}:{s:02}.{ms:03}"
@@ -321,13 +457,19 @@ def write_vtt(segments: List, words: List, path: Path):
         f.write("::cue(.line) { background: rgba(0,0,0,0.7); padding: 4px; }\n\n")
         if not words:
-            # フォールバック処理は同じ
             for i, seg_list in enumerate(segments, 1):
                 f.write(f"NOTE Segment {i}\n")
                 f.write(f"{sec2vtt(float(seg_list[0]))} --> {sec2vtt(float(seg_list[1]))}\n{seg_list[2]}\n\n")
             return
-        # セグメント単位でグループ化してカラオケ風に
         for seg_data in segments:
             seg_start = float(seg_data[0])
             seg_end = float(seg_data[1])
@@ -342,56 +484,74 @@ def write_vtt(segments: List, words: List, path: Path):
             if not segment_words:
                 continue
-            # セグメント開始時刻から最初の単語開始まで（全て未来色）
             first_word_start = float(segment_words[0][1][0])
             if seg_start < first_word_start - 0.05:
-                line_parts = [f'<c.future>{w_data[2]}</c>' for _, w_data in segment_words]
                 f.write(f"{sec2vtt(seg_start)} --> {sec2vtt(first_word_start)}\n")
-                f.write(f'<c.line>{" ".join(line_parts)}</c>\n\n')
             # 各単語の処理
-            for local_idx, (global_word_idx, word_data) in enumerate(segment_words):
                 w_start = float(word_data[0])
                 w_end = float(word_data[1])
-                # 単語再生中：現在の単語をハイライト
                 line_parts = []
-                for i, (_, w_data) in enumerate(segment_words):
-                    w_text = w_data[2]
                     if i == local_idx:
-                        line_parts.append(f'<c.current>{w_text}</c>')
                     elif i < local_idx:
-                        line_parts.append(f'<c.past>{w_text}</c>')
                     else:
-                        line_parts.append(f'<c.future>{w_text}</c>')
-                f.write(f"{sec2vtt(w_start)} --> {sec2vtt(w_end)}\n")
                 f.write(f'<c.line>{" ".join(line_parts)}</c>\n\n')
-                # 単語終了から次の単語開始まで（無音期間）：過去・未来のみ
-                if local_idx < len(segment_words) - 1:  # 最後の単語でない場合
                     next_word_start = float(segment_words[local_idx + 1][1][0])
                     gap_duration = next_word_start - w_end
                     if gap_duration > 0.05:  # 50ms以上の無音期間がある場合
-                        gap_line_parts = []
-                        for i, (_, w_data) in enumerate(segment_words):
-                            w_text = w_data[2]
-                            if i <= local_idx:  # 現在の単語まで（過去）
-                                gap_line_parts.append(f'<c.past>{w_text}</c>')
-                            else:  # 未来の単語
-                                gap_line_parts.append(f'<c.future>{w_text}</c>')
                         f.write(f"{sec2vtt(w_end)} --> {sec2vtt(next_word_start)}\n")
-                        f.write(f'<c.line>{" ".join(gap_line_parts)}</c>\n\n')
-                else:
-                    # 最後の単語終了からセグメント終了まで（全て過去色）
-                    if w_end < seg_end - 0.05:
-                        line_parts = [f'<c.past>{w_data[2]}</c>' for _, w_data in segment_words]
-                        f.write(f"{sec2vtt(w_end)} --> {sec2vtt(seg_end)}\n")
-                        f.write(f'<c.line>{" ".join(line_parts)}</c>\n\n')
 def write_json(segments: List, words: List, path: Path):
     result = {"segments": []}; word_idx = 0

 MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
 TARGET_SAMPLE_RATE = 16000
 # 音声の長さに関する閾値 (秒)
+LONG_AUDIO_THRESHOLD_SECONDS = 480  # 8分
+VERY_LONG_AUDIO_THRESHOLD_SECONDS = 10800 # 3時間
 # チャンク分割時の設定
+CHUNK_LENGTH_SECONDS = 1800 # 30分
+CHUNK_OVERLAP_SECONDS = 60  # 1分
+# セグメント処理の設定
+MAX_SEGMENT_LENGTH_SECONDS = 15  # 最大セグメント長（秒）を15秒に短縮
+MAX_SEGMENT_CHARS = 100  # 最大セグメント文字数を100文字に短縮
+MIN_SEGMENT_GAP_SECONDS = 0.3  # 最小セグメント間隔（秒）
+# VTTファイルの最大サイズ（バイト）
+MAX_VTT_SIZE_BYTES = 10 * 1024 * 1024  # 10MB
+# 文の区切り文字
+SENTENCE_ENDINGS = ['.', '!', '?', '。', '！', '？']
+SENTENCE_PAUSES = [',', '、', ';', '；', ':', '：']
 # ★ 入力ファイルの優先順位付き拡張子リスト
 INPUT_PRIORITY_EXTENSIONS: List[str] = ['.wav', '.mp3', '.mp4']
 # ★ デフォルトで出力するフォーマットリスト
         return None
 # --- 文字起こしコア関数 ---
+def find_natural_break_point(text: str, max_length: int) -> int:
+    """テキスト内で自然な区切り点を探す"""
+    if len(text) <= max_length:
+        return len(text)
+    # 文末で区切る
+    for i in range(max_length, 0, -1):
+        if i < len(text) and text[i] in SENTENCE_ENDINGS:
+            return i + 1
+    # 文の区切りで区切る
+    for i in range(max_length, 0, -1):
+        if i < len(text) and text[i] in SENTENCE_PAUSES:
+            return i + 1
+    # スペースで区切る
+    for i in range(max_length, 0, -1):
+        if i < len(text) and text[i].isspace():
+            return i + 1
+    # それでも見つからない場合は最大長で区切る
+    return max_length
+def split_segment(segment: dict, max_length_seconds: float, max_chars: int) -> List[dict]:
+    """セグメントを自然な区切りで分割する"""
+    if (segment['end'] - segment['start']) <= max_length_seconds and len(segment['segment']) <= max_chars:
+        return [segment]
+    result = []
+    current_text = segment['segment']
+    current_start = segment['start']
+    total_duration = segment['end'] - segment['start']
+    while current_text:
+        # 文字数に基づく分割点を探す
+        break_point = find_natural_break_point(current_text, max_chars)
+        # 時間に基づく分割点を計算
+        text_ratio = break_point / len(segment['segment'])
+        segment_duration = total_duration * text_ratio
+        # 分割点が最大長を超えないように調整
+        if segment_duration > max_length_seconds:
+            time_ratio = max_length_seconds / total_duration
+            break_point = int(len(segment['segment']) * time_ratio)
+            break_point = find_natural_break_point(current_text, break_point)
+            segment_duration = max_length_seconds
+        # 新しいセグメントを作成
+        new_segment = {
+            'start': current_start,
+            'end': current_start + segment_duration,
+            'segment': current_text[:break_point].strip()
+        }
+        result.append(new_segment)
+        # 残りのテキストと開始時間を更新
+        current_text = current_text[break_point:].strip()
+        current_start = new_segment['end']
+    return result
 def transcribe_audio_cli(
     transcribe_path_str: str,
     model: ASRModel,
+    duration_sec: float,
     device: str
 ) -> Tuple[Optional[List], Optional[List], Optional[List]]:
+    long_audio_settings_applied = False
+    original_model_dtype = model.dtype
     try:
         if device == 'cuda':
             torch.cuda.empty_cache()
             gc.collect()
+        model.to(device)
+        # 音声長に応じてモデル設定を変更
         if duration_sec > LONG_AUDIO_THRESHOLD_SECONDS:
             try:
                 print(f"  情報: 音声長 ({duration_sec:.0f}s) が閾値 ({LONG_AUDIO_THRESHOLD_SECONDS}s) を超えるため、長尺音声向け設定を適用します。")
+                model.change_attention_model(
+                    self_attention_model="rel_pos_local_attn",
+                    att_context_size=[128, 128]
+                )
+                model.change_subsampling_conv_chunking_factor(1)
                 long_audio_settings_applied = True
+                if device == 'cuda':
+                    torch.cuda.empty_cache()
+                    gc.collect()
             except Exception as setting_e:
                 print(f"  警告: 長尺音声向け設定の適用に失敗しました: {setting_e}。デフォルト設定で続行します。")
         if device == 'cuda' and torch.cuda.is_bf16_supported():
             print("  情報: モデルを bfloat16 に変換して推論を実行します。")
             model.to(torch.bfloat16)
+        elif model.dtype != original_model_dtype:
+            model.to(original_model_dtype)
         print(f"  文字起こしを実行中 (デバイス: {device}, モデルdtype: {model.dtype})...")
+        output = model.transcribe(
+            [transcribe_path_str],
+            timestamps=True,
+            batch_size=2
+        )
         if not output or not isinstance(output, list) or not output[0] or \
            not hasattr(output[0], 'timestamp') or not output[0].timestamp or \
             return None, None, None
         segment_timestamps = output[0].timestamp['segment']
+        # セグメントの前処理：より適切なセグメント分割
+        processed_segments = []
+        current_segment = None
+        for ts in segment_timestamps:
+            if current_segment is None:
+                current_segment = ts
+            else:
+                # セグメント結合の条件を厳格化
+                time_gap = ts['start'] - current_segment['end']
+                current_text = current_segment['segment']
+                next_text = ts['segment']
+                # 結合条件のチェック
+                should_merge = (
+                    time_gap < MIN_SEGMENT_GAP_SECONDS and  # 時間間隔が短い
+                    len(current_text) + len(next_text) < MAX_SEGMENT_CHARS and  # 文字数制限
+                    (current_segment['end'] - current_segment['start']) < MAX_SEGMENT_LENGTH_SECONDS and  # 現在のセグメントが短い
+                    (ts['end'] - ts['start']) < MAX_SEGMENT_LENGTH_SECONDS and  # 次のセグメントが短い
+                    not any(current_text.strip().endswith(p) for p in SENTENCE_ENDINGS)  # 文の区切りでない
+                )
+                if should_merge:
+                    current_segment['end'] = ts['end']
+                    current_segment['segment'] += ' ' + ts['segment']
+                else:
+                    # 現在のセグメントを分割
+                    split_segments = split_segment(current_segment, MAX_SEGMENT_LENGTH_SECONDS, MAX_SEGMENT_CHARS)
+                    processed_segments.extend(split_segments)
+                    current_segment = ts
+        if current_segment is not None:
+            # 最後のセグメントも分割
+            split_segments = split_segment(current_segment, MAX_SEGMENT_LENGTH_SECONDS, MAX_SEGMENT_CHARS)
+            processed_segments.extend(split_segments)
+        # 処理済みセグメントからデータを生成
+        vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in processed_segments]
+        raw_times_data = [[ts['start'], ts['end']] for ts in processed_segments]
+        # 単語タイムスタンプの処理を改善
+        word_timestamps_raw = output[0].timestamp.get("word", [])
+        word_vis_data = []
+        for w in word_timestamps_raw:
+            if not isinstance(w, dict) or not all(k in w for k in ['start', 'end', 'word']):
+                continue
+            # 単語のタイムスタンプを最も近いセグメントに割り当て
+            word_start = float(w['start'])
+            word_end = float(w['end'])
+            # 単語が完全に含まれるセグメントを探す
+            for seg in processed_segments:
+                if word_start >= seg['start'] - 0.05 and word_end <= seg['end'] + 0.05:
+                    word_vis_data.append([f"{word_start:.2f}", f"{word_end:.2f}", w["word"]])
+                    break
         print("  文字起こし完了。")
         return vis_data, raw_times_data, word_vis_data
             print(f"    SRTファイルを保存: {srt_file_path.name}"); saved_files_count +=1
         if "vtt" in formats_to_save:
             vtt_file_path = output_dir_path / f"{audio_file_stem}.vtt"
+            try:
+                write_vtt(vis_data, word_vis_data, vtt_file_path)
+                print(f"    VTTファイルを保存: {vtt_file_path.name}"); saved_files_count +=1
+            except ValueError as e:
+                if "VTTファイルサイズが制限を超えました" in str(e):
+                    print(f"  エラー: {e}")
+                    # 既に作成されたVTTファイルを削除
+                    if vtt_file_path.exists():
+                        vtt_file_path.unlink()
+                    raise  # エラーを上位に伝播
         if "json" in formats_to_save:
             json_file_path = output_dir_path / f"{audio_file_stem}.json"
             write_json(vis_data, word_vis_data, json_file_path)
             print(f"  警告: 指定されたフォーマット {formats_to_save} でのファイルの保存は行われませんでした。")
     except Exception as e:
         print(f"  エラー: 文字起こしファイルの保存中にエラーが発生しました: {e}")
+        raise  # エラーを上位に伝播
 # --- 書き出しヘルパー関数群 (SRT, VTT, JSON, LRC) ---
 def write_srt(segments: List, path: Path):
 def write_vtt(segments: List, words: List, path: Path):
     def sec2vtt(t_float: float) -> str:
+        h, rem = divmod(int(t_float), 3600)
+        m, s = divmod(rem, 60)
         ms = int((t_float - int(t_float)) * 1000)
         return f"{h:02}:{m:02}:{s:02}.{ms:03}"
         f.write("::cue(.line) { background: rgba(0,0,0,0.7); padding: 4px; }\n\n")
         if not words:
+            # 単語タイムスタンプがない場合は、セグメント単位で出力
             for i, seg_list in enumerate(segments, 1):
                 f.write(f"NOTE Segment {i}\n")
                 f.write(f"{sec2vtt(float(seg_list[0]))} --> {sec2vtt(float(seg_list[1]))}\n{seg_list[2]}\n\n")
+                # ファイルサイズをチェック
+                current_size = f.tell()
+                if current_size > MAX_VTT_SIZE_BYTES:
+                    print(f"警告: VTTファイルが{MAX_VTT_SIZE_BYTES/1024/1024:.1f}MBを超えました。処理を中止します。")
+                    raise ValueError("VTTファイルサイズが制限を超えました")
             return
+        # セグメント単位で処理
         for seg_data in segments:
             seg_start = float(seg_data[0])
             seg_end = float(seg_data[1])
             if not segment_words:
                 continue
+            # セグメント内の全単語のテキストを一度だけ生成
+            all_words = [w_data[2] for _, w_data in segment_words]
+            # セグメント開始から最初の単語まで
             first_word_start = float(segment_words[0][1][0])
             if seg_start < first_word_start - 0.05:
                 f.write(f"{sec2vtt(seg_start)} --> {sec2vtt(first_word_start)}\n")
+                f.write(f'<c.line>{" ".join(f"<c.future>{w}</c>" for w in all_words)}</c>\n\n')
+                # ファイルサイズをチェック
+                current_size = f.tell()
+                if current_size > MAX_VTT_SIZE_BYTES:
+                    print(f"警告: VTTファイルが{MAX_VTT_SIZE_BYTES/1024/1024:.1f}MBを超えました。処理を中止します。")
+                    raise ValueError("VTTファイルサイズが制限を超えました")
             # 各単語の処理
+            for local_idx, (_, word_data) in enumerate(segment_words):
                 w_start = float(word_data[0])
                 w_end = float(word_data[1])
+                # 単語の表示時間を出力
+                f.write(f"{sec2vtt(w_start)} --> {sec2vtt(w_end)}\n")
+                # 現在の単語をハイライトしたテキストを生成
                 line_parts = []
+                for i, w in enumerate(all_words):
                     if i == local_idx:
+                        line_parts.append(f'<c.current>{w}</c>')
                     elif i < local_idx:
+                        line_parts.append(f'<c.past>{w}</c>')
                     else:
+                        line_parts.append(f'<c.future>{w}</c>')
                 f.write(f'<c.line>{" ".join(line_parts)}</c>\n\n')
+                # ファイルサイズをチェック
+                current_size = f.tell()
+                if current_size > MAX_VTT_SIZE_BYTES:
+                    print(f"警告: VTTファイルが{MAX_VTT_SIZE_BYTES/1024/1024:.1f}MBを超えました。処理を中止します。")
+                    raise ValueError("VTTファイルサイズが制限を超えました")
+                # 単語間の無音期間の処理
+                if local_idx < len(segment_words) - 1:
                     next_word_start = float(segment_words[local_idx + 1][1][0])
                     gap_duration = next_word_start - w_end
                     if gap_duration > 0.05:  # 50ms以上の無音期間がある場合
                         f.write(f"{sec2vtt(w_end)} --> {sec2vtt(next_word_start)}\n")
+                        f.write(f'<c.line>{" ".join(f"<c.past>{w}</c>" if i <= local_idx else f"<c.future>{w}</c>" for i, w in enumerate(all_words))}</c>\n\n')
+                        # ファイルサイズをチェック
+                        current_size = f.tell()
+                        if current_size > MAX_VTT_SIZE_BYTES:
+                            print(f"警告: VTTファイルが{MAX_VTT_SIZE_BYTES/1024/1024:.1f}MBを超えました。処理を中止します。")
+                            raise ValueError("VTTファイルサイズが制限を超えました")
+            # 最後の単語からセグメント終了まで
+            last_word_end = float(segment_words[-1][1][1])
+            if last_word_end < seg_end - 0.05:
+                f.write(f"{sec2vtt(last_word_end)} --> {sec2vtt(seg_end)}\n")
+                f.write(f'<c.line>{" ".join(f"<c.past>{w}</c>" for w in all_words)}</c>\n\n')
+                # ファイルサイズをチェック
+                current_size = f.tell()
+                if current_size > MAX_VTT_SIZE_BYTES:
+                    print(f"警告: VTTファ���ルが{MAX_VTT_SIZE_BYTES/1024/1024:.1f}MBを超えました。処理を中止します。")
+                    raise ValueError("VTTファイルサイズが制限を超えました")
 def write_json(segments: List, words: List, path: Path):
     result = {"segments": []}; word_idx = 0