parakeet-tdt-0.6b-v2

Running

App Files Files Community

sungo-ganpare commited on May 25

Commit

af69235

1 Parent(s): 29074da

GPUタイムアウトを300秒から60秒に変更。gradio_client.fileをインポートし、chunk_pathをgradio_file()でラップ。音声ファイルの前処理を改善し、モノラル変換とリサンプリングを追加。新しいトランスクリプトファイルを作成。

Browse files

Files changed (3) hide show

app.py +1 -1
local_controller.py +12 -13
transcribe_cli.py +116 -70

app.py CHANGED Viewed

@@ -262,7 +262,7 @@ def transcribe_audio_core(
             print("CUDA cache cleared.")
         print("Transcription cleanup finished.")
-@spaces.GPU(duration=300) # GPUリソースを要求し、タイムアウトを300秒に設定
 def process_audio_file(audio_filepath: str) -> dict: # Gradioから渡されるのは一時ファイルのパス
     """
     アップロードされた音声ファイルを処理し、文字起こし結果をJSONで返す。

             print("CUDA cache cleared.")
         print("Transcription cleanup finished.")
+@spaces.GPU(duration=60) # GPUリソースを要求し、タイムアウトを60秒に設定
 def process_audio_file(audio_filepath: str) -> dict: # Gradioから渡されるのは一時ファイルのパス
     """
     アップロードされた音声ファイルを処理し、文字起こし結果をJSONで返す。

local_controller.py CHANGED Viewed

@@ -10,7 +10,7 @@ import shutil
 import subprocess
 try:
-    from gradio_client import Client
     GRADIO_CLIENT_AVAILABLE = True
 except ImportError:
     GRADIO_CLIENT_AVAILABLE = False
@@ -137,12 +137,11 @@ def process_chunk(chunk_path: str) -> Optional[Dict]:
         return None
     try:
-        from gradio_client import Client
         import time
         print(f"Connecting to Space: {SPACE_URL}")
-        # 複数の接続方法を試す
         client = None
         for attempt in range(3):
             try:
@@ -167,26 +166,26 @@ def process_chunk(chunk_path: str) -> Optional[Dict]:
         # Method 1: fn_index=1 を使用 (change イベント用)
         try:
-            result = client.predict(chunk_path, fn_index=1)
-            print("Successfully used fn_index=1")
         except Exception as e:
-            print(f"fn_index=1 method failed: {e}")
         # Method 2: fn_index=0 を使用
         if result is None:
             try:
-                result = client.predict(chunk_path, fn_index=0)
-                print("Successfully used fn_index=0")
             except Exception as e:
-                print(f"fn_index=0 method failed: {e}")
         # Method 3: デフォルトで呼び出し
         if result is None:
             try:
-                result = client.predict(chunk_path)
-                print("Successfully used default method")
             except Exception as e:
-                print(f"Default method failed: {e}")
         if result is None:
             print("All API call methods failed")
@@ -324,4 +323,4 @@ def main():
     process_audio_file(args.input_path, args.output_dir)
 if __name__ == "__main__":
-    main()

 import subprocess
 try:
+    from gradio_client import Client, file as gradio_file # gradio_client.file をインポート
     GRADIO_CLIENT_AVAILABLE = True
 except ImportError:
     GRADIO_CLIENT_AVAILABLE = False
         return None
     try:
+        # from gradio_client import Client # Client は既に上でインポートされているので不要
         import time
         print(f"Connecting to Space: {SPACE_URL}")
         client = None
         for attempt in range(3):
             try:
         # Method 1: fn_index=1 を使用 (change イベント用)
         try:
+            result = client.predict(gradio_file(chunk_path), fn_index=1) # chunk_path を gradio_file() でラップ
+            print("Successfully used fn_index=1 with gradio_file()")
         except Exception as e:
+            print(f"fn_index=1 method with gradio_file() failed: {e}")
         # Method 2: fn_index=0 を使用
         if result is None:
             try:
+                result = client.predict(gradio_file(chunk_path), fn_index=0) # chunk_path を gradio_file() でラップ
+                print("Successfully used fn_index=0 with gradio_file()")
             except Exception as e:
+                print(f"fn_index=0 method with gradio_file() failed: {e}")
         # Method 3: デフォルトで呼び出し
         if result is None:
             try:
+                result = client.predict(gradio_file(chunk_path)) # chunk_path を gradio_file() でラップ
+                print("Successfully used default method with gradio_file()")
             except Exception as e:
+                print(f"Default method with gradio_file() failed: {e}")
         if result is None:
             print("All API call methods failed")
     process_audio_file(args.input_path, args.output_dir)
 if __name__ == "__main__":
+    main()

transcribe_cli.py CHANGED Viewed

@@ -70,7 +70,20 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
         # 4GB以上またはVERY_LONG_AUDIO_THRESHOLD_SECONDS以上の場合は直接ffmpegでチャンク分割
         if file_size > 4 * 1024**3 or duration_sec > VERY_LONG_AUDIO_THRESHOLD_SECONDS:
             print(f"  大容量ファイル（{file_size_gb:.2f}GB, {duration_sec/3600:.2f}時間）のため、ffmpegで直接チャンク分割処理を行います。")
-            return audio_path_str, f"{original_path_name} (大容量)", duration_sec
         # 4GB未満の場合は従来のpydub処理
         try:
@@ -82,6 +95,62 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
             else:
                 raise pydub_e
     except FileNotFoundError:
         print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
         return None, None, None
@@ -89,65 +158,6 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
         print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
         return None, None, None
-    resampled = False
-    mono_converted = False
-    # リサンプリング処理
-    if audio.frame_rate != TARGET_SAMPLE_RATE:
-        try:
-            print(f"  リサンプリング中: {audio.frame_rate}Hz -> {TARGET_SAMPLE_RATE}Hz")
-            audio = audio.set_frame_rate(TARGET_SAMPLE_RATE)
-            resampled = True
-        except Exception as resample_e:
-            print(f"エラー: 音声のリサンプリングに失敗しました: {resample_e}")
-            return None, None, None
-    # モノラル変換処理
-    if audio.channels == 2:
-        try:
-            print("  モノラルに変換中 (2ch -> 1ch)")
-            audio = audio.set_channels(1)
-            mono_converted = True
-        except Exception as mono_e:
-            print(f"エラー: 音声のモノラル変換に失敗しました: {mono_e}")
-            return None, None, None
-    elif audio.channels > 2:
-        print(f"エラー: 音声チャンネルが {audio.channels} です。1ch(モノラル)または2ch(ステレオ)のみサポートしています。")
-        return None, None, None
-    elif audio.channels == 1:
-        print("  音声は既にモノラルです。")
-    processed_temp_file_path_obj = None
-    # 前処理が行われた場合、一時ファイルに保存
-    if resampled or mono_converted:
-        try:
-            # ファイル名から特殊文字を除去してより安全な名前を生成
-            import re
-            safe_stem = re.sub(r'[^\w\-_\.]', '_', audio_name_stem)
-            temp_suffix = "_preprocessed_temp.wav"
-            processed_temp_file_path_obj = Path(output_dir_for_temp_files, f"{safe_stem}{temp_suffix}")
-            print(f"  前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
-            audio.export(processed_temp_file_path_obj, format="wav")
-            path_for_transcription = processed_temp_file_path_obj.as_posix()
-            display_name_for_info = f"{original_path_name} (前処理済み)"
-        except Exception as export_e:
-            print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
-            if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
-                try:
-                    os.remove(processed_temp_file_path_obj)
-                except OSError:
-                    pass
-            return None, None, None
-    else:
-        # 前処理が不要だった場合
-        print("  前処理は不要でした。元のファイルを使用します。")
-        path_for_transcription = audio_path_str
-        display_name_for_info = original_path_name
-    return path_for_transcription, display_name_for_info, duration_sec
 def get_audio_duration_with_ffprobe(audio_path_str: str) -> Optional[float]:
     """ffprobeを使用して音声ファイルの長さを取得（4GB制限なし）"""
     try:
@@ -750,42 +760,78 @@ def process_single_file(
             all_word_vis_data_merged: List[List[str]] = []
             current_global_time_offset_sec = 0.0
             last_global_segment_end_time_sec = 0.0
             for i, chunk_file_path_str in enumerate(temp_chunk_file_paths_str_list):
                 print(f"  チャンク {i+1}/{len(temp_chunk_file_paths_str_list)} ({Path(chunk_file_path_str).name}) を処理中...")
                 try:
                     estimated_chunk_duration_for_asr_settings = CHUNK_LENGTH_SECONDS + CHUNK_OVERLAP_SECONDS
                     vis_data_chunk, _, word_vis_data_chunk = transcribe_audio_cli(
                         chunk_file_path_str, asr_model_instance,
                         estimated_chunk_duration_for_asr_settings, device_to_use
                     )
                     if not vis_data_chunk:
                         print(f"  警告: チャンク {Path(chunk_file_path_str).name} の文字起こしに失敗。スキップします。")
                         current_global_time_offset_sec += CHUNK_LENGTH_SECONDS - (CHUNK_OVERLAP_SECONDS if i < len(temp_chunk_file_paths_str_list) - 1 else 0)
                         continue
                     for seg_row_list in vis_data_chunk:
-                        s_local_sec = float(seg_row_list[0]); e_local_sec = float(seg_row_list[1]); text_seg = seg_row_list[2]
-                        s_global_sec = s_local_sec + current_global_time_offset_sec; e_global_sec = e_local_sec + current_global_time_offset_sec
-                        if s_global_sec >= last_global_segment_end_time_sec - 0.1 :
                             all_vis_data_merged.append([f"{s_global_sec:.2f}", f"{e_global_sec:.2f}", text_seg])
                             last_global_segment_end_time_sec = max(last_global_segment_end_time_sec, e_global_sec)
                     temp_last_word_global_end_time_sec = float(all_word_vis_data_merged[-1][1]) if all_word_vis_data_merged else 0.0
                     if word_vis_data_chunk:
                         for word_row_list in word_vis_data_chunk:
-                            w_s_local_sec = float(word_row_list[0]); w_e_local_sec = float(word_row_list[1]); text_word = word_row_list[2]
-                            w_s_global_sec = w_s_local_sec + current_global_time_offset_sec; w_e_global_sec = w_e_local_sec + current_global_time_offset_sec
                             if w_s_global_sec >= temp_last_word_global_end_time_sec - 0.05:
-                                 all_word_vis_data_merged.append([f"{w_s_global_sec:.2f}", f"{w_e_global_sec:.2f}", text_word])
-                                 temp_last_word_global_end_time_sec = max(temp_last_word_global_end_time_sec, w_e_global_sec)
                     if i < len(temp_chunk_file_paths_str_list) - 1:
                         current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
-                except Exception as chunk_proc_e:
                     print(f"  エラー: チャンク {Path(chunk_file_path_str).name} の処理中にエラー: {chunk_proc_e}")
-                    if i < len(temp_chunk_file_paths_str_list) - 1:
                         current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
             final_vis_data = all_vis_data_merged
             final_word_vis_data = all_word_vis_data_merged
             if not final_vis_data:
-                 raise Exception("チャンク処理後、有効な文字起こしデータが得られませんでした。")
         else:
             vis_data_single, _, word_vis_data_single = transcribe_audio_cli(
                 processed_path_for_asr, asr_model_instance, actual_audio_duration_sec, device_to_use

         # 4GB以上またはVERY_LONG_AUDIO_THRESHOLD_SECONDS以上の場合は直接ffmpegでチャンク分割
         if file_size > 4 * 1024**3 or duration_sec > VERY_LONG_AUDIO_THRESHOLD_SECONDS:
             print(f"  大容量ファイル（{file_size_gb:.2f}GB, {duration_sec/3600:.2f}時間）のため、ffmpegで直接チャンク分割処理を行います。")
+            # 大容量ファイルの場合もモノラル変換を行う
+            temp_mono_path = Path(output_dir_for_temp_files) / f"{audio_name_stem}_mono_temp.wav"
+            try:
+                cmd = [
+                    'ffmpeg', '-y', '-i', audio_path_str,
+                    '-ac', '1',  # モノラルに変換
+                    '-ar', str(TARGET_SAMPLE_RATE),  # サンプルレートを設定
+                    str(temp_mono_path)
+                ]
+                subprocess.run(cmd, capture_output=True, check=True)
+                return temp_mono_path.as_posix(), f"{original_path_name} (大容量・モノラル)", duration_sec
+            except subprocess.CalledProcessError as e:
+                print(f"  ffmpegでのモノラル変換に失敗: {e}")
+                return audio_path_str, f"{original_path_name} (大容量)", duration_sec
         # 4GB未満の場合は従来のpydub処理
         try:
             else:
                 raise pydub_e
+        resampled = False
+        mono_converted = False
+        # リサンプリング処理
+        if audio.frame_rate != TARGET_SAMPLE_RATE:
+            try:
+                print(f"  リサンプリング中: {audio.frame_rate}Hz -> {TARGET_SAMPLE_RATE}Hz")
+                audio = audio.set_frame_rate(TARGET_SAMPLE_RATE)
+                resampled = True
+            except Exception as resample_e:
+                print(f"エラー: 音声のリサンプリングに失敗しました: {resample_e}")
+                return None, None, None
+        # モノラル変換処理
+        if audio.channels > 1:
+            try:
+                print(f"  モノラルに変換中 ({audio.channels}ch -> 1ch)")
+                audio = audio.set_channels(1)
+                mono_converted = True
+            except Exception as mono_e:
+                print(f"エラー: 音声のモノラル変換に失敗しました: {mono_e}")
+                return None, None, None
+        elif audio.channels == 1:
+            print("  音声は既にモノラルです。")
+        processed_temp_file_path_obj = None
+        # 前処理が行われた場合、一時ファイルに保存
+        if resampled or mono_converted:
+            try:
+                # ファイル名から特殊文字を除去してより安全な名前を生成
+                import re
+                safe_stem = re.sub(r'[^\w\-_\.]', '_', audio_name_stem)
+                temp_suffix = "_preprocessed_temp.wav"
+                processed_temp_file_path_obj = Path(output_dir_for_temp_files, f"{safe_stem}{temp_suffix}")
+                print(f"  前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
+                audio.export(processed_temp_file_path_obj, format="wav")
+                path_for_transcription = processed_temp_file_path_obj.as_posix()
+                display_name_for_info = f"{original_path_name} (前処理済み)"
+            except Exception as export_e:
+                print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
+                if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
+                    try:
+                        os.remove(processed_temp_file_path_obj)
+                    except OSError:
+                        pass
+                return None, None, None
+        else:
+            # 前処理が不要だった場合
+            print("  前処理は不要でした。元のファイルを使用します。")
+            path_for_transcription = audio_path_str
+            display_name_for_info = original_path_name
+        return path_for_transcription, display_name_for_info, duration_sec
     except FileNotFoundError:
         print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
         return None, None, None
         print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
         return None, None, None
 def get_audio_duration_with_ffprobe(audio_path_str: str) -> Optional[float]:
     """ffprobeを使用して音声ファイルの長さを取得（4GB制限なし）"""
     try:
             all_word_vis_data_merged: List[List[str]] = []
             current_global_time_offset_sec = 0.0
             last_global_segment_end_time_sec = 0.0
+            # チャンク処理前にGPUメモリをクリア
+            if device_to_use == 'cuda':
+                torch.cuda.empty_cache()
+                gc.collect()
+                print(f"  初期GPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
             for i, chunk_file_path_str in enumerate(temp_chunk_file_paths_str_list):
                 print(f"  チャンク {i+1}/{len(temp_chunk_file_paths_str_list)} ({Path(chunk_file_path_str).name}) を処理中...")
                 try:
+                    # 各チャンク処理前にGPUメモリをクリア
+                    if device_to_use == 'cuda':
+                        torch.cuda.empty_cache()
+                        gc.collect()
+                        print(f"    チャンク処理前のGPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
                     estimated_chunk_duration_for_asr_settings = CHUNK_LENGTH_SECONDS + CHUNK_OVERLAP_SECONDS
                     vis_data_chunk, _, word_vis_data_chunk = transcribe_audio_cli(
                         chunk_file_path_str, asr_model_instance,
                         estimated_chunk_duration_for_asr_settings, device_to_use
                     )
+                    # チャンク処理後のGPUメモリ使用量を確認
+                    if device_to_use == 'cuda':
+                        print(f"    チャンク処理後のGPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
                     if not vis_data_chunk:
                         print(f"  警告: チャンク {Path(chunk_file_path_str).name} の文字起こしに失敗。スキップします。")
                         current_global_time_offset_sec += CHUNK_LENGTH_SECONDS - (CHUNK_OVERLAP_SECONDS if i < len(temp_chunk_file_paths_str_list) - 1 else 0)
                         continue
+                    # データのマージ処理
                     for seg_row_list in vis_data_chunk:
+                        s_local_sec = float(seg_row_list[0])
+                        e_local_sec = float(seg_row_list[1])
+                        text_seg = seg_row_list[2]
+                        s_global_sec = s_local_sec + current_global_time_offset_sec
+                        e_global_sec = e_local_sec + current_global_time_offset_sec
+                        if s_global_sec >= last_global_segment_end_time_sec - 0.1:
                             all_vis_data_merged.append([f"{s_global_sec:.2f}", f"{e_global_sec:.2f}", text_seg])
                             last_global_segment_end_time_sec = max(last_global_segment_end_time_sec, e_global_sec)
                     temp_last_word_global_end_time_sec = float(all_word_vis_data_merged[-1][1]) if all_word_vis_data_merged else 0.0
                     if word_vis_data_chunk:
                         for word_row_list in word_vis_data_chunk:
+                            w_s_local_sec = float(word_row_list[0])
+                            w_e_local_sec = float(word_row_list[1])
+                            text_word = word_row_list[2]
+                            w_s_global_sec = w_s_local_sec + current_global_time_offset_sec
+                            w_e_global_sec = w_e_local_sec + current_global_time_offset_sec
                             if w_s_global_sec >= temp_last_word_global_end_time_sec - 0.05:
+                                all_word_vis_data_merged.append([f"{w_s_global_sec:.2f}", f"{w_e_global_sec:.2f}", text_word])
+                                temp_last_word_global_end_time_sec = max(temp_last_word_global_end_time_sec, w_e_global_sec)
                     if i < len(temp_chunk_file_paths_str_list) - 1:
                         current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
+                    # チャンク処理後にGPUメモリをクリア
+                    if device_to_use == 'cuda':
+                        torch.cuda.empty_cache()
+                        gc.collect()
+                        print(f"    メモリクリア後のGPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
+                except Exception as chunk_proc_e:
                     print(f"  エラー: チャンク {Path(chunk_file_path_str).name} の処理中にエラー: {chunk_proc_e}")
+                    if i < len(temp_chunk_file_paths_str_list) - 1:
                         current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
             final_vis_data = all_vis_data_merged
             final_word_vis_data = all_word_vis_data_merged
             if not final_vis_data:
+                raise Exception("チャンク処理後、有効な文字起こしデータが得られませんでした。")
         else:
             vis_data_single, _, word_vis_data_single = transcribe_audio_cli(
                 processed_path_for_asr, asr_model_instance, actual_audio_duration_sec, device_to_use