parakeet-tdt-0.6b-v2

Running

App Files Files Community

sungo-ganpare commited on May 17

Commit

d4575dc

1 Parent(s): 21b4fcb

オーディオファイルの前処理と文字起こし機能を改善し、エラーハンドリングを強化

Browse files

Files changed (1) hide show

app.py +194 -137

app.py CHANGED Viewed

@@ -69,27 +69,28 @@ def get_audio_segment(audio_path, start_second, end_second):
         print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
         return None
-@spaces.GPU
-def get_transcripts_and_raw_times(audio_path, session_dir):
-    if not audio_path:
-        gr.Error("No audio file path provided for transcription.", duration=None)
-        return [], [], [], None, gr.DownloadButton(visible=False)
-    vis_data = [["N/A", "N/A", "Processing failed"]]
-    raw_times_data = [[0.0, 0.0]]
-    char_vis_data = []
-    processed_audio_path = None
-    original_path_name = Path(audio_path).name
-    audio_name = Path(audio_path).stem
     try:
         try:
             gr.Info(f"Loading audio: {original_path_name}", duration=2)
             audio = AudioSegment.from_file(audio_path)
             duration_sec = audio.duration_seconds
         except Exception as load_e:
             gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
-            return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
         resampled = False
         mono = False
@@ -101,7 +102,7 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
                 resampled = True
             except Exception as resample_e:
                 gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
-                return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
         if audio.channels == 2:
             try:
@@ -109,11 +110,12 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
                 mono = True
             except Exception as mono_e:
                 gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
-                return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
         elif audio.channels > 2:
             gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
-            return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
         if resampled or mono:
             try:
                 processed_audio_path = Path(session_dir, f"{audio_name}_resampled.wav")
@@ -124,134 +126,189 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
                 gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
                 if processed_audio_path and os.path.exists(processed_audio_path):
                     os.remove(processed_audio_path)
-                return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
         else:
             transcribe_path = audio_path
             info_path_name = original_path_name
-        long_audio_settings_applied = False
-        try:
-            model.to(device)
-            model.to(torch.float32)
-            gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
-            if duration_sec > 480:
-                try:
-                    gr.Info("Audio longer than 8 minutes. Applying optimized settings for long transcription.", duration=3)
-                    print("Applying long audio settings: Local Attention and Chunking.")
-                    model.change_attention_model("rel_pos_local_attn", [256,256])
-                    model.change_subsampling_conv_chunking_factor(1)
-                    long_audio_settings_applied = True
-                except Exception as setting_e:
-                    gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5)
-                    print(f"Warning: Failed to apply long audio settings: {setting_e}")
-            model.to(torch.bfloat16)
-            output = model.transcribe([transcribe_path], timestamps=True)
-            if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
-                gr.Error("Transcription failed or produced unexpected output format.", duration=None)
-                return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
-            segment_timestamps = output[0].timestamp['segment']
-            csv_headers = ["Start (s)", "End (s)", "Segment"]
-            vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
-            raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
-            char_timestamps_raw = output[0].timestamp.get("char", [])
-            if not isinstance(char_timestamps_raw, list):
-                print(f"Warning: char_timestamps_raw is not a list, but {type(char_timestamps_raw)}. Defaulting to empty.")
-                char_timestamps_raw = []
-            char_vis_data = [
-                [f"{c['start']:.2f}", f"{c['end']:.2f}", c["char"]]
-                for c in char_timestamps_raw if isinstance(c, dict) and 'start' in c and 'end' in c and 'char' in c
-            ]
-            word_timestamps_raw = output[0].timestamp.get("word", [])
-            word_vis_data = [
-                [f"{w['start']:.2f}", f"{w['end']:.2f}", w["word"]]
-                for w in word_timestamps_raw if isinstance(w, dict) and 'start' in w and 'end' in w and 'word' in w
-            ]
-            button_update = gr.DownloadButton(visible=False)
-            srt_file_path = None
-            vtt_file_path = None
-            json_file_path = None
-            lrc_file_path = None
             try:
-                csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
-                with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
-                    writer = csv.writer(f)
-                    writer.writerow(csv_headers)
-                    writer.writerows(vis_data)
-                print(f"CSV transcript saved to temporary file: {csv_file_path}")
-                button_update = gr.DownloadButton(value=csv_file_path.as_posix(), visible=True)
-                srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt")
-                vtt_file_path = Path(session_dir, f"transcription_{audio_name}.vtt")
-                json_file_path = Path(session_dir, f"transcription_{audio_name}.json")
-                write_srt(vis_data, srt_file_path)
-                write_vtt(vis_data, word_vis_data, vtt_file_path)
-                write_json(vis_data, word_vis_data, json_file_path)
-                print(f"SRT, VTT, JSON transcript saved to temporary files: {srt_file_path}, {vtt_file_path}, {json_file_path}")
-                lrc_file_path = Path(session_dir, f"transcription_{audio_name}.lrc")
-                write_lrc(vis_data, lrc_file_path)
-                print(f"LRC transcript saved to temporary file: {lrc_file_path}")
-            except Exception as csv_e:
-                gr.Error(f"Failed to create transcript files: {csv_e}", duration=None)
-                print(f"Error writing transcript files: {csv_e}")
-            gr.Info("Transcription complete.", duration=2)
-            return (
-                vis_data,
-                raw_times_data,
-                word_vis_data,
-                audio_path,
-                gr.DownloadButton(value=csv_file_path.as_posix(), visible=True),
-                gr.DownloadButton(value=srt_file_path.as_posix(), visible=True),
-                gr.DownloadButton(value=vtt_file_path.as_posix(), visible=True),
-                gr.DownloadButton(value=json_file_path.as_posix(), visible=True),
-                gr.DownloadButton(value=lrc_file_path.as_posix(), visible=True)
-            )
-        except torch.cuda.OutOfMemoryError as e:
-            error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
-            print(f"CUDA OutOfMemoryError: {e}")
-            gr.Error(error_msg, duration=None)
-            return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
-        except FileNotFoundError:
-            error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
-            print(f"Error: Transcribe audio file not found at path: {transcribe_path}")
-            gr.Error(error_msg, duration=None)
-            return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
-        except Exception as e:
-            error_msg = f"Transcription failed: {e}"
-            print(f"Error during transcription processing: {e}")
-            gr.Error(error_msg, duration=None)
-            return [["Error", "Error", error_msg]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
-        finally:
-            try:
-                if long_audio_settings_applied:
-                    try:
-                        print("Reverting long audio settings.")
-                        model.change_attention_model("rel_pos")
-                        model.change_subsampling_conv_chunking_factor(-1)
-                    except Exception as revert_e:
-                        print(f"Warning: Failed to revert long audio settings: {revert_e}")
-                        gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5)
-                if 'model' in locals() and hasattr(model, 'cpu'):
-                    if device == 'cuda':
-                        model.cpu()
-                gc.collect()
-                if device == 'cuda':
-                    torch.cuda.empty_cache()
-            except Exception as cleanup_e:
-                print(f"Error during model cleanup: {cleanup_e}")
-                gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
     finally:
         if processed_audio_path and os.path.exists(processed_audio_path):
             try:
@@ -489,4 +546,4 @@ with gr.Blocks(theme=nvidia_theme) as demo:
 if __name__ == "__main__":
     print("Launching Gradio Demo...")
     demo.queue()
-    demo.launch()

         print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
         return None
+def preprocess_audio(audio_path, session_dir):
+    """
+    オーディオファイルの前処理（リサンプリング、モノラル変換）を行う。
+    Args:
+        audio_path (str): 入力オーディオファイルのパス。
+        session_dir (str): セッションディレクトリのパス。
+    Returns:
+        tuple: (processed_path, info_path_name, duration_sec) のタプル、または None（処理に失敗した場合）。
+    """
     try:
+        original_path_name = Path(audio_path).name
+        audio_name = Path(audio_path).stem
         try:
             gr.Info(f"Loading audio: {original_path_name}", duration=2)
             audio = AudioSegment.from_file(audio_path)
             duration_sec = audio.duration_seconds
         except Exception as load_e:
             gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
+            return None, None, None
         resampled = False
         mono = False
                 resampled = True
             except Exception as resample_e:
                 gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
+                return None, None, None
         if audio.channels == 2:
             try:
                 mono = True
             except Exception as mono_e:
                 gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
+                return None, None, None
         elif audio.channels > 2:
             gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
+            return None, None, None
+        processed_audio_path = None
         if resampled or mono:
             try:
                 processed_audio_path = Path(session_dir, f"{audio_name}_resampled.wav")
                 gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
                 if processed_audio_path and os.path.exists(processed_audio_path):
                     os.remove(processed_audio_path)
+                return None, None, None
         else:
             transcribe_path = audio_path
             info_path_name = original_path_name
+        return transcribe_path, info_path_name, duration_sec
+    except Exception as e:
+        gr.Error(f"Audio preprocessing failed: {e}", duration=None)
+        return None, None, None
+def transcribe_audio(transcribe_path, model, duration_sec, device):
+    """
+    オーディオファイルを文字起こしし、タイムスタンプを取得する。
+    Args:
+        transcribe_path (str): 入力オーディオファイルのパス。
+        model (ASRModel): 使用するASRモデル。
+        duration_sec (float): オーディオファイルの長さ（秒）。
+        device (str): 使用するデバイス（'cuda' or 'cpu'）。
+    Returns:
+        tuple: (vis_data, raw_times_data, word_vis_data) のタプル、または None（処理に失敗した場合）。
+    """
+    long_audio_settings_applied = False
+    try:
+        model.to(device)
+        model.to(torch.float32)
+        gr.Info(f"Transcribing on {device}...", duration=2)
+        if duration_sec > 480:
             try:
+                gr.Info("Audio longer than 8 minutes. Applying optimized settings for long transcription.", duration=3)
+                print("Applying long audio settings: Local Attention and Chunking.")
+                model.change_attention_model("rel_pos_local_attn", [256,256])
+                model.change_subsampling_conv_chunking_factor(1)
+                long_audio_settings_applied = True
+            except Exception as setting_e:
+                gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5)
+                print(f"Warning: Failed to apply long audio settings: {setting_e}")
+        model.to(torch.bfloat16)
+        output = model.transcribe([transcribe_path], timestamps=True)
+        if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
+            gr.Error("Transcription failed or produced unexpected output format.", duration=None)
+            return None, None, None
+        segment_timestamps = output[0].timestamp['segment']
+        vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
+        raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
+        word_timestamps_raw = output[0].timestamp.get("word", [])
+        word_vis_data = [
+            [f"{w['start']:.2f}", f"{w['end']:.2f}", w["word"]]
+            for w in word_timestamps_raw if isinstance(w, dict) and 'start' in w and 'end' in w and 'word' in w
+        ]
+        gr.Info("Transcription complete.", duration=2)
+        return vis_data, raw_times_data, word_vis_data
+    except torch.cuda.OutOfMemoryError as e:
+        error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
+        print(f"CUDA OutOfMemoryError: {e}")
+        gr.Error(error_msg, duration=None)
+        return None, None, None
+    except Exception as e:
+        error_msg = f"Transcription failed: {e}"
+        print(f"Error during transcription processing: {e}")
+        gr.Error(error_msg, duration=None)
+        return None, None, None
+    finally:
+        try:
+            if long_audio_settings_applied:
+                try:
+                    print("Reverting long audio settings.")
+                    model.change_attention_model("rel_pos")
+                    model.change_subsampling_conv_chunking_factor(-1)
+                except Exception as revert_e:
+                    print(f"Warning: Failed to revert long audio settings: {revert_e}")
+                    gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5)
+            if device == 'cuda':
+                model.cpu()
+            gc.collect()
+            if device == 'cuda':
+                torch.cuda.empty_cache()
+        except Exception as cleanup_e:
+            print(f"Error during model cleanup: {cleanup_e}")
+            gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
+def save_transcripts(session_dir, audio_name, vis_data, word_vis_data):
+    """
+    文字起こし結果を各種ファイル形式（CSV、SRT、VTT、JSON、LRC）で保存する。
+    Args:
+        session_dir (str): セッションディレクトリのパス。
+        audio_name (str): オーディオファイルの名前。
+        vis_data (list): 表示用の文字起こし結果のリスト。
+        word_vis_data (list): 単語レベルのタイムスタンプのリスト。
+    Returns:
+        tuple: 各ファイルのダウンロードボタンの更新情報を含むタプル。
+    """
+    try:
+        csv_headers = ["Start (s)", "End (s)", "Segment"]
+        csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
+        with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(csv_headers)
+            writer.writerows(vis_data)
+        print(f"CSV transcript saved to temporary file: {csv_file_path}")
+        srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt")
+        vtt_file_path = Path(session_dir, f"transcription_{audio_name}.vtt")
+        json_file_path = Path(session_dir, f"transcription_{audio_name}.json")
+        write_srt(vis_data, srt_file_path)
+        write_vtt(vis_data, word_vis_data, vtt_file_path)
+        write_json(vis_data, word_vis_data, json_file_path)
+        print(f"SRT, VTT, JSON transcript saved to temporary files: {srt_file_path}, {vtt_file_path}, {json_file_path}")
+        lrc_file_path = Path(session_dir, f"transcription_{audio_name}.lrc")
+        write_lrc(vis_data, lrc_file_path)
+        print(f"LRC transcript saved to temporary file: {lrc_file_path}")
+        return (
+            gr.DownloadButton(value=csv_file_path.as_posix(), visible=True),
+            gr.DownloadButton(value=srt_file_path.as_posix(), visible=True),
+            gr.DownloadButton(value=vtt_file_path.as_posix(), visible=True),
+            gr.DownloadButton(value=json_file_path.as_posix(), visible=True),
+            gr.DownloadButton(value=lrc_file_path.as_posix(), visible=True)
+        )
+    except Exception as e:
+        gr.Error(f"Failed to create transcript files: {e}", duration=None)
+        print(f"Error writing transcript files: {e}")
+        return tuple([gr.DownloadButton(visible=False)] * 5)
+@spaces.GPU
+def get_transcripts_and_raw_times(audio_path, session_dir):
+    """
+    オーディオファイルを処理し、文字起こし結果を生成する。
+    Args:
+        audio_path (str): 入力オーディオファイルのパス。
+        session_dir (str): セッションディレクトリのパス。
+    Returns:
+        tuple: 文字起こし結果と関連データを含むタプル。
+    """
+    if not audio_path:
+        gr.Error("No audio file path provided for transcription.", duration=None)
+        return [], [], [], None, gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False)
+    audio_name = Path(audio_path).stem
+    processed_audio_path = None
+    try:
+        # オーディオの前処理
+        transcribe_path, info_path_name, duration_sec = preprocess_audio(audio_path, session_dir)
+        if not transcribe_path or not duration_sec:
+            return [], [], [], audio_path, gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False)
+        processed_audio_path = transcribe_path if transcribe_path != audio_path else None
+        # 文字起こしの実行
+        result = transcribe_audio(transcribe_path, model, duration_sec, device)
+        if not result:
+            return [], [], [], audio_path, gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False), gr.DownloadButton(visible=False)
+        vis_data, raw_times_data, word_vis_data = result
+        # ファイルの保存
+        button_updates = save_transcripts(session_dir, audio_name, vis_data, word_vis_data)
+        return (
+            vis_data,
+            raw_times_data,
+            word_vis_data,
+            audio_path,
+            *button_updates
+        )
     finally:
         if processed_audio_path and os.path.exists(processed_audio_path):
             try:
 if __name__ == "__main__":
     print("Launching Gradio Demo...")
     demo.queue()
+    demo.launch()