parakeet-tdt-0.6b-v2

Running

App Files Files Community

sungo-ganpare commited on May 12

Commit

9eb42b2

1 Parent(s): 5179d0a

いったんセーブ

Browse files

Files changed (3) hide show

app.py +152 -16
app_space.py +498 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from nemo.collections.asr.models import ASRModel
 import torch
 import gradio as gr
-import spaces
 import gc
 import shutil
 from pathlib import Path
@@ -10,6 +10,7 @@ import numpy as np
 import os
 import gradio.themes as gr_themes
 import csv
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_NAME="nvidia/parakeet-tdt-0.6b-v2"
@@ -72,7 +73,7 @@ def get_audio_segment(audio_path, start_second, end_second):
         print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
         return None
-@spaces.GPU
 def get_transcripts_and_raw_times(audio_path, session_dir):
     if not audio_path:
         gr.Error("No audio file path provided for transcription.", duration=None)
@@ -172,21 +173,57 @@ def get_transcripts_and_raw_times(audio_path, session_dir):
                 for c in char_timestamps_raw if isinstance(c, dict) and 'start' in c and 'end' in c and 'char' in c
             ]
             button_update = gr.DownloadButton(visible=False)
             try:
                 csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
-                with open(csv_file_path, 'w', newline='', encoding='utf-8') as f: # Added newline and encoding
                     writer = csv.writer(f)
                     writer.writerow(csv_headers)
                     writer.writerows(vis_data)
                 print(f"CSV transcript saved to temporary file: {csv_file_path}")
-                button_update = gr.DownloadButton(value=csv_file_path.as_posix(), visible=True) # Use as_posix() for path
             except Exception as csv_e:
-                gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
-                print(f"Error writing CSV: {csv_e}")
             gr.Info("Transcription complete.", duration=2)
-            return vis_data, raw_times_data, char_vis_data, audio_path, button_update
         except torch.cuda.OutOfMemoryError as e:
             error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
@@ -266,6 +303,102 @@ def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
         print("Failed to get audio segment data.")
         return gr.Audio(value=None, label="Selected Segment")
 article = (
     "<p style='font-size: 1.1em;'>"
     "This demo showcases <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2'>parakeet-tdt-0.6b-v2</a></code>, a 600-million-parameter model designed for high-quality English speech recognition."
@@ -328,6 +461,10 @@ with gr.Blocks(theme=nvidia_theme) as demo:
     gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results</strong></p>")
     download_btn = gr.DownloadButton(label="Download Segment Transcript (CSV)", visible=False)
     with gr.Tabs(): # Tabs for result views
         with gr.TabItem("Segment View (Click row to play segment)"):
@@ -339,30 +476,30 @@ with gr.Blocks(theme=nvidia_theme) as demo:
             )
             selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
-        with gr.TabItem("Character View"):
-            char_vis_df = gr.DataFrame( # Define char_vis_df here
-                headers=["Start (s)", "End (s)", "Char"],
                 datatype=["number", "number", "str"],
                 wrap=False, # As specified in diff
-                # label="Character Timestamps" # Label provided by tab
             )
     # Ensure outputs list matches the return order of get_transcripts_and_raw_times:
     # vis_data, raw_times_data, char_vis_data, audio_path, button_update
     # maps to:
-    # vis_timestamps_df, raw_timestamps_list_state, char_vis_df, current_audio_path_state, download_btn
     mic_transcribe_btn.click(
         fn=get_transcripts_and_raw_times,
         inputs=[mic_input, session_dir_state],
-        outputs=[vis_timestamps_df, raw_timestamps_list_state, char_vis_df, current_audio_path_state, download_btn],
         api_name="transcribe_mic"
     )
     file_transcribe_btn.click(
         fn=get_transcripts_and_raw_times,
         inputs=[file_input, session_dir_state],
-        outputs=[vis_timestamps_df, raw_timestamps_list_state, char_vis_df, current_audio_path_state, download_btn],
         api_name="transcribe_file"
     )
@@ -372,8 +509,7 @@ with gr.Blocks(theme=nvidia_theme) as demo:
         outputs=[selected_segment_player],
     )
-    demo.unload(end_session, inputs=[session_dir_state]) # Pass session_dir_state to end_session if it needs it (original didn't but good practice)
-                                                     # Corrected: end_session takes no inputs from gr.State directly from unload signature based on original code.
 if __name__ == "__main__":
     print("Launching Gradio Demo...")

 from nemo.collections.asr.models import ASRModel
 import torch
 import gradio as gr
+# import spaces
 import gc
 import shutil
 from pathlib import Path
 import os
 import gradio.themes as gr_themes
 import csv
+import json
 device = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_NAME="nvidia/parakeet-tdt-0.6b-v2"
         print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
         return None
+# @spaces.GPU
 def get_transcripts_and_raw_times(audio_path, session_dir):
     if not audio_path:
         gr.Error("No audio file path provided for transcription.", duration=None)
                 for c in char_timestamps_raw if isinstance(c, dict) and 'start' in c and 'end' in c and 'char' in c
             ]
+            # 単語タイムスタンプ（word）を追加で抽出
+            word_timestamps_raw = output[0].timestamp.get("word", [])
+            word_vis_data = [
+                [f"{w['start']:.2f}", f"{w['end']:.2f}", w["word"]]
+                for w in word_timestamps_raw if isinstance(w, dict) and 'start' in w and 'end' in w and 'word' in w
+            ]
             button_update = gr.DownloadButton(visible=False)
+            srt_file_path = None
+            vtt_file_path = None
+            json_file_path = None
+            lrc_file_path = None
             try:
                 csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
+                with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
                     writer = csv.writer(f)
                     writer.writerow(csv_headers)
                     writer.writerows(vis_data)
                 print(f"CSV transcript saved to temporary file: {csv_file_path}")
+                button_update = gr.DownloadButton(value=csv_file_path.as_posix(), visible=True)
+                # SRT, VTT, JSON も保存
+                srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt")
+                vtt_file_path = Path(session_dir, f"transcription_{audio_name}.vtt")
+                json_file_path = Path(session_dir, f"transcription_{audio_name}.json")
+                write_srt(vis_data, srt_file_path)
+                write_vtt(vis_data, word_vis_data, vtt_file_path)
+                write_json(vis_data, word_vis_data, json_file_path)
+                print(f"SRT, VTT, JSON transcript saved to temporary files: {srt_file_path}, {vtt_file_path}, {json_file_path}")
+                # LRC も保存
+                lrc_file_path = Path(session_dir, f"transcription_{audio_name}.lrc")
+                write_lrc(vis_data, lrc_file_path)
+                print(f"LRC transcript saved to temporary file: {lrc_file_path}")
             except Exception as csv_e:
+                gr.Error(f"Failed to create transcript files: {csv_e}", duration=None)
+                print(f"Error writing transcript files: {csv_e}")
             gr.Info("Transcription complete.", duration=2)
+            # 4つのファイルパスを返す
+            return (
+                vis_data,
+                raw_times_data,
+                word_vis_data,
+                audio_path,
+                gr.DownloadButton(value=csv_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=srt_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=vtt_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=json_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=lrc_file_path.as_posix(), visible=True)
+            )
         except torch.cuda.OutOfMemoryError as e:
             error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
         print("Failed to get audio segment data.")
         return gr.Audio(value=None, label="Selected Segment")
+def write_srt(segments, path):
+    def sec2srt(t):
+        h, rem = divmod(int(float(t)), 3600)
+        m, s = divmod(rem, 60)
+        ms = int((float(t) - int(float(t))) * 1000)
+        return f"{h:02}:{m:02}:{s:02},{ms:03}"
+    with open(path, "w", encoding="utf-8") as f:
+        for i, seg in enumerate(segments, 1):
+            f.write(f"{i}\n{sec2srt(seg[0])} --> {sec2srt(seg[1])}\n{seg[2]}\n\n")
+def write_vtt(segments, words, path):
+    def sec2vtt(t):
+        h, rem = divmod(int(float(t)), 3600)
+        m, s = divmod(rem, 60)
+        ms = int((float(t) - int(float(t))) * 1000)
+        return f"{h:02}:{m:02}:{s:02}.{ms:03}"
+    with open(path, "w", encoding="utf-8") as f:
+        f.write("WEBVTT\n\n")
+        word_idx = 0
+        for seg in segments:
+            s_start = float(seg[0])
+            s_end = float(seg[1])
+            s_text = seg[2]
+            # このセグメントに含まれる単語を抽出
+            segment_words = []
+            while word_idx < len(words):
+                w = words[word_idx]
+                w_start = float(w[0])
+                w_end = float(w[1])
+                if w_start >= s_start and w_end <= s_end:
+                    segment_words.append(w)
+                    word_idx += 1
+                elif w_end < s_start:
+                    word_idx += 1
+                else:
+                    break
+            # 各単語ごとにタイムスタンプを生成
+            for i, w in enumerate(segment_words):
+                w_start = float(w[0])
+                w_end = float(w[1])
+                w_text = w[2]
+                # 現在の単語を強調表示し、他の単語は通常表示
+                colored_text = ""
+                for j, other_w in enumerate(segment_words):
+                    if j == i:
+                        colored_text += f"<c.yellow><b>{other_w[2]}</b></c> "
+                    else:
+                        colored_text += f"{other_w[2]} "
+                f.write(f"{sec2vtt(w_start)} --> {sec2vtt(w_end)}\n{colored_text.strip()}\n\n")
+def write_json(segments, words, path):
+    # segments: [[start, end, text], ...]
+    # words: [[start, end, word], ...]
+    result = {"segments": []}
+    word_idx = 0
+    for s in segments:
+        s_start = float(s[0])
+        s_end = float(s[1])
+        s_text = s[2]
+        word_list = []
+        # wordのstartがこのsegmentの範囲内のものを抽出
+        while word_idx < len(words):
+            w = words[word_idx]
+            w_start = float(w[0])
+            w_end = float(w[1])
+            if w_start >= s_start and w_end <= s_end:
+                word_list.append({"start": w_start, "end": w_end, "word": w[2]})
+                word_idx += 1
+            elif w_end < s_start:
+                word_idx += 1
+            else:
+                break
+        result["segments"].append({
+            "start": s_start,
+            "end": s_end,
+            "text": s_text,
+            "words": word_list
+        })
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+def write_lrc(segments, path):
+    # segments: [[start, end, text], ...]
+    def sec2lrc(t):
+        m, s = divmod(float(t), 60)
+        return f"[{int(m):02}:{s:05.2f}]"
+    with open(path, "w", encoding="utf-8") as f:
+        for seg in segments:
+            f.write(f"{sec2lrc(seg[0])}{seg[2]}\n")
 article = (
     "<p style='font-size: 1.1em;'>"
     "This demo showcases <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2'>parakeet-tdt-0.6b-v2</a></code>, a 600-million-parameter model designed for high-quality English speech recognition."
     gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results</strong></p>")
     download_btn = gr.DownloadButton(label="Download Segment Transcript (CSV)", visible=False)
+    srt_btn = gr.DownloadButton(label="Download SRT", visible=False)
+    vtt_btn = gr.DownloadButton(label="Download VTT", visible=False)
+    json_btn = gr.DownloadButton(label="Download JSON", visible=False)
+    lrc_btn = gr.DownloadButton(label="Download LRC", visible=False)
     with gr.Tabs(): # Tabs for result views
         with gr.TabItem("Segment View (Click row to play segment)"):
             )
             selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
+        with gr.TabItem("Word View"):
+            word_vis_df = gr.DataFrame( # Define word_vis_df here
+                headers=["Start (s)", "End (s)", "Word"],
                 datatype=["number", "number", "str"],
                 wrap=False, # As specified in diff
+                # label="Word Timestamps" # Label provided by tab
             )
     # Ensure outputs list matches the return order of get_transcripts_and_raw_times:
     # vis_data, raw_times_data, char_vis_data, audio_path, button_update
     # maps to:
+    # vis_timestamps_df, raw_timestamps_list_state, word_vis_df, current_audio_path_state, download_btn
     mic_transcribe_btn.click(
         fn=get_transcripts_and_raw_times,
         inputs=[mic_input, session_dir_state],
+        outputs=[vis_timestamps_df, raw_timestamps_list_state, word_vis_df, current_audio_path_state, download_btn, srt_btn, vtt_btn, json_btn, lrc_btn],
         api_name="transcribe_mic"
     )
     file_transcribe_btn.click(
         fn=get_transcripts_and_raw_times,
         inputs=[file_input, session_dir_state],
+        outputs=[vis_timestamps_df, raw_timestamps_list_state, word_vis_df, current_audio_path_state, download_btn, srt_btn, vtt_btn, json_btn, lrc_btn],
         api_name="transcribe_file"
     )
         outputs=[selected_segment_player],
     )
+    demo.unload(end_session)
 if __name__ == "__main__":
     print("Launching Gradio Demo...")

app_space.py ADDED Viewed

	@@ -0,0 +1,498 @@

+from nemo.collections.asr.models import ASRModel
+import torch
+import gradio as gr
+import spaces
+import gc
+import shutil
+from pathlib import Path
+from pydub import AudioSegment
+import numpy as np
+import os
+import gradio.themes as gr_themes
+import csv
+import json
+device = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_NAME="nvidia/parakeet-tdt-0.6b-v2"
+model = ASRModel.from_pretrained(model_name=MODEL_NAME)
+model.eval()
+def start_session(request: gr.Request):
+    session_hash = request.session_hash
+    session_dir = Path(f'/tmp/{session_hash}')
+    session_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Session with hash {session_hash} started.")
+    return session_dir.as_posix()
+def end_session(request: gr.Request):
+    session_hash = request.session_hash
+    session_dir = Path(f'/tmp/{session_hash}')
+    if session_dir.exists():
+        shutil.rmtree(session_dir)
+    print(f"Session with hash {session_hash} ended.")
+def get_audio_segment(audio_path, start_second, end_second):
+    if not audio_path or not Path(audio_path).exists():
+        print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
+        return None
+    try:
+        start_ms = int(start_second * 1000)
+        end_ms = int(end_second * 1000)
+        start_ms = max(0, start_ms)
+        if end_ms <= start_ms:
+            print(f"Warning: End time ({end_second}s) is not after start time ({start_second}s). Adjusting end time.")
+            end_ms = start_ms + 100
+        audio = AudioSegment.from_file(audio_path)
+        clipped_audio = audio[start_ms:end_ms]
+        samples = np.array(clipped_audio.get_array_of_samples())
+        if clipped_audio.channels == 2:
+            samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype)
+        frame_rate = clipped_audio.frame_rate
+        if frame_rate <= 0:
+            print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
+            frame_rate = audio.frame_rate
+        if samples.size == 0:
+            print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
+            return None
+        return (frame_rate, samples)
+    except FileNotFoundError:
+        print(f"Error: Audio file not found at path: {audio_path}")
+        return None
+    except Exception as e:
+        print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
+        return None
+@spaces.GPU
+def get_transcripts_and_raw_times(audio_path, session_dir):
+    if not audio_path:
+        gr.Error("No audio file path provided for transcription.", duration=None)
+        return [], [], [], None, gr.DownloadButton(visible=False)
+    vis_data = [["N/A", "N/A", "Processing failed"]]
+    raw_times_data = [[0.0, 0.0]]
+    char_vis_data = []
+    processed_audio_path = None
+    original_path_name = Path(audio_path).name
+    audio_name = Path(audio_path).stem
+    try:
+        try:
+            gr.Info(f"Loading audio: {original_path_name}", duration=2)
+            audio = AudioSegment.from_file(audio_path)
+            duration_sec = audio.duration_seconds
+        except Exception as load_e:
+            gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
+            return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        resampled = False
+        mono = False
+        target_sr = 16000
+        if audio.frame_rate != target_sr:
+            try:
+                audio = audio.set_frame_rate(target_sr)
+                resampled = True
+            except Exception as resample_e:
+                gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
+                return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        if audio.channels == 2:
+            try:
+                audio = audio.set_channels(1)
+                mono = True
+            except Exception as mono_e:
+                gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
+                return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        elif audio.channels > 2:
+            gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
+            return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        if resampled or mono:
+            try:
+                processed_audio_path = Path(session_dir, f"{audio_name}_resampled.wav")
+                audio.export(processed_audio_path, format="wav")
+                transcribe_path = processed_audio_path.as_posix()
+                info_path_name = f"{original_path_name} (processed)"
+            except Exception as export_e:
+                gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
+                if processed_audio_path and os.path.exists(processed_audio_path):
+                    os.remove(processed_audio_path)
+                return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        else:
+            transcribe_path = audio_path
+            info_path_name = original_path_name
+        long_audio_settings_applied = False
+        try:
+            model.to(device)
+            model.to(torch.float32)
+            gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
+            if duration_sec > 480:
+                try:
+                    gr.Info("Audio longer than 8 minutes. Applying optimized settings for long transcription.", duration=3)
+                    print("Applying long audio settings: Local Attention and Chunking.")
+                    model.change_attention_model("rel_pos_local_attn", [256,256])
+                    model.change_subsampling_conv_chunking_factor(1)
+                    long_audio_settings_applied = True
+                except Exception as setting_e:
+                    gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5)
+                    print(f"Warning: Failed to apply long audio settings: {setting_e}")
+            model.to(torch.bfloat16)
+            output = model.transcribe([transcribe_path], timestamps=True)
+            if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
+                gr.Error("Transcription failed or produced unexpected output format.", duration=None)
+                return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+            segment_timestamps = output[0].timestamp['segment']
+            csv_headers = ["Start (s)", "End (s)", "Segment"]
+            vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
+            raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
+            char_timestamps_raw = output[0].timestamp.get("char", [])
+            if not isinstance(char_timestamps_raw, list):
+                print(f"Warning: char_timestamps_raw is not a list, but {type(char_timestamps_raw)}. Defaulting to empty.")
+                char_timestamps_raw = []
+            char_vis_data = [
+                [f"{c['start']:.2f}", f"{c['end']:.2f}", c["char"]]
+                for c in char_timestamps_raw if isinstance(c, dict) and 'start' in c and 'end' in c and 'char' in c
+            ]
+            word_timestamps_raw = output[0].timestamp.get("word", [])
+            word_vis_data = [
+                [f"{w['start']:.2f}", f"{w['end']:.2f}", w["word"]]
+                for w in word_timestamps_raw if isinstance(w, dict) and 'start' in w and 'end' in w and 'word' in w
+            ]
+            button_update = gr.DownloadButton(visible=False)
+            srt_file_path = None
+            vtt_file_path = None
+            json_file_path = None
+            lrc_file_path = None
+            try:
+                csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
+                with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
+                    writer = csv.writer(f)
+                    writer.writerow(csv_headers)
+                    writer.writerows(vis_data)
+                print(f"CSV transcript saved to temporary file: {csv_file_path}")
+                button_update = gr.DownloadButton(value=csv_file_path.as_posix(), visible=True)
+                srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt")
+                vtt_file_path = Path(session_dir, f"transcription_{audio_name}.vtt")
+                json_file_path = Path(session_dir, f"transcription_{audio_name}.json")
+                write_srt(vis_data, srt_file_path)
+                write_vtt(vis_data, word_vis_data, vtt_file_path)
+                write_json(vis_data, word_vis_data, json_file_path)
+                print(f"SRT, VTT, JSON transcript saved to temporary files: {srt_file_path}, {vtt_file_path}, {json_file_path}")
+                lrc_file_path = Path(session_dir, f"transcription_{audio_name}.lrc")
+                write_lrc(vis_data, word_vis_data, lrc_file_path)
+                print(f"LRC transcript saved to temporary file: {lrc_file_path}")
+            except Exception as csv_e:
+                gr.Error(f"Failed to create transcript files: {csv_e}", duration=None)
+                print(f"Error writing transcript files: {csv_e}")
+            gr.Info("Transcription complete.", duration=2)
+            return (
+                vis_data,
+                raw_times_data,
+                word_vis_data,
+                audio_path,
+                gr.DownloadButton(value=csv_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=srt_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=vtt_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=json_file_path.as_posix(), visible=True),
+                gr.DownloadButton(value=lrc_file_path.as_posix(), visible=True)
+            )
+        except torch.cuda.OutOfMemoryError as e:
+            error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
+            print(f"CUDA OutOfMemoryError: {e}")
+            gr.Error(error_msg, duration=None)
+            return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        except FileNotFoundError:
+            error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
+            print(f"Error: Transcribe audio file not found at path: {transcribe_path}")
+            gr.Error(error_msg, duration=None)
+            return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        except Exception as e:
+            error_msg = f"Transcription failed: {e}"
+            print(f"Error during transcription processing: {e}")
+            gr.Error(error_msg, duration=None)
+            return [["Error", "Error", error_msg]], [[0.0, 0.0]], [], audio_path, gr.DownloadButton(visible=False)
+        finally:
+            try:
+                if long_audio_settings_applied:
+                    try:
+                        print("Reverting long audio settings.")
+                        model.change_attention_model("rel_pos")
+                        model.change_subsampling_conv_chunking_factor(-1)
+                    except Exception as revert_e:
+                        print(f"Warning: Failed to revert long audio settings: {revert_e}")
+                        gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5)
+                if 'model' in locals() and hasattr(model, 'cpu'):
+                    if device == 'cuda':
+                        model.cpu()
+                gc.collect()
+                if device == 'cuda':
+                    torch.cuda.empty_cache()
+            except Exception as cleanup_e:
+                print(f"Error during model cleanup: {cleanup_e}")
+                gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
+    finally:
+        if processed_audio_path and os.path.exists(processed_audio_path):
+            try:
+                os.remove(processed_audio_path)
+                print(f"Temporary audio file {processed_audio_path} removed.")
+            except Exception as e:
+                print(f"Error removing temporary audio file {processed_audio_path}: {e}")
+def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
+    if not isinstance(raw_ts_list, list):
+        print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
+        return gr.Audio(value=None, label="Selected Segment")
+    if not current_audio_path:
+        print("No audio path available to play segment from.")
+        return gr.Audio(value=None, label="Selected Segment")
+    selected_index = evt.index[0]
+    if selected_index < 0 or selected_index >= len(raw_ts_list):
+        print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
+        return gr.Audio(value=None, label="Selected Segment")
+    if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
+        print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
+        return gr.Audio(value=None, label="Selected Segment")
+    start_time_s, end_time_s = raw_ts_list[selected_index]
+    print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
+    segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
+    if segment_data:
+        print("Segment data retrieved successfully.")
+        return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False)
+    else:
+        print("Failed to get audio segment data.")
+        return gr.Audio(value=None, label="Selected Segment")
+def write_srt(segments, path):
+    def sec2srt(t):
+        h, rem = divmod(int(float(t)), 3600)
+        m, s = divmod(rem, 60)
+        ms = int((float(t) - int(float(t))) * 1000)
+        return f"{h:02}:{m:02}:{s:02},{ms:03}"
+    with open(path, "w", encoding="utf-8") as f:
+        for i, seg in enumerate(segments, 1):
+            f.write(f"{i}\n{sec2srt(seg[0])} --> {sec2srt(seg[1])}\n{seg[2]}\n\n")
+def write_vtt(segments, words, path):
+    def sec2vtt(t):
+        h, rem = divmod(int(float(t)), 3600)
+        m, s = divmod(rem, 60)
+        ms = int((float(t) - int(float(t))) * 1000)
+        return f"{h:02}:{m:02}:{s:02}.{ms:03}"
+    with open(path, "w", encoding="utf-8") as f:
+        f.write("WEBVTT\n\n")
+        word_idx = 0
+        for seg in segments:
+            s_start = float(seg[0])
+            s_end = float(seg[1])
+            s_text = seg[2]
+            # このセグメントに含まれる単語を抽出
+            segment_words = []
+            while word_idx < len(words):
+                w = words[word_idx]
+                w_start = float(w[0])
+                w_end = float(w[1])
+                if w_start >= s_start and w_end <= s_end:
+                    segment_words.append(w)
+                    word_idx += 1
+                elif w_end < s_start:
+                    word_idx += 1
+                else:
+                    break
+            prev_end = s_start
+            for i, w in enumerate(segment_words):
+                w_start = float(w[0])
+                w_end = float(w[1])
+                # 空白区間（前の単語のend～今の単語のstart）
+                if prev_end < w_start:
+                    f.write(f"{sec2vtt(prev_end)} --> {sec2vtt(w_start)}\n{s_text}\n\n")
+                # 今の単語をハイライト
+                colored_text = ""
+                for j, other_w in enumerate(segment_words):
+                    if j == i:
+                        colored_text += f"<c.yellow><b>{other_w[2]}</b></c> "
+                    else:
+                        colored_text += f"{other_w[2]} "
+                f.write(f"{sec2vtt(w_start)} --> {sec2vtt(w_end)}\n{colored_text.strip()}\n\n")
+                prev_end = w_end
+                # 次の単語の開始まで空白があれば埋める
+                if i+1 < len(segment_words):
+                    next_start = float(segment_words[i+1][0])
+                    if prev_end < next_start:
+                        f.write(f"{sec2vtt(prev_end)} --> {sec2vtt(next_start)}\n{s_text}\n\n")
+                        prev_end = next_start
+            # 最後の単語のend～セグメントのendまで
+            if prev_end < s_end:
+                f.write(f"{sec2vtt(prev_end)} --> {sec2vtt(s_end)}\n{s_text}\n\n")
+def write_json(segments, words, path):
+    result = {"segments": []}
+    word_idx = 0
+    for s in segments:
+        s_start = float(s[0])
+        s_end = float(s[1])
+        s_text = s[2]
+        word_list = []
+        while word_idx < len(words):
+            w = words[word_idx]
+            w_start = float(w[0])
+            w_end = float(w[1])
+            if w_start >= s_start and w_end <= s_end:
+                word_list.append({"start": w_start, "end": w_end, "word": w[2]})
+                word_idx += 1
+            elif w_end < s_start:
+                word_idx += 1
+            else:
+                break
+        result["segments"].append({
+            "start": s_start,
+            "end": s_end,
+            "text": s_text,
+            "words": word_list
+        })
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+def write_lrc(segments, words, path):
+    def sec2lrc(t):
+        m, s = divmod(float(t), 60)
+        return f"[{int(m):02}:{s:05.2f}]"
+    with open(path, "w", encoding="utf-8") as f:
+        for w in words:
+            f.write(f"{sec2lrc(w[0])}{w[2]}\n")
+article = (
+    "<p style='font-size: 1.1em;'>"
+    "This demo showcases <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2'>parakeet-tdt-0.6b-v2</a></code>, a 600-million-parameter model designed for high-quality English speech recognition."
+    "</p>"
+    "<p><strong style='color: red; font-size: 1.2em;'>Key Features:</strong></p>"
+    "<ul style='font-size: 1.1em;'>"
+    "    <li>Automatic punctuation and capitalization</li>"
+    "    <li>Accurate word-level timestamps (click on a segment in the table below to play it!)</li>"
+    "    <li>Character-level timestamps now available in the 'Character View' tab.</li>"
+    "    <li>Efficiently transcribes long audio segments (<strong>updated to support upto 3 hours</strong>) <small>(For even longer audios, see <a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py' target='_blank'>this script</a>)</small></li>"
+    "    <li>Robust performance on spoken numbers, and song lyrics transcription </li>"
+    "</ul>"
+    "<p style='font-size: 1.1em;'>"
+    "This model is <strong>available for commercial and non-commercial use</strong>."
+    "</p>"
+    "<p style='text-align: center;'>"
+    "<a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2' target='_blank'>🎙️ Learn more about the Model</a> | "
+    "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>📄 Fast Conformer paper</a> | "
+    "<a href='https://arxiv.org/abs/2304.06795' target='_blank'>📚 TDT paper</a> | "
+    "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 NeMo Repository</a>"
+    "</p>"
+)
+examples = [
+    ["data/example-yt_saTD1u8PorI.mp3"],
+]
+nvidia_theme = gr_themes.Default(
+    primary_hue=gr_themes.Color(
+        c50="#E6F1D9", c100="#CEE3B3", c200="#B5D58C", c300="#9CC766",
+        c400="#84B940", c500="#76B900", c600="#68A600", c700="#5A9200",
+        c800="#4C7E00", c900="#3E6A00", c950="#2F5600"
+    ),
+    neutral_hue="gray",
+    font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+).set()
+with gr.Blocks(theme=nvidia_theme) as demo:
+    model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
+    gr.Markdown(f"<h1 style='text-align: center; margin: 0 auto;'>Speech Transcription with {model_display_name}</h1>")
+    gr.HTML(article)
+    current_audio_path_state = gr.State(None)
+    raw_timestamps_list_state = gr.State([])
+    session_dir_state = gr.State()
+    demo.load(start_session, outputs=[session_dir_state])
+    with gr.Tabs():
+        with gr.TabItem("Audio File"):
+            file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
+            gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
+            file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
+        with gr.TabItem("Microphone"):
+            mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
+            mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
+    gr.Markdown("---")
+    gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results</strong></p>")
+    download_btn = gr.DownloadButton(label="Download Segment Transcript (CSV)", visible=False)
+    srt_btn = gr.DownloadButton(label="Download SRT", visible=False)
+    vtt_btn = gr.DownloadButton(label="Download VTT", visible=False)
+    json_btn = gr.DownloadButton(label="Download JSON", visible=False)
+    lrc_btn = gr.DownloadButton(label="Download LRC", visible=False)
+    with gr.Tabs():
+        with gr.TabItem("Segment View (Click row to play segment)"):
+            vis_timestamps_df = gr.DataFrame(
+                headers=["Start (s)", "End (s)", "Segment"],
+                datatype=["number", "number", "str"],
+                wrap=True,
+            )
+            selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
+        with gr.TabItem("Word View"):
+            word_vis_df = gr.DataFrame(
+                headers=["Start (s)", "End (s)", "Word"],
+                datatype=["number", "number", "str"],
+                wrap=False,
+            )
+    mic_transcribe_btn.click(
+        fn=get_transcripts_and_raw_times,
+        inputs=[mic_input, session_dir_state],
+        outputs=[vis_timestamps_df, raw_timestamps_list_state, word_vis_df, current_audio_path_state, download_btn, srt_btn, vtt_btn, json_btn, lrc_btn],
+        api_name="transcribe_mic"
+    )
+    file_transcribe_btn.click(
+        fn=get_transcripts_and_raw_times,
+        inputs=[file_input, session_dir_state],
+        outputs=[vis_timestamps_df, raw_timestamps_list_state, word_vis_df, current_audio_path_state, download_btn, srt_btn, vtt_btn, json_btn, lrc_btn],
+        api_name="transcribe_file"
+    )
+    vis_timestamps_df.select(
+        fn=play_segment,
+        inputs=[raw_timestamps_list_state, current_audio_path_state],
+        outputs=[selected_segment_player],
+    )
+    demo.unload(end_session)
+if __name__ == "__main__":
+    print("Launching Gradio Demo...")
+    demo.queue()
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 Cython
 git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
 numpy<2.0
-pydub

 Cython
 git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
 numpy<2.0
+pydub
+gradio
+spaces