Commit
·
af69235
1
Parent(s):
29074da
GPUタイムアウトを300秒から60秒に変更。gradio_client.fileをインポートし、chunk_pathをgradio_file()でラップ。音声ファイルの前処理を改善し、モノラル変換とリサンプリングを追加。新しいトランスクリプトファイルを作成。
Browse files- app.py +1 -1
- local_controller.py +12 -13
- transcribe_cli.py +116 -70
app.py
CHANGED
|
@@ -262,7 +262,7 @@ def transcribe_audio_core(
|
|
| 262 |
print("CUDA cache cleared.")
|
| 263 |
print("Transcription cleanup finished.")
|
| 264 |
|
| 265 |
-
@spaces.GPU(duration=
|
| 266 |
def process_audio_file(audio_filepath: str) -> dict: # Gradioから渡されるのは一時ファイルのパス
|
| 267 |
"""
|
| 268 |
アップロードされた音声ファイルを処理し、文字起こし結果をJSONで返す。
|
|
|
|
| 262 |
print("CUDA cache cleared.")
|
| 263 |
print("Transcription cleanup finished.")
|
| 264 |
|
| 265 |
+
@spaces.GPU(duration=60) # GPUリソースを要求し、タイムアウトを60秒に設定
|
| 266 |
def process_audio_file(audio_filepath: str) -> dict: # Gradioから渡されるのは一時ファイルのパス
|
| 267 |
"""
|
| 268 |
アップロードされた音声ファイルを処理し、文字起こし結果をJSONで返す。
|
local_controller.py
CHANGED
|
@@ -10,7 +10,7 @@ import shutil
|
|
| 10 |
import subprocess
|
| 11 |
|
| 12 |
try:
|
| 13 |
-
from gradio_client import Client
|
| 14 |
GRADIO_CLIENT_AVAILABLE = True
|
| 15 |
except ImportError:
|
| 16 |
GRADIO_CLIENT_AVAILABLE = False
|
|
@@ -137,12 +137,11 @@ def process_chunk(chunk_path: str) -> Optional[Dict]:
|
|
| 137 |
return None
|
| 138 |
|
| 139 |
try:
|
| 140 |
-
from gradio_client import Client
|
| 141 |
import time
|
| 142 |
|
| 143 |
print(f"Connecting to Space: {SPACE_URL}")
|
| 144 |
|
| 145 |
-
# 複数の接続方法を試す
|
| 146 |
client = None
|
| 147 |
for attempt in range(3):
|
| 148 |
try:
|
|
@@ -167,26 +166,26 @@ def process_chunk(chunk_path: str) -> Optional[Dict]:
|
|
| 167 |
|
| 168 |
# Method 1: fn_index=1 を使用 (change イベント用)
|
| 169 |
try:
|
| 170 |
-
result = client.predict(chunk_path, fn_index=1)
|
| 171 |
-
print("Successfully used fn_index=1")
|
| 172 |
except Exception as e:
|
| 173 |
-
print(f"fn_index=1 method failed: {e}")
|
| 174 |
|
| 175 |
# Method 2: fn_index=0 を使用
|
| 176 |
if result is None:
|
| 177 |
try:
|
| 178 |
-
result = client.predict(chunk_path, fn_index=0)
|
| 179 |
-
print("Successfully used fn_index=0")
|
| 180 |
except Exception as e:
|
| 181 |
-
print(f"fn_index=0 method failed: {e}")
|
| 182 |
|
| 183 |
# Method 3: デフォルトで呼び出し
|
| 184 |
if result is None:
|
| 185 |
try:
|
| 186 |
-
result = client.predict(chunk_path)
|
| 187 |
-
print("Successfully used default method")
|
| 188 |
except Exception as e:
|
| 189 |
-
print(f"Default method failed: {e}")
|
| 190 |
|
| 191 |
if result is None:
|
| 192 |
print("All API call methods failed")
|
|
@@ -324,4 +323,4 @@ def main():
|
|
| 324 |
process_audio_file(args.input_path, args.output_dir)
|
| 325 |
|
| 326 |
if __name__ == "__main__":
|
| 327 |
-
main()
|
|
|
|
| 10 |
import subprocess
|
| 11 |
|
| 12 |
try:
|
| 13 |
+
from gradio_client import Client, file as gradio_file # gradio_client.file をインポート
|
| 14 |
GRADIO_CLIENT_AVAILABLE = True
|
| 15 |
except ImportError:
|
| 16 |
GRADIO_CLIENT_AVAILABLE = False
|
|
|
|
| 137 |
return None
|
| 138 |
|
| 139 |
try:
|
| 140 |
+
# from gradio_client import Client # Client は既に上でインポートされているので不要
|
| 141 |
import time
|
| 142 |
|
| 143 |
print(f"Connecting to Space: {SPACE_URL}")
|
| 144 |
|
|
|
|
| 145 |
client = None
|
| 146 |
for attempt in range(3):
|
| 147 |
try:
|
|
|
|
| 166 |
|
| 167 |
# Method 1: fn_index=1 を使用 (change イベント用)
|
| 168 |
try:
|
| 169 |
+
result = client.predict(gradio_file(chunk_path), fn_index=1) # chunk_path を gradio_file() でラップ
|
| 170 |
+
print("Successfully used fn_index=1 with gradio_file()")
|
| 171 |
except Exception as e:
|
| 172 |
+
print(f"fn_index=1 method with gradio_file() failed: {e}")
|
| 173 |
|
| 174 |
# Method 2: fn_index=0 を使用
|
| 175 |
if result is None:
|
| 176 |
try:
|
| 177 |
+
result = client.predict(gradio_file(chunk_path), fn_index=0) # chunk_path を gradio_file() でラップ
|
| 178 |
+
print("Successfully used fn_index=0 with gradio_file()")
|
| 179 |
except Exception as e:
|
| 180 |
+
print(f"fn_index=0 method with gradio_file() failed: {e}")
|
| 181 |
|
| 182 |
# Method 3: デフォルトで呼び出し
|
| 183 |
if result is None:
|
| 184 |
try:
|
| 185 |
+
result = client.predict(gradio_file(chunk_path)) # chunk_path を gradio_file() でラップ
|
| 186 |
+
print("Successfully used default method with gradio_file()")
|
| 187 |
except Exception as e:
|
| 188 |
+
print(f"Default method with gradio_file() failed: {e}")
|
| 189 |
|
| 190 |
if result is None:
|
| 191 |
print("All API call methods failed")
|
|
|
|
| 323 |
process_audio_file(args.input_path, args.output_dir)
|
| 324 |
|
| 325 |
if __name__ == "__main__":
|
| 326 |
+
main()
|
transcribe_cli.py
CHANGED
|
@@ -70,7 +70,20 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
|
|
| 70 |
# 4GB以上またはVERY_LONG_AUDIO_THRESHOLD_SECONDS以上の場合は直接ffmpegでチャンク分割
|
| 71 |
if file_size > 4 * 1024**3 or duration_sec > VERY_LONG_AUDIO_THRESHOLD_SECONDS:
|
| 72 |
print(f" 大容量ファイル({file_size_gb:.2f}GB, {duration_sec/3600:.2f}時間)のため、ffmpegで直接チャンク分割処理を行います。")
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# 4GB未満の場合は従来のpydub処理
|
| 76 |
try:
|
|
@@ -82,6 +95,62 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
|
|
| 82 |
else:
|
| 83 |
raise pydub_e
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
except FileNotFoundError:
|
| 86 |
print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
|
| 87 |
return None, None, None
|
|
@@ -89,65 +158,6 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
|
|
| 89 |
print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
|
| 90 |
return None, None, None
|
| 91 |
|
| 92 |
-
resampled = False
|
| 93 |
-
mono_converted = False
|
| 94 |
-
|
| 95 |
-
# リサンプリング処理
|
| 96 |
-
if audio.frame_rate != TARGET_SAMPLE_RATE:
|
| 97 |
-
try:
|
| 98 |
-
print(f" リサンプリング中: {audio.frame_rate}Hz -> {TARGET_SAMPLE_RATE}Hz")
|
| 99 |
-
audio = audio.set_frame_rate(TARGET_SAMPLE_RATE)
|
| 100 |
-
resampled = True
|
| 101 |
-
except Exception as resample_e:
|
| 102 |
-
print(f"エラー: 音声のリサンプリングに失敗しました: {resample_e}")
|
| 103 |
-
return None, None, None
|
| 104 |
-
|
| 105 |
-
# モノラル変換処理
|
| 106 |
-
if audio.channels == 2:
|
| 107 |
-
try:
|
| 108 |
-
print(" モノラルに変換中 (2ch -> 1ch)")
|
| 109 |
-
audio = audio.set_channels(1)
|
| 110 |
-
mono_converted = True
|
| 111 |
-
except Exception as mono_e:
|
| 112 |
-
print(f"エラー: 音声のモノラル変換に失敗しました: {mono_e}")
|
| 113 |
-
return None, None, None
|
| 114 |
-
elif audio.channels > 2:
|
| 115 |
-
print(f"エラー: 音声チャンネルが {audio.channels} です。1ch(モノラル)または2ch(ステレオ)のみサポートしています。")
|
| 116 |
-
return None, None, None
|
| 117 |
-
elif audio.channels == 1:
|
| 118 |
-
print(" 音声は既にモノラルです。")
|
| 119 |
-
|
| 120 |
-
processed_temp_file_path_obj = None
|
| 121 |
-
# 前処理が行われた場合、一時ファイルに保存
|
| 122 |
-
if resampled or mono_converted:
|
| 123 |
-
try:
|
| 124 |
-
# ファイル名から特殊文字を除去してより安全な名前を生成
|
| 125 |
-
import re
|
| 126 |
-
safe_stem = re.sub(r'[^\w\-_\.]', '_', audio_name_stem)
|
| 127 |
-
temp_suffix = "_preprocessed_temp.wav"
|
| 128 |
-
processed_temp_file_path_obj = Path(output_dir_for_temp_files, f"{safe_stem}{temp_suffix}")
|
| 129 |
-
|
| 130 |
-
print(f" 前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
|
| 131 |
-
audio.export(processed_temp_file_path_obj, format="wav")
|
| 132 |
-
|
| 133 |
-
path_for_transcription = processed_temp_file_path_obj.as_posix()
|
| 134 |
-
display_name_for_info = f"{original_path_name} (前処理済み)"
|
| 135 |
-
except Exception as export_e:
|
| 136 |
-
print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
|
| 137 |
-
if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
|
| 138 |
-
try:
|
| 139 |
-
os.remove(processed_temp_file_path_obj)
|
| 140 |
-
except OSError:
|
| 141 |
-
pass
|
| 142 |
-
return None, None, None
|
| 143 |
-
else:
|
| 144 |
-
# 前処理が不要だった場合
|
| 145 |
-
print(" 前処理は不要でした。元のファイルを使用します。")
|
| 146 |
-
path_for_transcription = audio_path_str
|
| 147 |
-
display_name_for_info = original_path_name
|
| 148 |
-
|
| 149 |
-
return path_for_transcription, display_name_for_info, duration_sec
|
| 150 |
-
|
| 151 |
def get_audio_duration_with_ffprobe(audio_path_str: str) -> Optional[float]:
|
| 152 |
"""ffprobeを使用して音声ファイルの長さを取得(4GB制限なし)"""
|
| 153 |
try:
|
|
@@ -750,42 +760,78 @@ def process_single_file(
|
|
| 750 |
all_word_vis_data_merged: List[List[str]] = []
|
| 751 |
current_global_time_offset_sec = 0.0
|
| 752 |
last_global_segment_end_time_sec = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
for i, chunk_file_path_str in enumerate(temp_chunk_file_paths_str_list):
|
| 754 |
print(f" チャンク {i+1}/{len(temp_chunk_file_paths_str_list)} ({Path(chunk_file_path_str).name}) を処理中...")
|
| 755 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
estimated_chunk_duration_for_asr_settings = CHUNK_LENGTH_SECONDS + CHUNK_OVERLAP_SECONDS
|
| 757 |
vis_data_chunk, _, word_vis_data_chunk = transcribe_audio_cli(
|
| 758 |
chunk_file_path_str, asr_model_instance,
|
| 759 |
estimated_chunk_duration_for_asr_settings, device_to_use
|
| 760 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 761 |
if not vis_data_chunk:
|
| 762 |
print(f" 警告: チャンク {Path(chunk_file_path_str).name} の文字起こしに失敗。スキップします。")
|
| 763 |
current_global_time_offset_sec += CHUNK_LENGTH_SECONDS - (CHUNK_OVERLAP_SECONDS if i < len(temp_chunk_file_paths_str_list) - 1 else 0)
|
| 764 |
continue
|
|
|
|
|
|
|
| 765 |
for seg_row_list in vis_data_chunk:
|
| 766 |
-
s_local_sec = float(seg_row_list[0])
|
| 767 |
-
|
| 768 |
-
|
|
|
|
|
|
|
|
|
|
| 769 |
all_vis_data_merged.append([f"{s_global_sec:.2f}", f"{e_global_sec:.2f}", text_seg])
|
| 770 |
last_global_segment_end_time_sec = max(last_global_segment_end_time_sec, e_global_sec)
|
|
|
|
| 771 |
temp_last_word_global_end_time_sec = float(all_word_vis_data_merged[-1][1]) if all_word_vis_data_merged else 0.0
|
| 772 |
if word_vis_data_chunk:
|
| 773 |
for word_row_list in word_vis_data_chunk:
|
| 774 |
-
w_s_local_sec = float(word_row_list[0])
|
| 775 |
-
|
|
|
|
|
|
|
|
|
|
| 776 |
if w_s_global_sec >= temp_last_word_global_end_time_sec - 0.05:
|
| 777 |
-
|
| 778 |
-
|
|
|
|
| 779 |
if i < len(temp_chunk_file_paths_str_list) - 1:
|
| 780 |
current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
|
| 781 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
print(f" エラー: チャンク {Path(chunk_file_path_str).name} の処理中にエラー: {chunk_proc_e}")
|
| 783 |
-
if i < len(temp_chunk_file_paths_str_list) - 1:
|
| 784 |
current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
|
|
|
|
| 785 |
final_vis_data = all_vis_data_merged
|
| 786 |
final_word_vis_data = all_word_vis_data_merged
|
| 787 |
if not final_vis_data:
|
| 788 |
-
|
| 789 |
else:
|
| 790 |
vis_data_single, _, word_vis_data_single = transcribe_audio_cli(
|
| 791 |
processed_path_for_asr, asr_model_instance, actual_audio_duration_sec, device_to_use
|
|
|
|
| 70 |
# 4GB以上またはVERY_LONG_AUDIO_THRESHOLD_SECONDS以上の場合は直接ffmpegでチャンク分割
|
| 71 |
if file_size > 4 * 1024**3 or duration_sec > VERY_LONG_AUDIO_THRESHOLD_SECONDS:
|
| 72 |
print(f" 大容量ファイル({file_size_gb:.2f}GB, {duration_sec/3600:.2f}時間)のため、ffmpegで直接チャンク分割処理を行います。")
|
| 73 |
+
# 大容量ファイルの場合もモノラル変換を行う
|
| 74 |
+
temp_mono_path = Path(output_dir_for_temp_files) / f"{audio_name_stem}_mono_temp.wav"
|
| 75 |
+
try:
|
| 76 |
+
cmd = [
|
| 77 |
+
'ffmpeg', '-y', '-i', audio_path_str,
|
| 78 |
+
'-ac', '1', # モノラルに変換
|
| 79 |
+
'-ar', str(TARGET_SAMPLE_RATE), # サンプルレートを設定
|
| 80 |
+
str(temp_mono_path)
|
| 81 |
+
]
|
| 82 |
+
subprocess.run(cmd, capture_output=True, check=True)
|
| 83 |
+
return temp_mono_path.as_posix(), f"{original_path_name} (大容量・モノラル)", duration_sec
|
| 84 |
+
except subprocess.CalledProcessError as e:
|
| 85 |
+
print(f" ffmpegでのモノラル変換に失敗: {e}")
|
| 86 |
+
return audio_path_str, f"{original_path_name} (大容量)", duration_sec
|
| 87 |
|
| 88 |
# 4GB未満の場合は従来のpydub処理
|
| 89 |
try:
|
|
|
|
| 95 |
else:
|
| 96 |
raise pydub_e
|
| 97 |
|
| 98 |
+
resampled = False
|
| 99 |
+
mono_converted = False
|
| 100 |
+
|
| 101 |
+
# リサンプリング処理
|
| 102 |
+
if audio.frame_rate != TARGET_SAMPLE_RATE:
|
| 103 |
+
try:
|
| 104 |
+
print(f" リサンプリング中: {audio.frame_rate}Hz -> {TARGET_SAMPLE_RATE}Hz")
|
| 105 |
+
audio = audio.set_frame_rate(TARGET_SAMPLE_RATE)
|
| 106 |
+
resampled = True
|
| 107 |
+
except Exception as resample_e:
|
| 108 |
+
print(f"エラー: 音声のリサンプリングに失敗しました: {resample_e}")
|
| 109 |
+
return None, None, None
|
| 110 |
+
|
| 111 |
+
# モノラル変換処理
|
| 112 |
+
if audio.channels > 1:
|
| 113 |
+
try:
|
| 114 |
+
print(f" モノラルに変換中 ({audio.channels}ch -> 1ch)")
|
| 115 |
+
audio = audio.set_channels(1)
|
| 116 |
+
mono_converted = True
|
| 117 |
+
except Exception as mono_e:
|
| 118 |
+
print(f"エラー: 音声のモノラル変換に失敗しました: {mono_e}")
|
| 119 |
+
return None, None, None
|
| 120 |
+
elif audio.channels == 1:
|
| 121 |
+
print(" 音声は既にモノラルです。")
|
| 122 |
+
|
| 123 |
+
processed_temp_file_path_obj = None
|
| 124 |
+
# 前処理が行われた場合、一時ファイルに保存
|
| 125 |
+
if resampled or mono_converted:
|
| 126 |
+
try:
|
| 127 |
+
# ファイル名から特殊文字を除去してより安全な名前を生成
|
| 128 |
+
import re
|
| 129 |
+
safe_stem = re.sub(r'[^\w\-_\.]', '_', audio_name_stem)
|
| 130 |
+
temp_suffix = "_preprocessed_temp.wav"
|
| 131 |
+
processed_temp_file_path_obj = Path(output_dir_for_temp_files, f"{safe_stem}{temp_suffix}")
|
| 132 |
+
|
| 133 |
+
print(f" 前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
|
| 134 |
+
audio.export(processed_temp_file_path_obj, format="wav")
|
| 135 |
+
|
| 136 |
+
path_for_transcription = processed_temp_file_path_obj.as_posix()
|
| 137 |
+
display_name_for_info = f"{original_path_name} (前処理済み)"
|
| 138 |
+
except Exception as export_e:
|
| 139 |
+
print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
|
| 140 |
+
if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
|
| 141 |
+
try:
|
| 142 |
+
os.remove(processed_temp_file_path_obj)
|
| 143 |
+
except OSError:
|
| 144 |
+
pass
|
| 145 |
+
return None, None, None
|
| 146 |
+
else:
|
| 147 |
+
# 前処理が不要だった場合
|
| 148 |
+
print(" 前処理は不要でした。元のファイルを使用します。")
|
| 149 |
+
path_for_transcription = audio_path_str
|
| 150 |
+
display_name_for_info = original_path_name
|
| 151 |
+
|
| 152 |
+
return path_for_transcription, display_name_for_info, duration_sec
|
| 153 |
+
|
| 154 |
except FileNotFoundError:
|
| 155 |
print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
|
| 156 |
return None, None, None
|
|
|
|
| 158 |
print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
|
| 159 |
return None, None, None
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
def get_audio_duration_with_ffprobe(audio_path_str: str) -> Optional[float]:
|
| 162 |
"""ffprobeを使用して音声ファイルの長さを取得(4GB制限なし)"""
|
| 163 |
try:
|
|
|
|
| 760 |
all_word_vis_data_merged: List[List[str]] = []
|
| 761 |
current_global_time_offset_sec = 0.0
|
| 762 |
last_global_segment_end_time_sec = 0.0
|
| 763 |
+
|
| 764 |
+
# チャンク処理前にGPUメモリをクリア
|
| 765 |
+
if device_to_use == 'cuda':
|
| 766 |
+
torch.cuda.empty_cache()
|
| 767 |
+
gc.collect()
|
| 768 |
+
print(f" 初期GPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
|
| 769 |
+
|
| 770 |
for i, chunk_file_path_str in enumerate(temp_chunk_file_paths_str_list):
|
| 771 |
print(f" チャンク {i+1}/{len(temp_chunk_file_paths_str_list)} ({Path(chunk_file_path_str).name}) を処理中...")
|
| 772 |
try:
|
| 773 |
+
# 各チャンク処理前にGPUメモリをクリア
|
| 774 |
+
if device_to_use == 'cuda':
|
| 775 |
+
torch.cuda.empty_cache()
|
| 776 |
+
gc.collect()
|
| 777 |
+
print(f" チャンク処理前のGPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
|
| 778 |
+
|
| 779 |
estimated_chunk_duration_for_asr_settings = CHUNK_LENGTH_SECONDS + CHUNK_OVERLAP_SECONDS
|
| 780 |
vis_data_chunk, _, word_vis_data_chunk = transcribe_audio_cli(
|
| 781 |
chunk_file_path_str, asr_model_instance,
|
| 782 |
estimated_chunk_duration_for_asr_settings, device_to_use
|
| 783 |
)
|
| 784 |
+
|
| 785 |
+
# チャンク処理後のGPUメモリ使用量を確認
|
| 786 |
+
if device_to_use == 'cuda':
|
| 787 |
+
print(f" チャンク処理後のGPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
|
| 788 |
+
|
| 789 |
if not vis_data_chunk:
|
| 790 |
print(f" 警告: チャンク {Path(chunk_file_path_str).name} の文字起こしに失敗。スキップします。")
|
| 791 |
current_global_time_offset_sec += CHUNK_LENGTH_SECONDS - (CHUNK_OVERLAP_SECONDS if i < len(temp_chunk_file_paths_str_list) - 1 else 0)
|
| 792 |
continue
|
| 793 |
+
|
| 794 |
+
# データのマージ処理
|
| 795 |
for seg_row_list in vis_data_chunk:
|
| 796 |
+
s_local_sec = float(seg_row_list[0])
|
| 797 |
+
e_local_sec = float(seg_row_list[1])
|
| 798 |
+
text_seg = seg_row_list[2]
|
| 799 |
+
s_global_sec = s_local_sec + current_global_time_offset_sec
|
| 800 |
+
e_global_sec = e_local_sec + current_global_time_offset_sec
|
| 801 |
+
if s_global_sec >= last_global_segment_end_time_sec - 0.1:
|
| 802 |
all_vis_data_merged.append([f"{s_global_sec:.2f}", f"{e_global_sec:.2f}", text_seg])
|
| 803 |
last_global_segment_end_time_sec = max(last_global_segment_end_time_sec, e_global_sec)
|
| 804 |
+
|
| 805 |
temp_last_word_global_end_time_sec = float(all_word_vis_data_merged[-1][1]) if all_word_vis_data_merged else 0.0
|
| 806 |
if word_vis_data_chunk:
|
| 807 |
for word_row_list in word_vis_data_chunk:
|
| 808 |
+
w_s_local_sec = float(word_row_list[0])
|
| 809 |
+
w_e_local_sec = float(word_row_list[1])
|
| 810 |
+
text_word = word_row_list[2]
|
| 811 |
+
w_s_global_sec = w_s_local_sec + current_global_time_offset_sec
|
| 812 |
+
w_e_global_sec = w_e_local_sec + current_global_time_offset_sec
|
| 813 |
if w_s_global_sec >= temp_last_word_global_end_time_sec - 0.05:
|
| 814 |
+
all_word_vis_data_merged.append([f"{w_s_global_sec:.2f}", f"{w_e_global_sec:.2f}", text_word])
|
| 815 |
+
temp_last_word_global_end_time_sec = max(temp_last_word_global_end_time_sec, w_e_global_sec)
|
| 816 |
+
|
| 817 |
if i < len(temp_chunk_file_paths_str_list) - 1:
|
| 818 |
current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
|
| 819 |
+
|
| 820 |
+
# チャンク処理後にGPUメモリをクリア
|
| 821 |
+
if device_to_use == 'cuda':
|
| 822 |
+
torch.cuda.empty_cache()
|
| 823 |
+
gc.collect()
|
| 824 |
+
print(f" メモリクリア後のGPUメモリ使用量: {torch.cuda.memory_allocated() / 1024**2:.1f}MB")
|
| 825 |
+
|
| 826 |
+
except Exception as chunk_proc_e:
|
| 827 |
print(f" エラー: チャンク {Path(chunk_file_path_str).name} の処理中にエラー: {chunk_proc_e}")
|
| 828 |
+
if i < len(temp_chunk_file_paths_str_list) - 1:
|
| 829 |
current_global_time_offset_sec += (CHUNK_LENGTH_SECONDS - CHUNK_OVERLAP_SECONDS)
|
| 830 |
+
|
| 831 |
final_vis_data = all_vis_data_merged
|
| 832 |
final_word_vis_data = all_word_vis_data_merged
|
| 833 |
if not final_vis_data:
|
| 834 |
+
raise Exception("チャンク処理後、有効な文字起こしデータが得られませんでした。")
|
| 835 |
else:
|
| 836 |
vis_data_single, _, word_vis_data_single = transcribe_audio_cli(
|
| 837 |
processed_path_for_asr, asr_model_instance, actual_audio_duration_sec, device_to_use
|