import gradio as gr import edge_tts import asyncio import tempfile import nltk import os import srt from pydub import AudioSegment, silence import datetime import nest_asyncio nltk.download("punkt") nest_asyncio.apply() async def text_to_speech(text, voice, rate, pitch): if not text.strip(): return None, None, "Please enter some text." if not voice: return None, None, "Please select a voice." voice_short = voice.split(" - ")[0] rate_str = f"{rate:+d}%" pitch_str = f"{pitch:+d}Hz" communicate = edge_tts.Communicate(text, voice_short, rate=rate_str, pitch=pitch_str) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) srt_path = generate_srt(tmp_path, text) return tmp_path, srt_path, "" def generate_srt(audio_path, text): audio = AudioSegment.from_file(audio_path) silences = silence.detect_silence(audio, min_silence_len=400, silence_thresh=audio.dBFS - 16) silences = [(start / 1000.0, end / 1000.0) for start, end in silences] sentences = nltk.tokenize.sent_tokenize(text) subtitles = [] last_time = 0.0 for i, sentence in enumerate(sentences): if i < len(silences): start = last_time end = silences[i][0] last_time = silences[i][1] else: start = last_time end = start + 2.5 subtitles.append(srt.Subtitle( index=i + 1, start=datetime.timedelta(seconds=start), end=datetime.timedelta(seconds=end), content=sentence )) srt_data = srt.compose(subtitles) with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode="w") as srt_file: srt_file.write(srt_data) return srt_file.name async def tts_interface(text, voice, rate, pitch): return await text_to_speech(text, voice, rate, pitch) # ⬇️ Create demo synchronously (run async functions in loop) voices = asyncio.run(edge_tts.list_voices()) voice_dict = {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} with gr.Blocks() as demo: gr.Markdown("# 🎙️ Text-to-Speech + Subtitle Generator") with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Input Text", lines=5) voice_dropdown = gr.Dropdown(choices=[""] + list(voice_dict.keys()), label="Select Voice") rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)") pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)") generate_btn = gr.Button("🎧 Generate Audio + SRT") with gr.Column(): audio_output = gr.Audio(label="Generated Audio", type="filepath") srt_output = gr.File(label="Download Subtitle (.srt)") message_output = gr.Textbox(label="Status", interactive=False) generate_btn.click( fn=lambda text, voice, rate, pitch: asyncio.run(tts_interface(text, voice, rate, pitch)), inputs=[text_input, voice_dropdown, rate_slider, pitch_slider], outputs=[audio_output, srt_output, message_output] )