Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import edge_tts | |
| import numpy as np | |
| import asyncio | |
| import os | |
| # Load STT and chatbot pipelines | |
| stt = pipeline("automatic-speech-recognition", model="openai/whisper-small") | |
| chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa") | |
| async def tts(text: str, voice: str = "fa-IR-FaridNeural"): | |
| communicate = edge_tts.Communicate(text, voice) | |
| audio_data = b"" | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| audio_data += chunk["data"] | |
| audio_array = np.frombuffer(audio_data, dtype=np.int16) | |
| sample_rate = 24000 | |
| return sample_rate, audio_array | |
| async def audio_to_audio(audio_input): | |
| if audio_input is None: | |
| return None, "No audio input received." | |
| sample_rate_in, data_in = audio_input | |
| audio = {"array": data_in, "sampling_rate": sample_rate_in} | |
| # 1. ASR → text | |
| text = stt(audio)["text"] | |
| # 2. Generate response | |
| response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"] | |
| # 3. TTS | |
| return await tts(response) | |
| # Gradio interface | |
| demo = gr.Interface( | |
| fn=audio_to_audio, | |
| inputs=gr.Audio( | |
| sources=["microphone"], # Use 'sources' instead of deprecated 'source' :contentReference[oaicite:2]{index=2} | |
| type="numpy", | |
| label="Speak in Farsi" | |
| ), | |
| outputs=gr.Audio(type="numpy", label="Response in Farsi"), | |
| title="Farsi Audio Chatbot", | |
| description="Speak in Farsi, and the app will respond in Farsi audio.", | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=int(os.environ.get("PORT", 7860)) | |
| ) | |