| from transformers import pipeline | |
| import scipy.io.wavfile | |
| import numpy as np | |
| # Initialize the pipeline (load once globally) | |
| pipe = pipeline("text-to-speech", model="maya-research/Veena") | |
| def text_to_speech_with_veena(input_text, output_filepath="doctor_voice.wav"): | |
| """Generate voice using Hugging Face Veena model and save as WAV""" | |
| result = pipe(input_text) | |
| audio = result["audio"] | |
| # Fix: fallback sample rate | |
| sr = result.get("sampling_rate") | |
| if sr is None: | |
| sr = 16000 # default 16kHz | |
| # Ensure numpy array | |
| if not isinstance(audio, np.ndarray): | |
| audio = np.array(audio) | |
| # Normalize to int16 if not already | |
| if audio.dtype != np.int16: | |
| audio = (audio / np.max(np.abs(audio)) * 32767).astype(np.int16) | |
| scipy.io.wavfile.write(output_filepath, rate=sr, data=audio) | |
| print(f"[DEBUG] Saved {output_filepath} with sample_rate={sr}, shape={audio.shape}") | |
| return output_filepath | |