from transformers import pipeline import scipy.io.wavfile import numpy as np # Initialize the pipeline (load once globally) pipe = pipeline("text-to-speech", model="maya-research/Veena") def text_to_speech_with_veena(input_text, output_filepath="doctor_voice.wav"): """Generate voice using Hugging Face Veena model and save as WAV""" result = pipe(input_text) audio = result["audio"] # Fix: fallback sample rate sr = result.get("sampling_rate") if sr is None: sr = 16000 # default 16kHz # Ensure numpy array if not isinstance(audio, np.ndarray): audio = np.array(audio) # Normalize to int16 if not already if audio.dtype != np.int16: audio = (audio / np.max(np.abs(audio)) * 32767).astype(np.int16) scipy.io.wavfile.write(output_filepath, rate=sr, data=audio) print(f"[DEBUG] Saved {output_filepath} with sample_rate={sr}, shape={audio.shape}") return output_filepath