import gradio as gr import os import numpy as np import soundfile as sf import tempfile import requests from huggingface_hub import InferenceClient # Simple minimal app to demonstrate Nari Labs Dia-1.6B TTS model class SimpleInference: def __init__(self): # Get API token from environment (in HF Spaces, this is automatically provided) self.hf_token = os.environ.get("HF_TOKEN") self.api = InferenceClient(model="nari-labs/Dia-1.6B", token=self.hf_token) def generate_audio(self, text, seed=None): """Generate audio from text using the Dia model""" try: # Create payload for the API payload = {"text": text} if seed is not None and seed > 0: payload["seed"] = int(seed) # Call the Hugging Face Inference API # API will return audio data with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: temp_filename = f.name # Call the API to generate audio audio_output = self.api.audio_generation( text, seed=int(seed) if seed and seed > 0 else None ) # Save the audio to a temporary file and read it back sf.write(temp_filename, audio_output.audio, audio_output.sampling_rate) audio_data, sample_rate = sf.read(temp_filename) # Clean up the temporary file os.unlink(temp_filename) return (sample_rate, audio_data) except Exception as e: print(f"Error generating audio: {str(e)}") # Return a silent audio sample to avoid breaking the UI return (44100, np.zeros(1000)) # Initialize the inference model model = SimpleInference() # Define example inputs examples = [ ["[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."], ["[S1] The weather is so nice today! [S2] I know, it's perfect for a walk in the park."], ["[S1] Did you hear about the new text to speech model? [S2] Yes, it's called Dia and it's really impressive! [S1] (laughs) That's amazing! Can it do different voices? [S2] Absolutely, and you can even clone your own voice."] ] # Define the Gradio interface demo = gr.Interface( fn=lambda text, seed=0: model.generate_audio(text, seed), inputs=[ gr.Textbox( label="Text", placeholder="Enter text with [S1] and [S2] tags for different speakers, include (laughs), (coughs), etc. for non-verbal sounds", lines=5 ), gr.Number( label="Seed (Optional)", precision=0, value=0 ) ], outputs=gr.Audio(label="Generated Speech"), title="Nari Labs Dia-1.6B Text-to-Speech", description=""" # Dia 1.6B Text-to-Speech Model Dia is a 1.6B parameter text-to-speech model created by Nari Labs that directly generates realistic dialogue from text. ## Features: - Generate dialogue using [S1] and [S2] tags for different speakers - Include non-verbal communication like (laughs), (coughs), etc. ## Usage Tips: - Use [S1] and [S2] tags to indicate different speakers - Include non-verbal sounds in parentheses: (laughs), (clears throat), (sighs), etc. - Set a seed value for consistent voices across multiple generations """, examples=examples, allow_flagging="never" ) if __name__ == "__main__": demo.launch()