import gradio as gr
import os
import numpy as np
import soundfile as sf
import tempfile
import requests
from huggingface_hub import InferenceClient

# Simple minimal app to demonstrate Nari Labs Dia-1.6B TTS model
class SimpleInference:
    def __init__(self):
        # Get API token from environment (in HF Spaces, this is automatically provided)
        self.hf_token = os.environ.get("HF_TOKEN")
        self.api = InferenceClient(model="nari-labs/Dia-1.6B", token=self.hf_token)
        
    def generate_audio(self, text, seed=None):
        """Generate audio from text using the Dia model"""
        try:
            # Create payload for the API
            payload = {"text": text}
            if seed is not None and seed > 0:
                payload["seed"] = int(seed)
                
            # Call the Hugging Face Inference API
            # API will return audio data
            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
                temp_filename = f.name
                
            # Call the API to generate audio
            audio_output = self.api.audio_generation(
                text,
                seed=int(seed) if seed and seed > 0 else None
            )
            
            # Save the audio to a temporary file and read it back
            sf.write(temp_filename, audio_output.audio, audio_output.sampling_rate)
            audio_data, sample_rate = sf.read(temp_filename)
            
            # Clean up the temporary file
            os.unlink(temp_filename)
            
            return (sample_rate, audio_data)
        except Exception as e:
            print(f"Error generating audio: {str(e)}")
            # Return a silent audio sample to avoid breaking the UI
            return (44100, np.zeros(1000))

# Initialize the inference model
model = SimpleInference()

# Define example inputs
examples = [
    ["[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."],
    ["[S1] The weather is so nice today! [S2] I know, it's perfect for a walk in the park."],
    ["[S1] Did you hear about the new text to speech model? [S2] Yes, it's called Dia and it's really impressive! [S1] (laughs) That's amazing! Can it do different voices? [S2] Absolutely, and you can even clone your own voice."]
]

# Define the Gradio interface
demo = gr.Interface(
    fn=lambda text, seed=0: model.generate_audio(text, seed),
    inputs=[
        gr.Textbox(
            label="Text",
            placeholder="Enter text with [S1] and [S2] tags for different speakers, include (laughs), (coughs), etc. for non-verbal sounds",
            lines=5
        ),
        gr.Number(
            label="Seed (Optional)",
            precision=0,
            value=0
        )
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="Nari Labs Dia-1.6B Text-to-Speech",
    description="""
    # Dia 1.6B Text-to-Speech Model
    
    Dia is a 1.6B parameter text-to-speech model created by Nari Labs that directly generates realistic dialogue from text.
    
    ## Features:
    - Generate dialogue using [S1] and [S2] tags for different speakers
    - Include non-verbal communication like (laughs), (coughs), etc.
    
    ## Usage Tips:
    - Use [S1] and [S2] tags to indicate different speakers
    - Include non-verbal sounds in parentheses: (laughs), (clears throat), (sighs), etc.
    - Set a seed value for consistent voices across multiple generations
    """,
    examples=examples,
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()