Spaces:

laloadrianmorales
/

nari-labs-Dia-1.6B

Running

App Files Files Community

laloadrianmorales commited on May 2

Commit

e9f8271

1 Parent(s): e50eb7e

update space

Browse files

Files changed (2) hide show

app.py +51 -79
requirements.txt +3 -5

app.py CHANGED Viewed

@@ -1,102 +1,76 @@
 import gradio as gr
-import torch
 import os
 import numpy as np
-from huggingface_hub import login, HfApi, InferenceClient
-# Set environment variables for Hugging Face API
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Get API token from Hugging Face Spaces environment
-# This will use the HF_TOKEN that's automatically available in your private Space
-hf_token = os.environ.get("HF_TOKEN")
-# Login to Hugging Face Hub
-if hf_token:
-    login(token=hf_token, add_to_git_credential=False)
-    print("Successfully logged in to Hugging Face Hub")
-else:
-    print("Warning: No HF_TOKEN found. API access may be limited.")
-# Initialize the Inference Client with authentication
-client = InferenceClient(
-    model="nari-labs/Dia-1.6B",
-    token=hf_token
-)
-def generate_speech(
-    text,
-    audio_prompt=None,
-    seed=None
-):
-    """
-    Generate speech from text using Dia-1.6B model via Inference API
-    Args:
-        text (str): Input text to convert to speech
-        audio_prompt (file): Optional audio file for voice cloning
-        seed (int): Optional seed for reproducibility
-    Returns:
-        tuple: Audio sample rate (44100) and audio data
-    """
-    # Prepare payload for the API
-    payload = {
-        "inputs": text
-    }
-    # Add seed if provided
-    if seed is not None and seed > 0:
-        payload["parameters"] = {"seed": int(seed)}
-    # Handle audio prompt if provided
-    if audio_prompt is not None:
-        # For audio prompt, we'd need to handle file upload and include it in the payload
-        # This may require a different approach depending on the API's capabilities
-        print(f"Audio prompt provided: {audio_prompt}")
-    # Call the inference API
-    try:
-        # Using audio_generation API endpoint
-        audio_output = client.audio_generation(
-            text,
-            seed=int(seed) if seed and seed > 0 else None
-        )
-        return 44100, np.array(audio_output.audio)
-    except Exception as e:
-        # If there's an error, print it and return an error message
-        print(f"Error calling API: {str(e)}")
-        # Return a silent audio sample to avoid breaking the UI
-        return 44100, np.zeros(1000)
-# Define examples
 examples = [
     ["[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."],
     ["[S1] The weather is so nice today! [S2] I know, it's perfect for a walk in the park."],
     ["[S1] Did you hear about the new text to speech model? [S2] Yes, it's called Dia and it's really impressive! [S1] (laughs) That's amazing! Can it do different voices? [S2] Absolutely, and you can even clone your own voice."]
 ]
-# Create Gradio Interface
 demo = gr.Interface(
-    fn=generate_speech,
     inputs=[
         gr.Textbox(
-            label="Text",
             placeholder="Enter text with [S1] and [S2] tags for different speakers, include (laughs), (coughs), etc. for non-verbal sounds",
             lines=5
         ),
-        gr.Audio(
-            label="Audio Prompt (Optional)",
-            type="filepath",
-            value=None
-        ),
         gr.Number(
-            label="Seed (Optional)",
-            precision=0,
             value=0
         )
     ],
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title="Nari Labs Dia-1.6B Text-to-Speech",
     description="""
     # Dia 1.6B Text-to-Speech Model
@@ -106,12 +80,10 @@ demo = gr.Interface(
     ## Features:
     - Generate dialogue using [S1] and [S2] tags for different speakers
     - Include non-verbal communication like (laughs), (coughs), etc.
-    - Voice cloning with audio prompt upload
     ## Usage Tips:
     - Use [S1] and [S2] tags to indicate different speakers
     - Include non-verbal sounds in parentheses: (laughs), (clears throat), (sighs), etc.
-    - For voice cloning, upload an audio sample and include its transcript before your script
     - Set a seed value for consistent voices across multiple generations
     """,
     examples=examples,

 import gradio as gr
 import os
 import numpy as np
+import soundfile as sf
+import tempfile
+import requests
+from huggingface_hub import InferenceClient
+# Simple minimal app to demonstrate Nari Labs Dia-1.6B TTS model
+class SimpleInference:
+    def __init__(self):
+        # Get API token from environment (in HF Spaces, this is automatically provided)
+        self.hf_token = os.environ.get("HF_TOKEN")
+        self.api = InferenceClient(model="nari-labs/Dia-1.6B", token=self.hf_token)
+    def generate_audio(self, text, seed=None):
+        """Generate audio from text using the Dia model"""
+        try:
+            # Create payload for the API
+            payload = {"text": text}
+            if seed is not None and seed > 0:
+                payload["seed"] = int(seed)
+            # Call the Hugging Face Inference API
+            # API will return audio data
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+                temp_filename = f.name
+            # Call the API to generate audio
+            audio_output = self.api.audio_generation(
+                text,
+                seed=int(seed) if seed and seed > 0 else None
+            )
+            # Save the audio to a temporary file and read it back
+            sf.write(temp_filename, audio_output.audio, audio_output.sampling_rate)
+            audio_data, sample_rate = sf.read(temp_filename)
+            # Clean up the temporary file
+            os.unlink(temp_filename)
+            return (sample_rate, audio_data)
+        except Exception as e:
+            print(f"Error generating audio: {str(e)}")
+            # Return a silent audio sample to avoid breaking the UI
+            return (44100, np.zeros(1000))
+# Initialize the inference model
+model = SimpleInference()
+# Define example inputs
 examples = [
     ["[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."],
     ["[S1] The weather is so nice today! [S2] I know, it's perfect for a walk in the park."],
     ["[S1] Did you hear about the new text to speech model? [S2] Yes, it's called Dia and it's really impressive! [S1] (laughs) That's amazing! Can it do different voices? [S2] Absolutely, and you can even clone your own voice."]
 ]
+# Define the Gradio interface
 demo = gr.Interface(
+    fn=lambda text, seed=0: model.generate_audio(text, seed),
     inputs=[
         gr.Textbox(
+            label="Text",
             placeholder="Enter text with [S1] and [S2] tags for different speakers, include (laughs), (coughs), etc. for non-verbal sounds",
             lines=5
         ),
         gr.Number(
+            label="Seed (Optional)",
+            precision=0,
             value=0
         )
     ],
+    outputs=gr.Audio(label="Generated Speech"),
     title="Nari Labs Dia-1.6B Text-to-Speech",
     description="""
     # Dia 1.6B Text-to-Speech Model
     ## Features:
     - Generate dialogue using [S1] and [S2] tags for different speakers
     - Include non-verbal communication like (laughs), (coughs), etc.
     ## Usage Tips:
     - Use [S1] and [S2] tags to indicate different speakers
     - Include non-verbal sounds in parentheses: (laughs), (clears throat), (sighs), etc.
     - Set a seed value for consistent voices across multiple generations
     """,
     examples=examples,

requirements.txt CHANGED Viewed

@@ -1,7 +1,5 @@
 gradio>=5.0.1
-huggingface_hub>=0.25.2
-transformers
-torch>=2.0.0
 soundfile
-accelerate
-git+https://github.com/nari-labs/dia.git

 gradio>=5.0.1
+huggingface_hub>=0.21.0
+numpy>=1.20.0
 soundfile
+requests