laloadrianmorales commited on
Commit
e9f8271
·
1 Parent(s): e50eb7e

update space

Browse files
Files changed (2) hide show
  1. app.py +51 -79
  2. requirements.txt +3 -5
app.py CHANGED
@@ -1,102 +1,76 @@
1
  import gradio as gr
2
- import torch
3
  import os
4
  import numpy as np
5
- from huggingface_hub import login, HfApi, InferenceClient
 
 
 
6
 
7
- # Set environment variables for Hugging Face API
8
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Get API token from Hugging Face Spaces environment
11
- # This will use the HF_TOKEN that's automatically available in your private Space
12
- hf_token = os.environ.get("HF_TOKEN")
13
 
14
- # Login to Hugging Face Hub
15
- if hf_token:
16
- login(token=hf_token, add_to_git_credential=False)
17
- print("Successfully logged in to Hugging Face Hub")
18
- else:
19
- print("Warning: No HF_TOKEN found. API access may be limited.")
20
-
21
- # Initialize the Inference Client with authentication
22
- client = InferenceClient(
23
- model="nari-labs/Dia-1.6B",
24
- token=hf_token
25
- )
26
-
27
- def generate_speech(
28
- text,
29
- audio_prompt=None,
30
- seed=None
31
- ):
32
- """
33
- Generate speech from text using Dia-1.6B model via Inference API
34
-
35
- Args:
36
- text (str): Input text to convert to speech
37
- audio_prompt (file): Optional audio file for voice cloning
38
- seed (int): Optional seed for reproducibility
39
-
40
- Returns:
41
- tuple: Audio sample rate (44100) and audio data
42
- """
43
- # Prepare payload for the API
44
- payload = {
45
- "inputs": text
46
- }
47
-
48
- # Add seed if provided
49
- if seed is not None and seed > 0:
50
- payload["parameters"] = {"seed": int(seed)}
51
-
52
- # Handle audio prompt if provided
53
- if audio_prompt is not None:
54
- # For audio prompt, we'd need to handle file upload and include it in the payload
55
- # This may require a different approach depending on the API's capabilities
56
- print(f"Audio prompt provided: {audio_prompt}")
57
-
58
- # Call the inference API
59
- try:
60
- # Using audio_generation API endpoint
61
- audio_output = client.audio_generation(
62
- text,
63
- seed=int(seed) if seed and seed > 0 else None
64
- )
65
- return 44100, np.array(audio_output.audio)
66
- except Exception as e:
67
- # If there's an error, print it and return an error message
68
- print(f"Error calling API: {str(e)}")
69
- # Return a silent audio sample to avoid breaking the UI
70
- return 44100, np.zeros(1000)
71
-
72
- # Define examples
73
  examples = [
74
  ["[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."],
75
  ["[S1] The weather is so nice today! [S2] I know, it's perfect for a walk in the park."],
76
  ["[S1] Did you hear about the new text to speech model? [S2] Yes, it's called Dia and it's really impressive! [S1] (laughs) That's amazing! Can it do different voices? [S2] Absolutely, and you can even clone your own voice."]
77
  ]
78
 
79
- # Create Gradio Interface
80
  demo = gr.Interface(
81
- fn=generate_speech,
82
  inputs=[
83
  gr.Textbox(
84
- label="Text",
85
  placeholder="Enter text with [S1] and [S2] tags for different speakers, include (laughs), (coughs), etc. for non-verbal sounds",
86
  lines=5
87
  ),
88
- gr.Audio(
89
- label="Audio Prompt (Optional)",
90
- type="filepath",
91
- value=None
92
- ),
93
  gr.Number(
94
- label="Seed (Optional)",
95
- precision=0,
96
  value=0
97
  )
98
  ],
99
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
100
  title="Nari Labs Dia-1.6B Text-to-Speech",
101
  description="""
102
  # Dia 1.6B Text-to-Speech Model
@@ -106,12 +80,10 @@ demo = gr.Interface(
106
  ## Features:
107
  - Generate dialogue using [S1] and [S2] tags for different speakers
108
  - Include non-verbal communication like (laughs), (coughs), etc.
109
- - Voice cloning with audio prompt upload
110
 
111
  ## Usage Tips:
112
  - Use [S1] and [S2] tags to indicate different speakers
113
  - Include non-verbal sounds in parentheses: (laughs), (clears throat), (sighs), etc.
114
- - For voice cloning, upload an audio sample and include its transcript before your script
115
  - Set a seed value for consistent voices across multiple generations
116
  """,
117
  examples=examples,
 
1
  import gradio as gr
 
2
  import os
3
  import numpy as np
4
+ import soundfile as sf
5
+ import tempfile
6
+ import requests
7
+ from huggingface_hub import InferenceClient
8
 
9
+ # Simple minimal app to demonstrate Nari Labs Dia-1.6B TTS model
10
+ class SimpleInference:
11
+ def __init__(self):
12
+ # Get API token from environment (in HF Spaces, this is automatically provided)
13
+ self.hf_token = os.environ.get("HF_TOKEN")
14
+ self.api = InferenceClient(model="nari-labs/Dia-1.6B", token=self.hf_token)
15
+
16
+ def generate_audio(self, text, seed=None):
17
+ """Generate audio from text using the Dia model"""
18
+ try:
19
+ # Create payload for the API
20
+ payload = {"text": text}
21
+ if seed is not None and seed > 0:
22
+ payload["seed"] = int(seed)
23
+
24
+ # Call the Hugging Face Inference API
25
+ # API will return audio data
26
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
27
+ temp_filename = f.name
28
+
29
+ # Call the API to generate audio
30
+ audio_output = self.api.audio_generation(
31
+ text,
32
+ seed=int(seed) if seed and seed > 0 else None
33
+ )
34
+
35
+ # Save the audio to a temporary file and read it back
36
+ sf.write(temp_filename, audio_output.audio, audio_output.sampling_rate)
37
+ audio_data, sample_rate = sf.read(temp_filename)
38
+
39
+ # Clean up the temporary file
40
+ os.unlink(temp_filename)
41
+
42
+ return (sample_rate, audio_data)
43
+ except Exception as e:
44
+ print(f"Error generating audio: {str(e)}")
45
+ # Return a silent audio sample to avoid breaking the UI
46
+ return (44100, np.zeros(1000))
47
 
48
+ # Initialize the inference model
49
+ model = SimpleInference()
 
50
 
51
+ # Define example inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  examples = [
53
  ["[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."],
54
  ["[S1] The weather is so nice today! [S2] I know, it's perfect for a walk in the park."],
55
  ["[S1] Did you hear about the new text to speech model? [S2] Yes, it's called Dia and it's really impressive! [S1] (laughs) That's amazing! Can it do different voices? [S2] Absolutely, and you can even clone your own voice."]
56
  ]
57
 
58
+ # Define the Gradio interface
59
  demo = gr.Interface(
60
+ fn=lambda text, seed=0: model.generate_audio(text, seed),
61
  inputs=[
62
  gr.Textbox(
63
+ label="Text",
64
  placeholder="Enter text with [S1] and [S2] tags for different speakers, include (laughs), (coughs), etc. for non-verbal sounds",
65
  lines=5
66
  ),
 
 
 
 
 
67
  gr.Number(
68
+ label="Seed (Optional)",
69
+ precision=0,
70
  value=0
71
  )
72
  ],
73
+ outputs=gr.Audio(label="Generated Speech"),
74
  title="Nari Labs Dia-1.6B Text-to-Speech",
75
  description="""
76
  # Dia 1.6B Text-to-Speech Model
 
80
  ## Features:
81
  - Generate dialogue using [S1] and [S2] tags for different speakers
82
  - Include non-verbal communication like (laughs), (coughs), etc.
 
83
 
84
  ## Usage Tips:
85
  - Use [S1] and [S2] tags to indicate different speakers
86
  - Include non-verbal sounds in parentheses: (laughs), (clears throat), (sighs), etc.
 
87
  - Set a seed value for consistent voices across multiple generations
88
  """,
89
  examples=examples,
requirements.txt CHANGED
@@ -1,7 +1,5 @@
1
  gradio>=5.0.1
2
- huggingface_hub>=0.25.2
3
- transformers
4
- torch>=2.0.0
5
  soundfile
6
- accelerate
7
- git+https://github.com/nari-labs/dia.git
 
1
  gradio>=5.0.1
2
+ huggingface_hub>=0.21.0
3
+ numpy>=1.20.0
 
4
  soundfile
5
+ requests