Gapeleon commited on
Commit
8c00aff
·
1 Parent(s): 2759e04

Voice Cloning

Browse files
Files changed (2) hide show
  1. app.py +383 -133
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,10 +1,8 @@
1
  import os
2
  import subprocess
3
  import sys
4
-
5
  # Fix OMP_NUM_THREADS issue before any imports
6
  os.environ["OMP_NUM_THREADS"] = "4"
7
-
8
  # Install dependencies programmatically to avoid conflicts
9
  def setup_dependencies():
10
  try:
@@ -24,160 +22,412 @@ def setup_dependencies():
24
 
25
  except Exception as e:
26
  print(f"Dependencies setup error: {e}")
27
-
28
  # Run setup
29
  setup_dependencies()
30
 
31
  import spaces
32
- import gradio as gr
33
- from util import Config, NemoAudioPlayer, KaniModel
34
- import numpy as np
35
  import torch
 
 
 
 
 
36
 
37
- # Get HuggingFace token
38
- token_ = os.getenv('HF_TOKEN')
39
-
40
- # Model configurations
41
- models_configs = {
42
- 'base': Config(),
43
- 'female': Config(
44
- model_name='nineninesix/kani-tts-450m-0.2-ft',
45
- ),
46
- 'male': Config(
47
- model_name='nineninesix/kani-tts-450m-0.1-ft',
48
- )
49
- }
 
 
50
 
51
- # Global variables for models (loaded once)
52
- player = NemoAudioPlayer(Config())
53
- models = {}
54
- for model_name, config in models_configs.items():
55
- print(f"Loading {model_name}...")
56
- models[model_name] = KaniModel(config, player, token_)
57
- print(f"{model_name} loaded!")
58
- print("All models loaded!")
59
 
 
 
 
60
 
61
- @spaces.GPU
62
- def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  """
64
- Generate speech from text using the selected model on GPU
65
  """
 
 
 
 
 
 
66
 
67
- if not text.strip():
68
- return None, "Please enter text for speech generation."
 
69
 
70
- if not model_choice:
71
- return None, "Please select a model."
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  try:
74
- # Check GPU availability
75
- device = "cuda" if torch.cuda.is_available() else "cpu"
76
- print(f"Using device: {device}")
77
-
78
- # Get selected model
79
- selected_model = models[model_choice]
80
-
81
- # Generate audio
82
- print(f"Generating speech with {model_choice}...")
83
- audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
84
-
85
- sample_rate = 22050
86
- print("Speech generation completed!")
87
-
88
- return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}"
89
-
90
  except Exception as e:
91
- print(f"Error during generation: {str(e)}")
92
- return None, f"❌ Error during generation: {str(e)}"
 
 
 
 
 
 
 
 
 
 
93
 
94
- # Create Gradio interface
95
- with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
96
- gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
97
- gr.Markdown("Select a model and enter text to generate emotional speech")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- with gr.Row():
100
- with gr.Column(scale=1):
101
- model_dropdown = gr.Dropdown(
102
- choices=list(models_configs.keys()),
103
- value=list(models_configs.keys())[0],
104
- label="Selected Model",
105
- info="Base generates random voices"
106
- )
107
-
108
- text_input = gr.Textbox(
109
- label="Text",
110
- placeholder="Enter your text ...",
111
- lines=3,
112
- max_lines=10
113
- )
114
 
115
- with gr.Accordion("Settings", open=False):
116
- temp = gr.Slider(
117
- minimum=0.1, maximum=1.5, value=1.4, step=0.05,
118
- label="Temp",
119
- )
120
- top_p = gr.Slider(
121
- minimum=0.1, maximum=1.0, value=0.95, step=0.05,
122
- label="Top P",
123
- )
124
- rp = gr.Slider(
125
- minimum=1.0, maximum=2.0, value=1.1, step=0.05,
126
- label="Repetition Penalty",
127
- )
128
- max_tok = gr.Slider(
129
- minimum=100, maximum=2000, value=1200, step=100,
130
- label="Max Tokens",
131
- )
132
-
133
- generate_btn = gr.Button("Run", variant="primary", size="lg")
134
 
135
-
136
- with gr.Column(scale=1):
137
- audio_output = gr.Audio(
138
- label="Generated Audio",
139
- type="numpy"
140
- )
141
-
142
- time_report_output = gr.Textbox(
143
- label="Time Report",
144
- interactive=False,
145
- value="Ready to generate speech",
146
- lines=3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  )
148
-
149
- # GPU generation event
150
- generate_btn.click(
151
- fn=generate_speech_gpu,
152
- inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
153
- outputs=[audio_output, time_report_output]
154
- )
155
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  with gr.Row():
 
 
 
 
 
 
 
157
 
158
- examples = [
159
- ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 1.4, 0.95, 1.1, 1200],
160
- ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 1.4, 0.95, 1.1, 1200],
161
- ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 1.4, 0.95, 1.1, 1200],
162
- ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 1.4, 0.95, 1.1, 1200],
163
- ["Holy fu- Oh my God! Don't you understand how dangerous it is?", "male", 1.4, 0.95, 1.1, 1200],
164
- ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 1.4, 0.95, 1.1, 1200],
165
- ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "base", 1.4, 0.95, 1.1, 1200],
166
- ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 1.4, 0.95, 1.1, 1200],
167
- ]
168
-
169
-
170
- gr.Examples(
171
- examples=examples,
172
- inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
173
- fn=generate_speech_gpu,
174
- outputs=[audio_output, time_report_output],
175
- cache_examples=True,
176
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  if __name__ == "__main__":
179
- demo.launch(
180
- server_name="0.0.0.0",
181
- server_port=7860,
182
- show_error=True
183
- )
 
1
  import os
2
  import subprocess
3
  import sys
 
4
  # Fix OMP_NUM_THREADS issue before any imports
5
  os.environ["OMP_NUM_THREADS"] = "4"
 
6
  # Install dependencies programmatically to avoid conflicts
7
  def setup_dependencies():
8
  try:
 
22
 
23
  except Exception as e:
24
  print(f"Dependencies setup error: {e}")
 
25
  # Run setup
26
  setup_dependencies()
27
 
28
  import spaces
29
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
30
  import torch
31
+ import librosa
32
+ import gradio as gr
33
+ from nemo.collections.tts.models import AudioCodecModel
34
+ import os
35
+ import sys
36
 
37
+ # Add the parent directory to sys.path to import kanitts
38
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
39
+ from kanitts import Config
40
+
41
+ # Load configuration
42
+ config = Config.default()
43
+
44
+ # Load KaniTTS model and tokenizer
45
+ kani_model_id = config.model.model_name
46
+
47
+ tokenizer = AutoTokenizer.from_pretrained(
48
+ kani_model_id,
49
+ trust_remote_code=True,
50
+ use_fast=True
51
+ )
52
 
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ kani_model_id,
55
+ torch_dtype=torch.bfloat16,
56
+ device_map="cuda",
57
+ trust_remote_code=True,
58
+ )
59
+ model.eval()
 
60
 
61
+ # Load Nemo codec
62
+ nemo_model_id = config.audio.nemo_model_name
63
+ nemo_codec = AudioCodecModel.from_pretrained(nemo_model_id).eval().cuda()
64
 
65
+ # Load Whisper for transcription
66
+ whisper_turbo_pipe = pipeline(
67
+ "automatic-speech-recognition",
68
+ model="openai/whisper-large-v3-turbo",
69
+ torch_dtype=torch.float16,
70
+ device='cuda',
71
+ )
72
+
73
+ # KaniTTS token IDs from config
74
+ tokens = config.tokens
75
+ SOH_ID = tokens.start_of_human
76
+ EOH_ID = tokens.end_of_human
77
+ SOA_ID = tokens.start_of_ai
78
+ EOA_ID = tokens.end_of_ai
79
+ SOT_ID = tokens.start_of_text
80
+ EOT_ID = tokens.end_of_text
81
+ SOS_ID = tokens.start_of_speech
82
+ EOS_ID = tokens.end_of_speech
83
+
84
+ def tokenize_audio(waveform, target_sample_rate=22050):
85
  """
86
+ Tokenize audio using Nemo codec for KaniTTS.
87
  """
88
+ # Ensure correct sample rate
89
+ if waveform.shape[0] > 1:
90
+ waveform = waveform.mean(dim=0, keepdim=True) # Convert to mono if stereo
91
+
92
+ # Resample if needed (simplified - in practice you'd use proper resampling)
93
+ waveform = waveform.to(dtype=torch.float32)
94
 
95
+ # Ensure we have the right shape: [batch, samples]
96
+ if waveform.dim() == 1:
97
+ waveform = waveform.unsqueeze(0)
98
 
99
+ waveform = waveform.to(nemo_codec.device)
 
100
 
101
+ # Calculate audio length in samples
102
+ audio_len = torch.tensor([waveform.shape[-1]], dtype=torch.int64).to(waveform.device)
103
+
104
+ # Encode audio to get token codes
105
+ with torch.inference_mode():
106
+ encoded_tokens, _ = nemo_codec.encode(audio=waveform, audio_len=audio_len)
107
+
108
+ # encoded_tokens shape: [batch, num_codebooks, sequence_length]
109
+ # For nemo-nano-codec: [1, 4, seq_len]
110
+ codes = encoded_tokens[0] # Remove batch dimension -> [4, seq_len]
111
+ seq_len = codes.shape[1]
112
+
113
+ # Flatten the 4 codebook levels per frame (KaniTTS uses 4 tokens per frame)
114
+ all_codes = []
115
+
116
+ for i in range(seq_len):
117
+ # Extract one frame across all 4 codebook levels
118
+ for level in range(4):
119
+ token_id = codes[level, i].item()
120
+ # Add offset for each codebook level
121
+ offset_token = token_id + config.tokens.audio_tokens_start + (level * config.tokens.codebook_size)
122
+ all_codes.append(offset_token)
123
+
124
+ return all_codes
125
+
126
+ def redistribute_codes(code_list):
127
+ """
128
+ Decode audio codes back to waveform using Nemo codec.
129
+ """
130
+ if len(code_list) % 4 != 0:
131
+ print(f"Warning: Code list length {len(code_list)} is not divisible by 4")
132
+ return None
133
+
134
+ num_frames = len(code_list) // 4
135
+ codebook_size = config.tokens.codebook_size
136
+
137
+ # Separate the 4 codebook levels
138
+ level_0 = []
139
+ level_1 = []
140
+ level_2 = []
141
+ level_3 = []
142
+
143
+ for i in range(num_frames):
144
+ # Extract each level and remove offsets
145
+ level_0.append((code_list[4*i] - config.tokens.audio_tokens_start) % codebook_size)
146
+ level_1.append((code_list[4*i + 1] - config.tokens.audio_tokens_start - codebook_size) % codebook_size)
147
+ level_2.append((code_list[4*i + 2] - config.tokens.audio_tokens_start - 2*codebook_size) % codebook_size)
148
+ level_3.append((code_list[4*i + 3] - config.tokens.audio_tokens_start - 3*codebook_size) % codebook_size)
149
+
150
+ # Convert to tensors in format expected by Nemo: [batch, num_codebooks, sequence_length]
151
+ codes = torch.stack([
152
+ torch.tensor(level_0, dtype=torch.long),
153
+ torch.tensor(level_1, dtype=torch.long),
154
+ torch.tensor(level_2, dtype=torch.long),
155
+ torch.tensor(level_3, dtype=torch.long)
156
+ ]).unsqueeze(0) # Add batch dimension
157
+
158
  try:
159
+ # Move to codec device
160
+ codes = codes.to(nemo_codec.device)
161
+
162
+ # Calculate length
163
+ tokens_len = torch.tensor([codes.shape[-1]], dtype=torch.int64).to(nemo_codec.device)
164
+
165
+ # Decode
166
+ with torch.no_grad():
167
+ audio_hat, _ = nemo_codec.decode(tokens=codes, tokens_len=tokens_len)
168
+
169
+ return audio_hat.cpu()
170
+
 
 
 
 
171
  except Exception as e:
172
+ print(f"Error decoding audio: {e}")
173
+ return None
174
+
175
+ def transcribe_audio(sample_audio_path, progress=gr.Progress()):
176
+ """Transcribe uploaded audio using Whisper."""
177
+ if not sample_audio_path:
178
+ gr.Warning("Please upload an audio file first.")
179
+ return ""
180
+
181
+ try:
182
+ progress(0, 'Loading audio...')
183
+ audio_array, sample_rate = librosa.load(sample_audio_path, sr=config.audio.sample_rate)
184
 
185
+ # Trim audio to max 15 seconds for transcription
186
+ if len(audio_array) / sample_rate > 15:
187
+ num_samples_to_keep = int(sample_rate * 15)
188
+ audio_array = audio_array[:num_samples_to_keep]
189
+
190
+ progress(0.5, 'Transcribing...')
191
+ transcript = whisper_turbo_pipe(audio_array)['text'].strip()
192
+ progress(1, 'Transcription complete!')
193
+
194
+ return transcript
195
+ except Exception as e:
196
+ gr.Error(f"Transcription failed: {str(e)}")
197
+ return ""
198
+
199
+ @spaces.GPU(duration=60)
200
+ def infer(sample_audio_path, ref_transcript, target_text, temperature, top_p, repetition_penalty, progress=gr.Progress()):
201
+ if not target_text or not target_text.strip():
202
+ gr.Warning("Please input text to generate audio.")
203
+ return None
204
+
205
+ if len(target_text) > 500:
206
+ gr.Warning("Text is too long. Please keep it under 500 characters.")
207
+ target_text = target_text[:500]
208
+
209
+ target_text = target_text.strip()
210
+
211
+ if sample_audio_path and (not ref_transcript or not ref_transcript.strip()):
212
+ gr.Warning("Please provide a transcript for the reference audio or use the transcribe button.")
213
+ return None
214
 
215
+ with torch.no_grad():
216
+ if sample_audio_path and ref_transcript:
217
+ progress(0, 'Loading and trimming audio...')
218
+ audio_array, sample_rate = librosa.load(sample_audio_path, sr=config.audio.sample_rate)
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ # Trim audio to max 15 seconds
221
+ if len(audio_array) / sample_rate > 15:
222
+ gr.Warning("Trimming audio to first 15secs.")
223
+ num_samples_to_keep = int(sample_rate * 15)
224
+ audio_array = audio_array[:num_samples_to_keep]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
+ prompt_wav = torch.from_numpy(audio_array).unsqueeze(0)
227
+ prompt_wav = prompt_wav.to(dtype=torch.float32)
228
+
229
+ progress(0.4, 'Encoding reference audio...')
230
+
231
+ # Encode the prompt wav
232
+ voice_tokens = tokenize_audio(prompt_wav)
233
+
234
+ # Use the provided transcript instead of auto-transcribing
235
+ prompt_text = ref_transcript.strip()
236
+
237
+ progress(0.6, "Generating audio...")
238
+
239
+ # Tokenize target text
240
+ target_text_ids = tokenizer.encode(target_text, add_special_tokens=False)
241
+
242
+ # Create complete sentence (reference + target)
243
+ complete_text = prompt_text + " " + target_text
244
+ complete_text_ids = tokenizer.encode(complete_text, add_special_tokens=False)
245
+
246
+ # Create prompt: Human says complete sentence, AI provides partial audio + continues
247
+ prompt_ids = (
248
+ [SOH_ID]
249
+ + complete_text_ids # Full sentence as human input
250
+ + [EOT_ID]
251
+ + [EOH_ID]
252
+ + [SOA_ID]
253
+ + [SOS_ID]
254
+ + voice_tokens # Audio only for reference part
255
+ # Model should continue generating audio for the target part
256
  )
257
+ else:
258
+ # No reference audio case
259
+ prompt_ids = []
260
+ progress(0.6, "Generating audio...")
261
+
262
+ # Tokenize target text
263
+ target_text_ids = tokenizer.encode(target_text, add_special_tokens=False)
264
+
265
+ # Simple generation without reference
266
+ prompt_ids.extend([SOH_ID])
267
+ prompt_ids.extend(target_text_ids)
268
+ prompt_ids.extend([EOT_ID])
269
+ prompt_ids.extend([EOH_ID])
270
+ prompt_ids.extend([SOA_ID])
271
+ prompt_ids.extend([SOS_ID])
272
+
273
+ print(f"Prompt length: {len(prompt_ids)} tokens")
274
+
275
+ input_ids = torch.tensor([prompt_ids], dtype=torch.int64).cuda()
276
+
277
+ # Generate the speech autoregressively
278
+ outputs = model.generate(
279
+ input_ids,
280
+ max_new_tokens=config.model.max_new_tokens,
281
+ eos_token_id=EOS_ID,
282
+ do_sample=True,
283
+ top_p=top_p,
284
+ temperature=temperature,
285
+ repetition_penalty=repetition_penalty,
286
+ pad_token_id=config.tokens.pad_token,
287
+ use_cache=True,
288
+ )
289
+ generated_ids = outputs[0].tolist()
290
+ print(f"Generated {len(generated_ids)} total tokens")
291
+
292
+ progress(0.8, "Decoding generated audio...")
293
+
294
+ # Since we end our prompt with SOS_ID, the generated tokens should be audio tokens directly
295
+ # We need to find where our input prompt ends and the generated tokens begin
296
+ input_length = len(prompt_ids)
297
+ speech_tokens = generated_ids[input_length:]
298
+
299
+ print(f"Input prompt length: {input_length}, generated tokens: {len(speech_tokens)}")
300
+
301
+ # Remove end of speech token if present
302
+ if EOS_ID in speech_tokens:
303
+ speech_tokens = speech_tokens[:speech_tokens.index(EOS_ID)]
304
+
305
+ if not speech_tokens:
306
+ gr.Error("Audio generation failed: No speech tokens were generated.")
307
+ return None
308
+
309
+ # Filter out non-audio tokens
310
+ audio_tokens = [token for token in speech_tokens if token >= config.tokens.audio_tokens_start]
311
+
312
+ if not audio_tokens:
313
+ gr.Error("Audio generation failed: No valid audio tokens found.")
314
+ return None
315
+
316
+ print(f"Decoding {len(audio_tokens)} audio tokens")
317
+ gen_wav_tensor = redistribute_codes(audio_tokens)
318
+
319
+ if gen_wav_tensor is None:
320
+ gr.Error("Audio decoding failed.")
321
+ return None
322
+
323
+ gen_wav = gen_wav_tensor.squeeze()
324
+
325
+ progress(1, 'Synthesized!')
326
+ return (config.audio.sample_rate, gen_wav.numpy())
327
+
328
+ with gr.Blocks(title="KaniTTS Zero-Shot Voice Cloning") as app_tts:
329
+ gr.Markdown("# KaniTTS Zero-Shot Voice Cloning")
330
+ gr.Markdown("Upload reference audio, provide its transcript, and enter text to generate speech in the reference voice.")
331
+
332
+ ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
333
+
334
  with gr.Row():
335
+ ref_transcript_input = gr.Textbox(
336
+ label="Reference Audio Transcript",
337
+ lines=3,
338
+ placeholder="Enter what the reference audio says, or use the transcribe button...",
339
+ info="This should match exactly what is said in the reference audio"
340
+ )
341
+ transcribe_btn = gr.Button("Transcribe", variant="secondary", size="sm")
342
 
343
+ gen_text_input = gr.Textbox(
344
+ label="Text to Generate",
345
+ lines=10,
346
+ placeholder="Enter the text you want to generate in the reference voice..."
347
+ )
348
+
349
+ with gr.Row():
350
+ temperature_slider = gr.Slider(
351
+ minimum=0.0, maximum=2.0, value=1.4, step=0.05,
352
+ label="Temperature",
353
+ info="Higher values make output more random"
 
 
 
 
 
 
 
354
  )
355
+ top_p_slider = gr.Slider(
356
+ minimum=0.0, maximum=1.0, value=0.9, step=0.05,
357
+ label="Top-p",
358
+ info="Nucleus sampling threshold"
359
+ )
360
+ repetition_penalty_slider = gr.Slider(
361
+ minimum=1.0, maximum=1.5, value=1.1, step=0.05,
362
+ label="Repetition Penalty",
363
+ info="Penalty for repeating tokens"
364
+ )
365
+
366
+ generate_btn = gr.Button("Generate Speech", variant="primary")
367
+
368
+ audio_output = gr.Audio(label="Generated Audio")
369
+
370
+ # Connect transcribe button
371
+ transcribe_btn.click(
372
+ transcribe_audio,
373
+ inputs=[ref_audio_input],
374
+ outputs=[ref_transcript_input],
375
+ )
376
+
377
+ # Connect generate button
378
+ generate_btn.click(
379
+ infer,
380
+ inputs=[
381
+ ref_audio_input,
382
+ ref_transcript_input,
383
+ gen_text_input,
384
+ temperature_slider,
385
+ top_p_slider,
386
+ repetition_penalty_slider,
387
+ ],
388
+ outputs=[audio_output],
389
+ )
390
+
391
+ with gr.Blocks() as app_info:
392
+ gr.Markdown("""
393
+ # About KaniTTS
394
+
395
+ KaniTTS is a conversational text-to-speech model that can perform zero-shot voice cloning.
396
+
397
+ ## How to use:
398
+ 1. Upload a reference audio file (WAV or MP3, max 15 seconds)
399
+ 2. Either enter the transcript manually or click "Transcribe" to auto-transcribe
400
+ 3. Edit the transcript if needed to ensure accuracy
401
+ 4. Enter the text you want to generate in that voice
402
+ 5. Adjust generation parameters if needed
403
+ 6. Click "Generate Speech"
404
+
405
+ The model will use your provided transcript to understand the reference voice and generate the target text in the same voice.
406
+
407
+ ## Tips:
408
+ - Use clear, high-quality reference audio
409
+ - Keep reference audio under 15 seconds
410
+ - The model works best with conversational speech
411
+ - Try different temperature settings for varied results
412
+
413
+ ## Credits:
414
+ - KaniTTS model by the KaniTTS team
415
+ - Nemo codec by NVIDIA
416
+ - Interface adapted from Orpheus TTS demo
417
+ """)
418
+
419
+ with gr.Blocks() as app:
420
+ gr.Markdown(
421
+ """
422
+ # KaniTTS Zero-Shot Voice Cloning
423
+
424
+ This is a web interface for KaniTTS zero-shot voice cloning. Upload reference audio and generate speech in any voice!
425
+
426
+ **Note:** This model requires significant GPU resources. Generation may take some time.
427
+ """
428
+ )
429
+ gr.TabbedInterface([app_tts, app_info], ["Voice Cloning", "About"])
430
 
431
  if __name__ == "__main__":
432
+ app.launch()
433
+
 
 
 
requirements.txt CHANGED
@@ -2,4 +2,4 @@ torch==2.8.0
2
  librosa==0.11.0
3
  nemo_toolkit[tts]==2.4.0
4
  numpy==1.26.4
5
- gradio>=4.0.0
 
2
  librosa==0.11.0
3
  nemo_toolkit[tts]==2.4.0
4
  numpy==1.26.4
5
+ gradio>=4.0.0