KaniTTS_Voice_Cloning_dev

Running on Zero

App Files Files Community

Gapeleon commited on Sep 22

Commit

8c00aff

1 Parent(s): 2759e04

Voice Cloning

Browse files

Files changed (2) hide show

app.py +383 -133
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import os
 import subprocess
 import sys
 # Fix OMP_NUM_THREADS issue before any imports
 os.environ["OMP_NUM_THREADS"] = "4"
 # Install dependencies programmatically to avoid conflicts
 def setup_dependencies():
     try:
@@ -24,160 +22,412 @@ def setup_dependencies():
     except Exception as e:
         print(f"Dependencies setup error: {e}")
 # Run setup
 setup_dependencies()
 import spaces
-import gradio as gr
-from util import Config, NemoAudioPlayer, KaniModel
-import numpy as np
 import torch
-# Get HuggingFace token
-token_ = os.getenv('HF_TOKEN')
-# Model configurations
-models_configs = {
-    'base': Config(),
-    'female': Config(
-        model_name='nineninesix/kani-tts-450m-0.2-ft',
-    ),
-    'male': Config(
-        model_name='nineninesix/kani-tts-450m-0.1-ft',
-    )
-}
-# Global variables for models (loaded once)
-player = NemoAudioPlayer(Config())
-models = {}
-for model_name, config in models_configs.items():
-    print(f"Loading {model_name}...")
-    models[model_name] = KaniModel(config, player, token_)
-    print(f"{model_name} loaded!")
-print("All models loaded!")
-@spaces.GPU
-def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
     """
-    Generate speech from text using the selected model on GPU
     """
-    if not text.strip():
-        return None, "Please enter text for speech generation."
-    if not model_choice:
-        return None, "Please select a model."
     try:
-        # Check GPU availability
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Using device: {device}")
-        # Get selected model
-        selected_model = models[model_choice]
-        # Generate audio
-        print(f"Generating speech with {model_choice}...")
-        audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
-        sample_rate = 22050
-        print("Speech generation completed!")
-        return (sample_rate, audio), time_report   #, f"✅ Audio generated successfully using {model_choice} on {device}"
     except Exception as e:
-        print(f"Error during generation: {str(e)}")
-        return None, f"❌ Error during generation: {str(e)}"
-# Create Gradio interface
-with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
-    gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
-    gr.Markdown("Select a model and enter text to generate emotional speech")
-    with gr.Row():
-        with gr.Column(scale=1):
-            model_dropdown = gr.Dropdown(
-                choices=list(models_configs.keys()),
-                value=list(models_configs.keys())[0],
-                label="Selected Model",
-                info="Base generates random voices"
-            )
-            text_input = gr.Textbox(
-                label="Text",
-                placeholder="Enter your text ...",
-                lines=3,
-                max_lines=10
-            )
-            with gr.Accordion("Settings", open=False):
-                temp = gr.Slider(
-                    minimum=0.1, maximum=1.5, value=1.4, step=0.05,
-                    label="Temp",
-                )
-                top_p = gr.Slider(
-                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                    label="Top P",
-                )
-                rp = gr.Slider(
-                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                    label="Repetition Penalty",
-                )
-                max_tok = gr.Slider(
-                    minimum=100, maximum=2000, value=1200, step=100,
-                    label="Max Tokens",
-                )
-            generate_btn = gr.Button("Run", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            audio_output = gr.Audio(
-                label="Generated Audio",
-                type="numpy"
-            )
-            time_report_output = gr.Textbox(
-                label="Time Report",
-                interactive=False,
-                value="Ready to generate speech",
-                lines=3
             )
-    # GPU generation event
-    generate_btn.click(
-        fn=generate_speech_gpu,
-        inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
-        outputs=[audio_output, time_report_output]
-    )
     with gr.Row():
-        examples = [
-            ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 1.4, 0.95, 1.1, 1200],
-            ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 1.4, 0.95, 1.1, 1200],
-            ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 1.4, 0.95, 1.1, 1200],
-            ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 1.4, 0.95, 1.1, 1200],
-            ["Holy fu- Oh my God! Don't you understand how dangerous it is?", "male", 1.4, 0.95, 1.1, 1200],
-            ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 1.4, 0.95, 1.1, 1200],
-            ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "base", 1.4, 0.95, 1.1, 1200],
-            ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 1.4, 0.95, 1.1, 1200],
-        ]
-        gr.Examples(
-            examples=examples,
-            inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
-            fn=generate_speech_gpu,
-            outputs=[audio_output, time_report_output],
-            cache_examples=True,
         )
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True
-    )

 import os
 import subprocess
 import sys
 # Fix OMP_NUM_THREADS issue before any imports
 os.environ["OMP_NUM_THREADS"] = "4"
 # Install dependencies programmatically to avoid conflicts
 def setup_dependencies():
     try:
     except Exception as e:
         print(f"Dependencies setup error: {e}")
 # Run setup
 setup_dependencies()
 import spaces
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
+import librosa
+import gradio as gr
+from nemo.collections.tts.models import AudioCodecModel
+import os
+import sys
+# Add the parent directory to sys.path to import kanitts
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from kanitts import Config
+# Load configuration
+config = Config.default()
+# Load KaniTTS model and tokenizer
+kani_model_id = config.model.model_name
+tokenizer = AutoTokenizer.from_pretrained(
+    kani_model_id,
+    trust_remote_code=True,
+    use_fast=True
+)
+model = AutoModelForCausalLM.from_pretrained(
+    kani_model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda",
+    trust_remote_code=True,
+)
+model.eval()
+# Load Nemo codec
+nemo_model_id = config.audio.nemo_model_name
+nemo_codec = AudioCodecModel.from_pretrained(nemo_model_id).eval().cuda()
+# Load Whisper for transcription
+whisper_turbo_pipe = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-large-v3-turbo",
+    torch_dtype=torch.float16,
+    device='cuda',
+)
+# KaniTTS token IDs from config
+tokens = config.tokens
+SOH_ID = tokens.start_of_human
+EOH_ID = tokens.end_of_human
+SOA_ID = tokens.start_of_ai
+EOA_ID = tokens.end_of_ai
+SOT_ID = tokens.start_of_text
+EOT_ID = tokens.end_of_text
+SOS_ID = tokens.start_of_speech
+EOS_ID = tokens.end_of_speech
+def tokenize_audio(waveform, target_sample_rate=22050):
     """
+    Tokenize audio using Nemo codec for KaniTTS.
     """
+    # Ensure correct sample rate
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono if stereo
+    # Resample if needed (simplified - in practice you'd use proper resampling)
+    waveform = waveform.to(dtype=torch.float32)
+    # Ensure we have the right shape: [batch, samples]
+    if waveform.dim() == 1:
+        waveform = waveform.unsqueeze(0)
+    waveform = waveform.to(nemo_codec.device)
+    # Calculate audio length in samples
+    audio_len = torch.tensor([waveform.shape[-1]], dtype=torch.int64).to(waveform.device)
+    # Encode audio to get token codes
+    with torch.inference_mode():
+        encoded_tokens, _ = nemo_codec.encode(audio=waveform, audio_len=audio_len)
+    # encoded_tokens shape: [batch, num_codebooks, sequence_length]
+    # For nemo-nano-codec: [1, 4, seq_len]
+    codes = encoded_tokens[0]  # Remove batch dimension -> [4, seq_len]
+    seq_len = codes.shape[1]
+    # Flatten the 4 codebook levels per frame (KaniTTS uses 4 tokens per frame)
+    all_codes = []
+    for i in range(seq_len):
+        # Extract one frame across all 4 codebook levels
+        for level in range(4):
+            token_id = codes[level, i].item()
+            # Add offset for each codebook level
+            offset_token = token_id + config.tokens.audio_tokens_start + (level * config.tokens.codebook_size)
+            all_codes.append(offset_token)
+    return all_codes
+def redistribute_codes(code_list):
+    """
+    Decode audio codes back to waveform using Nemo codec.
+    """
+    if len(code_list) % 4 != 0:
+        print(f"Warning: Code list length {len(code_list)} is not divisible by 4")
+        return None
+    num_frames = len(code_list) // 4
+    codebook_size = config.tokens.codebook_size
+    # Separate the 4 codebook levels
+    level_0 = []
+    level_1 = []
+    level_2 = []
+    level_3 = []
+    for i in range(num_frames):
+        # Extract each level and remove offsets
+        level_0.append((code_list[4*i] - config.tokens.audio_tokens_start) % codebook_size)
+        level_1.append((code_list[4*i + 1] - config.tokens.audio_tokens_start - codebook_size) % codebook_size)
+        level_2.append((code_list[4*i + 2] - config.tokens.audio_tokens_start - 2*codebook_size) % codebook_size)
+        level_3.append((code_list[4*i + 3] - config.tokens.audio_tokens_start - 3*codebook_size) % codebook_size)
+    # Convert to tensors in format expected by Nemo: [batch, num_codebooks, sequence_length]
+    codes = torch.stack([
+        torch.tensor(level_0, dtype=torch.long),
+        torch.tensor(level_1, dtype=torch.long),
+        torch.tensor(level_2, dtype=torch.long),
+        torch.tensor(level_3, dtype=torch.long)
+    ]).unsqueeze(0)  # Add batch dimension
     try:
+        # Move to codec device
+        codes = codes.to(nemo_codec.device)
+        # Calculate length
+        tokens_len = torch.tensor([codes.shape[-1]], dtype=torch.int64).to(nemo_codec.device)
+        # Decode
+        with torch.no_grad():
+            audio_hat, _ = nemo_codec.decode(tokens=codes, tokens_len=tokens_len)
+        return audio_hat.cpu()
     except Exception as e:
+        print(f"Error decoding audio: {e}")
+        return None
+def transcribe_audio(sample_audio_path, progress=gr.Progress()):
+    """Transcribe uploaded audio using Whisper."""
+    if not sample_audio_path:
+        gr.Warning("Please upload an audio file first.")
+        return ""
+    try:
+        progress(0, 'Loading audio...')
+        audio_array, sample_rate = librosa.load(sample_audio_path, sr=config.audio.sample_rate)
+        # Trim audio to max 15 seconds for transcription
+        if len(audio_array) / sample_rate > 15:
+            num_samples_to_keep = int(sample_rate * 15)
+            audio_array = audio_array[:num_samples_to_keep]
+        progress(0.5, 'Transcribing...')
+        transcript = whisper_turbo_pipe(audio_array)['text'].strip()
+        progress(1, 'Transcription complete!')
+        return transcript
+    except Exception as e:
+        gr.Error(f"Transcription failed: {str(e)}")
+        return ""
+@spaces.GPU(duration=60)
+def infer(sample_audio_path, ref_transcript, target_text, temperature, top_p, repetition_penalty, progress=gr.Progress()):
+    if not target_text or not target_text.strip():
+        gr.Warning("Please input text to generate audio.")
+        return None
+    if len(target_text) > 500:
+        gr.Warning("Text is too long. Please keep it under 500 characters.")
+        target_text = target_text[:500]
+    target_text = target_text.strip()
+    if sample_audio_path and (not ref_transcript or not ref_transcript.strip()):
+        gr.Warning("Please provide a transcript for the reference audio or use the transcribe button.")
+        return None
+    with torch.no_grad():
+        if sample_audio_path and ref_transcript:
+            progress(0, 'Loading and trimming audio...')
+            audio_array, sample_rate = librosa.load(sample_audio_path, sr=config.audio.sample_rate)
+            # Trim audio to max 15 seconds
+            if len(audio_array) / sample_rate > 15:
+                gr.Warning("Trimming audio to first 15secs.")
+                num_samples_to_keep = int(sample_rate * 15)
+                audio_array = audio_array[:num_samples_to_keep]
+            prompt_wav = torch.from_numpy(audio_array).unsqueeze(0)
+            prompt_wav = prompt_wav.to(dtype=torch.float32)
+            progress(0.4, 'Encoding reference audio...')
+            # Encode the prompt wav
+            voice_tokens = tokenize_audio(prompt_wav)
+            # Use the provided transcript instead of auto-transcribing
+            prompt_text = ref_transcript.strip()
+            progress(0.6, "Generating audio...")
+            # Tokenize target text
+            target_text_ids = tokenizer.encode(target_text, add_special_tokens=False)
+            # Create complete sentence (reference + target)
+            complete_text = prompt_text + " " + target_text
+            complete_text_ids = tokenizer.encode(complete_text, add_special_tokens=False)
+            # Create prompt: Human says complete sentence, AI provides partial audio + continues
+            prompt_ids = (
+                [SOH_ID]
+                + complete_text_ids  # Full sentence as human input
+                + [EOT_ID]
+                + [EOH_ID]
+                + [SOA_ID]
+                + [SOS_ID]
+                + voice_tokens        # Audio only for reference part
+                # Model should continue generating audio for the target part
             )
+        else:
+            # No reference audio case
+            prompt_ids = []
+            progress(0.6, "Generating audio...")
+            # Tokenize target text
+            target_text_ids = tokenizer.encode(target_text, add_special_tokens=False)
+            # Simple generation without reference
+            prompt_ids.extend([SOH_ID])
+            prompt_ids.extend(target_text_ids)
+            prompt_ids.extend([EOT_ID])
+            prompt_ids.extend([EOH_ID])
+            prompt_ids.extend([SOA_ID])
+            prompt_ids.extend([SOS_ID])
+        print(f"Prompt length: {len(prompt_ids)} tokens")
+        input_ids = torch.tensor([prompt_ids], dtype=torch.int64).cuda()
+        # Generate the speech autoregressively
+        outputs = model.generate(
+            input_ids,
+            max_new_tokens=config.model.max_new_tokens,
+            eos_token_id=EOS_ID,
+            do_sample=True,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            pad_token_id=config.tokens.pad_token,
+            use_cache=True,
+        )
+        generated_ids = outputs[0].tolist()
+        print(f"Generated {len(generated_ids)} total tokens")
+        progress(0.8, "Decoding generated audio...")
+        # Since we end our prompt with SOS_ID, the generated tokens should be audio tokens directly
+        # We need to find where our input prompt ends and the generated tokens begin
+        input_length = len(prompt_ids)
+        speech_tokens = generated_ids[input_length:]
+        print(f"Input prompt length: {input_length}, generated tokens: {len(speech_tokens)}")
+        # Remove end of speech token if present
+        if EOS_ID in speech_tokens:
+            speech_tokens = speech_tokens[:speech_tokens.index(EOS_ID)]
+        if not speech_tokens:
+            gr.Error("Audio generation failed: No speech tokens were generated.")
+            return None
+        # Filter out non-audio tokens
+        audio_tokens = [token for token in speech_tokens if token >= config.tokens.audio_tokens_start]
+        if not audio_tokens:
+            gr.Error("Audio generation failed: No valid audio tokens found.")
+            return None
+        print(f"Decoding {len(audio_tokens)} audio tokens")
+        gen_wav_tensor = redistribute_codes(audio_tokens)
+        if gen_wav_tensor is None:
+            gr.Error("Audio decoding failed.")
+            return None
+        gen_wav = gen_wav_tensor.squeeze()
+        progress(1, 'Synthesized!')
+        return (config.audio.sample_rate, gen_wav.numpy())
+with gr.Blocks(title="KaniTTS Zero-Shot Voice Cloning") as app_tts:
+    gr.Markdown("# KaniTTS Zero-Shot Voice Cloning")
+    gr.Markdown("Upload reference audio, provide its transcript, and enter text to generate speech in the reference voice.")
+    ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
     with gr.Row():
+        ref_transcript_input = gr.Textbox(
+            label="Reference Audio Transcript",
+            lines=3,
+            placeholder="Enter what the reference audio says, or use the transcribe button...",
+            info="This should match exactly what is said in the reference audio"
+        )
+        transcribe_btn = gr.Button("Transcribe", variant="secondary", size="sm")
+    gen_text_input = gr.Textbox(
+        label="Text to Generate",
+        lines=10,
+        placeholder="Enter the text you want to generate in the reference voice..."
+    )
+    with gr.Row():
+        temperature_slider = gr.Slider(
+            minimum=0.0, maximum=2.0, value=1.4, step=0.05,
+            label="Temperature",
+            info="Higher values make output more random"
         )
+        top_p_slider = gr.Slider(
+            minimum=0.0, maximum=1.0, value=0.9, step=0.05,
+            label="Top-p",
+            info="Nucleus sampling threshold"
+        )
+        repetition_penalty_slider = gr.Slider(
+            minimum=1.0, maximum=1.5, value=1.1, step=0.05,
+            label="Repetition Penalty",
+            info="Penalty for repeating tokens"
+        )
+    generate_btn = gr.Button("Generate Speech", variant="primary")
+    audio_output = gr.Audio(label="Generated Audio")
+    # Connect transcribe button
+    transcribe_btn.click(
+        transcribe_audio,
+        inputs=[ref_audio_input],
+        outputs=[ref_transcript_input],
+    )
+    # Connect generate button
+    generate_btn.click(
+        infer,
+        inputs=[
+            ref_audio_input,
+            ref_transcript_input,
+            gen_text_input,
+            temperature_slider,
+            top_p_slider,
+            repetition_penalty_slider,
+        ],
+        outputs=[audio_output],
+    )
+with gr.Blocks() as app_info:
+    gr.Markdown("""
+# About KaniTTS
+KaniTTS is a conversational text-to-speech model that can perform zero-shot voice cloning.
+## How to use:
+1. Upload a reference audio file (WAV or MP3, max 15 seconds)
+2. Either enter the transcript manually or click "Transcribe" to auto-transcribe
+3. Edit the transcript if needed to ensure accuracy
+4. Enter the text you want to generate in that voice
+5. Adjust generation parameters if needed
+6. Click "Generate Speech"
+The model will use your provided transcript to understand the reference voice and generate the target text in the same voice.
+## Tips:
+- Use clear, high-quality reference audio
+- Keep reference audio under 15 seconds
+- The model works best with conversational speech
+- Try different temperature settings for varied results
+## Credits:
+- KaniTTS model by the KaniTTS team
+- Nemo codec by NVIDIA
+- Interface adapted from Orpheus TTS demo
+""")
+with gr.Blocks() as app:
+    gr.Markdown(
+        """
+# KaniTTS Zero-Shot Voice Cloning
+This is a web interface for KaniTTS zero-shot voice cloning. Upload reference audio and generate speech in any voice!
+**Note:** This model requires significant GPU resources. Generation may take some time.
+"""
+    )
+    gr.TabbedInterface([app_tts, app_info], ["Voice Cloning", "About"])
 if __name__ == "__main__":
+    app.launch()

requirements.txt CHANGED Viewed

@@ -2,4 +2,4 @@ torch==2.8.0
 librosa==0.11.0
 nemo_toolkit[tts]==2.4.0
 numpy==1.26.4
-gradio>=4.0.0

 librosa==0.11.0
 nemo_toolkit[tts]==2.4.0
 numpy==1.26.4
+gradio>=4.0.0