A quick fix for the broken results

#8
by J22 - opened

Replace these two values in the demo code:

128266 -> 128256
156937 -> 156938

like so? Still get the same error:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf

def main(text, output_path, description):
    # Load the best open source voice AI model
    model = AutoModelForCausalLM.from_pretrained(
        "maya-research/maya1",
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")

    # Load SNAC audio decoder (24kHz)
    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to("cuda")

    # Create prompt with voice design
    prompt = f'<description="{description}"> {text}'

    # Generate emotional speech
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.4,
            top_p=0.9,
            do_sample=True
        )

    # Extract SNAC audio tokens
    generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
    snac_tokens = [t.item() for t in generated_ids if 128256 <= t <= 156938]

    # Decode SNAC tokens to audio frames
    frames = len(snac_tokens) // 7
    codes = [[], [], []]
    for i in range(frames):
        s = snac_tokens[i*7:(i+1)*7]
        codes[0].append((s[0]-128256) % 4096)
        codes[1].extend([(s[1]-128256) % 4096, (s[4]-128256) % 4096])
        codes[2].extend([(s[2]-128256) % 4096, (s[3]-128256) % 4096, (s[5]-128256) % 4096, (s[6]-128256) % 4096])

    # Generate final audio with SNAC decoder
    codes_tensor = [torch.tensor(c, dtype=torch.long, device="cuda").unsqueeze(0) for c in codes]
    with torch.inference_mode():
        audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()

    # Save your emotional voice output
    sf.write(output_path, audio, 24000)

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Generate emotional speech using Maya1 model.")
    parser.add_argument("--text", type=str, required=True, help="Input text to synthesize.")
    parser.add_argument("--output_path", type=str, required=True, help="Path to save the output audio file.")
    parser.add_argument("--description", type=str, default="A friendly and warm voice", help="Voice description for synthesis.")
    args = parser.parse_args()
    main(args.text, args.output_path, args.description)

Still facing the issue

still creating garbled voices with some seeds / descriptions

Maya Research org
J22 changed discussion status to closed

Sign up or log in to comment