A quick fix for the broken results
#8
by
J22
- opened
Replace these two values in the demo code:
128266 -> 128256
156937 -> 156938
like so? Still get the same error:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
def main(text, output_path, description):
# Load the best open source voice AI model
model = AutoModelForCausalLM.from_pretrained(
"maya-research/maya1",
torch_dtype=torch.bfloat16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
# Load SNAC audio decoder (24kHz)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to("cuda")
# Create prompt with voice design
prompt = f'<description="{description}"> {text}'
# Generate emotional speech
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=500,
temperature=0.4,
top_p=0.9,
do_sample=True
)
# Extract SNAC audio tokens
generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
snac_tokens = [t.item() for t in generated_ids if 128256 <= t <= 156938]
# Decode SNAC tokens to audio frames
frames = len(snac_tokens) // 7
codes = [[], [], []]
for i in range(frames):
s = snac_tokens[i*7:(i+1)*7]
codes[0].append((s[0]-128256) % 4096)
codes[1].extend([(s[1]-128256) % 4096, (s[4]-128256) % 4096])
codes[2].extend([(s[2]-128256) % 4096, (s[3]-128256) % 4096, (s[5]-128256) % 4096, (s[6]-128256) % 4096])
# Generate final audio with SNAC decoder
codes_tensor = [torch.tensor(c, dtype=torch.long, device="cuda").unsqueeze(0) for c in codes]
with torch.inference_mode():
audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
# Save your emotional voice output
sf.write(output_path, audio, 24000)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate emotional speech using Maya1 model.")
parser.add_argument("--text", type=str, required=True, help="Input text to synthesize.")
parser.add_argument("--output_path", type=str, required=True, help="Path to save the output audio file.")
parser.add_argument("--description", type=str, default="A friendly and warm voice", help="Voice description for synthesis.")
args = parser.parse_args()
main(args.text, args.output_path, args.description)
Still facing the issue
still creating garbled voices with some seeds / descriptions
HF-Space - https://huggingface.co/spaces/maya-research/maya1
Repo with fastAPI implementation - https://github.com/MayaResearch/maya1-fastapi
J22
changed discussion status to
closed