maya1 / app.py
akhaliq's picture
akhaliq HF Staff
Update app.py
f021bd5 verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import numpy as np
import tempfile
import spaces
# Load models at startup
print("Loading Maya1 model...")
model = AutoModelForCausalLM.from_pretrained(
"maya-research/maya1",
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
"maya-research/maya1",
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Loading SNAC audio decoder...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
@spaces.GPU(duration=180)
def generate_speech(text, voice_description):
"""Generate speech from text using Maya1 model with ZeroGPU"""
if not text.strip():
raise gr.Error("Please enter some text to convert to speech!")
if not voice_description.strip():
voice_description = "Realistic voice with neutral tone and conversational pacing."
try:
# Move models to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
snac_model.to(device)
# Create prompt
prompt = f'<description="{voice_description}"> {text}'
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# Generate
with torch.inference_mode():
outputs = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs.get('attention_mask', None),
max_new_tokens=1000,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=None,
)
# Extract SNAC audio tokens
generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
if len(snac_tokens) < 7:
raise gr.Error(
f"Not enough audio tokens generated ({len(snac_tokens)}). "
f"Try using longer text or different voice description."
)
# Decode SNAC tokens to audio frames
frames = len(snac_tokens) // 7
codes = [[], [], []]
for i in range(frames):
s = snac_tokens[i*7:(i+1)*7]
codes[0].append((s[0]-128266) % 4096)
codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
codes[2].extend([
(s[2]-128266) % 4096,
(s[3]-128266) % 4096,
(s[5]-128266) % 4096,
(s[6]-128266) % 4096
])
# Generate final audio with SNAC decoder
codes_tensor = [
torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0)
for c in codes
]
with torch.inference_mode():
audio = snac_model.decoder(
snac_model.quantizer.from_codes(codes_tensor)
)[0, 0].cpu().numpy()
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
sf.write(f.name, audio, 24000)
return f.name
except Exception as e:
import traceback
traceback.print_exc()
raise gr.Error(f"Error generating speech: {str(e)}")
# Examples
examples = [
[
"Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
"Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
],
[
"I'm so excited to share this amazing news with you! This is incredible and wonderful!",
"Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing."
],
[
"In a world of constant change, one thing remains certain: the power of human connection and understanding.",
"Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing."
],
]
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
gr.Markdown(
"""
# ๐ŸŽ™๏ธ Maya1 Text-to-Speech
Generate emotional and realistic speech with natural language voice design
Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter your text here... You can use <laugh>, <sigh>, and other emotion tags!",
lines=5,
value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions."
)
voice_description = gr.Textbox(
label="Voice Description",
placeholder="Describe the voice characteristics (age, gender, accent, pitch, timbre, pacing)...",
lines=3,
value="Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
)
generate_btn = gr.Button("๐ŸŽค Generate Speech", variant="primary", size="lg")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
type="filepath"
)
gr.Markdown("""
### ๐Ÿ’ก Tips
- Use emotion tags: `<laugh>`, `<sigh>`, `<whisper>`, `<shout>`
- Describe voice with: age, gender, accent, pitch, timbre, pacing
- Longer text works better (20+ words recommended)
### About
Maya1 is an open-source voice AI model that generates realistic, emotional speech from text using natural language voice descriptions.
""")
# Generate speech button
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_description],
outputs=[audio_output]
)
# Examples section
gr.Examples(
examples=examples,
inputs=[text_input, voice_description],
outputs=[audio_output],
fn=generate_speech,
cache_examples=False,
label="Example Prompts"
)
if __name__ == "__main__":
demo.launch()