Spaces:

akhaliq
/

maya1

Running on Zero

App Files Files Community

maya1 / app.py

akhaliq HF Staff

Update app.py

f021bd5 verified 10 days ago

raw

history blame contribute delete

6.57 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from snac import SNAC
	import soundfile as sf
	import numpy as np
	import tempfile
	import spaces

	# Load models at startup
	print("Loading Maya1 model...")
	model = AutoModelForCausalLM.from_pretrained(
	"maya-research/maya1",
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(
	"maya-research/maya1",
	trust_remote_code=True
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print("Loading SNAC audio decoder...")
	snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()

	@spaces.GPU(duration=180)
	def generate_speech(text, voice_description):
	"""Generate speech from text using Maya1 model with ZeroGPU"""

	if not text.strip():
	raise gr.Error("Please enter some text to convert to speech!")

	if not voice_description.strip():
	voice_description = "Realistic voice with neutral tone and conversational pacing."

	try:
	# Move models to GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	snac_model.to(device)

	# Create prompt
	prompt = f'<description="{voice_description}"> {text}'

	# Tokenize input
	inputs = tokenizer(prompt, return_tensors="pt").to(device)

	# Generate
	with torch.inference_mode():
	outputs = model.generate(
	input_ids=inputs['input_ids'],
	attention_mask=inputs.get('attention_mask', None),
	max_new_tokens=1000,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=None,
	)

	# Extract SNAC audio tokens
	generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
	snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]

	if len(snac_tokens) < 7:
	raise gr.Error(
	f"Not enough audio tokens generated ({len(snac_tokens)}). "
	f"Try using longer text or different voice description."
	)

	# Decode SNAC tokens to audio frames
	frames = len(snac_tokens) // 7
	codes = [[], [], []]

	for i in range(frames):
	s = snac_tokens[i7:(i+1)7]
	codes[0].append((s[0]-128266) % 4096)
	codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
	codes[2].extend([
	(s[2]-128266) % 4096,
	(s[3]-128266) % 4096,
	(s[5]-128266) % 4096,
	(s[6]-128266) % 4096
	])

	# Generate final audio with SNAC decoder
	codes_tensor = [
	torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0)
	for c in codes
	]

	with torch.inference_mode():
	audio = snac_model.decoder(
	snac_model.quantizer.from_codes(codes_tensor)
	)[0, 0].cpu().numpy()

	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
	sf.write(f.name, audio, 24000)
	return f.name

	except Exception as e:
	import traceback
	traceback.print_exc()
	raise gr.Error(f"Error generating speech: {str(e)}")

	# Examples
	examples = [
	[
	"Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
	"Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
	],
	[
	"I'm so excited to share this amazing news with you! This is incredible and wonderful!",
	"Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing."
	],
	[
	"In a world of constant change, one thing remains certain: the power of human connection and understanding.",
	"Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing."
	],
	]

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
	gr.Markdown(
	"""
	# 🎙️ Maya1 Text-to-Speech
	Generate emotional and realistic speech with natural language voice design

	Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter your text here... You can use <laugh>, <sigh>, and other emotion tags!",
	lines=5,
	value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions."
	)

	voice_description = gr.Textbox(
	label="Voice Description",
	placeholder="Describe the voice characteristics (age, gender, accent, pitch, timbre, pacing)...",
	lines=3,
	value="Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
	)

	generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")

	with gr.Column():
	audio_output = gr.Audio(
	label="Generated Speech",
	type="filepath"
	)

	gr.Markdown("""
	### 💡 Tips
	- Use emotion tags: `<laugh>`, `<sigh>`, `<whisper>`, `<shout>`
	- Describe voice with: age, gender, accent, pitch, timbre, pacing
	- Longer text works better (20+ words recommended)

	### About
	Maya1 is an open-source voice AI model that generates realistic, emotional speech from text using natural language voice descriptions.
	""")

	# Generate speech button
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_description],
	outputs=[audio_output]
	)

	# Examples section
	gr.Examples(
	examples=examples,
	inputs=[text_input, voice_description],
	outputs=[audio_output],
	fn=generate_speech,
	cache_examples=False,
	label="Example Prompts"
	)

	if __name__ == "__main__":
	demo.launch()