import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from snac import SNAC import soundfile as sf import numpy as np import tempfile import spaces # Load models at startup print("Loading Maya1 model...") model = AutoModelForCausalLM.from_pretrained( "maya-research/maya1", torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "maya-research/maya1", trust_remote_code=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("Loading SNAC audio decoder...") snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval() @spaces.GPU(duration=180) def generate_speech(text, voice_description): """Generate speech from text using Maya1 model with ZeroGPU""" if not text.strip(): raise gr.Error("Please enter some text to convert to speech!") if not voice_description.strip(): voice_description = "Realistic voice with neutral tone and conversational pacing." try: # Move models to GPU device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) snac_model.to(device) # Create prompt prompt = f' {text}' # Tokenize input inputs = tokenizer(prompt, return_tensors="pt").to(device) # Generate with torch.inference_mode(): outputs = model.generate( input_ids=inputs['input_ids'], attention_mask=inputs.get('attention_mask', None), max_new_tokens=1000, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=None, ) # Extract SNAC audio tokens generated_ids = outputs[0, inputs['input_ids'].shape[1]:] snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937] if len(snac_tokens) < 7: raise gr.Error( f"Not enough audio tokens generated ({len(snac_tokens)}). " f"Try using longer text or different voice description." ) # Decode SNAC tokens to audio frames frames = len(snac_tokens) // 7 codes = [[], [], []] for i in range(frames): s = snac_tokens[i*7:(i+1)*7] codes[0].append((s[0]-128266) % 4096) codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096]) codes[2].extend([ (s[2]-128266) % 4096, (s[3]-128266) % 4096, (s[5]-128266) % 4096, (s[6]-128266) % 4096 ]) # Generate final audio with SNAC decoder codes_tensor = [ torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) for c in codes ] with torch.inference_mode(): audio = snac_model.decoder( snac_model.quantizer.from_codes(codes_tensor) )[0, 0].cpu().numpy() # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: sf.write(f.name, audio, 24000) return f.name except Exception as e: import traceback traceback.print_exc() raise gr.Error(f"Error generating speech: {str(e)}") # Examples examples = [ [ "Hello! This is Maya1 the best open source voice AI model with emotions.", "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing." ], [ "I'm so excited to share this amazing news with you! This is incredible and wonderful!", "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing." ], [ "In a world of constant change, one thing remains certain: the power of human connection and understanding.", "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing." ], ] # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo: gr.Markdown( """ # 🎙️ Maya1 Text-to-Speech Generate emotional and realistic speech with natural language voice design Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """ ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to Speak", placeholder="Enter your text here... You can use , , and other emotion tags!", lines=5, value="Hello! This is Maya1 the best open source voice AI model with emotions." ) voice_description = gr.Textbox( label="Voice Description", placeholder="Describe the voice characteristics (age, gender, accent, pitch, timbre, pacing)...", lines=3, value="Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing." ) generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg") with gr.Column(): audio_output = gr.Audio( label="Generated Speech", type="filepath" ) gr.Markdown(""" ### 💡 Tips - Use emotion tags: ``, ``, ``, `` - Describe voice with: age, gender, accent, pitch, timbre, pacing - Longer text works better (20+ words recommended) ### About Maya1 is an open-source voice AI model that generates realistic, emotional speech from text using natural language voice descriptions. """) # Generate speech button generate_btn.click( fn=generate_speech, inputs=[text_input, voice_description], outputs=[audio_output] ) # Examples section gr.Examples( examples=examples, inputs=[text_input, voice_description], outputs=[audio_output], fn=generate_speech, cache_examples=False, label="Example Prompts" ) if __name__ == "__main__": demo.launch()