import spaces import gradio as gr import io from urllib.request import urlopen import soundfile as sf import torch from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig MODEL_ID = "microsoft/Phi-4-multimodal-instruct" processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cuda" if torch.cuda.is_available() else "cpu", torch_dtype="auto", trust_remote_code=True ) model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'}) model.set_adapter("speech") generation_config = GenerationConfig.from_pretrained(MODEL_ID) generation_config.num_logits_to_keep = 1 @spaces.GPU def run_phi4(audio_path: str, instruction: str) -> str: if not audio_path: return "Please upload an audio file." audio, samplerate = sf.read(audio_path) user_prompt = "<|user|>" assistant_prompt = "<|assistant|>" prompt_suffix = "<|end|>" prompt = f"{user_prompt}<|audio_1|>{instruction}{prompt_suffix}{assistant_prompt}" inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device) output_ids = model.generate( **inputs, max_new_tokens=4096, generation_config=generation_config, ) output_ids = output_ids[:, inputs["input_ids"].shape[1]:] response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return response with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo: gr.Markdown("# Phi-4 Multimodal (Audio) Demo") gr.Markdown("Upload an audio file and run instructions with Phi-4.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Upload Audio") instruction = gr.Textbox( label="Instruction", value=( "Transcribe the audio to text, and then translate the audio to French. " "Use as a separator between the original transcript and the translation." ), ) submit_btn = gr.Button("Run", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Model Response", lines=14) submit_btn.click(run_phi4, [audio_input, instruction], output_text) if __name__ == "__main__": demo.queue().launch(share=False, ssr_mode=False)