import gradio as gr from PIL import Image import torch import soundfile as sf from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig import spaces # Define model path model_path = "microsoft/Phi-4-multimodal-instruct" # Load model and processor processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", torch_dtype="auto", trust_remote_code=True, _attn_implementation="eager", ) # Define prompt structure user_prompt = '<|user|>' assistant_prompt = '<|assistant|>' prompt_suffix = '<|end|>' # Define inference functions for each input type @spaces.GPU def process_image(image, question): if not image or not question: return "Please upload an image and provide a question." prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}' inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device) with torch.no_grad(): generate_ids = model.generate( **inputs, max_new_tokens=200, num_logits_to_keep=0, ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return response @spaces.GPU def process_audio(audio, question): if not audio or not question: return "Please upload an audio file and provide a question." prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}' samplerate, audio_data = audio # Gradio Audio returns (samplerate, data) inputs = processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(model.device) with torch.no_grad(): generate_ids = model.generate( **inputs, max_new_tokens=200, num_logits_to_keep=0, ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return response # Gradio interface with gr.Blocks( title="Phi-4 Multimodal Demo", theme=gr.themes.Soft( primary_hue="blue", secondary_hue="gray", radius_size="lg", ), ) as demo: gr.Markdown( """ # Phi-4 Multimodal Demo Select a tab below to upload an **image** or **audio** file, ask a question, and get a response from the model! Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI. """ ) with gr.Tabs(): # Image Tab with gr.TabItem("Image"): with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(label="Upload Your Image", type="pil") image_question = gr.Textbox( label="Your Question", placeholder="e.g., 'What is shown in this image?'", lines=2, ) image_submit = gr.Button("Submit", variant="primary") with gr.Column(scale=2): image_output = gr.Textbox( label="Model Response", placeholder="Response will appear here...", lines=10, interactive=False, ) image_submit.click( fn=process_image, inputs=[image_input, image_question], outputs=image_output, ) # Audio Tab with gr.TabItem("Audio"): with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio(label="Upload Your Audio", type="numpy") audio_question = gr.Textbox( label="Your Question", placeholder="e.g., 'Transcribe this audio.'", lines=2, ) audio_submit = gr.Button("Submit", variant="primary") with gr.Column(scale=2): audio_output = gr.Textbox( label="Model Response", placeholder="Response will appear here...", lines=10, interactive=False, ) audio_submit.click( fn=process_audio, inputs=[audio_input, audio_question], outputs=audio_output, ) # Launch the demo demo.launch()