Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from PIL import Image | |
import torch | |
import soundfile as sf | |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig | |
import spaces | |
# Define model path | |
model_path = "microsoft/Phi-4-multimodal-instruct" | |
# Load model and processor | |
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_path, | |
device_map="auto", | |
torch_dtype="auto", | |
trust_remote_code=True, | |
_attn_implementation="eager", | |
) | |
# Define prompt structure | |
user_prompt = '<|user|>' | |
assistant_prompt = '<|assistant|>' | |
prompt_suffix = '<|end|>' | |
# Define inference functions for each input type | |
def process_image(image, question): | |
if not image or not question: | |
return "Please upload an image and provide a question." | |
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}' | |
inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device) | |
with torch.no_grad(): | |
generate_ids = model.generate( | |
**inputs, | |
max_new_tokens=200, | |
num_logits_to_keep=0, | |
) | |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] | |
response = processor.batch_decode( | |
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
)[0] | |
return response | |
def process_audio(audio, question): | |
if not audio or not question: | |
return "Please upload an audio file and provide a question." | |
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}' | |
samplerate, audio_data = audio # Gradio Audio returns (samplerate, data) | |
inputs = processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(model.device) | |
with torch.no_grad(): | |
generate_ids = model.generate( | |
**inputs, | |
max_new_tokens=200, | |
num_logits_to_keep=0, | |
) | |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] | |
response = processor.batch_decode( | |
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
)[0] | |
return response | |
# Gradio interface | |
with gr.Blocks( | |
title="Phi-4 Multimodal Demo", | |
theme=gr.themes.Soft( | |
primary_hue="blue", | |
secondary_hue="gray", | |
radius_size="lg", | |
), | |
) as demo: | |
gr.Markdown( | |
""" | |
# Phi-4 Multimodal Demo | |
Select a tab below to upload an **image** or **audio** file, ask a question, and get a response from the model! | |
Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI. | |
""" | |
) | |
with gr.Tabs(): | |
# Image Tab | |
with gr.TabItem("Image"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
image_input = gr.Image(label="Upload Your Image", type="pil") | |
image_question = gr.Textbox( | |
label="Your Question", | |
placeholder="e.g., 'What is shown in this image?'", | |
lines=2, | |
) | |
image_submit = gr.Button("Submit", variant="primary") | |
with gr.Column(scale=2): | |
image_output = gr.Textbox( | |
label="Model Response", | |
placeholder="Response will appear here...", | |
lines=10, | |
interactive=False, | |
) | |
image_submit.click( | |
fn=process_image, | |
inputs=[image_input, image_question], | |
outputs=image_output, | |
) | |
# Audio Tab | |
with gr.TabItem("Audio"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
audio_input = gr.Audio(label="Upload Your Audio", type="numpy") | |
audio_question = gr.Textbox( | |
label="Your Question", | |
placeholder="e.g., 'Transcribe this audio.'", | |
lines=2, | |
) | |
audio_submit = gr.Button("Submit", variant="primary") | |
with gr.Column(scale=2): | |
audio_output = gr.Textbox( | |
label="Model Response", | |
placeholder="Response will appear here...", | |
lines=10, | |
interactive=False, | |
) | |
audio_submit.click( | |
fn=process_audio, | |
inputs=[audio_input, audio_question], | |
outputs=audio_output, | |
) | |
# Launch the demo | |
demo.launch() |