phi4-multimodal / app.py
ariG23498's picture
ariG23498 HF staff
minor fixes
003a173
raw
history blame
5.11 kB
import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor
import spaces
# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"
# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="eager",
)
# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
# Define inference function
@spaces.GPU
def process_input(input_type, file, question):
if not file or not question:
return "Please upload a file and provide a question.", None
# Prepare the prompt
if input_type == "Image":
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
# Open image from uploaded file
image = Image.open(file)
inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
media_output = image # Return the image for display
elif input_type == "Audio":
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
# Read audio from uploaded file
audio, samplerate = sf.read(file)
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
media_output = (samplerate, audio) # Return audio in format (samplerate, data) for Gradio
else:
return "Invalid input type selected.", None
# Generate response
with torch.no_grad():
generate_ids = model.generate(
**inputs,
max_new_tokens=200,
num_logits_to_keep=0,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response, media_output
# Gradio interface
with gr.Blocks(
title="Phi-4 Multimodal Demo",
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="gray",
radius_size="lg",
),
) as demo:
gr.Markdown(
"""
# Phi-4 Multimodal Demo
Upload an **image** or **audio** file, ask a question, and get a response from the model!
Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft.
"""
)
with gr.Row():
with gr.Column(scale=1):
input_type = gr.Radio(
choices=["Image", "Audio"],
label="Select Input Type",
value="Image",
)
file_input = gr.File(
label="Upload Your File",
file_types=["image", "audio"],
)
question_input = gr.Textbox(
label="Your Question",
placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
lines=2,
)
submit_btn = gr.Button("Submit", variant="primary")
with gr.Column(scale=2):
with gr.Tab("Preview"):
media_output = gr.Image(label="Uploaded Image", visible=True) # Default to image
gr.Audio(label="Uploaded Audio", visible=False) # Hidden by default
with gr.Tab("Response"):
output_text = gr.Textbox(
label="Model Response",
placeholder="Response will appear here...",
lines=10,
interactive=False,
)
# Dynamically update media visibility based on input type
def update_media_visibility(input_type):
if input_type == "Image":
return gr.update(visible=True), gr.update(visible=False)
elif input_type == "Audio":
return gr.update(visible=False), gr.update(visible=True)
return gr.update(visible=False), gr.update(visible=False)
input_type.change(
fn=update_media_visibility,
inputs=input_type,
outputs=[media_output, demo.blocks["Audio"]]
)
# Connect the submit button
submit_btn.click(
fn=process_input,
inputs=[input_type, file_input, question_input],
outputs=[output_text, media_output],
)
# Example section
with gr.Accordion("Examples", open=False):
gr.Markdown("Try these examples:")
gr.Examples(
examples=[
["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
],
inputs=[input_type, file_input, question_input],
outputs=[output_text, media_output],
fn=process_input,
cache_examples=False,
)
# Launch the demo
demo.launch()