Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,713 Bytes
f9923d4 97e7627 39d7a6f 97e7627 f9923d4 97e7627 6cec260 97e7627 6cec260 f9923d4 39d7a6f f9923d4 39d7a6f 97e7627 6cec260 97e7627 f9923d4 39d7a6f 97e7627 39d7a6f f9923d4 97e7627 39d7a6f 97e7627 f9923d4 39d7a6f 97e7627 f9923d4 39d7a6f 97e7627 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
import spaces
# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"
# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="eager",
)
# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
# Define inference functions for each input type
@spaces.GPU
def process_image(image, question):
if not image or not question:
return "Please upload an image and provide a question."
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
with torch.no_grad():
generate_ids = model.generate(
**inputs,
max_new_tokens=200,
num_logits_to_keep=0,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response
@spaces.GPU
def process_audio(audio, question):
if not audio or not question:
return "Please upload an audio file and provide a question."
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
samplerate, audio_data = audio # Gradio Audio returns (samplerate, data)
inputs = processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(model.device)
with torch.no_grad():
generate_ids = model.generate(
**inputs,
max_new_tokens=200,
num_logits_to_keep=0,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response
# Gradio interface
with gr.Blocks(
title="Phi-4 Multimodal Demo",
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="gray",
radius_size="lg",
),
) as demo:
gr.Markdown(
"""
# Phi-4 Multimodal Demo
Select a tab below to upload an **image** or **audio** file, ask a question, and get a response from the model!
Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
"""
)
with gr.Tabs():
# Image Tab
with gr.TabItem("Image"):
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(label="Upload Your Image", type="pil")
image_question = gr.Textbox(
label="Your Question",
placeholder="e.g., 'What is shown in this image?'",
lines=2,
)
image_submit = gr.Button("Submit", variant="primary")
with gr.Column(scale=2):
image_output = gr.Textbox(
label="Model Response",
placeholder="Response will appear here...",
lines=10,
interactive=False,
)
image_submit.click(
fn=process_image,
inputs=[image_input, image_question],
outputs=image_output,
)
# Audio Tab
with gr.TabItem("Audio"):
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(label="Upload Your Audio", type="numpy")
audio_question = gr.Textbox(
label="Your Question",
placeholder="e.g., 'Transcribe this audio.'",
lines=2,
)
audio_submit = gr.Button("Submit", variant="primary")
with gr.Column(scale=2):
audio_output = gr.Textbox(
label="Model Response",
placeholder="Response will appear here...",
lines=10,
interactive=False,
)
audio_submit.click(
fn=process_audio,
inputs=[audio_input, audio_question],
outputs=audio_output,
)
# Launch the demo
demo.launch() |