Spaces:

ariG23498
/

phi4-multimodal

Running on Zero

App Files Files Community

phi4-multimodal / app.py

ariG23498 HF staff

minor fixes

003a173 29 days ago

raw

history blame

5.11 kB

	import gradio as gr
	from PIL import Image
	import torch
	import soundfile as sf
	from transformers import AutoModelForCausalLM, AutoProcessor
	import spaces

	# Define model path
	model_path = "microsoft/Phi-4-multimodal-instruct"

	# Load model and processor
	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto",
	torch_dtype="auto",
	trust_remote_code=True,
	_attn_implementation="eager",
	)

	# Define prompt structure
	user_prompt = '<\|user\|>'
	assistant_prompt = '<\|assistant\|>'
	prompt_suffix = '<\|end\|>'

	# Define inference function
	@spaces.GPU
	def process_input(input_type, file, question):
	if not file or not question:
	return "Please upload a file and provide a question.", None

	# Prepare the prompt
	if input_type == "Image":
	prompt = f'{user_prompt}<\|image_1\|>{question}{prompt_suffix}{assistant_prompt}'
	# Open image from uploaded file
	image = Image.open(file)
	inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
	media_output = image # Return the image for display
	elif input_type == "Audio":
	prompt = f'{user_prompt}<\|audio_1\|>{question}{prompt_suffix}{assistant_prompt}'
	# Read audio from uploaded file
	audio, samplerate = sf.read(file)
	inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
	media_output = (samplerate, audio) # Return audio in format (samplerate, data) for Gradio
	else:
	return "Invalid input type selected.", None

	# Generate response
	with torch.no_grad():
	generate_ids = model.generate(
	**inputs,
	max_new_tokens=200,
	num_logits_to_keep=0,
	)
	generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
	response = processor.batch_decode(
	generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	return response, media_output

	# Gradio interface
	with gr.Blocks(
	title="Phi-4 Multimodal Demo",
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="gray",
	radius_size="lg",
	),
	) as demo:
	gr.Markdown(
	"""
	# Phi-4 Multimodal Demo
	Upload an image or audio file, ask a question, and get a response from the model!
	Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_type = gr.Radio(
	choices=["Image", "Audio"],
	label="Select Input Type",
	value="Image",
	)
	file_input = gr.File(
	label="Upload Your File",
	file_types=["image", "audio"],
	)
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
	lines=2,
	)
	submit_btn = gr.Button("Submit", variant="primary")

	with gr.Column(scale=2):
	with gr.Tab("Preview"):
	media_output = gr.Image(label="Uploaded Image", visible=True) # Default to image
	gr.Audio(label="Uploaded Audio", visible=False) # Hidden by default
	with gr.Tab("Response"):
	output_text = gr.Textbox(
	label="Model Response",
	placeholder="Response will appear here...",
	lines=10,
	interactive=False,
	)

	# Dynamically update media visibility based on input type
	def update_media_visibility(input_type):
	if input_type == "Image":
	return gr.update(visible=True), gr.update(visible=False)
	elif input_type == "Audio":
	return gr.update(visible=False), gr.update(visible=True)
	return gr.update(visible=False), gr.update(visible=False)

	input_type.change(
	fn=update_media_visibility,
	inputs=input_type,
	outputs=[media_output, demo.blocks["Audio"]]
	)

	# Connect the submit button
	submit_btn.click(
	fn=process_input,
	inputs=[input_type, file_input, question_input],
	outputs=[output_text, media_output],
	)

	# Example section
	with gr.Accordion("Examples", open=False):
	gr.Markdown("Try these examples:")
	gr.Examples(
	examples=[
	["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
	["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
	],
	inputs=[input_type, file_input, question_input],
	outputs=[output_text, media_output],
	fn=process_input,
	cache_examples=False,
	)

	# Launch the demo
	demo.launch()