Spaces:

ariG23498
/

phi4-multimodal

Running on Zero

File size: 5,108 Bytes

import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor
import spaces

# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"

# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="eager",
)

# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'

# Define inference function
@spaces.GPU
def process_input(input_type, file, question):
    if not file or not question:
        return "Please upload a file and provide a question.", None

    # Prepare the prompt
    if input_type == "Image":
        prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
        # Open image from uploaded file
        image = Image.open(file)
        inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
        media_output = image  # Return the image for display
    elif input_type == "Audio":
        prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
        # Read audio from uploaded file
        audio, samplerate = sf.read(file)
        inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
        media_output = (samplerate, audio)  # Return audio in format (samplerate, data) for Gradio
    else:
        return "Invalid input type selected.", None

    # Generate response
    with torch.no_grad():
        generate_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            num_logits_to_keep=0,
        )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return response, media_output

# Gradio interface
with gr.Blocks(
    title="Phi-4 Multimodal Demo",
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="gray",
        radius_size="lg",
    ),
) as demo:
    gr.Markdown(
        """
        # Phi-4 Multimodal Demo
        Upload an **image** or **audio** file, ask a question, and get a response from the model!  
        Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_type = gr.Radio(
                choices=["Image", "Audio"],
                label="Select Input Type",
                value="Image",
            )
            file_input = gr.File(
                label="Upload Your File",
                file_types=["image", "audio"],
            )
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
                lines=2,
            )
            submit_btn = gr.Button("Submit", variant="primary")
        
        with gr.Column(scale=2):
            with gr.Tab("Preview"):
                media_output = gr.Image(label="Uploaded Image", visible=True)  # Default to image
                gr.Audio(label="Uploaded Audio", visible=False)  # Hidden by default
            with gr.Tab("Response"):
                output_text = gr.Textbox(
                    label="Model Response",
                    placeholder="Response will appear here...",
                    lines=10,
                    interactive=False,
                )

    # Dynamically update media visibility based on input type
    def update_media_visibility(input_type):
        if input_type == "Image":
            return gr.update(visible=True), gr.update(visible=False)
        elif input_type == "Audio":
            return gr.update(visible=False), gr.update(visible=True)
        return gr.update(visible=False), gr.update(visible=False)

    input_type.change(
        fn=update_media_visibility,
        inputs=input_type,
        outputs=[media_output, demo.blocks["Audio"]]
    )

    # Connect the submit button
    submit_btn.click(
        fn=process_input,
        inputs=[input_type, file_input, question_input],
        outputs=[output_text, media_output],
    )

    # Example section
    with gr.Accordion("Examples", open=False):
        gr.Markdown("Try these examples:")
        gr.Examples(
            examples=[
                ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
                ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
            ],
            inputs=[input_type, file_input, question_input],
            outputs=[output_text, media_output],
            fn=process_input,
            cache_examples=False,
        )

# Launch the demo
demo.launch()