File size: 5,108 Bytes
f9923d4
 
97e7627
 
003a173
97e7627
f9923d4
 
 
 
 
 
 
97e7627
6cec260
97e7627
 
6cec260
 
f9923d4
 
 
 
 
 
97e7627
f9923d4
97e7627
 
003a173
97e7627
 
 
 
 
 
6cec260
003a173
97e7627
 
 
 
6cec260
003a173
f9923d4
003a173
97e7627
 
 
 
 
6cec260
97e7627
 
f9923d4
 
 
 
97e7627
003a173
f9923d4
97e7627
 
 
 
 
 
 
 
 
 
 
 
 
003a173
97e7627
 
f9923d4
 
97e7627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9923d4
97e7627
003a173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97e7627
 
 
 
 
 
 
 
 
 
003a173
97e7627
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor
import spaces

# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"

# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="eager",
)

# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'

# Define inference function
@spaces.GPU
def process_input(input_type, file, question):
    if not file or not question:
        return "Please upload a file and provide a question.", None

    # Prepare the prompt
    if input_type == "Image":
        prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
        # Open image from uploaded file
        image = Image.open(file)
        inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
        media_output = image  # Return the image for display
    elif input_type == "Audio":
        prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
        # Read audio from uploaded file
        audio, samplerate = sf.read(file)
        inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
        media_output = (samplerate, audio)  # Return audio in format (samplerate, data) for Gradio
    else:
        return "Invalid input type selected.", None

    # Generate response
    with torch.no_grad():
        generate_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            num_logits_to_keep=0,
        )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return response, media_output

# Gradio interface
with gr.Blocks(
    title="Phi-4 Multimodal Demo",
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="gray",
        radius_size="lg",
    ),
) as demo:
    gr.Markdown(
        """
        # Phi-4 Multimodal Demo
        Upload an **image** or **audio** file, ask a question, and get a response from the model!  
        Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_type = gr.Radio(
                choices=["Image", "Audio"],
                label="Select Input Type",
                value="Image",
            )
            file_input = gr.File(
                label="Upload Your File",
                file_types=["image", "audio"],
            )
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
                lines=2,
            )
            submit_btn = gr.Button("Submit", variant="primary")
        
        with gr.Column(scale=2):
            with gr.Tab("Preview"):
                media_output = gr.Image(label="Uploaded Image", visible=True)  # Default to image
                gr.Audio(label="Uploaded Audio", visible=False)  # Hidden by default
            with gr.Tab("Response"):
                output_text = gr.Textbox(
                    label="Model Response",
                    placeholder="Response will appear here...",
                    lines=10,
                    interactive=False,
                )

    # Dynamically update media visibility based on input type
    def update_media_visibility(input_type):
        if input_type == "Image":
            return gr.update(visible=True), gr.update(visible=False)
        elif input_type == "Audio":
            return gr.update(visible=False), gr.update(visible=True)
        return gr.update(visible=False), gr.update(visible=False)

    input_type.change(
        fn=update_media_visibility,
        inputs=input_type,
        outputs=[media_output, demo.blocks["Audio"]]
    )

    # Connect the submit button
    submit_btn.click(
        fn=process_input,
        inputs=[input_type, file_input, question_input],
        outputs=[output_text, media_output],
    )

    # Example section
    with gr.Accordion("Examples", open=False):
        gr.Markdown("Try these examples:")
        gr.Examples(
            examples=[
                ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
                ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
            ],
            inputs=[input_type, file_input, question_input],
            outputs=[output_text, media_output],
            fn=process_input,
            cache_examples=False,
        )

# Launch the demo
demo.launch()