File size: 4,713 Bytes
f9923d4
 
97e7627
 
39d7a6f
97e7627
f9923d4
 
 
 
 
 
 
97e7627
6cec260
97e7627
 
6cec260
 
f9923d4
 
 
 
 
 
39d7a6f
f9923d4
39d7a6f
 
 
 
 
 
 
97e7627
 
 
6cec260
97e7627
 
f9923d4
 
 
 
39d7a6f
 
97e7627
39d7a6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9923d4
97e7627
 
 
 
 
 
 
 
 
 
 
 
39d7a6f
 
97e7627
 
f9923d4
39d7a6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97e7627
f9923d4
39d7a6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97e7627
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
import spaces

# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"

# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="eager",
)

# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'

# Define inference functions for each input type
@spaces.GPU
def process_image(image, question):
    if not image or not question:
        return "Please upload an image and provide a question."
    
    prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
    inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
    
    with torch.no_grad():
        generate_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            num_logits_to_keep=0,
        )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    return response

@spaces.GPU
def process_audio(audio, question):
    if not audio or not question:
        return "Please upload an audio file and provide a question."
    
    prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
    samplerate, audio_data = audio  # Gradio Audio returns (samplerate, data)
    inputs = processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(model.device)
    
    with torch.no_grad():
        generate_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            num_logits_to_keep=0,
        )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    return response

# Gradio interface
with gr.Blocks(
    title="Phi-4 Multimodal Demo",
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="gray",
        radius_size="lg",
    ),
) as demo:
    gr.Markdown(
        """
        # Phi-4 Multimodal Demo
        Select a tab below to upload an **image** or **audio** file, ask a question, and get a response from the model!  
        Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
        """
    )
    
    with gr.Tabs():
        # Image Tab
        with gr.TabItem("Image"):
            with gr.Row():
                with gr.Column(scale=1):
                    image_input = gr.Image(label="Upload Your Image", type="pil")
                    image_question = gr.Textbox(
                        label="Your Question",
                        placeholder="e.g., 'What is shown in this image?'",
                        lines=2,
                    )
                    image_submit = gr.Button("Submit", variant="primary")
                with gr.Column(scale=2):
                    image_output = gr.Textbox(
                        label="Model Response",
                        placeholder="Response will appear here...",
                        lines=10,
                        interactive=False,
                    )
            image_submit.click(
                fn=process_image,
                inputs=[image_input, image_question],
                outputs=image_output,
            )
        
        # Audio Tab
        with gr.TabItem("Audio"):
            with gr.Row():
                with gr.Column(scale=1):
                    audio_input = gr.Audio(label="Upload Your Audio", type="numpy")
                    audio_question = gr.Textbox(
                        label="Your Question",
                        placeholder="e.g., 'Transcribe this audio.'",
                        lines=2,
                    )
                    audio_submit = gr.Button("Submit", variant="primary")
                with gr.Column(scale=2):
                    audio_output = gr.Textbox(
                        label="Model Response",
                        placeholder="Response will appear here...",
                        lines=10,
                        interactive=False,
                    )
            audio_submit.click(
                fn=process_audio,
                inputs=[audio_input, audio_question],
                outputs=audio_output,
            )

# Launch the demo
demo.launch()