ariG23498 HF staff commited on
Commit
39d7a6f
·
1 Parent(s): 003a173
Files changed (1) hide show
  1. app.py +80 -88
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  from PIL import Image
3
  import torch
4
  import soundfile as sf
5
- from transformers import AutoModelForCausalLM, AutoProcessor
6
  import spaces
7
 
8
  # Define model path
@@ -23,29 +23,15 @@ user_prompt = '<|user|>'
23
  assistant_prompt = '<|assistant|>'
24
  prompt_suffix = '<|end|>'
25
 
26
- # Define inference function
27
  @spaces.GPU
28
- def process_input(input_type, file, question):
29
- if not file or not question:
30
- return "Please upload a file and provide a question.", None
31
-
32
- # Prepare the prompt
33
- if input_type == "Image":
34
- prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
35
- # Open image from uploaded file
36
- image = Image.open(file)
37
- inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
38
- media_output = image # Return the image for display
39
- elif input_type == "Audio":
40
- prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
41
- # Read audio from uploaded file
42
- audio, samplerate = sf.read(file)
43
- inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
44
- media_output = (samplerate, audio) # Return audio in format (samplerate, data) for Gradio
45
- else:
46
- return "Invalid input type selected.", None
47
-
48
- # Generate response
49
  with torch.no_grad():
50
  generate_ids = model.generate(
51
  **inputs,
@@ -56,8 +42,30 @@ def process_input(input_type, file, question):
56
  response = processor.batch_decode(
57
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
58
  )[0]
 
 
59
 
60
- return response, media_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # Gradio interface
63
  with gr.Blocks(
@@ -71,75 +79,59 @@ with gr.Blocks(
71
  gr.Markdown(
72
  """
73
  # Phi-4 Multimodal Demo
74
- Upload an **image** or **audio** file, ask a question, and get a response from the model!
75
- Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft.
76
  """
77
  )
78
 
79
- with gr.Row():
80
- with gr.Column(scale=1):
81
- input_type = gr.Radio(
82
- choices=["Image", "Audio"],
83
- label="Select Input Type",
84
- value="Image",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  )
86
- file_input = gr.File(
87
- label="Upload Your File",
88
- file_types=["image", "audio"],
89
- )
90
- question_input = gr.Textbox(
91
- label="Your Question",
92
- placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
93
- lines=2,
94
- )
95
- submit_btn = gr.Button("Submit", variant="primary")
96
 
97
- with gr.Column(scale=2):
98
- with gr.Tab("Preview"):
99
- media_output = gr.Image(label="Uploaded Image", visible=True) # Default to image
100
- gr.Audio(label="Uploaded Audio", visible=False) # Hidden by default
101
- with gr.Tab("Response"):
102
- output_text = gr.Textbox(
103
- label="Model Response",
104
- placeholder="Response will appear here...",
105
- lines=10,
106
- interactive=False,
107
- )
108
-
109
- # Dynamically update media visibility based on input type
110
- def update_media_visibility(input_type):
111
- if input_type == "Image":
112
- return gr.update(visible=True), gr.update(visible=False)
113
- elif input_type == "Audio":
114
- return gr.update(visible=False), gr.update(visible=True)
115
- return gr.update(visible=False), gr.update(visible=False)
116
-
117
- input_type.change(
118
- fn=update_media_visibility,
119
- inputs=input_type,
120
- outputs=[media_output, demo.blocks["Audio"]]
121
- )
122
-
123
- # Connect the submit button
124
- submit_btn.click(
125
- fn=process_input,
126
- inputs=[input_type, file_input, question_input],
127
- outputs=[output_text, media_output],
128
- )
129
-
130
- # Example section
131
- with gr.Accordion("Examples", open=False):
132
- gr.Markdown("Try these examples:")
133
- gr.Examples(
134
- examples=[
135
- ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
136
- ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
137
- ],
138
- inputs=[input_type, file_input, question_input],
139
- outputs=[output_text, media_output],
140
- fn=process_input,
141
- cache_examples=False,
142
- )
143
 
144
  # Launch the demo
145
  demo.launch()
 
2
  from PIL import Image
3
  import torch
4
  import soundfile as sf
5
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
6
  import spaces
7
 
8
  # Define model path
 
23
  assistant_prompt = '<|assistant|>'
24
  prompt_suffix = '<|end|>'
25
 
26
+ # Define inference functions for each input type
27
  @spaces.GPU
28
+ def process_image(image, question):
29
+ if not image or not question:
30
+ return "Please upload an image and provide a question."
31
+
32
+ prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
33
+ inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
34
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  with torch.no_grad():
36
  generate_ids = model.generate(
37
  **inputs,
 
42
  response = processor.batch_decode(
43
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
44
  )[0]
45
+
46
+ return response
47
 
48
+ @spaces.GPU
49
+ def process_audio(audio, question):
50
+ if not audio or not question:
51
+ return "Please upload an audio file and provide a question."
52
+
53
+ prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
54
+ samplerate, audio_data = audio # Gradio Audio returns (samplerate, data)
55
+ inputs = processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(model.device)
56
+
57
+ with torch.no_grad():
58
+ generate_ids = model.generate(
59
+ **inputs,
60
+ max_new_tokens=200,
61
+ num_logits_to_keep=0,
62
+ )
63
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
64
+ response = processor.batch_decode(
65
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
66
+ )[0]
67
+
68
+ return response
69
 
70
  # Gradio interface
71
  with gr.Blocks(
 
79
  gr.Markdown(
80
  """
81
  # Phi-4 Multimodal Demo
82
+ Select a tab below to upload an **image** or **audio** file, ask a question, and get a response from the model!
83
+ Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
84
  """
85
  )
86
 
87
+ with gr.Tabs():
88
+ # Image Tab
89
+ with gr.TabItem("Image"):
90
+ with gr.Row():
91
+ with gr.Column(scale=1):
92
+ image_input = gr.Image(label="Upload Your Image", type="pil")
93
+ image_question = gr.Textbox(
94
+ label="Your Question",
95
+ placeholder="e.g., 'What is shown in this image?'",
96
+ lines=2,
97
+ )
98
+ image_submit = gr.Button("Submit", variant="primary")
99
+ with gr.Column(scale=2):
100
+ image_output = gr.Textbox(
101
+ label="Model Response",
102
+ placeholder="Response will appear here...",
103
+ lines=10,
104
+ interactive=False,
105
+ )
106
+ image_submit.click(
107
+ fn=process_image,
108
+ inputs=[image_input, image_question],
109
+ outputs=image_output,
110
  )
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # Audio Tab
113
+ with gr.TabItem("Audio"):
114
+ with gr.Row():
115
+ with gr.Column(scale=1):
116
+ audio_input = gr.Audio(label="Upload Your Audio", type="numpy")
117
+ audio_question = gr.Textbox(
118
+ label="Your Question",
119
+ placeholder="e.g., 'Transcribe this audio.'",
120
+ lines=2,
121
+ )
122
+ audio_submit = gr.Button("Submit", variant="primary")
123
+ with gr.Column(scale=2):
124
+ audio_output = gr.Textbox(
125
+ label="Model Response",
126
+ placeholder="Response will appear here...",
127
+ lines=10,
128
+ interactive=False,
129
+ )
130
+ audio_submit.click(
131
+ fn=process_audio,
132
+ inputs=[audio_input, audio_question],
133
+ outputs=audio_output,
134
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  # Launch the demo
137
  demo.launch()