ariG23498 HF staff commited on
Commit
003a173
·
1 Parent(s): 6cec260

minor fixes

Browse files
Files changed (1) hide show
  1. app.py +39 -20
app.py CHANGED
@@ -2,8 +2,7 @@ import gradio as gr
2
  from PIL import Image
3
  import torch
4
  import soundfile as sf
5
- from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
6
- from urllib.request import urlopen
7
  import spaces
8
 
9
  # Define model path
@@ -28,7 +27,7 @@ prompt_suffix = '<|end|>'
28
  @spaces.GPU
29
  def process_input(input_type, file, question):
30
  if not file or not question:
31
- return "Please upload a file and provide a question."
32
 
33
  # Prepare the prompt
34
  if input_type == "Image":
@@ -36,13 +35,15 @@ def process_input(input_type, file, question):
36
  # Open image from uploaded file
37
  image = Image.open(file)
38
  inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
 
39
  elif input_type == "Audio":
40
  prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
41
  # Read audio from uploaded file
42
  audio, samplerate = sf.read(file)
43
  inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
 
44
  else:
45
- return "Invalid input type selected."
46
 
47
  # Generate response
48
  with torch.no_grad():
@@ -56,7 +57,7 @@ def process_input(input_type, file, question):
56
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
57
  )[0]
58
 
59
- return response
60
 
61
  # Gradio interface
62
  with gr.Blocks(
@@ -71,7 +72,7 @@ with gr.Blocks(
71
  """
72
  # Phi-4 Multimodal Demo
73
  Upload an **image** or **audio** file, ask a question, and get a response from the model!
74
- Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
75
  """
76
  )
77
 
@@ -94,12 +95,37 @@ with gr.Blocks(
94
  submit_btn = gr.Button("Submit", variant="primary")
95
 
96
  with gr.Column(scale=2):
97
- output_text = gr.Textbox(
98
- label="Model Response",
99
- placeholder="Response will appear here...",
100
- lines=10,
101
- interactive=False,
102
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  # Example section
105
  with gr.Accordion("Examples", open=False):
@@ -110,17 +136,10 @@ with gr.Blocks(
110
  ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
111
  ],
112
  inputs=[input_type, file_input, question_input],
113
- outputs=output_text,
114
  fn=process_input,
115
  cache_examples=False,
116
  )
117
 
118
- # Connect the submit button
119
- submit_btn.click(
120
- fn=process_input,
121
- inputs=[input_type, file_input, question_input],
122
- outputs=output_text,
123
- )
124
-
125
  # Launch the demo
126
  demo.launch()
 
2
  from PIL import Image
3
  import torch
4
  import soundfile as sf
5
+ from transformers import AutoModelForCausalLM, AutoProcessor
 
6
  import spaces
7
 
8
  # Define model path
 
27
  @spaces.GPU
28
  def process_input(input_type, file, question):
29
  if not file or not question:
30
+ return "Please upload a file and provide a question.", None
31
 
32
  # Prepare the prompt
33
  if input_type == "Image":
 
35
  # Open image from uploaded file
36
  image = Image.open(file)
37
  inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
38
+ media_output = image # Return the image for display
39
  elif input_type == "Audio":
40
  prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
41
  # Read audio from uploaded file
42
  audio, samplerate = sf.read(file)
43
  inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
44
+ media_output = (samplerate, audio) # Return audio in format (samplerate, data) for Gradio
45
  else:
46
+ return "Invalid input type selected.", None
47
 
48
  # Generate response
49
  with torch.no_grad():
 
57
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
58
  )[0]
59
 
60
+ return response, media_output
61
 
62
  # Gradio interface
63
  with gr.Blocks(
 
72
  """
73
  # Phi-4 Multimodal Demo
74
  Upload an **image** or **audio** file, ask a question, and get a response from the model!
75
+ Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft.
76
  """
77
  )
78
 
 
95
  submit_btn = gr.Button("Submit", variant="primary")
96
 
97
  with gr.Column(scale=2):
98
+ with gr.Tab("Preview"):
99
+ media_output = gr.Image(label="Uploaded Image", visible=True) # Default to image
100
+ gr.Audio(label="Uploaded Audio", visible=False) # Hidden by default
101
+ with gr.Tab("Response"):
102
+ output_text = gr.Textbox(
103
+ label="Model Response",
104
+ placeholder="Response will appear here...",
105
+ lines=10,
106
+ interactive=False,
107
+ )
108
+
109
+ # Dynamically update media visibility based on input type
110
+ def update_media_visibility(input_type):
111
+ if input_type == "Image":
112
+ return gr.update(visible=True), gr.update(visible=False)
113
+ elif input_type == "Audio":
114
+ return gr.update(visible=False), gr.update(visible=True)
115
+ return gr.update(visible=False), gr.update(visible=False)
116
+
117
+ input_type.change(
118
+ fn=update_media_visibility,
119
+ inputs=input_type,
120
+ outputs=[media_output, demo.blocks["Audio"]]
121
+ )
122
+
123
+ # Connect the submit button
124
+ submit_btn.click(
125
+ fn=process_input,
126
+ inputs=[input_type, file_input, question_input],
127
+ outputs=[output_text, media_output],
128
+ )
129
 
130
  # Example section
131
  with gr.Accordion("Examples", open=False):
 
136
  ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
137
  ],
138
  inputs=[input_type, file_input, question_input],
139
+ outputs=[output_text, media_output],
140
  fn=process_input,
141
  cache_examples=False,
142
  )
143
 
 
 
 
 
 
 
 
144
  # Launch the demo
145
  demo.launch()