prithivMLmods commited on
Commit
4c498cf
·
verified ·
1 Parent(s): e4a7c3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -15
app.py CHANGED
@@ -74,7 +74,7 @@ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
74
  torch_dtype=torch.float16
75
  ).to(device).eval()
76
 
77
- #video sampling
78
  def downsample_video(video_path):
79
  """
80
  Downsamples the video to evenly spaced frames.
@@ -105,6 +105,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
105
  repetition_penalty: float = 1.2):
106
  """
107
  Generates responses using the selected model for image input.
 
108
  """
109
  if model_name == "SkyCaptioner-V1":
110
  processor = processor_m
@@ -122,11 +123,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
122
  processor = processor_y
123
  model = model_y
124
  else:
125
- yield "Invalid model selected."
126
  return
127
 
128
  if image is None:
129
- yield "Please upload an image."
130
  return
131
 
132
  messages = [{
@@ -154,7 +155,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
154
  buffer += new_text
155
  buffer = buffer.replace("<|im_end|>", "")
156
  time.sleep(0.01)
157
- yield buffer
158
 
159
  @spaces.GPU
160
  def generate_video(model_name: str, text: str, video_path: str,
@@ -165,6 +166,7 @@ def generate_video(model_name: str, text: str, video_path: str,
165
  repetition_penalty: float = 1.2):
166
  """
167
  Generates responses using the selected model for video input.
 
168
  """
169
  if model_name == "SkyCaptioner-V1":
170
  processor = processor_m
@@ -182,11 +184,11 @@ def generate_video(model_name: str, text: str, video_path: str,
182
  processor = processor_y
183
  model = model_y
184
  else:
185
- yield "Invalid model selected."
186
  return
187
 
188
  if video_path is None:
189
- yield "Please upload a video."
190
  return
191
 
192
  frames = downsample_video(video_path)
@@ -225,7 +227,7 @@ def generate_video(model_name: str, text: str, video_path: str,
225
  buffer += new_text
226
  buffer = buffer.replace("<|im_end|>", "")
227
  time.sleep(0.01)
228
- yield buffer
229
 
230
  # Define examples for image and video inference
231
  image_examples = [
@@ -249,6 +251,11 @@ css = """
249
  .submit-btn:hover {
250
  background-color: #3498db !important;
251
  }
 
 
 
 
 
252
  """
253
 
254
  # Create the Gradio Interface
@@ -280,28 +287,33 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
280
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
281
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
282
  with gr.Column():
283
- output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
 
 
 
 
284
  model_choice = gr.Radio(
285
  choices=["SkyCaptioner-V1", "Behemoth-3B-070225-post0.1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "SpaceOm-3B"],
286
  label="Select Model",
287
  value="SkyCaptioner-V1"
288
  )
289
-
290
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
291
- gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
292
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
293
- gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
294
- gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")
295
-
 
 
296
  image_submit.click(
297
  fn=generate_image,
298
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
299
- outputs=output
300
  )
301
  video_submit.click(
302
  fn=generate_video,
303
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
304
- outputs=output
305
  )
306
 
307
  if __name__ == "__main__":
 
74
  torch_dtype=torch.float16
75
  ).to(device).eval()
76
 
77
+ # Video sampling
78
  def downsample_video(video_path):
79
  """
80
  Downsamples the video to evenly spaced frames.
 
105
  repetition_penalty: float = 1.2):
106
  """
107
  Generates responses using the selected model for image input.
108
+ Yields raw text and Markdown-formatted text.
109
  """
110
  if model_name == "SkyCaptioner-V1":
111
  processor = processor_m
 
123
  processor = processor_y
124
  model = model_y
125
  else:
126
+ yield "Invalid model selected.", "Invalid model selected."
127
  return
128
 
129
  if image is None:
130
+ yield "Please upload an image.", "Please upload an image."
131
  return
132
 
133
  messages = [{
 
155
  buffer += new_text
156
  buffer = buffer.replace("<|im_end|>", "")
157
  time.sleep(0.01)
158
+ yield buffer, buffer
159
 
160
  @spaces.GPU
161
  def generate_video(model_name: str, text: str, video_path: str,
 
166
  repetition_penalty: float = 1.2):
167
  """
168
  Generates responses using the selected model for video input.
169
+ Yields raw text and Markdown-formatted text.
170
  """
171
  if model_name == "SkyCaptioner-V1":
172
  processor = processor_m
 
184
  processor = processor_y
185
  model = model_y
186
  else:
187
+ yield "Invalid model selected.", "Invalid model selected."
188
  return
189
 
190
  if video_path is None:
191
+ yield "Please upload a video.", "Please upload a video."
192
  return
193
 
194
  frames = downsample_video(video_path)
 
227
  buffer += new_text
228
  buffer = buffer.replace("<|im_end|>", "")
229
  time.sleep(0.01)
230
+ yield buffer, buffer
231
 
232
  # Define examples for image and video inference
233
  image_examples = [
 
251
  .submit-btn:hover {
252
  background-color: #3498db !important;
253
  }
254
+ .canvas-output {
255
+ border: 2px solid #4682B4;
256
+ border-radius: 10px;
257
+ padding: 20px;
258
+ }
259
  """
260
 
261
  # Create the Gradio Interface
 
287
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
288
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
289
  with gr.Column():
290
+ with gr.Column(elem_classes="canvas-output"):
291
+ gr.Markdown("## Result.Md")
292
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, scale=2)
293
+ with gr.Accordion("Formatted Result (Result.md)", open=False):
294
+ markdown_output = gr.Markdown(label="Formatted Result")
295
  model_choice = gr.Radio(
296
  choices=["SkyCaptioner-V1", "Behemoth-3B-070225-post0.1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "SpaceOm-3B"],
297
  label="Select Model",
298
  value="SkyCaptioner-V1"
299
  )
 
300
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
301
+ gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
302
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
303
+ gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
304
+ gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")
305
+ gr.Markdown("> [Behemoth-3B-070225-post0.1](https://huggingface.co/prithivMLmods/Behemoth-3B-070225-post0.1): The behemoth-3b-070225-post0.1 model is a fine-tuned version of qwen2.5-vl-3b-instruct, optimized for detailed image captioning, OCR tasks, and chain-of-thought reasoning.")
306
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
307
+
308
  image_submit.click(
309
  fn=generate_image,
310
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
311
+ outputs=[output, markdown_output]
312
  )
313
  video_submit.click(
314
  fn=generate_video,
315
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
316
+ outputs=[output, markdown_output]
317
  )
318
 
319
  if __name__ == "__main__":