prithivMLmods commited on
Commit
3d9d8c2
·
verified ·
1 Parent(s): b73b9a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -37
app.py CHANGED
@@ -22,11 +22,17 @@ from transformers import (
22
  TextIteratorStreamer,
23
  )
24
 
25
- # Constants for text generation
26
- MAX_MAX_NEW_TOKENS = 2048
27
  DEFAULT_MAX_NEW_TOKENS = 1024
28
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
 
 
 
 
 
 
 
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
  # Load Behemoth-3B-070225-post0.1
@@ -74,8 +80,8 @@ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
74
  torch_dtype=torch.float16
75
  ).to(device).eval()
76
 
77
- # Video sampling
78
- def downsample_video(video_path):
79
  """
80
  Downsamples the video to evenly spaced frames.
81
  Each frame is returned as a PIL image along with its timestamp.
@@ -98,7 +104,7 @@ def downsample_video(video_path):
98
 
99
  @spaces.GPU
100
  def generate_image(model_name: str, text: str, image: Image.Image,
101
- max_new_tokens: int = 1024,
102
  temperature: float = 0.6,
103
  top_p: float = 0.9,
104
  top_k: int = 50,
@@ -106,21 +112,27 @@ def generate_image(model_name: str, text: str, image: Image.Image,
106
  """
107
  Generates responses using the selected model for image input.
108
  """
 
109
  if model_name == "SkyCaptioner-V1":
110
  processor = processor_m
111
  model = model_m
 
112
  elif model_name == "Behemoth-3B-070225-post0.1":
113
  processor = processor_n
114
  model = model_n
 
115
  elif model_name == "SpaceThinker-3B":
116
  processor = processor_z
117
  model = model_z
 
118
  elif model_name == "coreOCR-7B-050325-preview":
119
  processor = processor_k
120
  model = model_k
 
121
  elif model_name == "SpaceOm-3B":
122
  processor = processor_y
123
  model = model_y
 
124
  else:
125
  yield "Invalid model selected."
126
  return
@@ -129,22 +141,17 @@ def generate_image(model_name: str, text: str, image: Image.Image,
129
  yield "Please upload an image."
130
  return
131
 
132
- if model_name == "Behemoth-3B-070225-post0.1":
133
- messages = [
134
- {"role": "system", "content": [{"type": "text", "text": "detailed thinking on"}]},
135
- {"role": "user", "content": [
136
- {"type": "image", "image": image},
137
- {"type": "text", "text": text},
138
- ]}
 
 
139
  ]
140
- else:
141
- messages = [{
142
- "role": "user",
143
- "content": [
144
- {"type": "image", "image": image},
145
- {"type": "text", "text": text},
146
- ]
147
- }]
148
 
149
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
150
  inputs = processor(
@@ -168,7 +175,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
168
 
169
  @spaces.GPU
170
  def generate_video(model_name: str, text: str, video_path: str,
171
- max_new_tokens: int = 1024,
172
  temperature: float = 0.6,
173
  top_p: float = 0.9,
174
  top_k: int = 50,
@@ -176,21 +183,27 @@ def generate_video(model_name: str, text: str, video_path: str,
176
  """
177
  Generates responses using the selected model for video input.
178
  """
 
179
  if model_name == "SkyCaptioner-V1":
180
  processor = processor_m
181
  model = model_m
 
182
  elif model_name == "Behemoth-3B-070225-post0.1":
183
  processor = processor_n
184
  model = model_n
 
185
  elif model_name == "SpaceThinker-3B":
186
  processor = processor_z
187
  model = model_z
 
188
  elif model_name == "coreOCR-7B-050325-preview":
189
  processor = processor_k
190
  model = model_k
 
191
  elif model_name == "SpaceOm-3B":
192
  processor = processor_y
193
  model = model_y
 
194
  else:
195
  yield "Invalid model selected."
196
  return
@@ -200,21 +213,16 @@ def generate_video(model_name: str, text: str, video_path: str,
200
  return
201
 
202
  frames = downsample_video(video_path)
203
-
204
- if model_name == "Behemoth-3B-070225-post0.1":
205
- system_message = "detailed thinking on"
206
- else:
207
- system_message = "You are a helpful assistant."
208
-
209
- messages = [
210
- {"role": "system", "content": [{"type": "text", "text": system_message}]},
211
- {"role": "user", "content": [{"type": "text", "text": text}]}
212
- ]
213
- for frame in frames:
214
- image, timestamp = frame
215
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
216
- messages[1]["content"].append({"type": "image", "image": image})
217
-
218
  inputs = processor.apply_chat_template(
219
  messages,
220
  tokenize=True,
@@ -296,6 +304,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
296
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
297
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
298
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
299
  with gr.Column():
300
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
301
  model_choice = gr.Radio(
@@ -305,7 +314,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
305
  )
306
 
307
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
308
- gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
309
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
310
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
311
  gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")
 
22
  TextIteratorStreamer,
23
  )
24
 
25
+ # Constants for text generation\ nMAX_MAX_NEW_TOKENS = 2048
 
26
  DEFAULT_MAX_NEW_TOKENS = 1024
27
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
28
 
29
+ # Default system prompt for Behemoth-3B-070225-post0.1
30
+ DEFAULT_SYSTEM_PROMPT_BEHEMOTH = (
31
+ "<|begin_of_text|><|start_header_id|>system<|end_header_id|> detailed thinking on<|eot_id|>"
32
+ "<|start_header_id|>user<|end_header_id|> You are a reasoning model designed to answer complex questions step-by-step, "
33
+ "Conclude with the solution <|eot_id|><|start_header_id|>assistant<|end_header_id|>"
34
+ )
35
+
36
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
37
 
38
  # Load Behemoth-3B-070225-post0.1
 
80
  torch_dtype=torch.float16
81
  ).to(device).eval()
82
 
83
+ # Video downsampling helper
84
+ def downsample_video(video_path):
85
  """
86
  Downsamples the video to evenly spaced frames.
87
  Each frame is returned as a PIL image along with its timestamp.
 
104
 
105
  @spaces.GPU
106
  def generate_image(model_name: str, text: str, image: Image.Image,
107
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
108
  temperature: float = 0.6,
109
  top_p: float = 0.9,
110
  top_k: int = 50,
 
112
  """
113
  Generates responses using the selected model for image input.
114
  """
115
+ # Model selection
116
  if model_name == "SkyCaptioner-V1":
117
  processor = processor_m
118
  model = model_m
119
+ system_prompt = None
120
  elif model_name == "Behemoth-3B-070225-post0.1":
121
  processor = processor_n
122
  model = model_n
123
+ system_prompt = DEFAULT_SYSTEM_PROMPT_BEHEMOTH
124
  elif model_name == "SpaceThinker-3B":
125
  processor = processor_z
126
  model = model_z
127
+ system_prompt = None
128
  elif model_name == "coreOCR-7B-050325-preview":
129
  processor = processor_k
130
  model = model_k
131
+ system_prompt = None
132
  elif model_name == "SpaceOm-3B":
133
  processor = processor_y
134
  model = model_y
135
+ system_prompt = None
136
  else:
137
  yield "Invalid model selected."
138
  return
 
141
  yield "Please upload an image."
142
  return
143
 
144
+ # Build message list
145
+ messages = []
146
+ if system_prompt:
147
+ messages.append({"role": "system", "content": system_prompt})
148
+ messages.append({
149
+ "role": "user",
150
+ "content": [
151
+ {"type": "image", "image": image},
152
+ {"type": "text", "text": text},
153
  ]
154
+ })
 
 
 
 
 
 
 
155
 
156
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
157
  inputs = processor(
 
175
 
176
  @spaces.GPU
177
  def generate_video(model_name: str, text: str, video_path: str,
178
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
179
  temperature: float = 0.6,
180
  top_p: float = 0.9,
181
  top_k: int = 50,
 
183
  """
184
  Generates responses using the selected model for video input.
185
  """
186
+ # Model selection
187
  if model_name == "SkyCaptioner-V1":
188
  processor = processor_m
189
  model = model_m
190
+ system_prompt = None
191
  elif model_name == "Behemoth-3B-070225-post0.1":
192
  processor = processor_n
193
  model = model_n
194
+ system_prompt = DEFAULT_SYSTEM_PROMPT_BEHEMOTH
195
  elif model_name == "SpaceThinker-3B":
196
  processor = processor_z
197
  model = model_z
198
+ system_prompt = None
199
  elif model_name == "coreOCR-7B-050325-preview":
200
  processor = processor_k
201
  model = model_k
202
+ system_prompt = None
203
  elif model_name == "SpaceOm-3B":
204
  processor = processor_y
205
  model = model_y
206
+ system_prompt = None
207
  else:
208
  yield "Invalid model selected."
209
  return
 
213
  return
214
 
215
  frames = downsample_video(video_path)
216
+ # Build message list
217
+ messages = []
218
+ if system_prompt:
219
+ messages.append({"role": "system", "content": system_prompt})
220
+ messages.append({"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]})
221
+ messages.append({"role": "user", "content": [{"type": "text", "text": text}]})
222
+ for image, timestamp in frames:
223
+ messages[-1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
224
+ messages[-1]["content"].append({"type": "image", "image": image})
225
+
 
 
 
 
 
226
  inputs = processor.apply_chat_template(
227
  messages,
228
  tokenize=True,
 
304
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
305
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
306
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
307
+
308
  with gr.Column():
309
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
310
  model_choice = gr.Radio(
 
314
  )
315
 
316
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
317
+ gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
318
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
319
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
320
  gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")