prithivMLmods commited on
Commit
9f2ad92
·
verified ·
1 Parent(s): 79c0a55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -42
app.py CHANGED
@@ -22,17 +22,11 @@ from transformers import (
22
  TextIteratorStreamer,
23
  )
24
 
25
- # Constants for text generation\ nMAX_MAX_NEW_TOKENS = 2048
 
26
  DEFAULT_MAX_NEW_TOKENS = 1024
27
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
28
 
29
- # Default system prompt for Behemoth-3B-070225-post0.1
30
- DEFAULT_SYSTEM_PROMPT_BEHEMOTH = (
31
- "<|begin_of_text|><|start_header_id|>system<|end_header_id|> detailed thinking on<|eot_id|>"
32
- "<|start_header_id|>user<|end_header_id|> You are a reasoning model designed to answer complex questions step-by-step, "
33
- "Conclude with the solution <|eot_id|><|start_header_id|>assistant<|end_header_id|>"
34
- )
35
-
36
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
37
 
38
  # Load Behemoth-3B-070225-post0.1
@@ -80,7 +74,7 @@ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
80
  torch_dtype=torch.float16
81
  ).to(device).eval()
82
 
83
- # Video downsampling helper
84
  def downsample_video(video_path):
85
  """
86
  Downsamples the video to evenly spaced frames.
@@ -104,7 +98,7 @@ def downsample_video(video_path):
104
 
105
  @spaces.GPU
106
  def generate_image(model_name: str, text: str, image: Image.Image,
107
- max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
108
  temperature: float = 0.6,
109
  top_p: float = 0.9,
110
  top_k: int = 50,
@@ -112,27 +106,21 @@ def generate_image(model_name: str, text: str, image: Image.Image,
112
  """
113
  Generates responses using the selected model for image input.
114
  """
115
- # Model selection
116
  if model_name == "SkyCaptioner-V1":
117
  processor = processor_m
118
  model = model_m
119
- system_prompt = None
120
  elif model_name == "Behemoth-3B-070225-post0.1":
121
  processor = processor_n
122
  model = model_n
123
- system_prompt = DEFAULT_SYSTEM_PROMPT_BEHEMOTH
124
  elif model_name == "SpaceThinker-3B":
125
  processor = processor_z
126
  model = model_z
127
- system_prompt = None
128
  elif model_name == "coreOCR-7B-050325-preview":
129
  processor = processor_k
130
  model = model_k
131
- system_prompt = None
132
  elif model_name == "SpaceOm-3B":
133
  processor = processor_y
134
  model = model_y
135
- system_prompt = None
136
  else:
137
  yield "Invalid model selected."
138
  return
@@ -141,18 +129,13 @@ def generate_image(model_name: str, text: str, image: Image.Image,
141
  yield "Please upload an image."
142
  return
143
 
144
- # Build message list
145
- messages = []
146
- if system_prompt:
147
- messages.append({"role": "system", "content": system_prompt})
148
- messages.append({
149
  "role": "user",
150
  "content": [
151
  {"type": "image", "image": image},
152
  {"type": "text", "text": text},
153
  ]
154
- })
155
-
156
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
157
  inputs = processor(
158
  text=[prompt_full],
@@ -175,7 +158,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
175
 
176
  @spaces.GPU
177
  def generate_video(model_name: str, text: str, video_path: str,
178
- max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
179
  temperature: float = 0.6,
180
  top_p: float = 0.9,
181
  top_k: int = 50,
@@ -183,27 +166,21 @@ def generate_video(model_name: str, text: str, video_path: str,
183
  """
184
  Generates responses using the selected model for video input.
185
  """
186
- # Model selection
187
  if model_name == "SkyCaptioner-V1":
188
  processor = processor_m
189
  model = model_m
190
- system_prompt = None
191
  elif model_name == "Behemoth-3B-070225-post0.1":
192
  processor = processor_n
193
  model = model_n
194
- system_prompt = DEFAULT_SYSTEM_PROMPT_BEHEMOTH
195
  elif model_name == "SpaceThinker-3B":
196
  processor = processor_z
197
  model = model_z
198
- system_prompt = None
199
  elif model_name == "coreOCR-7B-050325-preview":
200
  processor = processor_k
201
  model = model_k
202
- system_prompt = None
203
  elif model_name == "SpaceOm-3B":
204
  processor = processor_y
205
  model = model_y
206
- system_prompt = None
207
  else:
208
  yield "Invalid model selected."
209
  return
@@ -213,16 +190,14 @@ def generate_video(model_name: str, text: str, video_path: str,
213
  return
214
 
215
  frames = downsample_video(video_path)
216
- # Build message list
217
- messages = []
218
- if system_prompt:
219
- messages.append({"role": "system", "content": system_prompt})
220
- messages.append({"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]})
221
- messages.append({"role": "user", "content": [{"type": "text", "text": text}]})
222
- for image, timestamp in frames:
223
- messages[-1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
224
- messages[-1]["content"].append({"type": "image", "image": image})
225
-
226
  inputs = processor.apply_chat_template(
227
  messages,
228
  tokenize=True,
@@ -304,7 +279,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
304
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
305
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
306
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
307
-
308
  with gr.Column():
309
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
310
  model_choice = gr.Radio(
@@ -314,7 +288,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
314
  )
315
 
316
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
317
- gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
318
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
319
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
320
  gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")
 
22
  TextIteratorStreamer,
23
  )
24
 
25
+ # Constants for text generation
26
+ MAX_MAX_NEW_TOKENS = 2048
27
  DEFAULT_MAX_NEW_TOKENS = 1024
28
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
 
 
 
 
 
 
 
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
  # Load Behemoth-3B-070225-post0.1
 
74
  torch_dtype=torch.float16
75
  ).to(device).eval()
76
 
77
+ #video sampling
78
  def downsample_video(video_path):
79
  """
80
  Downsamples the video to evenly spaced frames.
 
98
 
99
  @spaces.GPU
100
  def generate_image(model_name: str, text: str, image: Image.Image,
101
+ max_new_tokens: int = 1024,
102
  temperature: float = 0.6,
103
  top_p: float = 0.9,
104
  top_k: int = 50,
 
106
  """
107
  Generates responses using the selected model for image input.
108
  """
 
109
  if model_name == "SkyCaptioner-V1":
110
  processor = processor_m
111
  model = model_m
 
112
  elif model_name == "Behemoth-3B-070225-post0.1":
113
  processor = processor_n
114
  model = model_n
 
115
  elif model_name == "SpaceThinker-3B":
116
  processor = processor_z
117
  model = model_z
 
118
  elif model_name == "coreOCR-7B-050325-preview":
119
  processor = processor_k
120
  model = model_k
 
121
  elif model_name == "SpaceOm-3B":
122
  processor = processor_y
123
  model = model_y
 
124
  else:
125
  yield "Invalid model selected."
126
  return
 
129
  yield "Please upload an image."
130
  return
131
 
132
+ messages = [{
 
 
 
 
133
  "role": "user",
134
  "content": [
135
  {"type": "image", "image": image},
136
  {"type": "text", "text": text},
137
  ]
138
+ }]
 
139
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
140
  inputs = processor(
141
  text=[prompt_full],
 
158
 
159
  @spaces.GPU
160
  def generate_video(model_name: str, text: str, video_path: str,
161
+ max_new_tokens: int = 1024,
162
  temperature: float = 0.6,
163
  top_p: float = 0.9,
164
  top_k: int = 50,
 
166
  """
167
  Generates responses using the selected model for video input.
168
  """
 
169
  if model_name == "SkyCaptioner-V1":
170
  processor = processor_m
171
  model = model_m
 
172
  elif model_name == "Behemoth-3B-070225-post0.1":
173
  processor = processor_n
174
  model = model_n
 
175
  elif model_name == "SpaceThinker-3B":
176
  processor = processor_z
177
  model = model_z
 
178
  elif model_name == "coreOCR-7B-050325-preview":
179
  processor = processor_k
180
  model = model_k
 
181
  elif model_name == "SpaceOm-3B":
182
  processor = processor_y
183
  model = model_y
 
184
  else:
185
  yield "Invalid model selected."
186
  return
 
190
  return
191
 
192
  frames = downsample_video(video_path)
193
+ messages = [
194
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
195
+ {"role": "user", "content": [{"type": "text", "text": text}]}
196
+ ]
197
+ for frame in frames:
198
+ image, timestamp = frame
199
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
200
+ messages[1]["content"].append({"type": "image", "image": image})
 
 
201
  inputs = processor.apply_chat_template(
202
  messages,
203
  tokenize=True,
 
279
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
280
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
281
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
282
  with gr.Column():
283
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
284
  model_choice = gr.Radio(
 
288
  )
289
 
290
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
291
+ gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
292
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
293
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
294
  gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")