prithivMLmods commited on
Commit
3fa8f27
·
verified ·
1 Parent(s): dab83dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -207
app.py CHANGED
@@ -19,8 +19,6 @@ from transformers import (
19
  Qwen2VLForConditionalGeneration,
20
  AutoProcessor,
21
  AutoTokenizer,
22
- AutoModel,
23
- AutoImageProcessor,
24
  TextIteratorStreamer,
25
  )
26
 
@@ -33,18 +31,14 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
33
 
34
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
 
36
- # Load Llama-3.1-Nemotron-Nano-VL-8B-V1
37
- MODEL_ID_M = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
38
- processor_m = AutoImageProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
39
- tokenizer_m = AutoTokenizer.from_pretrained(MODEL_ID_M)
40
- tokenizer_m.pad_token = tokenizer_m.eos_token # Set pad_token to resolve ValueError
41
- model_m = AutoModel.from_pretrained(
42
  MODEL_ID_M,
43
  trust_remote_code=True,
44
  torch_dtype=torch.float16
45
  ).to(device).eval()
46
- # Fix AssertionError by setting img_context_token_id
47
- model_m.img_context_token_id = tokenizer_m.convert_tokens_to_ids("<image>")
48
 
49
  # Load Space Thinker
50
  MODEL_ID_Z = "remyxai/SpaceThinker-Qwen2.5VL-3B"
@@ -64,7 +58,21 @@ model_k = Qwen2VLForConditionalGeneration.from_pretrained(
64
  torch_dtype=torch.float16
65
  ).to(device).eval()
66
 
 
 
 
 
 
 
 
 
 
 
67
  def downsample_video(video_path):
 
 
 
 
68
  vidcap = cv2.VideoCapture(video_path)
69
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
70
  fps = vidcap.get(cv2.CAP_PROP_FPS)
@@ -83,214 +91,129 @@ def downsample_video(video_path):
83
 
84
  @spaces.GPU
85
  def generate_image(model_name: str, text: str, image: Image.Image,
86
- max_new_tokens: int = 1024,
87
- temperature: float = 0.6,
88
- top_p: float = 0.9,
89
- top_k: int = 50,
90
- repetition_penalty: float = 1.2):
91
- if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
 
 
 
92
  processor = processor_m
93
- tokenizer = tokenizer_m
94
  model = model_m
95
- if image is None:
96
- yield "Please upload an image."
97
- return
98
- # Construct message with <image> token
99
- if "<image>" not in text:
100
- message = f"<image>\n{text}"
101
- else:
102
- message = text
103
-
104
- # Tokenize the message
105
- inputs = tokenizer(message, return_tensors="pt").to(device)
106
-
107
- # Process image
108
- image_features = processor(image, return_tensors="pt").to(device)
109
-
110
- # Combine inputs
111
- generation_inputs = {
112
- "input_ids": inputs["input_ids"],
113
- "attention_mask": inputs["attention_mask"],
114
- **image_features,
115
- }
116
-
117
- # Create streamer
118
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
119
-
120
- # Generation kwargs
121
- generation_kwargs = {
122
- **generation_inputs,
123
- "streamer": streamer,
124
- "max_new_tokens": max_new_tokens,
125
- "do_sample": True,
126
- "temperature": temperature,
127
- "top_p": top_p,
128
- "top_k": top_k,
129
- "repetition_penalty": repetition_penalty,
130
- }
131
-
132
- # Start generation in a thread
133
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
134
- thread.start()
135
-
136
- buffer = ""
137
- for new_text in streamer:
138
- buffer += new_text
139
- buffer = buffer.replace("<|im_end|>", "")
140
- time.sleep(0.01)
141
- yield buffer
142
- elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
143
- if model_name == "SpaceThinker-3B":
144
- processor = processor_z
145
- model = model_z
146
- else:
147
- processor = processor_k
148
- model = model_k
149
-
150
- if image is None:
151
- yield "Please upload an image."
152
- return
153
-
154
- messages = [{
155
- "role": "user",
156
- "content": [
157
- {"type": "image", "image": image},
158
- {"type": "text", "text": text},
159
- ]
160
- }]
161
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
162
- inputs = processor(
163
- text=[prompt_full],
164
- images=[image],
165
- return_tensors="pt",
166
- padding=True,
167
- truncation=False,
168
- max_length=MAX_INPUT_TOKEN_LENGTH
169
- ).to(device)
170
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
171
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
172
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
173
- thread.start()
174
- buffer = ""
175
- for new_text in streamer:
176
- buffer += new_text
177
- buffer = buffer.replace("<|im_end|>", "")
178
- time.sleep(0.01)
179
- yield buffer
180
  else:
181
  yield "Invalid model selected."
182
  return
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  @spaces.GPU
185
  def generate_video(model_name: str, text: str, video_path: str,
186
- max_new_tokens: int = 1024,
187
- temperature: float = 0.6,
188
- top_p: float = 0.9,
189
- top_k: int = 50,
190
- repetition_penalty: float = 1.2):
191
- if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
 
 
 
192
  processor = processor_m
193
- tokenizer = tokenizer_m
194
  model = model_m
195
- if video_path is None:
196
- yield "Please upload a video."
197
- return
198
- frames = downsample_video(video_path)
199
- # Construct message with multiple <image> tokens
200
- prompt_parts = ["<image>"] * len(frames) + [text]
201
- message = " ".join(prompt_parts)
202
-
203
- # Tokenize
204
- inputs = tokenizer(message, return_tensors="pt").to(device)
205
-
206
- # Process all frames
207
- image_features = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
208
-
209
- # Combine inputs
210
- generation_inputs = {
211
- "input_ids": inputs["input_ids"],
212
- "attention_mask": inputs["attention_mask"],
213
- **image_features,
214
- }
215
-
216
- # Create streamer
217
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
218
-
219
- # Generation kwargs
220
- generation_kwargs = {
221
- **generation_inputs,
222
- "streamer": streamer,
223
- "max_new_tokens": max_new_tokens,
224
- "do_sample": True,
225
- "temperature": temperature,
226
- "top_p": top_p,
227
- "top_k": top_k,
228
- "repetition_penalty": repetition_penalty,
229
- }
230
-
231
- # Start generation in a thread
232
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
233
- thread.start()
234
-
235
- buffer = ""
236
- for new_text in streamer:
237
- buffer += new_text
238
- buffer = buffer.replace("<|im_end|>", "")
239
- time.sleep(0.01)
240
- yield buffer
241
- elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
242
- if model_name == "SpaceThinker-3B":
243
- processor = processor_z
244
- model = model_z
245
- else:
246
- processor = processor_k
247
- model = model_k
248
-
249
- if video_path is None:
250
- yield "Please upload a video."
251
- return
252
-
253
- frames = downsample_video(video_path)
254
- messages = [
255
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
256
- {"role": "user", "content": [{"type": "text", "text": text}]}
257
- ]
258
- for frame in frames:
259
- image, timestamp = frame
260
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
261
- messages[1]["content"].append({"type": "image", "image": image})
262
- inputs = processor.apply_chat_template(
263
- messages,
264
- tokenize=True,
265
- add_generation_prompt=True,
266
- return_dict=True,
267
- return_tensors="pt",
268
- truncation=False,
269
- max_length=MAX_INPUT_TOKEN_LENGTH
270
- ).to(device)
271
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
272
- generation_kwargs = {
273
- **inputs,
274
- "streamer": streamer,
275
- "max_new_tokens": max_new_tokens,
276
- "do_sample": True,
277
- "temperature": temperature,
278
- "top_p": top_p,
279
- "top_k": top_k,
280
- "repetition_penalty": repetition_penalty,
281
- }
282
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
283
- thread.start()
284
- buffer = ""
285
- for new_text in streamer:
286
- buffer += new_text
287
- buffer = buffer.replace("<|im_end|>", "")
288
- time.sleep(0.01)
289
- yield buffer
290
  else:
291
  yield "Invalid model selected."
292
  return
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  # Define examples for image and video inference
295
  image_examples = [
296
  ["type out the messy hand-writing as accurately as you can.", "images/1.jpg"],
@@ -346,13 +269,13 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
346
  with gr.Column():
347
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
348
  model_choice = gr.Radio(
349
- choices=["Llama-3.1-Nemotron-Nano-VL-8B-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview"],
350
  label="Select Model",
351
- value="Llama-3.1-Nemotron-Nano-VL-8B-V1"
352
  )
353
 
354
  gr.Markdown("**Model Info**")
355
- gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
356
  gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
357
  gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
358
  gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")
 
19
  Qwen2VLForConditionalGeneration,
20
  AutoProcessor,
21
  AutoTokenizer,
 
 
22
  TextIteratorStreamer,
23
  )
24
 
 
31
 
32
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
33
 
34
+ # Load SkyCaptioner-V1
35
+ MODEL_ID_M = "Skywork/SkyCaptioner-V1"
36
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
37
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
 
38
  MODEL_ID_M,
39
  trust_remote_code=True,
40
  torch_dtype=torch.float16
41
  ).to(device).eval()
 
 
42
 
43
  # Load Space Thinker
44
  MODEL_ID_Z = "remyxai/SpaceThinker-Qwen2.5VL-3B"
 
58
  torch_dtype=torch.float16
59
  ).to(device).eval()
60
 
61
+ # Load Imgscope-OCR-2B-0527
62
+ MODEL_ID_Y = "prithivMLmods/Imgscope-OCR-2B-0527"
63
+ processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
64
+ model_y = Qwen2VLForConditionalGeneration.from_pretrained(
65
+ MODEL_ID_Y,
66
+ trust_remote_code=True,
67
+ torch_dtype=torch.float16
68
+ ).to(device).eval()
69
+
70
+
71
  def downsample_video(video_path):
72
+ """
73
+ Downsamples the video to evenly spaced frames.
74
+ Each frame is returned as a PIL image along with its timestamp.
75
+ """
76
  vidcap = cv2.VideoCapture(video_path)
77
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
78
  fps = vidcap.get(cv2.CAP_PROP_FPS)
 
91
 
92
  @spaces.GPU
93
  def generate_image(model_name: str, text: str, image: Image.Image,
94
+ max_new_tokens: int = 1024,
95
+ temperature: float = 0.6,
96
+ top_p: float = 0.9,
97
+ top_k: int = 50,
98
+ repetition_penalty: float = 1.2):
99
+ """
100
+ Generates responses using the selected model for image input.
101
+ """
102
+ if model_name == "SkyCaptioner-V1":
103
  processor = processor_m
 
104
  model = model_m
105
+ elif model_name == "SpaceThinker-3B":
106
+ processor = processor_z
107
+ model = model_z
108
+ elif model_name == "coreOCR-7B-050325-preview":
109
+ processor = processor_k
110
+ model = model_k
111
+ elif model_name == "Imgscope-OCR-2B-0527":
112
+ processor = processor_y
113
+ model = model_y
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  else:
115
  yield "Invalid model selected."
116
  return
117
 
118
+ if image is None:
119
+ yield "Please upload an image."
120
+ return
121
+
122
+ messages = [{
123
+ "role": "user",
124
+ "content": [
125
+ {"type": "image", "image": image},
126
+ {"type": "text", "text": text},
127
+ ]
128
+ }]
129
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
130
+ inputs = processor(
131
+ text=[prompt_full],
132
+ images=[image],
133
+ return_tensors="pt",
134
+ padding=True,
135
+ truncation=False,
136
+ max_length=MAX_INPUT_TOKEN_LENGTH
137
+ ).to(device)
138
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
139
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
140
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
141
+ thread.start()
142
+ buffer = ""
143
+ for new_text in streamer:
144
+ buffer += new_text
145
+ buffer = buffer.replace("<|im_end|>", "")
146
+ time.sleep(0.01)
147
+ yield buffer
148
+
149
  @spaces.GPU
150
  def generate_video(model_name: str, text: str, video_path: str,
151
+ max_new_tokens: int = 1024,
152
+ temperature: float = 0.6,
153
+ top_p: float = 0.9,
154
+ top_k: int = 50,
155
+ repetition_penalty: float = 1.2):
156
+ """
157
+ Generates responses using the selected model for video input.
158
+ """
159
+ if model_name == "SkyCaptioner-V1":
160
  processor = processor_m
 
161
  model = model_m
162
+ elif model_name == "SpaceThinker-3B":
163
+ processor = processor_z
164
+ model = model_z
165
+ elif model_name == "coreOCR-7B-050325-preview":
166
+ processor = processor_k
167
+ model = model_k
168
+ elif model_name == "Imgscope-OCR-2B-0527":
169
+ processor = processor_y
170
+ model = model_y
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  else:
172
  yield "Invalid model selected."
173
  return
174
 
175
+ if video_path is None:
176
+ yield "Please upload a video."
177
+ return
178
+
179
+ frames = downsample_video(video_path)
180
+ messages = [
181
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
182
+ {"role": "user", "content": [{"type": "text", "text": text}]}
183
+ ]
184
+ for frame in frames:
185
+ image, timestamp = frame
186
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
187
+ messages[1]["content"].append({"type": "image", "image": image})
188
+ inputs = processor.apply_chat_template(
189
+ messages,
190
+ tokenize=True,
191
+ add_generation_prompt=True,
192
+ return_dict=True,
193
+ return_tensors="pt",
194
+ truncation=False,
195
+ max_length=MAX_INPUT_TOKEN_LENGTH
196
+ ).to(device)
197
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
198
+ generation_kwargs = {
199
+ **inputs,
200
+ "streamer": streamer,
201
+ "max_new_tokens": max_new_tokens,
202
+ "do_sample": True,
203
+ "temperature": temperature,
204
+ "top_p": top_p,
205
+ "top_k": top_k,
206
+ "repetition_penalty": repetition_penalty,
207
+ }
208
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
209
+ thread.start()
210
+ buffer = ""
211
+ for new_text in streamer:
212
+ buffer += new_text
213
+ buffer = buffer.replace("<|im_end|>", "")
214
+ time.sleep(0.01)
215
+ yield buffer
216
+
217
  # Define examples for image and video inference
218
  image_examples = [
219
  ["type out the messy hand-writing as accurately as you can.", "images/1.jpg"],
 
269
  with gr.Column():
270
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
271
  model_choice = gr.Radio(
272
+ choices=["SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "Imgscope-OCR-2B-0527"],
273
  label="Select Model",
274
+ value="SkyCaptioner-V1"
275
  )
276
 
277
  gr.Markdown("**Model Info**")
278
+ gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
279
  gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
280
  gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
281
  gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")