prithivMLmods commited on
Commit
e90fcb3
·
verified ·
1 Parent(s): 8dd2305

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -67
app.py CHANGED
@@ -131,7 +131,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
131
  top_k: int = 50,
132
  repetition_penalty: float = 1.2):
133
  """Generate responses for image input using the selected model."""
134
- # Model selection
135
  if model_name == "Nanonets-OCR-s":
136
  processor = processor_m
137
  model = model_m
@@ -152,17 +151,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
152
  yield "Please upload an image.", "Please upload an image."
153
  return
154
 
155
- # Prepare images as a list (single image for image inference)
156
  images = [image]
157
 
158
- # SmolDocling-256M specific preprocessing
159
  if model_name == "SmolDocling-256M-preview":
160
  if "OTSL" in text or "code" in text:
161
  images = [add_random_padding(img) for img in images]
162
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
163
  text = normalize_values(text, target_max=500)
164
 
165
- # Unified message structure for all models
166
  messages = [
167
  {
168
  "role": "user",
@@ -174,7 +170,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
174
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
175
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
176
 
177
- # Generation with streaming
178
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
179
  generation_kwargs = {
180
  **inputs,
@@ -188,13 +183,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
188
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
189
  thread.start()
190
 
191
- # Stream output
192
  buffer = ""
193
  for new_text in streamer:
194
  buffer += new_text.replace("<|im_end|>", "")
195
  yield buffer, buffer
196
 
197
- # SmolDocling-256M specific postprocessing
198
  if model_name == "SmolDocling-256M-preview":
199
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
200
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
@@ -216,7 +209,6 @@ def generate_video(model_name: str, text: str, video_path: str,
216
  top_k: int = 50,
217
  repetition_penalty: float = 1.2):
218
  """Generate responses for video input using the selected model."""
219
- # Model selection
220
  if model_name == "Nanonets-OCR-s":
221
  processor = processor_m
222
  model = model_m
@@ -237,18 +229,15 @@ def generate_video(model_name: str, text: str, video_path: str,
237
  yield "Please upload a video.", "Please upload a video."
238
  return
239
 
240
- # Extract frames from video
241
  frames = downsample_video(video_path)
242
  images = [frame for frame, _ in frames]
243
 
244
- # SmolDocling-256M specific preprocessing
245
  if model_name == "SmolDocling-256M-preview":
246
  if "OTSL" in text or "code" in text:
247
  images = [add_random_padding(img) for img in images]
248
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
249
  text = normalize_values(text, target_max=500)
250
 
251
- # Unified message structure for all models
252
  messages = [
253
  {
254
  "role": "user",
@@ -260,7 +249,6 @@ def generate_video(model_name: str, text: str, video_path: str,
260
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
261
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
262
 
263
- # Generation with streaming
264
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
265
  generation_kwargs = {
266
  **inputs,
@@ -274,13 +262,11 @@ def generate_video(model_name: str, text: str, video_path: str,
274
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
275
  thread.start()
276
 
277
- # Stream output
278
  buffer = ""
279
  for new_text in streamer:
280
  buffer += new_text.replace("<|im_end|>", "")
281
  yield buffer, buffer
282
 
283
- # SmolDocling-256M specific postprocessing
284
  if model_name == "SmolDocling-256M-preview":
285
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
286
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
@@ -311,63 +297,53 @@ video_examples = [
311
  ["Explain the video in detail.", "videos/2.mp4"]
312
  ]
313
 
314
- # Updated CSS with new button theme
315
  css = """
316
- .button {
 
 
 
 
 
317
  cursor: pointer;
318
- padding: 1em 2em;
319
- font-weight: bold;
320
- font-size: 20px;
321
- color: #fff;
322
- position: relative;
323
- overflow: hidden;
324
- background: rgba(60, 73, 203, 0.35);
325
- box-shadow: 0 0px 32px 0 rgba(31, 38, 135, 0.37);
326
- backdrop-filter: blur(14.5px);
327
- border: 1px solid rgba(255, 255, 255, 0.18);
328
- -webkit-backdrop-filter: blur(14.5px);
 
 
 
 
 
 
 
 
 
 
329
  }
330
 
331
- .button:hover {
332
- box-shadow: 0px 0 32px 0 rgba(31, 38, 135, 0.37),
333
- 0px 0 32px 0 rgba(31, 38, 135, 0.37), 0 0 42px 0px rgba(31, 38, 135, 0.37),
334
- 0 0 52px 0 rgba(31, 38, 135, 0.37);
335
- border: 1px solid rgba(255, 255, 255, 0.58);
336
  }
337
 
338
- .button,
339
- .button::before {
340
- display: grid;
341
- place-items: center;
342
- border-radius: 10px;
343
- box-shadow: 0 0px 32px 0 rgba(31, 38, 135, 0.37);
344
  }
345
 
346
- .button::before {
347
- content: "";
348
- position: absolute;
349
- background: rgba(26, 18, 241, 0.25);
350
- width: 90%;
351
- height: 80%;
352
- backdrop-filter: blur(18.5px);
353
- -webkit-backdrop-filter: blur(18.5px);
354
- border: 1px solid rgba(255, 255, 255, 0.18);
355
- transition: 0.4s;
356
- }
357
-
358
- .button:hover::before {
359
- background: rgba(51, 57, 236, 0.4);
360
- box-shadow: 1px 1px 2px 0 rgba(31, 38, 135, 0.37),
361
- 2px 2px 2px 0 rgba(31, 38, 135, 0.37), 0 0px 32px 0 rgba(31, 38, 135, 0.37),
362
- 0 0px 32px 1px rgba(31, 38, 135, 0.37), 0 0px 32px 0 rgba(31, 38, 135, 0.37);
363
- backdrop-filter: blur(5.5px);
364
- -webkit-backdrop-filter: blur(5.5px);
365
- border-radius: 10px;
366
- border: 1px solid rgba(255, 255, 255, 0.18);
367
- }
368
-
369
- .button:active::before {
370
- transform: scale(0.67);
371
  }
372
 
373
  .canvas-output {
@@ -386,7 +362,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
386
  with gr.TabItem("Image Inference"):
387
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
388
  image_upload = gr.Image(type="pil", label="Image")
389
- image_submit = gr.Button("Submit", elem_classes="button")
390
  gr.Examples(
391
  examples=image_examples,
392
  inputs=[image_query, image_upload]
@@ -394,7 +370,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
394
  with gr.TabItem("Video Inference"):
395
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
396
  video_upload = gr.Video(label="Video")
397
- video_submit = gr.Button("Submit", elem_classes="button")
398
  gr.Examples(
399
  examples=video_examples,
400
  inputs=[video_query, video_upload]
@@ -407,7 +383,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
407
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
408
 
409
  with gr.Column():
410
- # Result Canvas with raw and formatted outputs
411
  with gr.Column(elem_classes="canvas-output"):
412
  gr.Markdown("## Output")
413
  raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
@@ -428,7 +403,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
428
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
429
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
430
 
431
- # Connect submit buttons to generation functions with both outputs
432
  image_submit.click(
433
  fn=generate_image,
434
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
131
  top_k: int = 50,
132
  repetition_penalty: float = 1.2):
133
  """Generate responses for image input using the selected model."""
 
134
  if model_name == "Nanonets-OCR-s":
135
  processor = processor_m
136
  model = model_m
 
151
  yield "Please upload an image.", "Please upload an image."
152
  return
153
 
 
154
  images = [image]
155
 
 
156
  if model_name == "SmolDocling-256M-preview":
157
  if "OTSL" in text or "code" in text:
158
  images = [add_random_padding(img) for img in images]
159
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
160
  text = normalize_values(text, target_max=500)
161
 
 
162
  messages = [
163
  {
164
  "role": "user",
 
170
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
171
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
172
 
 
173
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
174
  generation_kwargs = {
175
  **inputs,
 
183
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
184
  thread.start()
185
 
 
186
  buffer = ""
187
  for new_text in streamer:
188
  buffer += new_text.replace("<|im_end|>", "")
189
  yield buffer, buffer
190
 
 
191
  if model_name == "SmolDocling-256M-preview":
192
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
193
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
 
209
  top_k: int = 50,
210
  repetition_penalty: float = 1.2):
211
  """Generate responses for video input using the selected model."""
 
212
  if model_name == "Nanonets-OCR-s":
213
  processor = processor_m
214
  model = model_m
 
229
  yield "Please upload a video.", "Please upload a video."
230
  return
231
 
 
232
  frames = downsample_video(video_path)
233
  images = [frame for frame, _ in frames]
234
 
 
235
  if model_name == "SmolDocling-256M-preview":
236
  if "OTSL" in text or "code" in text:
237
  images = [add_random_padding(img) for img in images]
238
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
239
  text = normalize_values(text, target_max=500)
240
 
 
241
  messages = [
242
  {
243
  "role": "user",
 
249
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
250
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
251
 
 
252
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
253
  generation_kwargs = {
254
  **inputs,
 
262
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
263
  thread.start()
264
 
 
265
  buffer = ""
266
  for new_text in streamer:
267
  buffer += new_text.replace("<|im_end|>", "")
268
  yield buffer, buffer
269
 
 
270
  if model_name == "SmolDocling-256M-preview":
271
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
272
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
 
297
  ["Explain the video in detail.", "videos/2.mp4"]
298
  ]
299
 
300
+ # Updated CSS with the new submit button theme
301
  css = """
302
+ .submit-btn {
303
+ --clr-font-main: hsla(0 0% 20% / 100);
304
+ --btn-bg-1: hsla(194 100% 69% / 1);
305
+ --btn-bg-2: hsla(217 100% 56% / 1);
306
+ --btn-bg-color: hsla(360 100% 100% / 1);
307
+ --radii: 0.5em;
308
  cursor: pointer;
309
+ padding: 0.9em 1.4em;
310
+ min-width: 120px;
311
+ min-height: 44px;
312
+ font-size: var(--size, 1rem);
313
+ font-weight: 500;
314
+ transition: 0.8s;
315
+ background-size: 280% auto;
316
+ background-image: linear-gradient(
317
+ 325deg,
318
+ var(--btn-bg-2) 0%,
319
+ var(--btn-bg-1) 55%,
320
+ var(--btn-bg-2) 90%
321
+ );
322
+ border: none;
323
+ border-radius: var(--radii);
324
+ color: var(--btn-bg-color);
325
+ box-shadow:
326
+ 0px 0px 20px rgba(71, 184, 255, 0.5),
327
+ 0px 5px 5px -1px rgba(58, 125, 233, 0.25),
328
+ inset 4px 4px 8px rgba(175, 230, 255, 0.5),
329
+ inset -4px -4px 8px rgba(19, 95, 216, 0.35);
330
  }
331
 
332
+ .submit-btn:hover {
333
+ background-position: right top;
 
 
 
334
  }
335
 
336
+ .submit-btn:is(:focus, :focus-visible, :active) {
337
+ outline: none;
338
+ box-shadow:
339
+ 0 0 0 3px var(--btn-bg-color),
340
+ 0 0 0 6px var(--btn-bg-2);
 
341
  }
342
 
343
+ @media (prefers-reduced-motion: reduce) {
344
+ .submit-btn {
345
+ transition: linear;
346
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  }
348
 
349
  .canvas-output {
 
362
  with gr.TabItem("Image Inference"):
363
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
364
  image_upload = gr.Image(type="pil", label="Image")
365
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
366
  gr.Examples(
367
  examples=image_examples,
368
  inputs=[image_query, image_upload]
 
370
  with gr.TabItem("Video Inference"):
371
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
372
  video_upload = gr.Video(label="Video")
373
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
374
  gr.Examples(
375
  examples=video_examples,
376
  inputs=[video_query, video_upload]
 
383
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
384
 
385
  with gr.Column():
 
386
  with gr.Column(elem_classes="canvas-output"):
387
  gr.Markdown("## Output")
388
  raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
 
403
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
404
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
405
 
 
406
  image_submit.click(
407
  fn=generate_image,
408
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],