prithivMLmods commited on
Commit
9304abf
·
verified ·
1 Parent(s): 045c0ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -34
app.py CHANGED
@@ -59,7 +59,7 @@ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
59
  torch_dtype=torch.float16
60
  ).to(device).eval()
61
 
62
- # Load typhoon-ocr-7b
63
  MODEL_ID_L = "scb10x/typhoon-ocr-7b"
64
  processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
65
  model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -68,7 +68,6 @@ model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
68
  torch_dtype=torch.float16
69
  ).to(device).eval()
70
 
71
- #--------------------------------------------------#
72
  # Load SmolDocling-256M-preview
73
  MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
74
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
@@ -77,7 +76,6 @@ model_x = AutoModelForVision2Seq.from_pretrained(
77
  trust_remote_code=True,
78
  torch_dtype=torch.float16
79
  ).to(device).eval()
80
- #--------------------------------------------------#
81
 
82
  # Preprocessing functions for SmolDocling-256M
83
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
@@ -136,7 +134,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
136
  # Model selection
137
  if model_name == "Nanonets-OCR-s":
138
  processor = processor_m
139
- model = model_m
140
  elif model_name == "MonkeyOCR-Recognition":
141
  processor = processor_g
142
  model = model_g
@@ -147,11 +145,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
147
  processor = processor_l
148
  model = model_l
149
  else:
150
- yield "Invalid model selected."
151
  return
152
 
153
  if image is None:
154
- yield "Please upload an image."
155
  return
156
 
157
  # Prepare images as a list (single image for image inference)
@@ -190,17 +188,15 @@ def generate_image(model_name: str, text: str, image: Image.Image,
190
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
191
  thread.start()
192
 
193
- # Stream output and collect full response
194
  buffer = ""
195
- full_output = ""
196
  for new_text in streamer:
197
- full_output += new_text
198
  buffer += new_text.replace("<|im_end|>", "")
199
- yield buffer
200
 
201
  # SmolDocling-256M specific postprocessing
202
  if model_name == "SmolDocling-256M-preview":
203
- cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
204
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
205
  if "<chart>" in cleaned_output:
206
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
@@ -208,9 +204,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
208
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
209
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
210
  markdown_output = doc.export_to_markdown()
211
- yield f"**MD Output:**\n\n{markdown_output}"
212
  else:
213
- yield cleaned_output
214
 
215
  @spaces.GPU
216
  def generate_video(model_name: str, text: str, video_path: str,
@@ -234,11 +230,11 @@ def generate_video(model_name: str, text: str, video_path: str,
234
  processor = processor_l
235
  model = model_l
236
  else:
237
- yield "Invalid model selected."
238
  return
239
 
240
  if video_path is None:
241
- yield "Please upload a video."
242
  return
243
 
244
  # Extract frames from video
@@ -278,17 +274,15 @@ def generate_video(model_name: str, text: str, video_path: str,
278
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
279
  thread.start()
280
 
281
- # Stream output and collect full response
282
  buffer = ""
283
- full_output = ""
284
  for new_text in streamer:
285
- full_output += new_text
286
  buffer += new_text.replace("<|im_end|>", "")
287
- yield buffer
288
 
289
  # SmolDocling-256M specific postprocessing
290
  if model_name == "SmolDocling-256M-preview":
291
- cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
292
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
293
  if "<chart>" in cleaned_output:
294
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
@@ -296,9 +290,9 @@ def generate_video(model_name: str, text: str, video_path: str,
296
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
297
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
298
  markdown_output = doc.export_to_markdown()
299
- yield f"**MD Output:**\n\n{markdown_output}"
300
  else:
301
- yield cleaned_output
302
 
303
  # Define examples for image and video inference
304
  image_examples = [
@@ -316,6 +310,7 @@ video_examples = [
316
  ["Explain the video in detail.", "videos/2.mp4"]
317
  ]
318
 
 
319
  css = """
320
  .submit-btn {
321
  background-color: #2980b9 !important;
@@ -328,6 +323,7 @@ css = """
328
  border: 2px solid #4682B4;
329
  border-radius: 10px;
330
  padding: 20px;
 
331
  """
332
 
333
  # Create the Gradio Interface
@@ -358,37 +354,36 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
358
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
359
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
360
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
361
-
362
  with gr.Column():
 
363
  with gr.Column(elem_classes="canvas-output"):
364
- gr.Markdown("## Result.Md")
365
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
366
- markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
367
 
368
  model_choice = gr.Radio(
369
- choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "SmolDocling-256M-preview", "Typhoon-OCR-7B"],
370
  label="Select Model",
371
  value="Nanonets-OCR-s"
372
  )
373
 
374
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
375
-
376
  gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
377
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
378
  gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
379
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
380
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
381
-
382
  image_submit.click(
383
  fn=generate_image,
384
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
385
- outputs=[output, markdown_output]
386
  )
387
  video_submit.click(
388
  fn=generate_video,
389
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
390
- outputs=[output, markdown_output]
391
  )
392
 
393
  if __name__ == "__main__":
394
- demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
59
  torch_dtype=torch.float16
60
  ).to(device).eval()
61
 
62
+ # Load Typhoon-OCR-7B
63
  MODEL_ID_L = "scb10x/typhoon-ocr-7b"
64
  processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
65
  model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
68
  torch_dtype=torch.float16
69
  ).to(device).eval()
70
 
 
71
  # Load SmolDocling-256M-preview
72
  MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
73
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 
76
  trust_remote_code=True,
77
  torch_dtype=torch.float16
78
  ).to(device).eval()
 
79
 
80
  # Preprocessing functions for SmolDocling-256M
81
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
 
134
  # Model selection
135
  if model_name == "Nanonets-OCR-s":
136
  processor = processor_m
137
+ model = model ExpressionError
138
  elif model_name == "MonkeyOCR-Recognition":
139
  processor = processor_g
140
  model = model_g
 
145
  processor = processor_l
146
  model = model_l
147
  else:
148
+ yield "Invalid model selected.", "Invalid model selected."
149
  return
150
 
151
  if image is None:
152
+ yield "Please upload an image.", "Please upload an image."
153
  return
154
 
155
  # Prepare images as a list (single image for image inference)
 
188
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
189
  thread.start()
190
 
191
+ # Stream output
192
  buffer = ""
 
193
  for new_text in streamer:
 
194
  buffer += new_text.replace("<|im_end|>", "")
195
+ yield buffer, buffer
196
 
197
  # SmolDocling-256M specific postprocessing
198
  if model_name == "SmolDocling-256M-preview":
199
+ cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
200
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
201
  if "<chart>" in cleaned_output:
202
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
 
204
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
205
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
206
  markdown_output = doc.export_to_markdown()
207
+ yield buffer, markdown_output
208
  else:
209
+ yield buffer, cleaned_output
210
 
211
  @spaces.GPU
212
  def generate_video(model_name: str, text: str, video_path: str,
 
230
  processor = processor_l
231
  model = model_l
232
  else:
233
+ yield "Invalid model selected.", "Invalid model selected."
234
  return
235
 
236
  if video_path is None:
237
+ yield "Please upload a video.", "Please upload a video."
238
  return
239
 
240
  # Extract frames from video
 
274
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
275
  thread.start()
276
 
277
+ # Stream output
278
  buffer = ""
 
279
  for new_text in streamer:
 
280
  buffer += new_text.replace("<|im_end|>", "")
281
+ yield buffer, buffer
282
 
283
  # SmolDocling-256M specific postprocessing
284
  if model_name == "SmolDocling-256M-preview":
285
+ cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
286
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
287
  if "<chart>" in cleaned_output:
288
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
 
290
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
291
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
292
  markdown_output = doc.export_to_markdown()
293
+ yield buffer, markdown_output
294
  else:
295
+ yield buffer, cleaned_output
296
 
297
  # Define examples for image and video inference
298
  image_examples = [
 
310
  ["Explain the video in detail.", "videos/2.mp4"]
311
  ]
312
 
313
+ # Updated CSS to include styling for the Result Canvas
314
  css = """
315
  .submit-btn {
316
  background-color: #2980b9 !important;
 
323
  border: 2px solid #4682B4;
324
  border-radius: 10px;
325
  padding: 20px;
326
+ }
327
  """
328
 
329
  # Create the Gradio Interface
 
354
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
355
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
356
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
357
  with gr.Column():
358
+ # Result Canvas with raw and formatted outputs
359
  with gr.Column(elem_classes="canvas-output"):
360
+ gr.Markdown("## Result Canvas")
361
+ raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
362
+ formatted_output = gr.Markdown(label="Formatted Result (Result.Md)")
363
 
364
  model_choice = gr.Radio(
365
+ choices=["SmolDocling-256M-preview", "Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B"],
366
  label="Select Model",
367
  value="Nanonets-OCR-s"
368
  )
369
 
370
+ gr.Markdown("**Model Info 💻**")
 
371
  gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
372
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
373
  gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
374
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
375
+
376
+ # Connect submit buttons to generation functions with both outputs
377
  image_submit.click(
378
  fn=generate_image,
379
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
380
+ outputs=[raw_output, formatted_output]
381
  )
382
  video_submit.click(
383
  fn=generate_video,
384
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
385
+ outputs=[raw_output, formatted_output]
386
  )
387
 
388
  if __name__ == "__main__":
389
+ demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)