prithivMLmods commited on
Commit
ec56362
·
verified ·
1 Parent(s): f4ca73a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -6
app.py CHANGED
@@ -81,6 +81,18 @@ model_a = AutoModelForImageTextToText.from_pretrained(
81
  torch_dtype=torch.float16
82
  ).to(device).eval()
83
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def downsample_video(video_path):
85
  """
86
  Downsamples the video to evenly spaced frames.
@@ -113,7 +125,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
113
  Generates responses using the selected model for image input.
114
  Yields raw text and Markdown-formatted text.
115
  """
116
- if model_name == "RolmOCR":
117
  processor = processor_m
118
  model = model_m
119
  elif model_name == "Qwen2-VL-OCR":
@@ -122,9 +134,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
122
  elif model_name == "Nanonets-OCR-s":
123
  processor = processor_v
124
  model = model_v
125
- elif model_name == "Aya-Vision":
126
  processor = processor_a
127
  model = model_a
 
 
 
128
  else:
129
  yield "Invalid model selected.", "Invalid model selected."
130
  return
@@ -171,7 +186,7 @@ def generate_video(model_name: str, text: str, video_path: str,
171
  Generates responses using the selected model for video input.
172
  Yields raw text and Markdown-formatted text.
173
  """
174
- if model_name == "RolmOCR":
175
  processor = processor_m
176
  model = model_m
177
  elif model_name == "Qwen2-VL-OCR":
@@ -180,9 +195,12 @@ def generate_video(model_name: str, text: str, video_path: str,
180
  elif model_name == "Nanonets-OCR-s":
181
  processor = processor_v
182
  model = model_v
183
- elif model_name == "Aya-Vision":
184
  processor = processor_a
185
  model = model_a
 
 
 
186
  else:
187
  yield "Invalid model selected.", "Invalid model selected."
188
  return
@@ -284,6 +302,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
284
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
285
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
286
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
287
  with gr.Column():
288
  with gr.Column(elem_classes="canvas-output"):
289
  gr.Markdown("## Result.Md")
@@ -292,15 +311,17 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
292
  with gr.Accordion("Formatted Result (Result.md)", open=False):
293
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
294
  model_choice = gr.Radio(
295
- choices=["Nanonets-OCR-s", "Qwen2-VL-OCR", "RolmOCR", "Aya-Vision"],
296
  label="Select Model",
297
  value="Nanonets-OCR-s"
298
  )
299
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
300
- gr.Markdown("> [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
301
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
 
 
302
  gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
303
  gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
 
304
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
305
 
306
  image_submit.click(
 
81
  torch_dtype=torch.float16
82
  ).to(device).eval()
83
 
84
+ #-----------------------------subfolder-----------------------------#
85
+ # Load MonkeyOCR-1.2B-0709
86
+ MODEL_ID_W = "echo840/MonkeyOCR-1.2B-0709"
87
+ SUBFOLDER = "Recognition"
88
+ processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True, subfolder=SUBFOLDER)
89
+ model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
90
+ MODEL_ID_W, trust_remote_code=True,
91
+ subfolder=SUBFOLDER,
92
+ torch_dtype=torch.float16).to(device).eval()
93
+ #-----------------------------subfolder-----------------------------#
94
+
95
+
96
  def downsample_video(video_path):
97
  """
98
  Downsamples the video to evenly spaced frames.
 
125
  Generates responses using the selected model for image input.
126
  Yields raw text and Markdown-formatted text.
127
  """
128
+ if model_name == "RolmOCR-7B":
129
  processor = processor_m
130
  model = model_m
131
  elif model_name == "Qwen2-VL-OCR":
 
134
  elif model_name == "Nanonets-OCR-s":
135
  processor = processor_v
136
  model = model_v
137
+ elif model_name == "Aya-Vision-8B":
138
  processor = processor_a
139
  model = model_a
140
+ elif model_name == "MonkeyOCR-1.2B-0709":
141
+ processor = processor_w
142
+ model = model_w
143
  else:
144
  yield "Invalid model selected.", "Invalid model selected."
145
  return
 
186
  Generates responses using the selected model for video input.
187
  Yields raw text and Markdown-formatted text.
188
  """
189
+ if model_name == "RolmOCR-7B":
190
  processor = processor_m
191
  model = model_m
192
  elif model_name == "Qwen2-VL-OCR":
 
195
  elif model_name == "Nanonets-OCR-s":
196
  processor = processor_v
197
  model = model_v
198
+ elif model_name == "Aya-Vision-8B":
199
  processor = processor_a
200
  model = model_a
201
+ elif model_name == "MonkeyOCR-1.2B-0709":
202
+ processor = processor_w
203
+ model = model_w
204
  else:
205
  yield "Invalid model selected.", "Invalid model selected."
206
  return
 
302
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
303
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
304
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
305
+
306
  with gr.Column():
307
  with gr.Column(elem_classes="canvas-output"):
308
  gr.Markdown("## Result.Md")
 
311
  with gr.Accordion("Formatted Result (Result.md)", open=False):
312
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
313
  model_choice = gr.Radio(
314
+ choices=["Nanonets-OCR-s", "MonkeyOCR-1.2B-0709", "Qwen2-VL-OCR", "RolmOCR-7B", "Aya-Vision-8B"],
315
  label="Select Model",
316
  value="Nanonets-OCR-s"
317
  )
318
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
 
319
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
320
+ gr.Markdown("> [MonkeyOCR-1.2B-0709](https://huggingface.co/echo840/MonkeyOCR-1.2B-0709): MonkeyOCR adopts a structure-recognition-relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
321
+ gr.Markdown("> [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
322
  gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
323
  gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
324
+
325
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
326
 
327
  image_submit.click(