prithivMLmods commited on
Commit
741bf21
·
verified ·
1 Parent(s): d3d24cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -2
app.py CHANGED
@@ -65,6 +65,15 @@ model_o = Qwen2VLForConditionalGeneration.from_pretrained(
65
  torch_dtype=torch.float16
66
  ).to(device).eval()
67
 
 
 
 
 
 
 
 
 
 
68
  def downsample_video(video_path):
69
  """
70
  Downsamples the video to evenly spaced frames.
@@ -108,6 +117,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
108
  elif model_name == "Typhoon-OCR-3B":
109
  processor = processor_t
110
  model = model_t
 
 
 
111
  else:
112
  yield "Invalid model selected.", "Invalid model selected."
113
  return
@@ -164,6 +176,9 @@ def generate_video(model_name: str, text: str, video_path: str,
164
  elif model_name == "Typhoon-OCR-3B":
165
  processor = processor_t
166
  model = model_t
 
 
 
167
  else:
168
  yield "Invalid model selected.", "Invalid model selected."
169
  return
@@ -281,20 +296,21 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
281
  with gr.Column():
282
  with gr.Column(elem_classes="canvas-output"):
283
  gr.Markdown("## Output")
284
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
285
 
286
  with gr.Accordion("(Result.md)", open=False):
287
  markdown_output = gr.Markdown(label="(Result.Md)")
288
  #download_btn = gr.Button("Download Result.md"
289
 
290
  model_choice = gr.Radio(
291
- choices=["DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "VIREX-062225-7B-exp", "Typhoon-OCR-3B"],
292
  label="Select Model",
293
  value="DREX-062225-7B-exp"
294
  )
295
 
296
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
297
  gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
 
298
  gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
299
  gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
300
  gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
 
65
  torch_dtype=torch.float16
66
  ).to(device).eval()
67
 
68
+ # Load typhoon-ocr-3b
69
+ MODEL_ID_J = "prithivMLmods/Megalodon-OCR-Sync-0713"
70
+ processor_j = AutoProcessor.from_pretrained(MODEL_ID_J, trust_remote_code=True)
71
+ model_j = Qwen2_5_VLForConditionalGeneration.from_pretrained(
72
+ MODEL_ID_J,
73
+ trust_remote_code=True,
74
+ torch_dtype=torch.float16
75
+ ).to(device).eval()
76
+
77
  def downsample_video(video_path):
78
  """
79
  Downsamples the video to evenly spaced frames.
 
117
  elif model_name == "Typhoon-OCR-3B":
118
  processor = processor_t
119
  model = model_t
120
+ elif model_name == "Megalodon-OCR":
121
+ processor = processor_j
122
+ model = model_j
123
  else:
124
  yield "Invalid model selected.", "Invalid model selected."
125
  return
 
176
  elif model_name == "Typhoon-OCR-3B":
177
  processor = processor_t
178
  model = model_t
179
+ elif model_name == "Megalodon-OCR":
180
+ processor = processor_j
181
+ model = model_j
182
  else:
183
  yield "Invalid model selected.", "Invalid model selected."
184
  return
 
296
  with gr.Column():
297
  with gr.Column(elem_classes="canvas-output"):
298
  gr.Markdown("## Output")
299
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
300
 
301
  with gr.Accordion("(Result.md)", open=False):
302
  markdown_output = gr.Markdown(label="(Result.Md)")
303
  #download_btn = gr.Button("Download Result.md"
304
 
305
  model_choice = gr.Radio(
306
+ choices=["DREX-062225-7B-exp", "Megalodon-OCR", "olmOCR-7B-0225-preview", "VIREX-062225-7B-exp", "Typhoon-OCR-3B"],
307
  label="Select Model",
308
  value="DREX-062225-7B-exp"
309
  )
310
 
311
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
312
  gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
313
+ gr.Markdown("> [Megalodon-OCR-Sync-0713](https://huggingface.co/prithivMLmods/Megalodon-OCR-Sync-0713): megalodon-ocr-sync-0713 model is a fine-tuned version of qwen2.5-vl-3b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
314
  gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
315
  gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
316
  gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")