prithivMLmods commited on
Commit
4eaf777
·
verified ·
1 Parent(s): 128e479

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -22
app.py CHANGED
@@ -28,7 +28,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
28
 
29
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
31
- # Load Cosmos-Reason1-7B
32
  MODEL_ID_M = "reducto/RolmOCR"
33
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
34
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -37,7 +37,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
37
  torch_dtype=torch.float16
38
  ).to(device).eval()
39
 
40
- # Load DocScope
41
  MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
42
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
43
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -46,16 +46,7 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
46
  torch_dtype=torch.float16
47
  ).to(device).eval()
48
 
49
- # Load Relaxed
50
- MODEL_ID_Z = "lingshu-medical-mllm/Lingshu-7B"
51
- processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
52
- model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
53
- MODEL_ID_Z,
54
- trust_remote_code=True,
55
- torch_dtype=torch.float16
56
- ).to(device).eval()
57
-
58
- # Load visionOCR
59
  MODEL_ID_V = "nanonets/Nanonets-OCR-s"
60
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
61
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -101,9 +92,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
101
  elif model_name == "Qwen2-VL-OCR-2B-Instruct":
102
  processor = processor_x
103
  model = model_x
104
- elif model_name == "Lingshu-7B":
105
- processor = processor_z
106
- model = model_z
107
  elif model_name == "Nanonets-OCR-s":
108
  processor = processor_v
109
  model = model_v
@@ -157,9 +145,6 @@ def generate_video(model_name: str, text: str, video_path: str,
157
  elif model_name == "Qwen2-VL-OCR-2B-Instruct":
158
  processor = processor_x
159
  model = model_x
160
- elif model_name == "Lingshu-7B":
161
- processor = processor_z
162
- model = model_z
163
  elif model_name == "Nanonets-OCR-s":
164
  processor = processor_v
165
  model = model_v
@@ -215,7 +200,6 @@ image_examples = [
215
  ]
216
 
217
  video_examples = [
218
- ["Explain the watch ad in detail.", "videos/1.mp4"],
219
  ["Identify the main actions in the cartoon video", "videos/2.mp4"]
220
  ]
221
 
@@ -260,16 +244,15 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
260
  with gr.Column():
261
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
262
  model_choice = gr.Radio(
263
- choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR", "Lingshu-7B"],
264
  label="Select Model",
265
- value="RolmOCR"
266
  )
267
 
268
  gr.Markdown("**Model Info**")
269
  gr.Markdown("⤷ [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
270
  gr.Markdown("⤷ [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve <messy> optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
271
  gr.Markdown("⤷ [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
272
- gr.Markdown("⤷ [Lingshu-7B](https://huggingface.co/lingshu-medical-mllm/Lingshu-7B): lingshu-7b is a generalist foundation model for unified multimodal medical understanding and reasoning, virtual assistants, and content generation.")
273
 
274
  image_submit.click(
275
  fn=generate_image,
 
28
 
29
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
31
+ # Load RolmOCR
32
  MODEL_ID_M = "reducto/RolmOCR"
33
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
34
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
37
  torch_dtype=torch.float16
38
  ).to(device).eval()
39
 
40
+ # Load Qwen2-VL-OCR-2B-Instruct
41
  MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
42
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
43
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
 
46
  torch_dtype=torch.float16
47
  ).to(device).eval()
48
 
49
+ # Load Nanonets-OCR-s
 
 
 
 
 
 
 
 
 
50
  MODEL_ID_V = "nanonets/Nanonets-OCR-s"
51
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
52
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
92
  elif model_name == "Qwen2-VL-OCR-2B-Instruct":
93
  processor = processor_x
94
  model = model_x
 
 
 
95
  elif model_name == "Nanonets-OCR-s":
96
  processor = processor_v
97
  model = model_v
 
145
  elif model_name == "Qwen2-VL-OCR-2B-Instruct":
146
  processor = processor_x
147
  model = model_x
 
 
 
148
  elif model_name == "Nanonets-OCR-s":
149
  processor = processor_v
150
  model = model_v
 
200
  ]
201
 
202
  video_examples = [
 
203
  ["Identify the main actions in the cartoon video", "videos/2.mp4"]
204
  ]
205
 
 
244
  with gr.Column():
245
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
246
  model_choice = gr.Radio(
247
+ choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR"],
248
  label="Select Model",
249
+ value="Nanonets-OCR-s"
250
  )
251
 
252
  gr.Markdown("**Model Info**")
253
  gr.Markdown("⤷ [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
254
  gr.Markdown("⤷ [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve <messy> optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
255
  gr.Markdown("⤷ [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
 
256
 
257
  image_submit.click(
258
  fn=generate_image,