Spaces:

shukdevdatta123
/

MedicineOCR

Running

App Files Files Community

shukdevdatta123 commited on Apr 23

Commit

e1accc9

verified ·

1 Parent(s): 79a9268

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -25

app.py CHANGED Viewed

@@ -3,11 +3,11 @@ from transformers.image_utils import load_image
 from threading import Thread
 import time
 import torch
-import spaces
 import cv2
 import numpy as np
 from PIL import Image
 import re
 from transformers import (
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
@@ -105,28 +105,57 @@ def extract_medicine_names(text):
     return unique_medicines
-# Model and Processor Setup
-# Qwen2VL OCR (default branch)
-QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # [or] prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
-qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
-qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
-    QV_MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
-# RolmOCR branch (@RolmOCR)
-ROLMOCR_MODEL_ID = "reducto/RolmOCR"
-rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
-rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    ROLMOCR_MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16
-).to("cuda").eval()
 # Main Inference Function
-@spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
@@ -154,7 +183,7 @@ def model_inference(input_dict, history):
             images=images,
             return_tensors="pt",
             padding=True,
-        ).to("cuda")
         # First, get the complete OCR text
         streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
@@ -210,7 +239,7 @@ def model_inference(input_dict, history):
                 images=video_images,
                 return_tensors="pt",
                 padding=True,
-            ).to("cuda")
         else:
             # Assume image(s) or text query.
             if len(files) > 1:
@@ -235,7 +264,7 @@ def model_inference(input_dict, history):
                 images=images if images else None,
                 return_tensors="pt",
                 padding=True,
-            ).to("cuda")
         streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
         thread = Thread(target=rolmocr_model.generate, kwargs=generation_kwargs)
@@ -279,7 +308,7 @@ def model_inference(input_dict, history):
         images=images if images else None,
         return_tensors="pt",
         padding=True,
-    ).to("cuda")
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
@@ -296,7 +325,6 @@ def model_inference(input_dict, history):
 examples = [
     [{"text": "@Prescription Extract medicines from this prescription", "files": ["examples/prescription1.jpg"]}],
     [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
-    [{"text": "@RolmOCR Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
     [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
@@ -325,6 +353,10 @@ description = """
 Upload your medical prescription images and get the medicine names extracted automatically!
 """
 demo = gr.ChatInterface(
     fn=model_inference,
     description=description,
@@ -341,4 +373,7 @@ demo = gr.ChatInterface(
     css=css
 )
-demo.launch(debug=True)

 from threading import Thread
 import time
 import torch
 import cv2
 import numpy as np
 from PIL import Image
 import re
+import os
 from transformers import (
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
     return unique_medicines
+# Check for CUDA availability
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Adjust model loading based on device
+dtype = torch.float16 if device == "cuda" else torch.float32
+bfdtype = torch.bfloat16 if device == "cuda" else torch.float32
+# Set lower precision for CPU if available
+if device == "cpu":
+    try:
+        # Check if Intel MKL is available for better CPU performance
+        import intel_extension_for_pytorch as ipex
+        dtype = torch.bfloat16
+        print("Using Intel optimizations for PyTorch")
+    except ImportError:
+        print("Intel optimizations not available, using standard CPU mode")
+# Model and Processor Setup with proper error handling
+try:
+    # Qwen2VL OCR (default branch)
+    QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # [or] prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
+    qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
+    qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
+        QV_MODEL_ID,
+        trust_remote_code=True,
+        torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+    ).to(device).eval()
+    # RolmOCR branch (@RolmOCR)
+    ROLMOCR_MODEL_ID = "reducto/RolmOCR"
+    rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
+    rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        ROLMOCR_MODEL_ID,
+        trust_remote_code=True,
+        torch_dtype=bfdtype,
+        low_cpu_mem_usage=True,
+    ).to(device).eval()
+    models_loaded = True
+except Exception as e:
+    print(f"Error loading models: {str(e)}")
+    models_loaded = False
 # Main Inference Function
 def model_inference(input_dict, history):
+    if not models_loaded:
+        yield "Error: Models could not be loaded. Please check system requirements."
+        return
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
             images=images,
             return_tensors="pt",
             padding=True,
+        ).to(device)
         # First, get the complete OCR text
         streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
                 images=video_images,
                 return_tensors="pt",
                 padding=True,
+            ).to(device)
         else:
             # Assume image(s) or text query.
             if len(files) > 1:
                 images=images if images else None,
                 return_tensors="pt",
                 padding=True,
+            ).to(device)
         streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
         thread = Thread(target=rolmocr_model.generate, kwargs=generation_kwargs)
         images=images if images else None,
         return_tensors="pt",
         padding=True,
+    ).to(device)
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
 examples = [
     [{"text": "@Prescription Extract medicines from this prescription", "files": ["examples/prescription1.jpg"]}],
     [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
     [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 Upload your medical prescription images and get the medicine names extracted automatically!
 """
+# Memory optimization for Hugging Face Spaces
+import gc
+max_memory = {i: f"{15}GiB" for i in range(torch.cuda.device_count())}
 demo = gr.ChatInterface(
     fn=model_inference,
     description=description,
     css=css
 )
+if __name__ == "__main__":
+    # Add queue to prevent timeouts
+    demo.queue(concurrency_count=1)
+    demo.launch(debug=True, share=False)