Spaces:

shukdevdatta123
/

MedicineOCR

Running

App Files Files Community

shukdevdatta123 commited on Apr 23

Commit

37acc53

verified ·

1 Parent(s): e1accc9

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -361

app.py CHANGED Viewed

@@ -1,379 +1,124 @@
 import gradio as gr
-from transformers.image_utils import load_image
-from threading import Thread
-import time
-import torch
-import cv2
-import numpy as np
-from PIL import Image
 import re
-import os
-from transformers import (
-    Qwen2VLForConditionalGeneration,
-    AutoProcessor,
-    TextIteratorStreamer,
-)
-from transformers import Qwen2_5_VLForConditionalGeneration
-# ---------------------------
-# Helper Functions
-# ---------------------------
-def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
-    """
-    Returns an HTML snippet for a thin animated progress bar with a label.
-    Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: {secondary_color}; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: {primary_color}; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
-def downsample_video(video_path):
-    """
-    Downsamples a video file by extracting 10 evenly spaced frames.
-    Returns a list of tuples (PIL.Image, timestamp).
-    """
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    if total_frames <= 0 or fps <= 0:
-        vidcap.release()
-        return frames
-    # Determine 10 evenly spaced frame indices.
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
-def extract_medicine_names(text):
-    """
-    Extracts medicine names from OCR text output.
-    Uses a combination of pattern matching and formatting to identify medications.
-    Returns a formatted list of medicines found.
-    """
-    # Common medicine patterns (extended to catch more formats)
-    lines = text.split('\n')
-    medicines = []
-    # Look for patterns typical in prescriptions
-    for line in lines:
-        # Clean and standardize the line
-        clean_line = line.strip()
-        # Skip very short lines, headers, or non-relevant text
-        if len(clean_line) < 3 or re.search(r'(prescription|rx|patient|name|date|doctor|hospital|clinic|address)', clean_line.lower()):
-            continue
-        # Medicine names often appear at the beginning of lines, with dosage info following
-        # Look for tablet/capsule/mg indicators - strong indicators of medication
-        if re.search(r'(tab|tablet|cap|capsule|mg|ml|injection|syrup|solution|suspension|ointment|cream|gel|patch|suppository|inhaler|drops)', clean_line.lower()):
-            # Extract the likely medicine name - the part before the dosage/form or the entire line if it's short
-            medicine_match = re.split(r'(\d+\s*mg|\d+\s*ml|\d+\s*tab|\d+\s*cap)', clean_line, 1)[0].strip()
-            if medicine_match and len(medicine_match) > 2:
-                medicines.append(medicine_match)
-        # Check for brand names or generic medication patterns
-        elif re.match(r'^[A-Z][a-z]+\s*[A-Z0-9]', clean_line) or re.match(r'^[A-Z][a-z]+', clean_line):
-            # Likely a medicine name starting with a capital letter
-            medicine_parts = re.split(r'(\d+|\s+\d+\s*times|\s+\d+\s*times\s+daily)', clean_line, 1)
-            if medicine_parts and len(medicine_parts[0]) > 2:
-                medicines.append(medicine_parts[0].strip())
-    # Remove duplicates while preserving order
-    unique_medicines = []
-    for med in medicines:
-        if med not in unique_medicines:
-            unique_medicines.append(med)
-    return unique_medicines
-# Check for CUDA availability
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-# Adjust model loading based on device
-dtype = torch.float16 if device == "cuda" else torch.float32
-bfdtype = torch.bfloat16 if device == "cuda" else torch.float32
-# Set lower precision for CPU if available
-if device == "cpu":
     try:
-        # Check if Intel MKL is available for better CPU performance
-        import intel_extension_for_pytorch as ipex
-        dtype = torch.bfloat16
-        print("Using Intel optimizations for PyTorch")
-    except ImportError:
-        print("Intel optimizations not available, using standard CPU mode")
-# Model and Processor Setup with proper error handling
-try:
-    # Qwen2VL OCR (default branch)
-    QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # [or] prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
-    qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
-    qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
-        QV_MODEL_ID,
-        trust_remote_code=True,
-        torch_dtype=dtype,
-        low_cpu_mem_usage=True,
-    ).to(device).eval()
-    # RolmOCR branch (@RolmOCR)
-    ROLMOCR_MODEL_ID = "reducto/RolmOCR"
-    rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
-    rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        ROLMOCR_MODEL_ID,
-        trust_remote_code=True,
-        torch_dtype=bfdtype,
-        low_cpu_mem_usage=True,
-    ).to(device).eval()
-    models_loaded = True
-except Exception as e:
-    print(f"Error loading models: {str(e)}")
-    models_loaded = False
-# Main Inference Function
-def model_inference(input_dict, history):
-    if not models_loaded:
-        yield "Error: Models could not be loaded. Please check system requirements."
-        return
-    text = input_dict["text"].strip()
-    files = input_dict.get("files", [])
-    # Check for prescription-specific command
-    if text.lower().startswith("@prescription") or text.lower().startswith("@med"):
-        # Specific mode for medicine extraction
-        if not files:
-            yield "Error: Please upload a prescription image to extract medicine names."
-            return
-        # Use RolmOCR for better text extraction from prescriptions
-        images = [load_image(image) for image in files[:1]]  # Taking just the first image for processing
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "image", "image": images[0]},
-                {"type": "text", "text": "Extract all text from this medical prescription image, focus on medicine names, dosages, and instructions."},
             ],
-        }]
-        prompt_full = rolmocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = rolmocr_processor(
-            text=[prompt_full],
-            images=images,
-            return_tensors="pt",
-            padding=True,
-        ).to(device)
-        # First, get the complete OCR text
-        streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-        thread = Thread(target=rolmocr_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        ocr_text = ""
-        yield progress_bar_html("Processing Prescription with Medicine Extractor")
-        for new_text in streamer:
-            ocr_text += new_text
-            ocr_text = ocr_text.replace("<|im_end|>", "")
-            time.sleep(0.01)
-        # After getting full OCR text, extract medicine names
-        medicines = extract_medicine_names(ocr_text)
-        # Format the results nicely
-        result = "## Extracted Medicine Names\n\n"
-        if medicines:
-            for i, med in enumerate(medicines, 1):
-                result += f"{i}. {med}\n"
-        else:
-            result += "No medicine names detected in the prescription.\n\n"
-        result += "\n\n## Full OCR Text\n\n```\n" + ocr_text + "\n```"
-        yield result
-        return
-    # RolmOCR Inference (@RolmOCR)
-    if text.lower().startswith("@rolmocr"):
-        # Remove the tag from the query.
-        text_prompt = text[len("@rolmocr"):].strip()
-        # Check if a video is provided for inference.
-        if files and isinstance(files[0], str) and files[0].lower().endswith((".mp4", ".avi", ".mov")):
-            video_path = files[0]
-            frames = downsample_video(video_path)
-            if not frames:
-                yield "Error: Could not extract frames from the video."
-                return
-            # Build the message: prompt followed by each frame with its timestamp.
-            content_list = [{"type": "text", "text": text_prompt}]
-            for image, timestamp in frames:
-                content_list.append({"type": "text", "text": f"Frame {timestamp}:"})
-                content_list.append({"type": "image", "image": image})
-            messages = [{"role": "user", "content": content_list}]
-            # For video, extract images only.
-            video_images = [image for image, _ in frames]
-            prompt_full = rolmocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            inputs = rolmocr_processor(
-                text=[prompt_full],
-                images=video_images,
-                return_tensors="pt",
-                padding=True,
-            ).to(device)
-        else:
-            # Assume image(s) or text query.
-            if len(files) > 1:
-                images = [load_image(image) for image in files]
-            elif len(files) == 1:
-                images = [load_image(files[0])]
-            else:
-                images = []
-            if text_prompt == "" and not images:
-                yield "Error: Please input a text query and/or provide an image for the @RolmOCR feature."
-                return
-            messages = [{
-                "role": "user",
-                "content": [
-                    *[{"type": "image", "image": image} for image in images],
-                    {"type": "text", "text": text_prompt},
-                ],
-            }]
-            prompt_full = rolmocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            inputs = rolmocr_processor(
-                text=[prompt_full],
-                images=images if images else None,
-                return_tensors="pt",
-                padding=True,
-            ).to(device)
-        streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-        thread = Thread(target=rolmocr_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        # Use a different color scheme for RolmOCR (purple-themed).
-        yield progress_bar_html("Processing with Qwen2.5VL (RolmOCR)")
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        return
-    # Default Inference: Qwen2VL OCR
-    # Process files: support multiple images.
-    if len(files) > 1:
-        images = [load_image(image) for image in files]
-    elif len(files) == 1:
-        images = [load_image(files[0])]
-    else:
-        images = []
-    if text == "" and not images:
-        yield "Error: Please input a text query and optionally image(s)."
-        return
-    if text == "" and images:
-        yield "Error: Please input a text query along with the image(s)."
-        return
-    messages = [{
-        "role": "user",
-        "content": [
-            *[{"type": "image", "image": image} for image in images],
-            {"type": "text", "text": text},
-        ],
-    }]
-    prompt_full = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = qwen_processor(
-        text=[prompt_full],
-        images=images if images else None,
-        return_tensors="pt",
-        padding=True,
-    ).to(device)
-    streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    yield progress_bar_html("Processing with Qwen2VL OCR")
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer
-# Gradio Interface
-examples = [
-    [{"text": "@Prescription Extract medicines from this prescription", "files": ["examples/prescription1.jpg"]}],
-    [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
-    [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
-    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
-]
-css = """
-.gradio-container {
-    font-family: 'Roboto', sans-serif;
-}
-.prescription-header {
-    background-color: #4B0082;
-    color: white;
-    padding: 10px;
-    border-radius: 5px;
-    margin-bottom: 10px;
-}
-"""
-description = """
-# **Multimodal OCR with Medicine Extraction**
-## Modes:
-- **@Prescription** - Upload a prescription image to extract medicine names
-- **@RolmOCR** - Use RolmOCR for general text extraction
-- **Default** - Use Qwen2VL OCR for general purposes
-Upload your medical prescription images and get the medicine names extracted automatically!
-"""
-# Memory optimization for Hugging Face Spaces
-import gc
-max_memory = {i: f"{15}GiB" for i in range(torch.cuda.device_count())}
-demo = gr.ChatInterface(
-    fn=model_inference,
-    description=description,
-    examples=examples,
-    textbox=gr.MultimodalTextbox(
-        label="Query Input",
-        file_types=["image", "video"],
-        file_count="multiple",
-        placeholder="Use @Prescription to extract medicines, @RolmOCR for RolmOCR, or leave blank for default Qwen2VL OCR"
-    ),
-    stop_btn="Stop Generation",
-    multimodal=True,
-    cache_examples=False,
-    css=css
-)
 if __name__ == "__main__":
-    # Add queue to prevent timeouts
-    demo.queue(concurrency_count=1)
-    demo.launch(debug=True, share=False)

 import gradio as gr
+from openai import OpenAI
 import re
+def get_openrouter_client(api_key):
+    """Initialize OpenRouter client with user-provided API key"""
+    if not api_key or api_key.strip() == "":
+        return None, "Please enter your OpenRouter API key"
     try:
+        client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key
+        )
+        return client, None
+    except Exception as e:
+        return None, f"Error initializing client: {str(e)}"
+def extract_medicine_names(image, api_key):
+    """Extract medicine names from a prescription image using Gemini via OpenRouter"""
+    if not image:
+        return "Please upload a prescription image."
+    # Get client with user-provided API key
+    client, error = get_openrouter_client(api_key)
+    if error:
+        return error
+    try:
+        response = client.chat.completions.create(
+            extra_headers={
+                "HTTP-Referer": "https://medicine-extractor-app.com",
+                "X-Title": "Medicine Name Extractor",
+            },
+            model="google/gemini-2.5-pro-exp-03-25:free",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an AI specialized in extracting medication names from prescription images. Only list the medication names, nothing else."
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Extract ONLY the names of medications from this prescription image. Provide them as a numbered list. If this isn't a medical prescription, respond with 'No prescription detected'."
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image
+                            }
+                        }
+                    ]
+                }
             ],
+            max_tokens=300
+        )
+        result = response.choices[0].message.content.strip()
+        # Check if no prescription was detected
+        if "No prescription detected" in result:
+            return "No prescription detected in the image."
+        # Clean up the response to just include the medication names
+        # Remove any explanatory text that might appear before or after the list
+        medicines = []
+        for line in result.split('\n'):
+            # Look for numbered lines or lines starting with medication names
+            if re.match(r'^\d+\.', line.strip()):
+                # Extract text after the number and period
+                med_name = re.sub(r'^\d+\.\s*', '', line.strip())
+                medicines.append(med_name)
+        if not medicines:
+            # If numbered list processing didn't work, return the raw output
+            return result
+        return "\n".join([f"{i+1}. {med}" for i, med in enumerate(medicines)])
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create the Gradio interface
+with gr.Blocks(title="Prescription Medicine Extractor") as app:
+    gr.Markdown("# Prescription Medicine Name Extractor")
+    gr.Markdown("Upload a prescription image to extract medication names.")
+    api_key = gr.Textbox(
+        label="OpenRouter API Key",
+        placeholder="Enter your OpenRouter API key here",
+        type="password"
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="filepath", label="Upload Prescription Image")
+            submit_btn = gr.Button("Extract Medicine Names", variant="primary")
+        with gr.Column():
+            output = gr.Textbox(label="Extracted Medicine Names", lines=10)
+    submit_btn.click(
+        fn=extract_medicine_names,
+        inputs=[image_input, api_key],
+        outputs=[output]
+    )
+    gr.Markdown("""
+    ## Usage Instructions
+    1. Enter your OpenRouter API key (get one from https://openrouter.ai)
+    2. Upload a clear image of a medical prescription
+    3. Click the "Extract Medicine Names" button
+    4. The names of medications will be displayed in the output box
+    **Note:** For best results, ensure the image is clear and the text is readable.
+    **Privacy Notice:** Your API key and images are processed only during the active session and are not stored.
+    """)
+# Launch the app
 if __name__ == "__main__":
+    print("Starting Prescription Medicine Name Extractor application...")
+    app.launch()