Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

File size: 8,332 Bytes

ed275c9
3f6a788
 
 
 
 
a520e3c
3f6a788
5d63d59
ed275c9
5d63d59
 
fc95e60
3f6a788
 
 
ed275c9
7342b9f
 
 
 
 
36ebfe1
 
7342b9f
 
 
c8cd2f3
 
 
7342b9f
 
 
 
a520e3c
 
 
 
3f6a788
 
 
91cda81
 
 
ed275c9
a520e3c
3f6a788
 
 
 
 
 
a520e3c
 
 
 
 
 
 
ed275c9
9522057
3f6a788
 
 
a520e3c
3f6a788
 
 
 
 
a520e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239e8eb
 
 
a520e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239e8eb
a520e3c
fc95e60
 
 
 
 
 
3f6a788
fc95e60
3f6a788
5d63d59
fc95e60
3f6a788
5d63d59
 
3f6a788
 
 
 
 
 
 
 
 
 
 
 
5d63d59
fc95e60
5633a75
fe53594
ed275c9
3f6a788
 
ed275c9
3f6a788
 
ed275c9
3f6a788
ed275c9
7342b9f
ed275c9
 
0de5083
5d63d59
ed275c9
 
a520e3c
5d63d59
a520e3c
 
 
df7c39c
 
 
a520e3c
78742f4
b50fe8f
8b3f5c3
9522057
91cda81
a520e3c
91cda81
9522057
a520e3c
 
 
 
 
 
 
91cda81
7342b9f
 
 
 
a520e3c
7342b9f
91cda81
 
 
 
 
fc95e60

import gradio as gr
from transformers import (
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
    TextIteratorStreamer,
    AutoModelForImageTextToText,
    Gemma3ForConditionalGeneration  # new Gemma3 model import
)
from transformers.image_utils import load_image
from threading import Thread
import time
import torch
import spaces
from PIL import Image
import requests
from io import BytesIO

# Helper function to return a progress bar HTML snippet.
def progress_bar_html(label: str) -> str:
    return f'''
<div style="display: flex; align-items: center;">
    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
    <div style="width: 110px; height: 5px; background-color: #FFB6C1; border-radius: 2px; overflow: hidden;">
        <div style="width: 100%; height: 100%; background-color: #FF69B4 ; animation: loading 1.5s linear infinite;"></div>
    </div>
</div>
<style>
@keyframes loading {{
    0% {{ transform: translateX(-100%); }}
    100% {{ transform: translateX(100%); }}
}}
</style>
    '''

### Load Models & Processors ###

# Qwen2VL OCR model (default)
QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # or alternate version
qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
    QV_MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to("cuda").eval()

# Aya-Vision model (trigger with @aya-vision)
AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
aya_model = AutoModelForImageTextToText.from_pretrained(
    AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
)

# Gemma3-4b model (trigger with @gemma3-4b)
GEMMA3_MODEL_ID = "google/gemma-3-4b-it"
gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
    GEMMA3_MODEL_ID, device_map="auto"
).eval()
gemma3_processor = AutoProcessor.from_pretrained(GEMMA3_MODEL_ID)

@spaces.GPU
def model_inference(input_dict, history):
    text = input_dict["text"].strip()
    files = input_dict.get("files", [])
    
    # Branch: Aya-Vision (trigger with @aya-vision)
    if text.lower().startswith("@aya-vision"):
        text_prompt = text[len("@aya-vision"):].strip()
        if not files:
            yield "Error: Please provide an image for the @aya-vision feature."
            return
        image = load_image(files[0])
        yield progress_bar_html("Processing with Aya-Vision-8b")
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": text_prompt},
            ],
        }]
        inputs = aya_processor.apply_chat_template(
            messages,
            padding=True,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        ).to(aya_model.device)
        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(
            inputs, 
            streamer=streamer, 
            max_new_tokens=1024, 
            do_sample=True, 
            temperature=0.3
        )
        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
        thread.start()
        buffer = ""
        for new_text in streamer:
            buffer += new_text
            buffer = buffer.replace("<|im_end|>", "")
            time.sleep(0.01)
            yield buffer
        return

    # Branch: Gemma3-4b (trigger with @gemma3-4b)
    if text.lower().startswith("@gemma3-4b"):
        text_prompt = text[len("@gemma3-4b"):].strip()
        if not files:
            yield "Error: Please provide an image for the @gemma3-4b feature."
            return
        image = load_image(files[0])
        yield progress_bar_html("Processing with Gemma3-4b")
        messages = [
            {
                "role": "system",
                "content": [{"type": "text", "text": "You are a helpful assistant."}]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": text_prompt}
                ]
            }
        ]
        inputs = gemma3_processor.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True,
            return_dict=True, return_tensors="pt"
        ).to(gemma3_model.device, dtype=torch.bfloat16)
        input_len = inputs["input_ids"].shape[-1]
        streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512, do_sample=False)
        thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
        thread.start()
        buffer = ""
        for new_text in streamer:
            buffer += new_text
            buffer = buffer.replace("<|im_end|>", "")
            time.sleep(0.01)
            yield buffer
        return

    # Default Branch: Qwen2-VL OCR (for text query with optional images)
    if len(files) > 1:
        images = [load_image(image) for image in files]
    elif len(files) == 1:
        images = [load_image(files[0])]
    else:
        images = []
    
    if text == "" and not images:
        yield "Error: Please input a query and optionally image(s)."
        return
    if text == "" and images:
        yield "Error: Please input a text query along with the image(s)."
        return

    messages = [{
        "role": "user",
        "content": [
            *[{"type": "image", "image": image} for image in images],
            {"type": "text", "text": text},
        ],
    }]
    
    prompt = qwen_processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = qwen_processor(
        text=[prompt],
        images=images if images else None,
        return_tensors="pt",
        padding=True,
    ).to("cuda")
    
    streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
    
    thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
    thread.start()
    
    buffer = ""
    yield progress_bar_html("Processing with Qwen2VL OCR")
    for new_text in streamer:
        buffer += new_text
        buffer = buffer.replace("<|im_end|>", "")
        time.sleep(0.01)
        yield buffer

# Examples for quick testing.
examples = [
    [{"text": "@gemma3-4b Summarize the letter", "files": ["examples/1.png"]}],
    [{"text": "@gemma3-4b Extract JSON from the image", "files": ["example_images/document.jpg"]}],
    [{"text": "@gemma3-4b Describe the photo", "files": ["examples/3.png"]}],
    [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
    [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
    [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
    [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
    [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
    [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
]

# Gradio ChatInterface with a multimodal textbox.
demo = gr.ChatInterface(
    fn=model_inference,
    description=(
        "# **Multimodal OCR & Vision Features**\n\n"
        "Use the following commands to select a model:\n"
        "- `@aya-vision` for Aya-Vision-8b\n"
        "- `@gemma3-4b` for Gemma3-4b\n\n"
        "Default processing is done with Qwen2VL OCR."
    ),
    examples=examples,
    textbox=gr.MultimodalTextbox(
        label="Query Input", 
        file_types=["image"], 
        file_count="multiple", 
        placeholder="Enter your text query and attach images if needed. Use @aya-vision or @gemma3-4b to choose a feature."
    ),
    stop_btn="Stop Generation",
    multimodal=True,
    cache_examples=False,
)

demo.launch(debug=True)