Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

File size: 8,148 Bytes

"""
app.py

This demo builds a Multimodal OCR Granite Vision interface using:
  - @rag: retrieval‐augmented generation for PDF and image documents (via LightRAG)
  - @granite: image understanding with Granite Vision
  - @video-infer: video understanding by downsampling frames and processing each with Granite Vision

Make sure the required Granite models and dependencies (Gradio, Transformers, etc.) are installed.
"""

import os
import random
import uuid
import time
import cv2
import numpy as np
import torch
from PIL import Image
import gradio as gr

from transformers import AutoProcessor, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM
from transformers.image_utils import load_image

# Import the LightRAG class (which internally uses Granite embedding and generation models)
from sandbox.light_rag.light_rag import LightRAG

# ------------------------------
# Utility and device setup
# ------------------------------
def get_device():
    if torch.backends.mps.is_available():
        return "mps"  # macOS GPU
    elif torch.cuda.is_available():
        return "cuda"
    else:
        return "cpu"

device = get_device()

# ------------------------------
# Generation parameter constants
# ------------------------------
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.7
TOP_P = 0.85
TOP_K = 50
REPETITION_PENALTY = 1.05

# ------------------------------
# Load Granite Vision model for image processing (@granite and video)
# ------------------------------
VISION_MODEL_ID = "ibm-granite/granite-vision-3.2-2b"
vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID)
vision_model = AutoModelForVision2Seq.from_pretrained(VISION_MODEL_ID, device_map="auto").to(device)

# ------------------------------
# Initialize the LightRAG pipeline for text-only or document (PDF/image) RAG (@rag)
# ------------------------------
rag_config = {
    "embedding_model_id": "ibm-granite/granite-embedding-125m-english",
    "generation_model_id": "ibm-granite/granite-3.1-8b-instruct",
    "milvus_collection_name": "granite_vision_text_milvus",
    "milvus_db_path": "milvus.db",  # adjust this path as needed
}
light_rag = LightRAG(rag_config)

# ------------------------------
# Video downsampling helper
# ------------------------------
def downsample_video(video_path):
    """
    Downsamples the video to 10 evenly spaced frames.
    Returns a list of tuples: (PIL image, timestamp in seconds)
    """
    vidcap = cv2.VideoCapture(video_path)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    frames = []
    # Sample 10 evenly spaced frame indices
    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
    for i in frame_indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, frame = vidcap.read()
        if success:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))
    vidcap.release()
    return frames

# ------------------------------
# Command processing functions
# ------------------------------
def process_rag(query, file_path=None):
    """
    Process @rag command using the LightRAG pipeline.
    Optionally, if a file is provided (e.g. PDF or image), one might extract text from it.
    Here we simply use the query for retrieval-augmented generation.
    """
    context = light_rag.search(query, top_n=5)
    answer, prompt = light_rag.generate(query, context)
    return answer

def process_granite(query, image: Image.Image):
    """
    Process @granite command:
      Build a simple prompt from the image and the query then run the Granite Vision model.
    """
    # Here we build a conversation with a single user turn.
    conversation = [{"role": "user", "content": query}]
    inputs = vision_processor.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
    ).to(device)
    generate_kwargs = {
        "max_new_tokens": MAX_NEW_TOKENS,
        "do_sample": True,
        "top_p": TOP_P,
        "top_k": TOP_K,
        "temperature": TEMPERATURE,
        "repetition_penalty": REPETITION_PENALTY,
    }
    output = vision_model.generate(**inputs, **generate_kwargs)
    result = vision_processor.decode(output[0], skip_special_tokens=True)
    return result.strip()

def process_video(query, video_path):
    """
    Process @video-infer command:
      Downsample the video, process each frame with the Granite Vision model, and combine the results.
    """
    frames = downsample_video(video_path)
    descriptions = []
    for image, timestamp in frames:
        desc = process_granite(query, image)
        descriptions.append(f"At {timestamp}s: {desc}")
    return "\n".join(descriptions)

# ------------------------------
# Main function to handle input and dispatch based on command
# ------------------------------
def generate_response(input_dict, chat_history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
    """
    Based on the query prefix, this function calls:
      - process_rag for @rag
      - process_granite for @granite
      - process_video for @video-infer
    If no special command is provided, it defaults to text-only generation via LightRAG.
    """
    text = input_dict["text"]
    files = input_dict.get("files", [])
    lower_text = text.strip().lower()
    
    if lower_text.startswith("@rag"):
        query = text[len("@rag"):].strip()
        file_path = files[0] if files else None  # Optionally process the provided file
        answer = process_rag(query, file_path)
        return answer
    
    elif lower_text.startswith("@granite"):
        query = text[len("@granite"):].strip()
        if files:
            # Assume first file is an image
            image = load_image(files[0])
            result = process_granite(query, image)
            return result
        else:
            return "No image file provided for @granite command."
    
    elif lower_text.startswith("@video-infer"):
        query = text[len("@video-infer"):].strip()
        if files:
            video_path = files[0]  # Assume first file is a video
            result = process_video(query, video_path)
            return result
        else:
            return "No video file provided for @video-infer command."
    
    else:
        # Default: text-only generation using LightRAG
        answer, prompt = light_rag.generate(text, context=[])
        return answer

# ------------------------------
# Build the Gradio interface using a multimodal textbox
# ------------------------------
demo = gr.ChatInterface(
    fn=generate_response,
    additional_inputs=[
        gr.Slider(label="Max new tokens", minimum=1, maximum=2048, step=1, value=MAX_NEW_TOKENS),
        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=TEMPERATURE),
        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=TOP_P),
        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=TOP_K),
        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=REPETITION_PENALTY),
    ],
    textbox=gr.MultimodalTextbox(
        label="Query Input",
        file_types=["image", "pdf", "video"],
        file_count="multiple",
        placeholder="Enter your query starting with @rag, @granite, or @video-infer",
    ),
    examples=[
        [{"text": "@rag What was the revenue growth in 2020?"}],
        [{"text": "@granite Describe the content of this image", "files": ["example_image.png"]}],
        [{"text": "@video-infer Summarize the event shown in the video", "files": ["example_video.mp4"]}],
    ],
    cache_examples=False,
    type="messages",
    description=(
        "### Multimodal OCR Granite Vision\n"
        "Use **@rag** for PDF/image RAG, **@granite** for image questions, and **@video-infer** for video understanding."
    ),
    fill_height=True,
    stop_btn="Stop Generation",
    theme="default",
)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()