Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 16

Commit

c7906eb

verified ·

1 Parent(s): 1366989

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -315

app.py CHANGED Viewed

@@ -1,343 +1,220 @@
-#!/usr/bin/env python
-import os
-import re
-import tempfile
-from collections.abc import Iterator
-from threading import Thread
 import cv2
-import gradio as gr
-import spaces
 import torch
-from loguru import logger
 from PIL import Image
-from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
-model_id = os.getenv("MODEL_ID", "google/gemma-3-12b-it")
-processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
-model = Gemma3ForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
-)
-MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
-def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
-    image_count = 0
-    video_count = 0
-    for path in paths:
-        if path.endswith(".mp4"):
-            video_count += 1
-        else:
-            image_count += 1
-    return image_count, video_count
-def count_files_in_history(history: list[dict]) -> tuple[int, int]:
-    image_count = 0
-    video_count = 0
-    for item in history:
-        if item["role"] != "user" or isinstance(item["content"], str):
-            continue
-        if item["content"][0].endswith(".mp4"):
-            video_count += 1
-        else:
-            image_count += 1
-    return image_count, video_count
-def validate_media_constraints(message: dict, history: list[dict]) -> bool:
-    new_image_count, new_video_count = count_files_in_new_message(message["files"])
-    history_image_count, history_video_count = count_files_in_history(history)
-    image_count = history_image_count + new_image_count
-    video_count = history_video_count + new_video_count
-    if video_count > 1:
-        gr.Warning("Only one video is supported.")
-        return False
-    if video_count == 1:
-        if image_count > 0:
-            gr.Warning("Mixing images and videos is not allowed.")
-            return False
-        if "<image>" in message["text"]:
-            gr.Warning("Using <image> tags with video files is not supported.")
-            return False
-    if video_count == 0 and image_count > MAX_NUM_IMAGES:
-        gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
-        return False
-    if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
-        gr.Warning("The number of <image> tags in the text does not match the number of images.")
-        return False
-    return True
-def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     vidcap = cv2.VideoCapture(video_path)
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    max_frames = 5 # Limit to 5 frames to prevent memory overload
-    if total_frames <= max_frames:
-        indices = list(range(total_frames))
-    else:
-        indices = [int(i * (total_frames - 1) / (max_frames - 1)) for i in range(max_frames)]
     frames = []
-    for i in indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
         if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
-def process_video(video_path: str) -> list[dict]:
-    content = []
     frames = downsample_video(video_path)
-    for frame in frames:
-        pil_image, timestamp = frame
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-            pil_image.save(temp_file.name)
-            content.append({"type": "text", "text": f"Frame {timestamp}:"})
-            content.append({"type": "image", "url": temp_file.name})
-    logger.debug(f"{content=}")
-    return content
-def process_interleaved_images(message: dict) -> list[dict]:
-    logger.debug(f"{message['files']=}")
-    parts = re.split(r"(<image>)", message["text"])
-    logger.debug(f"{parts=}")
-    content = []
-    image_index = 0
-    for part in parts:
-        logger.debug(f"{part=}")
-        if part == "<image>":
-            content.append({"type": "image", "url": message["files"][image_index]})
-            logger.debug(f"file: {message['files'][image_index]}")
-            image_index += 1
-        elif part.strip():
-            content.append({"type": "text", "text": part.strip()})
-        elif isinstance(part, str) and part != "<image>":
-            content.append({"type": "text", "text": part})
-    logger.debug(f"{content=}")
-    return content
-def process_new_user_message(message: dict) -> list[dict]:
-    if not message["files"]:
-        return [{"type": "text", "text": message["text"]}]
-    if message["files"][0].endswith(".mp4"):
-        return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
-    if "<image>" in message["text"]:
-        return process_interleaved_images(message)
-    return [
-        {"type": "text", "text": message["text"]},
-        *[{"type": "image", "url": path} for path in message["files"]],
-    ]
-def process_history(history: list[dict]) -> list[dict]:
-    messages = []
-    current_user_content: list[dict] = []
-    for item in history:
-        if item["role"] == "assistant":
-            if current_user_content:
-                messages.append({"role": "user", "content": current_user_content})
-                current_user_content = []
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
         else:
-            content = item["content"]
-            if isinstance(content, str):
-                current_user_content.append({"type": "text", "text": content})
-            else:
-                current_user_content.append({"type": "image", "url": content[0]})
-    return messages
-@spaces.GPU(duration=90)
-def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
-    if not validate_media_constraints(message, history):
-        yield ""
-        return
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
-    messages.extend(process_history(history))
-    messages.append({"role": "user", "content": process_new_user_message(message)})
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt",
-    ).to(device=model.device, dtype=torch.bfloat16)
-    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        inputs,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    output = ""
-    for delta in streamer:
-        output += delta
-        yield output
-examples = [
-    [
-        {
-            "text": "I need to be in Japan for 10 days, going to Tokyo, Kyoto and Osaka. Think about number of attractions in each of them and allocate number of days to each city. Make public transport recommendations.",
-            "files": [],
-        }
-    ],
-    [
-        {
-            "text": "Write the matplotlib code to generate the same bar chart.",
-            "files": ["assets/additional-examples/barchart.png"],
-        }
-    ],
-    [
-        {
-            "text": "What is odd about this video?",
-            "files": ["assets/additional-examples/tmp.mp4"],
-        }
-    ],
-    [
-        {
-            "text": "I already have this supplement <image> and I want to buy this one <image>. Any warnings I should know about?",
-            "files": ["assets/additional-examples/pill1.png", "assets/additional-examples/pill2.png"],
-        }
-    ],
-    [
-        {
-            "text": "Write a poem inspired by the visual elements of the images.",
-            "files": ["assets/sample-images/06-1.png", "assets/sample-images/06-2.png"],
-        }
-    ],
-    [
-        {
-            "text": "Compose a short musical piece inspired by the visual elements of the images.",
-            "files": [
-                "assets/sample-images/07-1.png",
-                "assets/sample-images/07-2.png",
-                "assets/sample-images/07-3.png",
-                "assets/sample-images/07-4.png",
-            ],
-        }
-    ],
-    [
-        {
-            "text": "Write a short story about what might have happened in this house.",
-            "files": ["assets/sample-images/08.png"],
-        }
-    ],
-    [
-        {
-            "text": "Create a short story based on the sequence of images.",
-            "files": [
-                "examples/09-1.png",
-                "examples/09-2.png",
-                "examples/09-3.png",
-                "examples/09-4.png",
-                "examples/09-5.png",
-            ],
-        }
-    ],
-    [
-        {
-            "text": "Describe the creatures that would live in this world.",
-            "files": ["assets/sample-images/10.png"],
-        }
-    ],
-    [
-        {
-            "text": "Read text in the image.",
-            "files": ["assets/additional-examples/1.png"],
-        }
-    ],
-    [
-        {
-            "text": "When is this ticket dated and how much did it cost?",
-            "files": ["assets/additional-examples/2.png"],
-        }
-    ],
-    [
-        {
-            "text": "Read the text in the image into markdown.",
-            "files": ["assets/additional-examples/3.png"],
-        }
-    ],
-    [
-        {
-            "text": "Evaluate this integral.",
-            "files": ["assets/additional-examples/4.png"],
-        }
-    ],
-    [
-        {
-            "text": "caption this image",
-            "files": ["assets/sample-images/01.png"],
-        }
-    ],
-    [
-        {
-            "text": "What's the sign says?",
-            "files": ["assets/sample-images/02.png"],
-        }
-    ],
-    [
-        {
-            "text": "Compare and contrast the two images.",
-            "files": ["assets/sample-images/03.png"],
-        }
-    ],
-    [
-        {
-            "text": "List all the objects in the image and their colors.",
-            "files": ["assets/sample-images/04.png"],
-        }
-    ],
-    [
-        {
-            "text": "Describe the atmosphere of the scene.",
-            "files": ["assets/sample-images/05.png"],
-        }
-    ],
-]
-DESCRIPTION = """\
-<img src='https://huggingface.co/spaces/huggingface-projects/gemma-3-12b-it/resolve/main/assets/logo.png' id='logo' />
-This is a demo of Gemma 3 12B it, a vision language model with outstanding performance on a wide range of tasks.
-You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input. For videos, up to 5 frames will be extracted and processed.
-"""
 demo = gr.ChatInterface(
-    fn=run,
-    type="messages",
-    chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
-    textbox=gr.MultimodalTextbox(file_types=["image", ".mp4"], file_count="multiple", autofocus=True),
-    multimodal=True,
     additional_inputs=[
-        gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
-        gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],
-    stop_btn=False,
-    title="Gemma 3 12B IT",
-    description=DESCRIPTION,
-    examples=examples,
-    run_examples_on_click=False,
     cache_examples=False,
-    css_paths="style.css",
-    delete_cache=(1800, 1800),
 )
 if __name__ == "__main__":
-    demo.launch(debug=True)

+"""
+app.py
+This demo builds a Multimodal OCR Granite Vision interface using:
+  - @rag: retrieval‐augmented generation for PDF and image documents (via LightRAG)
+  - @granite: image understanding with Granite Vision
+  - @video-infer: video understanding by downsampling frames and processing each with Granite Vision
+Make sure the required Granite models and dependencies (Gradio, Transformers, etc.) are installed.
+"""
+import os
+import random
+import uuid
+import time
 import cv2
+import numpy as np
 import torch
 from PIL import Image
+import gradio as gr
+from transformers import AutoProcessor, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM
+from transformers.image_utils import load_image
+# Import the LightRAG class (which internally uses Granite embedding and generation models)
+from sandbox.light_rag.light_rag import LightRAG
+# ------------------------------
+# Utility and device setup
+# ------------------------------
+def get_device():
+    if torch.backends.mps.is_available():
+        return "mps"  # macOS GPU
+    elif torch.cuda.is_available():
+        return "cuda"
+    else:
+        return "cpu"
+device = get_device()
+# ------------------------------
+# Generation parameter constants
+# ------------------------------
+MAX_NEW_TOKENS = 1024
+TEMPERATURE = 0.7
+TOP_P = 0.85
+TOP_K = 50
+REPETITION_PENALTY = 1.05
+# ------------------------------
+# Load Granite Vision model for image processing (@granite and video)
+# ------------------------------
+VISION_MODEL_ID = "ibm-granite/granite-vision-3.2-2b"
+vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID)
+vision_model = AutoModelForVision2Seq.from_pretrained(VISION_MODEL_ID, device_map="auto").to(device)
+# ------------------------------
+# Initialize the LightRAG pipeline for text-only or document (PDF/image) RAG (@rag)
+# ------------------------------
+rag_config = {
+    "embedding_model_id": "ibm-granite/granite-embedding-125m-english",
+    "generation_model_id": "ibm-granite/granite-3.1-8b-instruct",
+    "milvus_collection_name": "granite_vision_text_milvus",
+    "milvus_db_path": "milvus.db",  # adjust this path as needed
+}
+light_rag = LightRAG(rag_config)
+# ------------------------------
+# Video downsampling helper
+# ------------------------------
+def downsample_video(video_path):
+    """
+    Downsamples the video to 10 evenly spaced frames.
+    Returns a list of tuples: (PIL image, timestamp in seconds)
+    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    # Sample 10 evenly spaced frame indices
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, frame = vidcap.read()
         if success:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
+# ------------------------------
+# Command processing functions
+# ------------------------------
+def process_rag(query, file_path=None):
+    """
+    Process @rag command using the LightRAG pipeline.
+    Optionally, if a file is provided (e.g. PDF or image), one might extract text from it.
+    Here we simply use the query for retrieval-augmented generation.
+    """
+    context = light_rag.search(query, top_n=5)
+    answer, prompt = light_rag.generate(query, context)
+    return answer
+def process_granite(query, image: Image.Image):
+    """
+    Process @granite command:
+      Build a simple prompt from the image and the query then run the Granite Vision model.
+    """
+    # Here we build a conversation with a single user turn.
+    conversation = [{"role": "user", "content": query}]
+    inputs = vision_processor.apply_chat_template(
+        conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+    ).to(device)
+    generate_kwargs = {
+        "max_new_tokens": MAX_NEW_TOKENS,
+        "do_sample": True,
+        "top_p": TOP_P,
+        "top_k": TOP_K,
+        "temperature": TEMPERATURE,
+        "repetition_penalty": REPETITION_PENALTY,
+    }
+    output = vision_model.generate(**inputs, **generate_kwargs)
+    result = vision_processor.decode(output[0], skip_special_tokens=True)
+    return result.strip()
+def process_video(query, video_path):
+    """
+    Process @video-infer command:
+      Downsample the video, process each frame with the Granite Vision model, and combine the results.
+    """
     frames = downsample_video(video_path)
+    descriptions = []
+    for image, timestamp in frames:
+        desc = process_granite(query, image)
+        descriptions.append(f"At {timestamp}s: {desc}")
+    return "\n".join(descriptions)
+# ------------------------------
+# Main function to handle input and dispatch based on command
+# ------------------------------
+def generate_response(input_dict, chat_history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
+    """
+    Based on the query prefix, this function calls:
+      - process_rag for @rag
+      - process_granite for @granite
+      - process_video for @video-infer
+    If no special command is provided, it defaults to text-only generation via LightRAG.
+    """
+    text = input_dict["text"]
+    files = input_dict.get("files", [])
+    lower_text = text.strip().lower()
+    if lower_text.startswith("@rag"):
+        query = text[len("@rag"):].strip()
+        file_path = files[0] if files else None  # Optionally process the provided file
+        answer = process_rag(query, file_path)
+        return answer
+    elif lower_text.startswith("@granite"):
+        query = text[len("@granite"):].strip()
+        if files:
+            # Assume first file is an image
+            image = load_image(files[0])
+            result = process_granite(query, image)
+            return result
         else:
+            return "No image file provided for @granite command."
+    elif lower_text.startswith("@video-infer"):
+        query = text[len("@video-infer"):].strip()
+        if files:
+            video_path = files[0]  # Assume first file is a video
+            result = process_video(query, video_path)
+            return result
+        else:
+            return "No video file provided for @video-infer command."
+    else:
+        # Default: text-only generation using LightRAG
+        answer, prompt = light_rag.generate(text, context=[])
+        return answer
+# ------------------------------
+# Build the Gradio interface using a multimodal textbox
+# ------------------------------
 demo = gr.ChatInterface(
+    fn=generate_response,
     additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=1, maximum=2048, step=1, value=MAX_NEW_TOKENS),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=TEMPERATURE),
+        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=TOP_P),
+        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=TOP_K),
+        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=REPETITION_PENALTY),
+    ],
+    textbox=gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "pdf", "video"],
+        file_count="multiple",
+        placeholder="Enter your query starting with @rag, @granite, or @video-infer",
+    ),
+    examples=[
+        [{"text": "@rag What was the revenue growth in 2020?"}],
+        [{"text": "@granite Describe the content of this image", "files": ["example_image.png"]}],
+        [{"text": "@video-infer Summarize the event shown in the video", "files": ["example_video.mp4"]}],
     ],
     cache_examples=False,
+    type="messages",
+    description=(
+        "### Multimodal OCR Granite Vision\n"
+        "Use **@rag** for PDF/image RAG, **@granite** for image questions, and **@video-infer** for video understanding."
+    ),
+    fill_height=True,
+    stop_btn="Stop Generation",
+    theme="default",
 )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()