File size: 8,148 Bytes
c7906eb
 
91d2c01
c7906eb
 
 
 
 
 
 
91d2c01
c7906eb
 
 
 
a9be97c
c7906eb
91d2c01
 
c7906eb
91d2c01
c7906eb
 
280b089
c7906eb
 
91d2c01
c7906eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d2c01
6ed0791
c7906eb
a9be97c
c7906eb
 
 
91d2c01
c7906eb
91d2c01
c7906eb
 
fe76282
91d2c01
 
a9be97c
 
c7906eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d2c01
c7906eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d2c01
c7906eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d2c01
c7906eb
 
 
86a82e4
c7906eb
9a23baa
c7906eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91d2c01
 
c7906eb
 
 
 
 
 
 
 
86a82e4
b8a0d2d
91d2c01
c7906eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
app.py

This demo builds a Multimodal OCR Granite Vision interface using:
  - @rag: retrieval‐augmented generation for PDF and image documents (via LightRAG)
  - @granite: image understanding with Granite Vision
  - @video-infer: video understanding by downsampling frames and processing each with Granite Vision

Make sure the required Granite models and dependencies (Gradio, Transformers, etc.) are installed.
"""

import os
import random
import uuid
import time
import cv2
import numpy as np
import torch
from PIL import Image
import gradio as gr

from transformers import AutoProcessor, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM
from transformers.image_utils import load_image

# Import the LightRAG class (which internally uses Granite embedding and generation models)
from sandbox.light_rag.light_rag import LightRAG

# ------------------------------
# Utility and device setup
# ------------------------------
def get_device():
    if torch.backends.mps.is_available():
        return "mps"  # macOS GPU
    elif torch.cuda.is_available():
        return "cuda"
    else:
        return "cpu"

device = get_device()

# ------------------------------
# Generation parameter constants
# ------------------------------
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.7
TOP_P = 0.85
TOP_K = 50
REPETITION_PENALTY = 1.05

# ------------------------------
# Load Granite Vision model for image processing (@granite and video)
# ------------------------------
VISION_MODEL_ID = "ibm-granite/granite-vision-3.2-2b"
vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID)
vision_model = AutoModelForVision2Seq.from_pretrained(VISION_MODEL_ID, device_map="auto").to(device)

# ------------------------------
# Initialize the LightRAG pipeline for text-only or document (PDF/image) RAG (@rag)
# ------------------------------
rag_config = {
    "embedding_model_id": "ibm-granite/granite-embedding-125m-english",
    "generation_model_id": "ibm-granite/granite-3.1-8b-instruct",
    "milvus_collection_name": "granite_vision_text_milvus",
    "milvus_db_path": "milvus.db",  # adjust this path as needed
}
light_rag = LightRAG(rag_config)

# ------------------------------
# Video downsampling helper
# ------------------------------
def downsample_video(video_path):
    """
    Downsamples the video to 10 evenly spaced frames.
    Returns a list of tuples: (PIL image, timestamp in seconds)
    """
    vidcap = cv2.VideoCapture(video_path)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    frames = []
    # Sample 10 evenly spaced frame indices
    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
    for i in frame_indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, frame = vidcap.read()
        if success:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))
    vidcap.release()
    return frames

# ------------------------------
# Command processing functions
# ------------------------------
def process_rag(query, file_path=None):
    """
    Process @rag command using the LightRAG pipeline.
    Optionally, if a file is provided (e.g. PDF or image), one might extract text from it.
    Here we simply use the query for retrieval-augmented generation.
    """
    context = light_rag.search(query, top_n=5)
    answer, prompt = light_rag.generate(query, context)
    return answer

def process_granite(query, image: Image.Image):
    """
    Process @granite command:
      Build a simple prompt from the image and the query then run the Granite Vision model.
    """
    # Here we build a conversation with a single user turn.
    conversation = [{"role": "user", "content": query}]
    inputs = vision_processor.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
    ).to(device)
    generate_kwargs = {
        "max_new_tokens": MAX_NEW_TOKENS,
        "do_sample": True,
        "top_p": TOP_P,
        "top_k": TOP_K,
        "temperature": TEMPERATURE,
        "repetition_penalty": REPETITION_PENALTY,
    }
    output = vision_model.generate(**inputs, **generate_kwargs)
    result = vision_processor.decode(output[0], skip_special_tokens=True)
    return result.strip()

def process_video(query, video_path):
    """
    Process @video-infer command:
      Downsample the video, process each frame with the Granite Vision model, and combine the results.
    """
    frames = downsample_video(video_path)
    descriptions = []
    for image, timestamp in frames:
        desc = process_granite(query, image)
        descriptions.append(f"At {timestamp}s: {desc}")
    return "\n".join(descriptions)

# ------------------------------
# Main function to handle input and dispatch based on command
# ------------------------------
def generate_response(input_dict, chat_history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
    """
    Based on the query prefix, this function calls:
      - process_rag for @rag
      - process_granite for @granite
      - process_video for @video-infer
    If no special command is provided, it defaults to text-only generation via LightRAG.
    """
    text = input_dict["text"]
    files = input_dict.get("files", [])
    lower_text = text.strip().lower()
    
    if lower_text.startswith("@rag"):
        query = text[len("@rag"):].strip()
        file_path = files[0] if files else None  # Optionally process the provided file
        answer = process_rag(query, file_path)
        return answer
    
    elif lower_text.startswith("@granite"):
        query = text[len("@granite"):].strip()
        if files:
            # Assume first file is an image
            image = load_image(files[0])
            result = process_granite(query, image)
            return result
        else:
            return "No image file provided for @granite command."
    
    elif lower_text.startswith("@video-infer"):
        query = text[len("@video-infer"):].strip()
        if files:
            video_path = files[0]  # Assume first file is a video
            result = process_video(query, video_path)
            return result
        else:
            return "No video file provided for @video-infer command."
    
    else:
        # Default: text-only generation using LightRAG
        answer, prompt = light_rag.generate(text, context=[])
        return answer

# ------------------------------
# Build the Gradio interface using a multimodal textbox
# ------------------------------
demo = gr.ChatInterface(
    fn=generate_response,
    additional_inputs=[
        gr.Slider(label="Max new tokens", minimum=1, maximum=2048, step=1, value=MAX_NEW_TOKENS),
        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=TEMPERATURE),
        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=TOP_P),
        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=TOP_K),
        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=REPETITION_PENALTY),
    ],
    textbox=gr.MultimodalTextbox(
        label="Query Input",
        file_types=["image", "pdf", "video"],
        file_count="multiple",
        placeholder="Enter your query starting with @rag, @granite, or @video-infer",
    ),
    examples=[
        [{"text": "@rag What was the revenue growth in 2020?"}],
        [{"text": "@granite Describe the content of this image", "files": ["example_image.png"]}],
        [{"text": "@video-infer Summarize the event shown in the video", "files": ["example_video.mp4"]}],
    ],
    cache_examples=False,
    type="messages",
    description=(
        "### Multimodal OCR Granite Vision\n"
        "Use **@rag** for PDF/image RAG, **@granite** for image questions, and **@video-infer** for video understanding."
    ),
    fill_height=True,
    stop_btn="Stop Generation",
    theme="default",
)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()