File size: 9,288 Bytes
ed275c9
5d63d59
ed275c9
5d63d59
 
fc95e60
3f6a788
 
 
6401487
 
 
 
 
 
 
 
ed275c9
7342b9f
 
 
 
 
36ebfe1
6401487
7342b9f
 
 
c8cd2f3
 
 
7342b9f
 
 
 
6401487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f6a788
 
 
91cda81
 
 
ed275c9
6401487
3f6a788
 
 
 
 
 
6401487
 
 
ed275c9
9522057
3f6a788
 
 
6401487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f6a788
 
 
 
 
64f9a07
6401487
64f9a07
 
 
239e8eb
 
 
64f9a07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239e8eb
6401487
fc95e60
 
 
 
 
 
3f6a788
fc95e60
3f6a788
5d63d59
fc95e60
3f6a788
5d63d59
 
3f6a788
 
 
 
 
 
 
 
 
 
 
 
5d63d59
fc95e60
5633a75
fe53594
ed275c9
3f6a788
 
ed275c9
3f6a788
 
ed275c9
3f6a788
ed275c9
7342b9f
ed275c9
 
0de5083
5d63d59
ed275c9
 
6401487
 
 
5d63d59
64f9a07
 
2e3cd2c
64f9a07
 
df7c39c
 
 
78742f4
b50fe8f
8b3f5c3
9522057
91cda81
 
9522057
86bb6ec
91cda81
7342b9f
 
6401487
7342b9f
6401487
7342b9f
91cda81
 
 
 
 
fc95e60
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import gradio as gr
from transformers.image_utils import load_image
from threading import Thread
import time
import torch
import spaces
from PIL import Image
import requests
from io import BytesIO
import cv2
import numpy as np
from transformers import (
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
    TextIteratorStreamer,
    AutoModelForImageTextToText,
)

# Helper function to return a progress bar HTML snippet.
def progress_bar_html(label: str) -> str:
    return f'''
<div style="display: flex; align-items: center;">
    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
    <div style="width: 110px; height: 5px; background-color: #FFB6C1; border-radius: 2px; overflow: hidden;">
        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
    </div>
</div>
<style>
@keyframes loading {{
    0% {{ transform: translateX(-100%); }}
    100% {{ transform: translateX(100%); }}
}}
</style>
    '''

# Helper function to downsample a video into 10 evenly spaced frames.
def downsample_video(video_path):
    vidcap = cv2.VideoCapture(video_path)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    frames = []
    # Sample 10 evenly spaced frames.
    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
    for i in frame_indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, image = vidcap.read()
        if success:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(image)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))
    vidcap.release()
    return frames

# Model and processor setups

# Setup for Qwen2VL OCR branch (default).
QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # or use "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
    QV_MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to("cuda").eval()

# Setup for Aya-Vision branch.
AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
aya_model = AutoModelForImageTextToText.from_pretrained(
    AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
)

# ---------------------------
# Main Inference Function
# ---------------------------
@spaces.GPU
def model_inference(input_dict, history):
    text = input_dict["text"].strip()
    files = input_dict.get("files", [])
    
    # Branch for video inference with Aya-Vision using @video-infer.
    if text.lower().startswith("@video-infer"):
        prompt = text[len("@video-infer"):].strip()
        if not files:
            yield "Error: Please provide a video for the @video-infer feature."
            return
        video_path = files[0]
        frames = downsample_video(video_path)
        if not frames:
            yield "Error: Could not extract frames from the video."
            return
        # Build messages: start with the prompt then add each frame with its timestamp.
        content_list = []
        content_list.append({"type": "text", "text": prompt})
        for frame, timestamp in frames:
            content_list.append({"type": "text", "text": f"Frame {timestamp}:"})
            content_list.append({"type": "image", "image": frame})
        messages = [{
            "role": "user",
            "content": content_list,
        }]
        inputs = aya_processor.apply_chat_template(
            messages,
            padding=True,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        ).to(aya_model.device)
        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(
            inputs, 
            streamer=streamer, 
            max_new_tokens=1024, 
            do_sample=True, 
            temperature=0.3
        )
        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
        thread.start()
        buffer = ""
        yield progress_bar_html("Processing video with Aya-Vision-8b")
        for new_text in streamer:
            buffer += new_text
            buffer = buffer.replace("<|im_end|>", "")
            time.sleep(0.01)
            yield buffer
        return

    # Branch for single image inference with Aya-Vision using @aya-vision.
    if text.lower().startswith("@aya-vision"):
        text_prompt = text[len("@aya-vision"):].strip()
        if not files:
            yield "Error: Please provide an image for the @aya-vision feature."
            return
        else:
            # Use the first provided image.
            image = load_image(files[0])
            yield progress_bar_html("Processing with Aya-Vision-8b")
            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": text_prompt},
                ],
            }]
            inputs = aya_processor.apply_chat_template(
                messages,
                padding=True,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt"
            ).to(aya_model.device)
            streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
            generation_kwargs = dict(
                inputs, 
                streamer=streamer, 
                max_new_tokens=1024, 
                do_sample=True, 
                temperature=0.3
            )
            thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
            thread.start()
            buffer = ""
            for new_text in streamer:
                buffer += new_text
                buffer = buffer.replace("<|im_end|>", "")
                time.sleep(0.01)
                yield buffer
            return

    # Default branch: Use Qwen2VL OCR for text (with optional images).
    if len(files) > 1:
        images = [load_image(image) for image in files]
    elif len(files) == 1:
        images = [load_image(files[0])]
    else:
        images = []
    
    if text == "" and not images:
        yield "Error: Please input a query and optionally image(s)."
        return
    if text == "" and images:
        yield "Error: Please input a text query along with the image(s)."
        return

    messages = [{
        "role": "user",
        "content": [
            *[{"type": "image", "image": image} for image in images],
            {"type": "text", "text": text},
        ],
    }]
    
    prompt = qwen_processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = qwen_processor(
        text=[prompt],
        images=images if images else None,
        return_tensors="pt",
        padding=True,
    ).to("cuda")
    
    streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
    
    thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
    thread.start()
    
    buffer = ""
    yield progress_bar_html("Processing with Qwen2VL OCR")
    for new_text in streamer:
        buffer += new_text
        buffer = buffer.replace("<|im_end|>", "")
        time.sleep(0.01)
        yield buffer


# Gradio Interface Setup

examples = [
    [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
    [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
    [{"text": "@video-infer Explain what is happening in this video ?", "files": ["examples/oreo.mp4"]}],
    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
    [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
    [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
    [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
    [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
    [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
    [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
    [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
]

demo = gr.ChatInterface(
    fn=model_inference,
    description="# **Multimodal OCR `@aya-vision for image, @video-infer for video`**",
    examples=examples,
    textbox=gr.MultimodalTextbox(
        label="Query Input", 
        file_types=["image", "video"], 
        file_count="multiple", 
        placeholder="Tag @aya-vision for Aya-Vision image infer, @video-infer for Aya-Vision video infer, default runs Qwen2VL OCR"
    ),
    stop_btn="Stop Generation",
    multimodal=True,
    cache_examples=False,
)

demo.launch(debug=True)