prithivMLmods commited on
Commit
fe8a556
·
verified ·
1 Parent(s): e62fc3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +319 -301
app.py CHANGED
@@ -1,302 +1,320 @@
1
- import os
2
- import random
3
- import uuid
4
- import json
5
- import time
6
- import asyncio
7
- from threading import Thread
8
-
9
- import gradio as gr
10
- import spaces
11
- import torch
12
- import numpy as np
13
- from PIL import Image
14
- import cv2
15
-
16
- from transformers import (
17
- Qwen2VLForConditionalGeneration,
18
- Qwen2_5_VLForConditionalGeneration,
19
- AutoModelForImageTextToText,
20
- AutoProcessor,
21
- TextIteratorStreamer,
22
- )
23
- from transformers.image_utils import load_image
24
-
25
- # Constants for text generation
26
- MAX_MAX_NEW_TOKENS = 2048
27
- DEFAULT_MAX_NEW_TOKENS = 1024
28
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
-
30
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
-
32
- # Load VIREX-062225-exp
33
- MODEL_ID_M = "prithivMLmods/VIREX-062225-exp"
34
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
35
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
- MODEL_ID_M,
37
- trust_remote_code=True,
38
- torch_dtype=torch.float16
39
- ).to(device).eval()
40
-
41
- # Load DREX-062225-exp
42
- MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
43
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
44
- model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
45
- MODEL_ID_X,
46
- trust_remote_code=True,
47
- torch_dtype=torch.float16
48
- ).to(device).eval()
49
-
50
- # Load typhoon-ocr-3b
51
- MODEL_ID_T = "scb10x/typhoon-ocr-3b"
52
- processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
53
- model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
- MODEL_ID_T,
55
- trust_remote_code=True,
56
- torch_dtype=torch.float16
57
- ).to(device).eval()
58
-
59
- # Load olmOCR-7B-0225-preview
60
- MODEL_ID_O = "allenai/olmOCR-7B-0225-preview"
61
- processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
62
- model_o = Qwen2VLForConditionalGeneration.from_pretrained(
63
- MODEL_ID_O,
64
- trust_remote_code=True,
65
- torch_dtype=torch.float16
66
- ).to(device).eval()
67
-
68
- def downsample_video(video_path):
69
- """
70
- Downsamples the video to evenly spaced frames.
71
- Each frame is returned as a PIL image along with its timestamp.
72
- """
73
- vidcap = cv2.VideoCapture(video_path)
74
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
75
- fps = vidcap.get(cv2.CAP_PROP_FPS)
76
- frames = []
77
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
78
- for i in frame_indices:
79
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
80
- success, image = vidcap.read()
81
- if success:
82
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
83
- pil_image = Image.fromarray(image)
84
- timestamp = round(i / fps, 2)
85
- frames.append((pil_image, timestamp))
86
- vidcap.release()
87
- return frames
88
-
89
- @spaces.GPU
90
- def generate_image(model_name: str, text: str, image: Image.Image,
91
- max_new_tokens: int = 1024,
92
- temperature: float = 0.6,
93
- top_p: float = 0.9,
94
- top_k: int = 50,
95
- repetition_penalty: float = 1.2):
96
- """
97
- Generates responses using the selected model for image input.
98
- """
99
- if model_name == "VIREX-062225-7B-exp":
100
- processor = processor_m
101
- model = model_m
102
- elif model_name == "DREX-062225-7B-exp":
103
- processor = processor_x
104
- model = model_x
105
- elif model_name == "olmOCR-7B-0225-preview":
106
- processor = processor_o
107
- model = model_o
108
- elif model_name == "Typhoon-OCR-3B":
109
- processor = processor_t
110
- model = model_t
111
- else:
112
- yield "Invalid model selected.", "Invalid model selected."
113
- return
114
-
115
- if image is None:
116
- yield "Please upload an image.", "Please upload an image."
117
- return
118
-
119
- messages = [{
120
- "role": "user",
121
- "content": [
122
- {"type": "image", "image": image},
123
- {"type": "text", "text": text},
124
- ]
125
- }]
126
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
127
- inputs = processor(
128
- text=[prompt_full],
129
- images=[image],
130
- return_tensors="pt",
131
- padding=True,
132
- truncation=False,
133
- max_length=MAX_INPUT_TOKEN_LENGTH
134
- ).to(device)
135
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
136
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
137
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
138
- thread.start()
139
- buffer = ""
140
- for new_text in streamer:
141
- buffer += new_text
142
- time.sleep(0.01)
143
- yield buffer, buffer
144
-
145
- @spaces.GPU
146
- def generate_video(model_name: str, text: str, video_path: str,
147
- max_new_tokens: int = 1024,
148
- temperature: float = 0.6,
149
- top_p: float = 0.9,
150
- top_k: int = 50,
151
- repetition_penalty: float = 1.2):
152
- """
153
- Generates responses using the selected model for video input.
154
- """
155
- if model_name == "VIREX-062225-7B-exp":
156
- processor = processor_m
157
- model = model_m
158
- elif model_name == "DREX-062225-7B-exp":
159
- processor = processor_x
160
- model = model_x
161
- elif model_name == "olmOCR-7B-0225-preview":
162
- processor = processor_o
163
- model = model_o
164
- elif model_name == "Typhoon-OCR-3B":
165
- processor = processor_t
166
- model = model_t
167
- else:
168
- yield "Invalid model selected.", "Invalid model selected."
169
- return
170
-
171
- if video_path is None:
172
- yield "Please upload a video.", "Please upload a video."
173
- return
174
-
175
- frames = downsample_video(video_path)
176
- messages = [
177
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
178
- {"role": "user", "content": [{"type": "text", "text": text}]}
179
- ]
180
- for frame in frames:
181
- image, timestamp = frame
182
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
183
- messages[1]["content"].append({"type": "image", "image": image})
184
- inputs = processor.apply_chat_template(
185
- messages,
186
- tokenize=True,
187
- add_generation_prompt=True,
188
- return_dict=True,
189
- return_tensors="pt",
190
- truncation=False,
191
- max_length=MAX_INPUT_TOKEN_LENGTH
192
- ).to(device)
193
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
194
- generation_kwargs = {
195
- **inputs,
196
- "streamer": streamer,
197
- "max_new_tokens": max_new_tokens,
198
- "do_sample": True,
199
- "temperature": temperature,
200
- "top_p": top_p,
201
- "top_k": top_k,
202
- "repetition_penalty": repetition_penalty,
203
- }
204
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
205
- thread.start()
206
- buffer = ""
207
- for new_text in streamer:
208
- buffer += new_text
209
- buffer = buffer.replace("<|im_end|>", "")
210
- time.sleep(0.01)
211
- yield buffer, buffer
212
-
213
- # Define examples for image and video inference
214
- image_examples = [
215
- ["Convert this page to doc [text] precisely.", "images/3.png"],
216
- ["Convert this page to doc [text] precisely.", "images/4.png"],
217
- ["Convert this page to doc [text] precisely.", "images/1.png"],
218
- ["Convert chart to OTSL.", "images/2.png"]
219
- ]
220
-
221
- video_examples = [
222
- ["Explain the video in detail.", "videos/2.mp4"],
223
- ["Explain the ad in detail.", "videos/1.mp4"]
224
- ]
225
-
226
- # Added CSS to style the output area as a "Canvas"
227
- css = """
228
- .submit-btn {
229
- background-color: #2980b9 !important;
230
- color: white !important;
231
- }
232
- .submit-btn:hover {
233
- background-color: #3498db !important;
234
- }
235
- .canvas-output {
236
- border: 2px solid #4682B4;
237
- border-radius: 10px;
238
- padding: 20px;
239
- }
240
- """
241
-
242
- # Create the Gradio Interface
243
- with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
244
- gr.Markdown("# **[Doc VLMs OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
245
- with gr.Row():
246
- with gr.Column():
247
- with gr.Tabs():
248
- with gr.TabItem("Image Inference"):
249
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
250
- image_upload = gr.Image(type="pil", label="Image")
251
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
252
- gr.Examples(
253
- examples=image_examples,
254
- inputs=[image_query, image_upload]
255
- )
256
- with gr.TabItem("Video Inference"):
257
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
258
- video_upload = gr.Video(label="Video")
259
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
260
- gr.Examples(
261
- examples=video_examples,
262
- inputs=[video_query, video_upload]
263
- )
264
- with gr.Accordion("Advanced options", open=False):
265
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
266
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
267
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
268
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
269
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
270
-
271
- with gr.Column():
272
- with gr.Column(elem_classes="canvas-output"):
273
- gr.Markdown("## Result Canvas")
274
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
275
- markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
276
-
277
- model_choice = gr.Radio(
278
- choices=["DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "VIREX-062225-7B-exp", "Typhoon-OCR-3B"],
279
- label="Select Model",
280
- value="DREX-062225-7B-exp"
281
- )
282
-
283
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
284
- gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
285
- gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
286
- gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
287
- gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
288
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
289
-
290
- image_submit.click(
291
- fn=generate_image,
292
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
293
- outputs=[output, markdown_output]
294
- )
295
- video_submit.click(
296
- fn=generate_video,
297
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
298
- outputs=[output, markdown_output]
299
- )
300
-
301
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
1
+ import os
2
+ import random
3
+ import uuid
4
+ import json
5
+ import time
6
+ import asyncio
7
+ from threading import Thread
8
+
9
+ import gradio as gr
10
+ import spaces
11
+ import torch
12
+ import numpy as np
13
+ from PIL import Image
14
+ import cv2
15
+
16
+ from transformers import (
17
+ Qwen2VLForConditionalGeneration,
18
+ Qwen2_5_VLForConditionalGeneration,
19
+ AutoModelForImageTextToText,
20
+ AutoProcessor,
21
+ TextIteratorStreamer,
22
+ )
23
+ from transformers.image_utils import load_image
24
+
25
+ # Constants for text generation
26
+ MAX_MAX_NEW_TOKENS = 2048
27
+ DEFAULT_MAX_NEW_TOKENS = 1024
28
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
+
30
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
+
32
+ # Load VIREX-062225-exp
33
+ MODEL_ID_M = "prithivMLmods/VIREX-062225-exp"
34
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
35
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
+ MODEL_ID_M,
37
+ trust_remote_code=True,
38
+ torch_dtype=torch.float16
39
+ ).to(device).eval()
40
+
41
+ # Load DREX-062225-exp
42
+ MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
43
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
44
+ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
45
+ MODEL_ID_X,
46
+ trust_remote_code=True,
47
+ torch_dtype=torch.float16
48
+ ).to(device).eval()
49
+
50
+ # Load typhoon-ocr-3b
51
+ MODEL_ID_T = "scb10x/typhoon-ocr-3b"
52
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
53
+ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
+ MODEL_ID_T,
55
+ trust_remote_code=True,
56
+ torch_dtype=torch.float16
57
+ ).to(device).eval()
58
+
59
+ # Load olmOCR-7B-0225-preview
60
+ MODEL_ID_O = "allenai/olmOCR-7B-0225-preview"
61
+ processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
62
+ model_o = Qwen2VLForConditionalGeneration.from_pretrained(
63
+ MODEL_ID_O,
64
+ trust_remote_code=True,
65
+ torch_dtype=torch.float16
66
+ ).to(device).eval()
67
+
68
+ def downsample_video(video_path):
69
+ """
70
+ Downsamples the video to evenly spaced frames.
71
+ Each frame is returned as a PIL image along with its timestamp.
72
+ """
73
+ vidcap = cv2.VideoCapture(video_path)
74
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
75
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
76
+ frames = []
77
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
78
+ for i in frame_indices:
79
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
80
+ success, image = vidcap.read()
81
+ if success:
82
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
83
+ pil_image = Image.fromarray(image)
84
+ timestamp = round(i / fps, 2)
85
+ frames.append((pil_image, timestamp))
86
+ vidcap.release()
87
+ return frames
88
+
89
+ @spaces.GPU
90
+ def generate_image(model_name: str, text: str, image: Image.Image,
91
+ max_new_tokens: int = 1024,
92
+ temperature: float = 0.6,
93
+ top_p: float = 0.9,
94
+ top_k: int = 50,
95
+ repetition_penalty: float = 1.2):
96
+ """
97
+ Generates responses using the selected model for image input.
98
+ """
99
+ if model_name == "VIREX-062225-7B-exp":
100
+ processor = processor_m
101
+ model = model_m
102
+ elif model_name == "DREX-062225-7B-exp":
103
+ processor = processor_x
104
+ model = model_x
105
+ elif model_name == "olmOCR-7B-0225-preview":
106
+ processor = processor_o
107
+ model = model_o
108
+ elif model_name == "Typhoon-OCR-3B":
109
+ processor = processor_t
110
+ model = model_t
111
+ else:
112
+ yield "Invalid model selected.", "Invalid model selected."
113
+ return
114
+
115
+ if image is None:
116
+ yield "Please upload an image.", "Please upload an image."
117
+ return
118
+
119
+ messages = [{
120
+ "role": "user",
121
+ "content": [
122
+ {"type": "image", "image": image},
123
+ {"type": "text", "text": text},
124
+ ]
125
+ }]
126
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
127
+ inputs = processor(
128
+ text=[prompt_full],
129
+ images=[image],
130
+ return_tensors="pt",
131
+ padding=True,
132
+ truncation=False,
133
+ max_length=MAX_INPUT_TOKEN_LENGTH
134
+ ).to(device)
135
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
136
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
137
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
138
+ thread.start()
139
+ buffer = ""
140
+ for new_text in streamer:
141
+ buffer += new_text
142
+ time.sleep(0.01)
143
+ yield buffer, buffer
144
+
145
+ @spaces.GPU
146
+ def generate_video(model_name: str, text: str, video_path: str,
147
+ max_new_tokens: int = 1024,
148
+ temperature: float = 0.6,
149
+ top_p: float = 0.9,
150
+ top_k: int = 50,
151
+ repetition_penalty: float = 1.2):
152
+ """
153
+ Generates responses using the selected model for video input.
154
+ """
155
+ if model_name == "VIREX-062225-7B-exp":
156
+ processor = processor_m
157
+ model = model_m
158
+ elif model_name == "DREX-062225-7B-exp":
159
+ processor = processor_x
160
+ model = model_x
161
+ elif model_name == "olmOCR-7B-0225-preview":
162
+ processor = processor_o
163
+ model = model_o
164
+ elif model_name == "Typhoon-OCR-3B":
165
+ processor = processor_t
166
+ model = model_t
167
+ else:
168
+ yield "Invalid model selected.", "Invalid model selected."
169
+ return
170
+
171
+ if video_path is None:
172
+ yield "Please upload a video.", "Please upload a video."
173
+ return
174
+
175
+ frames = downsample_video(video_path)
176
+ messages = [
177
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
178
+ {"role": "user", "content": [{"type": "text", "text": text}]}
179
+ ]
180
+ for frame in frames:
181
+ image, timestamp = frame
182
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
183
+ messages[1]["content"].append({"type": "image", "image": image})
184
+ inputs = processor.apply_chat_template(
185
+ messages,
186
+ tokenize=True,
187
+ add_generation_prompt=True,
188
+ return_dict=True,
189
+ return_tensors="pt",
190
+ truncation=False,
191
+ max_length=MAX_INPUT_TOKEN_LENGTH
192
+ ).to(device)
193
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
194
+ generation_kwargs = {
195
+ **inputs,
196
+ "streamer": streamer,
197
+ "max_new_tokens": max_new_tokens,
198
+ "do_sample": True,
199
+ "temperature": temperature,
200
+ "top_p": top_p,
201
+ "top_k": top_k,
202
+ "repetition_penalty": repetition_penalty,
203
+ }
204
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
205
+ thread.start()
206
+ buffer = ""
207
+ for new_text in streamer:
208
+ buffer += new_text
209
+ buffer = buffer.replace("<|im_end|>", "")
210
+ time.sleep(0.01)
211
+ yield buffer, buffer
212
+
213
+ def save_to_md(output_text):
214
+ """
215
+ Saves the output text to a Markdown file and returns the file path for download.
216
+ """
217
+ file_path = f"result_{uuid.uuid4()}.md"
218
+ with open(file_path, "w") as f:
219
+ f.write(output_text)
220
+ return file_path
221
+
222
+ # Define examples for image and video inference
223
+ image_examples = [
224
+ ["Convert this page to doc [text] precisely.", "images/3.png"],
225
+ ["Convert this page to doc [text] precisely.", "images/4.png"],
226
+ ["Convert this page to doc [text] precisely.", "images/1.png"],
227
+ ["Convert chart to OTSL.", "images/2.png"]
228
+ ]
229
+
230
+ video_examples = [
231
+ ["Explain the video in detail.", "videos/2.mp4"],
232
+ ["Explain the ad in detail.", "videos/1.mp4"]
233
+ ]
234
+
235
+ # Added CSS to style the output area as a "Canvas"
236
+ css = """
237
+ .submit-btn {
238
+ background-color: #2980b9 !important;
239
+ color: white !important;
240
+ }
241
+ .submit-btn:hover {
242
+ background-color: #3498db !important;
243
+ }
244
+ .canvas-output {
245
+ border: 2px solid #4682B4;
246
+ border-radius: 10px;
247
+ padding: 20px;
248
+ }
249
+ """
250
+
251
+ # Create the Gradio Interface
252
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
253
+ gr.Markdown("# **[Doc VLMs OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
254
+ with gr.Row():
255
+ with gr.Column():
256
+ with gr.Tabs():
257
+ with gr.TabItem("Image Inference"):
258
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
259
+ image_upload = gr.Image(type="pil", label="Image")
260
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
261
+ gr.Examples(
262
+ examples=image_examples,
263
+ inputs=[image_query, image_upload]
264
+ )
265
+ with gr.TabItem("Video Inference"):
266
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
267
+ video_upload = gr.Video(label="Video")
268
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
269
+ gr.Examples(
270
+ examples=video_examples,
271
+ inputs=[video_query, video_upload]
272
+ )
273
+
274
+ with gr.Accordion("Advanced options", open=False):
275
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
276
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
277
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
278
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
279
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
280
+
281
+ with gr.Column():
282
+ with gr.Column(elem_classes="canvas-output"):
283
+ gr.Markdown("## Result.Md")
284
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
285
+
286
+ with gr.Accordion("Formatted Result (Result.md)", open=False):
287
+ markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
288
+ #download_btn = gr.Button("Download Result.md"
289
+
290
+ model_choice = gr.Radio(
291
+ choices=["DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "VIREX-062225-7B-exp", "Typhoon-OCR-3B"],
292
+ label="Select Model",
293
+ value="DREX-062225-7B-exp"
294
+ )
295
+
296
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
297
+ gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
298
+ gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
299
+ gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
300
+ gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
301
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
302
+
303
+ image_submit.click(
304
+ fn=generate_image,
305
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
306
+ outputs=[output, markdown_output]
307
+ )
308
+ video_submit.click(
309
+ fn=generate_video,
310
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
311
+ outputs=[output, markdown_output]
312
+ )
313
+ download_btn.click(
314
+ fn=save_to_md,
315
+ inputs=output,
316
+ outputs=None
317
+ )
318
+
319
+ if __name__ == "__main__":
320
  demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)