prithivMLmods commited on
Commit
a68aebf
·
verified ·
1 Parent(s): f48789b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -134
app.py CHANGED
@@ -23,6 +23,9 @@ from transformers import (
23
  )
24
  from transformers.image_utils import load_image
25
 
 
 
 
26
  # Constants for text generation
27
  MAX_MAX_NEW_TOKENS = 2048
28
  DEFAULT_MAX_NEW_TOKENS = 1024
@@ -30,158 +33,183 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
 
33
- # Load VIREX-062225-exp
34
- MODEL_ID_M = "prithivMLmods/VIREX-062225-exp"
35
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
36
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
37
- MODEL_ID_M,
 
 
 
 
38
  trust_remote_code=True,
39
  torch_dtype=torch.float16
40
  ).to(device).eval()
41
 
42
- # Load DREX-062225-exp
43
- MODEL_ID_X = "prithivMLmods/DREX-062225-exp"
44
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
45
- model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
46
- MODEL_ID_X,
47
  trust_remote_code=True,
48
  torch_dtype=torch.float16
49
  ).to(device).eval()
50
 
51
- # Load typhoon-ocr-3b
52
- MODEL_ID_T = "sarvamai/sarvam-translate"
53
- processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
54
- model_t = Gemma3ForConditionalGeneration.from_pretrained(
55
- MODEL_ID_T,
56
  trust_remote_code=True,
57
  torch_dtype=torch.float16
58
  ).to(device).eval()
59
 
60
- # Load olmOCR-7B-0225-preview
61
- MODEL_ID_O = "allenai/olmOCR-7B-0225-preview"
62
- processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
63
- model_o = Qwen2VLForConditionalGeneration.from_pretrained(
64
- MODEL_ID_O,
65
  trust_remote_code=True,
66
  torch_dtype=torch.float16
67
  ).to(device).eval()
68
 
 
 
 
69
  def downsample_video(video_path):
70
  """
71
- Downsamples the video to evenly spaced frames.
72
- Each frame is returned as a PIL image along with its timestamp.
73
  """
74
  vidcap = cv2.VideoCapture(video_path)
75
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
76
- fps = vidcap.get(cv2.CAP_PROP_FPS)
77
  frames = []
78
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
79
- for i in frame_indices:
80
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
81
- success, image = vidcap.read()
82
- if success:
83
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
84
- pil_image = Image.fromarray(image)
85
- timestamp = round(i / fps, 2)
86
- frames.append((pil_image, timestamp))
87
  vidcap.release()
88
  return frames
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  @spaces.GPU
91
  def generate_image(model_name: str, text: str, image: Image.Image,
92
- max_new_tokens: int = 1024,
93
  temperature: float = 0.6,
94
  top_p: float = 0.9,
95
  top_k: int = 50,
96
  repetition_penalty: float = 1.2):
97
- """
98
- Generates responses using the selected model for image input.
99
- """
100
- if model_name == "VIREX-062225-7B-exp":
101
- processor = processor_m
102
- model = model_m
103
- elif model_name == "DREX-062225-7B-exp":
104
- processor = processor_x
105
- model = model_x
106
- elif model_name == "olmOCR-7B-0225-preview":
107
- processor = processor_o
108
- model = model_o
109
- elif model_name == "Typhoon-OCR-3B":
110
- processor = processor_t
111
- model = model_t
112
  else:
113
  yield "Invalid model selected.", "Invalid model selected."
114
  return
115
 
116
  if image is None:
117
- yield "Please upload an image.", "Please upload an image."
118
  return
119
 
 
120
  messages = [{
121
  "role": "user",
122
  "content": [
123
  {"type": "image", "image": image},
124
- {"type": "text", "text": text},
125
  ]
126
  }]
127
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
128
  inputs = processor(
129
- text=[prompt_full],
130
  images=[image],
131
  return_tensors="pt",
132
  padding=True,
133
  truncation=False,
134
  max_length=MAX_INPUT_TOKEN_LENGTH
135
  ).to(device)
 
136
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
137
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
138
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
139
- thread.start()
 
 
 
 
 
 
 
 
140
  buffer = ""
141
- for new_text in streamer:
142
- buffer += new_text
143
- time.sleep(0.01)
144
  yield buffer, buffer
145
 
146
  @spaces.GPU
147
  def generate_video(model_name: str, text: str, video_path: str,
148
- max_new_tokens: int = 1024,
149
  temperature: float = 0.6,
150
  top_p: float = 0.9,
151
  top_k: int = 50,
152
  repetition_penalty: float = 1.2):
153
- """
154
- Generates responses using the selected model for video input.
155
- """
156
- if model_name == "VIREX-062225-7B-exp":
157
- processor = processor_m
158
- model = model_m
159
- elif model_name == "DREX-062225-7B-exp":
160
- processor = processor_x
161
- model = model_x
162
- elif model_name == "olmOCR-7B-0225-preview":
163
- processor = processor_o
164
- model = model_o
165
- elif model_name == "Typhoon-OCR-3B":
166
- processor = processor_t
167
- model = model_t
168
  else:
169
  yield "Invalid model selected.", "Invalid model selected."
170
  return
171
 
172
  if video_path is None:
173
- yield "Please upload a video.", "Please upload a video."
174
  return
175
 
 
176
  frames = downsample_video(video_path)
 
 
177
  messages = [
178
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
179
- {"role": "user", "content": [{"type": "text", "text": text}]}
180
  ]
181
- for frame in frames:
182
- image, timestamp = frame
183
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
184
- messages[1]["content"].append({"type": "image", "image": image})
185
  inputs = processor.apply_chat_template(
186
  messages,
187
  tokenize=True,
@@ -191,27 +219,26 @@ def generate_video(model_name: str, text: str, video_path: str,
191
  truncation=False,
192
  max_length=MAX_INPUT_TOKEN_LENGTH
193
  ).to(device)
 
194
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
195
- generation_kwargs = {
196
- **inputs,
197
- "streamer": streamer,
198
- "max_new_tokens": max_new_tokens,
199
- "do_sample": True,
200
- "temperature": temperature,
201
- "top_p": top_p,
202
- "top_k": top_k,
203
- "repetition_penalty": repetition_penalty,
204
- }
205
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
206
- thread.start()
207
  buffer = ""
208
- for new_text in streamer:
209
- buffer += new_text
210
- buffer = buffer.replace("<|im_end|>", "")
211
- time.sleep(0.01)
212
  yield buffer, buffer
213
 
214
- # Define examples for image and video inference
 
 
215
  image_examples = [
216
  ["Convert this page to doc [text] precisely.", "images/3.png"],
217
  ["Convert this page to doc [text] precisely.", "images/4.png"],
@@ -224,7 +251,6 @@ video_examples = [
224
  ["Explain the ad in detail.", "videos/1.mp4"]
225
  ]
226
 
227
- # Added CSS to style the output area as a "Canvas"
228
  css = """
229
  .submit-btn {
230
  background-color: #2980b9 !important;
@@ -240,54 +266,47 @@ css = """
240
  }
241
  """
242
 
243
- # Create the Gradio Interface
244
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
245
  gr.Markdown("# **[Doc VLMs OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
246
  with gr.Row():
247
  with gr.Column():
248
  with gr.Tabs():
249
  with gr.TabItem("Image Inference"):
250
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
251
- image_upload = gr.Image(type="pil", label="Image")
252
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
253
- gr.Examples(
254
- examples=image_examples,
255
- inputs=[image_query, image_upload]
256
- )
257
  with gr.TabItem("Video Inference"):
258
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
259
  video_upload = gr.Video(label="Video")
260
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
261
- gr.Examples(
262
- examples=video_examples,
263
- inputs=[video_query, video_upload]
264
- )
265
  with gr.Accordion("Advanced options", open=False):
266
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
267
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
268
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
269
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
270
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
271
 
272
- with gr.Column():
273
- with gr.Column(elem_classes="canvas-output"):
274
- gr.Markdown("## Result Canvas")
275
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
276
- markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
277
-
278
- model_choice = gr.Radio(
279
- choices=["DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "VIREX-062225-7B-exp", "Typhoon-OCR-3B"],
280
- label="Select Model",
281
- value="DREX-062225-7B-exp"
282
- )
283
-
284
- gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
285
- gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
286
- gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
287
- gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): a bilingual document parsing model built specifically for real-world documents in thai and english, inspired by models like olmocr, based on qwen2.5-vl-instruction. this model is intended to be used with a specific prompt only.")
288
- gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
289
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
290
-
291
  image_submit.click(
292
  fn=generate_image,
293
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
23
  )
24
  from transformers.image_utils import load_image
25
 
26
+ # Optionally enable synchronous CUDA errors for debugging:
27
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
28
+
29
  # Constants for text generation
30
  MAX_MAX_NEW_TOKENS = 2048
31
  DEFAULT_MAX_NEW_TOKENS = 1024
 
33
 
34
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
 
36
+ # -------------------------------------------------------------------
37
+ # Load models and processors
38
+ # -------------------------------------------------------------------
39
+
40
+ # VIREX (Video Information Retrieval & Extraction)
41
+ MODEL_ID_VIREX = "prithivMLmods/VIREX-062225-exp"
42
+ processor_virex = AutoProcessor.from_pretrained(MODEL_ID_VIREX, trust_remote_code=True)
43
+ model_virex = Qwen2_5_VLForConditionalGeneration.from_pretrained(
44
+ MODEL_ID_VIREX,
45
  trust_remote_code=True,
46
  torch_dtype=torch.float16
47
  ).to(device).eval()
48
 
49
+ # DREX (Document Retrieval & Extraction Expert)
50
+ MODEL_ID_DREX = "prithivMLmods/DREX-062225-exp"
51
+ processor_drex = AutoProcessor.from_pretrained(MODEL_ID_DREX, trust_remote_code=True)
52
+ model_drex = Qwen2_5_VLForConditionalGeneration.from_pretrained(
53
+ MODEL_ID_DREX,
54
  trust_remote_code=True,
55
  torch_dtype=torch.float16
56
  ).to(device).eval()
57
 
58
+ # Typhoon-OCR-3B (Thai/English OCR parser)
59
+ MODEL_ID_TYPHOON = "sarvamai/sarvam-translate"
60
+ processor_typhoon = AutoProcessor.from_pretrained(MODEL_ID_TYPHOON, trust_remote_code=True)
61
+ model_typhoon = Gemma3ForConditionalGeneration.from_pretrained(
62
+ MODEL_ID_TYPHOON,
63
  trust_remote_code=True,
64
  torch_dtype=torch.float16
65
  ).to(device).eval()
66
 
67
+ # olmOCR-7B-0225-preview (document OCR + LaTeX)
68
+ MODEL_ID_OLM = "allenai/olmOCR-7B-0225-preview"
69
+ processor_olm = AutoProcessor.from_pretrained(MODEL_ID_OLM, trust_remote_code=True)
70
+ model_olm = Qwen2VLForConditionalGeneration.from_pretrained(
71
+ MODEL_ID_OLM,
72
  trust_remote_code=True,
73
  torch_dtype=torch.float16
74
  ).to(device).eval()
75
 
76
+ # -------------------------------------------------------------------
77
+ # Video downsampling helper
78
+ # -------------------------------------------------------------------
79
  def downsample_video(video_path):
80
  """
81
+ Downsamples the video to 10 evenly spaced frames.
82
+ Returns a list of (PIL.Image, timestamp) tuples.
83
  """
84
  vidcap = cv2.VideoCapture(video_path)
85
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
86
+ fps = vidcap.get(cv2.CAP_PROP_FPS) or 30.0
87
  frames = []
88
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
89
+ for idx in frame_indices:
90
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
91
+ success, img = vidcap.read()
92
+ if not success:
93
+ continue
94
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
95
+ frames.append((Image.fromarray(img), round(idx / fps, 2)))
 
96
  vidcap.release()
97
  return frames
98
 
99
+ # -------------------------------------------------------------------
100
+ # Generation loops
101
+ # -------------------------------------------------------------------
102
+ def _make_generation_kwargs(processor, inputs, streamer, max_new_tokens, do_sample=False, temperature=1.0, top_p=1.0, top_k=0, repetition_penalty=1.0):
103
+ # ensure pad/eos tokens are defined
104
+ tok = processor.tokenizer
105
+ return {
106
+ **inputs,
107
+ "streamer": streamer,
108
+ "max_new_tokens": max_new_tokens,
109
+ "do_sample": do_sample,
110
+ "temperature": temperature,
111
+ "top_p": top_p,
112
+ "top_k": top_k,
113
+ "repetition_penalty": repetition_penalty,
114
+ "pad_token_id": tok.eos_token_id,
115
+ "eos_token_id": tok.eos_token_id,
116
+ }
117
+
118
  @spaces.GPU
119
  def generate_image(model_name: str, text: str, image: Image.Image,
120
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
121
  temperature: float = 0.6,
122
  top_p: float = 0.9,
123
  top_k: int = 50,
124
  repetition_penalty: float = 1.2):
125
+ # select
126
+ if model_name.startswith("VIREX"):
127
+ processor, model = processor_virex, model_virex
128
+ elif model_name.startswith("DREX"):
129
+ processor, model = processor_drex, model_drex
130
+ elif model_name.startswith("olmOCR"):
131
+ processor, model = processor_olm, model_olm
132
+ elif model_name.startswith("Typhoon"):
133
+ processor, model = processor_typhoon, model_typhoon
 
 
 
 
 
 
134
  else:
135
  yield "Invalid model selected.", "Invalid model selected."
136
  return
137
 
138
  if image is None:
139
+ yield "Please upload an image.", ""
140
  return
141
 
142
+ # build the chat-style prompt
143
  messages = [{
144
  "role": "user",
145
  "content": [
146
  {"type": "image", "image": image},
147
+ {"type": "text", "text": text},
148
  ]
149
  }]
150
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
151
  inputs = processor(
152
+ text=[prompt],
153
  images=[image],
154
  return_tensors="pt",
155
  padding=True,
156
  truncation=False,
157
  max_length=MAX_INPUT_TOKEN_LENGTH
158
  ).to(device)
159
+
160
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
161
+ gen_kwargs = _make_generation_kwargs(
162
+ processor, inputs, streamer, max_new_tokens,
163
+ do_sample=True,
164
+ temperature=temperature,
165
+ top_p=top_p,
166
+ top_k=top_k,
167
+ repetition_penalty=repetition_penalty
168
+ )
169
+
170
+ # launch
171
+ Thread(target=model.generate, kwargs=gen_kwargs).start()
172
  buffer = ""
173
+ for chunk in streamer:
174
+ buffer += chunk
 
175
  yield buffer, buffer
176
 
177
  @spaces.GPU
178
  def generate_video(model_name: str, text: str, video_path: str,
179
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
180
  temperature: float = 0.6,
181
  top_p: float = 0.9,
182
  top_k: int = 50,
183
  repetition_penalty: float = 1.2):
184
+ # select model
185
+ if model_name.startswith("VIREX"):
186
+ processor, model = processor_virex, model_virex
187
+ elif model_name.startswith("DREX"):
188
+ processor, model = processor_drex, model_drex
189
+ elif model_name.startswith("olmOCR"):
190
+ processor, model = processor_olm, model_olm
191
+ elif model_name.startswith("Typhoon"):
192
+ processor, model = processor_typhoon, model_typhoon
 
 
 
 
 
 
193
  else:
194
  yield "Invalid model selected.", "Invalid model selected."
195
  return
196
 
197
  if video_path is None:
198
+ yield "Please upload a video.", ""
199
  return
200
 
201
+ # downsample frames
202
  frames = downsample_video(video_path)
203
+
204
+ # system + user
205
  messages = [
206
+ {"role": "system", "content": [{"type":"text", "text":"You are a helpful assistant."}]},
207
+ {"role": "user", "content": [{"type":"text", "text": text}]}
208
  ]
209
+ for img, ts in frames:
210
+ messages[1]["content"].append({"type":"text", "text":f"Frame {ts}s:"})
211
+ messages[1]["content"].append({"type":"image", "image":img})
212
+
213
  inputs = processor.apply_chat_template(
214
  messages,
215
  tokenize=True,
 
219
  truncation=False,
220
  max_length=MAX_INPUT_TOKEN_LENGTH
221
  ).to(device)
222
+
223
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
224
+ gen_kwargs = _make_generation_kwargs(
225
+ processor, inputs, streamer, max_new_tokens,
226
+ do_sample=True,
227
+ temperature=temperature,
228
+ top_p=top_p,
229
+ top_k=top_k,
230
+ repetition_penalty=repetition_penalty
231
+ )
232
+
233
+ Thread(target=model.generate, kwargs=gen_kwargs).start()
 
 
234
  buffer = ""
235
+ for chunk in streamer:
236
+ buffer += chunk.replace("<|im_end|>", "")
 
 
237
  yield buffer, buffer
238
 
239
+ # -------------------------------------------------------------------
240
+ # Examples, CSS, and launch
241
+ # -------------------------------------------------------------------
242
  image_examples = [
243
  ["Convert this page to doc [text] precisely.", "images/3.png"],
244
  ["Convert this page to doc [text] precisely.", "images/4.png"],
 
251
  ["Explain the ad in detail.", "videos/1.mp4"]
252
  ]
253
 
 
254
  css = """
255
  .submit-btn {
256
  background-color: #2980b9 !important;
 
266
  }
267
  """
268
 
 
269
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
270
  gr.Markdown("# **[Doc VLMs OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
271
  with gr.Row():
272
  with gr.Column():
273
  with gr.Tabs():
274
  with gr.TabItem("Image Inference"):
275
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
276
+ image_upload = gr.Image(type="pil", label="Image")
277
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
278
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
 
 
 
279
  with gr.TabItem("Video Inference"):
280
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
281
  video_upload = gr.Video(label="Video")
282
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
283
+ gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
284
+
 
 
285
  with gr.Accordion("Advanced options", open=False):
286
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
287
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
288
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
289
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
290
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
291
 
292
+ with gr.Column(elem_classes="canvas-output"):
293
+ gr.Markdown("## Result Canvas")
294
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
295
+ markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
296
+
297
+ model_choice = gr.Radio(
298
+ choices=["DREX-062225-7B-exp", "olmOCR-7B-0225-preview", "VIREX-062225-7B-exp", "Typhoon-OCR-3B"],
299
+ label="Select Model",
300
+ value="DREX-062225-7B-exp"
301
+ )
302
+
303
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
304
+ gr.Markdown("> [DREX-062225-7B-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): ...")
305
+ gr.Markdown("> [VIREX-062225-7B-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): ...")
306
+ gr.Markdown("> [Typhoon-OCR-3B](https://huggingface.co/scb10x/typhoon-ocr-3b): ...")
307
+ gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): ...")
308
+ gr.Markdown("> ⚠️ note: video inference may be less reliable.")
309
+
 
310
  image_submit.click(
311
  fn=generate_image,
312
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],