prithivMLmods commited on
Commit
cf972b9
·
verified ·
1 Parent(s): 481fc63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -97,11 +97,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
97
  processor = processor_o
98
  model = model_o
99
  else:
100
- yield "Invalid model selected."
101
  return
102
 
103
  if image is None:
104
- yield "Please upload an image."
105
  return
106
 
107
  messages = [{
@@ -127,9 +127,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
127
  buffer = ""
128
  for new_text in streamer:
129
  buffer += new_text
130
- #buffer = buffer.replace("<|im_end|>", "")
131
  time.sleep(0.01)
132
- yield buffer
133
 
134
  @spaces.GPU
135
  def generate_video(model_name: str, text: str, video_path: str,
@@ -151,11 +150,11 @@ def generate_video(model_name: str, text: str, video_path: str,
151
  processor = processor_o
152
  model = model_o
153
  else:
154
- yield "Invalid model selected."
155
  return
156
 
157
  if video_path is None:
158
- yield "Please upload a video."
159
  return
160
 
161
  frames = downsample_video(video_path)
@@ -194,7 +193,7 @@ def generate_video(model_name: str, text: str, video_path: str,
194
  buffer += new_text
195
  buffer = buffer.replace("<|im_end|>", "")
196
  time.sleep(0.01)
197
- yield buffer
198
 
199
  # Define examples for image and video inference
200
  image_examples = [
@@ -202,7 +201,6 @@ image_examples = [
202
  ["Convert this page to doc [text] precisely.", "images/4.png"],
203
  ["Convert this page to doc [text] precisely.", "images/1.png"],
204
  ["Convert chart to OTSL.", "images/2.png"]
205
-
206
  ]
207
 
208
  video_examples = [
@@ -250,6 +248,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
250
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
251
  with gr.Column():
252
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
 
253
  model_choice = gr.Radio(
254
  choices=["DREX-062225-exp", "VIREX-062225-exp", "olmOCR-7B-0225"],
255
  label="Select Model",
@@ -259,17 +258,17 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
259
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
260
  gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
261
  gr.Markdown("> [VIREX-062225-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
262
- gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
263
 
264
  image_submit.click(
265
  fn=generate_image,
266
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
267
- outputs=output
268
  )
269
  video_submit.click(
270
  fn=generate_video,
271
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
272
- outputs=output
273
  )
274
 
275
  if __name__ == "__main__":
 
97
  processor = processor_o
98
  model = model_o
99
  else:
100
+ yield "Invalid model selected.", "Invalid model selected."
101
  return
102
 
103
  if image is None:
104
+ yield "Please upload an image.", "Please upload an image."
105
  return
106
 
107
  messages = [{
 
127
  buffer = ""
128
  for new_text in streamer:
129
  buffer += new_text
 
130
  time.sleep(0.01)
131
+ yield buffer, buffer
132
 
133
  @spaces.GPU
134
  def generate_video(model_name: str, text: str, video_path: str,
 
150
  processor = processor_o
151
  model = model_o
152
  else:
153
+ yield "Invalid model selected.", "Invalid model selected."
154
  return
155
 
156
  if video_path is None:
157
+ yield "Please upload a video.", "Please upload a video."
158
  return
159
 
160
  frames = downsample_video(video_path)
 
193
  buffer += new_text
194
  buffer = buffer.replace("<|im_end|>", "")
195
  time.sleep(0.01)
196
+ yield buffer, buffer
197
 
198
  # Define examples for image and video inference
199
  image_examples = [
 
201
  ["Convert this page to doc [text] precisely.", "images/4.png"],
202
  ["Convert this page to doc [text] precisely.", "images/1.png"],
203
  ["Convert chart to OTSL.", "images/2.png"]
 
204
  ]
205
 
206
  video_examples = [
 
248
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
249
  with gr.Column():
250
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
251
+ markdown_output = gr.Markdown(label="Result.Md", scale=2)
252
  model_choice = gr.Radio(
253
  choices=["DREX-062225-exp", "VIREX-062225-exp", "olmOCR-7B-0225"],
254
  label="Select Model",
 
258
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
259
  gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
260
  gr.Markdown("> [VIREX-062225-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
261
+ gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
262
 
263
  image_submit.click(
264
  fn=generate_image,
265
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
266
+ outputs=[output, markdown_output]
267
  )
268
  video_submit.click(
269
  fn=generate_video,
270
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
271
+ outputs=[output, markdown_output]
272
  )
273
 
274
  if __name__ == "__main__":