Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -97,11 +97,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
97 |
processor = processor_o
|
98 |
model = model_o
|
99 |
else:
|
100 |
-
yield "Invalid model selected."
|
101 |
return
|
102 |
|
103 |
if image is None:
|
104 |
-
yield "Please upload an image."
|
105 |
return
|
106 |
|
107 |
messages = [{
|
@@ -127,9 +127,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
127 |
buffer = ""
|
128 |
for new_text in streamer:
|
129 |
buffer += new_text
|
130 |
-
#buffer = buffer.replace("<|im_end|>", "")
|
131 |
time.sleep(0.01)
|
132 |
-
yield buffer
|
133 |
|
134 |
@spaces.GPU
|
135 |
def generate_video(model_name: str, text: str, video_path: str,
|
@@ -151,11 +150,11 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
151 |
processor = processor_o
|
152 |
model = model_o
|
153 |
else:
|
154 |
-
yield "Invalid model selected."
|
155 |
return
|
156 |
|
157 |
if video_path is None:
|
158 |
-
yield "Please upload a video."
|
159 |
return
|
160 |
|
161 |
frames = downsample_video(video_path)
|
@@ -194,7 +193,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
194 |
buffer += new_text
|
195 |
buffer = buffer.replace("<|im_end|>", "")
|
196 |
time.sleep(0.01)
|
197 |
-
yield buffer
|
198 |
|
199 |
# Define examples for image and video inference
|
200 |
image_examples = [
|
@@ -202,7 +201,6 @@ image_examples = [
|
|
202 |
["Convert this page to doc [text] precisely.", "images/4.png"],
|
203 |
["Convert this page to doc [text] precisely.", "images/1.png"],
|
204 |
["Convert chart to OTSL.", "images/2.png"]
|
205 |
-
|
206 |
]
|
207 |
|
208 |
video_examples = [
|
@@ -250,6 +248,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
250 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
251 |
with gr.Column():
|
252 |
output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
|
|
|
253 |
model_choice = gr.Radio(
|
254 |
choices=["DREX-062225-exp", "VIREX-062225-exp", "olmOCR-7B-0225"],
|
255 |
label="Select Model",
|
@@ -259,17 +258,17 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
259 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
|
260 |
gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
|
261 |
gr.Markdown("> [VIREX-062225-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
|
262 |
-
gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
|
263 |
|
264 |
image_submit.click(
|
265 |
fn=generate_image,
|
266 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
267 |
-
outputs=output
|
268 |
)
|
269 |
video_submit.click(
|
270 |
fn=generate_video,
|
271 |
inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
272 |
-
outputs=output
|
273 |
)
|
274 |
|
275 |
if __name__ == "__main__":
|
|
|
97 |
processor = processor_o
|
98 |
model = model_o
|
99 |
else:
|
100 |
+
yield "Invalid model selected.", "Invalid model selected."
|
101 |
return
|
102 |
|
103 |
if image is None:
|
104 |
+
yield "Please upload an image.", "Please upload an image."
|
105 |
return
|
106 |
|
107 |
messages = [{
|
|
|
127 |
buffer = ""
|
128 |
for new_text in streamer:
|
129 |
buffer += new_text
|
|
|
130 |
time.sleep(0.01)
|
131 |
+
yield buffer, buffer
|
132 |
|
133 |
@spaces.GPU
|
134 |
def generate_video(model_name: str, text: str, video_path: str,
|
|
|
150 |
processor = processor_o
|
151 |
model = model_o
|
152 |
else:
|
153 |
+
yield "Invalid model selected.", "Invalid model selected."
|
154 |
return
|
155 |
|
156 |
if video_path is None:
|
157 |
+
yield "Please upload a video.", "Please upload a video."
|
158 |
return
|
159 |
|
160 |
frames = downsample_video(video_path)
|
|
|
193 |
buffer += new_text
|
194 |
buffer = buffer.replace("<|im_end|>", "")
|
195 |
time.sleep(0.01)
|
196 |
+
yield buffer, buffer
|
197 |
|
198 |
# Define examples for image and video inference
|
199 |
image_examples = [
|
|
|
201 |
["Convert this page to doc [text] precisely.", "images/4.png"],
|
202 |
["Convert this page to doc [text] precisely.", "images/1.png"],
|
203 |
["Convert chart to OTSL.", "images/2.png"]
|
|
|
204 |
]
|
205 |
|
206 |
video_examples = [
|
|
|
248 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
249 |
with gr.Column():
|
250 |
output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
|
251 |
+
markdown_output = gr.Markdown(label="Result.Md", scale=2)
|
252 |
model_choice = gr.Radio(
|
253 |
choices=["DREX-062225-exp", "VIREX-062225-exp", "olmOCR-7B-0225"],
|
254 |
label="Select Model",
|
|
|
258 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
|
259 |
gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
|
260 |
gr.Markdown("> [VIREX-062225-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
|
261 |
+
gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
|
262 |
|
263 |
image_submit.click(
|
264 |
fn=generate_image,
|
265 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
266 |
+
outputs=[output, markdown_output]
|
267 |
)
|
268 |
video_submit.click(
|
269 |
fn=generate_video,
|
270 |
inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
271 |
+
outputs=[output, markdown_output]
|
272 |
)
|
273 |
|
274 |
if __name__ == "__main__":
|