Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,6 @@ from threading import Thread
|
|
5 |
import time
|
6 |
import torch
|
7 |
import spaces
|
8 |
-
from qwen_vl_utils import process_vision_info
|
9 |
|
10 |
# Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
|
11 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
@@ -56,18 +55,14 @@ def model_inference(input_dict, history):
|
|
56 |
}
|
57 |
]
|
58 |
|
59 |
-
# Process vision info (images and videos)
|
60 |
-
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
|
61 |
-
|
62 |
# Apply chat template and process inputs
|
63 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
64 |
inputs = processor(
|
65 |
text=[prompt],
|
66 |
-
images=
|
67 |
-
videos=
|
68 |
-
padding=True,
|
69 |
return_tensors="pt",
|
70 |
-
|
71 |
).to("cuda")
|
72 |
|
73 |
# Set up streamer for real-time output
|
@@ -90,7 +85,6 @@ def model_inference(input_dict, history):
|
|
90 |
|
91 |
# Example inputs
|
92 |
examples = [
|
93 |
-
[{"text": "Describe the video.", "files": ["examples/demo.mp4"]}],
|
94 |
[{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
|
95 |
[{"text": "summarize the letter", "files": ["examples/1.png"]}],
|
96 |
[{"text": "Describe the photo", "files": ["examples/3.png"]}],
|
@@ -101,7 +95,7 @@ examples = [
|
|
101 |
[{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
|
102 |
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
|
103 |
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
104 |
-
|
105 |
]
|
106 |
|
107 |
demo = gr.ChatInterface(
|
@@ -114,4 +108,4 @@ demo = gr.ChatInterface(
|
|
114 |
cache_examples=False,
|
115 |
)
|
116 |
|
117 |
-
demo.launch(debug=True)
|
|
|
5 |
import time
|
6 |
import torch
|
7 |
import spaces
|
|
|
8 |
|
9 |
# Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
|
10 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
|
|
55 |
}
|
56 |
]
|
57 |
|
|
|
|
|
|
|
58 |
# Apply chat template and process inputs
|
59 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
60 |
inputs = processor(
|
61 |
text=[prompt],
|
62 |
+
images=images if images else None,
|
63 |
+
videos=videos if videos else None,
|
|
|
64 |
return_tensors="pt",
|
65 |
+
padding=True,
|
66 |
).to("cuda")
|
67 |
|
68 |
# Set up streamer for real-time output
|
|
|
85 |
|
86 |
# Example inputs
|
87 |
examples = [
|
|
|
88 |
[{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
|
89 |
[{"text": "summarize the letter", "files": ["examples/1.png"]}],
|
90 |
[{"text": "Describe the photo", "files": ["examples/3.png"]}],
|
|
|
95 |
[{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
|
96 |
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
|
97 |
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
98 |
+
[{"text": "Describe the video.", "files": ["example_videos/sample.mp4"]}],
|
99 |
]
|
100 |
|
101 |
demo = gr.ChatInterface(
|
|
|
108 |
cache_examples=False,
|
109 |
)
|
110 |
|
111 |
+
demo.launch(debug=True, share=True)
|