Update app.py
Browse files
app.py
CHANGED
@@ -1,31 +1,32 @@
|
|
|
|
1 |
import torch
|
2 |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
3 |
from qwen_vl_utils import process_vision_info
|
4 |
from PIL import Image
|
5 |
import cv2
|
6 |
import numpy as np
|
7 |
-
import gradio as gr
|
8 |
import spaces
|
9 |
|
10 |
# Load the model and processor
|
|
|
11 |
def load_model():
|
12 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
13 |
"Qwen/Qwen2-VL-2B-Instruct",
|
14 |
torch_dtype=torch.float16
|
15 |
-
)
|
16 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
17 |
return model, processor
|
18 |
|
19 |
model, processor = load_model()
|
20 |
|
21 |
@spaces.GPU
|
22 |
-
def process_image(image):
|
23 |
messages = [
|
24 |
{
|
25 |
"role": "user",
|
26 |
"content": [
|
27 |
{"type": "image", "image": image},
|
28 |
-
{"type": "text", "text":
|
29 |
],
|
30 |
}
|
31 |
]
|
@@ -40,7 +41,7 @@ def process_image(image):
|
|
40 |
padding=True,
|
41 |
return_tensors="pt",
|
42 |
).to("cuda")
|
43 |
-
|
44 |
with torch.no_grad():
|
45 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
46 |
generated_ids_trimmed = [
|
@@ -53,7 +54,7 @@ def process_image(image):
|
|
53 |
return output_text[0]
|
54 |
|
55 |
@spaces.GPU
|
56 |
-
def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
|
57 |
cap = cv2.VideoCapture(video_path)
|
58 |
frames = []
|
59 |
frame_count = 0
|
@@ -83,7 +84,7 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
|
|
83 |
"role": "user",
|
84 |
"content": [
|
85 |
{"type": "video", "video": frames},
|
86 |
-
{"type": "text", "text":
|
87 |
],
|
88 |
}
|
89 |
]
|
@@ -99,6 +100,9 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
|
|
99 |
return_tensors="pt",
|
100 |
).to("cuda")
|
101 |
|
|
|
|
|
|
|
102 |
with torch.no_grad():
|
103 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
104 |
generated_ids_trimmed = [
|
@@ -111,25 +115,26 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
|
|
111 |
return output_text[0]
|
112 |
|
113 |
@spaces.GPU
|
114 |
-
def process_content(content):
|
115 |
if content is None:
|
116 |
return "Please upload an image or video file."
|
117 |
|
118 |
-
if content
|
119 |
-
return process_image(
|
120 |
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
|
121 |
-
return process_video(content.name)
|
122 |
else:
|
123 |
return "Unsupported file type. Please provide an image or video file."
|
124 |
|
125 |
-
# Gradio interface
|
126 |
iface = gr.Interface(
|
127 |
fn=process_content,
|
128 |
-
inputs=
|
|
|
|
|
|
|
129 |
outputs="text",
|
130 |
-
title="Image and Video Description",
|
131 |
-
description="Upload an image or video to get a
|
132 |
)
|
133 |
|
134 |
-
|
135 |
-
iface.launch()
|
|
|
1 |
+
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
4 |
from qwen_vl_utils import process_vision_info
|
5 |
from PIL import Image
|
6 |
import cv2
|
7 |
import numpy as np
|
|
|
8 |
import spaces
|
9 |
|
10 |
# Load the model and processor
|
11 |
+
@spaces.GPU
|
12 |
def load_model():
|
13 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
14 |
"Qwen/Qwen2-VL-2B-Instruct",
|
15 |
torch_dtype=torch.float16
|
16 |
+
).to("cuda")
|
17 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
18 |
return model, processor
|
19 |
|
20 |
model, processor = load_model()
|
21 |
|
22 |
@spaces.GPU
|
23 |
+
def process_image(image, prompt):
|
24 |
messages = [
|
25 |
{
|
26 |
"role": "user",
|
27 |
"content": [
|
28 |
{"type": "image", "image": image},
|
29 |
+
{"type": "text", "text": prompt},
|
30 |
],
|
31 |
}
|
32 |
]
|
|
|
41 |
padding=True,
|
42 |
return_tensors="pt",
|
43 |
).to("cuda")
|
44 |
+
|
45 |
with torch.no_grad():
|
46 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
47 |
generated_ids_trimmed = [
|
|
|
54 |
return output_text[0]
|
55 |
|
56 |
@spaces.GPU
|
57 |
+
def process_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
|
58 |
cap = cv2.VideoCapture(video_path)
|
59 |
frames = []
|
60 |
frame_count = 0
|
|
|
84 |
"role": "user",
|
85 |
"content": [
|
86 |
{"type": "video", "video": frames},
|
87 |
+
{"type": "text", "text": prompt},
|
88 |
],
|
89 |
}
|
90 |
]
|
|
|
100 |
return_tensors="pt",
|
101 |
).to("cuda")
|
102 |
|
103 |
+
del frames, image_inputs, video_inputs
|
104 |
+
torch.cuda.empty_cache()
|
105 |
+
|
106 |
with torch.no_grad():
|
107 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
108 |
generated_ids_trimmed = [
|
|
|
115 |
return output_text[0]
|
116 |
|
117 |
@spaces.GPU
|
118 |
+
def process_content(content, prompt):
|
119 |
if content is None:
|
120 |
return "Please upload an image or video file."
|
121 |
|
122 |
+
if isinstance(content, Image.Image):
|
123 |
+
return process_image(content, prompt)
|
124 |
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
|
125 |
+
return process_video(content.name, prompt)
|
126 |
else:
|
127 |
return "Unsupported file type. Please provide an image or video file."
|
128 |
|
|
|
129 |
iface = gr.Interface(
|
130 |
fn=process_content,
|
131 |
+
inputs=[
|
132 |
+
gr.File(label="Upload Image or Video", type="filepath"),
|
133 |
+
gr.Textbox(label="Enter your prompt or task description")
|
134 |
+
],
|
135 |
outputs="text",
|
136 |
+
title="Image and Video Description with Custom Prompt",
|
137 |
+
description="Upload an image or video and specify a task to get a response.",
|
138 |
)
|
139 |
|
140 |
+
iface.launch()
|
|