Update app.py
Browse files
app.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1 |
-
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
4 |
from qwen_vl_utils import process_vision_info
|
5 |
from PIL import Image
|
6 |
import cv2
|
7 |
import numpy as np
|
|
|
8 |
import spaces
|
9 |
|
10 |
# Load the model and processor
|
11 |
-
@spaces.GPU
|
12 |
def load_model():
|
13 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
14 |
"Qwen/Qwen2-VL-2B-Instruct",
|
15 |
torch_dtype=torch.float16
|
16 |
-
)
|
17 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
18 |
return model, processor
|
19 |
|
@@ -41,7 +40,8 @@ def process_image(image, prompt):
|
|
41 |
padding=True,
|
42 |
return_tensors="pt",
|
43 |
).to("cuda")
|
44 |
-
|
|
|
45 |
with torch.no_grad():
|
46 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
47 |
generated_ids_trimmed = [
|
@@ -54,8 +54,8 @@ def process_image(image, prompt):
|
|
54 |
return output_text[0]
|
55 |
|
56 |
@spaces.GPU
|
57 |
-
def process_video(
|
58 |
-
cap = cv2.VideoCapture(
|
59 |
frames = []
|
60 |
frame_count = 0
|
61 |
|
@@ -100,9 +100,7 @@ def process_video(video_path, prompt, max_frames=16, frame_interval=30, max_reso
|
|
100 |
return_tensors="pt",
|
101 |
).to("cuda")
|
102 |
|
103 |
-
|
104 |
-
torch.cuda.empty_cache()
|
105 |
-
|
106 |
with torch.no_grad():
|
107 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
108 |
generated_ids_trimmed = [
|
@@ -119,22 +117,24 @@ def process_content(content, prompt):
|
|
119 |
if content is None:
|
120 |
return "Please upload an image or video file."
|
121 |
|
122 |
-
if
|
123 |
-
return process_image(content, prompt)
|
124 |
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
|
125 |
-
return process_video(content
|
126 |
else:
|
127 |
return "Unsupported file type. Please provide an image or video file."
|
128 |
|
|
|
129 |
iface = gr.Interface(
|
130 |
fn=process_content,
|
131 |
inputs=[
|
132 |
-
gr.File(label="Upload Image or Video"
|
133 |
-
gr.Textbox(label="Enter your prompt
|
134 |
],
|
135 |
outputs="text",
|
136 |
-
title="Image and Video Description
|
137 |
-
description="Upload an image or video and
|
138 |
)
|
139 |
|
140 |
-
|
|
|
|
|
|
1 |
import torch
|
2 |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
3 |
from qwen_vl_utils import process_vision_info
|
4 |
from PIL import Image
|
5 |
import cv2
|
6 |
import numpy as np
|
7 |
+
import gradio as gr
|
8 |
import spaces
|
9 |
|
10 |
# Load the model and processor
|
|
|
11 |
def load_model():
|
12 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
13 |
"Qwen/Qwen2-VL-2B-Instruct",
|
14 |
torch_dtype=torch.float16
|
15 |
+
)
|
16 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
17 |
return model, processor
|
18 |
|
|
|
40 |
padding=True,
|
41 |
return_tensors="pt",
|
42 |
).to("cuda")
|
43 |
+
|
44 |
+
model.to("cuda")
|
45 |
with torch.no_grad():
|
46 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
47 |
generated_ids_trimmed = [
|
|
|
54 |
return output_text[0]
|
55 |
|
56 |
@spaces.GPU
|
57 |
+
def process_video(video, prompt, max_frames=16, frame_interval=30, max_resolution=224):
|
58 |
+
cap = cv2.VideoCapture(video.name)
|
59 |
frames = []
|
60 |
frame_count = 0
|
61 |
|
|
|
100 |
return_tensors="pt",
|
101 |
).to("cuda")
|
102 |
|
103 |
+
model.to("cuda")
|
|
|
|
|
104 |
with torch.no_grad():
|
105 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
106 |
generated_ids_trimmed = [
|
|
|
117 |
if content is None:
|
118 |
return "Please upload an image or video file."
|
119 |
|
120 |
+
if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
|
121 |
+
return process_image(Image.open(content.name), prompt)
|
122 |
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
|
123 |
+
return process_video(content, prompt)
|
124 |
else:
|
125 |
return "Unsupported file type. Please provide an image or video file."
|
126 |
|
127 |
+
# Gradio interface
|
128 |
iface = gr.Interface(
|
129 |
fn=process_content,
|
130 |
inputs=[
|
131 |
+
gr.File(label="Upload Image or Video"),
|
132 |
+
gr.Textbox(label="Enter your prompt")
|
133 |
],
|
134 |
outputs="text",
|
135 |
+
title="Image and Video Description",
|
136 |
+
description="Upload an image or video and enter a prompt to get a description or analysis.",
|
137 |
)
|
138 |
|
139 |
+
if __name__ == "__main__":
|
140 |
+
iface.launch()
|