Spaces:

whyumesh
/

vision_v1

Sleeping

App Files Files Community

whyumesh commited on Oct 2, 2024

Commit

07de076

verified ·

1 Parent(s): 48da597

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -17

app.py CHANGED Viewed

@@ -1,31 +1,32 @@
 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 from PIL import Image
 import cv2
 import numpy as np
-import gradio as gr
 import spaces
 # Load the model and processor
 def load_model():
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
         torch_dtype=torch.float16
-    )
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model, processor
 model, processor = load_model()
 @spaces.GPU
-def process_image(image):
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "image", "image": image},
-                {"type": "text", "text": "Describe this image."},
             ],
         }
     ]
@@ -40,7 +41,7 @@ def process_image(image):
         padding=True,
         return_tensors="pt",
     ).to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
@@ -53,7 +54,7 @@ def process_image(image):
     return output_text[0]
 @spaces.GPU
-def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
     cap = cv2.VideoCapture(video_path)
     frames = []
     frame_count = 0
@@ -83,7 +84,7 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
             "role": "user",
             "content": [
                 {"type": "video", "video": frames},
-                {"type": "text", "text": "Describe this video."},
             ],
         }
     ]
@@ -99,6 +100,9 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
         return_tensors="pt",
     ).to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
@@ -111,25 +115,26 @@ def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=2
     return output_text[0]
 @spaces.GPU
-def process_content(content):
     if content is None:
         return "Please upload an image or video file."
-    if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
-        return process_image(Image.open(content.name))
     elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
-        return process_video(content.name)
     else:
         return "Unsupported file type. Please provide an image or video file."
-# Gradio interface
 iface = gr.Interface(
     fn=process_content,
-    inputs=gr.File(label="Upload Image or Video"),
     outputs="text",
-    title="Image and Video Description",
-    description="Upload an image or video to get a description.",
 )
-if __name__ == "__main__":
-    iface.launch()

+import gradio as gr
 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 from PIL import Image
 import cv2
 import numpy as np
 import spaces
 # Load the model and processor
+@spaces.GPU
 def load_model():
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
         torch_dtype=torch.float16
+    ).to("cuda")
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model, processor
 model, processor = load_model()
 @spaces.GPU
+def process_image(image, prompt):
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
             ],
         }
     ]
         padding=True,
         return_tensors="pt",
     ).to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
     return output_text[0]
 @spaces.GPU
+def process_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
     cap = cv2.VideoCapture(video_path)
     frames = []
     frame_count = 0
             "role": "user",
             "content": [
                 {"type": "video", "video": frames},
+                {"type": "text", "text": prompt},
             ],
         }
     ]
         return_tensors="pt",
     ).to("cuda")
+    del frames, image_inputs, video_inputs
+    torch.cuda.empty_cache()
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
     return output_text[0]
 @spaces.GPU
+def process_content(content, prompt):
     if content is None:
         return "Please upload an image or video file."
+    if isinstance(content, Image.Image):
+        return process_image(content, prompt)
     elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
+        return process_video(content.name, prompt)
     else:
         return "Unsupported file type. Please provide an image or video file."
 iface = gr.Interface(
     fn=process_content,
+    inputs=[
+        gr.File(label="Upload Image or Video", type="filepath"),
+        gr.Textbox(label="Enter your prompt or task description")
+    ],
     outputs="text",
+    title="Image and Video Description with Custom Prompt",
+    description="Upload an image or video and specify a task to get a response.",
 )
+iface.launch()