Spaces:

whyumesh
/

vision_v1

Sleeping

App Files Files Community

whyumesh commited on Oct 2, 2024

Commit

c28fb09

verified ·

1 Parent(s): 07de076

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -17

app.py CHANGED Viewed

@@ -1,19 +1,18 @@
-import gradio as gr
 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 from PIL import Image
 import cv2
 import numpy as np
 import spaces
 # Load the model and processor
-@spaces.GPU
 def load_model():
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
         torch_dtype=torch.float16
-    ).to("cuda")
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model, processor
@@ -41,7 +40,8 @@ def process_image(image, prompt):
         padding=True,
         return_tensors="pt",
     ).to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
@@ -54,8 +54,8 @@ def process_image(image, prompt):
     return output_text[0]
 @spaces.GPU
-def process_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
-    cap = cv2.VideoCapture(video_path)
     frames = []
     frame_count = 0
@@ -100,9 +100,7 @@ def process_video(video_path, prompt, max_frames=16, frame_interval=30, max_reso
         return_tensors="pt",
     ).to("cuda")
-    del frames, image_inputs, video_inputs
-    torch.cuda.empty_cache()
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
@@ -119,22 +117,24 @@ def process_content(content, prompt):
     if content is None:
         return "Please upload an image or video file."
-    if isinstance(content, Image.Image):
-        return process_image(content, prompt)
     elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
-        return process_video(content.name, prompt)
     else:
         return "Unsupported file type. Please provide an image or video file."
 iface = gr.Interface(
     fn=process_content,
     inputs=[
-        gr.File(label="Upload Image or Video", type="filepath"),
-        gr.Textbox(label="Enter your prompt or task description")
     ],
     outputs="text",
-    title="Image and Video Description with Custom Prompt",
-    description="Upload an image or video and specify a task to get a response.",
 )
-iface.launch()

 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 from PIL import Image
 import cv2
 import numpy as np
+import gradio as gr
 import spaces
 # Load the model and processor
 def load_model():
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
         torch_dtype=torch.float16
+    )
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model, processor
         padding=True,
         return_tensors="pt",
     ).to("cuda")
+    model.to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
     return output_text[0]
 @spaces.GPU
+def process_video(video, prompt, max_frames=16, frame_interval=30, max_resolution=224):
+    cap = cv2.VideoCapture(video.name)
     frames = []
     frame_count = 0
         return_tensors="pt",
     ).to("cuda")
+    model.to("cuda")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=256)
     generated_ids_trimmed = [
     if content is None:
         return "Please upload an image or video file."
+    if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
+        return process_image(Image.open(content.name), prompt)
     elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
+        return process_video(content, prompt)
     else:
         return "Unsupported file type. Please provide an image or video file."
+# Gradio interface
 iface = gr.Interface(
     fn=process_content,
     inputs=[
+        gr.File(label="Upload Image or Video"),
+        gr.Textbox(label="Enter your prompt")
     ],
     outputs="text",
+    title="Image and Video Description",
+    description="Upload an image or video and enter a prompt to get a description or analysis.",
 )
+if __name__ == "__main__":
+    iface.launch()