Spaces:

whyumesh
/

vision_v1

Sleeping

App Files Files Community

whyumesh commited on Oct 2, 2024

Commit

48da597

verified ·

1 Parent(s): cb893dd

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -125

app.py CHANGED Viewed

@@ -5,147 +5,130 @@ from PIL import Image
 import cv2
 import numpy as np
 import gradio as gr
-# Check GPU availability
-if not torch.cuda.is_available():
-    raise RuntimeError("This application requires a GPU to run. No GPU detected.")
 # Load the model and processor
 def load_model():
-    try:
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-2B-Instruct",
-            torch_dtype=torch.float16  # Use float16 for GPU
-        ).to("cuda")  # Explicitly use CUDA
-        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
-        return model, processor
-    except RuntimeError as e:
-        print(f"Error loading model: {e}")
-        raise
-try:
-    model, processor = load_model()
-except Exception as e:
-    print(f"Failed to load model: {e}")
-    raise
-def process_image(image):
-    try:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            }
-        ]
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        ).to("cuda")  # Explicitly use CUDA
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=256)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        return output_text[0]
-    except Exception as e:
-        return f"An error occurred while processing the image: {str(e)}"
 def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
-    try:
-        cap = cv2.VideoCapture(video_path)
-        frames = []
-        frame_count = 0
-        while len(frames) < max_frames:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            if frame_count % frame_interval == 0:
-                h, w = frame.shape[:2]
-                if h > w:
-                    new_h, new_w = max_resolution, int(w * max_resolution / h)
-                else:
-                    new_h, new_w = int(h * max_resolution / w), max_resolution
-                frame = cv2.resize(frame, (new_w, new_h))
-                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                frame = Image.fromarray(frame)
-                frames.append(frame)
-            frame_count += 1
-        cap.release()
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "video", "video": frames},
-                    {"type": "text", "text": "Describe this video."},
-                ],
-            }
-        ]
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        ).to("cuda")  # Explicitly use CUDA
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=256)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        return output_text[0]
-    except Exception as e:
-        return f"An error occurred while processing the video: {str(e)}"
 def process_content(content):
     if content is None:
         return "Please upload an image or video file."
-    try:
-        if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
-            return process_image(Image.open(content.name))
-        elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
-            return process_video(content.name)
-        else:
-            return "Unsupported file type. Please provide an image or video file."
-    except Exception as e:
-        return f"An error occurred while processing the content: {str(e)}"
 # Gradio interface
 iface = gr.Interface(
     fn=process_content,
     inputs=gr.File(label="Upload Image or Video"),
     outputs="text",
-    title="Image and Video Description (GPU Version)",
-    description="Upload an image or video to get a description. This application requires GPU computation.",
 )
 if __name__ == "__main__":

 import cv2
 import numpy as np
 import gradio as gr
+import spaces
 # Load the model and processor
 def load_model():
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        torch_dtype=torch.float16
+    )
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+    return model, processor
+model, processor = load_model()
+@spaces.GPU
+def process_image(image):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=256)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
+@spaces.GPU
 def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    frame_count = 0
+    while len(frames) < max_frames:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_count % frame_interval == 0:
+            h, w = frame.shape[:2]
+            if h > w:
+                new_h, new_w = max_resolution, int(w * max_resolution / h)
+            else:
+                new_h, new_w = int(h * max_resolution / w), max_resolution
+            frame = cv2.resize(frame, (new_w, new_h))
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            frames.append(frame)
+        frame_count += 1
+    cap.release()
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "video": frames},
+                {"type": "text", "text": "Describe this video."},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=256)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
+@spaces.GPU
 def process_content(content):
     if content is None:
         return "Please upload an image or video file."
+    if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
+        return process_image(Image.open(content.name))
+    elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
+        return process_video(content.name)
+    else:
+        return "Unsupported file type. Please provide an image or video file."
 # Gradio interface
 iface = gr.Interface(
     fn=process_content,
     inputs=gr.File(label="Upload Image or Video"),
     outputs="text",
+    title="Image and Video Description",
+    description="Upload an image or video to get a description.",
 )
 if __name__ == "__main__":