Spaces:

whyumesh
/

vision_v1

Sleeping

App Files Files Community

whyumesh commited on Oct 2, 2024

Commit

a8a3b01

verified ·

1 Parent(s): 478042e

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -103

app.py CHANGED Viewed

@@ -6,126 +6,147 @@ import cv2
 import numpy as np
 import gradio as gr
 # Load the model and processor
 def load_model():
-    model = Qwen2VLForConditionalGeneration.from_pretrained(
-        "Qwen/Qwen2-VL-2B-Instruct",
-        torch_dtype=torch.float16
-    ).to("cuda" if torch.cuda.is_available() else "cpu")
-    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
-    return model, processor
-model, processor = load_model()
 def process_image(image):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": "Describe this image."},
-            ],
-        }
-    ]
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    ).to(model.device)
-    with torch.no_grad():
-        generated_ids = model.generate(**inputs, max_new_tokens=256)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    return output_text[0]
 def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
-    cap = cv2.VideoCapture(video_path)
-    frames = []
-    frame_count = 0
-    while len(frames) < max_frames:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        if frame_count % frame_interval == 0:
-            h, w = frame.shape[:2]
-            if h > w:
-                new_h, new_w = max_resolution, int(w * max_resolution / h)
-            else:
-                new_h, new_w = int(h * max_resolution / w), max_resolution
-            frame = cv2.resize(frame, (new_w, new_h))
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frame = Image.fromarray(frame)
-            frames.append(frame)
-        frame_count += 1
-    cap.release()
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "video", "video": frames},
-                {"type": "text", "text": "Describe this video."},
-            ],
-        }
-    ]
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    ).to(model.device)
-    with torch.no_grad():
-        generated_ids = model.generate(**inputs, max_new_tokens=256)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    return output_text[0]
 def process_content(content):
     if content is None:
         return "Please upload an image or video file."
-    if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
-        return process_image(Image.open(content.name))
-    elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
-        return process_video(content.name)
-    else:
-        return "Unsupported file type. Please provide an image or video file."
 # Gradio interface
 iface = gr.Interface(
     fn=process_content,
     inputs=gr.File(label="Upload Image or Video"),
     outputs="text",
-    title="Image and Video Description",
-    description="Upload an image or video to get a description.",
 )
 if __name__ == "__main__":
-    iface.launch(share=True)

 import numpy as np
 import gradio as gr
+# Check GPU availability
+if not torch.cuda.is_available():
+    raise RuntimeError("This application requires a GPU to run. No GPU detected.")
 # Load the model and processor
 def load_model():
+    try:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-2B-Instruct",
+            torch_dtype=torch.float16  # Use float16 for GPU
+        ).to("cuda")  # Explicitly use CUDA
+        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+        return model, processor
+    except RuntimeError as e:
+        print(f"Error loading model: {e}")
+        raise
+try:
+    model, processor = load_model()
+except Exception as e:
+    print(f"Failed to load model: {e}")
+    raise
 def process_image(image):
+    try:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to("cuda")  # Explicitly use CUDA
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=256)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return output_text[0]
+    except Exception as e:
+        return f"An error occurred while processing the image: {str(e)}"
 def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
+    try:
+        cap = cv2.VideoCapture(video_path)
+        frames = []
+        frame_count = 0
+        while len(frames) < max_frames:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_count % frame_interval == 0:
+                h, w = frame.shape[:2]
+                if h > w:
+                    new_h, new_w = max_resolution, int(w * max_resolution / h)
+                else:
+                    new_h, new_w = int(h * max_resolution / w), max_resolution
+                frame = cv2.resize(frame, (new_w, new_h))
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frame = Image.fromarray(frame)
+                frames.append(frame)
+            frame_count += 1
+        cap.release()
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": frames},
+                    {"type": "text", "text": "Describe this video."},
+                ],
+            }
+        ]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to("cuda")  # Explicitly use CUDA
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=256)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return output_text[0]
+    except Exception as e:
+        return f"An error occurred while processing the video: {str(e)}"
 def process_content(content):
     if content is None:
         return "Please upload an image or video file."
+    try:
+        if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
+            return process_image(Image.open(content.name))
+        elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
+            return process_video(content.name)
+        else:
+            return "Unsupported file type. Please provide an image or video file."
+    except Exception as e:
+        return f"An error occurred while processing the content: {str(e)}"
 # Gradio interface
 iface = gr.Interface(
     fn=process_content,
     inputs=gr.File(label="Upload Image or Video"),
     outputs="text",
+    title="Image and Video Description (GPU Version)",
+    description="Upload an image or video to get a description. This application requires GPU computation.",
 )
 if __name__ == "__main__":
+    iface.launch()