Spaces:

whyumesh
/

vision_v1

Sleeping

App Files Files Community

whyumesh commited on Oct 2, 2024

Commit

114c949

verified ·

1 Parent(s): b7f3d17

Create app.py

Browse files

Files changed (1) hide show

app.py +131 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from PIL import Image
+import cv2
+import numpy as np
+import gradio as gr
+# Load the model and processor
+def load_model():
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        torch_dtype=torch.float16
+    ).to("cuda" if torch.cuda.is_available() else "cpu")
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+    return model, processor
+model, processor = load_model()
+def process_image(image):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to(model.device)
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=256)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
+def process_video(video_path, max_frames=16, frame_interval=30, max_resolution=224):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    frame_count = 0
+    while len(frames) < max_frames:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_count % frame_interval == 0:
+            h, w = frame.shape[:2]
+            if h > w:
+                new_h, new_w = max_resolution, int(w * max_resolution / h)
+            else:
+                new_h, new_w = int(h * max_resolution / w), max_resolution
+            frame = cv2.resize(frame, (new_w, new_h))
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            frames.append(frame)
+        frame_count += 1
+    cap.release()
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "video": frames},
+                {"type": "text", "text": "Describe this video."},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to(model.device)
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=256)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
+def process_content(content):
+    if content is None:
+        return "Please upload an image or video file."
+    if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
+        return process_image(Image.open(content.name))
+    elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
+        return process_video(content.name)
+    else:
+        return "Unsupported file type. Please provide an image or video file."
+# Gradio interface
+iface = gr.Interface(
+    fn=process_content,
+    inputs=gr.File(label="Upload Image or Video"),
+    outputs="text",
+    title="Image and Video Description",
+    description="Upload an image or video to get a description.",
+)
+if __name__ == "__main__":
+    iface.launch()