Spaces:

ustc-community
/

d-fine-object-detection

Running on Zero

App Files Files Community

qubvel-hf HF Staff commited on May 2

Commit

fe66000

1 Parent(s): caa691a

Batched vidoe processing + supervision

Browse files

Files changed (7) hide show

app.py +180 -203
image.jpg → examples/images/crossroad.jpg +0 -0
video.mp4 → examples/videos/break_dance.mp4 +0 -0
examples/videos/fast_and_furious.mp4 +3 -0
examples/videos/traffic.mp4 +3 -0
packages.txt +1 -0
requirements.txt +5 -2

app.py CHANGED Viewed

@@ -1,19 +1,27 @@
-import logging
 import os
-from typing import Tuple, List, Optional
-from pathlib import Path
 import shutil
 import tempfile
-import numpy as np
-import cv2
 import gradio as gr
 from PIL import Image
-from transformers import pipeline
 from transformers.image_utils import load_image
-import tqdm
 # Configuration constants
 CHECKPOINTS = [
     "ustc-community/dfine_m_obj365",
     "ustc-community/dfine_n_coco",
     "ustc-community/dfine_s_coco",
@@ -24,15 +32,17 @@ CHECKPOINTS = [
     "ustc-community/dfine_l_obj365",
     "ustc-community/dfine_x_obj365",
     "ustc-community/dfine_s_obj2coco",
-    "ustc-community/dfine_m_obj2coco",
     "ustc-community/dfine_l_obj2coco_e25",
     "ustc-community/dfine_x_obj2coco",
 ]
-MAX_NUM_FRAMES = 300
 DEFAULT_CHECKPOINT = CHECKPOINTS[0]
 DEFAULT_CONFIDENCE_THRESHOLD = 0.3
 IMAGE_EXAMPLES = [
-    {"path": "./image.jpg", "use_url": False, "url": "", "label": "Local Image"},
     {
         "path": None,
         "use_url": True,
@@ -40,216 +50,195 @@ IMAGE_EXAMPLES = [
         "label": "Flickr Image",
     },
 ]
 VIDEO_EXAMPLES = [
-    {"path": "./video.mp4", "label": "Local Video"},
 ]
-ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov"}
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
-VIDEO_OUTPUT_DIR = Path("static/videos")
-VIDEO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 def detect_objects(
-    image: Optional[Image.Image],
     checkpoint: str,
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
-    use_url: bool = False,
-    url: str = "",
-) -> Tuple[
-    Optional[Tuple[Image.Image, List[Tuple[Tuple[int, int, int, int], str]]]],
-    gr.Markdown,
-]:
-    if use_url and url:
-        try:
-            input_image = load_image(url)
-        except Exception as e:
-            logger.error(f"Failed to load image from URL {url}: {str(e)}")
-            return None, gr.Markdown(
-                f"**Error**: Failed to load image from URL: {str(e)}", visible=True
-            )
-    elif image is not None:
-        if not isinstance(image, Image.Image):
-            logger.error("Input image is not a PIL Image")
-            return None, gr.Markdown("**Error**: Invalid image format.", visible=True)
-        input_image = image
-    else:
-        return None, gr.Markdown(
-            "**Error**: Please provide an image or URL.", visible=True
-        )
-    try:
-        pipe = pipeline(
-            "object-detection",
-            model=checkpoint,
-            image_processor=checkpoint,
-            device="cpu",
-        )
-    except Exception as e:
-        logger.error(f"Failed to initialize model pipeline for {checkpoint}: {str(e)}")
-        return None, gr.Markdown(
-            f"**Error**: Failed to load model: {str(e)}", visible=True
-        )
-    results = pipe(input_image, threshold=confidence_threshold)
-    img_width, img_height = input_image.size
-    annotations = []
-    for result in results:
-        score = result["score"]
-        if score < confidence_threshold:
-            continue
-        label = f"{result['label']} ({score:.2f})"
-        box = result["box"]
-        # Validate and convert box to (xmin, ymin, xmax, ymax)
-        bbox_xmin = max(0, int(box["xmin"]))
-        bbox_ymin = max(0, int(box["ymin"]))
-        bbox_xmax = min(img_width, int(box["xmax"]))
-        bbox_ymax = min(img_height, int(box["ymax"]))
-        if bbox_xmax <= bbox_xmin or bbox_ymax <= bbox_ymin:
-            continue
-        bounding_box = (bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax)
-        annotations.append((bounding_box, label))
-    if not annotations:
-        return (input_image, []), gr.Markdown(
-            "**Warning**: No objects detected above the confidence threshold. Try lowering the threshold.",
-            visible=True,
-        )
-    return (input_image, annotations), gr.Markdown(visible=False)
-def annotate_frame(
-    image: Image.Image, annotations: List[Tuple[Tuple[int, int, int, int], str]]
-) -> np.ndarray:
-    image_np = np.array(image)
-    image_bgr = image_np[:, :, ::-1].copy()  # RGB to BGR
-    for (xmin, ymin, xmax, ymax), label in annotations:
-        cv2.rectangle(image_bgr, (xmin, ymin), (xmax, ymax), (255, 255, 255), 2)
-        text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
-        cv2.rectangle(
-            image_bgr,
-            (xmin, ymin - text_size[1] - 4),
-            (xmin + text_size[0], ymin),
-            (255, 255, 255),
-            -1,
-        )
-        cv2.putText(
-            image_bgr,
-            label,
-            (xmin, ymin - 4),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            0.5,
-            (0, 0, 0),
-            1,
-        )
-    return image_bgr
 def process_video(
     video_path: str,
     checkpoint: str,
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
-) -> Tuple[Optional[str], gr.Markdown]:
     if not video_path or not os.path.isfile(video_path):
-        logger.error(f"Invalid video path: {video_path}")
-        return None, gr.Markdown(
-            "**Error**: Please provide a valid video file.", visible=True
-        )
     ext = os.path.splitext(video_path)[1].lower()
     if ext not in ALLOWED_VIDEO_EXTENSIONS:
-        logger.error(f"Unsupported video format: {ext}")
-        return None, gr.Markdown(
-            f"**Error**: Unsupported video format. Use MP4, AVI, or MOV.", visible=True
-        )
-    try:
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            logger.error(f"Failed to open video: {video_path}")
-            return None, gr.Markdown(
-                "**Error**: Failed to open video file.", visible=True
-            )
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        # Use H.264 codec for browser compatibility
-        # fourcc = cv2.VideoWriter_fourcc(*"H264")
-        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-        temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
-        writer = cv2.VideoWriter(temp_file.name, fourcc, fps, (width, height))
-        if not writer.isOpened():
-            logger.error("Failed to initialize video writer")
-            cap.release()
-            temp_file.close()
-            os.unlink(temp_file.name)
-            return None, gr.Markdown(
-                "**Error**: Failed to initialize video writer.", visible=True
-            )
-        frame_count = 0
-        for _ in tqdm.tqdm(
-            range(min(MAX_NUM_FRAMES, num_frames)), desc="Processing video"
-        ):
-            ok, frame = cap.read()
-            if not ok:
-                break
-            rgb_frame = frame[:, :, ::-1]  # BGR to RGB
-            pil_image = Image.fromarray(rgb_frame)
-            (annotated_image, annotations), _ = detect_objects(
-                pil_image, checkpoint, confidence_threshold, use_url=False, url=""
-            )
-            if annotated_image is None:
-                continue
-            annotated_frame = annotate_frame(annotated_image, annotations)
-            writer.write(annotated_frame)
-            frame_count += 1
-        writer.release()
-        cap.release()
-        if frame_count == 0:
-            logger.warning("No valid frames processed in video")
-            temp_file.close()
-            os.unlink(temp_file.name)
-            return None, gr.Markdown(
-                "**Warning**: No valid frames processed. Try a different video or threshold.",
-                visible=True,
-            )
-        temp_file.close()
-        # Copy to persistent directory for Gradio access
-        output_filename = f"output_{os.path.basename(temp_file.name)}"
-        output_path = VIDEO_OUTPUT_DIR / output_filename
-        shutil.copy(temp_file.name, output_path)
-        os.unlink(temp_file.name)  # Remove temporary file
-        logger.info(f"Video saved to {output_path}")
-        return str(output_path), gr.Markdown(visible=False)
-    except Exception as e:
-        logger.error(f"Video processing failed: {str(e)}")
-        if "temp_file" in locals():
-            temp_file.close()
-            if os.path.exists(temp_file.name):
-                os.unlink(temp_file.name)
-        return None, gr.Markdown(
-            f"**Error**: Video processing failed: {str(e)}", visible=True
         )
 def create_image_inputs() -> List[gr.components.Component]:
     return [
@@ -340,7 +329,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                             image_input,
                             use_url,
                             url_input,
-                            image_checkpoint,
                             image_confidence_threshold,
                         ) = create_image_inputs()
                         image_detect_button, image_clear_button = create_button_row(
@@ -353,10 +342,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                         color_map=None,
                         elem_classes="output-component",
                     )
-                    image_error_message = gr.Markdown(
-                        visible=False, elem_classes="error-text"
-                    )
             gr.Examples(
                 examples=[
                     [
@@ -372,18 +357,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                     image_input,
                     use_url,
                     url_input,
-                    image_checkpoint,
                     image_confidence_threshold,
                 ],
-                outputs=[image_output, image_error_message],
-                fn=detect_objects,
                 cache_examples=False,
                 label="Select an image example to populate inputs",
             )
         with gr.Tab("Video"):
             gr.Markdown(
-                f"The input video will be truncated to {MAX_NUM_FRAMES} frames."
             )
             with gr.Row():
                 with gr.Column(scale=1, min_width=300):
@@ -400,9 +385,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                         format="mp4",  # Explicit MP4 format
                         elem_classes="output-component",
                     )
-                    video_error_message = gr.Markdown(
-                        visible=False, elem_classes="error-text"
-                    )
             gr.Examples(
                 examples=[
@@ -410,7 +392,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                     for example in VIDEO_EXAMPLES
                 ],
                 inputs=[video_input, video_checkpoint, video_confidence_threshold],
-                outputs=[video_output, video_error_message],
                 fn=process_video,
                 cache_examples=False,
                 label="Select a video example to populate inputs",
@@ -432,16 +414,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             DEFAULT_CHECKPOINT,
             DEFAULT_CONFIDENCE_THRESHOLD,
             None,
-            gr.Markdown(visible=False),
         ),
         outputs=[
             image_input,
             use_url,
             url_input,
-            image_checkpoint,
             image_confidence_threshold,
             image_output,
-            image_error_message,
         ],
     )
@@ -452,35 +432,32 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             DEFAULT_CHECKPOINT,
             DEFAULT_CONFIDENCE_THRESHOLD,
             None,
-            gr.Markdown(visible=False),
         ),
         outputs=[
             video_input,
             video_checkpoint,
             video_confidence_threshold,
             video_output,
-            video_error_message,
         ],
     )
     # Image detect button
     image_detect_button.click(
-        fn=detect_objects,
         inputs=[
             image_input,
-            image_checkpoint,
-            image_confidence_threshold,
-            use_url,
             url_input,
         ],
-        outputs=[image_output, image_error_message],
     )
     # Video detect button
     video_detect_button.click(
         fn=process_video,
         inputs=[video_input, video_checkpoint, video_confidence_threshold],
-        outputs=[video_output, video_error_message],
     )
 if __name__ == "__main__":

 import os
+import cv2
+import tqdm
 import shutil
 import tempfile
+import logging
+import supervision as sv
+import torch
+import spaces
 import gradio as gr
+from pathlib import Path
+from functools import lru_cache
+from typing import List, Optional, Tuple
 from PIL import Image
+from transformers import AutoModelForObjectDetection, AutoImageProcessor
 from transformers.image_utils import load_image
 # Configuration constants
 CHECKPOINTS = [
+    "ustc-community/dfine_m_obj2coco",
     "ustc-community/dfine_m_obj365",
     "ustc-community/dfine_n_coco",
     "ustc-community/dfine_s_coco",
     "ustc-community/dfine_l_obj365",
     "ustc-community/dfine_x_obj365",
     "ustc-community/dfine_s_obj2coco",
     "ustc-community/dfine_l_obj2coco_e25",
     "ustc-community/dfine_x_obj2coco",
 ]
 DEFAULT_CHECKPOINT = CHECKPOINTS[0]
 DEFAULT_CONFIDENCE_THRESHOLD = 0.3
+TORCH_DTYPE = torch.float32
+# Image
 IMAGE_EXAMPLES = [
+    {"path": "./examples/images/crossroad.jpg", "use_url": False, "url": "", "label": "Local Image"},
     {
         "path": None,
         "use_url": True,
         "label": "Flickr Image",
     },
 ]
+# Video
+MAX_NUM_FRAMES = 500
+BATCH_SIZE = 4
+ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov"}
+VIDEO_OUTPUT_DIR = Path("static/videos")
+VIDEO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 VIDEO_EXAMPLES = [
+    {"path": "./examples/videos/traffic.mp4", "label": "Local Video"},
+    {"path": "./examples/videos/fast_and_furious.mp4", "label": "Local Video"},
+    {"path": "./examples/videos/break_dance.mp4", "label": "Local Video"},
 ]
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
+@lru_cache(maxsize=3)
+def get_model_and_image_processor(checkpoint: str, device: str = "cpu"):
+    model = AutoModelForObjectDetection.from_pretrained(checkpoint, torch_dtype=TORCH_DTYPE).to(device)
+    image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+    return model, image_processor
+@spaces.GPU(duration=20)
 def detect_objects(
     checkpoint: str,
+    images: Optional[List[Image.Image]] = None,
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
+    target_sizes: Optional[List[Tuple[int, int]]] = None,
+):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model, image_processor = get_model_and_image_processor(checkpoint, device=device)
+    # preprocess images
+    inputs = image_processor(images=images, return_tensors="pt")
+    inputs = inputs.to(device).to(TORCH_DTYPE)
+    # forward pass
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # postprocess outputs
+    if not target_sizes:
+        target_sizes = [(image.height, image.width) for image in images]
+    results = image_processor.post_process_object_detection(
+        outputs, target_sizes=target_sizes, threshold=confidence_threshold
+    )
+    return results, model.config.id2label
+def process_image(
+    checkpoint: str = DEFAULT_CHECKPOINT,
+    image: Optional[Image.Image] = None,
+    url: Optional[str] = None,
+    confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
+):
+    if (image is None) ^ bool(url):
+        raise ValueError(f"Either image or url must be provided, but not both.")
+    if url:
+        image = load_image(url)
+    results, id2label = detect_objects(
+        checkpoint=checkpoint,
+        images=[image],
+        confidence_threshold=confidence_threshold,
+    )
+    result = results[0] # first image in batch (we have batch size 1)
+    annotations = []
+    for label, score, box in zip(result["labels"], result["scores"], result["boxes"]):
+        text_label = id2label[label.item()]
+        formatted_label = f"{text_label} ({score:.2f})"
+        x_min, y_min, x_max, y_max = box.cpu().numpy().round().astype(int)
+        x_min = max(0, x_min)
+        y_min = max(0, y_min)
+        x_max = min(image.width - 1, x_max)
+        y_max = min(image.height - 1, y_max)
+        annotations.append(((x_min, y_min, x_max, y_max), formatted_label))
+    return (image, annotations)
+def get_target_size(image_height, image_width, max_size: int):
+    if image_height < max_size and image_width < max_size:
+        return image_width, image_height
+    if image_height > image_width:
+        new_height = max_size
+        new_width = int(image_width * max_size / image_height)
+    else:
+        new_width = max_size
+        new_height = int(image_height * max_size / image_width)
+    return new_width, new_height
 def process_video(
     video_path: str,
     checkpoint: str,
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
+) -> str:
     if not video_path or not os.path.isfile(video_path):
+        raise ValueError(f"Invalid video path: {video_path}")
     ext = os.path.splitext(video_path)[1].lower()
     if ext not in ALLOWED_VIDEO_EXTENSIONS:
+        raise ValueError(f"Unsupported video format: {ext}, supported formats: {ALLOWED_VIDEO_EXTENSIONS}")
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Failed to open video: {video_path}")
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    process_each_frame = fps // 25
+    target_fps = fps / process_each_frame
+    target_width, target_height = get_target_size(height, width, 1080)
+    # Use H.264 codec for browser compatibility
+    fourcc = cv2.VideoWriter_fourcc(*"MJPG")
+    temp_file = tempfile.NamedTemporaryFile(suffix=".avi", delete=False)
+    writer = cv2.VideoWriter(temp_file.name, fourcc, target_fps, (target_width, target_height))
+    box_annotator = sv.BoxAnnotator(thickness=1)
+    label_annotator = sv.LabelAnnotator(text_scale=0.5)
+    if not writer.isOpened():
+        cap.release()
+        temp_file.close()
+        os.unlink(temp_file.name)
+        raise ValueError("Failed to initialize video writer")
+    frames_to_process = int(min(MAX_NUM_FRAMES * process_each_frame, num_frames))
+    batch = []
+    for i in tqdm.tqdm(range(frames_to_process), desc="Processing video"):
+        ok, frame = cap.read()
+        if not ok:
+            break
+        if not i % process_each_frame == 0:
+            continue
+        if len(batch) < BATCH_SIZE:
+            frame = frame[:, :, ::-1].copy()  # BGR to RGB
+            batch.append(frame)
+            continue
+        results, id2label = detect_objects(
+            images=[Image.fromarray(frame) for frame in batch],
+            checkpoint=checkpoint,
+            confidence_threshold=confidence_threshold,
+            target_sizes=[(target_height, target_width)] * len(batch),
         )
+        for frame, result in zip(batch, results):
+            frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
+            detections = sv.Detections.from_transformers(result, id2label=id2label)
+            detections = detections.with_nms(threshold=0.95, class_agnostic=True)
+            annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
+            annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections)
+            writer.write(cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
+        batch = []
+    writer.release()
+    cap.release()
+    temp_file.close()
+    # Copy to persistent directory for Gradio access
+    output_filename = f"output_{os.path.basename(temp_file.name)}"
+    output_path = VIDEO_OUTPUT_DIR / output_filename
+    shutil.copy(temp_file.name, output_path)
+    os.unlink(temp_file.name)  # Remove temporary file
+    logger.info(f"Video saved to {output_path}")
+    return str(output_path)
 def create_image_inputs() -> List[gr.components.Component]:
     return [
                             image_input,
                             use_url,
                             url_input,
+                            image_model_checkpoint,
                             image_confidence_threshold,
                         ) = create_image_inputs()
                         image_detect_button, image_clear_button = create_button_row(
                         color_map=None,
                         elem_classes="output-component",
                     )
             gr.Examples(
                 examples=[
                     [
                     image_input,
                     use_url,
                     url_input,
+                    image_model_checkpoint,
                     image_confidence_threshold,
                 ],
+                outputs=[image_output],
+                fn=process_image,
                 cache_examples=False,
                 label="Select an image example to populate inputs",
             )
         with gr.Tab("Video"):
             gr.Markdown(
+                f"The input video will be processed in ~25 FPS (up to {MAX_NUM_FRAMES} frames in result)."
             )
             with gr.Row():
                 with gr.Column(scale=1, min_width=300):
                         format="mp4",  # Explicit MP4 format
                         elem_classes="output-component",
                     )
             gr.Examples(
                 examples=[
                     for example in VIDEO_EXAMPLES
                 ],
                 inputs=[video_input, video_checkpoint, video_confidence_threshold],
+                outputs=[video_output],
                 fn=process_video,
                 cache_examples=False,
                 label="Select a video example to populate inputs",
             DEFAULT_CHECKPOINT,
             DEFAULT_CONFIDENCE_THRESHOLD,
             None,
         ),
         outputs=[
             image_input,
             use_url,
             url_input,
+            image_model_checkpoint,
             image_confidence_threshold,
             image_output,
         ],
     )
             DEFAULT_CHECKPOINT,
             DEFAULT_CONFIDENCE_THRESHOLD,
             None,
         ),
         outputs=[
             video_input,
             video_checkpoint,
             video_confidence_threshold,
             video_output,
         ],
     )
     # Image detect button
     image_detect_button.click(
+        fn=process_image,
         inputs=[
+            image_model_checkpoint,
             image_input,
             url_input,
+            image_confidence_threshold,
         ],
+        outputs=[image_output],
     )
     # Video detect button
     video_detect_button.click(
         fn=process_video,
         inputs=[video_input, video_checkpoint, video_confidence_threshold],
+        outputs=[video_output],
     )
 if __name__ == "__main__":

image.jpg → examples/images/crossroad.jpg RENAMED Viewed

File without changes

video.mp4 → examples/videos/break_dance.mp4 RENAMED Viewed

File without changes

examples/videos/fast_and_furious.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5980eada9d80c65b4da5b536427ccf8ff8ea2707ee3e4aa52fb2c4e1b1979dae
+size 16872922

examples/videos/traffic.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71908c136bba6b50b9071fb2015553f651c91a7ee857924f33616c046011aaed
+size 8591523

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -2,6 +2,9 @@ gradio
 transformers @ git+https://github.com/huggingface/transformers
 torch
 torchvision
-opencv-python
 tqdm
-pillow

 transformers @ git+https://github.com/huggingface/transformers
 torch
 torchvision
+opencv-python-headless
+ffmpeg-python
 tqdm
+pillow
+supervision
+spaces