File size: 3,621 Bytes
0350aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98ea781
 
 
 
 
0350aec
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import spaces
import gradio as gr
import cv2
from PIL import Image
import torch
import time
import numpy as np
import uuid

from transformers import RTDetrForObjectDetection, RTDetrImageProcessor  # type: ignore

from draw_boxes import draw_bounding_boxes

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd").to("cuda")


SUBSAMPLE = 2


@spaces.GPU
def stream_object_detection(video, conf_threshold):
    cap = cv2.VideoCapture(video)

    video_codec = cv2.VideoWriter_fourcc(*"mp4v")  # type: ignore
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    desired_fps = fps // SUBSAMPLE
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2

    iterating, frame = cap.read()

    n_frames = 0

    name = f"output_{uuid.uuid4()}.mp4"
    segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height))  # type: ignore
    batch = []

    while iterating:
        frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        if n_frames % SUBSAMPLE == 0:
            batch.append(frame)
        if len(batch) == 2 * desired_fps:
            inputs = image_processor(images=batch, return_tensors="pt").to("cuda")

            print(f"starting batch of size {len(batch)}")
            start = time.time()
            with torch.no_grad():
                outputs = model(**inputs)
            end = time.time()
            print("time taken for inference", end - start)

            start = time.time()
            boxes = image_processor.post_process_object_detection(
                outputs,
                target_sizes=torch.tensor([(height, width)] * len(batch)),
                threshold=conf_threshold,
            )

            for _, (array, box) in enumerate(zip(batch, boxes)):
                pil_image = draw_bounding_boxes(
                    Image.fromarray(array), box, model, conf_threshold
                )
                frame = np.array(pil_image)
                # Convert RGB to BGR
                frame = frame[:, :, ::-1].copy()
                segment_file.write(frame)

            batch = []
            segment_file.release()
            yield name
            end = time.time()
            print("time taken for processing boxes", end - start)
            name = f"output_{uuid.uuid4()}.mp4"
            segment_file = cv2.VideoWriter(
                name, video_codec, desired_fps, (width, height)
            )  # type: ignore

        iterating, frame = cap.read()
        n_frames += 1


with gr.Blocks() as demo:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Video Object Detection with <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>RT-DETR</a>
    </h1>
    """
    )
    with gr.Row():
        with gr.Column():
            video = gr.Video(label="Video Source")
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.30,
            )
        with gr.Column():
            output_video = gr.Video(
                label="Processed Video", streaming=True, autoplay=True
            )

    video.upload(
        fn=stream_object_detection,
        inputs=[video, conf_threshold],
        outputs=[output_video],
    )

    gr.Examples(
        examples=["3285790-hd_1920_1080_30fps.mp4"],
        inputs=[video],
    )

if __name__ == "__main__":
    demo.launch()