File size: 3,609 Bytes
780389c
8b2cbe6
1e8e71b
6a95f1f
61732db
 
 
8b2cbe6
6a95f1f
 
 
 
 
 
ccc35d4
1e8e71b
6a95f1f
 
 
b7278d2
6a95f1f
61732db
6a95f1f
 
 
 
 
 
 
61732db
b7278d2
 
6a95f1f
 
 
b7278d2
6a95f1f
61732db
6a95f1f
61732db
6a95f1f
 
61732db
 
6a95f1f
 
61732db
 
6a95f1f
 
 
61732db
6a95f1f
 
 
61732db
 
6a95f1f
 
 
 
 
 
 
 
b7278d2
6a95f1f
 
 
 
 
 
 
385e56e
8b2cbe6
9740995
ccc35d4
8b2cbe6
9740995
ccc35d4
8b2cbe6
 
 
6a95f1f
8b2cbe6
9740995
8b2cbe6
 
 
6a95f1f
8b2cbe6
9740995
790227b
 
b7278d2
66947f7
 
 
 
 
1ded52c
66947f7
6a95f1f
 
 
 
9740995
 
 
ccc35d4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import spaces
import gradio as gr
import cv2
from PIL import Image
import torch
import time
import numpy as np

from transformers import RTDetrForObjectDetection, RTDetrImageProcessor

from draw_boxes import draw_bounding_boxes

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")

@spaces.GPU
def stream_object_detection(video, conf_threshold):
    cap = cv2.VideoCapture(video)

    video_codec = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    desired_fps = fps // 5
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    iterating, frame = cap.read()

    n_frames = 0
    n_chunks = 0

    name = f"output_{n_chunks}.mp4"
    segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width // 2, height // 2)) # type: ignore
    batch = []

    while iterating:
        frame = cv2.resize( frame, (0,0), fx=0.5, fy=0.5)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        if n_frames % 5 == 0:
            batch.append(frame)
        if len(batch) == 2 * desired_fps:
            inputs = image_processor(images=batch, return_tensors="pt")

            print(f"starting batch of size {len(batch)}")
            start = time.time()
            with torch.no_grad():
                outputs = model(**inputs)
            end = time.time()
            print("time taken ", end - start)

            boxes = image_processor.post_process_object_detection(
                outputs,
                target_sizes=torch.tensor([frame[0].shape[:2][::-1]] * len(batch)),
                threshold=conf_threshold)
            
            for array, box in zip(batch, boxes):
                pil_image = draw_bounding_boxes(Image.fromarray(array), box, model, conf_threshold)
                frame = np.array(pil_image)
                # Convert RGB to BGR
                frame = frame[:, :, ::-1].copy()
                segment_file.write(frame)

            segment_file.release()
            n_frames = 0
            n_chunks += 1
            yield name
            name = f"output_{n_chunks}.mp4"
            segment_file = cv2.VideoWriter(name, video_codec, fps, (width, height)) # type: ignore

        iterating, frame = cap.read()
        n_frames += 1

    segment_file.release()
    yield name


css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""


with gr.Blocks(css=css) as app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Video Object Detection with RT-DETR
    </h1>
    """)
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>github</a>
        </h3>
        """)
    with gr.Column(elem_classes=["my-column"]):
        with gr.Group(elem_classes=["my-group"]):
            video = gr.Video(label="Video Source", streaming=True)
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.30,
            )
        video.upload(
            fn=stream_object_detection,
            inputs=[video, conf_threshold],
            outputs=[video],
        )

if __name__ == '__main__':
    app.launch()