Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,332 Bytes
780389c 8b2cbe6 1e8e71b 6a95f1f 8b2cbe6 6a95f1f ccc35d4 1e8e71b 6a95f1f 385e56e 8b2cbe6 9740995 ccc35d4 8b2cbe6 9740995 ccc35d4 8b2cbe6 6a95f1f 8b2cbe6 9740995 8b2cbe6 6a95f1f 8b2cbe6 9740995 790227b 6a95f1f 66947f7 1ded52c 66947f7 6a95f1f 9740995 ccc35d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import spaces
import gradio as gr
import cv2
from PIL import Image
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
from draw_boxes import draw_bounding_boxes
image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")
@spaces.GPU
def stream_object_detection(video, conf_threshold):
cap = cv2.VideoCapture(video)
video_codec = cv2.VideoWriter_fourcc(*"x264") # type: ignore
fps = int(cap.get(cv2.CAP_PROP_FPS))
desired_fps = fps // 3
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
iterating, frame = cap.read()
n_frames = 0
n_chunks = 0
name = str(current_dir / f"output_{n_chunks}.ts")
segment_file = cv2.VideoWriter(name, video_codec, fps, (width, height)) # type: ignore
batch = []
while iterating:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if n_frames % 3 == 0:
batch.append(frame)
if len(batch) == desired_fps:
inputs = image_processor(images=batch, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
boxes = image_processor.post_process_object_detection(
outputs,
target_sizes=torch.tensor([batch[0].shape[::-1]] * len(batch)),
threshold=conf_threshold)
for array, box in zip(batch, boxes):
pil_image = draw_bounding_boxes(Image.from_array(array), boxes[0], model, 0.3)
frame = numpy.array(pil_image)
# Convert RGB to BGR
frame = frame[:, :, ::-1].copy()
segment_file.write(frame)
segment_file.release()
n_frames = 0
n_chunks += 1
yield name
name = str(current_dir / f"output_{n_chunks}.ts")
segment_file = cv2.VideoWriter(name, video_codec, fps, (width, height)) # type: ignore
iterating, frame = cap.read()
n_frames += 1
segment_file.release()
yield name
css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
.my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
with gr.Blocks(css=css) as app:
gr.HTML(
"""
<h1 style='text-align: center'>
Video Object Detection with RT-DETR
</h1>
""")
gr.HTML(
"""
<h3 style='text-align: center'>
<a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>github</a>
</h3>
""")
with gr.Column(elem_classes=["my-column"]):
with gr.Group(elem_classes=["my-group"]):
video = gr.Video(label="Video Source")
conf_threshold = gr.Slider(
label="Confidence Threshold",
minimum=0.0,
maximum=1.0,
step=0.05,
value=0.30,
)
video.upload(
fn=stream_object_detection,
inputs=[video, conf_threshold],
outputs=[video],
)
if __name__ == '__main__':
app.launch()
|