|
import os
|
|
import time
|
|
|
|
import gradio as gr
|
|
import numpy as np
|
|
import requests
|
|
import spaces
|
|
import supervision as sv
|
|
import torch
|
|
from PIL import Image
|
|
from tqdm import tqdm
|
|
|
|
from transformers import AutoModelForObjectDetection, AutoProcessor
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
|
|
model = AutoModelForObjectDetection.from_pretrained(
|
|
"PekingU/rtdetr_r50vd_coco_o365",
|
|
disable_custom_kernels=False,
|
|
torch_dtype=torch.float16,
|
|
).to(device)
|
|
model_compiled = torch.compile(
|
|
model,
|
|
mode="reduce-overhead",
|
|
)
|
|
|
|
|
|
@spaces.GPU
|
|
def init_compiled_model():
|
|
print("Compiling model...")
|
|
start_time = time.time()
|
|
with torch.no_grad():
|
|
for _ in range(10):
|
|
outputs = model_compiled(**inputs)
|
|
_ = outputs[0].cpu()
|
|
print(f"Model compiled in {time.time() - start_time:.2f} seconds.")
|
|
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
image = Image.open(requests.get(url, stream=True).raw)
|
|
inputs = processor(images=image, return_tensors="pt").to(device).to(torch.float16)
|
|
init_compiled_model()
|
|
|
|
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
|
|
MASK_ANNOTATOR = sv.MaskAnnotator()
|
|
LABEL_ANNOTATOR = sv.LabelAnnotator()
|
|
TRACKER = sv.ByteTrack()
|
|
|
|
|
|
def calculate_end_frame_index(source_video_path):
|
|
video_info = sv.VideoInfo.from_video_path(source_video_path)
|
|
return min(video_info.total_frames, video_info.fps * 5)
|
|
|
|
|
|
def annotate_image(input_image, detections, labels) -> np.ndarray:
|
|
output_image = MASK_ANNOTATOR.annotate(input_image, detections)
|
|
output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
|
|
output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
|
|
return output_image
|
|
|
|
|
|
@spaces.GPU
|
|
def process_video(
|
|
input_video,
|
|
confidence_threshold,
|
|
progress=gr.Progress(track_tqdm=True),
|
|
):
|
|
video_info = sv.VideoInfo.from_video_path(input_video)
|
|
total = calculate_end_frame_index(input_video)
|
|
frame_generator = sv.get_video_frames_generator(source_path=input_video, end=total)
|
|
|
|
result_file_name = "output.mp4"
|
|
result_file_path = os.path.join(os.getcwd(), result_file_name)
|
|
all_fps = []
|
|
with sv.VideoSink(result_file_path, video_info=video_info) as sink:
|
|
for _ in tqdm(range(total), desc="Processing video.."):
|
|
try:
|
|
frame = next(frame_generator)
|
|
except StopIteration:
|
|
break
|
|
results, fps = query(frame, confidence_threshold)
|
|
all_fps.append(fps)
|
|
final_labels = []
|
|
detections = []
|
|
|
|
detections = sv.Detections.from_transformers(results[0])
|
|
detections = TRACKER.update_with_detections(detections)
|
|
for label in detections.class_id.tolist():
|
|
final_labels.append(model.config.id2label[label])
|
|
frame = annotate_image(
|
|
input_image=frame,
|
|
detections=detections,
|
|
labels=final_labels,
|
|
)
|
|
sink.write_frame(frame)
|
|
|
|
avg_fps = np.mean(all_fps)
|
|
return result_file_path, gr.Markdown(
|
|
f'<h3 style="text-align: center;">Model inference FPS: {avg_fps:.2f}</h3>',
|
|
visible=True,
|
|
)
|
|
|
|
|
|
def query(frame, confidence_threshold):
|
|
image = Image.fromarray(frame)
|
|
inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
|
|
with torch.no_grad():
|
|
start = time.time()
|
|
outputs = model_compiled(**inputs)
|
|
outputs[0].cpu()
|
|
fps = 1 / (time.time() - start)
|
|
target_sizes = torch.tensor([frame.shape[:2]]).to(device)
|
|
|
|
results = processor.post_process_object_detection(
|
|
outputs=outputs,
|
|
threshold=confidence_threshold,
|
|
target_sizes=target_sizes,
|
|
)
|
|
return results, fps
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
gr.Markdown("## Real Time Object Detection with compiled RT-DETR")
|
|
gr.Markdown(
|
|
"""
|
|
This is a demo for real-time object detection using RT-DETR compiled.<br>
|
|
It runs on ZeroGPU which captures GPU every first time you infer.<br>
|
|
This combined with video processing time means that the demo inference time is slower than the model's actual inference time.<br>
|
|
The actual model average inference FPS is displayed under the processed video after inference.
|
|
"""
|
|
)
|
|
gr.Markdown(
|
|
"Simply upload a video! You can also play with confidence threshold or try the examples below. π"
|
|
)
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
input_video = gr.Video(label="Input Video")
|
|
with gr.Column():
|
|
output_video = gr.Video(label="Output Video (5s max)")
|
|
actual_fps = gr.Markdown("", visible=False)
|
|
with gr.Row():
|
|
conf = gr.Slider(
|
|
label="Confidence Threshold",
|
|
minimum=0.1,
|
|
maximum=1.0,
|
|
value=0.3,
|
|
step=0.05,
|
|
)
|
|
with gr.Row():
|
|
submit = gr.Button(variant="primary")
|
|
|
|
example = gr.Examples(
|
|
examples=[
|
|
["./football.mp4", 0.3, 640],
|
|
["./cat.mp4", 0.3, 640],
|
|
["./safari2.mp4", 0.3, 640],
|
|
],
|
|
inputs=[input_video, conf],
|
|
outputs=output_video,
|
|
)
|
|
|
|
submit.click(
|
|
fn=process_video,
|
|
inputs=[input_video, conf],
|
|
outputs=[output_video, actual_fps],
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch(show_error=True)
|
|
|