File size: 4,528 Bytes
f461d96
e176112
 
 
 
59d8758
c2d34b4
 
 
 
 
 
 
 
 
 
59d8758
7e0d391
c2d34b4
 
 
 
60042f7
7e0d391
cb1da5d
c2d34b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import ultralytics
import onemetric
import supervision
import typing
import tqdm
import os
from ultralytics import YOLO
from dataclasses import dataclass
from onemetric.cv.utils.iou import box_iou_batch
from supervision import Point
from supervision import Detections, BoxAnnotator
from supervision import draw_text
from supervision import Color
from supervision import VideoInfo
from supervision import get_video_frames_generator
from supervision import VideoSink
import torch
os.system("pip install git+https://github.com/ifzhang/ByteTrack")
from typing import List
import numpy as np
import gradio as gr
from tqdm import tqdm
import yolox
from yolox.tracker.byte_tracker import BYTETracker, STrack


MODEL = "./best.pt"

SOURCE_VIDEO_PATH = "./examples"

TARGET_VIDEO_PATH = "test.mp4"

CLASS_ID = [0,1,2,3]

model = YOLO(MODEL)
model.fuse()

classes = CLASS_ID

@dataclass(frozen=True)
class BYTETrackerArgs:
    track_thresh: float = 0.25
    track_buffer: int = 30
    match_thresh: float = 0.8
    aspect_ratio_thresh: float = 3.0
    min_box_area: float = 1.0
    mot20: bool = False


# converts Detections into format that can be consumed by match_detections_with_tracks function
def detections2boxes(detections : Detections) -> np.ndarray:
    return np.hstack((
        detections.xyxy,
        detections.confidence[:, np.newaxis]
    ))


# converts List[STrack] into format that can be consumed by match_detections_with_tracks function
def tracks2boxes(tracks: List[STrack]) -> np.ndarray:
    return np.array([
        track.tlbr
        for track
        in tracks
    ], dtype=float)


# matches our bounding boxes with predictions
def match_detections_with_tracks(
    detections: Detections,
    tracks: List[STrack],
) -> Detections:
    if not np.any(detections.xyxy) or len(tracks) == 0:
        return np.empty((0,))

    tracks_boxes = tracks2boxes(tracks=tracks)
    iou = box_iou_batch(tracks_boxes, detections.xyxy)
    track2detection = np.argmax(iou, axis=1)

    tracker_ids = [None] * len(detections)

    for tracker_index, detection_index in enumerate(track2detection):
        if iou[tracker_index, detection_index] != 0:
            tracker_ids[detection_index] = tracks[tracker_index].track_id

    return tracker_ids

def ObjectDetection(video_path):
    byte_tracker = BYTETracker(BYTETrackerArgs())
    video_info = VideoInfo.from_video_path(video_path)
    generator = get_video_frames_generator(video_path)
    box_annotator = BoxAnnotator(thickness=5, text_thickness=5, text_scale=1)

    with VideoSink(TARGET_VIDEO_PATH, video_info) as sink:
        # loop over video frames
        for frame in tqdm(generator, total=video_info.total_frames):
            results = model(frame)
            detections = Detections(
                xyxy=results[0].boxes.xyxy.cpu().numpy(),
                confidence=results[0].boxes.conf.cpu().numpy(),
                class_id=results[0].boxes.cls.cpu().numpy().astype(int)
            )
            # filtering out detections with unwanted classes
            detections = detections[np.isin(detections.class_id, [0,1,2,3])]
            # tracking detections
            tracks = byte_tracker.update(
                output_results=detections2boxes(detections = detections),
                img_info=frame.shape,
                img_size=frame.shape
            )
            tracker_id = match_detections_with_tracks(detections=detections, tracks=tracks)
            detections.tracker_id = np.array(tracker_id)
            # filtering out detections without trackers
            detections = detections[np.not_equal(detections.tracker_id, None)]
            # format custom labels
            labels = [
                f"#{tracker_id} {classes[class_id]} {confidence:0.2f}"
                for _, _, confidence, class_id, tracker_id
                in detections
            ]
            t = np.unique(detections.class_id, return_counts =True)
            for x in zip(t[0], t[1]):
                frame = draw_text(background_color=Color.white(), scene=frame, text=' '.join((str(classes[x[0]]), ':', str(x[1]))), text_anchor=Point(x=50, y=300 + (50 * x[0])), text_scale = 2, text_thickness = 4, )
            # annotate and display frame
            frame = box_annotator.annotate(scene=frame, detections=detections, labels=labels)
            sink.write_frame(frame)

    return TARGET_VIDEO_PATH

demo = gr.Interface(fn=ObjectDetection, inputs=gr.Video(), outputs=gr.Video(), examples=SOURCE_VIDEO_PATH, cache_examples=False)
demo.launch()