File size: 5,216 Bytes
e176112
 
59d8758
c2d34b4
 
 
 
 
 
 
 
 
 
7e0d391
c2d34b4
 
 
 
60042f7
be5c43c
 
7e0d391
c2d34b4
 
 
 
 
41ffe92
c2d34b4
a9d3fa2
62cf07b
c2d34b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23b399d
 
 
 
 
 
 
c2d34b4
 
 
 
 
 
 
 
 
 
41ffe92
c2d34b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23b399d
 
 
 
 
 
 
c2d34b4
 
 
 
9a41c33
7962c9a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import supervision
import tqdm
import os
from ultralytics import YOLO
from dataclasses import dataclass
from onemetric.cv.utils.iou import box_iou_batch
from supervision import Point
from supervision import Detections, BoxAnnotator
from supervision import draw_text
from supervision import Color
from supervision import VideoInfo
from supervision import get_video_frames_generator
from supervision import VideoSink
os.system("pip install git+https://github.com/ifzhang/ByteTrack")
from typing import List
import numpy as np
import gradio as gr
from tqdm import tqdm
import yolox
os.system("pip3 install cython_bbox gdown 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'")
os.system("pip3 install -v -e .")
from yolox.tracker.byte_tracker import BYTETracker, STrack

MODEL = "./best.pt"

TARGET_VIDEO_PATH = "test.mp4"

CLASS_ID = [0,1,2,3,4,5,6]

video_examples = [['example.mp4']]

model = YOLO(MODEL)
model.fuse()

classes = CLASS_ID

@dataclass(frozen=True)
class BYTETrackerArgs:
    track_thresh: float = 0.25
    track_buffer: int = 30
    match_thresh: float = 0.8
    aspect_ratio_thresh: float = 3.0
    min_box_area: float = 1.0
    mot20: bool = False


# converts Detections into format that can be consumed by match_detections_with_tracks function
def detections2boxes(detections : Detections) -> np.ndarray:
    return np.hstack((
        detections.xyxy,
        detections.confidence[:, np.newaxis]
    ))


# converts List[STrack] into format that can be consumed by match_detections_with_tracks function
def tracks2boxes(tracks: List[STrack]) -> np.ndarray:
    return np.array([
        track.tlbr
        for track
        in tracks
    ], dtype=float)


# matches our bounding boxes with predictions
def match_detections_with_tracks(
    detections: Detections,
    tracks: List[STrack],
) -> Detections:
    if not np.any(detections.xyxy) or len(tracks) == 0:
        return np.empty((0,))

    tracks_boxes = tracks2boxes(tracks=tracks)
    iou = box_iou_batch(tracks_boxes, detections.xyxy)
    track2detection = np.argmax(iou, axis=1)

    tracker_ids = [None] * len(detections)

    for tracker_index, detection_index in enumerate(track2detection):
        if iou[tracker_index, detection_index] != 0:
            tracker_ids[detection_index] = tracks[tracker_index].track_id

    return tracker_ids

def ObjectDetection(video_path):
    byte_tracker = BYTETracker(BYTETrackerArgs())
    video_info = VideoInfo.from_video_path(video_path)
    generator = get_video_frames_generator(video_path)
    box_annotator = BoxAnnotator(thickness=5, text_thickness=5, text_scale=1)
    #polygon
    polygon = np.array([[200,300], [200,1420], [880, 1420], [880, 300]])
    #zone
    zone = supervision.PolygonZone(polygon=polygon, frame_resolution_wh=video_info.resolution_wh)
    #zone annotator
    zone_annotator = supervision.PolygonZoneAnnotator(zone=zone, color=Color.white(), thickness=4)
    # open target video file
    with VideoSink(TARGET_VIDEO_PATH, video_info) as sink:
        # loop over video frames
        for frame in tqdm(generator, total=video_info.total_frames):
            results = model(frame)
            detections = Detections(
                xyxy=results[0].boxes.xyxy.cpu().numpy(),
                confidence=results[0].boxes.conf.cpu().numpy(),
                class_id=results[0].boxes.cls.cpu().numpy().astype(int)
            )
            # filtering out detections with unwanted classes
            detections = detections[np.isin(detections.class_id, CLASS_ID)]
            # tracking detections
            tracks = byte_tracker.update(
                output_results=detections2boxes(detections = detections),
                img_info=frame.shape,
                img_size=frame.shape
            )
            tracker_id = match_detections_with_tracks(detections=detections, tracks=tracks)
            detections.tracker_id = np.array(tracker_id)
            # filtering out detections without trackers
            detections = detections[np.not_equal(detections.tracker_id, None)]
            # format custom labels
            labels = [
                f"#{tracker_id} {classes[class_id]} {confidence:0.2f}"
                for _, _, confidence, class_id, tracker_id
                in detections
            ]
            t = np.unique(detections.class_id, return_counts =True)
            # annotate and display frame
            mask = zone.trigger(detections=detections)
            detections_filtered = detections[mask]
            t = np.unique(detections_filtered.class_id, return_counts =True)
            for x in zip(t[0], t[1]):
                frame = draw_text(background_color=Color.white(), scene=frame, text=' '.join((str(classes[x[0]]), ':', str(x[1]))), text_anchor=Point(x=500, y=1550 + (50 * x[0])), text_scale = 2, text_thickness = 4)
            frame = box_annotator.annotate(scene=frame, detections=detections_filtered, labels=labels)
            frame = zone_annotator.annotate(scene=frame)
            sink.write_frame(frame)

    return TARGET_VIDEO_PATH

demo = gr.Interface(fn=ObjectDetection, inputs=gr.Video(), outputs=gr.Video(), examples=video_examples, cache_examples=False)
demo.queue().launch()