File size: 10,387 Bytes
e7e7f2a
 
eeac268
 
 
 
 
 
 
 
 
 
e7e7f2a
 
 
eeac268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# from dataclasses import dataclass, replace
# from functools import reduce
from io import BytesIO
import math
import os
from pprint import pprint
import tempfile

from PIL import Image, ImageDraw, ImageFont
import numpy as np
import cv2

# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline

import torch
from torch.utils.data import Dataset
import torchvision
from torchvision import transforms

import roboflow
from roboflow import Roboflow
import supervision as sv
import albumentations as A

import gradio as gr
import requests

# from torchmetrics.detection.mean_ap import MeanAveragePrecision
# from torchmetrics.detection.iou import IntersectionOverUnion
# import evaluate
#from datasets import load_metric

from transformers import pipeline
from transformers import (
    AutoProcessor,
    AutoImageProcessor,
    AutoModel,
    AutoModelForObjectDetection,
    RTDetrForObjectDetection,
    RTDetrImageProcessor,
    TrainingArguments,
    Trainer
)
from huggingface_hub import hf_hub_download

from safetensors.torch import load_file

#@title Utilities
PALETTE = {0: {"color": (255, 0, 0),
               "name": "Ambulance"},
           1: {"color": (0, 191, 0),
               "name": "Firetruck"},
           2: {"color": (0, 0, 255),
               "name": "Police"},
           3: {"color": (255, 0, 255),
               "name": "Non-EV"}}
label2id = {val["name"]: id for (id, val) in PALETTE.items()}
id2label = {id: name for (name, id) in label2id.items()}

print(label2id)
print(id2label)

def unnormalize_bbox(img_h, img_w, bbox):
  x_min = bbox[0] - bbox[2]/2
  y_min = bbox[1] - bbox[3]/2
  x_max = bbox[0] + bbox[2]/2 # - x_min
  y_max = bbox[1] + bbox[3]/2 # - y_min

  x_min *= img_w
  y_min *= img_h
  x_max *= img_w
  y_max *= img_h
  x_min, y_min, x_max, y_max = list(map(int, [x_min, y_min, x_max, y_max]))

  return (x_min, y_min, x_max, y_max)

def paint_bbox(
    image,
    annotations,
    normalize_labels=True,
    normalize_bbox=True,
  ):
  bboxes = annotations["boxes"].tolist()
  class_id = annotations["labels"].tolist()
  confidences = annotations["scores"].tolist()

  painted_img = image.copy() # Wutdehell
  for (bbox, label, confidence) in zip(bboxes, class_id, confidences):
    label = (label - 1) if normalize_labels else label
    if normalize_bbox:
      img_h, img_w = image.shape[0], image.shape[1] # H, W, C
      x_min, y_min, x_max, y_max = unnormalize_bbox(img_h, img_w, bbox)
      print([x_min, y_min, x_max, y_max])

      """
      x_min = #int(bbox[0] - bbox[2]/2) # Left
      y_min = #int(bbox[1] - bbox[3]/2) # Top
      x_max = #int(bbox[0] + bbox[2]/2)
      y_max = #int(bbox[1] + bbox[3]/2)
      """
    else:
      x_min, y_min, x_max, y_max = list(map(int, bbox))

    box_color = PALETTE[label]["color"]
    label_name = PALETTE[label]["name"]

    if confidence != -1:
      label_name = f"{label_name} ({confidence:.2f})"

    cv2.rectangle(painted_img,
                  (x_min, y_min),
                  (x_max, y_max),
                  color=box_color,
                  thickness=2)
    cv2.rectangle(painted_img,
                  (x_min, y_min),
                  (x_min + 5 + len(label_name)*10, y_min + 17),
                  color=box_color,
                  thickness=-1)
    cv2.putText(painted_img,
                label_name,
                (x_min + 2, y_min + 12),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=0.5,
                color=(255, 255, 255),
                thickness=1)
  return painted_img

# Function to calculate Intersection over Union (IoU)
def calculate_iou(truth_bbx, pred_bbx):
    # Coordinates of the boxes: [xmin, ymin, xmax, ymax]
    x1, y1, x2, y2 = truth_bbx
    x1_p, y1_p, x2_p, y2_p = pred_bbx

    # Calculate intersection
    ixmin = max(x1, x1_p)
    iymin = max(y1, y1_p)
    ixmax = min(x2, x2_p)
    iymax = min(y2, y2_p)

    iw = max(0, ixmax - ixmin)
    ih = max(0, iymax - iymin)

    intersection = iw * ih
    area1 = (x2 - x1) * (y2 - y1)
    area2 = (x2_p - x1_p) * (y2_p - y1_p)
    union = area1 + area2 - intersection
    iou = intersection / union if union != 0 else 0
    return iou

# Example: emotion_classifier = pipeline("image-classification", model="itsindrabudhik/emotion_classification")
# (Load only once)
DETECTOR = pipeline("object-detection", model="itsindrabudhik/finalProjectCV2425") #later on, change this with out trained modell yesssss (the trained model should be uploaded to hugging face)
tensor_file = hf_hub_download(repo_id="itsindrabudhik/finalProjectCV2425",
                               filename="model.safetensors")

# Assign classification head weights since that pipeline seems to not handling it
# weights = load_file(tensor_file)
# DETECTOR.model.class_labels_classifier.weight.data = weights["class_labels_classifier.weight"]
# DETECTOR.model.class_labels_classifier.bias.data = weights["class_labels_classifier.bias"]
# del weights

def detect_ev_nev(image, confidence_threshold=0.5, iou_threshold=0.5):
    # Run the detector pipeline on the image
    results = DETECTOR(image)

    # Open the image
    if isinstance(image, str):  # If the image is a URL or file path
        if image.startswith("http"):
            response = requests.get(image)
            img = Image.open(BytesIO(response.content))
        else:
            img = Image.open(image)
    else:
        img = image

    # Draw bounding boxes and labels on the image
    font_path = os.path.join(cv2.__path__[0],'qt','fonts','DejaVuSans.ttf')
    font = ImageFont.truetype(font_path, size=32)
    draw = ImageDraw.Draw(img)

    details = []  # Collect details for text output
    for result in results:
        score = result['score']
        label = result['label']
        box = result['box']

        # Apply confidence threshold
        if score < confidence_threshold:
            continue

        # Filter out low IoU detections
        keep = True
        for previous_result in results:
            if previous_result != result:
                prev_box = previous_result['box']
                iou = calculate_iou([box['xmin'], box['ymin'], box['xmax'], box['ymax']],
                                    [prev_box['xmin'], prev_box['ymin'], prev_box['xmax'], prev_box['ymax']])
                if iou > iou_threshold:
                    keep = False
                    break

        label_color = PALETTE[label2id[label]]["color"]
        if keep:
            # Draw the bounding box and label
            xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
            draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3)

            # Use a larger font size for text
            text = f"{label} ({score:.2f})"

            # Calculate text bounding box
            text_bbox = draw.textbbox((xmin, ymin - 10), text, font=font)  # This gives (xmin, ymin, xmax, ymax)
            text_width = text_bbox[2] - text_bbox[0]  # width of the text box
            text_height = text_bbox[3] - text_bbox[1]  # height of the text box

            # Draw the text on the image (position adjusted)
            draw.text((xmin, ymin - text_height - 5), text, fill="red", font=font)

            # Add details to the list
            details.append({
                "Label": label,
                "Confidence": f"{score:.2f}",
                "Bounding Box": f"({xmin}, {ymin}, {xmax}, {ymax})"
            })
    details_text = "\n".join([f"Label: {d['Label']}, Confidence: {d['Confidence']}, Box: {d['Bounding Box']}" for d in details])
    return img, details_text

def detect_video(video, confidence_threshold=0.5, iou_threshold=0.5):
    video_capture = cv2.VideoCapture(video)
    fps = video_capture.get(cv2.CAP_PROP_FPS)
    frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))

    temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(temp_output.name, fourcc, fps, (frame_width, frame_height))

    details = []
    total_frames = 0
    detected_frames = 0

    while True:
        ret, frame = video_capture.read()
        if not ret:
            break

        total_frames += 1
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        annotated_image, frame_details = detect_ev_nev(image, confidence_threshold, iou_threshold)

        # Count frames with detections
        if frame_details.strip():  # Non-empty details indicate detections
            detected_frames += 1

        details.append(frame_details)
        annotated_frame = cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR)
        out.write(annotated_frame)

    video_capture.release()
    out.release()

    details_text = "\n".join(details)
    summary = f"Total Frames: {total_frames}, Frames with Detections: {detected_frames}\n" + details_text
    return temp_output.name, summary

def detect(file, confidence_threshold=0.5, iou_threshold=0.5):
    # Determine if input is an image or video
    file_ext = file.name.split(".")[-1].lower()
    if file_ext in ["png", "jpg", "jpeg"]:
        # Image processing
        annotated_image, details = detect_ev_nev(file, confidence_threshold, iou_threshold)
        return annotated_image, None, details
    elif file_ext in ["mp4", "avi", "mov"]:
        # Video processing
        processed_video, details = detect_video(file, confidence_threshold, iou_threshold)
        return None, processed_video, details
    else:
        raise ValueError("Unsupported file format. Please upload an image or video.")


interface = gr.Interface(
    fn=detect,
    inputs=[
        gr.File(label="Upload Image or Video", file_types=[".png", ".jpg", ".jpeg", ".mp4", ".avi", ".mov"]),
        gr.Slider(0, 1, value=0.5, label="Confidence Threshold"),
        gr.Slider(0, 1, value=0.5, label="IoU Threshold"),
    ],
    outputs=[

        gr.Image(label="Processed Image"),

        gr.Video(label="Generated Video"),
        gr.Text(label="Detection Details")

    ],
    title="RT-DETR Object Detection for Images and Videos",
    description="Upload an image or video to detect objects using the fine-tuned RT-DETR model. Results include the annotated image/video and detection details."
)
interface.launch(debug=True)