# from dataclasses import dataclass, replace # from functools import reduce from io import BytesIO import math import os from pprint import pprint import tempfile from PIL import Image, ImageDraw, ImageFont import numpy as np import cv2 # import seaborn as sns # import matplotlib.pyplot as plt # %matplotlib inline import torch from torch.utils.data import Dataset import torchvision from torchvision import transforms import roboflow from roboflow import Roboflow import supervision as sv import albumentations as A import gradio as gr import requests # from torchmetrics.detection.mean_ap import MeanAveragePrecision # from torchmetrics.detection.iou import IntersectionOverUnion # import evaluate #from datasets import load_metric from transformers import pipeline from transformers import ( AutoProcessor, AutoImageProcessor, AutoModel, AutoModelForObjectDetection, RTDetrForObjectDetection, RTDetrImageProcessor, TrainingArguments, Trainer ) from huggingface_hub import hf_hub_download from safetensors.torch import load_file #@title Utilities PALETTE = {0: {"color": (255, 0, 0), "name": "Ambulance"}, 1: {"color": (0, 191, 0), "name": "Firetruck"}, 2: {"color": (0, 0, 255), "name": "Police"}, 3: {"color": (255, 0, 255), "name": "Non-EV"}} label2id = {val["name"]: id for (id, val) in PALETTE.items()} id2label = {id: name for (name, id) in label2id.items()} print(label2id) print(id2label) def unnormalize_bbox(img_h, img_w, bbox): x_min = bbox[0] - bbox[2]/2 y_min = bbox[1] - bbox[3]/2 x_max = bbox[0] + bbox[2]/2 # - x_min y_max = bbox[1] + bbox[3]/2 # - y_min x_min *= img_w y_min *= img_h x_max *= img_w y_max *= img_h x_min, y_min, x_max, y_max = list(map(int, [x_min, y_min, x_max, y_max])) return (x_min, y_min, x_max, y_max) def paint_bbox( image, annotations, normalize_labels=True, normalize_bbox=True, ): bboxes = annotations["boxes"].tolist() class_id = annotations["labels"].tolist() confidences = annotations["scores"].tolist() painted_img = image.copy() # Wutdehell for (bbox, label, confidence) in zip(bboxes, class_id, confidences): label = (label - 1) if normalize_labels else label if normalize_bbox: img_h, img_w = image.shape[0], image.shape[1] # H, W, C x_min, y_min, x_max, y_max = unnormalize_bbox(img_h, img_w, bbox) print([x_min, y_min, x_max, y_max]) """ x_min = #int(bbox[0] - bbox[2]/2) # Left y_min = #int(bbox[1] - bbox[3]/2) # Top x_max = #int(bbox[0] + bbox[2]/2) y_max = #int(bbox[1] + bbox[3]/2) """ else: x_min, y_min, x_max, y_max = list(map(int, bbox)) box_color = PALETTE[label]["color"] label_name = PALETTE[label]["name"] if confidence != -1: label_name = f"{label_name} ({confidence:.2f})" cv2.rectangle(painted_img, (x_min, y_min), (x_max, y_max), color=box_color, thickness=2) cv2.rectangle(painted_img, (x_min, y_min), (x_min + 5 + len(label_name)*10, y_min + 17), color=box_color, thickness=-1) cv2.putText(painted_img, label_name, (x_min + 2, y_min + 12), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.5, color=(255, 255, 255), thickness=1) return painted_img # Function to calculate Intersection over Union (IoU) def calculate_iou(truth_bbx, pred_bbx): # Coordinates of the boxes: [xmin, ymin, xmax, ymax] x1, y1, x2, y2 = truth_bbx x1_p, y1_p, x2_p, y2_p = pred_bbx # Calculate intersection ixmin = max(x1, x1_p) iymin = max(y1, y1_p) ixmax = min(x2, x2_p) iymax = min(y2, y2_p) iw = max(0, ixmax - ixmin) ih = max(0, iymax - iymin) intersection = iw * ih area1 = (x2 - x1) * (y2 - y1) area2 = (x2_p - x1_p) * (y2_p - y1_p) union = area1 + area2 - intersection iou = intersection / union if union != 0 else 0 return iou # Example: emotion_classifier = pipeline("image-classification", model="itsindrabudhik/emotion_classification") # (Load only once) DETECTOR = pipeline("object-detection", model="itsindrabudhik/finalProjectCV2425") #later on, change this with out trained modell yesssss (the trained model should be uploaded to hugging face) tensor_file = hf_hub_download(repo_id="itsindrabudhik/finalProjectCV2425", filename="model.safetensors") # Assign classification head weights since that pipeline seems to not handling it # weights = load_file(tensor_file) # DETECTOR.model.class_labels_classifier.weight.data = weights["class_labels_classifier.weight"] # DETECTOR.model.class_labels_classifier.bias.data = weights["class_labels_classifier.bias"] # del weights def detect_ev_nev(image, confidence_threshold=0.5, iou_threshold=0.5): # Run the detector pipeline on the image results = DETECTOR(image) # Open the image if isinstance(image, str): # If the image is a URL or file path if image.startswith("http"): response = requests.get(image) img = Image.open(BytesIO(response.content)) else: img = Image.open(image) else: img = image # Draw bounding boxes and labels on the image font_path = os.path.join(cv2.__path__[0],'qt','fonts','DejaVuSans.ttf') font = ImageFont.truetype(font_path, size=32) draw = ImageDraw.Draw(img) details = [] # Collect details for text output for result in results: score = result['score'] label = result['label'] box = result['box'] # Apply confidence threshold if score < confidence_threshold: continue # Filter out low IoU detections keep = True for previous_result in results: if previous_result != result: prev_box = previous_result['box'] iou = calculate_iou([box['xmin'], box['ymin'], box['xmax'], box['ymax']], [prev_box['xmin'], prev_box['ymin'], prev_box['xmax'], prev_box['ymax']]) if iou > iou_threshold: keep = False break label_color = PALETTE[label2id[label]]["color"] if keep: # Draw the bounding box and label xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax'] draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3) # Use a larger font size for text text = f"{label} ({score:.2f})" # Calculate text bounding box text_bbox = draw.textbbox((xmin, ymin - 10), text, font=font) # This gives (xmin, ymin, xmax, ymax) text_width = text_bbox[2] - text_bbox[0] # width of the text box text_height = text_bbox[3] - text_bbox[1] # height of the text box # Draw the text on the image (position adjusted) draw.text((xmin, ymin - text_height - 5), text, fill="red", font=font) # Add details to the list details.append({ "Label": label, "Confidence": f"{score:.2f}", "Bounding Box": f"({xmin}, {ymin}, {xmax}, {ymax})" }) details_text = "\n".join([f"Label: {d['Label']}, Confidence: {d['Confidence']}, Box: {d['Bounding Box']}" for d in details]) return img, details_text def detect_video(video, confidence_threshold=0.5, iou_threshold=0.5): video_capture = cv2.VideoCapture(video) fps = video_capture.get(cv2.CAP_PROP_FPS) frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter(temp_output.name, fourcc, fps, (frame_width, frame_height)) details = [] total_frames = 0 detected_frames = 0 while True: ret, frame = video_capture.read() if not ret: break total_frames += 1 image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) annotated_image, frame_details = detect_ev_nev(image, confidence_threshold, iou_threshold) # Count frames with detections if frame_details.strip(): # Non-empty details indicate detections detected_frames += 1 details.append(frame_details) annotated_frame = cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR) out.write(annotated_frame) video_capture.release() out.release() details_text = "\n".join(details) summary = f"Total Frames: {total_frames}, Frames with Detections: {detected_frames}\n" + details_text return temp_output.name, summary def detect(file, confidence_threshold=0.5, iou_threshold=0.5): # Determine if input is an image or video file_ext = file.name.split(".")[-1].lower() if file_ext in ["png", "jpg", "jpeg"]: # Image processing annotated_image, details = detect_ev_nev(file, confidence_threshold, iou_threshold) return annotated_image, None, details elif file_ext in ["mp4", "avi", "mov"]: # Video processing processed_video, details = detect_video(file, confidence_threshold, iou_threshold) return None, processed_video, details else: raise ValueError("Unsupported file format. Please upload an image or video.") interface = gr.Interface( fn=detect, inputs=[ gr.File(label="Upload Image or Video", file_types=[".png", ".jpg", ".jpeg", ".mp4", ".avi", ".mov"]), gr.Slider(0, 1, value=0.5, label="Confidence Threshold"), gr.Slider(0, 1, value=0.5, label="IoU Threshold"), ], outputs=[ gr.Image(label="Processed Image"), gr.Video(label="Generated Video"), gr.Text(label="Detection Details") ], title="RT-DETR Object Detection for Images and Videos", description="Upload an image or video to detect objects using the fine-tuned RT-DETR model. Results include the annotated image/video and detection details." ) interface.launch(debug=True)