Spaces:

itsindrabudhik
/

cvfinalproject24251

Sleeping

App Files Files Community

cvfinalproject24251 / app.py

itsindrabudhik

Update app.py

e7e7f2a verified 7 months ago

raw

history blame contribute delete

10.4 kB

	# from dataclasses import dataclass, replace
	# from functools import reduce
	from io import BytesIO
	import math
	import os
	from pprint import pprint
	import tempfile

	from PIL import Image, ImageDraw, ImageFont
	import numpy as np
	import cv2

	# import seaborn as sns
	# import matplotlib.pyplot as plt
	# %matplotlib inline

	import torch
	from torch.utils.data import Dataset
	import torchvision
	from torchvision import transforms

	import roboflow
	from roboflow import Roboflow
	import supervision as sv
	import albumentations as A

	import gradio as gr
	import requests

	# from torchmetrics.detection.mean_ap import MeanAveragePrecision
	# from torchmetrics.detection.iou import IntersectionOverUnion
	# import evaluate
	#from datasets import load_metric

	from transformers import pipeline
	from transformers import (
	AutoProcessor,
	AutoImageProcessor,
	AutoModel,
	AutoModelForObjectDetection,
	RTDetrForObjectDetection,
	RTDetrImageProcessor,
	TrainingArguments,
	Trainer
	)
	from huggingface_hub import hf_hub_download

	from safetensors.torch import load_file

	#@title Utilities
	PALETTE = {0: {"color": (255, 0, 0),
	"name": "Ambulance"},
	1: {"color": (0, 191, 0),
	"name": "Firetruck"},
	2: {"color": (0, 0, 255),
	"name": "Police"},
	3: {"color": (255, 0, 255),
	"name": "Non-EV"}}
	label2id = {val["name"]: id for (id, val) in PALETTE.items()}
	id2label = {id: name for (name, id) in label2id.items()}

	print(label2id)
	print(id2label)

	def unnormalize_bbox(img_h, img_w, bbox):
	x_min = bbox[0] - bbox[2]/2
	y_min = bbox[1] - bbox[3]/2
	x_max = bbox[0] + bbox[2]/2 # - x_min
	y_max = bbox[1] + bbox[3]/2 # - y_min

	x_min *= img_w
	y_min *= img_h
	x_max *= img_w
	y_max *= img_h
	x_min, y_min, x_max, y_max = list(map(int, [x_min, y_min, x_max, y_max]))

	return (x_min, y_min, x_max, y_max)

	def paint_bbox(
	image,
	annotations,
	normalize_labels=True,
	normalize_bbox=True,
	):
	bboxes = annotations["boxes"].tolist()
	class_id = annotations["labels"].tolist()
	confidences = annotations["scores"].tolist()

	painted_img = image.copy() # Wutdehell
	for (bbox, label, confidence) in zip(bboxes, class_id, confidences):
	label = (label - 1) if normalize_labels else label
	if normalize_bbox:
	img_h, img_w = image.shape[0], image.shape[1] # H, W, C
	x_min, y_min, x_max, y_max = unnormalize_bbox(img_h, img_w, bbox)
	print([x_min, y_min, x_max, y_max])

	"""
	x_min = #int(bbox[0] - bbox[2]/2) # Left
	y_min = #int(bbox[1] - bbox[3]/2) # Top
	x_max = #int(bbox[0] + bbox[2]/2)
	y_max = #int(bbox[1] + bbox[3]/2)
	"""
	else:
	x_min, y_min, x_max, y_max = list(map(int, bbox))

	box_color = PALETTE[label]["color"]
	label_name = PALETTE[label]["name"]

	if confidence != -1:
	label_name = f"{label_name} ({confidence:.2f})"

	cv2.rectangle(painted_img,
	(x_min, y_min),
	(x_max, y_max),
	color=box_color,
	thickness=2)
	cv2.rectangle(painted_img,
	(x_min, y_min),
	(x_min + 5 + len(label_name)*10, y_min + 17),
	color=box_color,
	thickness=-1)
	cv2.putText(painted_img,
	label_name,
	(x_min + 2, y_min + 12),
	fontFace=cv2.FONT_HERSHEY_SIMPLEX,
	fontScale=0.5,
	color=(255, 255, 255),
	thickness=1)
	return painted_img

	# Function to calculate Intersection over Union (IoU)
	def calculate_iou(truth_bbx, pred_bbx):
	# Coordinates of the boxes: [xmin, ymin, xmax, ymax]
	x1, y1, x2, y2 = truth_bbx
	x1_p, y1_p, x2_p, y2_p = pred_bbx

	# Calculate intersection
	ixmin = max(x1, x1_p)
	iymin = max(y1, y1_p)
	ixmax = min(x2, x2_p)
	iymax = min(y2, y2_p)

	iw = max(0, ixmax - ixmin)
	ih = max(0, iymax - iymin)

	intersection = iw * ih
	area1 = (x2 - x1) * (y2 - y1)
	area2 = (x2_p - x1_p) * (y2_p - y1_p)
	union = area1 + area2 - intersection
	iou = intersection / union if union != 0 else 0
	return iou

	# Example: emotion_classifier = pipeline("image-classification", model="itsindrabudhik/emotion_classification")
	# (Load only once)
	DETECTOR = pipeline("object-detection", model="itsindrabudhik/finalProjectCV2425") #later on, change this with out trained modell yesssss (the trained model should be uploaded to hugging face)
	tensor_file = hf_hub_download(repo_id="itsindrabudhik/finalProjectCV2425",
	filename="model.safetensors")

	# Assign classification head weights since that pipeline seems to not handling it
	# weights = load_file(tensor_file)
	# DETECTOR.model.class_labels_classifier.weight.data = weights["class_labels_classifier.weight"]
	# DETECTOR.model.class_labels_classifier.bias.data = weights["class_labels_classifier.bias"]
	# del weights

	def detect_ev_nev(image, confidence_threshold=0.5, iou_threshold=0.5):
	# Run the detector pipeline on the image
	results = DETECTOR(image)

	# Open the image
	if isinstance(image, str): # If the image is a URL or file path
	if image.startswith("http"):
	response = requests.get(image)
	img = Image.open(BytesIO(response.content))
	else:
	img = Image.open(image)
	else:
	img = image

	# Draw bounding boxes and labels on the image
	font_path = os.path.join(cv2.__path__[0],'qt','fonts','DejaVuSans.ttf')
	font = ImageFont.truetype(font_path, size=32)
	draw = ImageDraw.Draw(img)

	details = [] # Collect details for text output
	for result in results:
	score = result['score']
	label = result['label']
	box = result['box']

	# Apply confidence threshold
	if score < confidence_threshold:
	continue

	# Filter out low IoU detections
	keep = True
	for previous_result in results:
	if previous_result != result:
	prev_box = previous_result['box']
	iou = calculate_iou([box['xmin'], box['ymin'], box['xmax'], box['ymax']],
	[prev_box['xmin'], prev_box['ymin'], prev_box['xmax'], prev_box['ymax']])
	if iou > iou_threshold:
	keep = False
	break

	label_color = PALETTE[label2id[label]]["color"]
	if keep:
	# Draw the bounding box and label
	xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
	draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3)

	# Use a larger font size for text
	text = f"{label} ({score:.2f})"

	# Calculate text bounding box
	text_bbox = draw.textbbox((xmin, ymin - 10), text, font=font) # This gives (xmin, ymin, xmax, ymax)
	text_width = text_bbox[2] - text_bbox[0] # width of the text box
	text_height = text_bbox[3] - text_bbox[1] # height of the text box

	# Draw the text on the image (position adjusted)
	draw.text((xmin, ymin - text_height - 5), text, fill="red", font=font)

	# Add details to the list
	details.append({
	"Label": label,
	"Confidence": f"{score:.2f}",
	"Bounding Box": f"({xmin}, {ymin}, {xmax}, {ymax})"
	})
	details_text = "\n".join([f"Label: {d['Label']}, Confidence: {d['Confidence']}, Box: {d['Bounding Box']}" for d in details])
	return img, details_text

	def detect_video(video, confidence_threshold=0.5, iou_threshold=0.5):
	video_capture = cv2.VideoCapture(video)
	fps = video_capture.get(cv2.CAP_PROP_FPS)
	frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
	frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))

	temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	out = cv2.VideoWriter(temp_output.name, fourcc, fps, (frame_width, frame_height))

	details = []
	total_frames = 0
	detected_frames = 0

	while True:
	ret, frame = video_capture.read()
	if not ret:
	break

	total_frames += 1
	image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	annotated_image, frame_details = detect_ev_nev(image, confidence_threshold, iou_threshold)

	# Count frames with detections
	if frame_details.strip(): # Non-empty details indicate detections
	detected_frames += 1

	details.append(frame_details)
	annotated_frame = cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR)
	out.write(annotated_frame)

	video_capture.release()
	out.release()

	details_text = "\n".join(details)
	summary = f"Total Frames: {total_frames}, Frames with Detections: {detected_frames}\n" + details_text
	return temp_output.name, summary

	def detect(file, confidence_threshold=0.5, iou_threshold=0.5):
	# Determine if input is an image or video
	file_ext = file.name.split(".")[-1].lower()
	if file_ext in ["png", "jpg", "jpeg"]:
	# Image processing
	annotated_image, details = detect_ev_nev(file, confidence_threshold, iou_threshold)
	return annotated_image, None, details
	elif file_ext in ["mp4", "avi", "mov"]:
	# Video processing
	processed_video, details = detect_video(file, confidence_threshold, iou_threshold)
	return None, processed_video, details
	else:
	raise ValueError("Unsupported file format. Please upload an image or video.")


	interface = gr.Interface(
	fn=detect,
	inputs=[
	gr.File(label="Upload Image or Video", file_types=[".png", ".jpg", ".jpeg", ".mp4", ".avi", ".mov"]),
	gr.Slider(0, 1, value=0.5, label="Confidence Threshold"),
	gr.Slider(0, 1, value=0.5, label="IoU Threshold"),
	],
	outputs=[

	gr.Image(label="Processed Image"),

	gr.Video(label="Generated Video"),
	gr.Text(label="Detection Details")

	],
	title="RT-DETR Object Detection for Images and Videos",
	description="Upload an image or video to detect objects using the fine-tuned RT-DETR model. Results include the annotated image/video and detection details."
	)
	interface.launch(debug=True)