import gradio as gr
import cv2
import numpy as np
import os
from PIL import Image

# Load YOLO model
net = cv2.dnn.readNet('yolov3.weights', 'yolov3.cfg')

# Set backend (CPU)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

# Load class names
with open('coco.names', 'r') as f:
    classes = [line.strip() for line in f.readlines()]

# Get YOLO output layer names
output_layers_names = net.getUnconnectedOutLayersNames()

def count_people_in_frame(frame):
    """
    Detects people in a given frame (image) and returns count.
    """
    height, width, _ = frame.shape

    # Convert frame to YOLO format
    blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)

    # Forward pass
    layer_outputs = net.forward(output_layers_names)

    # Process detections
    boxes, confidences = [], []
    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]

            if classes[class_id] == 'person' and confidence > 0.5:
                center_x, center_y = int(detection[0] * width), int(detection[1] * height)
                w, h = int(detection[2] * width), int(detection[3] * height)
                x, y = int(center_x - w / 2), int(center_y - h / 2)

                boxes.append([x, y, w, h])
                confidences.append(float(confidence))

    # Apply Non-Maximum Suppression (NMS)
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4) if boxes else []

    return len(indexes)

def analyze_image(image):
    """
    Processes an image and detects people.
    """
    if isinstance(image, np.ndarray):  
        image_cv = image  # Already a NumPy array
    else:  
        image_cv = np.array(image)  # Convert PIL image to NumPy array

    people_count = count_people_in_frame(image_cv)
    return image, f"People in Image: {people_count}"

def analyze_video(video_file):
    """
    Processes a video and detects people in each frame.
    """
    video_path = video_file.name  

    if not os.path.exists(video_path):
        return "Error: Video file could not be loaded."

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return "Error: Unable to open video file."

    frame_count = 0
    people_per_frame = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Count people in the frame
        people_count = count_people_in_frame(frame)
        people_per_frame.append(people_count)

        frame_count += 1

    cap.release()

    return f"Max People Detected in Video: {max(people_per_frame) if people_per_frame else 0}"

def process_input(input_file):
    """
    Determines if the input is an image or a video and calls the appropriate function.
    """
    file_path = input_file.name
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
        image = Image.open(file_path)
        return analyze_image(image)
    elif file_extension in [".mp4", ".avi", ".mov", ".mkv"]:
        return analyze_video(input_file)
    else:
        return "Error: Unsupported file format."

# Gradio Interface for Image and Video Processing
app = gr.Interface(
    fn=process_input,
    inputs=gr.File(label="Upload Image or Video"),  # Use File to handle both types
    outputs=[gr.Textbox(label="People Counting Results")],
    title="YOLO People Counter (Image & Video)",
    description="Upload an image or video to detect and count people using YOLOv3."
)

# Launch app
if __name__ == "__main__":
    app.launch()