import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from transformers import pipeline
import cv2
import numpy as np
import tempfile
import os

# Initialize the object detection pipeline
object_detector = pipeline("object-detection",
                         model="facebook/detr-resnet-50")

def draw_bounding_boxes(frame, detections):
    """
    Draws bounding boxes on the video frame based on the detections.
    """
    # Convert numpy array to PIL Image
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    draw = ImageDraw.Draw(pil_image)
    
    # Use default font
    font = ImageFont.load_default()

    for detection in detections:
        box = detection['box']
        xmin = int(box['xmin'])
        ymin = int(box['ymin'])
        xmax = int(box['xmax'])
        ymax = int(box['ymax'])

        # Draw the bounding box
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)

        # Create label with score
        label = detection['label']
        score = detection['score']
        text = f"{label} {score:.2f}"

        # Draw text with background rectangle for visibility
        text_bbox = draw.textbbox((xmin, ymin), text, font=font)
        draw.rectangle([
            (text_bbox[0], text_bbox[1]),
            (text_bbox[2], text_bbox[3])
        ], fill="red")
        draw.text((xmin, ymin), text, fill="white", font=font)

    # Convert back to numpy array
    frame_with_boxes = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    return frame_with_boxes

def process_video(video_path, progress=gr.Progress()):
    """
    Process the video file and return the path to the processed video
    """
    try:
        # Open the video file
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError("Could not open video file")

        # Get video properties
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Create output video file
        output_path = os.path.join(tempfile.gettempdir(), 'output_video.mp4')
        
        # Initialize video writer with H264 codec
        fourcc = cv2.VideoWriter_fourcc(*'avc1')
        out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

        if not out.isOpened():
            raise ValueError("Could not create output video file")

        frame_count = 0
        process_every_n_frames = 1  # Process every frame
        
        progress(0, desc="Processing video...")
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            frame_count += 1
            
            # Process frame
            if frame_count % process_every_n_frames == 0:
                # Convert frame to RGB for the model
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                
                # Detect objects
                detections = object_detector(frame_rgb)
                
                # Draw bounding boxes
                frame = draw_bounding_boxes(frame, detections)
            
            # Write the frame
            out.write(frame)
            
            # Update progress
            progress((frame_count / total_frames), desc=f"Processing frame {frame_count}/{total_frames}")

        # Release everything
        cap.release()
        out.release()
        
        # Verify the output file exists and has size
        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
            raise ValueError("Output video file is empty or was not created")
            
        return output_path
        
    except Exception as e:
        print(f"Error processing video: {str(e)}")
        raise gr.Error(f"Error processing video: {str(e)}")

def detect_objects_in_video(video):
    """
    Gradio interface function for video object detection
    """
    if video is None:
        raise gr.Error("Please upload a video file")
    
    try:
        # Process the video
        output_path = process_video(video)
        return output_path
        
    except Exception as e:
        raise gr.Error(f"Error during video processing: {str(e)}")

# Create the Gradio interface
demo = gr.Interface(
    fn=detect_objects_in_video,
    inputs=[
        gr.Video(label="Upload Video", format="mp4")
    ],
    outputs=[
        gr.Video(label="Processed Video", format="mp4")
    ],
    title="@GenAILearniverse Project: Video Object Detection",
    description="""
    Upload a video to detect and track objects within it. 
    The application will process the video and draw bounding boxes around detected objects 
    with their labels and confidence scores.
    Note: Processing may take some time depending on the video length.
    """,
    examples=[],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()