import cv2
import numpy as np
import time
import os
import matplotlib.pyplot as plt
import gradio as gr

# try:
#     from pygame import mixer

#     mixer_init = True
# except ModuleNotFoundError:
#     mixer = None
#     mixer_init = False

# ------------------------------------------------------------------------------
# 1. Initializations.
# ------------------------------------------------------------------------------

# Initialize counter for the number of blinks detected.
BLINK = 0

# Model file paths.
MODEL_PATH = "./model/res10_300x300_ssd_iter_140000.caffemodel"
CONFIG_PATH = "./model/deploy.prototxt"
LBF_MODEL = "./model/lbfmodel.yaml"

# Create a face detector network instance.
net = cv2.dnn.readNetFromCaffe(CONFIG_PATH, MODEL_PATH)

# Create the landmark detector instance.
landmarkDetector = cv2.face.createFacemarkLBF()
landmarkDetector.loadModel(LBF_MODEL)

# ------------------------------------------------------------------------------
# 2. Function definitions.
# ------------------------------------------------------------------------------


def detect_faces(image, detection_threshold=0.70):
    blob = cv2.dnn.blobFromImage(image, 1.0, (300, 300), [104, 117, 123])
    net.setInput(blob)
    detections = net.forward()

    faces = []
    img_h = image.shape[0]
    img_w = image.shape[1]

    for detection in detections[0][0]:
        if detection[2] >= detection_threshold:
            left = detection[3] * img_w
            top = detection[4] * img_h
            right = detection[5] * img_w
            bottom = detection[6] * img_h

            face_w = right - left
            face_h = bottom - top

            face_roi = (left, top, face_w, face_h)
            faces.append(face_roi)

    return np.array(faces).astype(int)


def get_primary_face(faces, frame_h, frame_w):
    primary_face_index = None
    face_height_max = 0
    for idx in range(len(faces)):
        face = faces[idx]
        x1 = face[0]
        y1 = face[1]
        x2 = x1 + face[2]
        y2 = y1 + face[3]
        if x1 > frame_w or y1 > frame_h or x2 > frame_w or y2 > frame_h:
            continue
        if x1 < 0 or y1 < 0 or x2 < 0 or y2 < 0:
            continue

        # Prioritize the face with the maximum height.
        if face[3] > face_height_max:
            primary_face_index = idx
            face_height_max = face[3]

    if primary_face_index is not None:
        primary_face = faces[primary_face_index]
    else:
        primary_face = None

    return primary_face


def visualize_eyes(landmarks, frame):
    for i in range(36, 48):
        cv2.circle(frame, tuple(landmarks[i].astype("int")), 2, (0, 255, 0), -1)


def get_eye_aspect_ratio(landmarks):
    vert_dist_1right = calculate_distance(landmarks[37], landmarks[41])
    vert_dist_2right = calculate_distance(landmarks[38], landmarks[40])
    vert_dist_1left = calculate_distance(landmarks[43], landmarks[47])
    vert_dist_2left = calculate_distance(landmarks[44], landmarks[46])
    horz_dist_right = calculate_distance(landmarks[36], landmarks[39])
    horz_dist_left = calculate_distance(landmarks[42], landmarks[45])
    EAR_left = (vert_dist_1left + vert_dist_2left) / (2.0 * horz_dist_left)
    EAR_right = (vert_dist_1right + vert_dist_2right) / (2.0 * horz_dist_right)
    ear = (EAR_left + EAR_right) / 2
    return ear


def calculate_distance(A, B):
    distance = ((A[0] - B[0]) ** 2 + (A[1] - B[1]) ** 2) ** 0.5
    return distance


# def play(file):
#     if mixer_init:
#         mixer.init()
#         sound = mixer.Sound(file)
#         sound.play()


# ------------------------------------------------------------------------------
# 3. Processing function (to be used in Gradio).
# ------------------------------------------------------------------------------


def process_video(input_video):

    # Generate unique filenames for the outputs
    out_video_filename = "processed_video.mp4"
    out_plot_filename = "ear_plot.png"

    cap = cv2.VideoCapture(input_video)
    ret, frame = cap.read()
    if not ret:
        print("Cannot read the input video.")
        return None, None

    frame_h = frame.shape[0]
    frame_w = frame.shape[1]

    # Initialize writer for processed video
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 30
    out_writer = cv2.VideoWriter(out_video_filename, fourcc, fps, (frame_w, frame_h))

    # Calibration
    frame_count = 0
    frame_calib = 30  # Number of frames to use for threshold calibration.
    sum_ear = 0

    BLINK = 0
    state_prev = state_curr = "open"

    ear_values = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Detect Face.
        faces = detect_faces(frame, detection_threshold=0.90)

        if len(faces) > 0:
            # Use primary face
            primary_face = get_primary_face(faces, frame_h, frame_w)

            if primary_face is not None:
                cv2.rectangle(
                    frame,
                    (primary_face[0], primary_face[1]),
                    (primary_face[0] + primary_face[2], primary_face[1] + primary_face[3]),
                    (0, 255, 0),
                    3,
                )

                # Detect Landmarks
                retval, landmarksList = landmarkDetector.fit(frame, np.expand_dims(primary_face, 0))

                if retval:
                    landmarks = landmarksList[0][0]

                    # Display detections.
                    visualize_eyes(landmarks, frame)

                    # Get EAR
                    ear = get_eye_aspect_ratio(landmarks)
                    ear_values.append(ear)

                    if frame_count < frame_calib:
                        frame_count += 1
                        sum_ear += ear
                    elif frame_count == frame_calib:
                        frame_count += 1
                        avg_ear = sum_ear / frame_count
                        HIGHER_TH = 0.90 * avg_ear
                        LOWER_TH = 0.80 * HIGHER_TH
                        print("SET EAR HIGH: ", HIGHER_TH)
                        print("SET EAR LOW: ", LOWER_TH)
                    else:
                        if ear < LOWER_TH:
                            state_curr = "closed"
                        elif ear > HIGHER_TH:
                            state_curr = "open"

                        if state_prev == "closed" and state_curr == "open":
                            BLINK += 1
                            # if mixer_init:
                            #     play("./click.wav")

                        state_prev = state_curr

                        cv2.putText(
                            frame,
                            f"Blink Counter: {BLINK}",
                            (10, 80),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1.5,
                            (0, 0, 255),
                            4,
                            cv2.LINE_AA,
                        )
            else:
                # No valid face detected
                pass
        else:
            # No faces
            pass
        frame_out_final = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        out_writer.write(frame)

        yield frame_out_final, None, None

    cap.release()
    out_writer.release()

    # Plot EAR values if collected
    if ear_values:
        plt.figure(figsize=(10, 5.625))
        plt.plot(ear_values, label="EAR")
        plt.title("Eye Aspect Ratio (EAR) over time")
        plt.xlabel("Frame Index")
        plt.ylabel("EAR")
        plt.legend()
        plt.grid(True)
        plt.savefig(out_plot_filename)
        plt.close()
    else:
        out_plot_filename = None

    yield None, out_video_filename, out_plot_filename


# ------------------------------------------------------------------------------
# 4. Gradio UI
# ------------------------------------------------------------------------------


def process_gradio(video_file):
    if video_file is None:
        return None, None, None

    video_path = video_file
    output_frames = None
    processed_video = None
    plot_img = None

    # Process video using generator
    for frame_out, processed_video_path, plot_path in process_video(video_path):
        if frame_out is not None:
            output_frames = frame_out  # Update frames dynamically
            yield output_frames, None, None  # Gradio updates frames step-by-step
        else:
            processed_video = processed_video_path
            plot_img = plot_path

    # Final yield with processed video and EAR plot
    yield None, processed_video, plot_img


with gr.Blocks() as demo:
    gr.Markdown("# Blink Detection with OpenCV")
    gr.Markdown("Upload a video to detect blinks and view the EAR plot after processing.")
    with gr.Row():
        video_input = gr.Video(label="Input Video")
        output_frames = gr.Image(label="Output Frames")
    process_btn = gr.Button("Process")
    with gr.Row():
        processed_video = gr.Video(label="Processed Video")
        ear_plot = gr.Image(label="EAR Plot")
    process_btn.click(process_gradio, inputs=video_input, outputs=[output_frames, processed_video, ear_plot])
    
    examples = [
        ["./input-video.mp4"],
    ]

    with gr.Row():
        gr.Examples(
            examples=examples,
            inputs=[video_input],
            label="Load Example Video",
        )

if __name__ == "__main__":
    demo.launch()