import os
import gradio as gr
import numpy as np
from PIL import Image
import cv2
import spaces

from inference.seg import process_image_or_video as process_seg
from inference.pose import process_image_or_video as process_pose
from inference.depth import process_image_or_video as process_depth
from inference.normal import process_image_or_video as process_normal
from config import SAPIENS_LITE_MODELS_PATH

def update_model_choices(task):
    model_choices = list(SAPIENS_LITE_MODELS_PATH[task.lower()].keys())
    return gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None)

@spaces.GPU(duration=75)
def process_image(input_image, task, version):
    if isinstance(input_image, np.ndarray):
        input_image = Image.fromarray(input_image)
    
    if task.lower() == 'seg':
        result = process_seg(input_image, task=task.lower(), version=version)
    elif task.lower() == 'pose':
        result = process_pose(input_image, task=task.lower(), version=version)
    elif task.lower() == 'depth':
        result = process_depth(input_image, task=task.lower(), version=version)
    elif task.lower() == 'normal':
        result = process_normal(input_image, task=task.lower(), version=version)
    else:
        result = None
        print(f"Tarea no soportada: {task}")
    
    return result

@spaces.GPU(duration=75)
def process_video(input_video, task, version):
    cap = cv2.VideoCapture(input_video)
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    output_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        if task.lower() == 'seg':
            processed_frame = process_seg(frame_rgb, task=task.lower(), version=version)
        elif task.lower() == 'pose':
            processed_frame = process_pose(frame_rgb, task=task.lower(), version=version)
        elif task.lower() == 'depth':
            processed_frame = process_depth(frame_rgb, task=task.lower(), version=version)
        elif task.lower() == 'normal':
            processed_frame = process_normal(frame_rgb, task=task.lower(), version=version)
        else:
            processed_frame = None
            print(f"Tarea no soportada: {task}")
            break
        
        if processed_frame is not None:
            processed_frame_bgr = cv2.cvtColor(np.array(processed_frame), cv2.COLOR_RGB2BGR)
            output_video.write(processed_frame_bgr)
    
    cap.release()
    output_video.release()
    
    return 'output_video.mp4'

with gr.Blocks() as demo:
    gr.Markdown("""
           <div style="text-align: center; font-size: 35px; font-weight: bold; margin-bottom: 20px;">
               Sapiens Huggingface Space🤗
           </div>
           <div style="text-align: center; font-size: 25px; font-weight: bold; margin-bottom: 20px;">
               Foundation for Human Vision Models
           </div>
           <div style="text-align: center;">
               <a href="https://huggingface.co/facebook/sapiens">🤗 Sapiens Models</a> |
               <a href="https://github.com/facebookresearch/sapiens/">🌐 Github</a> |
               <a href="https://www.arxiv.org/abs/2408.12569">📜 arxiv </a> |
               <a href="https://joselo.ai">🔗Personal Blog </a>
           </div>
           <div style="text-align: center; font-size: 15px; font-weight: bold; margin-bottom: 20px;">
            Sapiens, a family of models for four fundamental human-centric vision tasks - 2D pose estimation, body-part segmentation, depth estimation, and surface normal prediction. 
            </div>
           """)
    with gr.Tabs():
        with gr.TabItem('Image'):
            with gr.Row():
                with gr.Column():
                    input_image = gr.Image(label="Input Image", type="pil")
                    select_task_image = gr.Radio(
                        ["seg", "pose", "depth", "normal"], 
                        label="Task", 
                        info="Choose the task to perform",
                        value="seg"
                    )
                    model_name_image = gr.Dropdown(
                        label="Model Version",
                        choices=list(SAPIENS_LITE_MODELS_PATH["seg"].keys()),
                        value="sapiens_0.3b",
                    )
                with gr.Column():
                    result_image = gr.Image(label="Result")
                    run_button_image = gr.Button("Run")
        
        with gr.TabItem('Video'):
            with gr.Row():
                with gr.Column():
                    input_video = gr.Video(label="Input Video")
                    select_task_video = gr.Radio(
                        ["seg", "pose", "depth", "normal"], 
                        label="Task", 
                        info="Choose the task to perform",
                        value="seg"
                    )
                    model_name_video = gr.Dropdown(
                        label="Model Version",
                        choices=list(SAPIENS_LITE_MODELS_PATH["seg"].keys()),
                        value="sapiens_0.3b",
                    )
                with gr.Column():
                    result_video = gr.Video(label="Result")
                    run_button_video = gr.Button("Run")

    select_task_image.change(fn=update_model_choices, inputs=select_task_image, outputs=model_name_image)
    select_task_video.change(fn=update_model_choices, inputs=select_task_video, outputs=model_name_video)

    run_button_image.click(
        fn=process_image,
        inputs=[input_image, select_task_image, model_name_image],
        outputs=[result_image],
    )

    run_button_video.click(
        fn=process_video,
        inputs=[input_video, select_task_video, model_name_video],
        outputs=[result_video],
    )

if __name__ == "__main__":
    demo.launch(share=False)