import os
import time
import sys

import cuid
import gradio as gr
import spaces
import numpy as np

from huggingface_hub import snapshot_download

# Add necessary paths
ProjectDir = os.path.abspath(os.path.dirname(__file__))
sys.path.insert(0, ProjectDir)
sys.path.insert(0, os.path.join(ProjectDir, "MMCM"))
sys.path.insert(0, os.path.join(ProjectDir, "diffusers/src"))
sys.path.insert(0, os.path.join(ProjectDir, "controlnet_aux/src"))

CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
ignore_video2video = True
max_image_edge = 960

def download_model():
    if not os.path.exists(CheckpointsDir):
        print("Checkpoint Not Downloaded, start downloading...")
        tic = time.time()
        snapshot_download(
            repo_id="TMElyralab/MuseV",
            local_dir=CheckpointsDir,
            max_workers=8,
            local_dir_use_symlinks=True,
        )
        toc = time.time()
        print(f"download cost {toc-tic} seconds")
    else:
        print("Already download the model.")

# Download model first
print("Starting model download...")
download_model()

# Import after model download to ensure all dependencies are ready
from gradio_text2video import online_t2v_inference

@spaces.GPU(duration=180)
def hf_online_t2v_inference(
    prompt,
    image_np,
    seed,
    fps,
    w,
    h,
    video_len,
    img_edge_ratio,
):
    if not isinstance(image_np, np.ndarray):  # None
        raise gr.Error("Need input reference image")
    return online_t2v_inference(
        prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
    )

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# MuseV Demo")
    
    with gr.Tab("Text to Video"):
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(label="Prompt")
                image = gr.Image(label="VisionCondImage")
                seed = gr.Number(
                    label="Seed (seed=-1 means that the seeds run each time are different)",
                    value=-1,
                )
                video_length = gr.Number(
                    label="Video Length(need smaller than 144)",
                    value=12,
                )
                fps = gr.Number(label="Generate Video FPS", value=6)
                with gr.Row():
                    w = gr.Number(label="Width", value=-1)
                    h = gr.Number(label="Height", value=-1)
                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
                btn = gr.Button("Generate")
            video_output = gr.Video()
            
        btn.click(
            fn=hf_online_t2v_inference,
            inputs=[
                prompt,
                image,
                seed,
                fps,
                w,
                h,
                video_length,
                img_edge_ratio,
            ],
            outputs=video_output,
        )

    with gr.Tab("Video to Video"):
        gr.Markdown(
            "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally."
        )

# Launch the app
demo.queue().launch(server_name="0.0.0.0", server_port=7860)