import spaces
import os
import gradio as gr
import numpy as np
import torch
from PIL import Image
import trimesh
import random
from transformers import AutoModelForImageSegmentation
from torchvision import transforms
from huggingface_hub import hf_hub_download, snapshot_download
import subprocess
import shutil

# install others
subprocess.run("pip install spandrel==0.4.1 --no-deps", shell=True, check=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16

print("DEVICE: ", DEVICE)

DEFAULT_FACE_NUMBER = 100000
MAX_SEED = np.iinfo(np.int32).max
TRIPOSG_REPO_URL = "https://github.com/VAST-AI-Research/TripoSG.git"
MV_ADAPTER_REPO_URL = "https://github.com/huanngzh/MV-Adapter.git"

RMBG_PRETRAINED_MODEL = "checkpoints/RMBG-1.4"
TRIPOSG_PRETRAINED_MODEL = "checkpoints/TripoSG"

TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
os.makedirs(TMP_DIR, exist_ok=True)

TRIPOSG_CODE_DIR = "./triposg"
if not os.path.exists(TRIPOSG_CODE_DIR):
    os.system(f"git clone {TRIPOSG_REPO_URL} {TRIPOSG_CODE_DIR}")

MV_ADAPTER_CODE_DIR = "./mv_adapter"
if not os.path.exists(MV_ADAPTER_CODE_DIR):
    os.system(f"git clone {MV_ADAPTER_REPO_URL} {MV_ADAPTER_CODE_DIR} && cd {MV_ADAPTER_CODE_DIR} && git checkout 7d37a97e9bc223cdb8fd26a76bd8dd46504c7c3d")

import sys
sys.path.append(TRIPOSG_CODE_DIR)
sys.path.append(os.path.join(TRIPOSG_CODE_DIR, "scripts"))
sys.path.append(MV_ADAPTER_CODE_DIR)
sys.path.append(os.path.join(MV_ADAPTER_CODE_DIR, "scripts"))

HEADER = """

# 🔮 Image to 3D with [TripoSG](https://github.com/VAST-AI-Research/TripoSG)

## State-of-the-art Open Source 3D Generation Using Large-Scale Rectified Flow Transformers

<p style="font-size: 1.1em;">By <a href="https://www.tripo3d.ai/" style="color: #1E90FF; text-decoration: none; font-weight: bold;">Tripo</a></p>

## 📋 Quick Start Guide:
1. **Upload an image** (single object works best)
2. Click **Generate Shape** to create the 3D mesh
3. Click **Apply Texture** to add textures
4. Use **Download GLB** to save your 3D model
5. Adjust parameters under **Generation Settings** for fine-tuning

Best results come from clean, well-lit images with clear subject isolation. Try it now!

<p style="font-size: 0.9em; margin-top: 10px;">Texture generation powered by <a href="https://github.com/huanngzh/MV-Adapter" style="color: #1E90FF; text-decoration: none;">MV-Adapter</a> - a versatile multi-view adapter for consistent texture generation. Try the <a href="https://huggingface.co/spaces/VAST-AI/MV-Adapter-I2MV-SDXL" style="color: #1E90FF; text-decoration: none;">MV-Adapter demo</a> for multi-view image generation.</p>

"""

# # triposg
from image_process import prepare_image
from briarmbg import BriaRMBG
snapshot_download("briaai/RMBG-1.4", local_dir=RMBG_PRETRAINED_MODEL)
rmbg_net = BriaRMBG.from_pretrained(RMBG_PRETRAINED_MODEL).to(DEVICE)
rmbg_net.eval()
from triposg.pipelines.pipeline_triposg import TripoSGPipeline
snapshot_download("VAST-AI/TripoSG", local_dir=TRIPOSG_PRETRAINED_MODEL)
triposg_pipe = TripoSGPipeline.from_pretrained(TRIPOSG_PRETRAINED_MODEL).to(DEVICE, DTYPE)

# mv adapter
NUM_VIEWS = 6
from inference_ig2mv_sdxl import prepare_pipeline, preprocess_image, remove_bg
from mvadapter.utils import get_orthogonal_camera, tensor_to_image, make_image_grid
from mvadapter.utils.render import NVDiffRastContextWrapper, load_mesh, render
mv_adapter_pipe = prepare_pipeline(
    base_model="stabilityai/stable-diffusion-xl-base-1.0",
    vae_model="madebyollin/sdxl-vae-fp16-fix",
    unet_model=None,
    lora_model=None,
    adapter_path="huanngzh/mv-adapter",
    scheduler=None,
    num_views=NUM_VIEWS,
    device=DEVICE,
    dtype=torch.float16,
)
birefnet = AutoModelForImageSegmentation.from_pretrained(
        "ZhengPeng7/BiRefNet", trust_remote_code=True
    )
birefnet.to(DEVICE)
transform_image = transforms.Compose(
    [
        transforms.Resize((1024, 1024)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)
remove_bg_fn = lambda x: remove_bg(x, birefnet, transform_image, DEVICE)

if not os.path.exists("checkpoints/RealESRGAN_x2plus.pth"):
    hf_hub_download("dtarnow/UPscaler", filename="RealESRGAN_x2plus.pth", local_dir="checkpoints")
if not os.path.exists("checkpoints/big-lama.pt"):
    subprocess.run("wget -P checkpoints/ https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt", shell=True, check=True)

def start_session(req: gr.Request):
    save_dir = os.path.join(TMP_DIR, str(req.session_hash))
    os.makedirs(save_dir, exist_ok=True)
    print("start session, mkdir", save_dir)

def end_session(req: gr.Request):
    save_dir = os.path.join(TMP_DIR, str(req.session_hash))
    shutil.rmtree(save_dir)

def get_random_hex():
    random_bytes = os.urandom(8)
    random_hex = random_bytes.hex()
    return random_hex

def get_random_seed(randomize_seed, seed):
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed

@spaces.GPU(duration=180)
def run_full(image: str, req: gr.Request):
    seed = 0
    num_inference_steps = 50
    guidance_scale = 7.5
    simplify = True
    target_face_num = DEFAULT_FACE_NUMBER
    
    image_seg = prepare_image(image, bg_color=np.array([1.0, 1.0, 1.0]), rmbg_net=rmbg_net)

    outputs = triposg_pipe(
        image=image_seg,
        generator=torch.Generator(device=triposg_pipe.device).manual_seed(seed),
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale
    ).samples[0]
    print("mesh extraction done")
    mesh = trimesh.Trimesh(outputs[0].astype(np.float32), np.ascontiguousarray(outputs[1]))

    if simplify:
        print("start simplify")
        from utils import simplify_mesh
        mesh = simplify_mesh(mesh, target_face_num)
    
    save_dir = os.path.join(TMP_DIR, "examples")
    os.makedirs(save_dir, exist_ok=True)
    mesh_path = os.path.join(save_dir, f"triposg_{get_random_hex()}.glb")
    mesh.export(mesh_path)
    print("save to ", mesh_path)

    torch.cuda.empty_cache()

    height, width = 768, 768
    # Prepare cameras
    cameras = get_orthogonal_camera(
        elevation_deg=[0, 0, 0, 0, 89.99, -89.99],
        distance=[1.8] * NUM_VIEWS,
        left=-0.55,
        right=0.55,
        bottom=-0.55,
        top=0.55,
        azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
        device=DEVICE,
    )
    ctx = NVDiffRastContextWrapper(device=DEVICE, context_type="cuda")

    mesh = load_mesh(mesh_path, rescale=True, device=DEVICE)
    render_out = render(
        ctx,
        mesh,
        cameras,
        height=height,
        width=width,
        render_attr=False,
        normal_background=0.0,
    )
    control_images = (
        torch.cat(
            [
                (render_out.pos + 0.5).clamp(0, 1),
                (render_out.normal / 2 + 0.5).clamp(0, 1),
            ],
            dim=-1,
        )
        .permute(0, 3, 1, 2)
        .to(DEVICE)
    )

    image = Image.open(image)
    image = remove_bg_fn(image)
    image = preprocess_image(image, height, width)

    pipe_kwargs = {}
    if seed != -1 and isinstance(seed, int):
        pipe_kwargs["generator"] = torch.Generator(device=DEVICE).manual_seed(seed)

    images = mv_adapter_pipe(
        "high quality",
        height=height,
        width=width,
        num_inference_steps=15,
        guidance_scale=3.0,
        num_images_per_prompt=NUM_VIEWS,
        control_image=control_images,
        control_conditioning_scale=1.0,
        reference_image=image,
        reference_conditioning_scale=1.0,
        negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
        cross_attention_kwargs={"scale": 1.0},
        **pipe_kwargs,
    ).images

    torch.cuda.empty_cache()

    mv_image_path = os.path.join(save_dir, f"mv_adapter_{get_random_hex()}.png")
    make_image_grid(images, rows=1).save(mv_image_path)

    from texture import TexturePipeline, ModProcessConfig
    texture_pipe = TexturePipeline(
        upscaler_ckpt_path="checkpoints/RealESRGAN_x2plus.pth",
        inpaint_ckpt_path="checkpoints/big-lama.pt",
        device=DEVICE,
    )

    textured_glb_path = texture_pipe(
        mesh_path=mesh_path,
        save_dir=save_dir,
        save_name=f"texture_mesh_{get_random_hex()}.glb",
        uv_unwarp=True,
        uv_size=4096,
        rgb_path=mv_image_path,
        rgb_process_config=ModProcessConfig(view_upscale=True, inpaint_mode="view"),
        camera_azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
    )

    return image_seg, mesh_path, textured_glb_path
    

@spaces.GPU()
@torch.no_grad()
def run_segmentation(image: str):
    image = prepare_image(image, bg_color=np.array([1.0, 1.0, 1.0]), rmbg_net=rmbg_net)
    return image

@spaces.GPU(duration=90)
@torch.no_grad()
def image_to_3d(
    image: Image.Image,
    seed: int,
    num_inference_steps: int,
    guidance_scale: float,
    simplify: bool,
    target_face_num: int,
    req: gr.Request
):
    outputs = triposg_pipe(
        image=image,
        generator=torch.Generator(device=triposg_pipe.device).manual_seed(seed),
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale
    ).samples[0]
    print("mesh extraction done")
    mesh = trimesh.Trimesh(outputs[0].astype(np.float32), np.ascontiguousarray(outputs[1]))

    if simplify:
        print("start simplify")
        from utils import simplify_mesh
        mesh = simplify_mesh(mesh, target_face_num)
    
    save_dir = os.path.join(TMP_DIR, str(req.session_hash))
    mesh_path = os.path.join(save_dir, f"triposg_{get_random_hex()}.glb")
    mesh.export(mesh_path)
    print("save to ", mesh_path)

    torch.cuda.empty_cache()

    return mesh_path

@spaces.GPU(duration=120)
@torch.no_grad()
def run_texture(image: Image, mesh_path: str, seed: int, req: gr.Request):
    height, width = 768, 768
    # Prepare cameras
    cameras = get_orthogonal_camera(
        elevation_deg=[0, 0, 0, 0, 89.99, -89.99],
        distance=[1.8] * NUM_VIEWS,
        left=-0.55,
        right=0.55,
        bottom=-0.55,
        top=0.55,
        azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
        device=DEVICE,
    )
    ctx = NVDiffRastContextWrapper(device=DEVICE, context_type="cuda")

    mesh = load_mesh(mesh_path, rescale=True, device=DEVICE)
    render_out = render(
        ctx,
        mesh,
        cameras,
        height=height,
        width=width,
        render_attr=False,
        normal_background=0.0,
    )
    control_images = (
        torch.cat(
            [
                (render_out.pos + 0.5).clamp(0, 1),
                (render_out.normal / 2 + 0.5).clamp(0, 1),
            ],
            dim=-1,
        )
        .permute(0, 3, 1, 2)
        .to(DEVICE)
    )

    image = Image.open(image)
    image = remove_bg_fn(image)
    image = preprocess_image(image, height, width)

    pipe_kwargs = {}
    if seed != -1 and isinstance(seed, int):
        pipe_kwargs["generator"] = torch.Generator(device=DEVICE).manual_seed(seed)

    images = mv_adapter_pipe(
        "high quality",
        height=height,
        width=width,
        num_inference_steps=15,
        guidance_scale=3.0,
        num_images_per_prompt=NUM_VIEWS,
        control_image=control_images,
        control_conditioning_scale=1.0,
        reference_image=image,
        reference_conditioning_scale=1.0,
        negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
        cross_attention_kwargs={"scale": 1.0},
        **pipe_kwargs,
    ).images

    torch.cuda.empty_cache()

    save_dir = os.path.join(TMP_DIR, str(req.session_hash))
    mv_image_path = os.path.join(save_dir, f"mv_adapter_{get_random_hex()}.png")
    make_image_grid(images, rows=1).save(mv_image_path)

    from texture import TexturePipeline, ModProcessConfig
    texture_pipe = TexturePipeline(
        upscaler_ckpt_path="checkpoints/RealESRGAN_x2plus.pth",
        inpaint_ckpt_path="checkpoints/big-lama.pt",
        device=DEVICE,
    )

    textured_glb_path = texture_pipe(
        mesh_path=mesh_path,
        save_dir=save_dir,
        save_name=f"texture_mesh_{get_random_hex()}.glb",
        uv_unwarp=True,
        uv_size=4096,
        rgb_path=mv_image_path,
        rgb_process_config=ModProcessConfig(view_upscale=True, inpaint_mode="view"),
        camera_azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
    )

    return textured_glb_path


with gr.Blocks(title="TripoSG") as demo:
    gr.Markdown(HEADER)

    with gr.Row():
        with gr.Column():
            with gr.Row():
                image_prompts = gr.Image(label="Input Image", type="filepath")
                seg_image = gr.Image(
                    label="Segmentation Result", type="pil", format="png", interactive=False
                )

            with gr.Accordion("Generation Settings", open=True):
                seed = gr.Slider(
                    label="Seed",
                    minimum=0,
                    maximum=MAX_SEED,
                    step=0,
                    value=0
                )
                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                num_inference_steps = gr.Slider(
                    label="Number of inference steps",
                    minimum=8,
                    maximum=50,
                    step=1,
                    value=50,
                )
                guidance_scale = gr.Slider(
                    label="CFG scale",
                    minimum=0.0,
                    maximum=20.0,
                    step=0.1,
                    value=7.0,
                )

                with gr.Row():
                    reduce_face = gr.Checkbox(label="Simplify Mesh", value=True)
                    target_face_num = gr.Slider(maximum=1000000, minimum=10000, value=DEFAULT_FACE_NUMBER, label="Target Face Number")

                gen_button = gr.Button("Generate Shape", variant="primary")
                gen_texture_button = gr.Button("Apply Texture", interactive=False)

        with gr.Column():
            model_output = gr.Model3D(label="Generated GLB", interactive=False)
            textured_model_output = gr.Model3D(label="Textured GLB", interactive=False)

    with gr.Row():
        examples = gr.Examples(
            examples=[
                f"{TRIPOSG_CODE_DIR}/assets/example_data/{image}"
                for image in os.listdir(f"{TRIPOSG_CODE_DIR}/assets/example_data")
            ],
            fn=run_full,
            inputs=[image_prompts],
            outputs=[seg_image, model_output, textured_model_output],
            cache_examples=True,
        )

    gen_button.click(
        run_segmentation,
        inputs=[image_prompts],
        outputs=[seg_image]
    ).then(
        get_random_seed,
        inputs=[randomize_seed, seed],
        outputs=[seed],
    ).then(
        image_to_3d,
        inputs=[
            seg_image,
            seed,
            num_inference_steps,
            guidance_scale,
            reduce_face,
            target_face_num
        ],
        outputs=[model_output]
    ).then(lambda: gr.Button(interactive=True), outputs=[gen_texture_button])

    gen_texture_button.click(
        run_texture,
        inputs=[image_prompts, model_output, seed],
        outputs=[textured_model_output]
    )

    demo.load(start_session)
    demo.unload(end_session)

demo.launch()