#!/usr/bin/env python3
# Copyright (C) 2024-present Naver Corporation. All rights reserved.
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
#
# --------------------------------------------------------
# Script to pre-process the CO3D dataset.
# Usage:
# python3 datasets_preprocess/preprocess_co3d.py --co3d_dir /path/to/co3d
# --------------------------------------------------------

import argparse
import gzip
import json
import os
import os.path as osp
import random

import cv2
import dust3r.datasets.utils.cropping as cropping  # noqa
import matplotlib.pyplot as plt
import numpy as np
import path_to_root  # noqa
import PIL.Image
import torch
from tqdm.auto import tqdm

CATEGORIES = [
    "apple",
    "backpack",
    "ball",
    "banana",
    "baseballbat",
    "baseballglove",
    "bench",
    "bicycle",
    "book",
    "bottle",
    "bowl",
    "broccoli",
    "cake",
    "car",
    "carrot",
    "cellphone",
    "chair",
    "couch",
    "cup",
    "donut",
    "frisbee",
    "hairdryer",
    "handbag",
    "hotdog",
    "hydrant",
    "keyboard",
    "kite",
    "laptop",
    "microwave",
    "motorcycle",
    "mouse",
    "orange",
    "parkingmeter",
    "pizza",
    "plant",
    "remote",
    "sandwich",
    "skateboard",
    "stopsign",
    "suitcase",
    "teddybear",
    "toaster",
    "toilet",
    "toybus",
    "toyplane",
    "toytrain",
    "toytruck",
    "tv",
    "umbrella",
    "vase",
    "wineglass",
]
CATEGORIES_IDX = {cat: i for i, cat in enumerate(CATEGORIES)}  # for seeding

SINGLE_SEQUENCE_CATEGORIES = sorted(
    set(CATEGORIES) - set(["microwave", "stopsign", "tv"])
)


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--category", type=str, default=None)
    parser.add_argument(
        "--single_sequence_subset",
        default=False,
        action="store_true",
        help="prepare the single_sequence_subset instead.",
    )
    parser.add_argument("--output_dir", type=str, default="data/co3d_processed")
    parser.add_argument("--co3d_dir", type=str, required=True)
    parser.add_argument("--num_sequences_per_object", type=int, default=50)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument(
        "--min_quality",
        type=float,
        default=0.5,
        help="Minimum viewpoint quality score.",
    )

    parser.add_argument(
        "--img_size",
        type=int,
        default=512,
        help=(
            "lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"
        ),
    )
    return parser


def convert_ndc_to_pinhole(focal_length, principal_point, image_size):
    focal_length = np.array(focal_length)
    principal_point = np.array(principal_point)
    image_size_wh = np.array([image_size[1], image_size[0]])
    half_image_size = image_size_wh / 2
    rescale = half_image_size.min()
    principal_point_px = half_image_size - principal_point * rescale
    focal_length_px = focal_length * rescale
    fx, fy = focal_length_px[0], focal_length_px[1]
    cx, cy = principal_point_px[0], principal_point_px[1]
    K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
    return K


def opencv_from_cameras_projection(R, T, focal, p0, image_size):
    R = torch.from_numpy(R)[None, :, :]
    T = torch.from_numpy(T)[None, :]
    focal = torch.from_numpy(focal)[None, :]
    p0 = torch.from_numpy(p0)[None, :]
    image_size = torch.from_numpy(image_size)[None, :]

    R_pytorch3d = R.clone()
    T_pytorch3d = T.clone()
    focal_pytorch3d = focal
    p0_pytorch3d = p0
    T_pytorch3d[:, :2] *= -1
    R_pytorch3d[:, :, :2] *= -1
    tvec = T_pytorch3d
    R = R_pytorch3d.permute(0, 2, 1)

    # Retype the image_size correctly and flip to width, height.
    image_size_wh = image_size.to(R).flip(dims=(1,))

    # NDC to screen conversion.
    scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
    scale = scale.expand(-1, 2)
    c0 = image_size_wh / 2.0

    principal_point = -p0_pytorch3d * scale + c0
    focal_length = focal_pytorch3d * scale

    camera_matrix = torch.zeros_like(R)
    camera_matrix[:, :2, 2] = principal_point
    camera_matrix[:, 2, 2] = 1.0
    camera_matrix[:, 0, 0] = focal_length[:, 0]
    camera_matrix[:, 1, 1] = focal_length[:, 1]
    return R[0], tvec[0], camera_matrix[0]


def get_set_list(category_dir, split, is_single_sequence_subset=False):
    listfiles = os.listdir(osp.join(category_dir, "set_lists"))
    if is_single_sequence_subset:
        # not all objects have manyview_dev
        subset_list_files = [f for f in listfiles if "manyview_dev" in f]
    else:
        subset_list_files = [f for f in listfiles if f"fewview_train" in f]

    sequences_all = []
    for subset_list_file in subset_list_files:
        with open(osp.join(category_dir, "set_lists", subset_list_file)) as f:
            subset_lists_data = json.load(f)
            sequences_all.extend(subset_lists_data[split])

    return sequences_all


def prepare_sequences(
    category,
    co3d_dir,
    output_dir,
    img_size,
    split,
    min_quality,
    max_num_sequences_per_object,
    seed,
    is_single_sequence_subset=False,
):
    random.seed(seed)
    category_dir = osp.join(co3d_dir, category)
    category_output_dir = osp.join(output_dir, category)
    sequences_all = get_set_list(category_dir, split, is_single_sequence_subset)
    sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all))

    frame_file = osp.join(category_dir, "frame_annotations.jgz")
    sequence_file = osp.join(category_dir, "sequence_annotations.jgz")

    with gzip.open(frame_file, "r") as fin:
        frame_data = json.loads(fin.read())
    with gzip.open(sequence_file, "r") as fin:
        sequence_data = json.loads(fin.read())

    frame_data_processed = {}
    for f_data in frame_data:
        sequence_name = f_data["sequence_name"]
        frame_data_processed.setdefault(sequence_name, {})[
            f_data["frame_number"]
        ] = f_data

    good_quality_sequences = set()
    for seq_data in sequence_data:
        if seq_data["viewpoint_quality_score"] > min_quality:
            good_quality_sequences.add(seq_data["sequence_name"])

    sequences_numbers = [
        seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences
    ]
    if len(sequences_numbers) < max_num_sequences_per_object:
        selected_sequences_numbers = sequences_numbers
    else:
        selected_sequences_numbers = random.sample(
            sequences_numbers, max_num_sequences_per_object
        )

    selected_sequences_numbers_dict = {
        seq_name: [] for seq_name in selected_sequences_numbers
    }
    sequences_all = [
        (seq_name, frame_number, filepath)
        for seq_name, frame_number, filepath in sequences_all
        if seq_name in selected_sequences_numbers_dict
    ]

    for seq_name, frame_number, filepath in tqdm(sequences_all):
        frame_idx = int(filepath.split("/")[-1][5:-4])
        selected_sequences_numbers_dict[seq_name].append(frame_idx)
        mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
        frame_data = frame_data_processed[seq_name][frame_number]
        focal_length = frame_data["viewpoint"]["focal_length"]
        principal_point = frame_data["viewpoint"]["principal_point"]
        image_size = frame_data["image"]["size"]
        K = convert_ndc_to_pinhole(focal_length, principal_point, image_size)
        R, tvec, camera_intrinsics = opencv_from_cameras_projection(
            np.array(frame_data["viewpoint"]["R"]),
            np.array(frame_data["viewpoint"]["T"]),
            np.array(focal_length),
            np.array(principal_point),
            np.array(image_size),
        )

        frame_data = frame_data_processed[seq_name][frame_number]
        depth_path = os.path.join(co3d_dir, frame_data["depth"]["path"])
        assert frame_data["depth"]["scale_adjustment"] == 1.0
        image_path = os.path.join(co3d_dir, filepath)
        mask_path_full = os.path.join(co3d_dir, mask_path)

        input_rgb_image = PIL.Image.open(image_path).convert("RGB")
        input_mask = plt.imread(mask_path_full)

        with PIL.Image.open(depth_path) as depth_pil:
            # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
            # we cast it to uint16, then reinterpret as float16, then cast to float32
            input_depthmap = (
                np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
                .astype(np.float32)
                .reshape((depth_pil.size[1], depth_pil.size[0]))
            )
        depth_mask = np.stack((input_depthmap, input_mask), axis=-1)
        H, W = input_depthmap.shape

        camera_intrinsics = camera_intrinsics.numpy()
        cx, cy = camera_intrinsics[:2, 2].round().astype(int)
        min_margin_x = min(cx, W - cx)
        min_margin_y = min(cy, H - cy)

        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
        l, t = cx - min_margin_x, cy - min_margin_y
        r, b = cx + min_margin_x, cy + min_margin_y
        crop_bbox = (l, t, r, b)
        (
            input_rgb_image,
            depth_mask,
            input_camera_intrinsics,
        ) = cropping.crop_image_depthmap(
            input_rgb_image, depth_mask, camera_intrinsics, crop_bbox
        )

        # try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384
        scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
        output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
        if max(output_resolution) < img_size:
            # let's put the max dimension to img_size
            scale_final = (img_size / max(H, W)) + 1e-8
            output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)

        (
            input_rgb_image,
            depth_mask,
            input_camera_intrinsics,
        ) = cropping.rescale_image_depthmap(
            input_rgb_image, depth_mask, input_camera_intrinsics, output_resolution
        )
        input_depthmap = depth_mask[:, :, 0]
        input_mask = depth_mask[:, :, 1]

        # generate and adjust camera pose
        camera_pose = np.eye(4, dtype=np.float32)
        camera_pose[:3, :3] = R
        camera_pose[:3, 3] = tvec
        camera_pose = np.linalg.inv(camera_pose)

        # save crop images and depth, metadata
        save_img_path = os.path.join(output_dir, filepath)
        save_depth_path = os.path.join(output_dir, frame_data["depth"]["path"])
        save_mask_path = os.path.join(output_dir, mask_path)
        os.makedirs(os.path.split(save_img_path)[0], exist_ok=True)
        os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True)
        os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True)

        input_rgb_image.save(save_img_path)
        scaled_depth_map = (input_depthmap / np.max(input_depthmap) * 65535).astype(
            np.uint16
        )
        cv2.imwrite(save_depth_path, scaled_depth_map)
        cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))

        save_meta_path = save_img_path.replace("jpg", "npz")
        np.savez(
            save_meta_path,
            camera_intrinsics=input_camera_intrinsics,
            camera_pose=camera_pose,
            maximum_depth=np.max(input_depthmap),
        )

    return selected_sequences_numbers_dict


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    assert args.co3d_dir != args.output_dir
    if args.category is None:
        if args.single_sequence_subset:
            categories = SINGLE_SEQUENCE_CATEGORIES
        else:
            categories = CATEGORIES
    else:
        categories = [args.category]
    os.makedirs(args.output_dir, exist_ok=True)

    for split in ["train", "test"]:
        selected_sequences_path = os.path.join(
            args.output_dir, f"selected_seqs_{split}.json"
        )
        if os.path.isfile(selected_sequences_path):
            continue

        all_selected_sequences = {}
        for category in categories:
            category_output_dir = osp.join(args.output_dir, category)
            os.makedirs(category_output_dir, exist_ok=True)
            category_selected_sequences_path = os.path.join(
                category_output_dir, f"selected_seqs_{split}.json"
            )
            if os.path.isfile(category_selected_sequences_path):
                with open(category_selected_sequences_path, "r") as fid:
                    category_selected_sequences = json.load(fid)
            else:
                print(f"Processing {split} - category = {category}")
                category_selected_sequences = prepare_sequences(
                    category=category,
                    co3d_dir=args.co3d_dir,
                    output_dir=args.output_dir,
                    img_size=args.img_size,
                    split=split,
                    min_quality=args.min_quality,
                    max_num_sequences_per_object=args.num_sequences_per_object,
                    seed=args.seed + CATEGORIES_IDX[category],
                    is_single_sequence_subset=args.single_sequence_subset,
                )
                with open(category_selected_sequences_path, "w") as file:
                    json.dump(category_selected_sequences, file)

            all_selected_sequences[category] = category_selected_sequences
        with open(selected_sequences_path, "w") as file:
            json.dump(all_selected_sequences, file)