# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from itertools import product

import torch
from fvcore.common.benchmark import benchmark
from pytorch3d.ops.interp_face_attrs import (
    interpolate_face_attributes,
    interpolate_face_attributes_python,
)


def _generate_data(N, S, K, F, D, device, requires_grad=False):
    pix_to_face = torch.randint(-10, F, (N, S, S, K), device=device)
    barycentric_coords = torch.randn(
        N, S, S, K, 3, device=device, requires_grad=requires_grad
    )
    face_attrs = torch.randn(F, 3, D, device=device, requires_grad=requires_grad)
    grad_pix_attrs = torch.randn(N, S, S, K, D, device=device)
    return pix_to_face, barycentric_coords, face_attrs, grad_pix_attrs


def _bm_forward(N, S, F, K, D, impl):
    # The runtime depends on the values of pix_to_face. So for proper
    # benchmarking we should probably take the average of multiple
    # values of pix to face. But this doesn't easily fit into fvcore
    # benchmarking, so instead we'll just set a manual seed to make sure
    # that different impls will use the same data.
    torch.manual_seed(0)
    device = torch.device("cuda")
    data = _generate_data(N, S, K, F, D, device, requires_grad=False)
    args = data[:3]
    torch.cuda.synchronize()
    if impl == "cuda":
        fun = interpolate_face_attributes
    elif impl == "python":
        fun = interpolate_face_attributes_python
    return lambda: fun(*args)


def _bm_forward_backward(N, S, F, K, D, impl):
    torch.manual_seed(0)
    device = torch.device("cuda")
    data = _generate_data(N, S, K, F, D, device, requires_grad=True)
    args, grad = data[:3], data[3]
    torch.cuda.synchronize()
    if impl == "cuda":
        fun = interpolate_face_attributes
    elif impl == "python":
        fun = interpolate_face_attributes_python

    def run():
        out = fun(*args)
        out.backward(gradient=grad)

    return run


def bm_interpolate_face_attribues() -> None:
    # For now only benchmark on GPU
    if not torch.cuda.is_available():
        return

    Ns = [1, 4]
    Ss = [128]
    Ks = [1, 10, 40]
    Fs = [5000]
    Ds = [1, 3, 16]
    impls = ["python", "cuda"]
    test_cases = product(Ns, Ss, Ks, Fs, Ds, impls)
    kwargs_list = []
    for case in test_cases:
        N, S, K, F, D, impl = case
        kwargs_list.append({"N": N, "S": S, "K": K, "F": F, "D": D, "impl": impl})
    benchmark(_bm_forward, "FORWARD", kwargs_list, warmup_iters=3)
    benchmark(_bm_forward_backward, "FORWARD+BACKWARD", kwargs_list, warmup_iters=3)


if __name__ == "__main__":
    bm_interpolate_face_attribues()