# Copyright Generate Biomedicines, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
By contrast, our model learns to reverse a correlated noise  process to match the distance statistics of natural proteins, 
which have scaling laws that are well understood from biophysics
"""

"""Layers for perturbing protein structure with noise.

This module contains pytorch layers for perturbing protein structure with noise,
which can be useful both for data augmentation, benchmarking, or denoising based
training.
"""


from typing import Callable, Dict, List, Optional, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import grad
from tqdm.auto import tqdm

from chroma.constants import AA20
from chroma.data.xcs import validate_XC
from chroma.layers import basic, sde
from chroma.layers.structure import backbone, hbonds, mvn, rmsd

## 高斯噪声
class GaussianNoiseSchedule:
    """
    A general noise schedule for the General Gaussian Forward Path, where noise is added
    to the input signal.

    The noise is modeled as Gaussian noise with mean `alpha_t x_0` and variance
     `sigma_t^2`, with 'x_0 ~ p(x_0)' The time range of the noise schedule is
     parameterized with a user-specified logarithmic signal-to-noise ratio (SNR) range,
    where  `snr_t = alpha_t^2 / sigma_t^2` is the SNR at time `t`.

    In addition, the object defines a quantity called the scaled signal-to-noise ratio
    (`ssnr_t`), which is given by `ssnr_t = alpha_t^2 / (alpha_t^2 + sigma_t^2)`
    and is a helpful quantity for analyzing the performance of signal processing
    algorithms under different noise conditions.

    This object implements a few standard noise schedule:

        'log_snr': variance-preserving process with linear log SNR schedule
        (https://arxiv.org/abs/2107.00630)

        'ot_linear': OT schedule
        (https://arxiv.org/abs/2210.02747)

        've_log_snr': variance-exploding process with linear log SNR s hedule
        (https://arxiv.org/abs/2011.13456 with log SNR noise schedule)

    User can also implement their own schedules by specifying alpha_func, sigma_func
    and compute_t_range.

    """

    def __init__(
        self, log_snr_range: Tuple[float, float] = (-7.0, 13.5), kind: str = "log_snr",
    ) -> None:
        super().__init__()

        if kind not in ["log_snr", "ot_linear", "ve_log_snr"]:
            raise NotImplementedError(
                f"noise type {kind} is not implemented,                            only"
                " log_snr and ot_linear are supported "
            )
        self.kind = kind
        self.log_snr_range = log_snr_range

        l_min, l_max = self.log_snr_range

        # map t \in [0, 1] to match the prescribed log_snr range
        self.t_max = self.compute_t_range(l_min)
        self.t_min = self.compute_t_range(l_max)
        self._eps = 1e-5

    def t_map(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """map t in [0, 1] to [t_min, t_max]

        Args:
            t (Union[float, torch.Tensor]): time

        Returns:
            torch.Tensor: mapped time
        """
        if not isinstance(t, torch.Tensor):
            t = torch.Tensor([t]).float()

        t_max = self.t_max.to(t.device)
        t_min = self.t_min.to(t.device)
        t_tilde = t_min + (t_max - t_min) * t

        return t_tilde

    def derivative(self, t: torch.Tensor, func: Callable) -> torch.Tensor:
        """compute derivative of a function, it supports bached single variable inputs

        Args:
            t (torch.Tensor): time variable at which derivatives are taken
            func (Callable): function for derivative calculation

        Returns:
            torch.Tensor: derivative that is detached from the computational graph
        """
        with torch.enable_grad():
            t.requires_grad_(True)
            derivative = grad(func(t).sum(), t, create_graph=False)[0].detach()
            t.requires_grad_(False)
        return derivative

    def tensor_check(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """convert input to torch.Tensor if it is a float

        Args:
            t ( Union[float, torch.Tensor]): input

        Returns:
            torch.Tensor: converted torch.Tensor
        """
        if not isinstance(t, torch.Tensor):
            t = torch.Tensor([t]).float()
        return t

    def alpha_func(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """alpha function that scales the mean, usually goes from 1. to 0.

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: alpha value
        """

        t = self.tensor_check(t)

        if self.kind == "log_snr":
            l_min, l_max = self.log_snr_range
            t_min, t_max = self.t_min, self.t_max
            log_snr = (1 - t) * l_max + t * l_min

            log_alpha = 0.5 * (log_snr - F.softplus(log_snr))
            alpha = log_alpha.exp()
            return alpha

        elif self.kind == "ve_log_snr":
            return 1 - torch.relu(-t)  # make this differentiable

        elif self.kind == "ot_linear":
            return 1 - t

    def sigma_func(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """sigma function that scales the standard deviation, usually goes from 0. to 1.

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: sigma value
        """
        t = self.tensor_check(t)
        l_min, l_max = self.log_snr_range

        if self.kind == "log_snr":
            alpha = self.alpha(t)
            return (1 - alpha.pow(2)).sqrt()

        elif self.kind == "ve_log_snr":
            # compute sigma value given snr range

            l_min, l_max = self.log_snr_range
            t_min, t_max = self.t_min, self.t_max
            log_snr = (1 - t) * l_max + t * l_min
            return torch.exp(-log_snr / 2)

        elif self.kind == "ot_linear":
            return t

    def alpha(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """compute alpha value for the mapped time in [t_min, t_max]

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: alpha value
        """
        return self.alpha_func(self.t_map(t))

    def sigma(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """compute sigma value for mapped time in [t_min, t_max]

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: sigma value
        """
        return self.sigma_func(self.t_map(t))

    def alpha_deriv(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """compute alpha derivative for mapped time in [t_min, t_max]

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: time derivative of alpha_func
        """
        t_tilde = self.t_map(t)
        alpha_deriv_t = self.derivative(t_tilde, self.alpha_func).detach()
        return alpha_deriv_t

    def sigma_deriv(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """compute sigma derivative for the mapped time in [t_min, t_max]

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: sigma derivative
        """
        t_tilde = self.t_map(t)
        sigma_deriv_t = self.derivative(t_tilde, self.sigma_func).detach()
        return sigma_deriv_t

    def beta(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """compute the drift coefficient for the OU process of the form
        $dx = -\frac{1}{2} \beta(t) x dt + g(t) dw_t$

        Args:
            t (Union[float, torch.Tensor]): t in [0, 1]

        Returns:
            torch.Tensor: beta(t)
        """
        # t = self.t_map(t)
        alpha = self.alpha(t).detach()
        t_map = self.t_map(t)
        alpha_deriv_t = self.alpha_deriv(t)
        beta = -2.0 * alpha_deriv_t / alpha

        return beta

    def g(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """compute drift coefficient for the OU process:
        $dx = -\frac{1}{2} \beta(t) x dt + g(t) dw_t$

        Args:
            t (Union[float, torch.Tensor]): t in [0, 1]

        Returns:
            torch.Tensor: g(t)
        """
        if self.kind == "log_snr":
            t = self.t_map(t)
            g = self.beta(t).sqrt()

        else:
            alpha_deriv = self.alpha_deriv(t)
            alpha_prime_div_alpha = alpha_deriv / self.alpha(t)
            sigma_deriv = self.sigma_deriv(t)
            sigma_prime_div_sigma = sigma_deriv / self.sigma(t)

            g_sq = (
                2
                * (sigma_deriv - alpha_prime_div_alpha * self.sigma(t))
                * self.sigma(t)
            )
            g = g_sq.sqrt()

        return g

    def SNR(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """Signal-to-Noise(SNR) ratio  mapped in the allowed log_SNR range

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: SNR value
        """
        t = self.tensor_check(t)

        if self.kind == "log_snr":
            SNR = self.log_SNR(t).exp()
        else:
            SNR = self.alpha(t).pow(2) / (self.sigma(t).pow(2))

        return SNR

    def log_SNR(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """log SNR value

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: log SNR value
        """
        t = self.tensor_check(t)

        if self.kind == "log_snr":
            l_min, l_max = self.log_snr_range
            log_snr = (1 - t) * l_max + t * l_min

        elif self.kind == "ot_linear":
            log_snr = self.SNR(t).log()

        return log_snr

    def compute_t_range(self, log_snr: Union[float, torch.Tensor]) -> torch.Tensor:
        """Given log(SNR) range : l_max, l_min to compute the time range.
        Hand-derivation is required for specific noise schedules.
        This function is essentially the inverse of logSNR(t)

        Args:
            log_snr (Union[float, torch.Tensor]): logSNR value

        Returns:
            torch.Tensor: the inverse logSNR
        """
        log_snr = self.tensor_check(log_snr)
        l_min, l_max = self.log_snr_range

        if self.kind == "log_snr":
            t = (1 / (l_min - l_max)) * (log_snr - l_max)

        elif self.kind == "ot_linear":
            t = ((0.5 * log_snr).exp() + 1).reciprocal()

        elif self.kind == "ve_log_snr":
            t = (1 / (l_min - l_max)) * (log_snr - l_max)

        return t

    def SNR_derivative(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """the derivative of SNR(t)

        Args:
            t (Union[float, torch.Tensor]): t in [0, 1]

        Returns:
            torch.Tensor: SNR derivative
        """
        t = self.tensor_check(t)

        if self.kind == "log_snr":
            snr_deriv = self.SNR(t) * (self.log_snr_range[0] - self.log_snr_range[1])

        elif self.kind == "ot_linear":
            snr_deriv = self.derivative(t, self.SNR)
        return snr_deriv

    def SSNR(self, t: Union[float, torch.Tensor]) -> torch.Tensor:
        """Signal to Signal+Noise Ratio (SSNR) = alpha^2 / (alpha^2 + sigma^2)
           SSNR monotonically goes from 1 to 0 as t going from 0 to 1.

        Args:
            t (Union[float, torch.Tensor]): time in [0, 1]

        Returns:
            torch.Tensor: SSNR value
        """
        t = self.tensor_check(t)
        return self.SNR(t) / (self.SNR(t) + 1)

    def SSNR_inv(self, ssnr: torch.Tensor) -> torch.Tensor:
        """the inverse of SSNR

        Args:
            ssnr (torch.Tensor): ssnr in [0, 1]

        Returns:
            torch.Tensor: time in [0, 1]
        """
        l_min, l_max = self.log_snr_range
        if self.kind == "log_snr":
            return ((ssnr / (1 - ssnr)).log() - l_max) / (l_min - l_max)
        elif self.kind == "ot_linear":
            # the value of SNNR_inv(t=0.5) need to be determined with L'Hôpital rule
            # the inver SNNR_function is solved anyltically:
            # see woflram alpha result: https://tinyurl.com/bdh4es5a
            singularity_check = (ssnr - 0.5).abs() < self._eps
            ssnr_mask = singularity_check.float()
            ssnr = ssnr_mask * (0.5 + self._eps) + (1.0 - ssnr_mask) * ssnr

            return (ssnr + (-ssnr * (ssnr - 1)).sqrt() - 1) / (2 * ssnr - 1)

    def SSNR_inv_deriv(self, ssnr: Union[float, torch.Tensor]) -> torch.Tensor:
        """SSNR_inv derivative. SSNR_inv is a CDF like quantity, so its derivative is a PDF-like quantity

        Args:
            ssnr (Union[float, torch.Tensor]): SSNR in [0, 1]

        Returns:
            torch.Tensor: derivative of SSNR
        """
        ssnr = self.tensor_check(ssnr)
        deriv = self.derivative(ssnr, self.SSNR_inv)
        return deriv

    def prob_SSNR(self, ssnr: Union[float, torch.Tensor]) -> torch.Tensor:
        """compute prob (SSNR(t)), the minus sign is accounted for the inversion of integration range

        Args:
            ssnr (Union[float, torch.Tensor]): SSNR value

        Returns:
            torch.Tensor: Prob(SSNR)
        """
        return -self.SSNR_inv_deriv(ssnr)

    def linear_logsnr_grid(self, N: int, tspan: Tuple[float, float]) -> torch.Tensor:
        """Map uniform time grid to respect logSNR schedule

        Args:
            N (int): number of steps
            tspan (Tuple[float, float]): time span (t_start, t_end)

        Returns:
            torch.Tensor: time grid as torch.Tensor
        """

        logsnr_noise = GaussianNoiseSchedule(
            kind="log_snr", log_snr_range=self.log_snr_range
        )

        ts = torch.linspace(tspan[0], tspan[1], N + 1)
        SSNR_vp = logsnr_noise.SSNR(ts)
        grid = self.SSNR_inv(SSNR_vp)

        # map from t_tilde back to t
        grid = (grid - self.t_min) / (self.t_max - self.t_min)

        return grid

## 噪声嵌入层
class NoiseTimeEmbedding(nn.Module):
    """
    A class that implements a noise time embedding layer.

    Args:
        dim_embedding (int): The dimension of the output embedding vector.
            noise_schedule (GaussianNoiseSchedule): A GaussianNoiseSchedule object that
            defines the noise schedule function.
        rff_scale (float, optional): The scaling factor for the random Fourier features.
            Default is 0.8.
        feature_type (str, optional): The type of feature to use for the time embedding.
            Either "t" or "log_snr". Default is "log_snr".

    Inputs:
        t (float): time in (1.0, 0.0).
        log_alpha (torch.Tensor, optional): A tensor of log alpha values with
            shape `(batch_size,)`.

    Outputs:
        time_h (torch.Tensor): A tensor of noise time embeddings with shape
         `(batch_size, dim_embedding)`.
    """

    def __init__(
        self,
        dim_embedding: int,
        noise_schedule: GaussianNoiseSchedule,
        rff_scale: float = 0.8,
        feature_type: str = "log_snr",
    ) -> None:
        super(NoiseTimeEmbedding, self).__init__()
        self.noise_schedule = noise_schedule
        self.feature_type = feature_type
        self.fourier_features = basic.FourierFeaturization(
            d_input=1, d_model=dim_embedding, trainable=False, scale=rff_scale
        )

    def forward(
        self, t: torch.Tensor, log_alpha: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        if not isinstance(t, torch.Tensor):
            t = torch.Tensor([t]).float().to(self.fourier_features.B.device)
        if t.dim() == 0:
            t = t[None]

        h = {"t": lambda: t, "log_snr": lambda: self.noise_schedule.log_SNR(t)}[
            self.feature_type
        ]()

        time_h = self.fourier_features(h[:, None, None])
        return time_h

## Diffusion
class DiffusionChainCov(nn.Module):
    def __init__(
        self,
        log_snr_range: Tuple[float, float] = (-7.0, 13.5),
        noise_schedule: str = "log_snr",
        sigma_translation: float = 1.0,
        covariance_model: str = "brownian",
        complex_scaling: bool = False,
        **kwargs,
    ) -> None:
        """Diffusion backbone noise, with chain-structured covariance.

        This class implements a diffusion backbone noise model. The model uses a
        chain-structured covariance matrix capturing the spatial correlations between
        residues along the backbone. The model also supports different noise schedules
        and integration schemes for the stochastic differential equation (SDE) that
        defines the diffusion process. This class also implemented various inference
        algorithm by reversing the forward diffusion with user-specified
        conditioner program.

        Args:
            log_snr_range (tuple, optional): log SNR range. Defaults to (-7.0, 13.5).
            noise_schedule (str, optional): noise schedule type. Defaults to "log_snr".
            sigma_translation (float, optional): Scaling factor for the translation
                component of the covariance matrix. Defaults to 1.0.
            covariance_model (str, optional): covariance mode,. Defaults to "brownian".
            complex_scaling (bool, optional): Whether to scale the complex component
                of the covariance matrix by the translation component. Defaults to False.
            **kwargs: Additional arguments for the base Gaussian distribution and
                 the SDE integration.
        """
        super().__init__()

        self.noise_schedule = GaussianNoiseSchedule(
            log_snr_range=log_snr_range, kind=noise_schedule,
        )

        if covariance_model in ["brownian", "globular"]:
            self.base_gaussian = mvn.BackboneMVNGlobular(
                sigma_translation=sigma_translation,
                covariance_model=covariance_model,
                complex_scaling=complex_scaling,
            )
        elif covariance_model == "residue_gas":
            self.base_gaussian = mvn.BackboneMVNResidueGas()

        self.loss_rmsd = rmsd.BackboneRMSD()
        self._eps = 1e-5
        self.sde_funcs = {
            "langevin": self.langevin,
            "reverse_sde": self.reverse_sde,
            "ode": self.ode,
        }
        self.integrate_funcs = {
            "euler_maruyama": sde.sde_integrate,
            "heun": sde.sde_integrate_heun,
        }

    def sample_t(
        self,
        C: torch.LongTensor,
        t: Optional[torch.Tensor] = None,
        inverse_CDF: Optional[Callable] = None,
    ) -> torch.Tensor:
        """Sample a random time index for each batch element

        Inputs:
            C (torch.LongTensor): Chain tensor with shape `(batch_size, num_residues)`.
            t (torch.Tensor, optional): Time index with shape `(batch_size,)`.
                If not given, a random time index will be sampled. Defaults to None.

        Outputs:
            t (float): Time index with shape `(batch_size,)`.
        """
        if t is not None:
            if not isinstance(t, torch.Tensor):
                t = torch.Tensor([t]).float()
            return t

        num_batch = C.size(0)
        if self.training:
            # Sample correlated but marginally uniform t
            # for variance reduction (Kingma et al 2021)
            u = torch.rand([])
            ix = torch.arange(num_batch) / num_batch
            t = torch.remainder(u + ix, 1)
        else:
            t = torch.rand([num_batch])
        if inverse_CDF is not None:
            t = inverse_CDF(t)
        t = t.to(C.device)
        return t

    def sde_forward(self, X, C, t, Z=None):
        """Sample an Euler-Maruyama step on forwards SDE.

        That is to say, Euler-Maruyama integration would
        correspond to the update.
            `X_new = X + dt * f + sqrt(dt) * gZ`

        Args:

        Returns:
            f (Tensor): Drift term with shape `()`.
            gZ (Tensor): Diffusion term  with shape `()`.
        """

        # Sample random perturbation
        if Z is None:
            Z = torch.randn_like(X)
        Z = Z.reshape(X.shape[0], -1, 3)
        R_Z = self.base_gaussian._multiply_R(Z, C).reshape(X.shape)

        X = backbone.center_X(X, C)
        beta = self.noise_schedule.beta(t)
        f = -beta * X / 2.0
        gZ = self.noise_schedule.g(t)[:, None, None] * R_Z

        return f, gZ

    def _schedule_coefficients(
        self,
        t: torch.Tensor,
        inverse_temperature: float = 1.0,
        langevin_isothermal: bool = True,
    ) -> Tuple[
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
    ]:
        """
        A method that computes the schedule coefficients for sampling in the reverse time

        Args:
            t (float): time in (1.0, 0.0).
            inverse_temperature (float, optional): The inverse temperature parameter for
                he Langevin dynamics. Default is 1.0.
            langevin_isothermal (bool, optional): A flag that indicates whether to use
                isothermal or non-isothermal Langevin dynamics. Default is True.

        Returns:
            alpha (torch.Tensor): A tensor of alpha values with shape `(batch_size, 1, 1)`.
            sigma (torch.Tensor): A tensor of sigma values with shape `(batch_size, 1, 1)`.
            beta (torch.Tensor): A tensor of beta values with shape `(batch_size, 1, 1)`.
            g (torch.Tensor): A tensor of g values with shape `(batch_size, 1, 1)`.
            lambda_t (float): A tensor of lambda_t values with shape `(batch_size, 1, 1)`.
            lambda_langevin (torch.Tensor): A tensor of lambda_langevin values with
                shape `(batch_size, 1, 1)`.
        """

        # Schedule coeffiecients
        alpha = self.noise_schedule.alpha(t)[:, None, None].to(t.device)
        sigma = self.noise_schedule.sigma(t)[:, None, None].to(t.device)
        beta = self.noise_schedule.beta(t)[:, None, None].to(t.device)
        g = self.noise_schedule.g(t)[:, None, None].to(t.device)

        # Temperature coefficients
        lambda_t = (
            inverse_temperature
            * (sigma.pow(2) + alpha.pow(2))
            / (inverse_temperature * sigma.pow(2) + alpha.pow(2))
        )
        lambda_langevin = inverse_temperature if langevin_isothermal else lambda_t
        return alpha, sigma, beta, g, lambda_t, lambda_langevin

    @validate_XC()
    def langevin(
        self,
        X: torch.Tensor,
        X0_func: Callable,
        C: torch.LongTensor,
        t: Union[torch.Tensor, float],
        conditioner: Callable = None,
        Z: Union[torch.Tensor, None] = None,
        inverse_temperature: float = 1.0,
        langevin_factor: float = 0.0,
        langevin_isothermal: bool = True,
        align_X0: bool = True,
    ):
        """Return the drift and diffusion components of the Langevin dynamics for the
            reverse process

        Args:
            X (torch.Tensor): A tensor of protein backbone structure with shape
                `(batch_size, num_residues, 4, 3)`.
            X0_func (Callable): A function a denoising function for protein backbon
                e geometry.
            C (torch.LongTensor): A chain map tensor with shape `(batch_size, num_residues)`.
            t (float): time in (1.0, 0.0).
            conditioner (Callable, optional): A conditioner the performs constrained
                transformation (see examples in chroma.layers.structure.conditioners).
            Z (torch.Tensor, optional): A tensor of random noise with
                 shape `(batch_size, num_residues, 4, 3)`. Default is None.
            inverse_temperature (float, optional): The inverse temperature parameter
                 for the Langevin dynamics. Default is 1.0.
            langevin_factor (float, optional): The scaling factor for the Langevin noise.
                 Default is 1.0.
            langevin_isothermal (bool, optional): A flag that indicates whether to use
                 isothermal or non-isothermal Langevin dynamics. Default is True.
            align_X0 (bool, optional): A flag that indicates whether to align the noised
                 X and denoised X for score function calculation.

        Returns:
            f (torch.Tensor): A tensor of drift terms with shape
                `(batch_size, num_residues, 4, 3)`.
            gZ (torch.Tensor): A tensor of diffusion terms with shape
                `(batch_size, num_residues, 4, 3)`.
        """

        alpha, sigma, beta, g, lambda_t, lambda_langevin = self._schedule_coefficients(
            t,
            inverse_temperature=inverse_temperature,
            langevin_isothermal=langevin_isothermal,
        )

        Z = torch.randn_like(X) if Z is None else Z

        score = self.score(X, X0_func, C, t, conditioner, align_X0=align_X0)
        score_transformed = self.base_gaussian.multiply_covariance(score, C)
        f = -g.pow(2) * lambda_langevin * langevin_factor / 2.0 * score_transformed
        gZ = g * np.sqrt(langevin_factor) * self.base_gaussian._multiply_R(Z, C)
        return f, gZ

    @validate_XC()
    def reverse_sde(
        self,
        X: torch.Tensor,
        X0_func: Callable,
        C: torch.LongTensor,
        t: Union[torch.Tensor, float],
        conditioner: Callable = None,
        Z: Union[torch.Tensor, None] = None,
        inverse_temperature: float = 1.0,
        langevin_factor: float = 0.0,
        langevin_isothermal: bool = True,
        align_X0: bool = True,
    ):
        """Return the drift and diffusion components of the reverse SDE.

        Args:
            X (torch.Tensor): A tensor of protein backbone structure with shape
                `(batch_size, num_residues, 4, 3)`.
            X0_func (Callable): A function a denoising function for the protein backbone
                geometry.
            C (torch.LongTensor): A tensor of condition features with shape
                `(batch_size, num_residues)`.
            t (float): time in (1.0, 0.0).
            conditioner (Callable, optional): A conditioner the performs constrained
                 transformation (see examples in chroma.layers.structure.conditioners).
            Z (torch.Tensor, optional): A tensor of random noise with shape
                 `(batch_size, num_residues, 4, 3)`. Default is None.
            inverse_temperature (float, optional): The inverse temperature parameter
                for the Langevin dynamics. Default is 1.0.
            langevin_factor (float, optional): The scaling factor for the Langevin noise.
                 Default is 0.0.
            langevin_isothermal (bool, optional): A flag that indicates whether to use
                isothermal or non-isothermal Langevin dynamics. Default is True.
            align_X0 (bool, optional): A flag that indicates whether to align the noised
                 X and denoised X for score function calculation.

        Returns:
            f (torch.Tensor): A tensor of drift terms with shape
                 `(batch_size, num_residues, 4, 3)`.
            gZ (torch.Tensor): A tensor of diffusion terms with shape
                 `(batch_size, num_residues, 4, 3)`.
        """

        # Schedule management
        alpha, sigma, beta, g, lambda_t, lambda_langevin = self._schedule_coefficients(
            t,
            inverse_temperature=inverse_temperature,
            langevin_isothermal=langevin_isothermal,
        )
        score_scale_t = lambda_t + lambda_langevin * langevin_factor / 2.0

        # Impute missing data
        Z = torch.randn_like(X) if Z is None else Z

        # X = backbone.center_X(X, C)
        score = self.score(X, X0_func, C, t, conditioner, align_X0=align_X0)
        score_transformed = self.base_gaussian.multiply_covariance(score, C)

        f = (
            beta * (-1 / 2) * backbone.center_X(X, C)
            - g.pow(2) * score_scale_t * score_transformed
        )
        gZ = g * np.sqrt(1.0 + langevin_factor) * self.base_gaussian._multiply_R(Z, C)
        return f, gZ

    @validate_XC()
    def ode(
        self,
        X: torch.Tensor,
        X0_func: Callable,
        C: torch.LongTensor,
        t: Union[torch.Tensor, float],
        conditioner: Callable = None,
        Z: Union[torch.Tensor, None] = None,
        inverse_temperature: float = 1.0,
        langevin_factor: float = 0.0,
        langevin_isothermal: bool = True,
        align_X0: bool = True,
        detach_X0: bool = True,
    ):
        """Return the drift and diffusion components of the probability flow ODE.

        Args:
            X (torch.Tensor): A tensor of protein backbone structure with shape
                 `(batch_size, num_residues, 4, 3)`.
            X0_func (Callable): A denoising function that returns a protein backbone
                 geometry `(batch_size, num_residues, 4, 3)`.
            C (torch.LongTensor): A tensor of condition features with shape
                `(batch_size, num_residues)`.
            t (float): time in (1.0, 0.0).
            conditioner (Callable, optional): A conditioner the performs constrained
                transformation (see examples in chroma.layers.structure.conditioners).
            Z (torch.Tensor, optional): A tensor of random noise with shape
                 `(batch_size, num_residues, 4, 3)`. Default is None.
            inverse_temperature (float, optional): The inverse temperature parameter
                 for the Langevin dynamics. Default is 1.0.
            langevin_factor (float, optional): The scaling factor for the Langevin
                 noise. Default is 0.0.
            langevin_isothermal (bool, optional): A flag that indicates whether to use
                isothermal or non-isothermal Langevin dynamics. Default is True.
            align_X0 (bool, optional): A flag that indicates whether to align
                the noised X and denoised X for score function calculation.

        Returns:
            f (torch.Tensor): A tensor of drift terms with shape
                `(batch_size, num_residues, 4, 3)`.
            gZ (torch.Tensor): A tensor of diffusion terms with shape
                 `(batch_size, num_residues, 4, 3)`.
        """

        # Schedule management
        alpha, sigma, beta, g, lambda_t, lambda_langevin = self._schedule_coefficients(
            t,
            inverse_temperature=inverse_temperature,
            langevin_isothermal=langevin_isothermal,
        )

        # Impute missing data
        X = backbone.center_X(X, C)
        score = self.score(
            X, X0_func, C, t, conditioner, align_X0=align_X0, detach_X0=detach_X0
        )
        score_transformed = self.base_gaussian.multiply_covariance(score, C)
        f = (-1 / 2) * beta * X - 0.5 * lambda_langevin * g.pow(2) * score_transformed
        gZ = torch.zeros_like(f)
        return f, gZ

    @validate_XC()
    def energy(
        self,
        X: torch.Tensor,
        X0_func: Callable,
        C: torch.Tensor,
        t: torch.Tensor,
        detach_X0: bool = True,
        align_X0: bool = True,
    ) -> torch.Tensor:
        """Compute the diffusion energy as a function of denoised X

        Args:
            X (torch.Tensor): A tensor of protein backbone coordinates with shape
                 `(batch_size, num_residues, 4, 3)`.
            X0_func (Callable): A function a denoising function for protein backbone
                 geometry.
            C (torch.LongTensor): A tensor of condition features with shape
                `(batch_size, num_residues)`.
            t (float): time in (1.0, 0.0).
            detach_X0 (bool, optional): A flag that indicates whether to detach the
                denoise X for score function evaluation
            align_X0 (bool, optional): A flag that indicates whether to align the
                 noised X and denoised X for score function calculation.

        Returns:
            U_diffusion (torch.Tensor): A tensor of diffusion energy values with
                 shape `(batch_size,)`.
        """

        X = backbone.impute_masked_X(X, C)
        alpha = self.noise_schedule.alpha(t).to(X.device)
        sigma = self.noise_schedule.sigma(t).to(X.device)
        if detach_X0:
            with torch.no_grad():
                X0 = X0_func(X, C, t=t)
        else:
            X0 = X0_func(X, C, t=t)
        if align_X0:
            X0, _ = self.loss_rmsd.align(X0, X, C, align_unmasked=True)
        if detach_X0:
            X0 = X0.detach()
        Z = self._X_to_Z(X, X0, C, alpha, sigma)
        U_diffusion = (0.5 * (Z ** 2)).sum([1, 2, 3])
        return U_diffusion

    @validate_XC()
    def score(
        self,
        X: torch.Tensor,
        X0_func: Callable,
        C: torch.Tensor,
        t: Union[torch.Tensor, float],
        conditioner: Callable = None,
        detach_X0: bool = True,
        align_X0: bool = True,
        U_traj: List = [],
    ) -> torch.Tensor:
        """Compute the score function

        Args:
            X (torch.Tensor): A tensor of protein back geometry with shape
                 `(batch_size, num_residues, 4, 3)`.
            X0_func (Callable): A function a denoising function for protein backbone
                 geometry.
            C (torch.LongTensor): A tensor of chain map with shape
                `(batch_size, num_residues)`.
            t (Union[torch.Tensor, float]): time in (1.0, 0.0).
            conditioner (Callable, optional): A conditioner the performs constrained
                transformation (see examples in chroma.layers.structure.conditioners).
            detach_X0 (bool, optional): A flag that indicates whether to detach the
                 denoised X for score function evaluation
            align_X0 (bool, optional): A flag that indicates whether to align the
                 noised X and denoised X for score function calculation.
            U_traj (List, optional): Record diffusion energy as a list.

        Returns:
            score (torch.Tensor): A tensor of score values with shape
                 `(batch_size, num_residues, 4, 3)`.
        """

        X = backbone.impute_masked_X(X, C)
        with torch.enable_grad():
            X = X.detach().clone()
            X.requires_grad = True

            # Apply optional conditioner transformations to state and energy
            Xt, Ct, U_conditioner = X, C, 0.0
            St = torch.zeros(Ct.shape, device=Xt.device).long()
            Ot = F.one_hot(St, len(AA20)).float()
            if conditioner is not None:
                Xt, Ct, _, U_conditioner, _ = conditioner(X, C, Ot, U_conditioner, t)
            U_conditioner = torch.as_tensor(U_conditioner)

            # Compute system energy
            U_diffusion = self.energy(
                Xt, X0_func, Ct, t, detach_X0=detach_X0, align_X0=align_X0
            )

            U_traj.append(U_diffusion.detach().cpu())

            # Compute score function as negative energy gradient
            U_total = U_diffusion.sum() + U_conditioner.sum()
            U_total.backward()
            score = -X.grad
            score = score.masked_fill((C <= 0)[..., None, None], 0.0)
        return score

    def elbo(self, X0_pred, X0, C, t):
        """ITD ELBO as a weighted average of denoising error,
        inspired by https://arxiv.org/abs/2302.03792"""
        if not isinstance(t, torch.Tensor):
            t = torch.Tensor([t]).float().to(X0.device)

        # Interpolate missing data with Brownian Bridge posterior
        X0 = backbone.impute_masked_X(X0, C)
        X0_pred = backbone.impute_masked_X(X0_pred, C)

        # Compute whitened residual
        dX = (X0 - X0_pred).reshape([X0.shape[0], -1, 3])
        R_inv_dX = self.base_gaussian._multiply_R_inverse(dX, C)

        # Average per atom, including over "missing" positions that we filled in

        weight = 0.5 * self.noise_schedule.SNR_derivative(t)[:, None, None, None]
        snr = self.noise_schedule.SNR(t)[:, None, None, None]
        loss_itd = (
            weight * (R_inv_dX.pow(2) - 1 / (1 + snr))
            - 0.5 * np.log(np.pi * 2.0 * np.e)
        ).reshape(X0.shape)

        # Compute average per-atom loss (including over missing regions)
        mask = (C != 0).float()
        mask_atoms = mask.reshape(mask.shape + (1, 1)).expand([-1, -1, 4, 1])

        # Per-complex
        elbo_gap = (mask_atoms * loss_itd).sum([1, 2, 3])
        logdet = self.base_gaussian.log_determinant(C)
        elbo_unnormalized = elbo_gap - logdet

        # Normalize per atom
        elbo = elbo_unnormalized / (mask_atoms.sum([1, 2, 3]) + self._eps)

        # Compute batch average
        weights = mask_atoms.sum([1, 2, 3])
        elbo_batch = (weights * elbo).sum() / (weights.sum() + self._eps)
        return elbo, elbo_batch

    def pseudoelbo(self, loss_per_residue, C, t):
        """Compute pseudo-ELBOs as weighted averages of other errors."""
        if not isinstance(t, torch.Tensor):
            t = torch.Tensor([t]).float().to(C.device)

        # Average per atom, including over x"missing" positions that we filled in
        weight = 0.5 * self.noise_schedule.SNR_derivative(t)[:, None]
        loss = weight * loss_per_residue

        # Compute average loss
        mask = (C > 0).float()
        pseudoelbo = (mask * loss).sum(-1) / (mask.sum(-1) + self._eps)
        pseudoelbo_batch = (mask * loss).sum() / (mask.sum() + self._eps)
        return pseudoelbo, pseudoelbo_batch

    def _baoab_sample_step(
        self,
        _x,
        p,
        C,
        t,
        dt,
        score_func,
        gamma=2.0,
        kT=1.0,
        n_equil=1,
        ode_boost=True,
        langevin_isothermal=False,
    ):
        gamma = torch.Tensor([gamma]).to(_x.device)
        (
            alpha,
            sigma,
            beta,
            g,
            lambda_t,
            lambda_langevin,
        ) = self._schedule_coefficients(
            t, inverse_temperature=1 / kT, langevin_isothermal=langevin_isothermal,
        )

        def baoab_step(_x, p, t):
            Z = torch.randn_like(_x)
            c1 = torch.exp(-gamma * dt)
            c3 = torch.sqrt((1 / lambda_t) * (1 - c1 ** 2))

            # BAOAB scheme
            p_half = p + score_func(t, C, _x) * dt / 2  # B
            _x_half = (
                _x
                + g.pow(2) * self.base_gaussian.multiply_covariance(p_half, C) * dt / 2
            )  # A
            p_half2 = c1 * p_half + c3 * (
                1 / g
            ) * self.base_gaussian._multiply_R_inverse_transpose(
                Z, C
            )  # O
            _x = (
                _x_half
                + g.pow(2) * self.base_gaussian.multiply_covariance(p_half2, C) * dt / 2
            )  # A
            p = p_half2 + score_func(t, C, _x) * dt / 2  # B

            return _x, p

        def ode_step(t, _x):
            score = score_func(t, C, _x)
            score_transformed = self.base_gaussian.multiply_covariance(score, C)
            _x = _x + 0.5 * (_x + score_transformed) * g.pow(2) * dt
            return _x

        for i in range(n_equil):
            _x, p = baoab_step(_x, p, t)

        if ode_boost:
            _x = ode_step(t, _x)

        return _x, p

    @torch.no_grad()
    def sample_sde(
        self,
        X0_func: Callable,
        C: torch.LongTensor,
        X_init: Optional[torch.Tensor] = None,
        conditioner: Optional[Callable] = None,
        N: int = 100,
        tspan: Tuple[float, float] = (1.0, 0.001),
        inverse_temperature: float = 1.0,
        langevin_factor: float = 0.0,
        langevin_isothermal: bool = True,
        sde_func: str = "reverse_sde",
        integrate_func: str = "euler_maruyama",
        initialize_noise: bool = True,
        remap_time: bool = False,
        remove_drift_translate: bool = False,
        remove_noise_translate: bool = False,
        align_X0: bool = True,
    ) -> Dict[str, torch.Tensor]:
        """Sample from the SDE using a numerical integration scheme.

        This function samples from the stochastic differential equation (SDE) defined
        by the model using a numerical integration scheme such as Euler-Maruyama or
        huen. The SDE can be either in the forward or reverse direction. The function
        also supports optional conditioning on external variables and adding Langevin
        noise to the SDE dynamics.

        Args:
            X0_func (Callable): A denoising function that maps `(X, C, t)` to `X0`.
            C (torch.LongTensor): Conditioner tensor with shape `(num_batch,
                num_residues)`.
            X_init (torch.Tensor, optional): Initial state tensor with shape `(num_batch
                , num_residues, 4 ,3)` or None.
                If None, a zero tensor will be used as the initial state.
            conditioner (Callable, optional): A function that transforms X, C, U, t.
                If None, no conditioning will be applied.
            N (int): Number of integration steps.
            tspan (Tuple[float,float]): Time span for integration.
            inverse_temperature (float): Inverse temperature parameter for SDE.
            langevin_factor (float): Langevin factor for adding noise to SDE.
            langevin_isothermal (bool): Whether to use isothermal or adiabatic Langevin
                 dynamics.
            sde_func (str): Which SDE function to use ('reverse_sde', 'langevin' or 'ode').
            integrate_func (str): Which integration function to use ('euler_maruyama'
                 or 'heun').
            initialize_noise (bool): Whether to initialize the state with noise.
            remap_time (bool): Whether to remap the time grid according to the noise
                 schedule.
            remove_drift_translate (bool): Whether to remove the net translational
                 component from the drift term.
            remove_noise_translate (bool): Whether to remove the net translational
                 component from the noise term.
            align_X0 (bool): Whether to Kabsch align X0 with X before computing SDE terms.

        Returns:
            outputs (Dict[str, torch.Tensor]): A dictionary of output tensors with the
            following keys:
                - 'C': The conditioned tensor with shape `(num_batch,num_residues)`.
                - 'X_sample': The final sampled state tensor with shape `(num_batch,
                    num_residues ,4 ,3)`.
                - 'X_trajectory': A list of state tensors along the trajectory with
                    shape `(num_batch,num_residues ,4 ,3)` each.
                - 'Xhat_trajectory': A list of transformed state tensors along the
                    trajectory with shape `(num_batch,num_residues ,4 ,3)` each.
                - 'Xunc_trajectory': A list of unconstrained state tensors along the
                    trajectory with shape `(num_batch,num_residues ,4 ,3)` each.
        """

        # Setup SDE integration
        integrate_func = self.integrate_funcs[integrate_func]
        sde_func = self.sde_funcs[sde_func]
        T_grid = (
            self.noise_schedule.linear_logsnr_grid(N=N, tspan=tspan).to(C.device)
            if remap_time
            else torch.linspace(tspan[0], tspan[1], N + 1).to(C.device)
        )

        # Intercept the X0 function for tracking Xt and Xhat
        Xhat_trajectory = []
        Xt_trajectory = []
        U_trajectory = []

        def _X0_func(_X, _C, t):
            _X0 = X0_func(_X, _C, t)
            Xt_trajectory.append(_X.detach())
            Xhat_trajectory.append(_X0.detach())
            return _X0

        def sdefun(_t, _X):
            f, gZ = sde_func(
                _X,
                _X0_func,
                C,
                _t,
                conditioner=conditioner,
                inverse_temperature=inverse_temperature,
                langevin_factor=langevin_factor,
                langevin_isothermal=langevin_isothermal,
                align_X0=align_X0,
            )
            # Remove net translational component
            if remove_drift_translate:
                f = backbone.center_X(f, C)
            if remove_noise_translate:
                gZ = backbone.center_X(gZ, C)
            return f, gZ

        # Initialization
        if initialize_noise and X_init is not None:
            X_init = self.forward(X_init, C, t=tspan[0]).detach()
        elif X_init is None:
            X_init = torch.zeros(list(C.shape) + [4, 3], device=C.device)
            X_init = self.forward(X_init, C, t=tspan[0]).detach()

        # Determine output shape via a test forward pass
        if conditioner:
            with torch.enable_grad():
                X_init_test = X_init.clone()
                X_init_test.requires_grad = True
                S_test = torch.zeros(C.shape, device=X_init.device).long()
                O_test = F.one_hot(S_test, len(AA20)).float()
                U_test = 0.0
                t_test = torch.tensor([0.0], device=X_init.device)
                _, Ct, _, _, _ = conditioner(X_init_test, C, O_test, U_test, t_test)
        else:
            Ct = C

        # Integrate
        X_trajectory = integrate_func(sdefun, X_init, tspan, N=N, T_grid=T_grid)

        # Return constrained coordinates
        outputs = {
            "C": Ct,
            "X_sample": Xt_trajectory[-1],
            "X_trajectory": [Xt_trajectory[-1]] + Xt_trajectory,
            "Xhat_trajectory": Xhat_trajectory,
            "Xunc_trajectory": X_trajectory,
        }
        return outputs

    @torch.no_grad()
    def estimate_pseudoelbo_X(
        self,
        X0_func,
        X,
        C,
        num_samples=50,
        deterministic_seed=0,
        return_elbo_t=False,
        noise=True,
    ):
        with torch.random.fork_rng():
            torch.random.manual_seed(deterministic_seed)

            mask = (C > 0).float()
            mask_atoms = mask.reshape(list(mask.shape) + [1, 1]).expand([-1, -1, 4, 1])

            elbo = []
            T = np.linspace(1e-4, 1.0, num_samples)
            for t in tqdm(T.tolist()):
                X_noise = self.forward(X, C, t=t) if noise else X
                X_denoise = X0_func(X_noise, C, t)

                elbo_t = -self.noise_schedule.SNR_derivative(t).to(X.device) * (
                    ((mask_atoms * (X_denoise - X) / 10.0) ** 2).sum([1, 2, 3])
                    / mask_atoms.sum([1, 2, 3])
                )
                elbo.append(elbo_t)
            elbo = torch.stack(elbo, 0)
            if not return_elbo_t:
                elbo = elbo.mean(0)
        return elbo

    def _score_direct(
        self, Xt, X0_func, C, t, align_X0=True,
    ):
        X0 = X0_func(Xt, C, t)

        """Compute the score function directly. (Sometimes numerically unstable)"""

        alpha = self.noise_schedule.alpha(t).to(Xt.device)
        sigma = self.noise_schedule.sigma(t).to(Xt.device)

        # Impute sensibly behaved values in masked regions for numerical stability
        # X0 = backbone.impute_masked_X(X0, C)
        Xt = backbone.impute_masked_X(Xt, C)

        if align_X0:
            X0, _ = self.loss_rmsd.align(X0, Xt, C, align_unmasked=True)

        # Compute mean
        X_mu = self._mean(X0, C, alpha)
        X_mu = backbone.impute_masked_X(X_mu, C)
        dX = Xt - X_mu

        Ci_dX = self.base_gaussian.multiply_inverse_covariance(dX, C)
        score = -Ci_dX / sigma.pow(2)[:, None, None, None]

        # Mask
        score = score.masked_fill((C <= 0)[..., None, None], 0.0)

        return score

    def estimate_logp(
        self,
        X0_func: Callable,
        X_sample: torch.Tensor,
        C: torch.LongTensor,
        N: int,
        return_trace_t: bool = False,
    ):
        """Estimate the model logP for given protein backboones
            (num_batch, num_residues, 4, 3) by the Continuous Normalizing Flow formalism

            Reference:
                https://arxiv.org/abs/1810.01367
                https://arxiv.org/abs/1806.07366

        Args:
            X0_func (Callable): A function that returns the initial protein backboone
                 (num) features given a condition.
            X_sample (torch.Tensor): A tensor of protein backboone (num) features with
            shape
                `(batch_size, num_residues, 4, 3)`.
            C (torch.Tensor): A tensor of condition features with shape `(batch_size,
                 num_residues)`.
            N (int, optional): number of ode integration steps
            return_trace_t (bool, optional): A flag that indicates whether to return the
            log |df / dx| for each time step for the integrated log Jacobian trance.
              Default is False.

        Returns:
            elbo (torch.Tensor): A tensor of logP value
            if return_elbo_t is False, or `(N)` if return_elbo_t
            is True.
        """

        def divergence(fn, x, t):
            """Calculate Divergance with Stochastic Trace Estimator"""
            vec_eps = torch.randn_like(x)
            fn_out, eps_J_prod = torch.autograd.functional.vjp(
                fn, (t, x), vec_eps, create_graph=False
            )
            eps_J_eps = (
                (eps_J_prod[1] * vec_eps).reshape(x.shape[0], -1).sum(-1).unsqueeze(-1)
            )
            return fn_out, eps_J_eps

        def flow_gradient(
            X, X0_func, C, t,
        ):
            """Compute the time gradient from the probability flow ODE."""

            _, _, beta, g, _, _ = self._schedule_coefficients(t)
            score = self._score_direct(X, X0_func, C, t)
            dXdt = (-1 / 2) * beta * X - 0.5 * g.pow(2) * score

            return dXdt

        def odefun(_t, _X):
            _t = _t.detach()
            f = flow_gradient(_X, X0_func, C, _t,)
            return f

        # foward integration to noise
        X_sample = backbone.center_X(X_sample, C)
        X_sample = backbone.impute_masked_X(X_sample, C)
        C = C.abs()

        out = self.sample_sde(
            X0_func=X0_func,
            C=C,
            X_init=X_sample,
            N=N,
            sde_func="ode",
            tspan=(0, 1.0),
            inverse_temperature=1.0,
            langevin_factor=0.0,
            initialize_noise=False,
            align_X0=False,
        )

        X_flow = out["X_trajectory"][1:]

        # get ode function
        ddlogp = []

        for i, t in enumerate(tqdm(torch.linspace(1e-2, 1.0, len(X_flow)))):
            with torch.enable_grad():
                dlogP = divergence(odefun, X_flow[i], t[None].to(C.device))[1]
            ddlogp.append(dlogP.item())

        logp_x1 = self.base_gaussian.log_prob(X_flow[-1], C).item()

        if return_trace_t:
            return np.array(ddlogp) / ((C > 0).float().sum().item() * 4)
        else:
            return (logp_x1 + np.array(ddlogp).mean()) / (
                (C > 0).float().sum().item() * 4
            )

    @torch.no_grad()
    @validate_XC(all_atom=False)
    def estimate_elbo(
        self,
        X0_func: Callable,
        X: torch.Tensor,
        C: torch.LongTensor,
        num_samples: int = 50,
        deterministic_seed: int = 0,
        return_elbo_t: bool = False,
        grad_logprob_Y_func: Optional[Callable] = None,
    ) -> torch.Tensor:
        """Estimate the evidence lower bound (ELBO) for given protein backboones
            (num_batch, num_residues, 4, 3) and condition.

        Args:
            X0_func (Callable): A function that returns the initial protein backboone
                 (num) features given a condition.
            X (torch.Tensor): A tensor of protein backboone (num) features with shape
                `(batch_size, num_residues, 4, 3)`.
            C (torch.Tensor): A tensor of condition features with shape `(batch_size,
                 num_residues)`.
            num_samples (int, optional): The number of time steps to sample for
                estimating the ELBO. Default is 50.
            deterministic_seed (int, optional): The seed for generating random noise.
                 Default is 0.
            return_elbo_t (bool, optional): A flag that indicates whether to return the
            ELBO for each time step or the average ELBO. Default is False.
            grad_logprob_Y_func (Optional[Callable], optional): A function that returns
            the gradient of the log probability of the observed protein backboone (num)
            given a time step and a noisy image. Default is None.

        Returns:
            elbo (torch.Tensor): A tensor of ELBO values with shape `(batch_size,)`
            if return_elbo_t is False, or `(num_samples, batch_size)` if return_elbo_t
            is True.
        """
        X = backbone.impute_masked_X(X, C)

        with torch.random.fork_rng():
            torch.random.manual_seed(deterministic_seed)
            mask = (C > 0).float()
            mask_atoms = mask.reshape(list(mask.shape) + [1, 1]).expand([-1, -1, 4, 1])

            elbo = []
            T = np.linspace(1e-4, 1.0, num_samples)
            for t in tqdm(T.tolist()):
                X_noise = self.forward(X, C, t=t)
                X_denoise = X0_func(X_noise, C, t)

                # Adjust X-hat estimate with aux-grad
                if grad_logprob_Y_func is not None:
                    with torch.random.fork_rng():
                        grad = grad_logprob_Y_func(t, X_noise)
                        sigma_square = (
                            self.noise_schedule.sigma(t).square().to(X.device)
                        )
                        dXhat = sigma_square * self.base_gaussian.multiply_covariance(
                            grad, C
                        )
                        dXhat = backbone.center_X(dXhat, C)
                        X_denoise = X_denoise + dXhat

                elbo_t, _ = self.elbo(X_denoise, X, C, t)

                elbo.append(elbo_t)

            elbo_t = torch.stack(elbo, 0)

        if return_elbo_t:
            return elbo_t
        else:
            return elbo_t.mean(0)

    def conditional_X0(
        self, X0: torch.Tensor, score: torch.Tensor, C: torch.tensor, t: torch.Tensor
    ) -> torch.Tensor:
        """Use Bayes theorem and Tweedie formula to obtain a conditional X0 given
        prior X0 and a conditional score \nabla_x p( y | x)
        X0 <- X0 + \frac{sigma_t**2}{alpha_t} \Sigma score
        Args:
            X0 (torch.Tensor): backbone coordinates of size (batch, num_residues, 4, 3)
            score (torch.Tensor): of size (batch, num_residues, 4, 3)
            C (torch.Tensor): of size (batch, num_residues)
            t (torch.Tensor): of size (batch,)

        Returns:
            X0 (torch.Tensor): updated conditional X0 of size (batch, num_residues, 4, 3)
        """
        alpha, sigma, _, _, _, _ = self._schedule_coefficients(t)
        X_update = sigma.pow(2).div(alpha)[
            ..., None
        ] * self.base_gaussian.multiply_covariance(score, C)
        return X0 + X_update

    def _mean(self, X, C, alpha):
        """Build the diffusion kernel mean given alpha"""
        # Compute the MVN mean
        X_mu = backbone.scale_around_mean(X, C, alpha)
        return X_mu

    def _X_to_Z(self, X_sample, X, C, alpha, sigma):
        """Convert from output space to standardized space"""

        # Impute missing data with conditional means
        X = backbone.impute_masked_X(X, C)
        X_sample = backbone.impute_masked_X(X_sample, C)

        # sigma = self.noise_schedule.sigma(t).to(X.device)

        # Step 4. [Inverse] Add mean
        X_mu = self._mean(X, C, alpha)
        X_mu = backbone.impute_masked_X(X_mu, C)
        X_noise = (X_sample - X_mu).reshape(X.shape[0], -1, 3)

        # Step 3. [Inverse] Scale noise by sigma
        X_noise = X_noise / sigma[:, None, None]

        # Step 1 & 2. Multiply Z by inverse square root of covariance
        Z = self.base_gaussian._multiply_R_inverse(X_noise, C)

        return Z

    def _Z_to_X(self, Z, X, C, alpha, sigma):
        """Convert from standardized space to output space"""

        # Step 1 & 2. Multiply Z by square root of covariance
        dX = self.base_gaussian._multiply_R(Z, C)

        # Step 3. Scale noise by alpha
        dX = sigma[:, None, None, None] * dX.reshape(X.shape)

        # Step 4. Add mean
        X_mu = self._mean(X, C, alpha)
        X_sample = X_mu + dX

        return X_sample

    def sample_conditional(
        self, X: torch.Tensor, C: torch.LongTensor, t: torch.Tensor, s: torch.Tensor
    ) -> torch.Tensor:
        """
        Samples from the forward process q(x_{t} | x_{s}) for t > s.
        See appendix A.1 in [https://arxiv.org/pdf/2107.00630.pdf]. `forward` does this for s = 0.
        Args:
            X (torch.Tensor): Input coordinates with shape `(batch_size, num_residues,
                4, 3)` at time `t0`.
            C (torch.Tensor): Chain tensor with shape `(batch_size, num_residues)`.
            t (torch.Tensor): Time index with shape `(batch_size,)`.
            s (torch.Tensor): Time index with shape `(batch_size,)`.

        Returns:
            X_sample (torch.Tensor): Sampled coordinates from the forward diffusion
                marginals with shape `(batch_size, num_residues, 4, 3)`.
        """
        assert (t > s).all()
        X = backbone.impute_masked_X(X, C)
        # Do we need this?
        X = backbone.center_X(X, C)
        alpha_ts = self.noise_schedule.alpha(t) / self.noise_schedule.alpha(s)
        sigma_ts = (
            self.noise_schedule.sigma(t).pow(2)
            - alpha_ts.pow(2) * self.noise_schedule.sigma(s).pow(2)
        ).sqrt()

        X_sample = alpha_ts * X + sigma_ts * self.base_gaussian.sample(C)
        # Do we need this?
        X_sample = backbone.center_X(X_sample - X, C) + X
        return X_sample

    @validate_XC(all_atom=False)
    def forward(
        self, X: torch.Tensor, C: torch.LongTensor, t: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Sample from the forwards diffusion marginals at time t

        Inputs:
            X (torch.Tensor): Input coordinates with shape `(batch_size, num_residues,
                4, 3)`.
            C (torch.LongTensor): Chain tensor with shape `(batch_size, num_residues)`.
            t (torch.Tensor, optional): Time index with shape `(batch_size,)`. If not
                given, a random time index will be sampled. Defaults to None.

        Outputs:
            X_sample (torch.Tensor): Sampled coordinates from the forward diffusion
                marginals with shape `(batch_size, num_residues, 4, 3)`.
            t (torch.Tensor, optional): Time index with shape `(batch_size,)`. Only
                returned if t is not given as input.
        """

        # Draw a sample from the prior
        X_prior = self.base_gaussian.sample(C)

        # Sample time if not given
        t_input = t
        t = self.sample_t(C, t)

        alpha = self.noise_schedule.alpha(t)[:, None, None, None].to(X.device)
        sigma = self.noise_schedule.sigma(t)[:, None, None, None].to(X.device)

        X_sample = alpha * X + sigma * X_prior
        X_sample = backbone.center_X(X_sample - X, C) + X

        if t_input is None:
            return X_sample, t
        else:
            return X_sample

## Loss
class ReconstructionLosses(nn.Module):
    """Compute diffusion reconstruction losses for protein backbones.

    Args:
        diffusion (DiffusionChainCov): Diffusion object parameterizing a
            forwards diffusion over protein backbones.
        loss_scale (float): Length scale parameter used for setting loss error
            scaling in units of Angstroms. Default is 10, which corresponds to
            using units of nanometers.
        rmsd_method (str): Method used for computing RMSD superpositions. Can
            be "symeig" (default) or "power" for power iteration.

    Inputs:
        X0_pred (torch.Tensor): Denoised coordinates with shape
            `(num_batch, num_residues, 4, 3)`.
        X (torch.Tensor): Unperturbed coordinates with shape
            `(num_batch, num_residues, 4, 3)`.
        C (torch.LongTensor): Chain map with shape `(num_batch, num_residues)`.
        t (torch.Tensor): Diffusion time with shape `(batch_size,)`.
            Should be on [0,1].

    Outputs:
        losses (dict): Dictionary of reconstructions computed across different
            metrics. Metrics prefixed with `batch_` will be batch-averaged scalars
            while other metrics should be per batch member with shape
            `(num_batch, ...)`.
    """

    def __init__(
        self,
        diffusion: DiffusionChainCov,
        loss_scale: float = 10.0,
        rmsd_method: str = "symeig",
    ):
        super().__init__()
        self.noise_perturb = diffusion
        self.loss_scale = loss_scale
        self._loss_eps = 1e-5

        # Auxiliary losses
        self.loss_rmsd = rmsd.BackboneRMSD(method=rmsd_method)
        self.loss_fragment = rmsd.LossFragmentRMSD(method=rmsd_method)
        self.loss_fragment_pair = rmsd.LossFragmentPairRMSD(method=rmsd_method)
        self.loss_neighborhood = rmsd.LossNeighborhoodRMSD(method=rmsd_method)
        self.loss_hbond = hbonds.LossBackboneHBonds()
        self.loss_distance = backbone.LossBackboneResidueDistance()

        self.loss_functions = {
            "elbo": self._loss_elbo,
            "rmsd": self._loss_rmsd,
            "pseudoelbo": self._loss_pseudoelbo,
            "fragment": self._loss_fragment,
            "pair": self._loss_pair,
            "neighborhood": self._loss_neighborhood,
            "distance": self._loss_distance,
            "hbonds": self._loss_hbonds,
        }

    def _batch_average(self, loss, C):
        weights = (C > 0).float().sum(-1)
        return (weights * loss).sum() / (weights.sum() + self._loss_eps)

    def _loss_elbo(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        losses["elbo"], losses["batch_elbo"] = self.noise_perturb.elbo(X0_pred, X, C, t)

    def _loss_rmsd(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        _, rmsd_denoise = self.loss_rmsd.align(X, X0_pred, C)
        _, rmsd_noise = self.loss_rmsd.align(X, X_t_2, C)
        rmsd_ratio_per_item = w * rmsd_denoise / (rmsd_noise + self._loss_eps)
        global_mse_normalized = (
            w
            * self.loss_scale
            * rmsd_denoise.square()
            / (rmsd_noise.square() + self._loss_eps)
        )
        losses["rmsd_ratio"] = self._batch_average(rmsd_ratio_per_item, C)
        losses["global_mse"] = global_mse_normalized
        losses["batch_global_mse"] = self._batch_average(global_mse_normalized, C)

    def _loss_pseudoelbo(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        # Unaligned residual pseudoELBO
        unaligned_mse = ((X - X0_pred) / self.loss_scale).square().sum(-1).mean(-1)
        losses["elbo_X"], losses["batch_pseudoelbo_X"] = self.noise_perturb.pseudoelbo(
            unaligned_mse, C, t
        )

    def _loss_fragment(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        # Aligned Fragment MSE loss
        mask = (C > 0).float()
        rmsd_fragment = self.loss_fragment(X0_pred, X, C)
        rmsd_fragment_noise = self.loss_fragment(X_t_2, X, C)
        fragment_mse_normalized = (
            self.loss_scale
            * w
            * (
                (mask * rmsd_fragment.square()).sum(1)
                / ((mask * rmsd_fragment_noise.square()).sum(1) + self._loss_eps)
            )
        )
        losses["fragment_mse"] = fragment_mse_normalized
        losses["batch_fragment_mse"] = self._batch_average(fragment_mse_normalized, C)

    def _loss_pair(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        # Aligned Pair MSE loss
        rmsd_pair, mask_ij_pair = self.loss_fragment_pair(X0_pred, X, C)
        rmsd_pair_noise, mask_ij_pair = self.loss_fragment_pair(X_t_2, X, C)
        pair_mse_normalized = (
            self.loss_scale
            * w
            * (
                (mask_ij_pair * rmsd_pair.square()).sum([1, 2])
                / (
                    (mask_ij_pair * rmsd_pair_noise.square()).sum([1, 2])
                    + self._loss_eps
                )
            )
        )
        losses["pair_mse"] = pair_mse_normalized
        losses["batch_pair_mse"] = self._batch_average(pair_mse_normalized, C)

    def _loss_neighborhood(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        # Neighborhood MSE
        rmsd_neighborhood, mask = self.loss_neighborhood(X0_pred, X, C)
        rmsd_neighborhood_noise, mask = self.loss_neighborhood(X_t_2, X, C)
        neighborhood_mse_normalized = (
            self.loss_scale
            * w
            * (
                (mask * rmsd_neighborhood.square()).sum(1)
                / ((mask * rmsd_neighborhood_noise.square()).sum(1) + self._loss_eps)
            )
        )
        losses["neighborhood_mse"] = neighborhood_mse_normalized
        losses["batch_neighborhood_mse"] = self._batch_average(
            neighborhood_mse_normalized, C
        )

    def _loss_distance(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        # Distance MSE
        mask = (C > 0).float()
        distance_mse = self.loss_distance(X0_pred, X, C)
        distance_mse_noise = self.loss_distance(X_t_2, X, C)
        distance_mse_normalized = self.loss_scale * (
            w
            * (mask * distance_mse).sum(1)
            / ((mask * distance_mse_noise).sum(1) + self._loss_eps)
        )
        losses["distance_mse"] = distance_mse_normalized
        losses["batch_distance_mse"] = self._batch_average(distance_mse_normalized, C)

    def _loss_hbonds(self, losses, X0_pred, X, C, t, w=None, X_t_2=None):
        # HBond recovery
        outs = self.loss_hbond(X0_pred, X, C)
        hb_local, hb_nonlocal, error_co = [w * o for o in outs]

        losses["batch_hb_local"] = self._batch_average(hb_local, C)
        losses["hb_local"] = hb_local
        losses["batch_hb_nonlocal"] = self._batch_average(hb_nonlocal, C)
        losses["hb_nonlocal"] = hb_nonlocal
        losses["batch_hb_contact_order"] = self._batch_average(error_co, C)

    @torch.no_grad()
    @validate_XC(all_atom=False)
    def estimate_metrics(
        self,
        X0_func: Callable,
        X: torch.Tensor,
        C: torch.LongTensor,
        num_samples: int = 50,
        deterministic_seed: int = 0,
        use_noise: bool = True,
        return_samples: bool = False,
        tspan: Tuple[float] = (1e-4, 1.0),
    ):
        """Estimate time-averaged reconstruction losses of protein backbones.

        Args:
            X0_func (Callable): A denoising function that maps `(X, C, t)` to `X0`.
            X (torch.Tensor): A tensor of protein backboone (num) features with shape
                `(batch_size, num_residues, 4, 3)`.
            C (torch.Tensor): A tensor of condition features with shape `(batch_size,
                num_residues)`.
            num_samples (int, optional): The number of time steps to sample for
            estimating the ELBO. Default is 50.
            use_noise (bool): If True, add noise to each structure before denoising.
                Default is True. When False this can be used for estimating if
                if structures are fixed points of the denoiser across time.
            deterministic_seed (int, optional): The seed for generating random noise.
                Default is 0.
            return_samples (bool): If True, include intermediate sampled
                values for each metric. Default is false.
            tspan (tuple[float]): Tuple of floats indicating the diffusion
                times between which to integrate.

        Returns:
            metrics (dict): A dictionary of reconstruction metrics averaged over
                time.
            metrics_samples (dict, optional): A dictionary of in metrics
                averaged over time.
        """
        #
        X = backbone.impute_masked_X(X, C)
        with torch.random.fork_rng():
            torch.random.manual_seed(deterministic_seed)
            T = np.linspace(1e-4, 1.0, num_samples)
            losses = []
            for t in tqdm(T.tolist(), desc="Integrating diffusion metrics"):
                X_noise = self.noise_perturb(X, C, t=t) if use_noise else X
                X_denoise = X0_func(X_noise, C, t)
                losses_t = self.forward(X_denoise, X, C, t)

                # Discard batch estimated objects
                losses_t = {
                    k: v
                    for k, v in losses_t.items()
                    if not k.startswith("batch_") and k != "rmsd_ratio"
                }
                losses.append(losses_t)

            # Transpose list of dicts to a dict of lists
            metrics_samples = {k: [d[k] for d in losses] for k in losses[0].keys()}

            # Average final metrics across time
            metrics = {
                k: torch.stack(v, 0).mean(0)
                for k, v in metrics_samples.items()
                if isinstance(v[0], torch.Tensor)
            }
        if return_samples:
            return metrics, metrics_samples
        else:
            return metrics

    @validate_XC()
    def forward(
        self,
        X0_pred: torch.Tensor,
        X: torch.Tensor,
        C: torch.LongTensor,
        t: torch.Tensor,
    ):
        # Collect all losses and tensors for metric tracking
        losses = {"t": t, "X": X, "X0_pred": X0_pred}
        X_t_2 = self.noise_perturb(X, C, t=t)

        # Per complex weights
        ssnr = self.noise_perturb.noise_schedule.SSNR(t).to(X.device)
        prob_ssnr = self.noise_perturb.noise_schedule.prob_SSNR(ssnr)
        importance_weights = 1 / prob_ssnr

        for _loss in self.loss_functions.values():
            _loss(losses, X0_pred, X, C, t, w=importance_weights, X_t_2=X_t_2)
        return losses


def _debug_viz_gradients(
    pml_file, X_list, dX_list, C, S, arrow_length=2.0, name="gradient", color="red"
):
    """ """
    lines = [
        "from pymol.cgo import *",
        "from pymol import cmd",
        f'color_1 = list(pymol.cmd.get_color_tuple("{color}"))',
        'color_2 = list(pymol.cmd.get_color_tuple("blue"))',
    ]

    with open(pml_file, "w") as f:
        for model_ix, X in enumerate(X_list):
            print(model_ix)
            lines = lines + ["obj_1 = []"]

            dX = dX_list[model_ix]
            scale = dX.norm(dim=-1).mean().item()
            X_i = X
            X_j = X + arrow_length * dX / scale

            for a_ix in range(4):
                for i in range(X.size(1)):
                    x_i = X_i[0, i, a_ix, :].tolist()
                    x_j = X_j[0, i, a_ix, :].tolist()
                    lines = lines + [
                        f"obj_1 = obj_1 + [CYLINDER] + {x_i} + {x_j} + [0.15]"
                        " + color_1 + color_1"
                    ]
            lines = lines + [f'cmd.load_cgo(obj_1, "{name}", {model_ix+1})']
            f.write("\n" + "\n".join(lines))
            lines = []


def _debug_viz_XZC(X, Z, C, rgb=True):
    from matplotlib import pyplot as plt

    if len(X.shape) > 3:
        X = X.reshape(X.shape[0], -1, 3)
    if len(Z.shape) > 3:
        Z = Z.reshape(Z.shape[0], -1, 3)
    if C.shape[1] != X.shape[1]:
        C_expand = C.unsqueeze(-1).expand(-1, -1, 4)
        C = C_expand.reshape(C.shape[0], -1)

    # C_mask = expand_chain_map(torch.abs(C))
    # X_expand = torch.einsum('nix,nic->nicx', X, C_mask)
    # plt.plot(X_expand[0,:,:,0].data.numpy())
    N = X.shape[1]
    Ymax = torch.max(X[0, :, 0]).item()
    plt.figure(figsize=[12, 4])
    plt.subplot(2, 1, 1)

    plt.bar(
        np.arange(0, N),
        (C[0, :].data.numpy() < 0) * Ymax,
        width=1.0,
        edgecolor=None,
        color="lightgrey",
    )
    if rgb:
        plt.plot(X[0, :, 0].data.numpy(), "r", linewidth=0.5)
        plt.plot(X[0, :, 1].data.numpy(), "g", linewidth=0.5)
        plt.plot(X[0, :, 2].data.numpy(), "b", linewidth=0.5)
        plt.xlim([0, N])
        plt.grid()
        plt.title("X")
        plt.xticks([])
        plt.subplot(2, 1, 2)
        plt.plot(Z[0, :, 0].data.numpy(), "r", linewidth=0.5)
        plt.plot(Z[0, :, 1].data.numpy(), "g", linewidth=0.5)
        plt.plot(Z[0, :, 2].data.numpy(), "b", linewidth=0.5)
        plt.plot(C[0, :].data.numpy(), "orange")
        plt.xlim([0, N])
        plt.grid()
        plt.title("RInverse @ [X]")
        plt.xticks([])
        plt.savefig("xzc.pdf")
    else:
        plt.plot(X[0, :, 0].data.numpy(), "k", linewidth=0.5)
        plt.xlim([0, N])
        plt.grid()
        plt.title("X")
        plt.xticks([])
        plt.subplot(2, 1, 2)
        plt.plot(Z[0, :, 0].data.numpy(), "k", linewidth=0.5)
        plt.plot(C[0, :].data.numpy(), "orange")
        plt.xlim([0, N])
        plt.grid()
        plt.title("Inverse[X]")
        plt.xticks([])
        plt.savefig("xzc.pdf")
    exit()