from __future__ import annotations

import contextlib
from functools import partial

import attr
import einops
import torch
import torch.nn as nn
from attr import dataclass

from esm.layers.regression_head import RegressionHead
from esm.layers.transformer_stack import TransformerStack
from esm.models.function_decoder import FunctionTokenDecoder
from esm.models.vqvae import (
    StructureTokenDecoder,
    StructureTokenEncoder,
)
from esm.sdk.api import (
    ESM3InferenceClient,
    ESMProtein,
    ESMProteinTensor,
    ForwardAndSampleOutput,
    ForwardConfig,
    ForwardOutput,
    ForwardTrackData,
    GenerationConfig,
    ProteinType,
    ReturnLogitsConfig,
    SamplingConfig,
    SamplingTrackConfig,
)
from esm.tokenization import get_model_tokenizers
from esm.utils import encoding
from esm.utils.constants import esm3 as C
from esm.utils.constants.models import ESM3_OPEN_SMALL
from esm.utils.decoding import decode_protein_tensor
from esm.utils.generation import (
    iterative_sampling_raw,
    iterative_sampling_tokens,
)
from esm.utils.misc import rbf
from esm.utils.sampling import (
    get_default_sampling_config,
    sample_function_logits,
    sample_logits,
    sample_residue_annotation_logits,
)
from esm.utils.structure.affine3d import (
    build_affine3d_from_coordinates,
)


@dataclass
class ESMOutput:
    sequence_logits: torch.Tensor
    structure_logits: torch.Tensor
    secondary_structure_logits: torch.Tensor
    sasa_logits: torch.Tensor
    function_logits: torch.Tensor
    residue_logits: torch.Tensor
    embeddings: torch.Tensor


class EncodeInputs(nn.Module):
    """
    Module for encoding input features in the ESM-3 model.

    Args:
        d_model (int): The dimensionality of the model's hidden states.
    """

    def __init__(self, d_model: int):
        super().__init__()

        # Sequence
        self.sequence_embed = nn.Embedding(64, d_model)
        # Mandatory information
        self.plddt_projection = nn.Linear(16, d_model)
        self.structure_per_res_plddt_projection = nn.Linear(16, d_model)

        # Structure
        self.structure_tokens_embed = nn.Embedding(4096 + 5, d_model)

        # "Structural" features
        self.ss8_embed = nn.Embedding(8 + 3, d_model)
        self.sasa_embed = nn.Embedding(16 + 3, d_model)

        # "Functional" features
        self.function_embed = nn.ModuleList(
            [nn.Embedding(260, d_model // 8, padding_idx=0) for _ in range(8)]
        )

        self.residue_embed = nn.EmbeddingBag(1478, d_model, mode="sum", padding_idx=0)

    def forward(
        self,
        sequence_tokens: torch.Tensor,
        structure_tokens: torch.Tensor,
        average_plddt: torch.Tensor,
        per_res_plddt: torch.Tensor,
        ss8_tokens: torch.Tensor,
        sasa_tokens: torch.Tensor,
        function_tokens: torch.Tensor,
        residue_annotation_tokens: torch.Tensor,
    ) -> torch.Tensor:
        sequence_embed = self.sequence_embed(sequence_tokens)

        rbf_16_fn = partial(rbf, v_min=0.0, v_max=1.0, n_bins=16)
        # the `masked_fill(padding_mask.unsqueeze(2), 0)` for the two below is unnecessary
        # as pad tokens never even interact with the "real" tokens (due to sequence_id)
        plddt_embed = self.plddt_projection(rbf_16_fn(average_plddt))
        structure_per_res_plddt = self.structure_per_res_plddt_projection(
            rbf_16_fn(per_res_plddt)
        )

        # Structure + "structural features" embeds
        structure_embed = self.structure_tokens_embed(structure_tokens)
        ss8_embed = self.ss8_embed(ss8_tokens)
        sasa_embed = self.sasa_embed(sasa_tokens)

        # "Functional" features embeds
        function_embed = torch.cat(
            [
                embed_fn(funcs)
                for embed_fn, funcs in zip(
                    self.function_embed, function_tokens.unbind(-1)
                )
            ],
            -1,
        )

        # Residue embeds
        B, L, N = residue_annotation_tokens.shape
        residue_embed = self.residue_embed(
            einops.rearrange(
                residue_annotation_tokens, "B L N -> (B L) N", B=B, L=L, N=N
            )
        )
        residue_embed = einops.rearrange(residue_embed, "(B L) D -> B L D", B=B, L=L)

        return (
            sequence_embed
            + plddt_embed
            + structure_per_res_plddt
            + structure_embed
            + ss8_embed
            + sasa_embed
            + function_embed
            + residue_embed
        )


class OutputHeads(nn.Module):
    def __init__(self, d_model: int):
        super().__init__()
        self.sequence_head = RegressionHead(d_model, 64)
        self.structure_head = RegressionHead(d_model, 4096)
        self.ss8_head = RegressionHead(d_model, 8 + 3)
        self.sasa_head = RegressionHead(d_model, 16 + 3)
        self.function_head = RegressionHead(d_model, 260 * 8)
        self.residue_head = RegressionHead(d_model, 1478)

    def forward(self, x: torch.Tensor, embed: torch.Tensor) -> ESMOutput:
        sequence_logits = self.sequence_head(x)
        structure_logits = self.structure_head(x)
        secondary_structure_logits = self.ss8_head(x)
        sasa_logits = self.sasa_head(x)
        function_logits = self.function_head(x)
        function_logits = einops.rearrange(
            function_logits,
            "... (k v) -> ... k v",
            k=8,
        )

        residue_logits = self.residue_head(x)

        return ESMOutput(
            sequence_logits=sequence_logits,
            structure_logits=structure_logits,
            secondary_structure_logits=secondary_structure_logits,
            sasa_logits=sasa_logits,
            function_logits=function_logits,
            residue_logits=residue_logits,
            embeddings=embed,
        )


class ESM3(nn.Module, ESM3InferenceClient):
    """
    ESM3 model implementation.

    Args:
        d_model (int): The dimensionality of the input and output feature vectors.
        n_heads (int): The number of attention heads in the transformer layers.
        v_heads (int): The number of attention heads in the variational transformer layers.
        n_layers (int): The number of transformer layers.
    """

    def __init__(
        self,
        d_model: int,
        n_heads: int,
        v_heads: int,
        n_layers: int,
        structure_encoder_name: str,
        structure_decoder_name: str,
        function_decoder_name: str,
    ):
        super().__init__()
        self.encoder = EncodeInputs(d_model)
        self.transformer = TransformerStack(
            d_model,
            n_heads,
            v_heads,
            n_layers,
            mask_and_zero_frameless=True,
        )
        self.output_heads = OutputHeads(d_model)

        self.structure_encoder_name = structure_encoder_name
        self.structure_decoder_name = structure_decoder_name
        self.function_decoder_name = function_decoder_name

        self.structure_encoder: StructureTokenEncoder | None = None  # type: ignore
        self.structure_decoder: StructureTokenDecoder | None = None  # type: ignore
        self.function_decoder: FunctionTokenDecoder | None = None  # type: ignore

        self.tokenizers = get_model_tokenizers(ESM3_OPEN_SMALL)

    @classmethod
    def from_pretrained(
        cls,
        model_name: str = ESM3_OPEN_SMALL,
        device: torch.device | str = "cpu",
    ) -> ESM3:
        from esm.pretrained import load_local_model
        if model_name not in [ESM3_OPEN_SMALL]:
            raise ValueError(f"Model name {model_name} is not a valid ESM3 model name.")
        model: ESM3 = load_local_model(model_name, device=device)  # type: ignore
        return model

    def get_structure_token_encoder(self) -> StructureTokenEncoder:
        if self.structure_encoder is None:
            self.structure_encoder = self.load_model(self.structure_encoder_name)  # type: ignore
        return self.structure_encoder  # type: ignore

    def get_structure_token_decoder(self) -> StructureTokenDecoder:
        if self.structure_decoder is None:
            self.structure_decoder = self.load_model(self.structure_decoder_name)  # type: ignore
        return self.structure_decoder  # type: ignore

    def get_function_token_decoder(self) -> FunctionTokenDecoder:
        if self.function_decoder is None:
            self.function_decoder = self.load_model(self.function_decoder_name)  # type: ignore
        return self.function_decoder  # type: ignore

    def load_model(self, model_name: str):
        # Lazy import from pretrained
        from esm.pretrained import load_local_model

        return load_local_model(model_name, device=next(self.parameters()).device)

    def forward(
        self,
        *,
        sequence_tokens: torch.Tensor | None = None,
        structure_tokens: torch.Tensor | None = None,
        ss8_tokens: torch.Tensor | None = None,
        sasa_tokens: torch.Tensor | None = None,
        function_tokens: torch.Tensor | None = None,
        residue_annotation_tokens: torch.Tensor | None = None,
        average_plddt: torch.Tensor | None = None,
        per_res_plddt: torch.Tensor | None = None,
        structure_coords: torch.Tensor | None = None,
        chain_id: torch.Tensor | None = None,
        sequence_id: torch.Tensor | None = None,
    ) -> ESMOutput:
        """
        Performs forward pass through the ESM3 model. Check utils to see how to tokenize inputs from raw data.

        Args:
            sequence_tokens (torch.Tensor, optional): The amino acid tokens.
            structure_tokens (torch.Tensor, optional): The structure tokens.
            ss8_tokens (torch.Tensor, optional): The secondary structure tokens.
            sasa_tokens (torch.Tensor, optional): The solvent accessible surface area tokens.
            function_tokens (torch.Tensor, optional): The function tokens.
            residue_annotation_tokens (torch.Tensor, optional): The residue annotation tokens.
            average_plddt (torch.Tensor, optional): The average plddt across the entire sequence.
            per_res_plddt (torch.Tensor, optional): The per residue plddt, if you want to specify exact plddts, use this,
                otherwise, use average_plddt.
            structure_coords (torch.Tensor, optional): The structure coordinates, in the form of (B, L, 3, 3).
            chain_id (torch.Tensor, optional): The chain ID
            sequence_id (torch.Tensor, optional): The sequence ID.

        Returns:
            ESMOutput: The output of the ESM3 model.

        Raises:
            ValueError: If at least one of the inputs is None.

        """
        # Reasonable defaults:
        try:
            L, device = next(
                (x.shape[1], x.device)
                for x in [
                    sequence_tokens,
                    structure_tokens,
                    ss8_tokens,
                    sasa_tokens,
                    structure_coords,
                    function_tokens,
                    residue_annotation_tokens,
                ]
                if x is not None
            )
        except StopIteration:
            raise ValueError("At least one of the inputs must be non-None")

        t = self.tokenizers
        defaults = lambda x, tok: (
            torch.full((1, L), tok, dtype=torch.long, device=device) if x is None else x
        )
        sequence_tokens = defaults(sequence_tokens, t.sequence.mask_token_id)
        ss8_tokens = defaults(ss8_tokens, C.SS8_UNK_TOKEN)
        sasa_tokens = defaults(sasa_tokens, C.SASA_UNK_TOKEN)
        average_plddt = defaults(average_plddt, 1).float()
        per_res_plddt = defaults(per_res_plddt, 0).float()
        chain_id = defaults(chain_id, 0)
        sequence_id = defaults(sequence_id, 0)

        if residue_annotation_tokens is None:
            residue_annotation_tokens = torch.full(
                (1, L, 16), C.RESIDUE_PAD_TOKEN, dtype=torch.long, device=device
            )

        if function_tokens is None:
            function_tokens = torch.full(
                (1, L, 8), C.INTERPRO_PAD_TOKEN, dtype=torch.long, device=device
            )

        if structure_coords is None:
            structure_coords = torch.full(
                (1, L, 3, 3), float("nan"), dtype=torch.float, device=device
            )

        structure_coords = structure_coords[
            ..., :3, :
        ]  # In case we pass in an atom14 or atom37 repr
        affine, affine_mask = build_affine3d_from_coordinates(structure_coords)

        if structure_tokens is None:
            _, structure_tokens = self.get_structure_token_encoder().encode(
                structure_coords
            )
        assert structure_tokens is not None
        structure_tokens = (
            structure_tokens.masked_fill(
                (structure_tokens == -1) | ~affine_mask, C.STRUCTURE_MASK_TOKEN
            )
            .masked_fill(sequence_tokens == C.SEQUENCE_BOS_TOKEN, C.STRUCTURE_BOS_TOKEN)
            .masked_fill(sequence_tokens == C.SEQUENCE_PAD_TOKEN, C.STRUCTURE_PAD_TOKEN)
            .masked_fill(sequence_tokens == C.SEQUENCE_EOS_TOKEN, C.STRUCTURE_EOS_TOKEN)
            .masked_fill(
                sequence_tokens == C.SEQUENCE_CHAINBREAK_TOKEN,
                C.STRUCTURE_CHAINBREAK_TOKEN,
            )
        )

        x = self.encoder(
            sequence_tokens,
            structure_tokens,
            average_plddt,
            per_res_plddt,
            ss8_tokens,
            sasa_tokens,
            function_tokens,
            residue_annotation_tokens,
        )
        x, embedding = self.transformer(x, sequence_id, affine, affine_mask, chain_id)
        return self.output_heads(x, embedding)

    # The following methods are for the ESM3InferenceClient interface
    def generate(self, input: ProteinType, config: GenerationConfig) -> ProteinType:
        if isinstance(input, ESMProtein):
            return iterative_sampling_raw(self, input, config)
        elif isinstance(input, ESMProteinTensor):
            return iterative_sampling_tokens(self, input, config, self.tokenizers)
        else:
            raise ValueError("Input must be an ESMProtein or ESMProteinTensor")

    def encode(self, input: ESMProtein) -> ESMProteinTensor:
        input = attr.evolve(input)  # Make a copy

        sequence_tokens = None
        structure_tokens = None
        secondary_structure_tokens = None
        sasa_tokens = None
        function_tokens = None
        residue_annotation_tokens = None

        coordinates = None

        if input.sequence is not None:
            sequence_tokens = encoding.tokenize_sequence(
                input.sequence, self.tokenizers.sequence, add_special_tokens=True
            )
        if input.secondary_structure is not None:
            secondary_structure_tokens = encoding.tokenize_secondary_structure(
                input.secondary_structure,
                self.tokenizers.secondary_structure,
                add_special_tokens=True,
            )
        if input.sasa is not None:
            sasa_tokens = encoding.tokenize_sasa(
                input.sasa, self.tokenizers.sasa, add_special_tokens=True
            )

        # Infer input length
        sequence_length = -1
        if sequence_tokens is not None:
            sequence_length = len(sequence_tokens)
        elif secondary_structure_tokens is not None:
            sequence_length = len(secondary_structure_tokens)
        elif sasa_tokens is not None:
            sequence_length = len(sasa_tokens)

        # Try to infer input length from structure data
        if input.coordinates is not None:
            coordinates, _, structure_tokens = encoding.tokenize_structure(
                input.coordinates,
                self.get_structure_token_encoder(),
                structure_tokenizer=self.tokenizers.structure,
                reference_sequence=input.sequence or "",
                add_special_tokens=True,
            )
            if sequence_length == -1:
                sequence_length = len(structure_tokens)

        if sequence_length == -1:
            raise ValueError(
                "Cannot infer input length from input data. Please provide one of: sequence, structure, secondary_structure, sasa.\n"
                "To condition on sequence length only, use ESM3LocalInferenceClient.get_default_sequence(sequence_length) to generate a default sequence input."
            )

        # Function and Residue annotations
        if input.function_annotations is not None:
            if input.sequence is None:
                reference_sequence = encoding.get_default_sequence(sequence_length - 2)
            else:
                reference_sequence = input.sequence
            (
                function_tokens,
                residue_annotation_tokens,
            ) = encoding.tokenize_function_annotations(
                input.function_annotations,
                reference_sequence=reference_sequence,
                function_tokenizer=self.tokenizers.function,
                residue_annotation_tokenizer=self.tokenizers.residue_annotations,
                add_special_tokens=True,
            )

        return ESMProteinTensor(
            sequence=sequence_tokens,
            structure=structure_tokens,
            secondary_structure=secondary_structure_tokens,
            sasa=sasa_tokens,
            function=function_tokens,
            residue_annotations=residue_annotation_tokens,
            coordinates=coordinates,
        ).to(next(self.parameters()).device)

    def decode(
        self,
        input: ESMProteinTensor,
    ) -> ESMProtein:
        return decode_protein_tensor(
            input=input,
            tokenizers=self.tokenizers,
            structure_token_decoder=self.get_structure_token_decoder(),
            function_token_decoder=self.get_function_token_decoder(),
        )

    def _forward(
        self, input: ESMProteinTensor, config: ForwardConfig = ForwardConfig()
    ) -> ForwardOutput:
        # Default plddt conditioning for inference. 1s where coordinates are provided.
        if input.coordinates is None:
            per_res_plddt = None
        else:
            # 1.0 if all coordinates at specific indices have valid non-nan values.
            per_res_plddt = input.coordinates.isfinite().all(dim=-1).any(dim=-1).float()

        with torch.no_grad() if self.eval else contextlib.nullcontext():
            output = self.forward(
                sequence_tokens=input.sequence,
                structure_tokens=input.structure,
                ss8_tokens=input.secondary_structure,
                sasa_tokens=input.sasa,
                function_tokens=input.function,
                residue_annotation_tokens=input.residue_annotations,
                average_plddt=torch.tensor(1.0, device=input.device),
                per_res_plddt=per_res_plddt,
                structure_coords=input.coordinates,
                chain_id=None,
                sequence_id=None,
            )

            if config.return_logits:
                logits = ForwardTrackData(
                    sequence=output.sequence_logits,
                    structure=output.structure_logits,
                    secondary_structure=output.secondary_structure_logits,
                    sasa=output.sasa_logits,
                    function=output.function_logits,
                )
            else:
                logits = None

            return ForwardOutput(
                logits=logits,
                residue_annotation_logits=output.residue_logits,
                embeddings=output.embeddings if config.return_embeddings else None,
            )

    def forward_and_sample(
        self, input: ESMProteinTensor, sampling_configuration: SamplingConfig
    ) -> ForwardAndSampleOutput:
        protein_tensor = attr.evolve(input)  # Make a copy

        def maybe_clone(x: torch.Tensor | None) -> torch.Tensor | None:
            return x.clone() if x is not None else None

        device = next(self.parameters()).device

        sampling_config = sampling_configuration
        if sampling_config is None:
            sampling_config = get_default_sampling_config(self.tokenizers)

        # Initialize default values for missing tracks
        default_protein_tensor = ESMProteinTensor.empty(
            len(input) - 2, tokenizers=self.tokenizers, device=input.device
        )
        for track in attr.fields(ESMProteinTensor):
            if getattr(protein_tensor, track.name, None) is None:
                setattr(
                    protein_tensor,
                    track.name,
                    getattr(default_protein_tensor, track.name, None),
                )

        # Preprocessing
        sequence_length: int = -1
        for track in [
            "sequence",
            "structure",
            "secondary_structure",
            "sasa",
            "function",
            "residue_annotations",
        ]:
            input_tensor: torch.Tensor | None = getattr(protein_tensor, track, None)
            if input_tensor is not None:
                # Add batch dimension if necessary
                if track in ["sequence", "structure", "secondary_structure", "sasa"]:
                    if len(input_tensor.size()) == 1:
                        input_tensor = input_tensor.unsqueeze(0)  # (L,) -> (1, L)
                elif track in ["function", "residue_annotations"]:
                    if len(input_tensor.size()) == 2:
                        input_tensor = input_tensor.unsqueeze(0)  # (L, O) -> (1, L, O)

                # Check length consistency
                if sequence_length == -1:
                    sequence_length = input_tensor.size(1)
                else:
                    if input_tensor.size(1) != sequence_length:
                        raise ValueError(
                            f"Length mismatch for track {track}. Expected {sequence_length}, got {input_tensor.size(1)}"
                        )

                # Move input tensor to model device
                input_tensor = input_tensor.to(device)
                setattr(protein_tensor, track, input_tensor)

        if protein_tensor.coordinates is not None:
            coordinates = protein_tensor.coordinates
            if len(coordinates.size()) == 3:
                coordinates = coordinates.unsqueeze(0)
            protein_tensor.coordinates = coordinates.to(device)
            sequence_length = coordinates.size(1)

        if sequence_length == -1:
            raise ValueError("No input data provided")

        # Forward pass
        forward_output = self._forward(
            protein_tensor,
            ForwardConfig(
                ReturnLogitsConfig(
                    sequence=True,
                    structure=True,
                    secondary_structure=True,
                    sasa=True,
                    function=True,
                    residue_annotations=True,
                ),
                return_embeddings=True,
            ),
        )

        # Sampling
        tokens_dir = {}
        track_sampling_metadata_dir: dict[str, dict | None] = {}
        for track in ["sequence", "structure", "secondary_structure", "sasa"]:
            config = getattr(sampling_config, track)
            if config is None:
                tokens_dir[track] = maybe_clone(getattr(input, track))
                continue
            sampling_metadata = self._sample_track(
                logits=getattr(forward_output.logits, track)[0, ...],
                tokens=getattr(protein_tensor, track)[0, ...],
                sampling_track_config=config,
                mask_idx=getattr(self.tokenizers, track).mask_token_id,
            )
            tokens_dir[track] = sampling_metadata.pop("sampled_tokens")  # (L,)
            track_sampling_metadata_dir[track] = sampling_metadata

        # Sample function and residue annotations separately
        config = getattr(sampling_config, "function")
        if config is None:
            tokens_dir["function"] = maybe_clone(getattr(input, "function"))
            tokens_dir["residue_annotations"] = maybe_clone(
                getattr(input, "residue_annotations")
            )
        else:
            sampling_metadata = self._sample_function_track(
                tokens=getattr(protein_tensor, "function")[0, ...],
                logits=getattr(forward_output.logits, "function")[0, ...],
                sampling_track_config=config,
            )
            tokens_dir["function"] = sampling_metadata.pop("sampled_tokens")  # (L, D)
            track_sampling_metadata_dir["function"] = sampling_metadata

            sampled_tokens, _ = sample_residue_annotation_logits(
                logits=forward_output.residue_annotation_logits[0, ...]  # type: ignore
            )
            tokens_dir["residue_annotations"] = sampled_tokens  # (L, MAX_R)

        # Format output
        forward_and_sample_output_dir = {}
        forward_and_sample_output_dir["protein_tensor"] = ESMProteinTensor(**tokens_dir)
        for property in [
            "entropy",
            "prob",
            "logprob",
            "top_prob",
            "topk_logprob",
            "topk_tokens",
        ]:
            is_all_none = True
            forward_track_data_dir = {}
            for track in track_sampling_metadata_dir.keys():
                values = track_sampling_metadata_dir[track]
                if values is not None and values.get(property, None) is not None:
                    forward_track_data_dir[track] = values.get(property, None)
                    is_all_none = False
            if not is_all_none:
                forward_and_sample_output_dir[property] = ForwardTrackData(
                    **forward_track_data_dir
                )
            else:
                forward_and_sample_output_dir[property] = None

        perres_embed = (
            forward_output.embeddings[0]  # type: ignore
            if sampling_configuration.return_per_residue_embeddings
            else None
        )
        mean_embedding = (
            forward_output.embeddings[0].mean(1)  # type: ignore
            if sampling_configuration.return_per_residue_embeddings
            else None
        )

        return ForwardAndSampleOutput(
            per_residue_embedding=perres_embed,
            mean_embedding=mean_embedding,
            **forward_and_sample_output_dir,
        )

    def _sample_track(
        self,
        logits: torch.Tensor,
        tokens: torch.Tensor,
        sampling_track_config: SamplingTrackConfig,
        mask_idx: int,
    ) -> dict[str, torch.Tensor]:
        # Sample in all positions
        temperature = sampling_track_config.temperature
        sampled_tokens = sample_logits(
            logits, temperature=temperature, top_p=sampling_track_config.top_p
        )
        log_probs = logits.log_softmax(-1)

        # Do not sample at BOS and EOS tokens
        sampling_mask = torch.ones_like(tokens, dtype=torch.bool)  # (L, )
        sampling_mask[0] = False
        sampling_mask[-1] = False

        # Do not sample at special token positions but allow sampling at mask token
        special_minus_mask = list(set(sampling_track_config.invalid_ids) - {mask_idx})
        if len(special_minus_mask) > 0:
            special_tokens = torch.tensor(special_minus_mask, device=tokens.device)
            assert special_tokens.numel() > 0
            sampling_mask = sampling_mask & (
                tokens[..., None] != special_tokens[None, :]
            ).all(-1)

        # Keep only samples from masked positions (if specified)
        if sampling_track_config.only_sample_masked_tokens:
            masked_tokens = tokens == mask_idx
            sampling_mask = sampling_mask & masked_tokens
        sampled_tokens = torch.where(sampling_mask, sampled_tokens, tokens)

        return self._compute_track_metadata(
            sampled_tokens,
            log_probs,
            sampling_mask,
            top_k=sampling_track_config.topk_logprobs,
        )

    def _sample_function_track(
        self,
        tokens: torch.Tensor,
        logits: torch.Tensor,
        sampling_track_config: SamplingTrackConfig,
    ) -> dict[str, torch.Tensor]:
        # Do not sample at BOS and EOS tokens
        sampling_mask = torch.ones_like(tokens, dtype=torch.bool)
        sampling_mask[0] = False
        sampling_mask[-1] = False

        sampled_tokens, probs = sample_function_logits(
            logits,
            self.tokenizers.function,
            top_p=sampling_track_config.top_p,
            temperature=sampling_track_config.temperature,
        )

        if sampling_track_config.only_sample_masked_tokens:
            raise ValueError(
                "Sampling only masked tokens is undefined for function tokens."
            )

        sampled_tokens = torch.where(sampling_mask, sampled_tokens, tokens)  # (L, D)

        return self._compute_track_metadata(
            sampled_tokens,
            probs,
            sampling_mask,
            top_k=sampling_track_config.topk_logprobs,
        )

    @staticmethod
    def _compute_track_metadata(
        sampled_tokens: torch.Tensor,
        log_probs: torch.Tensor,
        sampling_mask: torch.Tensor,
        top_k: int,
    ) -> dict:
        probs = torch.exp(log_probs)  # (B, L)
        entropy = torch.distributions.Categorical(probs=probs).entropy()  # (B, L)

        # Only compute probabilities for sampled tokens
        sampled_logprob = torch.zeros_like(
            sampled_tokens, dtype=torch.float32
        )  # (B, L)
        sampled_tokens_valid = sampled_tokens[sampling_mask]
        sampled_log_probs_valid = log_probs[sampling_mask, sampled_tokens_valid]
        sampled_logprob[sampling_mask] = sampled_log_probs_valid

        # Calculate extra metadata
        sampled_prob = torch.exp(sampled_logprob)
        top_prob = torch.max(probs, dim=-1).values
        topk_logprobs, topk_tokens = torch.topk(log_probs, top_k, dim=-1)
        topk_logprobs = None if top_k == 0 else topk_logprobs
        topk_tokens = None if top_k == 0 else topk_tokens

        return {
            "entropy": entropy,
            "sampled_tokens": sampled_tokens,
            "prob": sampled_prob,
            "logprob": sampled_logprob,
            "top_prob": top_prob,
            "topk_logprob": topk_logprobs,
            "topk_tokens": topk_tokens,
        }