Upload TrajectoryVLA

Browse files

Files changed (7) hide show

config.json +32 -217
generation_config.json +7 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +0 -0
prismatic_model.py +1129 -0

config.json CHANGED Viewed

@@ -1,222 +1,37 @@
 {
   "auto_map": {
-    "AutoConfig": "prismatic_config.TrajectoryVLAConfig"
   },
-  "cheat": false,
-  "model_type": "trajectoryvla",
-  "num_timesteps": 6,
-  "prismatic_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "arch_specifier": "no-align+gelu-mlp",
-    "architectures": [
-      "TrajectoryVLA"
-    ],
-    "auto_map": {
-      "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
-    },
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hf_llm_id": "meta-llama/Llama-2-7b-hf",
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_resize_strategy": "letterbox",
-    "image_sizes": [
-      224,
-      224
-    ],
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "length_penalty": 1.0,
-    "llm_backbone_id": "llama2-7b-pure",
-    "llm_max_length": 2048,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "prismatic",
-    "no_repeat_ngram_size": 0,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_projector_states": false,
-    "output_scores": false,
-    "pad_to_multiple_of": 64,
-    "pad_token_id": 32000,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": false,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "text_config": {
-      "_name_or_path": "",
-      "add_cross_attention": false,
-      "architectures": null,
-      "attention_bias": false,
-      "attention_dropout": 0.0,
-      "bad_words_ids": null,
-      "begin_suppress_tokens": null,
-      "bos_token_id": 1,
-      "chunk_size_feed_forward": 0,
-      "cross_attention_hidden_size": null,
-      "decoder_start_token_id": null,
-      "diversity_penalty": 0.0,
-      "do_sample": false,
-      "early_stopping": false,
-      "encoder_no_repeat_ngram_size": 0,
-      "eos_token_id": 2,
-      "exponential_decay_length_penalty": null,
-      "finetuning_task": null,
-      "forced_bos_token_id": null,
-      "forced_eos_token_id": null,
-      "hidden_act": "silu",
-      "hidden_size": 4096,
-      "id2label": {
-        "0": "LABEL_0",
-        "1": "LABEL_1"
-      },
-      "initializer_range": 0.02,
-      "intermediate_size": 11008,
-      "is_decoder": false,
-      "is_encoder_decoder": false,
-      "label2id": {
-        "LABEL_0": 0,
-        "LABEL_1": 1
-      },
-      "length_penalty": 1.0,
-      "max_length": 20,
-      "max_position_embeddings": 2048,
-      "min_length": 0,
-      "mlp_bias": false,
-      "model_type": "llama",
-      "no_repeat_ngram_size": 0,
-      "num_attention_heads": 32,
-      "num_beam_groups": 1,
-      "num_beams": 1,
-      "num_hidden_layers": 32,
-      "num_key_value_heads": 32,
-      "num_return_sequences": 1,
-      "output_attentions": false,
-      "output_hidden_states": false,
-      "output_scores": false,
-      "pad_token_id": null,
-      "prefix": null,
-      "pretraining_tp": 1,
-      "problem_type": null,
-      "pruned_heads": {},
-      "remove_invalid_values": false,
-      "repetition_penalty": 1.0,
-      "return_dict": true,
-      "return_dict_in_generate": false,
-      "rms_norm_eps": 1e-06,
-      "rope_scaling": null,
-      "rope_theta": 10000.0,
-      "sep_token_id": null,
-      "suppress_tokens": null,
-      "task_specific_params": null,
-      "temperature": 1.0,
-      "tf_legacy_loss": false,
-      "tie_encoder_decoder": false,
-      "tie_word_embeddings": false,
-      "tokenizer_class": null,
-      "top_k": 50,
-      "top_p": 1.0,
-      "torch_dtype": null,
-      "torchscript": false,
-      "typical_p": 1.0,
-      "use_bfloat16": false,
-      "use_cache": true,
-      "vocab_size": 32000
-    },
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "timm_model_ids": [
-      "vit_large_patch14_reg4_dinov2.lvd142m",
-      "vit_so400m_patch14_siglip_224"
-    ],
-    "timm_override_act_layers": [
-      null,
-      null
-    ],
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": "bfloat16",
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "use_fused_vision_backbone": true,
-    "vision_backbone_id": "dinosiglip-vit-so-224px"
   },
-  "rotation_components": 9,
-  "seperate_control_proj": true,
-  "timestep_proj_config": {
-    "num_tokens": 3,
-    "pos_embed_scale": 8,
-    "proj_layers": [
-      128,
-      512,
-      1024
-    ],
-    "time_delta_sec": 0.1
-  },
-  "token_proj_config": {
-    "control_tokens_layers": [
-      4096,
-      2048,
-      1024
-    ],
-    "image_tokens_mode": "vit",
-    "llm_image_tokens_layers": [],
-    "vit_tokens_layers": [
-      2176,
-      1024
-    ]
-  },
-  "token_size": 1024,
-  "transformer_config": {
-    "decoder_block_config": {
-      "dropout": 0.0,
-      "feature_size": 1024,
-      "head_dim": 64,
-      "num_heads": 16
-    },
-    "encoder_block_config": {
-      "feature_size": 1024,
-      "head_dim": 64,
-      "num_heads": 16
-    },
-    "num_blocks": 2,
-    "pos_embed_config": {
-      "embedding_dim": 1024,
-      "num_embeddings": 300
-    }
-  },
-  "transformers_version": "4.44.2"
 }

 {
+  "arch_specifier": "no-align+gelu-mlp",
+  "architectures": [
+    "TrajectoryVLA"
+  ],
   "auto_map": {
+    "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
   },
+  "hf_llm_id": "meta-llama/Llama-2-7b-hf",
+  "image_resize_strategy": "letterbox",
+  "image_sizes": [
+    224,
+    224
+  ],
+  "llm_backbone_id": "llama2-7b-pure",
+  "llm_max_length": 2048,
+  "model_type": "prismatic",
+  "output_projector_states": false,
+  "pad_to_multiple_of": 64,
+  "pad_token_id": 32000,
+  "return_dict": false,
+  "text_config": {
+    "model_type": "llama"
   },
+  "timm_model_ids": [
+    "vit_large_patch14_reg4_dinov2.lvd142m",
+    "vit_so400m_patch14_siglip_224"
+  ],
+  "timm_override_act_layers": [
+    null,
+    null
+  ],
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "use_fused_vision_backbone": true,
+  "vision_backbone_id": "dinosiglip-vit-so-224px"
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 32000,
+  "transformers_version": "4.44.2"
+}

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cab95ea8a69faf885ec29dce3dba829617f86bf9fc8fdd730dbf28804ad7bf1
+size 6948963952

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbb646e9b5155db78dfeb12260d2e3171f0ae53bed32d9a5a7c488e08c3372ee
+size 6971232352

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf01b558f01cfa15b7d7112150d134d1b633d6ab56db901759885f59503371ff
+size 1266349562

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

prismatic_model.py ADDED Viewed

	@@ -0,0 +1,1129 @@

+"""
+modeling_prismatic.py
+Core HuggingFace-style PrismaticPreTrainedModel and PrismaticForConditionalGeneration class definitions, inheriting
+from the default `transformers.PretrainedModel`. Meant to be standalone and self-contained, but exactly replicate the
+logic in `prismatic.models.vlms.prismatic.py`.
+Note =>> for the time being, not adding the custom HF "docstring" formatting.
+References [LLaVa, IDEFICS-2]:
+    => https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava/modeling_llava.py
+    => https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics2/modeling_idefics2.py
+"""
+import logging
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, Union
+from functools import cached_property
+# from barrel.components.nn.layers.nerf_pos_embed import NeRFPositionalEmbedding
+import numpy as np
+import timm
+import tokenizers
+import torch
+import torch.nn as nn
+import transformers
+from timm.models.vision_transformer import LayerScale
+from transformers import AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+import collections
+import math
+from barrel.pipes.vlams.extern.prismatic_config import OpenVLAConfig, PrismaticConfig , TrajectoryVLAConfig, WaypointTokenizer
+# from barrel.pipes.vlams.models.control.token_proj import TokenProjector
+from barrel.pipes.vlams.extern.datatypes import *
+from barrel.pipes.vlams.extern.detr import *
+from IPython import embed
+import os
+from PIL import Image
+from pathlib import Path
+from torch.amp.autocast_mode import autocast  # Corrected import for latest PyTorch
+from scipy.spatial.transform import Rotation as R
+ht_token_path = Path(".hf_token")
+HF_TOKEN  = ht_token_path.read_text().strip() if isinstance(ht_token_path, Path) else hf_token_path
+# Get Logger
+logger = logging.getLogger(__name__)
+torch.backends.cudnn.benchmark = False
+torch.backends.cudnn.deterministic = True
+# === PyTorch/HuggingFace Default IGNORE_INDEX (for CrossEntropyLoss labels)
+IGNORE_INDEX = -100
+# === Utility Functions for Monkey-Patching ===
+def unpack_tuple(fn: Callable[[Any], Tuple[Any]]) -> Callable[[Any], Any]:
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        result = fn(*args, **kwargs)
+        return result[0] if isinstance(result, tuple) else result
+    return wrapper
+# HF Transformers overwrites parameters with names containing `gamma`; we're going to patch VisionBackbone.LayerScale.
+#   =>> TIMM :: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L109
+#   =>> Transformers :: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3960
+def _ls_new_forward(self, x: torch.Tensor) -> torch.Tensor:
+    return x.mul_(self.scale_factor) if self.inplace else x * self.scale_factor
+def ls_apply_patch(ls_module: LayerScale):
+    ls_module.scale_factor = nn.Parameter(ls_module.gamma.clone())
+    ls_module.forward = _ls_new_forward.__get__(ls_module, LayerScale)
+    del ls_module.gamma
+# === Prismatic Vision Backbone (nn.Module) Definitions (w/ Fused Backbone Support) ===
+class PrismaticVisionBackbone(nn.Module):
+    def __init__(
+        self,
+        use_fused_vision_backbone: bool,
+        image_sizes: List[int],
+        timm_model_ids: List[str],
+        timm_override_act_layers: List[Optional[str]],
+    ) -> None:
+        super().__init__()
+        self.use_fused_vision_backbone = use_fused_vision_backbone
+        # [Contract] Validate number of (fused) vision backbones, create "alpha" featurizer and Instantiate
+        #   =>> Note :: Monkey-Patch the `forward()` function of the backbone to ensure FSDP-compatibility
+        #               Hardcodes `get_intermediate_layers` to return the **SECOND-TO-LAST** layer patches!
+        assert len(timm_model_ids) <= 2, "Prismatic models only support up to 2 (fused) vision backbones!"
+        self.dino_featurizer = timm.create_model(
+            timm_model_ids[0],
+            pretrained=True,
+            num_classes=0,
+            img_size=image_sizes[0],
+            act_layer=timm_override_act_layers[0],
+        )
+        self.dino_featurizer.eval()
+        self.embed_dim = self.dino_featurizer.embed_dim
+        # If `use_fused_vision_backbone` =>> create "beta" featurizer
+        # if self.use_fused_vision_backbone:
+        self.siglip_featurizer = timm.create_model(
+            timm_model_ids[1],
+            pretrained=True,
+            num_classes=0,
+            img_size=image_sizes[1],
+            act_layer=timm_override_act_layers[1],)
+        self.siglip_featurizer.eval()
+        self.dino_featurizer.forward = partial(
+            self.dino_featurizer.forward_intermediates,
+            indices=[len(self.dino_featurizer.blocks) - 2],
+            return_prefix_tokens=False,
+            norm=False,
+            stop_early=True,
+            output_fmt='NLC',
+            intermediates_only=True,
+        )
+        self.siglip_featurizer.forward = partial(
+            self.siglip_featurizer.forward_intermediates,
+            indices=[len(self.siglip_featurizer.blocks) - 2],
+            return_prefix_tokens=False,
+            norm=False,
+            stop_early=True,
+            output_fmt='NLC',
+            intermediates_only=True,
+        )
+        self.embed_dim += self.siglip_featurizer.embed_dim
+    def forward(self, pixel_values) -> torch.Tensor:
+        """Run image (`pixel_values`) through featurizer; if channel-stacked, then dispatch and sequence stack."""
+        if not self.use_fused_vision_backbone:
+            return self.featurizer(pixel_values)
+        # Split `pixel_values :: [bsz, 2 * 3, resolution, resolution]` =>> featurize =>> channel stack
+        # img, img_fused = torch.split(pixel_values, [3, 3], dim=1)
+        img = pixel_values['dino']
+        img_fused = pixel_values['siglip']
+        patches, patches_fused = self.dino_featurizer(img)[0], self.siglip_featurizer(img_fused)[0]
+        return torch.cat([patches, patches_fused], dim=2)
+class PrismaticProjector(nn.Module):
+    def __init__(self, use_fused_vision_backbone, vision_dim: int, llm_dim: int) -> None:
+        super().__init__()
+        self.initial_projection_dim = vision_dim * 4
+        self.projector = torch.nn.Sequential(
+            torch.nn.Linear(vision_dim, self.initial_projection_dim, bias=True),
+            torch.nn.GELU(),
+            torch.nn.Linear(self.initial_projection_dim, llm_dim, bias=True),
+            torch.nn.GELU(),
+            torch.nn.Linear(llm_dim, llm_dim, bias=True),
+        )
+    def forward(self, fused_img_patches: torch.Tensor) -> torch.Tensor:
+        return self.projector(fused_img_patches)
+# === Main HF Class Definitions ===
+@dataclass
+class PrismaticCausalLMOutputWithPast(ModelOutput):
+    """Base class for Prismatic casual (visually-conditioned) language model outputs; also exposes visual features."""
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    # Additions for VLMs
+    projector_features: Optional[torch.FloatTensor] = None
+class PrismaticPreTrainedModel(PreTrainedModel):
+    config_class: PrismaticConfig
+    base_model_prefix: str = "model"
+    supports_gradient_checkpointing: bool = True
+    _no_split_modules: ClassVar[List[str]] = ["PrismaticProjector"]
+    _skip_keys_device_placement: str = "past_key_values"
+    _supports_flash_attn_2: bool = True
+    def _init_weights(self, module: nn.Module) -> None:
+        # Important :: this HF ported version is *not* meant for training from scratch; only inference and fine-tuning!
+        #   => As such, this init_weights code is not correct; if training VLMs from scratch, use the main codebase at
+        #      https://github.com/TRI-ML/prismatic-vlms
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def _supports_sdpa(self) -> bool:
+        """Check LLM supports SDPA Attention"""
+        return self.language_model._supports_sdpa
+class LLMBackbone(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.llm : AutoModelForCausalLM
+        self.tokenizer = self._create_tokenizer()
+    def _create_tokenizer(self) -> transformers.PreTrainedTokenizerBase:
+        # Load (Fast) Tokenizer
+        print(f"Loading (Fast) Tokenizer via the AutoTokenizer API")
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            self.config['hf_model_id'],
+            model_max_length=self.config['llm_max_length'],
+            token=HF_TOKEN,
+            padding_side="right",
+        )
+        # Validation =>> Our VLM logic currently operates under the assumption that the tokenization of a new input
+        #                starts with a <BOS> token unless `add_special_tokens = False`; for these models, we empirically
+        #                find that adding image patches *after* the BOS leads to much better performance.
+        #
+        # As a result we explicitly validate that a tokenizer conforms to the expected behavior; if you're reading this
+        # line, it's probably because you're adding a new LLM with a different tokenizer behavior. If so, feel free to
+        # override the `SPECIAL_CASES` set below, but make sure to make the appropriate changes in the `datasets.py`
+        # and VLM `forward()` logic!
+        SPECIAL_CASES = {
+            # Phi-2 Tokenizer doesn't add any BOS tokens by default, and sets BOS == EOS == "<|endoftext|>"
+            #   =>> We'll prepend BOS to first input (to play nicely with image token insertion logic; verified that
+            #       this works well with base LLM generation.
+            #   =>> Like Llama-2 Tokenizers -- we'll add a special PAD token for training purposes.
+            "microsoft/phi-2",
+        }
+        if self.config['hf_model_id'] not in SPECIAL_CASES:
+            # Note =>> this assert should hold for all Llama-derived tokenizers (`LlamaTokenizerFast` ==> includes Mistral!
+            assert (
+                tokenizer("Test 123", add_special_tokens=True).input_ids[0] == tokenizer.bos_token_id
+            ) and (
+                tokenizer("Test 123", add_special_tokens=False).input_ids[0] != tokenizer.bos_token_id
+            ), f"Default Tokenizer of type `{type(tokenizer)}` does not automatically prefix inputs with BOS token!\n"
+        return tokenizer
+class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
+    def __init__(self, config: PrismaticConfig) -> None:
+        super().__init__(config)
+        # [Validation] Lightweight Validate on `config` Fields + Dependency Versions
+        if config.use_fused_vision_backbone is None:
+            raise ValueError("Missing config field `use_fused_vision_backbone`")
+        # if timm.__version__ not in {"0.9.10", "0.9.11", "0.9.12", "0.9.16"}:
+        #     raise NotImplementedError(
+        #         "TIMM Version must be >= 0.9.10 and < 1.0.0 (breaking); please raise a GitHub Issue "
+        #         "if you urgently need support for latest TIMM versions."
+        #     )
+        # if (transformers.__version__ != "4.40.1") or (tokenizers.__version__ != "0.19.1"):
+        #     logger.warning(
+        #         f"Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got "
+        #         f"`transformers=={transformers.__version__}` and `tokenizers=={tokenizers.__version__}`; "
+        #         f"there might be inference-time regressions due to dependency changes. If in doubt, please"
+        #         f"use the above versions."
+        #     )
+        # Instantiate PrismaticVisionBackbone (w/ Potential Fused Backbone)
+        self.vision_backbone = PrismaticVisionBackbone(
+            config.use_fused_vision_backbone, config.image_sizes, config.timm_model_ids, config.timm_override_act_layers
+        )
+        # Create Multimodal Projector
+        self.projector = PrismaticProjector(
+            config.use_fused_vision_backbone,
+            vision_dim=self.vision_backbone.embed_dim,
+            llm_dim=config.text_config.hidden_size,
+        )
+        # Instantiate LLM Backbone
+        self.llm_backbone = LLMBackbone({'hf_model_id': config.hf_llm_id, 'llm_max_length': config.llm_max_length, "pad_token_id" :32000,
+        "pad_to_multiple_of" : 64,})
+        # self.llm_backbone.llm = AutoModelForCausalLM.from_config(
+        #     config.text_config, attn_implementation="flash_attention_2"
+        # )
+        self.llm_backbone.llm = AutoModelForCausalLM.from_pretrained(
+                'meta-llama/Llama-2-7b-hf',
+                token=HF_TOKEN,
+                attn_implementation='flash_attention_2',
+                # The following parameters are set to prevent `UserWarnings` from HF; we want greedy decoding!
+                do_sample=False,
+                temperature=1.0,
+                use_cache=False,
+                top_p=1.0, )
+        self.llm_backbone.tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+        self.llm_backbone.llm.config.pad_token_id = self.llm_backbone.tokenizer.pad_token_id
+        self.llm_backbone.llm.resize_token_embeddings(len(self.llm_backbone.tokenizer), pad_to_multiple_of=64)
+        # self.llm_backbone.llm.config.pad_token_id = self.llm_backbone.tokenizer.pad_token_id
+        # self.llm_backbone.llm.resize_token_embeddings(len(self.llm_backbone.tokenizer), pad_to_multiple_of=64)
+        # self.resize_token_embeddings(32001,64)
+        self.vocab_size = config.text_config.vocab_size
+        self.pad_token_id = config.pad_token_id
+        # HF Boilerplate =>> initializes weights via `_init_weights()` and sets gradient checkpointing
+        self.post_init()
+    # === `PreTrainedModel` Boilerplate ===
+    def get_input_embeddings(self) -> nn.Module:
+        return self.llm_backbone.llm.get_input_embeddings()
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.llm_backbone.llm.set_input_embeddings(value)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.llm_backbone.llm.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
+        self.llm_backbone.llm.set_output_embeddings(new_embeddings)
+    def get_decoder(self) -> nn.Module:
+        return self.llm_backbone.llm.get_decoder()
+    def set_decoder(self, decoder: nn.Module) -> None:
+        self.llm_backbone.llm.set_decoder(decoder)
+    def tie_weights(self) -> None:
+        self.llm_backbone.llm.tie_weights()  # Note: `Llama-2` and `Mistral` don't tie weights (no-op)
+    # def resize_token_embeddings(
+    #     self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+    # ) -> nn.Embedding:
+    #     updated_embeddings = self.llm_backbone.llm.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+    #     # Update config/instance variables
+    #     self.config.text_config.vocab_size = updated_embeddings.num_embeddings
+    #     self.vocab_size = updated_embeddings.num_embeddings
+    #     return updated_embeddings
+    # === Core Prismatic VLM `forward()` Logic ===
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] ,
+        attention_mask: Optional[torch.Tensor],
+        # pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values: Dict[str, torch.Tensor] = {},
+        labels: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_projector_features: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Any,
+    ) -> Union[Tuple, PrismaticCausalLMOutputWithPast]:
+        """Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance."""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_projector_features = output_projector_features if output_projector_features is not None else False
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Respect `use_cache` only if not training (even if `gradient_checkpointing` is off)
+        use_cache = use_cache and not self.training
+        # Instantiate Placeholder for Projector Features
+        projected_patch_embeddings = None
+        # Note :: We only support forward passes with the following cases:
+        #   => Cached Generation :: (input_ids.shape[1] == 1) and (past_key_values is not None)
+        #   => Unimodal Forward :: (pixel_values is None)
+        #   => Multimodal Forward :: (pixel_values is not None) and (input_ids/embeds.shape[0] == pixel_values.shape[0])
+        # === Handle Generation with Cache (`input_ids.shape[1] == 1`) =>> requires `past_keys_values` ===
+        if input_ids.shape[1] == 1:
+            assert input_ids.shape[0] == 1, "Generation is only currently supported for batch size of 1!"
+            assert past_key_values is not None, "You must provide `past_key_values` during cached generation!"
+            assert labels is None, "Unexpected key `labels` provided during cached generation!"
+            language_model_output = self.llm_backbone.llm(
+                input_ids=input_ids,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=past_key_values,
+                inputs_embeds=None,
+                labels=None,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # === Handle Unimodal Forward ===
+        elif pixel_values is None:
+            assert (input_ids is not None) and (inputs_embeds is None), "Missing `input_ids` in language-only forward!"
+            assert past_key_values is None, "Unexpected key `past_key_values` provided during language-only forward!"
+            language_model_output = self.llm_backbone.llm(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=None,
+                past_key_values=None,
+                inputs_embeds=None,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # === Handle Multimodal Forward ===
+        elif (input_ids.shape[0] == pixel_values['dino'].shape[0]) or (inputs_embeds.shape[0] == pixel_values['dino'].shape[0]):
+            assert past_key_values is None, "Unexpected key `past_key_values` provided during language-only forward!"
+            # Visual Feature Extraction
+            patch_features = self.vision_backbone(pixel_values)
+            projected_patch_embeddings = self.projector(patch_features)  ## matches
+            projected_patch_attention_mask = None
+            if attention_mask is not None:
+                projected_patch_attention_mask = torch.full(
+                    (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
+                    fill_value=True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+            # Get Input Embeddings (from Language Model Embeddings)
+            input_embeddings = self.get_input_embeddings()(input_ids)
+            # Build Multimodal Embeddings & Attention Mask =>> Prismatic defaults to inserting after <BOS> token (1:)
+            multimodal_embeddings = torch.cat(
+                [input_embeddings[:, :1, :], projected_patch_embeddings, input_embeddings[:, 1:, :]], dim=1
+            )
+            multimodal_attention_mask = None
+            if attention_mask is not None:
+                multimodal_attention_mask = torch.cat(
+                    [attention_mask[:, :1], projected_patch_attention_mask, attention_mask[:, 1:]], dim=1
+                )
+            # Build Labels (if specified) =>> Ignore Labels for Patch Embeddings
+            multimodal_labels = None
+            if labels is not None:
+                projected_patch_labels = torch.full(
+                    (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
+                    fill_value=IGNORE_INDEX,
+                    dtype=labels.dtype,
+                    device=labels.device,
+                )
+                multimodal_labels = torch.cat([labels[:, :1], projected_patch_labels, labels[:, 1:]], dim=1)
+            # Dispatch to Language Model
+            language_model_output = self.llm_backbone.llm(
+                input_ids=None,
+                attention_mask=multimodal_attention_mask,
+                position_ids=None,
+                past_key_values=None,
+                inputs_embeds=multimodal_embeddings,
+                labels=multimodal_labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # === Otherwise =>> Assume Invalid! ===
+        elif (input_ids.shape[0] != pixel_values.shape[0]) or (inputs_embeds.shape[0] != pixel_values.shape[0]):
+            raise ValueError("Non-homogenous batch of (text, image) input -- forward() does not support mixed batches!")
+        else:
+            raise ValueError(
+                "Invalid PrismaticForConditionalGeneration `forward()` call with provided arguments:\n"
+                f"=> `input_ids` = {input_ids is not None}\n"
+                f"=> `attention_mask` = {attention_mask is not None}\n"
+                f"=> `pixel_values` = {pixel_values is not None}\n"
+                f"=> `labels` = {labels is not None}\n"
+                f"=> `input_embeds` = {inputs_embeds is not None}\n"
+                f"=> `past_key_values` = {past_key_values is not None}\n"
+                f"=> `use_cache` = {use_cache}"
+            )
+        # Unpack `language_model_output` and return PrismaticCausalLMOutputWithPast (or tuple if not `return_dict`)
+        if not return_dict:
+            if output_projector_features and (projected_patch_embeddings is not None):
+                return *language_model_output, projected_patch_embeddings
+            return language_model_output
+        return (PrismaticCausalLMOutputWithPast(
+            loss=language_model_output.loss,
+            logits=language_model_output.logits,
+            past_key_values=language_model_output.past_key_values,
+            hidden_states=language_model_output.hidden_states,
+            attentions=language_model_output.attentions,
+            projector_features=projected_patch_embeddings,
+        ),patch_features,multimodal_attention_mask)
+    # === GenerationMixin Methods ===
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: str,
+    ) -> Dict[str, torch.Tensor]:
+        """Borrowed from `LlamaForCausalLM` and simplified for batch size = 1; mirrors original PrismaticVLM logic."""
+        if ((input_ids is not None) and (input_ids.shape[0] > 1)) or (
+            (inputs_embeds is not None) and (inputs_embeds.shape[0] > 1)
+        ):
+            raise ValueError("Generation with batch size > 1 is not currently supported!")
+        # Handle `past_key_values` (cache) =>> assume `input_ids` just has unprocessed tokens
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        # If `input_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"input_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        # Make sure `pixel_values` are preserved in `model_inputs`
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+            }
+        )
+        return model_inputs
+    # Defer to Language Model (all handle this differently, with different return types)
+    def _reorder_cache(self, *args, **kwargs) -> Any:
+        return self.language_model._reorder_cache(*args, **kwargs)
+class TokenProjectorConfig(PretrainedConfig):
+    vit_tokens_layers: List[int] = []  # If empty, torch.nn.Identity
+    llm_image_tokens_layers: List[int] = []  # If empty, torch.nn.Identity
+    control_tokens_layers: List[int] = []  # If empty, torch.nn.Identity
+    # image_tokens_mode:
+    #   vit: use ViT tokens only
+    #   llm: use LLM tokens only
+    #   skip: skip connection between projector(ViT) and LLM with addition
+    #   none: don't feed to TokenProjector
+    image_tokens_mode: str
+    def __post_init__(self):
+        super().__post_init__()
+        if self.image_tokens_mode == 'vit':
+            assert len(self.vit_tokens_layers) > 0 or len(self.control_tokens_layers) > 0
+        elif self.image_tokens_mode == 'llm':
+            assert len(self.vit_tokens_layers) > 0 or len(self.control_tokens_layers) > 0
+        elif self.image_tokens_mode == 'skip':
+            assert len(self.vit_tokens_layers) > 0 or len(self.llm_image_tokens_layers) > 0
+        elif self.image_tokens_mode == 'none':
+            assert len(self.vit_tokens_layers) == 0
+            assert len(self.llm_image_tokens_layers) == 0
+        else:
+            raise NotImplementedError(f"Unknown image tokens mode {self.image_tokens_mode}")
+class TokenProjector(nn.Module):
+    """Project and pack VLM output tokens"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = TokenProjectorConfig()
+        self.config.vit_tokens_layers = config['vit_tokens_layers']
+        self.config.llm_image_tokens_layers = config['llm_image_tokens_layers']
+        self.config.control_tokens_layers = config['control_tokens_layers']
+        self.config.image_tokens_mode = config['image_tokens_mode']
+        self.vit_tokens_proj = self._make_token_proj_module(self.config.vit_tokens_layers)
+        self.llm_image_tokens_proj = self._make_token_proj_module(self.config.llm_image_tokens_layers)
+        self.control_tokens_proj = self._make_token_proj_module(self.config.control_tokens_layers)
+    def forward(self, inputs: WaypointerInput) -> torch.Tensor:
+        """
+        Args:
+            inputs: Contains VLM outputs
+        Returns:
+            torch.Tensor of shape [B, num_tokens, token_size] that always contains the control tokens
+            and possibly the image tokens (prepended), depending on the configuration
+        """
+        vit_tokens = self.vit_tokens_proj(inputs.vit_tokens)
+        control_tokens = self.control_tokens_proj(inputs.control_tokens)
+        llm_image_tokens = self.llm_image_tokens_proj(inputs.llm_image_tokens)
+        if self.config.image_tokens_mode == 'vit':
+            output = torch.cat([vit_tokens, control_tokens], dim=1)  # [B, img + control, token_size]
+        elif self.config.image_tokens_mode == 'llm':
+            output = torch.cat([llm_image_tokens, control_tokens], dim=1)  # [B, img + control, token_size]
+        elif self.config.image_tokens_mode == 'skip':
+            image_tokens = llm_image_tokens + vit_tokens
+            output = torch.cat([image_tokens, control_tokens], dim=1)  # [B, img + control, token_size]
+        elif self.config.image_tokens_mode == 'none':
+            output = control_tokens
+        else:
+            raise NotImplementedError(f"Unknown image tokens mode {self.config.image_tokens_mode}")
+        return output
+    def _make_token_proj_module(self, layer_sizes: List[int]) -> torch.nn.Module:
+        if len(layer_sizes) == 0:
+            return torch.nn.Identity()
+        assert len(layer_sizes) > 1, "Need to provide input and output layer sizes at least"
+        module = torch.nn.Sequential(
+            *[
+                torch.nn.Sequential(
+                    collections.OrderedDict(
+                        {
+                            'linear': torch.nn.Linear(layer_in_features, layer_out_features),
+                            'act': torch.nn.ReLU(),
+                            'norm': torch.nn.LayerNorm(layer_out_features),
+                        }
+                    )
+                )
+                for layer_in_features, layer_out_features in zip(layer_sizes[:-1], layer_sizes[1:])
+            ]
+        )
+        return module
+class NeRFPositionalEmbedding(torch.nn.Module):
+    def __init__(self, proj_scale: int):
+        """
+        Args:
+            proj_scale: Dimension size, same as L parameter in the NeRF paper
+        """
+        super().__init__()
+        self.proj_scale = proj_scale
+        freq = 2 ** torch.arange(self.proj_scale, dtype=torch.float32) * math.pi  # size: [L]
+        self.register_buffer('freq', freq)
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        Maps values from R^N to a higher dimensional space R^(N2L)
+        Args:
+            inputs: torch.Tensor of shape [B, ..., N]; input values to be transformed
+        Returns: torch.Tensor of shape [B, ..., N2L]; encoded input values
+        """
+        spectrum = self.freq.view(*[1] * inputs.ndim, -1) * inputs.unsqueeze(-1)  # [B, ..., N, L]
+        encoding = torch.stack([torch.sin(spectrum), torch.cos(spectrum)], dim=-2)  # [B, ..., N, 2, L]
+        encoding = encoding.view(inputs.shape[-1], -1)  # [B, ..., N2L]
+        return encoding
+class TimestepProjModuleConfig(PretrainedConfig):
+    pos_embed_scale: int  # How much to scale timestep values when doing position embedding
+    proj_layers: List[int]
+    time_delta_sec: float = 0.25  # Time delta between two predictions
+    num_tokens: int = 3  # Number of tokens per timestep; Currently 3 - translation, rotation, gripper
+class TimestepProjModule(nn.Module):
+    def __init__(self, config: TimestepProjModuleConfig, num_timesteps: int, token_size: int):
+        """
+        Args:
+            num_timesteps: Number of control timesteps
+            token_size: Single token size
+        """
+        super().__init__()
+        self.config = TimestepProjModuleConfig()
+        self.config.pos_embed_scale = config['pos_embed_scale']
+        self.config.proj_layers = config['proj_layers']
+        self.config.time_delta_sec = config['time_delta_sec']
+        self.config.num_tokens = config['num_tokens']
+        self.num_timesteps = num_timesteps
+        self.token_size = token_size
+        input_size = 2 * self.config.pos_embed_scale
+        self.pos_embed = NeRFPositionalEmbedding(self.config.pos_embed_scale)
+        # We output one token for translation, one for rotation and one for gripper state
+        feature_size = self.config.num_tokens * self.token_size
+        # Make MLP projection
+        self.timestep_proj = self._make_timestep_proj(in_features=int(input_size), out_features=int(feature_size))
+    def _make_timestep_proj(self, in_features: int, out_features: int) -> torch.nn.Module:
+        layer_sizes = [in_features] + list(self.config.proj_layers) + [out_features]
+        module = torch.nn.Sequential(
+            *[
+                torch.nn.Sequential(
+                    collections.OrderedDict(
+                        {
+                            'linear': torch.nn.Linear(layer_in_features, layer_out_features),
+                            'act': torch.nn.ReLU(),
+                            'norm': torch.nn.LayerNorm(layer_out_features),
+                        }
+                    )
+                )
+                for layer_in_features, layer_out_features in zip(layer_sizes[:-1], layer_sizes[1:])
+            ]
+        )
+        return module
+    def forward(self) -> torch.Tensor:
+        """
+        Returns:
+            torch.Tensor of sequence of timestep tokens, shape [1, num_timesteps * num_tokens, token_size]
+        """
+        device = self.timestep_proj[0].linear.weight.device  # type: ignore[index]
+        # Position encode timesteps
+        time_deltas_norm = self.time_deltas_norm.view(1, self.num_timesteps)  # [1, num_timesteps]
+        time_deltas_norm = time_deltas_norm.to(device=device)
+        # Embed timesteps to intermediate dimension
+        timesteps_embed = self.pos_embed(time_deltas_norm)  # [1, num_timesteps * 2 * L]
+        timesteps_embed = timesteps_embed.view(self.num_timesteps, -1)  # [num_timesteps, 2 * L]
+        # Project the timesteps via MLP to tokens
+        timesteps_tokens = self.timestep_proj(timesteps_embed)  # [num_timesteps, token_size * 3]
+        # Reshape MLP outputs into tokens
+        timesteps_tokens = timesteps_tokens.view(  # [1, num_timesteps * 3, token_size]
+            1, self.num_timesteps * self.config.num_tokens, self.token_size
+        )
+        return timesteps_tokens
+    @cached_property
+    def time_deltas_sec(self) -> torch.Tensor:
+        return torch.arange(0, self.num_timesteps, 1, dtype=torch.float32) * self.config.time_delta_sec
+    @cached_property
+    def time_deltas_norm(self) -> torch.Tensor:
+        # Normalize time deltas between [0, 1]. We are saving [-1, 0] interval for possible past supervision
+        if self.time_deltas_sec.shape[0] == 1:
+            # Can't divide by 0
+            time_deltas_norm = self.time_deltas_sec
+        else:
+            time_deltas_norm = self.time_deltas_sec / self.time_deltas_sec.max()  # [num_timesteps]
+        return time_deltas_norm.detach()
+# class Waypointer(nn.Module):
+class TrajectoryVLA(PrismaticForConditionalGeneration):
+    config_class: PretrainedConfig = TrajectoryVLAConfig
+    def __init__(self, config: TrajectoryVLAConfig) -> None:
+        super().__init__(config.prismatic_config)
+        self.control_tokenizer = WaypointTokenizer(self.llm_backbone.tokenizer)
+        self.timestep_proj = TimestepProjModule(
+            config.timestep_proj_config,
+            num_timesteps=config.num_timesteps,
+            token_size=config.token_size, )
+        self.num_timesteps = config.num_timesteps
+        self.token_proj = TokenProjector(config.token_proj_config)
+        self.transformer = DETR(config.transformer_config)
+        self.token_size = config.token_size
+        self.rotation_components = config.rotation_components
+        # if self.config.separate_control_proj:
+        # Project translation, rotation and gripper separately. Each timestep is projected separately
+        self.translation_proj = torch.nn.Sequential(
+            torch.nn.Linear(in_features=config.token_size, out_features=config.token_size // 2),
+            torch.nn.ReLU(),
+            torch.nn.Linear(in_features=config.token_size // 2, out_features=3),
+        )
+        self.rotation_proj = torch.nn.Sequential(
+            torch.nn.Linear(in_features=config.token_size, out_features=config.token_size // 2),
+            torch.nn.ReLU(),
+            torch.nn.Linear(
+                in_features=config.token_size // 2, out_features=config.rotation_components
+            ),
+        )
+        self.gripper_proj = torch.nn.Sequential(
+            torch.nn.Linear(in_features=config.token_size, out_features=config.token_size // 2),
+            torch.nn.ReLU(),
+            torch.nn.Linear(in_features=config.token_size // 2, out_features=1),
+        )
+    def _pack_waypointer_input(self, input_ids: torch.Tensor, vlm_output: PrismaticCausalLMOutputWithPast,vit_tokens,fused_attention_mask) -> WaypointerInput:
+        # Get the LLM output
+        # assert vlm_output.llm_output.hidden_states is not None
+        projected_tokens = vlm_output.hidden_states[-1]
+        control_tokens = self._extract_control_tokens(input_ids, projected_tokens)  # type: ignore
+        num_image_tokens = vit_tokens.shape[1]  # type: ignore[union-attr]
+        # TODO: This assumes a specific position of image tokens in the sequence. Make general
+        llm_image_tokens = projected_tokens[..., 1 : 1 + num_image_tokens, :]
+        return WaypointerInput(
+            vit_tokens=vit_tokens,
+            llm_image_tokens=llm_image_tokens,
+            control_tokens=control_tokens,
+            llm_tokens=projected_tokens,
+            attn_mask=fused_attention_mask,
+        )
+    def predict_tracks(self,inputs):
+        vlm_output,vit_tokens,fused_attention_mask = super().forward(**inputs,output_hidden_states=True,output_attentions=True,return_dict=True)
+        waypointer_input = self._pack_waypointer_input(inputs['input_ids'], vlm_output,vit_tokens,fused_attention_mask)
+        waypoint_output = self._waypointer_forward(waypointer_input)
+        translation, rotation, gripper = torch.split(
+            waypoint_output, [3, self.rotation_components, 1], dim=-1 )
+        translation, rotation, gripper = self.process_output(translation, rotation, gripper)
+        return translation, rotation, gripper
+    def process_output(self,translation,rotation,gripper):
+        ## convert rotation from matrix to  euler angles
+        euler_angles = []
+        for matrix in rotation[0]:
+            # Convert each rotation matrix to a Rotation object
+            rotation_obj = R.from_matrix(matrix.view(3, 3).detach().cpu().float().numpy().squeeze())
+            # Convert to Euler angles in radians with chosen convention, e.g., 'xyz'
+            euler_angle = rotation_obj.as_euler('xyz', degrees=False)
+            euler_angles.append(euler_angle)
+        translation = translation.detach().cpu().float().numpy().squeeze()
+        ## sigmoid and clip from 0-1
+        gripper = np.round(torch.sigmoid(gripper).detach().cpu().float().numpy().squeeze())
+        return translation,euler_angles,gripper
+    def _extract_control_tokens(self, input_ids: torch.Tensor, output_tokens: torch.Tensor) -> torch.Tensor:
+        """
+        Extract the action tokens from the LLM output sequence. Assumes the following order
+            [image_tokens, language_tokens, action_tokens, padding]
+        Args:
+            input_ids: IDs of the tokens in text input sequence; shape [B, S]
+            output_tokens: Token sequence output from LLM; shape [B, L, token_size]. Note the length is
+                different from input_ids as it also contains image tokens
+        Returns:
+            torch.Tensor of shape [B, 7, token_size] containing only action tokens
+        """
+        assert input_ids.ndim == 2
+        assert output_tokens.ndim == 3
+        batch, in_seq_len, out_seq_len = *input_ids.shape, output_tokens.shape[1]
+        device = input_ids.device
+        num_control_tokens = self.control_tokenizer.num_control_tokens  # type: ignore[attr-defined]
+        control_token_ids = torch.from_numpy(  # type: ignore[attr-defined]
+            self.control_tokenizer.control_token_ids  # type: ignore[attr-defined]
+        )
+        control_token_ids = control_token_ids.to(dtype=input_ids.dtype, device=input_ids.device)
+        is_control_token = torch.any(  # shape: [B, S]
+            input_ids.unsqueeze(-1) == control_token_ids.view(1, 1, -1),
+            dim=-1,
+        )
+        if not torch.all(mask := is_control_token.sum(dim=-1) == num_control_tokens):
+            raise RuntimeError(
+                f"Can't properly detect control tokens with ids {control_token_ids} of len="
+                f"{len(control_token_ids)} in input_ids {input_ids}. Rows mask: {mask}"
+            )
+        # Pad is_control_tokens mask to the LLM output sequence size
+        tokens_mask = torch.cat(  # shape: [B, L]
+            [
+                torch.zeros(batch, out_seq_len - in_seq_len, dtype=torch.bool, device=device),
+                is_control_token.to(torch.bool),
+            ],
+            dim=1,
+        )
+        control_tokens = output_tokens[tokens_mask]  # shape: 1D tensor
+        control_tokens = control_tokens.view(  # [B, num_control_tokens, token_size]
+            batch, num_control_tokens, output_tokens.shape[-1]
+        )
+        return control_tokens
+    def _waypointer_forward(self, inputs:WaypointerInput):
+        timesteps_tokens = self.timestep_proj()  # [1, num_timesteps * 3, token_size]
+        # Project and pack LLM tokens
+        llm_tokens = self.token_proj(inputs)  # [B, num_tokens, token_size]
+        # TODO: Pass inputs.attn_mask if you start using the LLM tokens
+        output_tokens = self.transformer(  # [B, num_timesteps * 3, token_size]
+            feature_tokens=llm_tokens, query_tokens=timesteps_tokens, attn_mask=None
+        )
+        output_tokens = output_tokens.view(  # [B, num_timesteps, 3 * token_size]
+            -1, self.num_timesteps, 3 * self.token_size
+        )
+        # if self.config.separate_control_proj:
+            # [B, num_timesteps, token_size] each
+        translation_tokens, rotation_tokens, gripper_tokens = torch.split(
+            output_tokens, [self.token_size] * 3, dim=-1
+        )
+        translation = self.translation_proj(translation_tokens)  # [B, num_timesteps, 3]
+        rotation = self.rotation_proj(rotation_tokens)  # [B, num_timesteps, rotation_components]
+        gripper = self.gripper_proj(gripper_tokens)  # [B, num_timesteps, 1]
+        output = torch.cat(  # [B, num_timesteps, control_components]
+            [translation, rotation, gripper], dim=-1
+        )
+        return output
+    # def predict_waypoints(self,input_ids: Optional[torch.LongTensor] = None, **kwargs: str) -> np.ndarray:
+    #     vlm_output = super().forward(
+    #         inputs=input_ids,
+    #         use_cache=use_cache,
+    #         output_attentions=output_attentions,
+    #         output_hidden_states=True,
+    #         return_dict=return_dict,
+    #     )
+    @staticmethod
+    def _check_unnorm_key(norm_stats: Dict[str, Dict[str, Any]], unnorm_key: Optional[str]) -> str:
+        if unnorm_key is None and len(norm_stats) != 1:
+            raise ValueError(
+                f"Your model was trained on more than one dataset. "
+                f"Please pass a `unnorm_key` from the following options to choose the statistics used for "
+                f"de-normalizing actions: {norm_stats.keys()}"
+            )
+        # If None, grab the (singular) dataset in `norm_stats` to use as `unnorm_key`
+        unnorm_key = unnorm_key if unnorm_key is not None else next(iter(norm_stats.keys()))
+        if unnorm_key not in norm_stats:
+            raise ValueError(
+                f"The `unnorm_key` you chose ({unnorm_key = }) is not in the available statistics. "
+                f"Please choose from: {norm_stats.keys()}"
+            )
+        return unnorm_key
+    def get_action_dim(self, unnorm_key: Optional[str] = None) -> int:
+        """Get the dimensionality of the policy's action space."""
+        unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
+        return len(self.norm_stats[unnorm_key]["action"]["q01"])
+    def get_action_stats(self, unnorm_key: Optional[str] = None) -> Dict[str, Any]:
+        """Get all the logged statistics for the given dataset."""
+        unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
+        return self.norm_stats[unnorm_key]["action"]
+def remove_waypointer_prefix(ckpt):
+    new_state_dict = {}
+    for key, value in ckpt.items():
+        # Remove the 'waypointer.' prefix if it exists
+        if key.startswith('waypointer.'):
+            new_key = key[len('waypointer.'):]
+        else:
+            new_key = key
+        new_state_dict[new_key] = value
+    return new_state_dict
+def image_processor(image):
+    image_resolution = (3,224,224)
+    image = image.resize(image_resolution[1:], resample=Image.Resampling.LANCZOS)
+def read_pt(pt_path):
+    data = torch.load(pt_path)
+    return data
+# model_input = read_pt('/work/nikolay_nikolov/debug/inference/model_input.pt')
+# vit_output = read_pt('/work/nikolay_nikolov/debug/inference/vit_output.pt')['vit_output']
+# llm_output = read_pt('/work/nikolay_nikolov/debug/inference/llm_output.pt')['llm_output']
+# projector_output = read_pt('/work/nikolay_nikolov/debug/inference/projector_output.pt')['projector_output']
+# transformer_input = read_pt('/work/nikolay_nikolov/debug/inference/transformer_input.pt')
+# feature_tokens = transformer_input['feature_tokens']
+# timestep_tokens = transformer_input['timestep_tokens']
+# # waypointer_input_nikolay = read_pt('/work/nikolay_nikolov/debug/inference/waypointer_input.pt')
+# transformer_input = read_pt('/work/nikolay_nikolov/debug/inference/transformer_input.pt')
+# control_target = read_pt('/work/nikolay_nikolov/debug/inference/control_target.pt')
+if __name__ == "__main__":
+    prismatic_config_dict = {
+        "vision_backbone_id":"dinosiglip-vit-so-224px",
+        "llm_backbone_id":"llama2-7b-pure",
+        "arch_specifier": "no-align+gelu-mlp", ## TODO: check
+        "use_fused_vision_backbone" :True, ## TODO: check
+        "image_resize_strategy" : "letterbox",
+        "text_config" : None,
+        "llm_max_length"  : 2048,
+        "pad_token_id" :32000,
+        "pad_to_multiple_of" : 64,
+        "output_projector_states" : False,
+        "return_dict": False,
+    }
+    token_proj_config = {
+        "vit_tokens_layers": [2176, 1024],
+        "control_tokens_layers": [4096, 2048, 1024],
+        "image_tokens_mode": 'vit',
+        'llm_image_tokens_layers': []
+    }
+    timestep_proj_config = {
+        "pos_embed_scale": 8,
+        "proj_layers": [128,512,1024],
+        "time_delta_sec": 0.1,
+        "num_tokens":3
+    }
+    pos_embed_config = {
+        "num_embeddings": 300,
+        "embedding_dim": 1024
+    }
+    encoder_block_config = {
+        "feature_size": 1024,
+        "head_dim": 64,
+        "num_heads": 16
+    }
+    decoder_block_config = {
+        "feature_size": 1024,
+        "head_dim": 64,
+        "num_heads": 16,
+        "dropout": 0.0
+    }
+    transformer_config = {
+        "pos_embed_config": pos_embed_config,
+        "encoder_block_config": encoder_block_config,
+        "decoder_block_config": decoder_block_config,
+        "num_blocks": 2
+    }
+        #   transformer_config:
+        # autoclass: barrel.components.nn.layers.detr.DETR
+        # pos_embed_config:
+        #   autoclass: barrel.components.nn.layers.positional_encodings.LearnedPosEmbed1D
+        #   num_embeddings: 300  # Max number of input tokens
+        #   embedding_dim: *token_size  # token_size
+        #   # num_embeddings: 256  # Number of image tokens
+        #   # embedding_dim: 512  # token_size / 2
+        # encoder_block_config:
+        #   autoclass: barrel.components.nn.layers.detr.TransformerEncoderBlock
+        #   feature_size: *token_size
+        #   # head_dim: 128
+        #   # num_heads: 8
+        #   head_dim: 64
+        #   num_heads: 16
+        # decoder_block_config:
+        #   autoclass: barrel.components.nn.layers.detr.TransformerDecoderBlock
+        #   feature_size: *token_size
+        #   # head_dim: 128
+        #   # num_heads: 8
+        #   head_dim: 64
+        #   num_heads: 16
+    TrajectoryVlaConfig_config = {
+        "prismatic_config":prismatic_config_dict,
+        "token_size": 1024,
+        "cheat": False,
+        "num_timesteps": 6,
+        "rotation_components": 9,
+        "seperate_control_proj": True,
+        "timestep_proj_config": timestep_proj_config,
+        "token_proj_config": token_proj_config,
+        "transformer_config": transformer_config,
+        "num_timestep_tokens": 3,
+    }
+    # ckpt_path = '/work/nikolay_nikolov/debug/inference/model.ckpt'
+    # ckpt_params = torch.load(ckpt_path, map_location='cpu', mmap= True)
+    # ckpt_params = remove_waypointer_prefix(ckpt_params)
+    ## Testing for prismatic
+    model_config = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
+    # model.load_state_dict(ckpt_params, strict=True)
+    model = TrajectoryVLA(model_config)
+    model = model.to(dtype=torch.bfloat16)
+    model = model.to('cuda')
+    model.eval()
+    # with autocast('cuda',dtype=torch.bfloat16):
+    #     with torch.no_grad():
+    #         output = model.predict_tracks(model_input)
+    # Get matched keys by finding keys that exist in both the model and checkpoint
+    # TrajectoryVLA.load_state_dict(ckpt_params, strict=False)
+    # model_keys = set(TrajectoryVLA.state_dict().keys())
+    # checkpoint_keys = set(ckpt_params.keys())
+    # matched_keys = model_keys.intersection(checkpoint_keys)
+    # print('Matched Keys:')
+    # for key in matched_keys:
+    #     print(key)
+    # embed()
+    # hf_image_processor.push_to_hub(cfg.output_hf_model_hub_path)
+    # hf_processor.push_to_hub(cfg.output_hf_model_hub_path)
+    # import code; code.interact(local=vars())