trajectoryvla_new / prismatic_model.py

Upload TrajectoryVLA

bbe26d0 verified 2 months ago

50.1 kB

	"""
	modeling_prismatic.py

	Core HuggingFace-style PrismaticPreTrainedModel and PrismaticForConditionalGeneration class definitions, inheriting
	from the default `transformers.PretrainedModel`. Meant to be standalone and self-contained, but exactly replicate the
	logic in `prismatic.models.vlms.prismatic.py`.

	Note =>> for the time being, not adding the custom HF "docstring" formatting.

	References [LLaVa, IDEFICS-2]:
	=> https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava/modeling_llava.py
	=> https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics2/modeling_idefics2.py
	"""

	import logging
	from dataclasses import dataclass
	from functools import partial
	from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, Union
	from functools import cached_property
	# from barrel.components.nn.layers.nerf_pos_embed import NeRFPositionalEmbedding

	import numpy as np
	import timm
	import tokenizers
	import torch
	import torch.nn as nn
	import transformers
	from timm.models.vision_transformer import LayerScale
	from transformers import AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
	from transformers.modeling_outputs import ModelOutput
	import collections
	import math
	from barrel.pipes.vlams.extern.prismatic_config import OpenVLAConfig, PrismaticConfig , TrajectoryVLAConfig, WaypointTokenizer
	# from barrel.pipes.vlams.models.control.token_proj import TokenProjector
	from barrel.pipes.vlams.extern.datatypes import *
	from barrel.pipes.vlams.extern.detr import *
	from IPython import embed
	import os
	from PIL import Image
	from pathlib import Path
	from torch.amp.autocast_mode import autocast # Corrected import for latest PyTorch
	from scipy.spatial.transform import Rotation as R
	ht_token_path = Path(".hf_token")
	HF_TOKEN = ht_token_path.read_text().strip() if isinstance(ht_token_path, Path) else hf_token_path

	# Get Logger
	logger = logging.getLogger(__name__)
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True

	# === PyTorch/HuggingFace Default IGNORE_INDEX (for CrossEntropyLoss labels)
	IGNORE_INDEX = -100


	# === Utility Functions for Monkey-Patching ===
	def unpack_tuple(fn: Callable[[Any], Tuple[Any]]) -> Callable[[Any], Any]:
	def wrapper(args: Any, *kwargs: Any) -> Any:
	result = fn(args, *kwargs)
	return result[0] if isinstance(result, tuple) else result

	return wrapper


	# HF Transformers overwrites parameters with names containing `gamma`; we're going to patch VisionBackbone.LayerScale.
	# =>> TIMM :: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L109
	# =>> Transformers :: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3960
	def _ls_new_forward(self, x: torch.Tensor) -> torch.Tensor:
	return x.mul_(self.scale_factor) if self.inplace else x * self.scale_factor


	def ls_apply_patch(ls_module: LayerScale):
	ls_module.scale_factor = nn.Parameter(ls_module.gamma.clone())
	ls_module.forward = _ls_new_forward.__get__(ls_module, LayerScale)
	del ls_module.gamma


	# === Prismatic Vision Backbone (nn.Module) Definitions (w/ Fused Backbone Support) ===
	class PrismaticVisionBackbone(nn.Module):
	def __init__(
	self,
	use_fused_vision_backbone: bool,
	image_sizes: List[int],
	timm_model_ids: List[str],
	timm_override_act_layers: List[Optional[str]],
	) -> None:
	super().__init__()
	self.use_fused_vision_backbone = use_fused_vision_backbone

	# [Contract] Validate number of (fused) vision backbones, create "alpha" featurizer and Instantiate
	# =>> Note :: Monkey-Patch the `forward()` function of the backbone to ensure FSDP-compatibility
	# Hardcodes `get_intermediate_layers` to return the SECOND-TO-LAST layer patches!
	assert len(timm_model_ids) <= 2, "Prismatic models only support up to 2 (fused) vision backbones!"

	self.dino_featurizer = timm.create_model(
	timm_model_ids[0],
	pretrained=True,
	num_classes=0,
	img_size=image_sizes[0],
	act_layer=timm_override_act_layers[0],
	)
	self.dino_featurizer.eval()

	self.embed_dim = self.dino_featurizer.embed_dim

	# If `use_fused_vision_backbone` =>> create "beta" featurizer
	# if self.use_fused_vision_backbone:
	self.siglip_featurizer = timm.create_model(
	timm_model_ids[1],
	pretrained=True,
	num_classes=0,
	img_size=image_sizes[1],
	act_layer=timm_override_act_layers[1],)

	self.siglip_featurizer.eval()

	self.dino_featurizer.forward = partial(
	self.dino_featurizer.forward_intermediates,
	indices=[len(self.dino_featurizer.blocks) - 2],
	return_prefix_tokens=False,
	norm=False,
	stop_early=True,
	output_fmt='NLC',
	intermediates_only=True,
	)
	self.siglip_featurizer.forward = partial(
	self.siglip_featurizer.forward_intermediates,
	indices=[len(self.siglip_featurizer.blocks) - 2],
	return_prefix_tokens=False,
	norm=False,
	stop_early=True,
	output_fmt='NLC',
	intermediates_only=True,
	)
	self.embed_dim += self.siglip_featurizer.embed_dim

	def forward(self, pixel_values) -> torch.Tensor:
	"""Run image (`pixel_values`) through featurizer; if channel-stacked, then dispatch and sequence stack."""
	if not self.use_fused_vision_backbone:
	return self.featurizer(pixel_values)

	# Split `pixel_values :: [bsz, 2 * 3, resolution, resolution]` =>> featurize =>> channel stack
	# img, img_fused = torch.split(pixel_values, [3, 3], dim=1)
	img = pixel_values['dino']
	img_fused = pixel_values['siglip']
	patches, patches_fused = self.dino_featurizer(img)[0], self.siglip_featurizer(img_fused)[0]

	return torch.cat([patches, patches_fused], dim=2)



	class PrismaticProjector(nn.Module):
	def __init__(self, use_fused_vision_backbone, vision_dim: int, llm_dim: int) -> None:
	super().__init__()
	self.initial_projection_dim = vision_dim * 4
	self.projector = torch.nn.Sequential(
	torch.nn.Linear(vision_dim, self.initial_projection_dim, bias=True),
	torch.nn.GELU(),
	torch.nn.Linear(self.initial_projection_dim, llm_dim, bias=True),
	torch.nn.GELU(),
	torch.nn.Linear(llm_dim, llm_dim, bias=True),
	)

	def forward(self, fused_img_patches: torch.Tensor) -> torch.Tensor:
	return self.projector(fused_img_patches)

	# === Main HF Class Definitions ===
	@dataclass
	class PrismaticCausalLMOutputWithPast(ModelOutput):
	"""Base class for Prismatic casual (visually-conditioned) language model outputs; also exposes visual features."""

	loss: Optional[torch.FloatTensor] = None
	logits: torch.FloatTensor = None
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None

	# Additions for VLMs
	projector_features: Optional[torch.FloatTensor] = None


	class PrismaticPreTrainedModel(PreTrainedModel):
	config_class: PrismaticConfig
	base_model_prefix: str = "model"
	supports_gradient_checkpointing: bool = True

	_no_split_modules: ClassVar[List[str]] = ["PrismaticProjector"]
	_skip_keys_device_placement: str = "past_key_values"
	_supports_flash_attn_2: bool = True

	def _init_weights(self, module: nn.Module) -> None:
	# Important :: this HF ported version is not meant for training from scratch; only inference and fine-tuning!
	# => As such, this init_weights code is not correct; if training VLMs from scratch, use the main codebase at
	# https://github.com/TRI-ML/prismatic-vlms
	std = (
	self.config.initializer_range
	if hasattr(self.config, "initializer_range")
	else self.config.text_config.initializer_range
	)

	if hasattr(module, "class_embedding"):
	module.class_embedding.data.normal_(mean=0.0, std=std)

	if isinstance(module, (nn.Linear, nn.Conv2d)):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()

	@property
	def _supports_sdpa(self) -> bool:
	"""Check LLM supports SDPA Attention"""
	return self.language_model._supports_sdpa

	class LLMBackbone(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.llm : AutoModelForCausalLM
	self.tokenizer = self._create_tokenizer()

	def _create_tokenizer(self) -> transformers.PreTrainedTokenizerBase:
	# Load (Fast) Tokenizer
	print(f"Loading (Fast) Tokenizer via the AutoTokenizer API")
	tokenizer = transformers.AutoTokenizer.from_pretrained(
	self.config['hf_model_id'],
	model_max_length=self.config['llm_max_length'],
	token=HF_TOKEN,
	padding_side="right",
	)

	# Validation =>> Our VLM logic currently operates under the assumption that the tokenization of a new input
	# starts with a <BOS> token unless `add_special_tokens = False`; for these models, we empirically
	# find that adding image patches after the BOS leads to much better performance.
	#
	# As a result we explicitly validate that a tokenizer conforms to the expected behavior; if you're reading this
	# line, it's probably because you're adding a new LLM with a different tokenizer behavior. If so, feel free to
	# override the `SPECIAL_CASES` set below, but make sure to make the appropriate changes in the `datasets.py`
	# and VLM `forward()` logic!
	SPECIAL_CASES = {
	# Phi-2 Tokenizer doesn't add any BOS tokens by default, and sets BOS == EOS == "<\|endoftext\|>"
	# =>> We'll prepend BOS to first input (to play nicely with image token insertion logic; verified that
	# this works well with base LLM generation.
	# =>> Like Llama-2 Tokenizers -- we'll add a special PAD token for training purposes.
	"microsoft/phi-2",
	}
	if self.config['hf_model_id'] not in SPECIAL_CASES:
	# Note =>> this assert should hold for all Llama-derived tokenizers (`LlamaTokenizerFast` ==> includes Mistral!
	assert (
	tokenizer("Test 123", add_special_tokens=True).input_ids[0] == tokenizer.bos_token_id
	) and (
	tokenizer("Test 123", add_special_tokens=False).input_ids[0] != tokenizer.bos_token_id
	), f"Default Tokenizer of type `{type(tokenizer)}` does not automatically prefix inputs with BOS token!\n"

	return tokenizer

	class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
	def __init__(self, config: PrismaticConfig) -> None:
	super().__init__(config)
	# [Validation] Lightweight Validate on `config` Fields + Dependency Versions
	if config.use_fused_vision_backbone is None:
	raise ValueError("Missing config field `use_fused_vision_backbone`")

	# if timm.__version__ not in {"0.9.10", "0.9.11", "0.9.12", "0.9.16"}:
	# raise NotImplementedError(
	# "TIMM Version must be >= 0.9.10 and < 1.0.0 (breaking); please raise a GitHub Issue "
	# "if you urgently need support for latest TIMM versions."
	# )

	# if (transformers.__version__ != "4.40.1") or (tokenizers.__version__ != "0.19.1"):
	# logger.warning(
	# f"Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got "
	# f"`transformers=={transformers.__version__}` and `tokenizers=={tokenizers.__version__}`; "
	# f"there might be inference-time regressions due to dependency changes. If in doubt, please"
	# f"use the above versions."
	# )

	# Instantiate PrismaticVisionBackbone (w/ Potential Fused Backbone)
	self.vision_backbone = PrismaticVisionBackbone(
	config.use_fused_vision_backbone, config.image_sizes, config.timm_model_ids, config.timm_override_act_layers
	)

	# Create Multimodal Projector
	self.projector = PrismaticProjector(
	config.use_fused_vision_backbone,
	vision_dim=self.vision_backbone.embed_dim,
	llm_dim=config.text_config.hidden_size,
	)

	# Instantiate LLM Backbone
	self.llm_backbone = LLMBackbone({'hf_model_id': config.hf_llm_id, 'llm_max_length': config.llm_max_length, "pad_token_id" :32000,
	"pad_to_multiple_of" : 64,})

	# self.llm_backbone.llm = AutoModelForCausalLM.from_config(
	# config.text_config, attn_implementation="flash_attention_2"
	# )
	self.llm_backbone.llm = AutoModelForCausalLM.from_pretrained(
	'meta-llama/Llama-2-7b-hf',
	token=HF_TOKEN,
	attn_implementation='flash_attention_2',
	# The following parameters are set to prevent `UserWarnings` from HF; we want greedy decoding!
	do_sample=False,
	temperature=1.0,
	use_cache=False,
	top_p=1.0, )

	self.llm_backbone.tokenizer.add_special_tokens({"pad_token": "<PAD>"})
	self.llm_backbone.llm.config.pad_token_id = self.llm_backbone.tokenizer.pad_token_id
	self.llm_backbone.llm.resize_token_embeddings(len(self.llm_backbone.tokenizer), pad_to_multiple_of=64)



	# self.llm_backbone.llm.config.pad_token_id = self.llm_backbone.tokenizer.pad_token_id
	# self.llm_backbone.llm.resize_token_embeddings(len(self.llm_backbone.tokenizer), pad_to_multiple_of=64)
	# self.resize_token_embeddings(32001,64)

	self.vocab_size = config.text_config.vocab_size
	self.pad_token_id = config.pad_token_id

	# HF Boilerplate =>> initializes weights via `_init_weights()` and sets gradient checkpointing
	self.post_init()

	# === `PreTrainedModel` Boilerplate ===
	def get_input_embeddings(self) -> nn.Module:
	return self.llm_backbone.llm.get_input_embeddings()

	def set_input_embeddings(self, value: nn.Module) -> None:
	self.llm_backbone.llm.set_input_embeddings(value)

	def get_output_embeddings(self) -> nn.Module:
	return self.llm_backbone.llm.get_output_embeddings()

	def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
	self.llm_backbone.llm.set_output_embeddings(new_embeddings)

	def get_decoder(self) -> nn.Module:
	return self.llm_backbone.llm.get_decoder()

	def set_decoder(self, decoder: nn.Module) -> None:
	self.llm_backbone.llm.set_decoder(decoder)

	def tie_weights(self) -> None:
	self.llm_backbone.llm.tie_weights() # Note: `Llama-2` and `Mistral` don't tie weights (no-op)

	# def resize_token_embeddings(
	# self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
	# ) -> nn.Embedding:
	# updated_embeddings = self.llm_backbone.llm.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)

	# # Update config/instance variables
	# self.config.text_config.vocab_size = updated_embeddings.num_embeddings
	# self.vocab_size = updated_embeddings.num_embeddings

	# return updated_embeddings

	# === Core Prismatic VLM `forward()` Logic ===
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] ,
	attention_mask: Optional[torch.Tensor],
	# pixel_values: Optional[torch.FloatTensor] = None,
	pixel_values: Dict[str, torch.Tensor] = {},
	labels: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	output_projector_features: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs: Any,
	) -> Union[Tuple, PrismaticCausalLMOutputWithPast]:
	"""Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance."""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	output_projector_features = output_projector_features if output_projector_features is not None else False
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# Respect `use_cache` only if not training (even if `gradient_checkpointing` is off)
	use_cache = use_cache and not self.training

	# Instantiate Placeholder for Projector Features
	projected_patch_embeddings = None

	# Note :: We only support forward passes with the following cases:
	# => Cached Generation :: (input_ids.shape[1] == 1) and (past_key_values is not None)
	# => Unimodal Forward :: (pixel_values is None)
	# => Multimodal Forward :: (pixel_values is not None) and (input_ids/embeds.shape[0] == pixel_values.shape[0])

	# === Handle Generation with Cache (`input_ids.shape[1] == 1`) =>> requires `past_keys_values` ===
	if input_ids.shape[1] == 1:
	assert input_ids.shape[0] == 1, "Generation is only currently supported for batch size of 1!"
	assert past_key_values is not None, "You must provide `past_key_values` during cached generation!"
	assert labels is None, "Unexpected key `labels` provided during cached generation!"

	language_model_output = self.llm_backbone.llm(
	input_ids=input_ids,
	attention_mask=None,
	position_ids=None,
	past_key_values=past_key_values,
	inputs_embeds=None,
	labels=None,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	# === Handle Unimodal Forward ===
	elif pixel_values is None:
	assert (input_ids is not None) and (inputs_embeds is None), "Missing `input_ids` in language-only forward!"
	assert past_key_values is None, "Unexpected key `past_key_values` provided during language-only forward!"

	language_model_output = self.llm_backbone.llm(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=None,
	past_key_values=None,
	inputs_embeds=None,
	labels=labels,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	# === Handle Multimodal Forward ===

	elif (input_ids.shape[0] == pixel_values['dino'].shape[0]) or (inputs_embeds.shape[0] == pixel_values['dino'].shape[0]):
	assert past_key_values is None, "Unexpected key `past_key_values` provided during language-only forward!"

	# Visual Feature Extraction
	patch_features = self.vision_backbone(pixel_values)

	projected_patch_embeddings = self.projector(patch_features) ## matches
	projected_patch_attention_mask = None
	if attention_mask is not None:
	projected_patch_attention_mask = torch.full(
	(projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
	fill_value=True,
	dtype=attention_mask.dtype,
	device=attention_mask.device,
	)

	# Get Input Embeddings (from Language Model Embeddings)
	input_embeddings = self.get_input_embeddings()(input_ids)

	# Build Multimodal Embeddings & Attention Mask =>> Prismatic defaults to inserting after <BOS> token (1:)
	multimodal_embeddings = torch.cat(
	[input_embeddings[:, :1, :], projected_patch_embeddings, input_embeddings[:, 1:, :]], dim=1
	)
	multimodal_attention_mask = None
	if attention_mask is not None:
	multimodal_attention_mask = torch.cat(
	[attention_mask[:, :1], projected_patch_attention_mask, attention_mask[:, 1:]], dim=1
	)

	# Build Labels (if specified) =>> Ignore Labels for Patch Embeddings
	multimodal_labels = None
	if labels is not None:
	projected_patch_labels = torch.full(
	(projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
	fill_value=IGNORE_INDEX,
	dtype=labels.dtype,
	device=labels.device,
	)
	multimodal_labels = torch.cat([labels[:, :1], projected_patch_labels, labels[:, 1:]], dim=1)

	# Dispatch to Language Model
	language_model_output = self.llm_backbone.llm(
	input_ids=None,
	attention_mask=multimodal_attention_mask,
	position_ids=None,
	past_key_values=None,
	inputs_embeds=multimodal_embeddings,
	labels=multimodal_labels,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	# === Otherwise =>> Assume Invalid! ===
	elif (input_ids.shape[0] != pixel_values.shape[0]) or (inputs_embeds.shape[0] != pixel_values.shape[0]):
	raise ValueError("Non-homogenous batch of (text, image) input -- forward() does not support mixed batches!")

	else:
	raise ValueError(
	"Invalid PrismaticForConditionalGeneration `forward()` call with provided arguments:\n"
	f"=> `input_ids` = {input_ids is not None}\n"
	f"=> `attention_mask` = {attention_mask is not None}\n"
	f"=> `pixel_values` = {pixel_values is not None}\n"
	f"=> `labels` = {labels is not None}\n"
	f"=> `input_embeds` = {inputs_embeds is not None}\n"
	f"=> `past_key_values` = {past_key_values is not None}\n"
	f"=> `use_cache` = {use_cache}"
	)

	# Unpack `language_model_output` and return PrismaticCausalLMOutputWithPast (or tuple if not `return_dict`)
	if not return_dict:
	if output_projector_features and (projected_patch_embeddings is not None):
	return *language_model_output, projected_patch_embeddings

	return language_model_output


	return (PrismaticCausalLMOutputWithPast(
	loss=language_model_output.loss,
	logits=language_model_output.logits,
	past_key_values=language_model_output.past_key_values,
	hidden_states=language_model_output.hidden_states,
	attentions=language_model_output.attentions,
	projector_features=projected_patch_embeddings,
	),patch_features,multimodal_attention_mask)

	# === GenerationMixin Methods ===
	def prepare_inputs_for_generation(
	self,
	input_ids: Optional[torch.Tensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs: str,
	) -> Dict[str, torch.Tensor]:
	"""Borrowed from `LlamaForCausalLM` and simplified for batch size = 1; mirrors original PrismaticVLM logic."""
	if ((input_ids is not None) and (input_ids.shape[0] > 1)) or (
	(inputs_embeds is not None) and (inputs_embeds.shape[0] > 1)
	):
	raise ValueError("Generation with batch size > 1 is not currently supported!")

	# Handle `past_key_values` (cache) =>> assume `input_ids` just has unprocessed tokens
	if past_key_values is not None:
	input_ids = input_ids[:, -1:]

	# If `input_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and past_key_values is None:
	model_inputs = {"input_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	# Make sure `pixel_values` are preserved in `model_inputs`
	model_inputs.update(
	{
	"attention_mask": attention_mask,
	"pixel_values": pixel_values,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	}
	)

	return model_inputs

	# Defer to Language Model (all handle this differently, with different return types)
	def _reorder_cache(self, args, *kwargs) -> Any:
	return self.language_model._reorder_cache(args, *kwargs)


	class TokenProjectorConfig(PretrainedConfig):
	vit_tokens_layers: List[int] = [] # If empty, torch.nn.Identity
	llm_image_tokens_layers: List[int] = [] # If empty, torch.nn.Identity
	control_tokens_layers: List[int] = [] # If empty, torch.nn.Identity

	# image_tokens_mode:
	# vit: use ViT tokens only
	# llm: use LLM tokens only
	# skip: skip connection between projector(ViT) and LLM with addition
	# none: don't feed to TokenProjector
	image_tokens_mode: str

	def __post_init__(self):
	super().__post_init__()

	if self.image_tokens_mode == 'vit':
	assert len(self.vit_tokens_layers) > 0 or len(self.control_tokens_layers) > 0
	elif self.image_tokens_mode == 'llm':
	assert len(self.vit_tokens_layers) > 0 or len(self.control_tokens_layers) > 0
	elif self.image_tokens_mode == 'skip':
	assert len(self.vit_tokens_layers) > 0 or len(self.llm_image_tokens_layers) > 0
	elif self.image_tokens_mode == 'none':
	assert len(self.vit_tokens_layers) == 0
	assert len(self.llm_image_tokens_layers) == 0
	else:
	raise NotImplementedError(f"Unknown image tokens mode {self.image_tokens_mode}")

	class TokenProjector(nn.Module):
	"""Project and pack VLM output tokens"""

	def __init__(self, config):
	super().__init__()
	self.config = TokenProjectorConfig()
	self.config.vit_tokens_layers = config['vit_tokens_layers']
	self.config.llm_image_tokens_layers = config['llm_image_tokens_layers']
	self.config.control_tokens_layers = config['control_tokens_layers']
	self.config.image_tokens_mode = config['image_tokens_mode']

	self.vit_tokens_proj = self._make_token_proj_module(self.config.vit_tokens_layers)
	self.llm_image_tokens_proj = self._make_token_proj_module(self.config.llm_image_tokens_layers)
	self.control_tokens_proj = self._make_token_proj_module(self.config.control_tokens_layers)

	def forward(self, inputs: WaypointerInput) -> torch.Tensor:
	"""
	Args:
	inputs: Contains VLM outputs
	Returns:
	torch.Tensor of shape [B, num_tokens, token_size] that always contains the control tokens
	and possibly the image tokens (prepended), depending on the configuration
	"""

	vit_tokens = self.vit_tokens_proj(inputs.vit_tokens)
	control_tokens = self.control_tokens_proj(inputs.control_tokens)
	llm_image_tokens = self.llm_image_tokens_proj(inputs.llm_image_tokens)

	if self.config.image_tokens_mode == 'vit':
	output = torch.cat([vit_tokens, control_tokens], dim=1) # [B, img + control, token_size]
	elif self.config.image_tokens_mode == 'llm':
	output = torch.cat([llm_image_tokens, control_tokens], dim=1) # [B, img + control, token_size]
	elif self.config.image_tokens_mode == 'skip':
	image_tokens = llm_image_tokens + vit_tokens
	output = torch.cat([image_tokens, control_tokens], dim=1) # [B, img + control, token_size]
	elif self.config.image_tokens_mode == 'none':
	output = control_tokens
	else:
	raise NotImplementedError(f"Unknown image tokens mode {self.config.image_tokens_mode}")

	return output

	def _make_token_proj_module(self, layer_sizes: List[int]) -> torch.nn.Module:
	if len(layer_sizes) == 0:
	return torch.nn.Identity()

	assert len(layer_sizes) > 1, "Need to provide input and output layer sizes at least"

	module = torch.nn.Sequential(
	*[
	torch.nn.Sequential(
	collections.OrderedDict(
	{
	'linear': torch.nn.Linear(layer_in_features, layer_out_features),
	'act': torch.nn.ReLU(),
	'norm': torch.nn.LayerNorm(layer_out_features),
	}
	)
	)
	for layer_in_features, layer_out_features in zip(layer_sizes[:-1], layer_sizes[1:])
	]
	)
	return module

	class NeRFPositionalEmbedding(torch.nn.Module):
	def __init__(self, proj_scale: int):
	"""
	Args:
	proj_scale: Dimension size, same as L parameter in the NeRF paper
	"""
	super().__init__()
	self.proj_scale = proj_scale

	freq = 2 ** torch.arange(self.proj_scale, dtype=torch.float32) * math.pi # size: [L]

	self.register_buffer('freq', freq)

	def forward(self, inputs: torch.Tensor) -> torch.Tensor:
	"""
	Maps values from R^N to a higher dimensional space R^(N2L)
	Args:
	inputs: torch.Tensor of shape [B, ..., N]; input values to be transformed
	Returns: torch.Tensor of shape [B, ..., N2L]; encoded input values
	"""

	spectrum = self.freq.view([1] inputs.ndim, -1) * inputs.unsqueeze(-1) # [B, ..., N, L]
	encoding = torch.stack([torch.sin(spectrum), torch.cos(spectrum)], dim=-2) # [B, ..., N, 2, L]
	encoding = encoding.view(inputs.shape[-1], -1) # [B, ..., N2L]

	return encoding

	class TimestepProjModuleConfig(PretrainedConfig):
	pos_embed_scale: int # How much to scale timestep values when doing position embedding
	proj_layers: List[int]
	time_delta_sec: float = 0.25 # Time delta between two predictions
	num_tokens: int = 3 # Number of tokens per timestep; Currently 3 - translation, rotation, gripper


	class TimestepProjModule(nn.Module):

	def __init__(self, config: TimestepProjModuleConfig, num_timesteps: int, token_size: int):
	"""
	Args:
	num_timesteps: Number of control timesteps
	token_size: Single token size
	"""
	super().__init__()
	self.config = TimestepProjModuleConfig()
	self.config.pos_embed_scale = config['pos_embed_scale']
	self.config.proj_layers = config['proj_layers']
	self.config.time_delta_sec = config['time_delta_sec']
	self.config.num_tokens = config['num_tokens']

	self.num_timesteps = num_timesteps
	self.token_size = token_size

	input_size = 2 * self.config.pos_embed_scale

	self.pos_embed = NeRFPositionalEmbedding(self.config.pos_embed_scale)

	# We output one token for translation, one for rotation and one for gripper state
	feature_size = self.config.num_tokens * self.token_size

	# Make MLP projection

	self.timestep_proj = self._make_timestep_proj(in_features=int(input_size), out_features=int(feature_size))

	def _make_timestep_proj(self, in_features: int, out_features: int) -> torch.nn.Module:
	layer_sizes = [in_features] + list(self.config.proj_layers) + [out_features]
	module = torch.nn.Sequential(
	*[
	torch.nn.Sequential(
	collections.OrderedDict(
	{
	'linear': torch.nn.Linear(layer_in_features, layer_out_features),
	'act': torch.nn.ReLU(),
	'norm': torch.nn.LayerNorm(layer_out_features),
	}
	)
	)
	for layer_in_features, layer_out_features in zip(layer_sizes[:-1], layer_sizes[1:])
	]
	)
	return module

	def forward(self) -> torch.Tensor:
	"""
	Returns:
	torch.Tensor of sequence of timestep tokens, shape [1, num_timesteps * num_tokens, token_size]
	"""
	device = self.timestep_proj[0].linear.weight.device # type: ignore[index]

	# Position encode timesteps
	time_deltas_norm = self.time_deltas_norm.view(1, self.num_timesteps) # [1, num_timesteps]
	time_deltas_norm = time_deltas_norm.to(device=device)

	# Embed timesteps to intermediate dimension
	timesteps_embed = self.pos_embed(time_deltas_norm) # [1, num_timesteps * 2 * L]
	timesteps_embed = timesteps_embed.view(self.num_timesteps, -1) # [num_timesteps, 2 * L]

	# Project the timesteps via MLP to tokens
	timesteps_tokens = self.timestep_proj(timesteps_embed) # [num_timesteps, token_size * 3]

	# Reshape MLP outputs into tokens
	timesteps_tokens = timesteps_tokens.view( # [1, num_timesteps * 3, token_size]
	1, self.num_timesteps * self.config.num_tokens, self.token_size
	)

	return timesteps_tokens

	@cached_property
	def time_deltas_sec(self) -> torch.Tensor:
	return torch.arange(0, self.num_timesteps, 1, dtype=torch.float32) * self.config.time_delta_sec

	@cached_property
	def time_deltas_norm(self) -> torch.Tensor:
	# Normalize time deltas between [0, 1]. We are saving [-1, 0] interval for possible past supervision
	if self.time_deltas_sec.shape[0] == 1:
	# Can't divide by 0
	time_deltas_norm = self.time_deltas_sec
	else:
	time_deltas_norm = self.time_deltas_sec / self.time_deltas_sec.max() # [num_timesteps]
	return time_deltas_norm.detach()


	# class Waypointer(nn.Module):

	class TrajectoryVLA(PrismaticForConditionalGeneration):
	# class TrajectoryVLA(nn.Module):
	config_class: PretrainedConfig = TrajectoryVLAConfig

	def __init__(self, config: TrajectoryVLAConfig) -> None:
	super().__init__(config.prismatic_config)
	self.control_tokenizer = WaypointTokenizer(self.llm_backbone.tokenizer)
	self.timestep_proj = TimestepProjModule(
	config.timestep_proj_config,
	num_timesteps=config.num_timesteps,
	token_size=config.token_size, )
	self.num_timesteps = config.num_timesteps
	self.token_proj = TokenProjector(config.token_proj_config)
	self.transformer = DETR(config.transformer_config)
	self.token_size = config.token_size
	self.rotation_components = config.rotation_components
	# if self.config.separate_control_proj:
	# Project translation, rotation and gripper separately. Each timestep is projected separately
	self.translation_proj = torch.nn.Sequential(
	torch.nn.Linear(in_features=config.token_size, out_features=config.token_size // 2),
	torch.nn.ReLU(),
	torch.nn.Linear(in_features=config.token_size // 2, out_features=3),
	)
	self.rotation_proj = torch.nn.Sequential(
	torch.nn.Linear(in_features=config.token_size, out_features=config.token_size // 2),
	torch.nn.ReLU(),
	torch.nn.Linear(
	in_features=config.token_size // 2, out_features=config.rotation_components
	),
	)

	self.gripper_proj = torch.nn.Sequential(
	torch.nn.Linear(in_features=config.token_size, out_features=config.token_size // 2),
	torch.nn.ReLU(),
	torch.nn.Linear(in_features=config.token_size // 2, out_features=1),
	)

	def _pack_waypointer_input(self, input_ids: torch.Tensor, vlm_output: PrismaticCausalLMOutputWithPast,vit_tokens,fused_attention_mask) -> WaypointerInput:
	# Get the LLM output
	# assert vlm_output.llm_output.hidden_states is not None
	projected_tokens = vlm_output.hidden_states[-1]

	control_tokens = self._extract_control_tokens(input_ids, projected_tokens) # type: ignore

	num_image_tokens = vit_tokens.shape[1] # type: ignore[union-attr]
	# TODO: This assumes a specific position of image tokens in the sequence. Make general
	llm_image_tokens = projected_tokens[..., 1 : 1 + num_image_tokens, :]


	return WaypointerInput(
	vit_tokens=vit_tokens,
	llm_image_tokens=llm_image_tokens,
	control_tokens=control_tokens,
	llm_tokens=projected_tokens,
	attn_mask=fused_attention_mask,
	)

	def predict_tracks(self,inputs):

	vlm_output,vit_tokens,fused_attention_mask = super().forward(**inputs,output_hidden_states=True,output_attentions=True,return_dict=True)
	waypointer_input = self._pack_waypointer_input(inputs['input_ids'], vlm_output,vit_tokens,fused_attention_mask)
	waypoint_output = self._waypointer_forward(waypointer_input)
	translation, rotation, gripper = torch.split(
	waypoint_output, [3, self.rotation_components, 1], dim=-1 )
	translation, rotation, gripper = self.process_output(translation, rotation, gripper)
	return translation, rotation, gripper
	def process_output(self,translation,rotation,gripper):
	## convert rotation from matrix to euler angles
	euler_angles = []
	for matrix in rotation[0]:
	# Convert each rotation matrix to a Rotation object
	rotation_obj = R.from_matrix(matrix.view(3, 3).detach().cpu().float().numpy().squeeze())
	# Convert to Euler angles in radians with chosen convention, e.g., 'xyz'
	euler_angle = rotation_obj.as_euler('xyz', degrees=False)
	euler_angles.append(euler_angle)

	translation = translation.detach().cpu().float().numpy().squeeze()
	## sigmoid and clip from 0-1
	gripper = np.round(torch.sigmoid(gripper).detach().cpu().float().numpy().squeeze())
	return translation,euler_angles,gripper

	def _extract_control_tokens(self, input_ids: torch.Tensor, output_tokens: torch.Tensor) -> torch.Tensor:
	"""
	Extract the action tokens from the LLM output sequence. Assumes the following order
	[image_tokens, language_tokens, action_tokens, padding]

	Args:
	input_ids: IDs of the tokens in text input sequence; shape [B, S]
	output_tokens: Token sequence output from LLM; shape [B, L, token_size]. Note the length is
	different from input_ids as it also contains image tokens
	Returns:
	torch.Tensor of shape [B, 7, token_size] containing only action tokens
	"""

	assert input_ids.ndim == 2
	assert output_tokens.ndim == 3
	batch, in_seq_len, out_seq_len = *input_ids.shape, output_tokens.shape[1]

	device = input_ids.device

	num_control_tokens = self.control_tokenizer.num_control_tokens # type: ignore[attr-defined]

	control_token_ids = torch.from_numpy( # type: ignore[attr-defined]
	self.control_tokenizer.control_token_ids # type: ignore[attr-defined]
	)
	control_token_ids = control_token_ids.to(dtype=input_ids.dtype, device=input_ids.device)
	is_control_token = torch.any( # shape: [B, S]
	input_ids.unsqueeze(-1) == control_token_ids.view(1, 1, -1),
	dim=-1,
	)
	if not torch.all(mask := is_control_token.sum(dim=-1) == num_control_tokens):
	raise RuntimeError(
	f"Can't properly detect control tokens with ids {control_token_ids} of len="
	f"{len(control_token_ids)} in input_ids {input_ids}. Rows mask: {mask}"
	)

	# Pad is_control_tokens mask to the LLM output sequence size
	tokens_mask = torch.cat( # shape: [B, L]
	[
	torch.zeros(batch, out_seq_len - in_seq_len, dtype=torch.bool, device=device),
	is_control_token.to(torch.bool),
	],
	dim=1,
	)

	control_tokens = output_tokens[tokens_mask] # shape: 1D tensor
	control_tokens = control_tokens.view( # [B, num_control_tokens, token_size]
	batch, num_control_tokens, output_tokens.shape[-1]
	)

	return control_tokens

	def _waypointer_forward(self, inputs:WaypointerInput):

	timesteps_tokens = self.timestep_proj() # [1, num_timesteps * 3, token_size]

	# Project and pack LLM tokens
	llm_tokens = self.token_proj(inputs) # [B, num_tokens, token_size]

	# TODO: Pass inputs.attn_mask if you start using the LLM tokens
	output_tokens = self.transformer( # [B, num_timesteps * 3, token_size]
	feature_tokens=llm_tokens, query_tokens=timesteps_tokens, attn_mask=None
	)

	output_tokens = output_tokens.view( # [B, num_timesteps, 3 * token_size]
	-1, self.num_timesteps, 3 * self.token_size
	)

	# if self.config.separate_control_proj:
	# [B, num_timesteps, token_size] each
	translation_tokens, rotation_tokens, gripper_tokens = torch.split(
	output_tokens, [self.token_size] * 3, dim=-1
	)

	translation = self.translation_proj(translation_tokens) # [B, num_timesteps, 3]
	rotation = self.rotation_proj(rotation_tokens) # [B, num_timesteps, rotation_components]
	gripper = self.gripper_proj(gripper_tokens) # [B, num_timesteps, 1]

	output = torch.cat( # [B, num_timesteps, control_components]
	[translation, rotation, gripper], dim=-1
	)

	return output
	# def predict_waypoints(self,input_ids: Optional[torch.LongTensor] = None, **kwargs: str) -> np.ndarray:
	# vlm_output = super().forward(
	# inputs=input_ids,
	# use_cache=use_cache,
	# output_attentions=output_attentions,
	# output_hidden_states=True,
	# return_dict=return_dict,
	# )


	@staticmethod
	def _check_unnorm_key(norm_stats: Dict[str, Dict[str, Any]], unnorm_key: Optional[str]) -> str:
	if unnorm_key is None and len(norm_stats) != 1:
	raise ValueError(
	f"Your model was trained on more than one dataset. "
	f"Please pass a `unnorm_key` from the following options to choose the statistics used for "
	f"de-normalizing actions: {norm_stats.keys()}"
	)

	# If None, grab the (singular) dataset in `norm_stats` to use as `unnorm_key`
	unnorm_key = unnorm_key if unnorm_key is not None else next(iter(norm_stats.keys()))
	if unnorm_key not in norm_stats:
	raise ValueError(
	f"The `unnorm_key` you chose ({unnorm_key = }) is not in the available statistics. "
	f"Please choose from: {norm_stats.keys()}"
	)

	return unnorm_key

	def get_action_dim(self, unnorm_key: Optional[str] = None) -> int:
	"""Get the dimensionality of the policy's action space."""
	unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
	return len(self.norm_stats[unnorm_key]["action"]["q01"])

	def get_action_stats(self, unnorm_key: Optional[str] = None) -> Dict[str, Any]:
	"""Get all the logged statistics for the given dataset."""
	unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
	return self.norm_stats[unnorm_key]["action"]

	def remove_waypointer_prefix(ckpt):
	new_state_dict = {}
	for key, value in ckpt.items():
	# Remove the 'waypointer.' prefix if it exists
	if key.startswith('waypointer.'):
	new_key = key[len('waypointer.'):]
	else:
	new_key = key
	new_state_dict[new_key] = value
	return new_state_dict

	def image_processor(image):
	image_resolution = (3,224,224)
	image = image.resize(image_resolution[1:], resample=Image.Resampling.LANCZOS)

	def read_pt(pt_path):
	data = torch.load(pt_path)
	return data

	# model_input = read_pt('/work/nikolay_nikolov/debug/inference/model_input.pt')
	# vit_output = read_pt('/work/nikolay_nikolov/debug/inference/vit_output.pt')['vit_output']
	# llm_output = read_pt('/work/nikolay_nikolov/debug/inference/llm_output.pt')['llm_output']
	# projector_output = read_pt('/work/nikolay_nikolov/debug/inference/projector_output.pt')['projector_output']
	# transformer_input = read_pt('/work/nikolay_nikolov/debug/inference/transformer_input.pt')
	# feature_tokens = transformer_input['feature_tokens']
	# timestep_tokens = transformer_input['timestep_tokens']
	# # waypointer_input_nikolay = read_pt('/work/nikolay_nikolov/debug/inference/waypointer_input.pt')
	# transformer_input = read_pt('/work/nikolay_nikolov/debug/inference/transformer_input.pt')
	# control_target = read_pt('/work/nikolay_nikolov/debug/inference/control_target.pt')

	if __name__ == "__main__":

	prismatic_config_dict = {
	"vision_backbone_id":"dinosiglip-vit-so-224px",
	"llm_backbone_id":"llama2-7b-pure",
	"arch_specifier": "no-align+gelu-mlp", ## TODO: check
	"use_fused_vision_backbone" :True, ## TODO: check
	"image_resize_strategy" : "letterbox",
	"text_config" : None,
	"llm_max_length" : 2048,
	"pad_token_id" :32000,
	"pad_to_multiple_of" : 64,
	"output_projector_states" : False,
	"return_dict": False,
	}

	token_proj_config = {
	"vit_tokens_layers": [2176, 1024],
	"control_tokens_layers": [4096, 2048, 1024],
	"image_tokens_mode": 'vit',
	'llm_image_tokens_layers': []
	}
	timestep_proj_config = {
	"pos_embed_scale": 8,
	"proj_layers": [128,512,1024],
	"time_delta_sec": 0.1,
	"num_tokens":3
	}
	pos_embed_config = {
	"num_embeddings": 300,
	"embedding_dim": 1024
	}
	encoder_block_config = {
	"feature_size": 1024,
	"head_dim": 64,
	"num_heads": 16
	}
	decoder_block_config = {
	"feature_size": 1024,
	"head_dim": 64,
	"num_heads": 16,
	"dropout": 0.0
	}
	transformer_config = {
	"pos_embed_config": pos_embed_config,
	"encoder_block_config": encoder_block_config,
	"decoder_block_config": decoder_block_config,
	"num_blocks": 2
	}

	# transformer_config:
	# autoclass: barrel.components.nn.layers.detr.DETR
	# pos_embed_config:
	# autoclass: barrel.components.nn.layers.positional_encodings.LearnedPosEmbed1D
	# num_embeddings: 300 # Max number of input tokens
	# embedding_dim: *token_size # token_size
	# # num_embeddings: 256 # Number of image tokens
	# # embedding_dim: 512 # token_size / 2
	# encoder_block_config:
	# autoclass: barrel.components.nn.layers.detr.TransformerEncoderBlock
	# feature_size: *token_size
	# # head_dim: 128
	# # num_heads: 8
	# head_dim: 64
	# num_heads: 16
	# decoder_block_config:
	# autoclass: barrel.components.nn.layers.detr.TransformerDecoderBlock
	# feature_size: *token_size
	# # head_dim: 128
	# # num_heads: 8
	# head_dim: 64
	# num_heads: 16

	TrajectoryVlaConfig_config = {
	"prismatic_config":prismatic_config_dict,
	"token_size": 1024,
	"cheat": False,
	"num_timesteps": 6,
	"rotation_components": 9,
	"seperate_control_proj": True,
	"timestep_proj_config": timestep_proj_config,
	"token_proj_config": token_proj_config,
	"transformer_config": transformer_config,
	"num_timestep_tokens": 3,
	}

	# ckpt_path = '/work/nikolay_nikolov/debug/inference/model.ckpt'
	# ckpt_params = torch.load(ckpt_path, map_location='cpu', mmap= True)
	# ckpt_params = remove_waypointer_prefix(ckpt_params)

	## Testing for prismatic
	model_config = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
	# model.load_state_dict(ckpt_params, strict=True)

	model = TrajectoryVLA(model_config)
	model = model.to(dtype=torch.bfloat16)
	model = model.to('cuda')
	model.eval()

	# with autocast('cuda',dtype=torch.bfloat16):
	# with torch.no_grad():
	# output = model.predict_tracks(model_input)


	# Get matched keys by finding keys that exist in both the model and checkpoint
	# TrajectoryVLA.load_state_dict(ckpt_params, strict=False)

	# model_keys = set(TrajectoryVLA.state_dict().keys())
	# checkpoint_keys = set(ckpt_params.keys())
	# matched_keys = model_keys.intersection(checkpoint_keys)
	# print('Matched Keys:')
	# for key in matched_keys:
	# print(key)
	# embed()

	# hf_image_processor.push_to_hub(cfg.output_hf_model_hub_path)
	# hf_processor.push_to_hub(cfg.output_hf_model_hub_path)

	# import code; code.interact(local=vars())