{
  "attn_layers": [
    -1
  ],
  "attn_loss_weights": [
    10.0
  ],
  "attn_sigma": 25,
  "chunk_size": 50,
  "dim_feedforward": 3200,
  "dim_gaze_decoder": 512,
  "dim_gaze_decoder_feedforward": 3200,
  "dim_model": 512,
  "dropout": 0.1,
  "eyes": {
    "observation.left_eye": "observation.images.left_eye_cam"
  },
  "feedforward_activation": "relu",
  "freeze_backbone": true,
  "gaze_loss_weight": 1.0,
  "gaze_sigma": 50.0,
  "image_size": [
    336,
    448
  ],
  "input_normalization_modes": {
    "observation.images.left_eye_cam": "mean_std",
    "observation.images.right_eye_cam": "mean_std",
    "observation.state": "mean_std"
  },
  "input_shapes": {
    "observation.images.left_eye_cam": [
      3,
      480,
      640
    ],
    "observation.images.right_eye_cam": [
      3,
      480,
      640
    ],
    "observation.state": [
      21
    ]
  },
  "kl_weight": 10.0,
  "latent_dim": 32,
  "n_action_steps": 50,
  "n_decoder_layers": 1,
  "n_encoder_layers": 4,
  "n_gaze_decoder_layers": 1,
  "n_heads": 8,
  "n_obs_steps": 1,
  "n_vae_encoder_layers": 4,
  "output_normalization_modes": {
    "action": "mean_std"
  },
  "output_shapes": {
    "action": [
      21
    ]
  },
  "pre_norm": false,
  "pretrained_backbone_weights": "dinov2_vits14_reg",
  "replace_final_stride_with_dilation": false,
  "temporal_ensemble_coeff": null,
  "use_attn": true,
  "use_gaze": false,
  "use_vae": true,
  "vision_backbone": "dinov2"
}