Sombit commited on
Commit
3b501b7
·
verified ·
1 Parent(s): 303fe03

Upload config

Browse files
Files changed (2) hide show
  1. config.json +59 -32
  2. prismatic_config.py +3 -30
config.json CHANGED
@@ -1,37 +1,64 @@
1
  {
2
- "arch_specifier": "no-align+gelu-mlp",
3
- "architectures": [
4
- "TrajectoryVLA"
5
- ],
6
  "auto_map": {
7
- "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
8
  },
9
- "hf_llm_id": "meta-llama/Llama-2-7b-hf",
10
- "image_resize_strategy": "letterbox",
11
- "image_sizes": [
12
- 224,
13
- 224
14
- ],
15
- "llm_backbone_id": "llama2-7b-pure",
16
- "llm_max_length": 2048,
17
- "model_type": "prismatic",
18
- "output_projector_states": false,
19
- "pad_to_multiple_of": 64,
20
- "pad_token_id": 32000,
21
- "return_dict": false,
22
- "text_config": {
23
- "model_type": "llama"
24
  },
25
- "timm_model_ids": [
26
- "vit_large_patch14_reg4_dinov2.lvd142m",
27
- "vit_so400m_patch14_siglip_224"
28
- ],
29
- "timm_override_act_layers": [
30
- null,
31
- null
32
- ],
33
- "torch_dtype": "bfloat16",
34
- "transformers_version": "4.44.2",
35
- "use_fused_vision_backbone": true,
36
- "vision_backbone_id": "dinosiglip-vit-so-224px"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
 
1
  {
 
 
 
 
2
  "auto_map": {
3
+ "AutoConfig": "prismatic_config.TrajectoryVLAConfig"
4
  },
5
+ "cheat": false,
6
+ "model_type": "trajectoryvla",
7
+ "num_timesteps": 6,
8
+ "prismatic_config": {
9
+ "architectures": [
10
+ "TrajectoryVLA"
11
+ ],
12
+ "auto_map": {
13
+ "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
14
+ },
15
+ "model_type": "prismatic",
16
+ "return_dict": false,
17
+ "torch_dtype": "bfloat16"
 
 
18
  },
19
+ "rotation_components": 9,
20
+ "seperate_control_proj": true,
21
+ "timestep_proj_config": {
22
+ "num_tokens": 3,
23
+ "pos_embed_scale": 8,
24
+ "proj_layers": [
25
+ 128,
26
+ 512,
27
+ 1024
28
+ ],
29
+ "time_delta_sec": 0.1
30
+ },
31
+ "token_proj_config": {
32
+ "control_tokens_layers": [
33
+ 4096,
34
+ 2048,
35
+ 1024
36
+ ],
37
+ "image_tokens_mode": "vit",
38
+ "llm_image_tokens_layers": [],
39
+ "vit_tokens_layers": [
40
+ 2176,
41
+ 1024
42
+ ]
43
+ },
44
+ "token_size": 1024,
45
+ "transformer_config": {
46
+ "decoder_block_config": {
47
+ "dropout": 0.0,
48
+ "feature_size": 1024,
49
+ "head_dim": 64,
50
+ "num_heads": 16
51
+ },
52
+ "encoder_block_config": {
53
+ "feature_size": 1024,
54
+ "head_dim": 64,
55
+ "num_heads": 16
56
+ },
57
+ "num_blocks": 2,
58
+ "pos_embed_config": {
59
+ "embedding_dim": 1024,
60
+ "num_embeddings": 300
61
+ }
62
+ },
63
+ "transformers_version": "4.44.2"
64
  }
prismatic_config.py CHANGED
@@ -7,7 +7,7 @@ Default configuration specifies `siglip-224px+7b`.
7
 
8
  from typing import Any, Dict, List, Optional
9
  import transformers
10
- from transformers import PretrainedConfig
11
  from transformers.models.auto import CONFIG_MAPPING
12
  import numpy as np
13
 
@@ -155,7 +155,6 @@ class PrismaticConfig(PretrainedConfig):
155
 
156
  # Here we need trajectory_vla config, with
157
  # prismatic_config fields and then the waypointer fields
158
-
159
  class TrajectoryVLAConfig(PretrainedConfig):
160
  model_type: str = "trajectoryvla"
161
 
@@ -217,7 +216,8 @@ if __name__ == "__main__" :
217
 
218
  # prismatic_config = PrismaticConfig()
219
  # print(prismatic_config)
220
-
 
221
  prismatic_config_dict = {
222
  "vision_backbone_id":"dinosiglip-vit-so-224px",
223
  "llm_backbone_id":"llama2-7b-pure",
@@ -280,30 +280,3 @@ if __name__ == "__main__" :
280
 
281
  TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
282
  print(TrajectoryVLAConfig)
283
-
284
- # class WaypointTokenizer:
285
- # """
286
- # Wraps base LLM/VLM tokenizer and overloads least used token as a control token
287
-
288
- # NOTE: By default, assumes a BPE-style tokenizer akin to the LlamaTokenizer,
289
- # where *the least used tokens* appear at the end of the vocabulary!
290
-
291
- # TODO: Adding new token vs overloading? When I call `tokenizer.add_token()` vocab stays the same
292
- # """
293
-
294
- # def __init__(self, tokenizer: transformers.PreTrainedTokenizerBase, num_tokens: int = 10) -> None:
295
- # self.tokenizer = tokenizer
296
- # self.num_tokens = num_tokens
297
-
298
- # def __call__(self, *_) -> str:
299
- # """Get the text token for control"""
300
- # return self.tokenizer.decode(self.control_token_ids)
301
-
302
- # @property
303
- # def control_token_ids(self) -> np.ndarray:
304
- # # Assumes we're overwriting the final tokens of the vocabulary (least used tokens)
305
- # return np.arange(self.num_tokens) + int(self.tokenizer.vocab_size - self.num_tokens)
306
-
307
- # @property
308
- # def num_control_tokens(self) -> int:
309
- # return self.num_tokens
 
7
 
8
  from typing import Any, Dict, List, Optional
9
  import transformers
10
+ from transformers import PretrainedConfig,AutoModel, AutoConfig
11
  from transformers.models.auto import CONFIG_MAPPING
12
  import numpy as np
13
 
 
155
 
156
  # Here we need trajectory_vla config, with
157
  # prismatic_config fields and then the waypointer fields
 
158
  class TrajectoryVLAConfig(PretrainedConfig):
159
  model_type: str = "trajectoryvla"
160
 
 
216
 
217
  # prismatic_config = PrismaticConfig()
218
  # print(prismatic_config)
219
+ AutoConfig.register("prismatic",PrismaticConfig)
220
+ AutoConfig.register("trajectoryvla",TrajectoryVLAConfig)
221
  prismatic_config_dict = {
222
  "vision_backbone_id":"dinosiglip-vit-so-224px",
223
  "llm_backbone_id":"llama2-7b-pure",
 
280
 
281
  TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
282
  print(TrajectoryVLAConfig)