Sombit commited on
Commit
c7f4f9e
·
verified ·
1 Parent(s): f73fcce

Upload config

Browse files
Files changed (2) hide show
  1. config.json +59 -32
  2. prismatic_config.py +60 -60
config.json CHANGED
@@ -1,37 +1,64 @@
1
  {
2
- "arch_specifier": "no-align+gelu-mlp",
3
- "architectures": [
4
- "TrajectoryVLA"
5
- ],
6
  "auto_map": {
7
- "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
8
  },
9
- "hf_llm_id": "meta-llama/Llama-2-7b-hf",
10
- "image_resize_strategy": "letterbox",
11
- "image_sizes": [
12
- 224,
13
- 224
14
- ],
15
- "llm_backbone_id": "llama2-7b-pure",
16
- "llm_max_length": 2048,
17
- "model_type": "prismatic",
18
- "output_projector_states": false,
19
- "pad_to_multiple_of": 64,
20
- "pad_token_id": 32000,
21
- "return_dict": false,
22
- "text_config": {
23
- "model_type": "llama"
24
  },
25
- "timm_model_ids": [
26
- "vit_large_patch14_reg4_dinov2.lvd142m",
27
- "vit_so400m_patch14_siglip_224"
28
- ],
29
- "timm_override_act_layers": [
30
- null,
31
- null
32
- ],
33
- "torch_dtype": "bfloat16",
34
- "transformers_version": "4.44.2",
35
- "use_fused_vision_backbone": true,
36
- "vision_backbone_id": "dinosiglip-vit-so-224px"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
 
1
  {
 
 
 
 
2
  "auto_map": {
3
+ "AutoConfig": "prismatic_config.TrajectoryVLAConfig"
4
  },
5
+ "cheat": false,
6
+ "model_type": "trajectoryvla",
7
+ "num_timesteps": 6,
8
+ "prismatic_config": {
9
+ "architectures": [
10
+ "TrajectoryVLA"
11
+ ],
12
+ "auto_map": {
13
+ "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
14
+ },
15
+ "model_type": "prismatic",
16
+ "return_dict": false,
17
+ "torch_dtype": "bfloat16"
 
 
18
  },
19
+ "rotation_components": 9,
20
+ "seperate_control_proj": true,
21
+ "timestep_proj_config": {
22
+ "num_tokens": 3,
23
+ "pos_embed_scale": 8,
24
+ "proj_layers": [
25
+ 128,
26
+ 512,
27
+ 1024
28
+ ],
29
+ "time_delta_sec": 0.1
30
+ },
31
+ "token_proj_config": {
32
+ "control_tokens_layers": [
33
+ 4096,
34
+ 2048,
35
+ 1024
36
+ ],
37
+ "image_tokens_mode": "vit",
38
+ "llm_image_tokens_layers": [],
39
+ "vit_tokens_layers": [
40
+ 2176,
41
+ 1024
42
+ ]
43
+ },
44
+ "token_size": 1024,
45
+ "transformer_config": {
46
+ "decoder_block_config": {
47
+ "dropout": 0.0,
48
+ "feature_size": 1024,
49
+ "head_dim": 64,
50
+ "num_heads": 16
51
+ },
52
+ "encoder_block_config": {
53
+ "feature_size": 1024,
54
+ "head_dim": 64,
55
+ "num_heads": 16
56
+ },
57
+ "num_blocks": 2,
58
+ "pos_embed_config": {
59
+ "embedding_dim": 1024,
60
+ "num_embeddings": 300
61
+ }
62
+ },
63
+ "transformers_version": "4.44.2"
64
  }
prismatic_config.py CHANGED
@@ -176,7 +176,8 @@ class TrajectoryVLAConfig(PretrainedConfig):
176
  # **kwargs: str,
177
  ):
178
 
179
- super().__init__(**prismatic_config)
 
180
  self.prismatic_config = PrismaticConfig(**prismatic_config)
181
 
182
  self.token_size = token_size
@@ -197,29 +198,6 @@ class TrajectoryVLAConfig(PretrainedConfig):
197
  @property
198
  def num_timestep_tokens(self) -> int:
199
  return self.timestep_proj_config['num_tokens']
200
- # class WaypointerConfig(ConfigurableModuleConfig):
201
- # token_size: int = 1024 # Timestep token size
202
-
203
- # cheat: bool # If True, cheat and use action tokens; Works only with OpenVLA checkpoint
204
-
205
- # timestep_proj_config: AutoConfig # Timestep tokens
206
- # token_proj_config: TokenProjectorConfig # LLM output tokens projection and packing
207
- # transformer_config: AutoConfig # Transformer config
208
-
209
- # # Output configurations
210
- # num_timesteps: int = 20 # Number of prediction time steps
211
- # rotation_components: int = 3 # Number of rotation componens: euler -> 3, quaternion -> 4, rotmat -> 9
212
- # separate_control_proj: bool = True # If True, project control components separately
213
-
214
- # @property
215
- # def control_components(self) -> int:
216
- # # Number of control dimensions: 3 translation, N rotation, 1 gripper
217
- # return 3 + self.rotation_components + 1
218
-
219
- # @property
220
- # def num_timestep_tokens(self) -> int:
221
- # return self.timestep_proj_config.num_tokens
222
-
223
 
224
  class OpenVLAConfig(PrismaticConfig):
225
  model_type: str = "openvla"
@@ -237,73 +215,95 @@ class OpenVLAConfig(PrismaticConfig):
237
  if __name__ == "__main__" :
238
  # yaml_file = 'barrel/pipes/vlams/configs/waypoints/waypointer_multistep_fractal.yaml'
239
 
240
- prismatic_config = PrismaticConfig()
241
- print(prismatic_config)
242
 
243
  prismatic_config_dict = {
244
  "vision_backbone_id":"dinosiglip-vit-so-224px",
245
- # "llm_backbone_id":"llama2-7b-pure",meta-llama/Llama-2-7b-hf
246
- "llm_backbone_id": "meta-llama/Llama-2-7b-hf",
247
-
248
  "arch_specifier": "no-align+gelu-mlp", ## TODO: check
249
- "use_fused_vision_backbone" :None, ## TODO: check
250
  "image_resize_strategy" : "letterbox",
251
  "text_config" : None,
252
  "llm_max_length" : 2048,
253
  "pad_token_id" :32000,
254
  "pad_to_multiple_of" : 64,
255
  "output_projector_states" : False,
 
256
  }
 
257
  token_proj_config = {
258
  "vit_tokens_layers": [2176, 1024],
259
  "control_tokens_layers": [4096, 2048, 1024],
260
  "image_tokens_mode": 'vit',
 
261
  }
262
  timestep_proj_config = {
263
- "pos_embed_scale": 1.0,
264
- "proj_layers": [1024],
265
  "time_delta_sec": 0.1,
266
  "num_tokens":3
267
  }
268
-
269
- TrajectoryVlaConfig = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  "prismatic_config":prismatic_config_dict,
271
  "token_size": 1024,
272
  "cheat": False,
273
- "num_timesteps": 20,
274
- "rotation_components": 3,
275
  "seperate_control_proj": True,
276
- "timestep_proj_config": {},
277
- "token_proj_config": {},
278
- "transformer_config": {},
 
279
  }
280
 
281
- TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig)
282
  print(TrajectoryVLAConfig)
283
 
284
- class WaypointTokenizer:
285
- """
286
- Wraps base LLM/VLM tokenizer and overloads least used token as a control token
287
 
288
- NOTE: By default, assumes a BPE-style tokenizer akin to the LlamaTokenizer,
289
- where *the least used tokens* appear at the end of the vocabulary!
290
 
291
- TODO: Adding new token vs overloading? When I call `tokenizer.add_token()` vocab stays the same
292
- """
293
 
294
- def __init__(self, tokenizer: transformers.PreTrainedTokenizerBase, num_tokens: int = 10) -> None:
295
- self.tokenizer = tokenizer
296
- self.num_tokens = num_tokens
297
 
298
- def __call__(self, *_) -> str:
299
- """Get the text token for control"""
300
- return self.tokenizer.decode(self.control_token_ids)
301
 
302
- @property
303
- def control_token_ids(self) -> np.ndarray:
304
- # Assumes we're overwriting the final tokens of the vocabulary (least used tokens)
305
- return np.arange(self.num_tokens) + int(self.tokenizer.vocab_size - self.num_tokens)
306
 
307
- @property
308
- def num_control_tokens(self) -> int:
309
- return self.num_tokens
 
176
  # **kwargs: str,
177
  ):
178
 
179
+ # super().__init__(**prismatic_config)
180
+ super().__init__()
181
  self.prismatic_config = PrismaticConfig(**prismatic_config)
182
 
183
  self.token_size = token_size
 
198
  @property
199
  def num_timestep_tokens(self) -> int:
200
  return self.timestep_proj_config['num_tokens']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  class OpenVLAConfig(PrismaticConfig):
203
  model_type: str = "openvla"
 
215
  if __name__ == "__main__" :
216
  # yaml_file = 'barrel/pipes/vlams/configs/waypoints/waypointer_multistep_fractal.yaml'
217
 
218
+ # prismatic_config = PrismaticConfig()
219
+ # print(prismatic_config)
220
 
221
  prismatic_config_dict = {
222
  "vision_backbone_id":"dinosiglip-vit-so-224px",
223
+ "llm_backbone_id":"llama2-7b-pure",
 
 
224
  "arch_specifier": "no-align+gelu-mlp", ## TODO: check
225
+ "use_fused_vision_backbone" :True, ## TODO: check
226
  "image_resize_strategy" : "letterbox",
227
  "text_config" : None,
228
  "llm_max_length" : 2048,
229
  "pad_token_id" :32000,
230
  "pad_to_multiple_of" : 64,
231
  "output_projector_states" : False,
232
+ "return_dict": False,
233
  }
234
+
235
  token_proj_config = {
236
  "vit_tokens_layers": [2176, 1024],
237
  "control_tokens_layers": [4096, 2048, 1024],
238
  "image_tokens_mode": 'vit',
239
+ 'llm_image_tokens_layers': []
240
  }
241
  timestep_proj_config = {
242
+ "pos_embed_scale": 8,
243
+ "proj_layers": [128,512,1024],
244
  "time_delta_sec": 0.1,
245
  "num_tokens":3
246
  }
247
+ pos_embed_config = {
248
+ "num_embeddings": 300,
249
+ "embedding_dim": 1024
250
+ }
251
+ encoder_block_config = {
252
+ "feature_size": 1024,
253
+ "head_dim": 64,
254
+ "num_heads": 16
255
+ }
256
+ decoder_block_config = {
257
+ "feature_size": 1024,
258
+ "head_dim": 64,
259
+ "num_heads": 16,
260
+ "dropout": 0.0
261
+ }
262
+ transformer_config = {
263
+ "pos_embed_config": pos_embed_config,
264
+ "encoder_block_config": encoder_block_config,
265
+ "decoder_block_config": decoder_block_config,
266
+ "num_blocks": 2
267
+ }
268
+ TrajectoryVlaConfig_config = {
269
  "prismatic_config":prismatic_config_dict,
270
  "token_size": 1024,
271
  "cheat": False,
272
+ "num_timesteps": 6,
273
+ "rotation_components": 9,
274
  "seperate_control_proj": True,
275
+ "timestep_proj_config": timestep_proj_config,
276
+ "token_proj_config": token_proj_config,
277
+ "transformer_config": transformer_config,
278
+ "num_timestep_tokens": 3,
279
  }
280
 
281
+ TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
282
  print(TrajectoryVLAConfig)
283
 
284
+ # class WaypointTokenizer:
285
+ # """
286
+ # Wraps base LLM/VLM tokenizer and overloads least used token as a control token
287
 
288
+ # NOTE: By default, assumes a BPE-style tokenizer akin to the LlamaTokenizer,
289
+ # where *the least used tokens* appear at the end of the vocabulary!
290
 
291
+ # TODO: Adding new token vs overloading? When I call `tokenizer.add_token()` vocab stays the same
292
+ # """
293
 
294
+ # def __init__(self, tokenizer: transformers.PreTrainedTokenizerBase, num_tokens: int = 10) -> None:
295
+ # self.tokenizer = tokenizer
296
+ # self.num_tokens = num_tokens
297
 
298
+ # def __call__(self, *_) -> str:
299
+ # """Get the text token for control"""
300
+ # return self.tokenizer.decode(self.control_token_ids)
301
 
302
+ # @property
303
+ # def control_token_ids(self) -> np.ndarray:
304
+ # # Assumes we're overwriting the final tokens of the vocabulary (least used tokens)
305
+ # return np.arange(self.num_tokens) + int(self.tokenizer.vocab_size - self.num_tokens)
306
 
307
+ # @property
308
+ # def num_control_tokens(self) -> int:
309
+ # return self.num_tokens