Upload config
Browse files- config.json +59 -32
- prismatic_config.py +60 -60
config.json
CHANGED
@@ -1,37 +1,64 @@
|
|
1 |
{
|
2 |
-
"arch_specifier": "no-align+gelu-mlp",
|
3 |
-
"architectures": [
|
4 |
-
"TrajectoryVLA"
|
5 |
-
],
|
6 |
"auto_map": {
|
7 |
-
"
|
8 |
},
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
"text_config": {
|
23 |
-
"model_type": "llama"
|
24 |
},
|
25 |
-
"
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
}
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
2 |
"auto_map": {
|
3 |
+
"AutoConfig": "prismatic_config.TrajectoryVLAConfig"
|
4 |
},
|
5 |
+
"cheat": false,
|
6 |
+
"model_type": "trajectoryvla",
|
7 |
+
"num_timesteps": 6,
|
8 |
+
"prismatic_config": {
|
9 |
+
"architectures": [
|
10 |
+
"TrajectoryVLA"
|
11 |
+
],
|
12 |
+
"auto_map": {
|
13 |
+
"AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
|
14 |
+
},
|
15 |
+
"model_type": "prismatic",
|
16 |
+
"return_dict": false,
|
17 |
+
"torch_dtype": "bfloat16"
|
|
|
|
|
18 |
},
|
19 |
+
"rotation_components": 9,
|
20 |
+
"seperate_control_proj": true,
|
21 |
+
"timestep_proj_config": {
|
22 |
+
"num_tokens": 3,
|
23 |
+
"pos_embed_scale": 8,
|
24 |
+
"proj_layers": [
|
25 |
+
128,
|
26 |
+
512,
|
27 |
+
1024
|
28 |
+
],
|
29 |
+
"time_delta_sec": 0.1
|
30 |
+
},
|
31 |
+
"token_proj_config": {
|
32 |
+
"control_tokens_layers": [
|
33 |
+
4096,
|
34 |
+
2048,
|
35 |
+
1024
|
36 |
+
],
|
37 |
+
"image_tokens_mode": "vit",
|
38 |
+
"llm_image_tokens_layers": [],
|
39 |
+
"vit_tokens_layers": [
|
40 |
+
2176,
|
41 |
+
1024
|
42 |
+
]
|
43 |
+
},
|
44 |
+
"token_size": 1024,
|
45 |
+
"transformer_config": {
|
46 |
+
"decoder_block_config": {
|
47 |
+
"dropout": 0.0,
|
48 |
+
"feature_size": 1024,
|
49 |
+
"head_dim": 64,
|
50 |
+
"num_heads": 16
|
51 |
+
},
|
52 |
+
"encoder_block_config": {
|
53 |
+
"feature_size": 1024,
|
54 |
+
"head_dim": 64,
|
55 |
+
"num_heads": 16
|
56 |
+
},
|
57 |
+
"num_blocks": 2,
|
58 |
+
"pos_embed_config": {
|
59 |
+
"embedding_dim": 1024,
|
60 |
+
"num_embeddings": 300
|
61 |
+
}
|
62 |
+
},
|
63 |
+
"transformers_version": "4.44.2"
|
64 |
}
|
prismatic_config.py
CHANGED
@@ -176,7 +176,8 @@ class TrajectoryVLAConfig(PretrainedConfig):
|
|
176 |
# **kwargs: str,
|
177 |
):
|
178 |
|
179 |
-
super().__init__(**prismatic_config)
|
|
|
180 |
self.prismatic_config = PrismaticConfig(**prismatic_config)
|
181 |
|
182 |
self.token_size = token_size
|
@@ -197,29 +198,6 @@ class TrajectoryVLAConfig(PretrainedConfig):
|
|
197 |
@property
|
198 |
def num_timestep_tokens(self) -> int:
|
199 |
return self.timestep_proj_config['num_tokens']
|
200 |
-
# class WaypointerConfig(ConfigurableModuleConfig):
|
201 |
-
# token_size: int = 1024 # Timestep token size
|
202 |
-
|
203 |
-
# cheat: bool # If True, cheat and use action tokens; Works only with OpenVLA checkpoint
|
204 |
-
|
205 |
-
# timestep_proj_config: AutoConfig # Timestep tokens
|
206 |
-
# token_proj_config: TokenProjectorConfig # LLM output tokens projection and packing
|
207 |
-
# transformer_config: AutoConfig # Transformer config
|
208 |
-
|
209 |
-
# # Output configurations
|
210 |
-
# num_timesteps: int = 20 # Number of prediction time steps
|
211 |
-
# rotation_components: int = 3 # Number of rotation componens: euler -> 3, quaternion -> 4, rotmat -> 9
|
212 |
-
# separate_control_proj: bool = True # If True, project control components separately
|
213 |
-
|
214 |
-
# @property
|
215 |
-
# def control_components(self) -> int:
|
216 |
-
# # Number of control dimensions: 3 translation, N rotation, 1 gripper
|
217 |
-
# return 3 + self.rotation_components + 1
|
218 |
-
|
219 |
-
# @property
|
220 |
-
# def num_timestep_tokens(self) -> int:
|
221 |
-
# return self.timestep_proj_config.num_tokens
|
222 |
-
|
223 |
|
224 |
class OpenVLAConfig(PrismaticConfig):
|
225 |
model_type: str = "openvla"
|
@@ -237,73 +215,95 @@ class OpenVLAConfig(PrismaticConfig):
|
|
237 |
if __name__ == "__main__" :
|
238 |
# yaml_file = 'barrel/pipes/vlams/configs/waypoints/waypointer_multistep_fractal.yaml'
|
239 |
|
240 |
-
prismatic_config = PrismaticConfig()
|
241 |
-
print(prismatic_config)
|
242 |
|
243 |
prismatic_config_dict = {
|
244 |
"vision_backbone_id":"dinosiglip-vit-so-224px",
|
245 |
-
|
246 |
-
"llm_backbone_id": "meta-llama/Llama-2-7b-hf",
|
247 |
-
|
248 |
"arch_specifier": "no-align+gelu-mlp", ## TODO: check
|
249 |
-
"use_fused_vision_backbone" :
|
250 |
"image_resize_strategy" : "letterbox",
|
251 |
"text_config" : None,
|
252 |
"llm_max_length" : 2048,
|
253 |
"pad_token_id" :32000,
|
254 |
"pad_to_multiple_of" : 64,
|
255 |
"output_projector_states" : False,
|
|
|
256 |
}
|
|
|
257 |
token_proj_config = {
|
258 |
"vit_tokens_layers": [2176, 1024],
|
259 |
"control_tokens_layers": [4096, 2048, 1024],
|
260 |
"image_tokens_mode": 'vit',
|
|
|
261 |
}
|
262 |
timestep_proj_config = {
|
263 |
-
"pos_embed_scale":
|
264 |
-
"proj_layers": [1024],
|
265 |
"time_delta_sec": 0.1,
|
266 |
"num_tokens":3
|
267 |
}
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
"prismatic_config":prismatic_config_dict,
|
271 |
"token_size": 1024,
|
272 |
"cheat": False,
|
273 |
-
"num_timesteps":
|
274 |
-
"rotation_components":
|
275 |
"seperate_control_proj": True,
|
276 |
-
"timestep_proj_config":
|
277 |
-
"token_proj_config":
|
278 |
-
"transformer_config":
|
|
|
279 |
}
|
280 |
|
281 |
-
TrajectoryVLAConfig = TrajectoryVLAConfig( **
|
282 |
print(TrajectoryVLAConfig)
|
283 |
|
284 |
-
class WaypointTokenizer:
|
285 |
-
|
286 |
-
|
287 |
|
288 |
-
|
289 |
-
|
290 |
|
291 |
-
|
292 |
-
|
293 |
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
|
307 |
-
|
308 |
-
|
309 |
-
|
|
|
176 |
# **kwargs: str,
|
177 |
):
|
178 |
|
179 |
+
# super().__init__(**prismatic_config)
|
180 |
+
super().__init__()
|
181 |
self.prismatic_config = PrismaticConfig(**prismatic_config)
|
182 |
|
183 |
self.token_size = token_size
|
|
|
198 |
@property
|
199 |
def num_timestep_tokens(self) -> int:
|
200 |
return self.timestep_proj_config['num_tokens']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
class OpenVLAConfig(PrismaticConfig):
|
203 |
model_type: str = "openvla"
|
|
|
215 |
if __name__ == "__main__" :
|
216 |
# yaml_file = 'barrel/pipes/vlams/configs/waypoints/waypointer_multistep_fractal.yaml'
|
217 |
|
218 |
+
# prismatic_config = PrismaticConfig()
|
219 |
+
# print(prismatic_config)
|
220 |
|
221 |
prismatic_config_dict = {
|
222 |
"vision_backbone_id":"dinosiglip-vit-so-224px",
|
223 |
+
"llm_backbone_id":"llama2-7b-pure",
|
|
|
|
|
224 |
"arch_specifier": "no-align+gelu-mlp", ## TODO: check
|
225 |
+
"use_fused_vision_backbone" :True, ## TODO: check
|
226 |
"image_resize_strategy" : "letterbox",
|
227 |
"text_config" : None,
|
228 |
"llm_max_length" : 2048,
|
229 |
"pad_token_id" :32000,
|
230 |
"pad_to_multiple_of" : 64,
|
231 |
"output_projector_states" : False,
|
232 |
+
"return_dict": False,
|
233 |
}
|
234 |
+
|
235 |
token_proj_config = {
|
236 |
"vit_tokens_layers": [2176, 1024],
|
237 |
"control_tokens_layers": [4096, 2048, 1024],
|
238 |
"image_tokens_mode": 'vit',
|
239 |
+
'llm_image_tokens_layers': []
|
240 |
}
|
241 |
timestep_proj_config = {
|
242 |
+
"pos_embed_scale": 8,
|
243 |
+
"proj_layers": [128,512,1024],
|
244 |
"time_delta_sec": 0.1,
|
245 |
"num_tokens":3
|
246 |
}
|
247 |
+
pos_embed_config = {
|
248 |
+
"num_embeddings": 300,
|
249 |
+
"embedding_dim": 1024
|
250 |
+
}
|
251 |
+
encoder_block_config = {
|
252 |
+
"feature_size": 1024,
|
253 |
+
"head_dim": 64,
|
254 |
+
"num_heads": 16
|
255 |
+
}
|
256 |
+
decoder_block_config = {
|
257 |
+
"feature_size": 1024,
|
258 |
+
"head_dim": 64,
|
259 |
+
"num_heads": 16,
|
260 |
+
"dropout": 0.0
|
261 |
+
}
|
262 |
+
transformer_config = {
|
263 |
+
"pos_embed_config": pos_embed_config,
|
264 |
+
"encoder_block_config": encoder_block_config,
|
265 |
+
"decoder_block_config": decoder_block_config,
|
266 |
+
"num_blocks": 2
|
267 |
+
}
|
268 |
+
TrajectoryVlaConfig_config = {
|
269 |
"prismatic_config":prismatic_config_dict,
|
270 |
"token_size": 1024,
|
271 |
"cheat": False,
|
272 |
+
"num_timesteps": 6,
|
273 |
+
"rotation_components": 9,
|
274 |
"seperate_control_proj": True,
|
275 |
+
"timestep_proj_config": timestep_proj_config,
|
276 |
+
"token_proj_config": token_proj_config,
|
277 |
+
"transformer_config": transformer_config,
|
278 |
+
"num_timestep_tokens": 3,
|
279 |
}
|
280 |
|
281 |
+
TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
|
282 |
print(TrajectoryVLAConfig)
|
283 |
|
284 |
+
# class WaypointTokenizer:
|
285 |
+
# """
|
286 |
+
# Wraps base LLM/VLM tokenizer and overloads least used token as a control token
|
287 |
|
288 |
+
# NOTE: By default, assumes a BPE-style tokenizer akin to the LlamaTokenizer,
|
289 |
+
# where *the least used tokens* appear at the end of the vocabulary!
|
290 |
|
291 |
+
# TODO: Adding new token vs overloading? When I call `tokenizer.add_token()` vocab stays the same
|
292 |
+
# """
|
293 |
|
294 |
+
# def __init__(self, tokenizer: transformers.PreTrainedTokenizerBase, num_tokens: int = 10) -> None:
|
295 |
+
# self.tokenizer = tokenizer
|
296 |
+
# self.num_tokens = num_tokens
|
297 |
|
298 |
+
# def __call__(self, *_) -> str:
|
299 |
+
# """Get the text token for control"""
|
300 |
+
# return self.tokenizer.decode(self.control_token_ids)
|
301 |
|
302 |
+
# @property
|
303 |
+
# def control_token_ids(self) -> np.ndarray:
|
304 |
+
# # Assumes we're overwriting the final tokens of the vocabulary (least used tokens)
|
305 |
+
# return np.arange(self.num_tokens) + int(self.tokenizer.vocab_size - self.num_tokens)
|
306 |
|
307 |
+
# @property
|
308 |
+
# def num_control_tokens(self) -> int:
|
309 |
+
# return self.num_tokens
|