Upload config
Browse files- config.json +59 -32
- prismatic_config.py +3 -30
config.json
CHANGED
@@ -1,37 +1,64 @@
|
|
1 |
{
|
2 |
-
"arch_specifier": "no-align+gelu-mlp",
|
3 |
-
"architectures": [
|
4 |
-
"TrajectoryVLA"
|
5 |
-
],
|
6 |
"auto_map": {
|
7 |
-
"
|
8 |
},
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
"text_config": {
|
23 |
-
"model_type": "llama"
|
24 |
},
|
25 |
-
"
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
}
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
2 |
"auto_map": {
|
3 |
+
"AutoConfig": "prismatic_config.TrajectoryVLAConfig"
|
4 |
},
|
5 |
+
"cheat": false,
|
6 |
+
"model_type": "trajectoryvla",
|
7 |
+
"num_timesteps": 6,
|
8 |
+
"prismatic_config": {
|
9 |
+
"architectures": [
|
10 |
+
"TrajectoryVLA"
|
11 |
+
],
|
12 |
+
"auto_map": {
|
13 |
+
"AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
|
14 |
+
},
|
15 |
+
"model_type": "prismatic",
|
16 |
+
"return_dict": false,
|
17 |
+
"torch_dtype": "bfloat16"
|
|
|
|
|
18 |
},
|
19 |
+
"rotation_components": 9,
|
20 |
+
"seperate_control_proj": true,
|
21 |
+
"timestep_proj_config": {
|
22 |
+
"num_tokens": 3,
|
23 |
+
"pos_embed_scale": 8,
|
24 |
+
"proj_layers": [
|
25 |
+
128,
|
26 |
+
512,
|
27 |
+
1024
|
28 |
+
],
|
29 |
+
"time_delta_sec": 0.1
|
30 |
+
},
|
31 |
+
"token_proj_config": {
|
32 |
+
"control_tokens_layers": [
|
33 |
+
4096,
|
34 |
+
2048,
|
35 |
+
1024
|
36 |
+
],
|
37 |
+
"image_tokens_mode": "vit",
|
38 |
+
"llm_image_tokens_layers": [],
|
39 |
+
"vit_tokens_layers": [
|
40 |
+
2176,
|
41 |
+
1024
|
42 |
+
]
|
43 |
+
},
|
44 |
+
"token_size": 1024,
|
45 |
+
"transformer_config": {
|
46 |
+
"decoder_block_config": {
|
47 |
+
"dropout": 0.0,
|
48 |
+
"feature_size": 1024,
|
49 |
+
"head_dim": 64,
|
50 |
+
"num_heads": 16
|
51 |
+
},
|
52 |
+
"encoder_block_config": {
|
53 |
+
"feature_size": 1024,
|
54 |
+
"head_dim": 64,
|
55 |
+
"num_heads": 16
|
56 |
+
},
|
57 |
+
"num_blocks": 2,
|
58 |
+
"pos_embed_config": {
|
59 |
+
"embedding_dim": 1024,
|
60 |
+
"num_embeddings": 300
|
61 |
+
}
|
62 |
+
},
|
63 |
+
"transformers_version": "4.44.2"
|
64 |
}
|
prismatic_config.py
CHANGED
@@ -7,7 +7,7 @@ Default configuration specifies `siglip-224px+7b`.
|
|
7 |
|
8 |
from typing import Any, Dict, List, Optional
|
9 |
import transformers
|
10 |
-
from transformers import PretrainedConfig
|
11 |
from transformers.models.auto import CONFIG_MAPPING
|
12 |
import numpy as np
|
13 |
|
@@ -155,7 +155,6 @@ class PrismaticConfig(PretrainedConfig):
|
|
155 |
|
156 |
# Here we need trajectory_vla config, with
|
157 |
# prismatic_config fields and then the waypointer fields
|
158 |
-
|
159 |
class TrajectoryVLAConfig(PretrainedConfig):
|
160 |
model_type: str = "trajectoryvla"
|
161 |
|
@@ -217,7 +216,8 @@ if __name__ == "__main__" :
|
|
217 |
|
218 |
# prismatic_config = PrismaticConfig()
|
219 |
# print(prismatic_config)
|
220 |
-
|
|
|
221 |
prismatic_config_dict = {
|
222 |
"vision_backbone_id":"dinosiglip-vit-so-224px",
|
223 |
"llm_backbone_id":"llama2-7b-pure",
|
@@ -280,30 +280,3 @@ if __name__ == "__main__" :
|
|
280 |
|
281 |
TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
|
282 |
print(TrajectoryVLAConfig)
|
283 |
-
|
284 |
-
# class WaypointTokenizer:
|
285 |
-
# """
|
286 |
-
# Wraps base LLM/VLM tokenizer and overloads least used token as a control token
|
287 |
-
|
288 |
-
# NOTE: By default, assumes a BPE-style tokenizer akin to the LlamaTokenizer,
|
289 |
-
# where *the least used tokens* appear at the end of the vocabulary!
|
290 |
-
|
291 |
-
# TODO: Adding new token vs overloading? When I call `tokenizer.add_token()` vocab stays the same
|
292 |
-
# """
|
293 |
-
|
294 |
-
# def __init__(self, tokenizer: transformers.PreTrainedTokenizerBase, num_tokens: int = 10) -> None:
|
295 |
-
# self.tokenizer = tokenizer
|
296 |
-
# self.num_tokens = num_tokens
|
297 |
-
|
298 |
-
# def __call__(self, *_) -> str:
|
299 |
-
# """Get the text token for control"""
|
300 |
-
# return self.tokenizer.decode(self.control_token_ids)
|
301 |
-
|
302 |
-
# @property
|
303 |
-
# def control_token_ids(self) -> np.ndarray:
|
304 |
-
# # Assumes we're overwriting the final tokens of the vocabulary (least used tokens)
|
305 |
-
# return np.arange(self.num_tokens) + int(self.tokenizer.vocab_size - self.num_tokens)
|
306 |
-
|
307 |
-
# @property
|
308 |
-
# def num_control_tokens(self) -> int:
|
309 |
-
# return self.num_tokens
|
|
|
7 |
|
8 |
from typing import Any, Dict, List, Optional
|
9 |
import transformers
|
10 |
+
from transformers import PretrainedConfig,AutoModel, AutoConfig
|
11 |
from transformers.models.auto import CONFIG_MAPPING
|
12 |
import numpy as np
|
13 |
|
|
|
155 |
|
156 |
# Here we need trajectory_vla config, with
|
157 |
# prismatic_config fields and then the waypointer fields
|
|
|
158 |
class TrajectoryVLAConfig(PretrainedConfig):
|
159 |
model_type: str = "trajectoryvla"
|
160 |
|
|
|
216 |
|
217 |
# prismatic_config = PrismaticConfig()
|
218 |
# print(prismatic_config)
|
219 |
+
AutoConfig.register("prismatic",PrismaticConfig)
|
220 |
+
AutoConfig.register("trajectoryvla",TrajectoryVLAConfig)
|
221 |
prismatic_config_dict = {
|
222 |
"vision_backbone_id":"dinosiglip-vit-so-224px",
|
223 |
"llm_backbone_id":"llama2-7b-pure",
|
|
|
280 |
|
281 |
TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
|
282 |
print(TrajectoryVLAConfig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|