|
{ |
|
"_name_or_path": "", |
|
"architectures": [ |
|
"RQVAESIGLIPTransformer" |
|
], |
|
"hidden_size": 1024, |
|
"model_type": "rqvaesigliptransformer_model", |
|
"rqtransformer": { |
|
"architectures": [ |
|
"RQTransformer" |
|
], |
|
"block_size": [ |
|
16, |
|
16, |
|
4 |
|
], |
|
"embed_dim": 2560, |
|
"head": { |
|
"block": { |
|
"n_head": 40 |
|
}, |
|
"n_layer": 6 |
|
}, |
|
"input_embed_dim_1": 1024, |
|
"input_embed_dim_2": 4096, |
|
"model_type": "rqtransformer_model", |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.36.2", |
|
"vocab_size": 16384 |
|
} |
|
, |
|
"rqvaesiglip": { |
|
"architectures": [ |
|
"RQVAESiglip" |
|
], |
|
"bottleneck_type": "rq", |
|
"checkpointing": true, |
|
"ckpt_path": null, |
|
"code_shape": [ |
|
16, |
|
16, |
|
4 |
|
], |
|
"ddconfig": { |
|
"attn_resolutions": [ |
|
16 |
|
], |
|
"ch": 128, |
|
"ch_mult": [ |
|
1, |
|
1, |
|
2, |
|
2, |
|
4 |
|
], |
|
"double_z": false, |
|
"dropout": 0.0, |
|
"in_channels": 3, |
|
"num_res_blocks": 2, |
|
"out_ch": 3, |
|
"resolution": 256, |
|
"z_channels": 256 |
|
}, |
|
"decay": 0.99, |
|
"embed_dim": 1024, |
|
"hidden_size": 1024, |
|
"ignore_keys": null, |
|
"latent_loss_weight": 0.25, |
|
"latent_shape": [ |
|
16, |
|
16, |
|
1024 |
|
], |
|
"loss_type": "mse", |
|
"model_type": "rqvaesiglip_model", |
|
"n_embed": 16384, |
|
"pretrained_model": "google/siglip-large-patch16-256", |
|
"restart_unused_codes": true, |
|
"shared_codebook": true, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.36.2" |
|
} |
|
, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.36.2" |
|
} |
|
|