|
{ |
|
"module": "keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_backbone", |
|
"class_name": "StableDiffusion3Backbone", |
|
"config": { |
|
"name": "stable_diffusion_3.5_backbone", |
|
"trainable": true, |
|
"mmdit_patch_size": 2, |
|
"mmdit_hidden_dim": 2432, |
|
"mmdit_num_layers": 38, |
|
"mmdit_num_heads": 38, |
|
"mmdit_position_size": 192, |
|
"mmdit_qk_norm": "rms_norm", |
|
"vae": { |
|
"module": "keras_hub.src.models.vae.vae_backbone", |
|
"class_name": "VAEBackbone", |
|
"config": { |
|
"name": "vae", |
|
"trainable": true, |
|
"encoder_num_filters": [ |
|
128, |
|
256, |
|
512, |
|
512 |
|
], |
|
"encoder_num_blocks": [ |
|
2, |
|
2, |
|
2, |
|
2 |
|
], |
|
"decoder_num_filters": [ |
|
512, |
|
512, |
|
256, |
|
128 |
|
], |
|
"decoder_num_blocks": [ |
|
3, |
|
3, |
|
3, |
|
3 |
|
], |
|
"sampler_method": "sample", |
|
"input_channels": 3, |
|
"sample_channels": 32, |
|
"output_channels": 3, |
|
"scale": 1.5305, |
|
"shift": 0.0609 |
|
}, |
|
"registered_name": "VAEBackbone" |
|
}, |
|
"clip_l": { |
|
"module": "keras_hub.src.models.clip.clip_text_encoder", |
|
"class_name": "CLIPTextEncoder", |
|
"config": { |
|
"name": "clip_l", |
|
"trainable": true, |
|
"vocabulary_size": 49408, |
|
"embedding_dim": 768, |
|
"hidden_dim": 768, |
|
"num_layers": 12, |
|
"num_heads": 12, |
|
"intermediate_dim": 3072, |
|
"intermediate_activation": "quick_gelu", |
|
"intermediate_output_index": 10, |
|
"max_sequence_length": 77 |
|
}, |
|
"registered_name": "keras_hub>CLIPTextEncoder" |
|
}, |
|
"clip_g": { |
|
"module": "keras_hub.src.models.clip.clip_text_encoder", |
|
"class_name": "CLIPTextEncoder", |
|
"config": { |
|
"name": "clip_g", |
|
"trainable": true, |
|
"vocabulary_size": 49408, |
|
"embedding_dim": 1280, |
|
"hidden_dim": 1280, |
|
"num_layers": 32, |
|
"num_heads": 20, |
|
"intermediate_dim": 5120, |
|
"intermediate_activation": "gelu", |
|
"intermediate_output_index": 30, |
|
"max_sequence_length": 77 |
|
}, |
|
"registered_name": "keras_hub>CLIPTextEncoder" |
|
}, |
|
"t5": null, |
|
"latent_channels": 16, |
|
"output_channels": 3, |
|
"num_train_timesteps": 1000, |
|
"shift": 3.0, |
|
"image_shape": [ |
|
1024, |
|
1024, |
|
3 |
|
] |
|
}, |
|
"registered_name": "keras_hub>StableDiffusion3Backbone" |
|
} |