Spaces:
Runtime error
Runtime error
data: | |
data_dir: [data/data_public/controlnet_data] | |
image_size: 1024 | |
caption_proportion: | |
prompt: 1 | |
external_caption_suffixes: [] | |
external_clipscore_suffixes: [] | |
clip_thr_temperature: 0.1 | |
clip_thr: 25.0 | |
load_text_feat: false | |
load_vae_feat: false | |
transform: default_train | |
type: SanaWebDatasetMSControl | |
sort_dataset: false | |
# model config | |
model: | |
model: SanaMSControlNet_600M_P1_D28 | |
image_size: 1024 | |
mixed_precision: fp16 | |
fp32_attention: true | |
load_from: hf://Efficient-Large-Model/Sana_600M_1024px/checkpoint/Sana_600M_1024px.pth | |
resume_from: | |
aspect_ratio_type: ASPECT_RATIO_1024 | |
multi_scale: true | |
attn_type: linear | |
ffn_type: glumbconv | |
mlp_acts: | |
- silu | |
- silu | |
- | |
mlp_ratio: 2.5 | |
use_pe: false | |
qk_norm: false | |
class_dropout_prob: 0.1 | |
# VAE setting | |
vae: | |
vae_type: AutoencoderDC | |
vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers | |
scale_factor: 0.41407 | |
vae_latent_dim: 32 | |
vae_downsample_rate: 32 | |
sample_posterior: true | |
# text encoder | |
text_encoder: | |
text_encoder_name: gemma-2-2b-it | |
y_norm: true | |
y_norm_scale_factor: 0.01 | |
model_max_length: 300 | |
# CHI | |
chi_prompt: | |
- 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:' | |
- '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.' | |
- '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.' | |
- 'Here are examples of how to transform or refine prompts:' | |
- '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.' | |
- '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.' | |
- 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:' | |
- 'User Prompt: ' | |
# Sana schedule Flow | |
scheduler: | |
predict_v: true | |
noise_schedule: linear_flow | |
pred_sigma: false | |
flow_shift: 4.0 | |
# logit-normal timestep | |
weighting_scheme: logit_normal | |
logit_mean: 0.0 | |
logit_std: 1.0 | |
vis_sampler: flow_dpm-solver | |
# training setting | |
train: | |
num_workers: 10 | |
seed: 1 | |
train_batch_size: 16 | |
num_epochs: 100 | |
gradient_accumulation_steps: 1 | |
grad_checkpointing: true | |
gradient_clip: 0.1 | |
optimizer: | |
betas: | |
- 0.9 | |
- 0.999 | |
- 0.9999 | |
eps: | |
- 1.0e-30 | |
- 1.0e-16 | |
lr: 0.0001 | |
type: CAMEWrapper | |
weight_decay: 0.0 | |
lr_schedule: constant | |
lr_schedule_args: | |
num_warmup_steps: 30 | |
local_save_vis: true # if save log image locally | |
visualize: true | |
eval_sampling_steps: 500 | |
log_interval: 20 | |
save_model_epochs: 5 | |
save_model_steps: 500 | |
work_dir: output/debug | |
online_metric: false | |
eval_metric_step: 2000 | |
online_metric_dir: metric_helper | |
controlnet: | |
control_signal_type: "scribble" | |