File size: 3,308 Bytes
f1f9265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# data settings
data:
  data_dir: []
  caption_proportion:
    prompt: 1
  external_caption_suffixes: []
  external_clipscore_suffixes: []
  clip_thr_temperature: 1.0
  clip_thr: 0.0
  sort_dataset: false
  load_text_feat: false
  load_vae_feat: false
  transform: default_train
  type: SanaWebDatasetMS
  image_size: 512
  hq_only: false
  valid_num: 0
# model settings
model:
  model: SanaMS_600M_P1_D28
  image_size: 512
  mixed_precision: fp16 # ['fp16', 'fp32', 'bf16']
  fp32_attention: true
  load_from:
  resume_from:
    checkpoint:
    load_ema: false
    resume_lr_scheduler: true
    resume_optimizer: true
  aspect_ratio_type: ASPECT_RATIO_1024
  multi_scale: true
  pe_interpolation: 1.0
  micro_condition: false
  attn_type: linear # 'flash', 'linear', 'vanilla', 'triton_linear'
  cross_norm: false
  autocast_linear_attn: false
  ffn_type: glumbconv
  mlp_acts:
    - silu
    - silu
    -
  mlp_ratio: 2.5
  use_pe: false
  qk_norm: false
  class_dropout_prob: 0.0
  linear_head_dim: 32
  # CFG & PAG settings
  cfg_scale: 4
  guidance_type: classifier-free
  pag_applied_layers: [14]
# text encoder settings
text_encoder:
  text_encoder_name: gemma-2-2b-it
  caption_channels: 2304
  y_norm: false
  y_norm_scale_factor: 1.0
  model_max_length: 300
  chi_prompt: []
# VAE settings
vae:
  vae_type: dc-ae
  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
  scale_factor: 0.41407
  vae_latent_dim: 32
  vae_downsample_rate: 32
  sample_posterior: true
# Scheduler settings
scheduler:
  train_sampling_steps: 1000
  predict_v: True
  noise_schedule: linear_flow
  pred_sigma: false
  flow_shift: 1.0
  weighting_scheme: logit_normal
  logit_mean: 0.0
  logit_std: 1.0
  vis_sampler: flow_dpm-solver
# training settings
train:
  num_workers: 4
  seed: 43
  train_batch_size: 32
  num_epochs: 100
  gradient_accumulation_steps: 1
  grad_checkpointing: false
  gradient_clip: 1.0
  gc_step: 1
  # optimizer settings
  optimizer:
    eps: 1.0e-10
    lr: 0.0001
    type: AdamW
    weight_decay: 0.03
  lr_schedule: constant
  lr_schedule_args:
    num_warmup_steps: 500
  auto_lr:
    rule: sqrt
  ema_rate: 0.9999
  eval_batch_size: 16
  use_fsdp: false
  use_flash_attn: false
  eval_sampling_steps: 250
  lora_rank: 4
  log_interval: 50
  mask_type: 'null'
  mask_loss_coef: 0.0
  load_mask_index: false
  snr_loss: false
  real_prompt_ratio: 1.0
  debug_nan: false
  # checkpoint settings
  save_image_epochs: 1
  save_model_epochs: 1
  save_model_steps: 1000000
  # visualization settings
  visualize: false
  null_embed_root: output/pretrained_models/
  valid_prompt_embed_root: output/tmp_embed/
  validation_prompts:
    - dog
    - portrait photo of a girl, photograph, highly detailed face, depth of field
    - Self-portrait oil painting, a beautiful cyborg with golden hair, 8k
    - Astronaut in a jungle, cold color palette, muted colors, detailed, 8k
    - A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece
  local_save_vis: false
  deterministic_validation: true
  online_metric: false
  eval_metric_step: 5000
  online_metric_dir: metric_helper
  # work dir settings
  work_dir: /cache/exps/
  skip_step: 0
  # LCM settings
  loss_type: huber
  huber_c: 0.001
  num_ddim_timesteps: 50
  w_max: 15.0
  w_min: 3.0
  ema_decay: 0.95