File size: 2,393 Bytes
9936b82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
wandb:
  entity: r-ap
  run_id: tx78bpg1
experiment:
  name: muse-multi
  project: muse-prod
  output_dir: output/
  max_train_examples: 28500
  max_eval_examples: 1000
  save_every: 1000
  eval_every: 1000
  generate_every: 400
  log_every: 50
  log_grad_norm_every: 100000000
  resume_from_checkpoint: latest
  resume_lr_scheduler: true
  checkpoints_total_limit: 4
  logging_dir: output/logs
model:
  vq_model:
    type: vqgan
  text_encoder:
    type: clip
  transformer:
    vocab_size: 8256
    hidden_size: 1024
    intermediate_size: 2816
    num_hidden_layers: 22
    num_attention_heads: 16
    in_channels: 768
    block_out_channels:
    - 768
    block_has_attention:
    - true
    block_num_heads: 12
    num_res_blocks: 3
    res_ffn_factor: 4
    patch_size: 1
    encoder_hidden_size: 768
    add_cross_attention: true
    project_encoder_hidden_states: true
    codebook_size: 8192
    num_vq_tokens: 256
    initializer_range: 0.02
    norm_type: rmsnorm
    layer_norm_eps: 1.0e-06
    ln_elementwise_affine: true
    use_encoder_layernorm: false
    use_bias: false
    hidden_dropout: 0.0
    attention_dropout: 0.0
    use_codebook_size_for_output: true
    use_empty_embeds_for_uncond: true
    add_cond_embeds: true
    cond_embed_dim: 768
    add_micro_cond_embeds: true
    micro_cond_encode_dim: 256
    micro_cond_embed_dim: 1280
    force_down_up_sample: true
  architecture: uvit
  enable_xformers_memory_efficient_attention: true
dataset:
  preprocessing:
    max_seq_length: 77
    resolution: 256
optimizer:
  name: adamw
  params:
    learning_rate: 0.0001
    scale_lr: false
    beta1: 0.9
    beta2: 0.999
    weight_decay: 0.01
    epsilon: 1.0e-08
lr_scheduler:
  scheduler: constant_with_warmup
  params:
    learning_rate: ${optimizer.params.learning_rate}
    warmup_steps: 100
training:
  gradient_accumulation_steps: 1
  batch_size: 20
  mixed_precision: 'no'
  enable_tf32: true
  use_ema: true
  ema_decay: 0.9999
  ema_update_after_step: 0
  ema_update_every: 1
  seed: 13399
  max_train_steps: 20000
  overfit_one_batch: false
  cond_dropout_prob: 0.1
  min_masking_rate: 0.0
  label_smoothing: 0.1
  max_grad_norm: null
  guidance_scale: 8
  generation_timesteps: 16
  use_soft_code_target: false
  use_stochastic_code: false
  soft_code_temp: 1.0
  mask_schedule: cosine
  mask_contiguous_region_prob: 0.15
config: configs/segmentation.yaml