mu3000-ens / config.yaml
reza-alipour's picture
Upload folder using huggingface_hub
658187f
wandb:
entity: r-ap
run_id: 082oe47w
experiment:
name: muse-multi
project: muse-prod
output_dir: output/
max_train_examples: 28500
max_eval_examples: 1000
save_every: 1000
eval_every: 700
generate_every: 200
log_every: 50
log_grad_norm_every: 100000000
resume_from_checkpoint: latest
resume_lr_scheduler: true
checkpoints_total_limit: 4
logging_dir: output/logs
model:
vq_model:
type: vqgan
text_encoder:
type: clip
pretrained: openMUSE/clip-vit-large-patch14-text-enc
transformer:
vocab_size: 8256
hidden_size: 1024
intermediate_size: 2816
num_hidden_layers: 22
num_attention_heads: 16
in_channels: 768
block_out_channels:
- 768
block_has_attention:
- true
block_num_heads: 12
num_res_blocks: 3
res_ffn_factor: 4
patch_size: 1
encoder_hidden_size: 768
add_cross_attention: true
project_encoder_hidden_states: true
codebook_size: 8192
num_vq_tokens: 256
initializer_range: 0.02
norm_type: rmsnorm
layer_norm_eps: 1.0e-06
ln_elementwise_affine: true
use_encoder_layernorm: false
use_bias: false
hidden_dropout: 0.0
attention_dropout: 0.0
use_codebook_size_for_output: true
use_empty_embeds_for_uncond: true
add_cond_embeds: true
cond_embed_dim: 768
add_micro_cond_embeds: true
micro_cond_encode_dim: 256
micro_cond_embed_dim: 1280
force_down_up_sample: true
architecture: uvit
enable_xformers_memory_efficient_attention: true
dataset:
preprocessing:
max_seq_length: 77
resolution: 256
optimizer:
name: adamw
params:
learning_rate: 0.0001
scale_lr: false
beta1: 0.9
beta2: 0.999
weight_decay: 0.01
epsilon: 1.0e-08
lr_scheduler:
scheduler: constant_with_warmup
params:
learning_rate: ${optimizer.params.learning_rate}
warmup_steps: 100
training:
gradient_accumulation_steps: 1
batch_size: 20
mixed_precision: 'no'
enable_tf32: true
use_ema: true
ema_decay: 0.9999
ema_update_after_step: 0
ema_update_every: 1
seed: 13399
max_train_steps: 20000
overfit_one_batch: false
cond_dropout_prob: 0.1
min_masking_rate: 0.0
label_smoothing: 0.1
max_grad_norm: null
guidance_scale: 8
generation_timesteps: 16
use_soft_code_target: false
use_stochastic_code: false
soft_code_temp: 1.0
mask_schedule: cosine
mask_contiguous_region_prob: 0.15
config: configs/segmentation.yaml