Titok-Video-Stage2 / config.yaml
NilanE's picture
Upload config.yaml with huggingface_hub
c52d378 verified
logging:
project: titok_video
run_name: BaseAll-CB16k-TL128-256x33-init-BS64-clipGrad1.0
logging_interval: 50
save_path: out_tiny
save_step_interval: 5000
keep_prior_checkpoints: -1 # -1 to keep all
resume_from_checkpoint:
init_from_checkpoint: base-interp-256x33-TL128.ckpt
model:
titok:
temporal_patch_size: 2
spatial_patch_size: 4
fsq_levels: [8, 8, 8, 6, 5] # [7, 5, 5, 5, 5]
num_latent_tokens: 128
encoder_size: base
decoder_size: base
exp_residual: False
vae:
type: wfvae # cogvideox, vidtok, wfvae
path: preprocess_dataset/wf-16
latent_channels: 16
temporal_compression: 4
spatial_compression: 8
disc: # experimental
use_disc: False
model_layers: 1
model_heads: 1
model_dim: 128
temporal_patch_size: 4
spatial_patch_size: 4
disc_start: 45000
disc_factor: 1.0
disc_weight: 0.1
lecam_weight: 0.0 # disabled
base_gamma: 1 # higher gamma smooths more earlier in training.
final_gamma: 0.1
dataset:
train_dataset: "/workspace/out_enc_256_33/**/*.pt"
eval_dataset: "/workspace/out_enc_256_33_eval/*.pt"
resolution: 256
num_frames: 33
frames_per_second: 8
workers: 8
optimizer:
titok:
learning_rate: 1e-4
beta1: 0.9
beta2: 0.99
weight_decay: 1e-4
warmup_steps: 5000 # 10000
end_lr: 1e-5
disc: # not used
learning_rate: 1e-4
beta1: 0.9
beta2: 0.99
weight_decay: 1e-4
warmup_steps: 1000
end_lr: 1e-5
training:
torch_compile: True
seed: 42
max_grad_norm: 1.0 # not needed?
batch_size: 64
# strategy: # ddp
enable_tf32: True
precision: bf16-mixed
train_devices: 1
accelerator: 'gpu'
max_steps: 500000
val_step_interval: 2000
eval_recon_log_num: 4
eval_sample_size: 32
eval_batch_size: 1
eval_clear_cache: True
eval_shuffle: True
log_codebook: True