logging: | |
project: titok_video | |
run_name: BaseAll-CB16k-TL128-256x33-init-BS64-clipGrad1.0 | |
logging_interval: 50 | |
save_path: out_tiny | |
save_step_interval: 5000 | |
keep_prior_checkpoints: -1 # -1 to keep all | |
resume_from_checkpoint: | |
init_from_checkpoint: base-interp-256x33-TL128.ckpt | |
model: | |
titok: | |
temporal_patch_size: 2 | |
spatial_patch_size: 4 | |
fsq_levels: [8, 8, 8, 6, 5] # [7, 5, 5, 5, 5] | |
num_latent_tokens: 128 | |
encoder_size: base | |
decoder_size: base | |
exp_residual: False | |
vae: | |
type: wfvae # cogvideox, vidtok, wfvae | |
path: preprocess_dataset/wf-16 | |
latent_channels: 16 | |
temporal_compression: 4 | |
spatial_compression: 8 | |
disc: # experimental | |
use_disc: False | |
model_layers: 1 | |
model_heads: 1 | |
model_dim: 128 | |
temporal_patch_size: 4 | |
spatial_patch_size: 4 | |
disc_start: 45000 | |
disc_factor: 1.0 | |
disc_weight: 0.1 | |
lecam_weight: 0.0 # disabled | |
base_gamma: 1 # higher gamma smooths more earlier in training. | |
final_gamma: 0.1 | |
dataset: | |
train_dataset: "/workspace/out_enc_256_33/**/*.pt" | |
eval_dataset: "/workspace/out_enc_256_33_eval/*.pt" | |
resolution: 256 | |
num_frames: 33 | |
frames_per_second: 8 | |
workers: 8 | |
optimizer: | |
titok: | |
learning_rate: 1e-4 | |
beta1: 0.9 | |
beta2: 0.99 | |
weight_decay: 1e-4 | |
warmup_steps: 5000 # 10000 | |
end_lr: 1e-5 | |
disc: # not used | |
learning_rate: 1e-4 | |
beta1: 0.9 | |
beta2: 0.99 | |
weight_decay: 1e-4 | |
warmup_steps: 1000 | |
end_lr: 1e-5 | |
training: | |
torch_compile: True | |
seed: 42 | |
max_grad_norm: 1.0 # not needed? | |
batch_size: 64 | |
# strategy: # ddp | |
enable_tf32: True | |
precision: bf16-mixed | |
train_devices: 1 | |
accelerator: 'gpu' | |
max_steps: 500000 | |
val_step_interval: 2000 | |
eval_recon_log_num: 4 | |
eval_sample_size: 32 | |
eval_batch_size: 1 | |
eval_clear_cache: True | |
eval_shuffle: True | |
log_codebook: True |