logging: project: titok_video run_name: BaseAll-CB16k-TL64-128x17-BS256 logging_interval: 50 save_path: out_tiny save_step_interval: 5000 keep_prior_checkpoints: -1 # -1 to keep all resume_from_checkpoint: init_from_checkpoint: model: titok: temporal_patch_size: 2 spatial_patch_size: 4 fsq_levels: [8, 8, 8, 6, 5] # [7, 5, 5, 5, 5] num_latent_tokens: 64 encoder_size: base decoder_size: base exp_residual: False vae: type: wfvae # cogvideox, vidtok, wfvae path: preprocess_dataset/wf-16 latent_channels: 16 temporal_compression: 4 spatial_compression: 8 disc: # experimental use_disc: False model_layers: 1 model_heads: 1 model_dim: 128 temporal_patch_size: 4 spatial_patch_size: 4 disc_start: 45000 disc_factor: 1.0 disc_weight: 0.1 lecam_weight: 0.0 # disabled base_gamma: 1 # higher gamma smooths more earlier in training. final_gamma: 0.1 dataset: train_dataset: "/workspace/out_enc_128_17/**/*.pt" eval_dataset: "/workspace/out_enc_128_17_eval/*.pt" resolution: 128 num_frames: 17 frames_per_second: 8 workers: 8 optimizer: titok: learning_rate: 1e-4 beta1: 0.9 beta2: 0.99 weight_decay: 1e-4 warmup_steps: 10000 end_lr: 1e-5 disc: # not used learning_rate: 1e-4 beta1: 0.9 beta2: 0.99 weight_decay: 1e-4 warmup_steps: 1000 end_lr: 1e-5 training: torch_compile: True seed: 42 max_grad_norm: # 1.0 batch_size: 256 # strategy: # ddp enable_tf32: True precision: bf16-mixed train_devices: 1 accelerator: 'gpu' max_steps: 500000 val_step_interval: 2000 eval_recon_log_num: 4 eval_sample_size: 32 eval_batch_size: 1 eval_clear_cache: True eval_shuffle: True log_codebook: True