Sony
/

Text-to-Audio
soundctm
koichisaito commited on
Commit
be035eb
1 Parent(s): 8d63414

Upload 5 files

Browse files
soundctm_ckpt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
soundctm_ckpt/030000/.DS_Store ADDED
Binary file (6.15 kB). View file
 
soundctm_ckpt/030000/ema_0.999_030000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07049a7b4c171b8607c5adc77913a23fd1afe5b54b80e2f39ff1511d33d39cb3
3
+ size 3472226228
soundctm_ckpt/030000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:722bdb278175f86df6598a3d68716242f11d026bfbf356dca9fd0130fa1fb22d
3
+ size 4837274834
soundctm_ckpt/030000/summary.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"seed": 5031, "tango": true, "train_file": "/data/audiocaps/train.csv", "validation_file": "/data/audiocaps/val.csv", "num_examples": -1, "text_encoder_name": "google/flan-t5-large", "unet_model_config": "configs/diffusion_model_config.json", "ctm_unet_model_config": "configs/diffusion_model_config.json", "freeze_text_encoder": true, "text_column": "caption", "audio_column": "file_name", "tango_data_augment": true, "augment_num": 2, "uncond_prob": 0.1, "prefix": null, "per_device_train_batch_size": 6, "per_device_eval_batch_size": 2, "num_train_epochs": 40, "gradient_accumulation_steps": 1, "lr_scheduler_type": "linear", "d_lr_scheduler_type": "linear", "num_warmup_steps": 0, "d_num_warmup_steps": 0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "output_dir": "/output/", "duration": 10.0, "checkpointing_steps": "best", "model_grad_clip_value": 1000.0, "disc_grad_clip_value": 1000.0, "sigma_data": 0.25, "resume_from_checkpoint": null, "generated_path": null, "valid_data_path": null, "mixed_precision": "bf16", "allow_tf32": false, "gradient_checkpointing": false, "enable_xformers_memory_efficient_attention": false, "with_tracking": true, "report_to": "wandb", "teacher_model_path": "ckpt/teacher/pytorch_model_2_sigma_025.bin", "stage1_path": "ckpt/audioldm-s-full.ckpt", "schedule_sampler": "uniform", "lr": 8e-05, "weight_decay": 0.0, "lr_anneal_steps": 0, "ema_rate": "0.999", "total_training_steps": 600000, "save_interval": 3000, "unet_mode": "full", "distill_steps_per_iter": 50000, "out_res": -1, "clip_denoised": false, "clip_output": false, "beta_min": 0.1, "beta_max": 20.0, "multiplier": 1.0, "load_optimizer": true, "num_channels": 128, "num_res_blocks": 2, "num_heads": 4, "num_heads_upsample": -1, "num_head_channels": -1, "attention_resolutions": "32,16,8", "channel_mult": "", "dropout": 0.0, "class_cond": false, "use_checkpoint": false, "use_scale_shift_norm": true, "resblock_updown": false, "use_new_attention_order": false, "learn_sigma": false, "out_channels": 8, "in_channels": 8, "deterministic": false, "time_continuous": false, "consistency_weight": 1.0, "loss_norm": "feature_space", "loss_distance": "l2", "loss_domain": "latent", "weight_schedule": "uniform", "parametrization": "euler", "inner_parametrization": "edm", "num_heun_step": 39, "num_heun_step_random": true, "teacher_dropout": 0.1, "training_mode": "ctm", "match_point": "zs", "target_ema_mode": "fixed", "scale_mode": "fixed", "start_ema": 0.999, "start_scales": 40, "end_scales": 40, "sigma_min": 0.002, "sigma_max": 80.0, "rho": 7, "latent_channels": 8, "latent_f_size": 16, "latent_t_size": 256, "cfg_distill": false, "target_cfg": 3.0, "unform_sampled_cfg_distill": true, "w_min": 2.0, "w_max": 5.0, "diffusion_training": true, "denoising_weight": 1.0, "diffusion_mult": 0.7, "diffusion_schedule_sampler": "halflognormal", "apply_adaptive_weight": true, "dsm_loss_target": "z_0", "diffusion_weight_schedule": "karras_weight", "cm_ratio": 0.0, "augment": false, "intermediate_samples": false, "compute_ema_fads": true, "sampling_steps": 18, "ref_path": "", "large_log": false, "discriminator_training": true, "discriminator_input": "latent", "gan_target": "z_target", "sample_s_strategy": "uniform", "heun_step_strategy": "weighted", "heun_step_multiplier": 1.0, "auxiliary_type": "stop_grad", "gan_estimate_type": "same", "discriminator_fix": false, "discriminator_free_target": false, "d_apply_adaptive_weight": true, "discriminator_start_itr": 39000, "discriminator_weight": 1.0, "d_lr": 8e-05, "r1_reg_enable": false, "reg_gamma": 2.0, "d_architecture": "CMBDisc", "dac_dis_rates": [], "dac_dis_periods": [2, 3, 5, 7, 11], "dac_dis_fft_sizes": [1024, 512, 256, 128], "dac_dis_sample_rate": 16000, "dac_dis_bands": [[0.0, 0.25], [0.25, 0.5], [0.5, 0.75], [0.75, 1.0]], "d_cond_type": "text_encoder", "c_dim": 1024, "cmap_dim": 128, "vqgan_ndf": 64, "vqgan_n_layers": 1, "vqgan_use_spectral_norm": false, "mbdisc_ndf": 32, "n_bins": 64, "increase_ch": false, "fm_apply_adaptive_weight": true, "fm_weight": 2.0}