Initial upload of GPT_XTTS_V2 model files

Browse files

Files changed (9) hide show

best_model_14510.pth +3 -0
config.json +194 -0
dvae.pth +3 -0
events.out.tfevents.1734889439.9cc9878929c5.1257.0 +3 -0
mel_stats.pth +3 -0
model.pth +3 -0
train_gpt_xtts.py +238 -0
trainer_0_log.txt +0 -0
vocab.json +0 -0

best_model_14510.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6a9a504f1d009969a2576f95f13dc51e5a97b4dbc4224163b2014d8c7ef3c89
+size 5780141753

config.json ADDED Viewed

	@@ -0,0 +1,194 @@

+{
+    "output_path": "/checkpoints/",
+    "logger_uri": null,
+    "run_name": "GPT_XTTS_FT",
+    "project_name": "XTTS_trainer",
+    "run_description": [
+        "\n        GPT XTTS training\n        "
+    ],
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "save_on_interrupt": true,
+    "log_model_step": 100,
+    "save_step": 20000,
+    "save_n_checkpoints": 1,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": 0,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "precision": "fp16",
+    "epochs": 5,
+    "batch_size": 16,
+    "eval_batch_size": 16,
+    "grad_clip": 0.0,
+    "scheduler_after_epoch": true,
+    "lr": 5e-06,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.9,
+            0.96
+        ],
+        "eps": 1e-08,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": "MultiStepLR",
+    "lr_scheduler_params": {
+        "milestones": [
+            60000,
+            120000,
+            180000
+        ],
+        "gamma": 0.5,
+        "last_epoch": -1
+    },
+    "use_grad_scaler": false,
+    "allow_tf32": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "xtts",
+    "num_loader_workers": 4,
+    "num_eval_loader_workers": 0,
+    "use_noise_augment": false,
+    "audio": {
+        "sample_rate": 22050,
+        "output_sample_rate": 24000,
+        "dvae_sample_rate": 22050
+    },
+    "use_phonemes": false,
+    "phonemizer": null,
+    "phoneme_language": null,
+    "compute_input_seq_cache": false,
+    "text_cleaner": null,
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": null,
+    "add_blank": false,
+    "batch_group_size": 0,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": false,
+    "precompute_num_workers": 0,
+    "start_by_longest": false,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "",
+            "dataset_name": "",
+            "path": "",
+            "meta_file_train": "",
+            "ignored_speakers": null,
+            "language": "",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [],
+    "eval_split_max_size": 256,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "gpt_batch_size": 1,
+        "enable_redaction": false,
+        "kv_cache": true,
+        "gpt_checkpoint": "",
+        "clvp_checkpoint": null,
+        "decoder_checkpoint": null,
+        "num_chars": 255,
+        "tokenizer_file": "/checkpoints/XTTS_v2.0_original_model_files/vocab.json",
+        "gpt_max_audio_tokens": 605,
+        "gpt_max_text_tokens": 402,
+        "gpt_max_prompt_tokens": 70,
+        "gpt_layers": 30,
+        "gpt_n_model_channels": 1024,
+        "gpt_n_heads": 16,
+        "gpt_number_text_tokens": 13685,
+        "gpt_start_text_token": 261,
+        "gpt_stop_text_token": 0,
+        "gpt_num_audio_tokens": 1026,
+        "gpt_start_audio_token": 1024,
+        "gpt_stop_audio_token": 1025,
+        "gpt_code_stride_len": 1024,
+        "gpt_use_masking_gt_prompt_approach": true,
+        "gpt_use_perceiver_resampler": true,
+        "input_sample_rate": 22050,
+        "output_sample_rate": 24000,
+        "output_hop_length": 256,
+        "decoder_input_dim": 1024,
+        "d_vector_dim": 512,
+        "cond_d_vector_in_each_upsampling_layer": true,
+        "duration_const": 102400,
+        "min_conditioning_length": 88200,
+        "max_conditioning_length": 264600,
+        "gpt_loss_text_ce_weight": 0.01,
+        "gpt_loss_mel_ce_weight": 1.0,
+        "debug_loading_failures": false,
+        "max_wav_length": 5000750,
+        "max_text_length": 500,
+        "mel_norm_file": "/checkpoints/XTTS_v2.0_original_model_files/mel_stats.pth",
+        "dvae_checkpoint": "/checkpoints/XTTS_v2.0_original_model_files/dvae.pth",
+        "xtts_checkpoint": "/checkpoints/XTTS_v2.0_original_model_files/model.pth",
+        "vocoder": ""
+    },
+    "model_dir": null,
+    "languages": [
+        "en",
+        "es",
+        "fr",
+        "de",
+        "it",
+        "pt",
+        "pl",
+        "tr",
+        "ru",
+        "nl",
+        "cs",
+        "ar",
+        "zh-cn",
+        "hu",
+        "ko",
+        "ja",
+        "hi",
+        "be",
+        "be"
+    ],
+    "temperature": 0.75,
+    "length_penalty": 1.0,
+    "repetition_penalty": 5.0,
+    "top_k": 50,
+    "top_p": 0.85,
+    "num_gpt_outputs": 1,
+    "gpt_cond_len": 30,
+    "gpt_cond_chunk_len": 4,
+    "max_ref_len": 30,
+    "sound_norm_refs": false,
+    "optimizer_wd_only_on_weights": true,
+    "weighted_loss_attrs": null,
+    "weighted_loss_multipliers": null,
+    "github_branch": "inside_docker"
+}

dvae.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b29bc227d410d4991e0a8c09b858f77415013eeb9fba9650258e96095557d97a
+size 210514388

events.out.tfevents.1734889439.9cc9878929c5.1257.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19e2f392eb35eb10849cf470ded95ce91c4cd9251ac48434ec49d50752148073
+size 74527

mel_stats.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f69422a8a8f344c4fca2f0c6b8d41d2151d6615b7321e48e6bb15ae949b119c
+size 1067

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6a9a504f1d009969a2576f95f13dc51e5a97b4dbc4224163b2014d8c7ef3c89
+size 5780141753

train_gpt_xtts.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import os
+import gc
+from trainer import Trainer, TrainerArgs
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+from dataclasses import dataclass, field
+from typing import Optional
+from transformers import HfArgumentParser
+import argparse
+def create_xtts_trainer_parser():
+    parser = argparse.ArgumentParser(description="Arguments for XTTS Trainer")
+    parser.add_argument("--output_path", type=str, required=True,
+                        help="Path to pretrained + checkpoint model")
+    parser.add_argument("--metadatas", nargs='+', type=str, required=True,
+                        help="train_csv_path,eval_csv_path,language")
+    parser.add_argument("--num_epochs", type=int, default=1,
+                        help="Number of epochs")
+    parser.add_argument("--batch_size", type=int, default=1,
+                        help="Mini batch size")
+    parser.add_argument("--grad_acumm", type=int, default=1,
+                        help="Grad accumulation steps")
+    parser.add_argument("--max_audio_length", type=int, default=255995,
+                        help="Max audio length")
+    parser.add_argument("--max_text_length", type=int, default=200,
+                        help="Max text length")
+    parser.add_argument("--weight_decay", type=float, default=1e-2,
+                        help="Weight decay")
+    parser.add_argument("--lr", type=float, default=5e-6,
+                        help="Learning rate")
+    parser.add_argument("--save_step", type=int, default=5000,
+                        help="Save step")
+    return parser
+def train_gpt(metadatas, num_epochs, batch_size, grad_acumm, output_path, max_audio_length, max_text_length, lr, weight_decay, save_step):
+    #  Logging parameters
+    RUN_NAME = "GPT_XTTS_FT"
+    PROJECT_NAME = "XTTS_trainer"
+    DASHBOARD_LOGGER = "tensorboard"
+    LOGGER_URI = None
+    # Set here the path that the checkpoints will be saved. Default: ./run/training/
+    # OUT_PATH = os.path.join(output_path, "run", "training")
+    OUT_PATH = output_path
+    # Training Parameters
+    OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+    START_WITH_EVAL = False  # if True it will star with evaluation
+    BATCH_SIZE = batch_size  # set here the batch size
+    GRAD_ACUMM_STEPS = grad_acumm  # set here the grad accumulation steps
+    # Define here the dataset that you want to use for the fine-tuning on.
+    DATASETS_CONFIG_LIST = []
+    for metadata in metadatas:
+        train_csv, eval_csv, language = metadata.split(",")
+        print(train_csv, eval_csv, language)
+        config_dataset = BaseDatasetConfig(
+            formatter="coqui",
+            dataset_name="ft_dataset",
+            path=os.path.dirname(train_csv),
+            meta_file_train=os.path.basename(train_csv),
+            meta_file_val=os.path.basename(eval_csv),
+            language=language,
+        )
+        DATASETS_CONFIG_LIST.append(config_dataset)
+    # Define the path where XTTS v2.0.1 files will be downloaded
+    CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
+    os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+    # DVAE files
+    DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+    MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+    # Set the path to the downloaded files
+    DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+    MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+    # download DVAE files if needed
+    if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+        print(" > Downloading DVAE files!")
+        ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+    # Download XTTS v2.0 checkpoint if needed
+    TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+    XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+    XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
+    # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+    TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+    XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+    XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK))  # config.json file
+    # download XTTS v2.0 files if needed
+    if not os.path.isfile(TOKENIZER_FILE):
+        print(" > Downloading XTTS v2.0 tokenizer!")
+        ModelManager._download_model_files(
+            [TOKENIZER_FILE_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+        )
+    if not os.path.isfile(XTTS_CHECKPOINT):
+        print(" > Downloading XTTS v2.0 checkpoint!")
+        ModelManager._download_model_files(
+            [XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+        )
+    if not os.path.isfile(XTTS_CONFIG_FILE):
+        print(" > Downloading XTTS v2.0 config!")
+        ModelManager._download_model_files(
+            [XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+        )
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=264600,  # 12 secs
+        min_conditioning_length=88200,  # 4 secs
+        debug_loading_failures=False,
+        max_wav_length=max_audio_length,  # ~11.6 seconds
+        max_text_length=max_text_length,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=1026,
+        gpt_start_audio_token=1024,
+        gpt_stop_audio_token=1025,
+        gpt_use_masking_gt_prompt_approach=True,
+        gpt_use_perceiver_resampler=True,
+    )
+    # define audio config
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+    # training parameters config
+    config = GPTTrainerConfig()
+    config.load_json(XTTS_CONFIG_FILE)
+    config.epochs = num_epochs
+    config.output_path = OUT_PATH
+    config.model_args = model_args
+    config.run_name = RUN_NAME
+    config.project_name = PROJECT_NAME
+    config.run_description = """
+        GPT XTTS training
+        """,
+    config.dashboard_logger = DASHBOARD_LOGGER
+    config.logger_uri = LOGGER_URI
+    config.audio = audio_config
+    config.batch_size = BATCH_SIZE
+    config.num_loader_workers = 4
+    config.eval_split_max_size = 256
+    config.print_step = 50
+    config.plot_step = 100
+    config.log_model_step = 100
+    config.save_step = save_step
+    config.save_n_checkpoints = 1
+    config.save_checkpoints = True
+    config.print_eval = False
+    config.optimizer = "AdamW"
+    config.optimizer_wd_only_on_weights = OPTIMIZER_WD_ONLY_ON_WEIGHTS
+    config.optimizer_params = {"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": weight_decay}
+    config.lr = lr
+    config.lr_scheduler = "MultiStepLR"
+    config.lr_scheduler_params = {"milestones": [
+        save_step * 3, save_step * 3 * 2, save_step * 3 * 3], "gamma": 0.5, "last_epoch": -1}
+    config.test_sentences = []
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=START_WITH_EVAL,
+            grad_accum_steps=GRAD_ACUMM_STEPS
+        ),
+        config,
+        #output_path=os.path.join(output_path, "run", "training"),
+        output_path=os.path.join(output_path),
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
+    # get the longest text audio file to use as speaker reference
+    samples_len = [len(item["text"].split(" ")) for item in train_samples]
+    longest_text_idx =  samples_len.index(max(samples_len))
+    speaker_ref = train_samples[longest_text_idx]["audio_file"]
+    trainer_out_path = trainer.output_path
+    # deallocate VRAM and RAM
+    del model, trainer, train_samples, eval_samples
+    gc.collect()
+    return trainer_out_path
+if __name__ == "__main__":
+    parser = create_xtts_trainer_parser()
+    args = parser.parse_args()
+    trainer_out_path = train_gpt(
+        metadatas=args.metadatas,
+        output_path=args.output_path,
+        num_epochs=args.num_epochs,
+        batch_size=args.batch_size,
+        grad_acumm=args.grad_acumm,
+        weight_decay=args.weight_decay,
+        lr=args.lr,
+        max_text_length=args.max_text_length,
+        max_audio_length=args.max_audio_length,
+        save_step=args.save_step
+    )
+    print(f"Checkpoint saved in dir: {trainer_out_path}")

trainer_0_log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff