Spaces:

peechapp
/

PeechTTSv22050

Sleeping

App Files Files Community

PeechTTSv22050 / models /tts /delightful_tts /delightful_tts.py

nickovchinnikov

Init

9d61c9b about 1 year ago

raw

history blame contribute delete

11.3 kB

	from typing import List

	from lightning.pytorch.core import LightningModule
	import torch
	from torch import Tensor
	from torch.optim import AdamW
	from torch.optim.lr_scheduler import ExponentialLR
	from torch.utils.data import DataLoader

	from models.config import (
	AcousticFinetuningConfig,
	AcousticModelConfigType,
	AcousticMultilingualModelConfig,
	AcousticPretrainingConfig,
	AcousticTrainingConfig,
	PreprocessingConfig,
	get_lang_map,
	lang2id,
	)
	from models.helpers.tools import get_mask_from_lengths
	from training.datasets.hifi_libri_dataset import (
	speakers_hifi_ids,
	speakers_libri_ids,
	train_dataloader,
	)
	from training.loss import FastSpeech2LossGen
	from training.preprocess.normalize_text import NormalizeText

	# Updated version of the tokenizer
	from training.preprocess.tokenizer_ipa_espeak import TokenizerIpaEspeak as TokenizerIPA

	from .acoustic_model import AcousticModel

	MEL_SPEC_EVERY_N_STEPS = 1000
	AUDIO_EVERY_N_STEPS = 100


	class DelightfulTTS(LightningModule):
	r"""Trainer for the acoustic model.

	Args:
	preprocess_config PreprocessingConfig: The preprocessing configuration.
	model_config AcousticModelConfigType: The model configuration.
	fine_tuning (bool, optional): Whether to use fine-tuning mode or not. Defaults to False.
	bin_warmup (bool, optional): Whether to use binarization warmup for the loss or not. Defaults to True.
	lang (str): Language of the dataset.
	n_speakers (int): Number of speakers in the dataset.generation during training.
	batch_size (int): The batch size.
	"""

	def __init__(
	self,
	preprocess_config: PreprocessingConfig,
	model_config: AcousticModelConfigType = AcousticMultilingualModelConfig(),
	fine_tuning: bool = False,
	bin_warmup: bool = True,
	lang: str = "en",
	n_speakers: int = 5392,
	batch_size: int = 19,
	):
	super().__init__()

	self.lang = lang
	self.lang_id = lang2id[self.lang]

	self.fine_tuning = fine_tuning
	self.batch_size = batch_size

	lang_map = get_lang_map(lang)
	normilize_text_lang = lang_map.nemo

	self.tokenizer = TokenizerIPA(lang)
	self.normilize_text = NormalizeText(normilize_text_lang)

	self.train_config_acoustic: AcousticTrainingConfig

	if self.fine_tuning:
	self.train_config_acoustic = AcousticFinetuningConfig()
	else:
	self.train_config_acoustic = AcousticPretrainingConfig()

	self.preprocess_config = preprocess_config

	# TODO: fix the arguments!
	self.acoustic_model = AcousticModel(
	preprocess_config=self.preprocess_config,
	model_config=model_config,
	# NOTE: this parameter may be hyperparameter that you can define based on the demands
	n_speakers=n_speakers,
	)

	# NOTE: in case of training from 0 bin_warmup should be True!
	self.loss_acoustic = FastSpeech2LossGen(
	bin_warmup=bin_warmup,
	)

	def forward(
	self,
	text: str,
	speaker_idx: Tensor,
	) -> Tensor:
	r"""Performs a forward pass through the AcousticModel.
	This code must be run only with the loaded weights from the checkpoint!

	Args:
	text (str): The input text.
	speaker_idx (Tensor): The index of the speaker

	Returns:
	Tensor: The generated waveform with hifi-gan.
	"""
	normalized_text = self.normilize_text(text)
	_, phones = self.tokenizer(normalized_text)

	# Convert to tensor
	x = torch.tensor(
	phones,
	dtype=torch.int,
	device=speaker_idx.device,
	).unsqueeze(0)

	speakers = speaker_idx.repeat(x.shape[1]).unsqueeze(0)

	langs = (
	torch.tensor(
	[self.lang_id],
	dtype=torch.int,
	device=speaker_idx.device,
	)
	.repeat(x.shape[1])
	.unsqueeze(0)
	)

	mel_pred = self.acoustic_model.forward(
	x=x,
	speakers=speakers,
	langs=langs,
	)

	return mel_pred

	def training_step(self, batch: List, _: int):
	r"""Performs a training step for the model.

	Args:
	batch (List): The batch of data for training. The batch should contain:
	- ids: List of indexes.
	- raw_texts: Raw text inputs.
	- speakers: Speaker identities.
	- texts: Text inputs.
	- src_lens: Lengths of the source sequences.
	- mels: Mel spectrogram targets.
	- pitches: Pitch targets.
	- pitches_stat: Statistics of the pitches.
	- mel_lens: Lengths of the mel spectrograms.
	- langs: Language identities.
	- attn_priors: Prior attention weights.
	- wavs: Waveform targets.
	- energies: Energy targets.
	batch_idx (int): Index of the batch.

	Returns:
	- 'loss': The total loss for the training step.
	"""
	(
	_,
	_,
	speakers,
	texts,
	src_lens,
	mels,
	pitches,
	_,
	mel_lens,
	langs,
	attn_priors,
	_,
	energies,
	) = batch

	outputs = self.acoustic_model.forward_train(
	x=texts,
	speakers=speakers,
	src_lens=src_lens,
	mels=mels,
	mel_lens=mel_lens,
	pitches=pitches,
	langs=langs,
	attn_priors=attn_priors,
	energies=energies,
	)

	y_pred = outputs["y_pred"]
	log_duration_prediction = outputs["log_duration_prediction"]
	p_prosody_ref = outputs["p_prosody_ref"]
	p_prosody_pred = outputs["p_prosody_pred"]
	pitch_prediction = outputs["pitch_prediction"]
	energy_pred = outputs["energy_pred"]
	energy_target = outputs["energy_target"]

	src_mask = get_mask_from_lengths(src_lens)
	mel_mask = get_mask_from_lengths(mel_lens)

	(
	total_loss,
	mel_loss,
	ssim_loss,
	duration_loss,
	u_prosody_loss,
	p_prosody_loss,
	pitch_loss,
	ctc_loss,
	bin_loss,
	energy_loss,
	) = self.loss_acoustic.forward(
	src_masks=src_mask,
	mel_masks=mel_mask,
	mel_targets=mels,
	mel_predictions=y_pred,
	log_duration_predictions=log_duration_prediction,
	u_prosody_ref=outputs["u_prosody_ref"],
	u_prosody_pred=outputs["u_prosody_pred"],
	p_prosody_ref=p_prosody_ref,
	p_prosody_pred=p_prosody_pred,
	pitch_predictions=pitch_prediction,
	p_targets=outputs["pitch_target"],
	durations=outputs["attn_hard_dur"],
	attn_logprob=outputs["attn_logprob"],
	attn_soft=outputs["attn_soft"],
	attn_hard=outputs["attn_hard"],
	src_lens=src_lens,
	mel_lens=mel_lens,
	energy_pred=energy_pred,
	energy_target=energy_target,
	step=self.trainer.global_step,
	)

	self.log(
	"train_total_loss",
	total_loss,
	sync_dist=True,
	batch_size=self.batch_size,
	)
	self.log("train_mel_loss", mel_loss, sync_dist=True, batch_size=self.batch_size)
	self.log(
	"train_ssim_loss",
	ssim_loss,
	sync_dist=True,
	batch_size=self.batch_size,
	)
	self.log(
	"train_duration_loss",
	duration_loss,
	sync_dist=True,
	batch_size=self.batch_size,
	)
	self.log(
	"train_u_prosody_loss",
	u_prosody_loss,
	sync_dist=True,
	batch_size=self.batch_size,
	)
	self.log(
	"train_p_prosody_loss",
	p_prosody_loss,
	sync_dist=True,
	batch_size=self.batch_size,
	)
	self.log(
	"train_pitch_loss",
	pitch_loss,
	sync_dist=True,
	batch_size=self.batch_size,
	)
	self.log("train_ctc_loss", ctc_loss, sync_dist=True, batch_size=self.batch_size)
	self.log("train_bin_loss", bin_loss, sync_dist=True, batch_size=self.batch_size)
	self.log(
	"train_energy_loss",
	energy_loss,
	sync_dist=True,
	batch_size=self.batch_size,
	)

	return total_loss

	def configure_optimizers(self):
	r"""Configures the optimizer used for training.

	Returns
	tuple: A tuple containing three dictionaries. Each dictionary contains the optimizer and learning rate scheduler for one of the models.
	"""
	lr_decay = self.train_config_acoustic.optimizer_config.lr_decay
	default_lr = self.train_config_acoustic.optimizer_config.learning_rate

	init_lr = (
	default_lr
	if self.trainer.global_step == 0
	else default_lr * (lr_decay**self.trainer.global_step)
	)

	optimizer_acoustic = AdamW(
	self.acoustic_model.parameters(),
	lr=init_lr,
	betas=self.train_config_acoustic.optimizer_config.betas,
	eps=self.train_config_acoustic.optimizer_config.eps,
	weight_decay=self.train_config_acoustic.optimizer_config.weight_decay,
	)

	scheduler_acoustic = ExponentialLR(optimizer_acoustic, gamma=lr_decay)

	return {
	"optimizer": optimizer_acoustic,
	"lr_scheduler": scheduler_acoustic,
	}

	def train_dataloader(
	self,
	root: str = "datasets_cache",
	cache: bool = True,
	cache_dir: str = "/dev/shm",
	include_libri: bool = False,
	libri_speakers: List[str] = speakers_libri_ids,
	hifi_speakers: List[str] = speakers_hifi_ids,
	) -> DataLoader:
	r"""Returns the training dataloader, that is using the LibriTTS dataset.

	Args:
	root (str): The root directory of the dataset.
	cache (bool): Whether to cache the preprocessed data.
	cache_dir (str): The directory for the cache. Defaults to "/dev/shm".
	include_libri (bool): Whether to include the LibriTTS dataset or not.
	libri_speakers (List[str]): The list of LibriTTS speakers to include.
	hifi_speakers (List[str]): The list of HiFi-GAN speakers to include.

	Returns:
	Tupple[DataLoader, DataLoader]: The training and validation dataloaders.
	"""
	return train_dataloader(
	batch_size=self.batch_size,
	num_workers=self.preprocess_config.workers,
	sampling_rate=self.preprocess_config.sampling_rate,
	root=root,
	cache=cache,
	cache_dir=cache_dir,
	lang=self.lang,
	include_libri=include_libri,
	libri_speakers=libri_speakers,
	hifi_speakers=hifi_speakers,
	)