Spaces:

wasmdashai
/

RunTasking

Running on Zero

App Files Files Community

RunTasking / VitsModelSplit /vits_models_only_decoder.py

wasmdashai

Upload vits_models_only_decoder.py

7654db0 verified 9 months ago

raw

history blame contribute delete

11.4 kB


	import numpy as np
	import torch
	from torch import nn
	import math
	from typing import Any, Callable, Optional, Tuple, Union
	from torch.cuda.amp import autocast, GradScaler

	from .vits_config import VitsConfig,VitsPreTrainedModel
	from .flow import VitsResidualCouplingBlock
	from .duration_predictor import VitsDurationPredictor, VitsStochasticDurationPredictor
	from .encoder import VitsTextEncoder
	from .decoder import VitsHifiGan
	from .posterior_encoder import VitsPosteriorEncoder
	from .discriminator import VitsDiscriminator
	from .vits_output import VitsModelOutput, VitsTrainingOutput


	class Vits_models_only_decoder(VitsPreTrainedModel):

	def __init__(self, config: VitsConfig):
	super().__init__(config)

	self.config = config
	self.text_encoder = VitsTextEncoder(config)
	self.flow = VitsResidualCouplingBlock(config)
	self.decoder = VitsHifiGan(config)



	if config.use_stochastic_duration_prediction:
	self.duration_predictor = VitsStochasticDurationPredictor(config)
	else:
	self.duration_predictor = VitsDurationPredictor(config)

	if config.num_speakers > 1:
	self.embed_speaker = nn.Embedding(config.num_speakers, config.speaker_embedding_size)

	# This is used only for training.
	self.posterior_encoder = VitsPosteriorEncoder(config)
	self.discriminator = VitsDiscriminator(config)

	# These parameters control the synthesised speech properties
	self.speaking_rate = config.speaking_rate
	self.noise_scale = config.noise_scale
	self.noise_scale_duration = config.noise_scale_duration
	self.segment_size = self.config.segment_size // self.config.hop_length

	# Initialize weights and apply final processing
	self.post_init()


	#....................................

	def monotonic_align_max_path(self,log_likelihoods, mask):
	# used for training - awfully slow
	# an alternative is proposed in examples/pytorch/text-to-speech/run_vits_finetuning.py
	path = torch.zeros_like(log_likelihoods)

	text_length_maxs = mask.sum(1)[:, 0]
	latent_length_maxs = mask.sum(2)[:, 0]

	indexes = latent_length_maxs - 1

	max_neg_val = -1e9

	for batch_id in range(len(path)):
	index = int(indexes[batch_id].item())
	text_length_max = int(text_length_maxs[batch_id].item())
	latent_length_max = int(latent_length_maxs[batch_id].item())

	for y in range(text_length_max):
	for x in range(max(0, latent_length_max + y - text_length_max), min(latent_length_max, y + 1)):
	if x == y:
	v_cur = max_neg_val
	else:
	v_cur = log_likelihoods[batch_id, y - 1, x]
	if x == 0:
	if y == 0:
	v_prev = 0.0
	else:
	v_prev = max_neg_val
	else:
	v_prev = log_likelihoods[batch_id, y - 1, x - 1]
	log_likelihoods[batch_id, y, x] += max(v_prev, v_cur)

	for y in range(text_length_max - 1, -1, -1):
	path[batch_id, y, index] = 1
	if index != 0 and (
	index == y or log_likelihoods[batch_id, y - 1, index] < log_likelihoods[batch_id, y - 1, index - 1]
	):
	index = index - 1
	return path

	#....................................

	def slice_segments(self,hidden_states, ids_str, segment_size=4):

	batch_size, channels, _ = hidden_states.shape
	# 1d tensor containing the indices to keep
	indices = torch.arange(segment_size).to(ids_str.device)
	# extend the indices to match the shape of hidden_states
	indices = indices.view(1, 1, -1).expand(batch_size, channels, -1)
	# offset indices with ids_str
	indices = indices + ids_str.view(-1, 1, 1)
	# gather indices
	output = torch.gather(hidden_states, dim=2, index=indices)

	return output


	#....................................


	def rand_slice_segments(self,hidden_states, sample_lengths=None, segment_size=4):

	batch_size, _, seq_len = hidden_states.size()
	if sample_lengths is None:
	sample_lengths = seq_len
	ids_str_max = sample_lengths - segment_size + 1
	ids_str = (torch.rand([batch_size]).to(device=hidden_states.device) * ids_str_max).to(dtype=torch.long)
	ret = self.slice_segments(hidden_states, ids_str, segment_size)

	return ret, ids_str

	#....................................

	def resize_speaker_embeddings(
	self,
	new_num_speakers: int,
	speaker_embedding_size: Optional[int] = None,
	pad_to_multiple_of: Optional[int] = 2,
	):
	if pad_to_multiple_of is not None:
	new_num_speakers = ((new_num_speakers + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of

	# first, take care of embed_speaker
	if self.config.num_speakers <= 1:
	if speaker_embedding_size is None:
	raise ValueError(
	"The current model had no previous speaker embedding, but `speaker_embedding_size` is not specified. Pass `speaker_embedding_size` to this method."
	)
	# create new embedding layer
	new_embeddings = nn.Embedding(
	new_num_speakers,
	speaker_embedding_size,
	device=self.device,
	)
	# initialize all new embeddings
	self._init_weights(new_embeddings)
	else:
	new_embeddings = self._get_resized_embeddings(self.embed_speaker, new_num_speakers)

	self.embed_speaker = new_embeddings

	# then take care of sub-models
	self.flow.resize_speaker_embeddings(speaker_embedding_size)
	for flow in self.flow.flows:
	self._init_weights(flow.wavenet.cond_layer)

	self.decoder.resize_speaker_embedding(speaker_embedding_size)
	self._init_weights(self.decoder.cond)

	self.duration_predictor.resize_speaker_embeddings(speaker_embedding_size)
	self._init_weights(self.duration_predictor.cond)

	self.posterior_encoder.resize_speaker_embeddings(speaker_embedding_size)
	self._init_weights(self.posterior_encoder.wavenet.cond_layer)

	self.config.num_speakers = new_num_speakers
	self.config.speaker_embedding_size = speaker_embedding_size

	#....................................

	def get_input_embeddings(self):
	return self.text_encoder.get_input_embeddings()

	#....................................

	def set_input_embeddings(self, value):
	self.text_encoder.set_input_embeddings(value)

	#....................................

	def apply_weight_norm(self):
	self.decoder.apply_weight_norm()
	self.flow.apply_weight_norm()
	self.posterior_encoder.apply_weight_norm()

	#....................................

	def remove_weight_norm(self):
	self.decoder.remove_weight_norm()
	self.flow.remove_weight_norm()
	self.posterior_encoder.remove_weight_norm()

	#....................................

	def discriminate(self, hidden_states):
	return self.discriminator(hidden_states)

	#....................................

	def get_encoder(self):
	return self.text_encoder

	#....................................

	def _inference_forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	speaker_embeddings: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	padding_mask: Optional[torch.Tensor] = None,
	):
	text_encoder_output = self.text_encoder(
	input_ids=input_ids,
	padding_mask=padding_mask,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
	hidden_states = hidden_states.transpose(1, 2)
	input_padding_mask = padding_mask.transpose(1, 2)

	prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
	prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances

	if self.config.use_stochastic_duration_prediction:
	log_duration = self.duration_predictor(
	hidden_states,
	input_padding_mask,
	speaker_embeddings,
	reverse=True,
	noise_scale=self.noise_scale_duration,
	)
	else:
	log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)

	length_scale = 1.0 / self.speaking_rate
	duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
	predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()


	# Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
	indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
	output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
	output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)

	# Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
	attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
	batch_size, _, output_length, input_length = attn_mask.shape
	cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
	indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
	valid_indices = indices.unsqueeze(0) < cum_duration
	valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
	padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
	attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask

	# Expand prior distribution
	prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
	prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)

	prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
	latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)

	spectrogram = latents * output_padding_mask
	return spectrogram