Spaces:

peechapp
/

PeechTTSv22050

Sleeping

App Files Files Community

PeechTTSv22050 / training /preprocess /audio_processor.py

nickovchinnikov

Init

9d61c9b about 1 year ago

raw

history blame contribute delete

8.04 kB

	from librosa.filters import mel as librosa_mel_fn
	import torch


	class AudioProcessor:
	r"""A class used to process audio signals and convert them into different representations.

	Attributes:
	hann_window (dict): A dictionary to store the Hann window for different configurations.
	mel_basis (dict): A dictionary to store the Mel basis for different configurations.

	Methods:
	name_mel_basis(spec, n_fft, fmax): Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.
	amp_to_db(magnitudes, C=1, clip_val=1e-5): Convert amplitude to decibels (dB).
	db_to_amp(magnitudes, C=1): Convert decibels (dB) to amplitude.
	wav_to_spec(y, n_fft, hop_length, win_length, center=False): Convert a waveform to a spectrogram and compute the magnitude.
	wav_to_energy(y, n_fft, hop_length, win_length, center=False): Convert a waveform to a spectrogram and compute the energy.
	spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): Convert a spectrogram to a Mel spectrogram.
	wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): Convert a waveform to a Mel spectrogram.
	"""

	def __init__(self):
	self.hann_window = {}
	self.mel_basis = {}

	@staticmethod
	def name_mel_basis(spec: torch.Tensor, n_fft: int, fmax: int) -> str:
	"""Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

	Args:
	spec (torch.Tensor): The spectrogram tensor.
	n_fft (int): The FFT size.
	fmax (int): The maximum frequency.

	Returns:
	str: The generated name for the Mel basis.
	"""
	n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}"
	return n_fft_len

	@staticmethod
	def amp_to_db(magnitudes: torch.Tensor, C: int = 1, clip_val: float = 1e-5) -> torch.Tensor:
	r"""Convert amplitude to decibels (dB).

	Args:
	magnitudes (Tensor): The amplitude magnitudes to convert.
	C (int, optional): A constant value used in the conversion. Defaults to 1.
	clip_val (float, optional): A value to clamp the magnitudes to avoid taking the log of zero. Defaults to 1e-5.

	Returns:
	Tensor: The converted magnitudes in dB.
	"""
	return torch.log(torch.clamp(magnitudes, min=clip_val) * C)

	@staticmethod
	def db_to_amp(magnitudes: torch.Tensor, C: int = 1) -> torch.Tensor:
	r"""Convert decibels (dB) to amplitude.

	Args:
	magnitudes (Tensor): The dB magnitudes to convert.
	C (int, optional): A constant value used in the conversion. Defaults to 1.

	Returns:
	Tensor: The converted magnitudes in amplitude.
	"""
	return torch.exp(magnitudes) / C

	def wav_to_spec(
	self,
	y: torch.Tensor,
	n_fft: int,
	hop_length: int,
	win_length: int,
	center: bool = False,
	) -> torch.Tensor:
	r"""Convert a waveform to a spectrogram and compute the magnitude.

	Args:
	y (Tensor): The input waveform.
	n_fft (int): The FFT size.
	hop_length (int): The hop (stride) size.
	win_length (int): The window size.
	center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

	Returns:
	Tensor: The magnitude of the computed spectrogram.
	"""
	y = y.squeeze(1)

	dtype_device = str(y.dtype) + "_" + str(y.device)
	wnsize_dtype_device = str(win_length) + "_" + dtype_device
	if wnsize_dtype_device not in self.hann_window:
	self.hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)

	y = torch.nn.functional.pad(
	y.unsqueeze(1),
	(int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
	mode="reflect",
	)
	y = y.squeeze(1)

	spec = torch.stft(
	y,
	n_fft,
	hop_length=hop_length,
	win_length=win_length,
	window=self.hann_window[wnsize_dtype_device],
	center=center,
	pad_mode="reflect",
	normalized=False,
	onesided=True,
	return_complex=True,
	)

	spec = torch.view_as_real(spec)

	# Compute the magnitude
	spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)

	return spec

	def wav_to_energy(
	self,
	y: torch.Tensor,
	n_fft: int,
	hop_length: int,
	win_length: int,
	center: bool = False,
	) -> torch.Tensor:
	r"""Convert a waveform to a spectrogram and compute the energy.

	Args:
	y (Tensor): The input waveform.
	n_fft (int): The FFT size.
	hop_length (int): The hop (stride) size.
	win_length (int): The window size.
	center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

	Returns:
	Tensor: The energy of the computed spectrogram.
	"""
	spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)
	spec = torch.norm(spec, dim=1, keepdim=True).squeeze(0)

	# Normalize the energy
	return (spec - spec.mean()) / spec.std()

	def spec_to_mel(
	self,
	spec: torch.Tensor,
	n_fft: int,
	num_mels: int,
	sample_rate: int,
	fmin: int,
	fmax: int,
	) -> torch.Tensor:
	r"""Convert a spectrogram to a Mel spectrogram.

	Args:
	spec (torch.Tensor): The input spectrogram of shape [B, C, T].
	n_fft (int): The FFT size.
	num_mels (int): The number of Mel bands.
	sample_rate (int): The sample rate of the audio.
	fmin (int): The minimum frequency.
	fmax (int): The maximum frequency.

	Returns:
	torch.Tensor: The computed Mel spectrogram of shape [B, C, T].
	"""
	mel_basis_key = self.name_mel_basis(spec, n_fft, fmax)

	if mel_basis_key not in self.mel_basis:
	mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
	self.mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)

	mel = torch.matmul(self.mel_basis[mel_basis_key], spec)
	mel = self.amp_to_db(mel)

	return mel

	def wav_to_mel(
	self,
	y: torch.Tensor,
	n_fft: int,
	num_mels: int,
	sample_rate: int,
	hop_length: int,
	win_length: int,
	fmin: int,
	fmax: int,
	center: bool = False,
	) -> torch.Tensor:
	r"""Convert a waveform to a Mel spectrogram.

	Args:
	y (torch.Tensor): The input waveform.
	n_fft (int): The FFT size.
	num_mels (int): The number of Mel bands.
	sample_rate (int): The sample rate of the audio.
	hop_length (int): The hop (stride) size.
	win_length (int): The window size.
	fmin (int): The minimum frequency.
	fmax (int): The maximum frequency.
	center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

	Returns:
	torch.Tensor: The computed Mel spectrogram.
	"""
	# Convert the waveform to a spectrogram
	spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)

	# Convert the spectrogram to a Mel spectrogram
	mel = self.spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)

	return mel