|
import librosa |
|
from text_to_speech.utils.audio import librosa_wav2spec |
|
from text_to_speech.utils.commons.hparams import hparams |
|
import numpy as np |
|
|
|
REGISTERED_VOCODERS = {} |
|
|
|
|
|
def register_vocoder(name): |
|
def _f(cls): |
|
REGISTERED_VOCODERS[name] = cls |
|
return cls |
|
|
|
return _f |
|
|
|
|
|
def get_vocoder_cls(vocoder_name): |
|
return REGISTERED_VOCODERS.get(vocoder_name) |
|
|
|
|
|
class BaseVocoder: |
|
def spec2wav(self, mel): |
|
""" |
|
|
|
:param mel: [T, 80] |
|
:return: wav: [T'] |
|
""" |
|
|
|
raise NotImplementedError |
|
|
|
@staticmethod |
|
def wav2spec(wav_fn): |
|
""" |
|
|
|
:param wav_fn: str |
|
:return: wav, mel: [T, 80] |
|
""" |
|
wav_spec_dict = librosa_wav2spec(wav_fn, fft_size=hparams['fft_size'], |
|
hop_size=hparams['hop_size'], |
|
win_length=hparams['win_size'], |
|
num_mels=hparams['audio_num_mel_bins'], |
|
fmin=hparams['fmin'], |
|
fmax=hparams['fmax'], |
|
sample_rate=hparams['audio_sample_rate'], |
|
loud_norm=hparams['loud_norm']) |
|
wav = wav_spec_dict['wav'] |
|
mel = wav_spec_dict['mel'] |
|
return wav, mel |
|
|
|
@staticmethod |
|
def wav2mfcc(wav_fn): |
|
fft_size = hparams['fft_size'] |
|
hop_size = hparams['hop_size'] |
|
win_length = hparams['win_size'] |
|
sample_rate = hparams['audio_sample_rate'] |
|
wav, _ = librosa.core.load(wav_fn, sr=sample_rate) |
|
mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13, |
|
n_fft=fft_size, hop_length=hop_size, |
|
win_length=win_length, pad_mode="constant", power=1.0) |
|
mfcc_delta = librosa.feature.delta(mfcc, order=1) |
|
mfcc_delta_delta = librosa.feature.delta(mfcc, order=2) |
|
mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T |
|
return mfcc |
|
|