alibabasglab's picture
Update scores/mcd.py
804519a verified
from basis import ScoreBasis
import librosa
import math
import numpy as np
import pyworld
import pysptk
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
class MCD(ScoreBasis):
def __init__(self):
super(MCD, self).__init__(name='MCD')
self.intrusive = False
# three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics
self.mcd_toolbox = Calculate_MCD(MCD_mode="plain")
def windowed_scoring(self, audios, score_rate):
if len(audios) != 2:
return None
return self.mcd_toolbox.calculate_mcd(audios[1], audios[0], score_rate)
# ================================================= #
# calculate the Mel-Cepstral Distortion (MCD) value #
# ================================================= #
#refer to : https://github.com/chenqi008/pymcd/blob/main/pymcd/mcd.py
class Calculate_MCD(object):
"""docstring for Calculate_MCD"""
def __init__(self, MCD_mode):
super(Calculate_MCD, self).__init__()
self.MCD_mode = MCD_mode
#self.SAMPLING_RATE = 22050
self.FRAME_PERIOD = 5.0
self.log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754
def load_wav(self, wav_file, sample_rate):
"""
Load a wav file with librosa.
:param wav_file: path to wav file
:param sr: sampling rate
:return: audio time series numpy array
"""
wav, _ = librosa.load(wav_file, sr=sample_rate, mono=True)
return wav
# distance metric
def log_spec_dB_dist(self, x, y):
# log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
diff = x - y
return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff))
# calculate distance (metric)
# def calculate_mcd_distance(self, x, y, distance, path):
def calculate_mcd_distance(self, x, y, path):
'''
param path: pairs between x and y
'''
pathx = list(map(lambda l: l[0], path))
pathy = list(map(lambda l: l[1], path))
x, y = x[pathx], y[pathy]
frames_tot = x.shape[0] # length of pairs
z = x - y
min_cost_tot = np.sqrt((z * z).sum(-1)).sum()
return frames_tot, min_cost_tot
# extract acoustic features
# alpha = 0.65 # commonly used at 22050 Hz
def wav2mcep_numpy(self, loaded_wav, score_rate=22050, alpha=0.65, fft_size=512):
# Use WORLD vocoder to spectral envelope
_, sp, _ = pyworld.wav2world(loaded_wav.astype(np.double), fs=score_rate,
frame_period=self.FRAME_PERIOD, fft_size=fft_size)
# Extract MCEP features
mcep = pysptk.sptk.mcep(sp, order=13, alpha=alpha, maxiter=0,
etype=1, eps=1.0E-8, min_det=0.0, itype=3)
return mcep
# calculate the Mel-Cepstral Distortion (MCD) value
#def average_mcd(self, ref_audio_file, syn_audio_file, cost_function, MCD_mode):
def average_mcd(self, loaded_ref_wav, loaded_syn_wav, cost_function, MCD_mode, score_rate):
"""
Calculate the average MCD.
:param ref_mcep_files: list of strings, paths to MCEP target reference files
:param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
:param cost_function: distance metric used
:param plain: if plain=True, use Dynamic Time Warping (dtw)
:returns: average MCD, total frames processed
"""
# load wav from given wav file
#loaded_ref_wav = self.load_wav(ref_audio_file, sample_rate=self.SAMPLING_RATE)
#loaded_syn_wav = self.load_wav(syn_audio_file, sample_rate=self.SAMPLING_RATE)
if MCD_mode == "plain":
# pad 0
if len(loaded_ref_wav)<len(loaded_syn_wav):
loaded_ref_wav = np.pad(loaded_ref_wav, (0, len(loaded_syn_wav)-len(loaded_ref_wav)))
else:
loaded_syn_wav = np.pad(loaded_syn_wav, (0, len(loaded_ref_wav)-len(loaded_syn_wav)))
# extract MCEP features (vectors): 2D matrix (num x mcep_size)
ref_mcep_vec = self.wav2mcep_numpy(loaded_ref_wav, score_rate)
syn_mcep_vec = self.wav2mcep_numpy(loaded_syn_wav, score_rate)
if MCD_mode == "plain":
# print("Calculate plain MCD ...")
path = []
# for i in range(num_temp):
for i in range(len(ref_mcep_vec)):
path.append((i, i))
elif MCD_mode == "dtw":
# print("Calculate MCD-dtw ...")
_, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
elif MCD_mode == "dtw_sl":
# print("Calculate MCD-dtw-sl ...")
cof = len(ref_mcep_vec)/len(syn_mcep_vec) if len(ref_mcep_vec)>len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec)
_, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
frames_tot, min_cost_tot = self.calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path)
if MCD_mode == "dtw_sl":
mean_mcd = cof * self.log_spec_dB_const * min_cost_tot / frames_tot
else:
mean_mcd = self.log_spec_dB_const * min_cost_tot / frames_tot
return mean_mcd
# calculate mcd
def calculate_mcd(self, reference_audio, synthesized_audio, score_rate):
# extract acoustic features
mean_mcd = self.average_mcd(reference_audio, synthesized_audio, self.log_spec_dB_dist, self.MCD_mode, score_rate)
return mean_mcd