|
import io |
|
|
|
import h5py |
|
import kaldiio |
|
import numpy as np |
|
|
|
|
|
class CMVN(object): |
|
def __init__( |
|
self, |
|
stats, |
|
norm_means=True, |
|
norm_vars=False, |
|
filetype="mat", |
|
utt2spk=None, |
|
spk2utt=None, |
|
reverse=False, |
|
std_floor=1.0e-20, |
|
): |
|
self.stats_file = stats |
|
self.norm_means = norm_means |
|
self.norm_vars = norm_vars |
|
self.reverse = reverse |
|
|
|
if isinstance(stats, dict): |
|
stats_dict = dict(stats) |
|
else: |
|
|
|
if filetype == "mat": |
|
stats_dict = {None: kaldiio.load_mat(stats)} |
|
|
|
elif filetype == "npy": |
|
stats_dict = {None: np.load(stats)} |
|
|
|
elif filetype == "ark": |
|
self.accept_uttid = True |
|
stats_dict = dict(kaldiio.load_ark(stats)) |
|
|
|
elif filetype == "hdf5": |
|
self.accept_uttid = True |
|
stats_dict = h5py.File(stats) |
|
else: |
|
raise ValueError("Not supporting filetype={}".format(filetype)) |
|
|
|
if utt2spk is not None: |
|
self.utt2spk = {} |
|
with io.open(utt2spk, "r", encoding="utf-8") as f: |
|
for line in f: |
|
utt, spk = line.rstrip().split(None, 1) |
|
self.utt2spk[utt] = spk |
|
elif spk2utt is not None: |
|
self.utt2spk = {} |
|
with io.open(spk2utt, "r", encoding="utf-8") as f: |
|
for line in f: |
|
spk, utts = line.rstrip().split(None, 1) |
|
for utt in utts.split(): |
|
self.utt2spk[utt] = spk |
|
else: |
|
self.utt2spk = None |
|
|
|
|
|
|
|
|
|
|
|
self.bias = {} |
|
self.scale = {} |
|
for spk, stats in stats_dict.items(): |
|
assert len(stats) == 2, stats.shape |
|
|
|
count = stats[0, -1] |
|
|
|
|
|
if not (np.isscalar(count) or isinstance(count, (int, float))): |
|
|
|
count = count.flatten()[0] |
|
|
|
mean = stats[0, :-1] / count |
|
|
|
var = stats[1, :-1] / count - mean * mean |
|
std = np.maximum(np.sqrt(var), std_floor) |
|
self.bias[spk] = -mean |
|
self.scale[spk] = 1 / std |
|
|
|
def __repr__(self): |
|
return ( |
|
"{name}(stats_file={stats_file}, " |
|
"norm_means={norm_means}, norm_vars={norm_vars}, " |
|
"reverse={reverse})".format( |
|
name=self.__class__.__name__, |
|
stats_file=self.stats_file, |
|
norm_means=self.norm_means, |
|
norm_vars=self.norm_vars, |
|
reverse=self.reverse, |
|
) |
|
) |
|
|
|
def __call__(self, x, uttid=None): |
|
if self.utt2spk is not None: |
|
spk = self.utt2spk[uttid] |
|
else: |
|
spk = uttid |
|
|
|
if not self.reverse: |
|
if self.norm_means: |
|
x = np.add(x, self.bias[spk]) |
|
if self.norm_vars: |
|
x = np.multiply(x, self.scale[spk]) |
|
|
|
else: |
|
if self.norm_vars: |
|
x = np.divide(x, self.scale[spk]) |
|
if self.norm_means: |
|
x = np.subtract(x, self.bias[spk]) |
|
|
|
return x |
|
|
|
|
|
class UtteranceCMVN(object): |
|
def __init__(self, norm_means=True, norm_vars=False, std_floor=1.0e-20): |
|
self.norm_means = norm_means |
|
self.norm_vars = norm_vars |
|
self.std_floor = std_floor |
|
|
|
def __repr__(self): |
|
return "{name}(norm_means={norm_means}, norm_vars={norm_vars})".format( |
|
name=self.__class__.__name__, |
|
norm_means=self.norm_means, |
|
norm_vars=self.norm_vars, |
|
) |
|
|
|
def __call__(self, x, uttid=None): |
|
|
|
square_sums = (x ** 2).sum(axis=0) |
|
mean = x.mean(axis=0) |
|
|
|
if self.norm_means: |
|
x = np.subtract(x, mean) |
|
|
|
if self.norm_vars: |
|
var = square_sums / x.shape[0] - mean ** 2 |
|
std = np.maximum(np.sqrt(var), self.std_floor) |
|
x = np.divide(x, std) |
|
|
|
return x |
|
|