DuyTa
/

speaker_identify

Model card Files Files and versions Community

speaker_identify / utils /preprocessing.py

DuyTa's picture

Upload folder using huggingface_hub

f831146 verified 11 months ago

history blame contribute delete

1.51 kB

	import librosa
	import numpy as np
	import python_speech_features as psf


	def get_fbanks(audio_file):

	def normalize_frames(signal, epsilon=1e-12):
	return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal])

	y, sr = librosa.load(audio_file, sr=16000)
	assert sr == 16000

	trim_len = int(0.25 * sr)
	if y.shape[0] < 1 * sr:
	# if less than 1 seconds, don't use that audio
	return None

	y = y[trim_len:-trim_len]

	# frame width of 25 ms with a stride of 15 ms. This will have an overlap of 10s
	filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01)
	filter_banks = normalize_frames(signal=filter_banks)

	filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1))
	return filter_banks


	def extract_fbanks(path):
	fbanks = get_fbanks(path)
	num_frames = fbanks.shape[0]

	# sample sets of 64 frames each

	numpy_arrays = []
	start = 0
	while start < num_frames + 64:
	slice_ = fbanks[start:start + 64]
	if slice_ is not None and slice_.shape[0] == 64:
	assert slice_.shape[0] == 64
	assert slice_.shape[1] == 64
	assert slice_.shape[2] == 1

	slice_ = np.moveaxis(slice_, 2, 0)
	slice_ = slice_.reshape((1, 1, 64, 64))
	numpy_arrays.append(slice_)
	start = start + 64

	print('num samples extracted: {}'.format(len(numpy_arrays)))
	return np.concatenate(numpy_arrays, axis=0)