Spaces:

badayvedat
/

AudioSep

Paused

App Files Files Community

AudioSep / data /waveform_mixers.py

badayvedat

Initial commit

ae29df4 about 1 year ago

raw

history blame

3.63 kB

	import random
	import sre_compile
	import numpy as np
	import torch
	import torch.nn as nn
	import pyloudnorm as pyln


	class SegmentMixer(nn.Module):
	def __init__(self, max_mix_num, lower_db, higher_db):
	super(SegmentMixer, self).__init__()

	self.max_mix_num = max_mix_num
	self.loudness_param = {
	'lower_db': lower_db,
	'higher_db': higher_db,
	}

	def __call__(self, waveforms):

	batch_size = waveforms.shape[0]

	data_dict = {
	'segment': [],
	'mixture': [],
	}

	for n in range(0, batch_size):

	segment = waveforms[n].clone()

	# create zero tensors as the background template
	noise = torch.zeros_like(segment)

	mix_num = random.randint(2, self.max_mix_num)
	assert mix_num >= 2

	for i in range(1, mix_num):
	next_segment = waveforms[(n + i) % batch_size]
	rescaled_next_segment = dynamic_loudnorm(audio=next_segment, reference=segment, **self.loudness_param)
	noise += rescaled_next_segment

	# randomly normalize background noise
	noise = dynamic_loudnorm(audio=noise, reference=segment, **self.loudness_param)

	# create audio mixyure
	mixture = segment + noise

	# declipping if need be
	max_value = torch.max(torch.abs(mixture))
	if max_value > 1:
	segment *= 0.9 / max_value
	mixture *= 0.9 / max_value

	data_dict['segment'].append(segment)
	data_dict['mixture'].append(mixture)

	for key in data_dict.keys():
	data_dict[key] = torch.stack(data_dict[key], dim=0)

	# return data_dict
	return data_dict['mixture'], data_dict['segment']


	def rescale_to_match_energy(segment1, segment2):

	ratio = get_energy_ratio(segment1, segment2)
	rescaled_segment1 = segment1 / ratio
	return rescaled_segment1


	def get_energy(x):
	return torch.mean(x ** 2)


	def get_energy_ratio(segment1, segment2):

	energy1 = get_energy(segment1)
	energy2 = max(get_energy(segment2), 1e-10)
	ratio = (energy1 / energy2) ** 0.5
	ratio = torch.clamp(ratio, 0.02, 50)
	return ratio


	def dynamic_loudnorm(audio, reference, lower_db=-10, higher_db=10):
	rescaled_audio = rescale_to_match_energy(audio, reference)

	delta_loudness = random.randint(lower_db, higher_db)

	gain = np.power(10.0, delta_loudness / 20.0)

	return gain * rescaled_audio


	def torch_to_numpy(tensor):
	"""Convert a PyTorch tensor to a NumPy array."""
	if isinstance(tensor, torch.Tensor):
	return tensor.detach().cpu().numpy()
	else:
	raise ValueError("Input must be a PyTorch tensor.")


	def numpy_to_torch(array):
	"""Convert a NumPy array to a PyTorch tensor."""
	if isinstance(array, np.ndarray):
	return torch.from_numpy(array)
	else:
	raise ValueError("Input must be a NumPy array.")


	# decayed
	def random_loudness_norm(audio, lower_db=-35, higher_db=-15, sr=32000):
	device = audio.device
	audio = torch_to_numpy(audio.squeeze(0))
	# randomly select a norm volume
	norm_vol = random.randint(lower_db, higher_db)

	# measure the loudness first
	meter = pyln.Meter(sr) # create BS.1770 meter
	loudness = meter.integrated_loudness(audio)
	# loudness normalize audio
	normalized_audio = pyln.normalize.loudness(audio, loudness, norm_vol)

	normalized_audio = numpy_to_torch(normalized_audio).unsqueeze(0)

	return normalized_audio.to(device)