Spaces:
Paused
Paused
File size: 3,628 Bytes
ae29df4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import random
import sre_compile
import numpy as np
import torch
import torch.nn as nn
import pyloudnorm as pyln
class SegmentMixer(nn.Module):
def __init__(self, max_mix_num, lower_db, higher_db):
super(SegmentMixer, self).__init__()
self.max_mix_num = max_mix_num
self.loudness_param = {
'lower_db': lower_db,
'higher_db': higher_db,
}
def __call__(self, waveforms):
batch_size = waveforms.shape[0]
data_dict = {
'segment': [],
'mixture': [],
}
for n in range(0, batch_size):
segment = waveforms[n].clone()
# create zero tensors as the background template
noise = torch.zeros_like(segment)
mix_num = random.randint(2, self.max_mix_num)
assert mix_num >= 2
for i in range(1, mix_num):
next_segment = waveforms[(n + i) % batch_size]
rescaled_next_segment = dynamic_loudnorm(audio=next_segment, reference=segment, **self.loudness_param)
noise += rescaled_next_segment
# randomly normalize background noise
noise = dynamic_loudnorm(audio=noise, reference=segment, **self.loudness_param)
# create audio mixyure
mixture = segment + noise
# declipping if need be
max_value = torch.max(torch.abs(mixture))
if max_value > 1:
segment *= 0.9 / max_value
mixture *= 0.9 / max_value
data_dict['segment'].append(segment)
data_dict['mixture'].append(mixture)
for key in data_dict.keys():
data_dict[key] = torch.stack(data_dict[key], dim=0)
# return data_dict
return data_dict['mixture'], data_dict['segment']
def rescale_to_match_energy(segment1, segment2):
ratio = get_energy_ratio(segment1, segment2)
rescaled_segment1 = segment1 / ratio
return rescaled_segment1
def get_energy(x):
return torch.mean(x ** 2)
def get_energy_ratio(segment1, segment2):
energy1 = get_energy(segment1)
energy2 = max(get_energy(segment2), 1e-10)
ratio = (energy1 / energy2) ** 0.5
ratio = torch.clamp(ratio, 0.02, 50)
return ratio
def dynamic_loudnorm(audio, reference, lower_db=-10, higher_db=10):
rescaled_audio = rescale_to_match_energy(audio, reference)
delta_loudness = random.randint(lower_db, higher_db)
gain = np.power(10.0, delta_loudness / 20.0)
return gain * rescaled_audio
def torch_to_numpy(tensor):
"""Convert a PyTorch tensor to a NumPy array."""
if isinstance(tensor, torch.Tensor):
return tensor.detach().cpu().numpy()
else:
raise ValueError("Input must be a PyTorch tensor.")
def numpy_to_torch(array):
"""Convert a NumPy array to a PyTorch tensor."""
if isinstance(array, np.ndarray):
return torch.from_numpy(array)
else:
raise ValueError("Input must be a NumPy array.")
# decayed
def random_loudness_norm(audio, lower_db=-35, higher_db=-15, sr=32000):
device = audio.device
audio = torch_to_numpy(audio.squeeze(0))
# randomly select a norm volume
norm_vol = random.randint(lower_db, higher_db)
# measure the loudness first
meter = pyln.Meter(sr) # create BS.1770 meter
loudness = meter.integrated_loudness(audio)
# loudness normalize audio
normalized_audio = pyln.normalize.loudness(audio, loudness, norm_vol)
normalized_audio = numpy_to_torch(normalized_audio).unsqueeze(0)
return normalized_audio.to(device)
|