File size: 1,253 Bytes
35c1cfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import torch
import torchaudio
import random

def EAT_preprocess(source, norm_mean = -4.268, norm_std = 4.569, target_length = 1024, fixed_length = False, random_crop = False):    
    source = source - source.mean()
    source = source.unsqueeze(dim=0)
    
    source = torchaudio.compliance.kaldi.fbank(source, htk_compat=True, sample_frequency=16000, use_energy=False,
                                                window_type='hanning', num_mel_bins=128, dither=0.0, frame_shift=10).unsqueeze(dim=0)
    
    n_frames = source.shape[1]
    if not fixed_length:
        target_length = n_frames
        if target_length % 16 != 0:
            target_length = n_frames + (16 - n_frames % 16)
    diff = target_length - n_frames
    if diff > 0:
        m = torch.nn.ZeroPad2d((0, 0, 0, diff)) 
        source = m(source)
    elif diff < 0:
        if random_crop: 
            start_index = random.randint(0, n_frames - target_length)
            source = source[:,start_index: start_index+target_length, :]
        else: 
            source = source[:,0:target_length, :]
    
    # Normalize the mel spectrogram
    source = (source - norm_mean) / (norm_std * 2)
    source = source.squeeze()
    
    return source