File size: 4,496 Bytes
0a97d6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import torch
import librosa
import numpy as np
import soundfile as sf
import pandas as pd
# from feats.hubert_model import get_soft_model, get_hubert_soft_content
from feats.contentvec_hf import get_content_model, get_content
# from modules.speaker_encoder.encoder import inference as spk_encoder
# from pathlib import Path
from tqdm import tqdm
from multiprocessing import Process
import pyworld as pw


def resample_save(infolder, audio_path, model,

                  audio_sr=24000, content_sr=16000, min_length=1.92,

                  content_resolution=50,

                  save_path='features'):
    if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
        audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
        final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
        # final_length = final_length / content_sr

        length = max(round(min_length*content_sr), round(final_length))
        assert length % 10 == 0
        audio = audio[:length]
        audio_save = np.zeros(length, dtype=audio.dtype)
        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]

        # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
        content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
        content = content.cpu()
        os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
        torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
        # print(audio_save.shape)
        # print(content.shape)
        os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
        sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
        # print(save_path + '/' + 'audio_16k/' + audio_path)

        audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
        length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
        assert length % 10 == 0
        audio = audio[:length]
        audio_save = np.zeros(length, dtype=audio.dtype)
        audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
        # print(audio_save.shape)
        os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
        sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))


def extract_f0(in_folder, audio_path, save_path):
    audio, sr = librosa.load(in_folder + audio_path, sr=None)
    assert sr == 16000
    if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
        # wav = audio
        # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
        # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
        #                         fmin=librosa.note_to_hz('C2'),
        #                         fmax=librosa.note_to_hz('C6'))

        _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
        f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]

        f0 = np.nan_to_num(f0)
        os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
        # print(save_path + '/' + 'f0/' + audio_path + '.pt')
        torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')


def chunks(arr, m):
    result = [[] for i in range(m)]
    for i in range(len(arr)):
        result[i%m].append(arr[i])
    return result


def extract_f0_main(in_folder, audio_paths, save_path):
    for audio_path in tqdm(audio_paths):
        extract_f0(in_folder, audio_path, save_path)


if __name__ == '__main__':
    df = pd.read_csv('../test_data/vc_meta.csv')
    # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
    model = get_content_model().to('cuda')
    # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        in_path = row['path']
        resample_save('../test_data/', in_path, model, save_path='../features/')

    in_folder = '../features/audio_16k/'
    audio_files = list(df['path'])
    save_path = '../features/'
    cores = 6

    subsets = chunks(audio_files, cores)

    for subset in subsets:
        t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
        t.start()