File size: 4,314 Bytes
bd3a23c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dabde8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import torch
import librosa
from tqdm import tqdm
from openvoice.api import ToneColorConverter
from openvoice.mel_processing import spectrogram_torch
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments


@torch.no_grad()
def se_extractor(audio_path, vc):
    # vad
    SAMPLE_RATE = 16000
    audio_vad = get_audio_tensor(audio_path)
    segments = get_vad_segments(
        audio_vad,
        output_sample=True,
        min_speech_duration=0.1,
        min_silence_duration=1,
        method="silero",
    )
    segments = [(seg["start"], seg["end"]) for seg in segments]
    segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]

    if len(segments) == 0:
        segments = [(0, len(audio_vad)/SAMPLE_RATE)]
        print(segments)

    # spk
    hps = vc.hps
    device = vc.device
    model = vc.model
    gs = []

    audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
    audio = torch.tensor(audio).float().to(device)

    for s, e in segments:
        y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
        y = y.to(device)
        y = y.unsqueeze(0)
        y = spectrogram_torch(y, hps.data.filter_length,
                              hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
                              center=False).to(device)
        g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
        gs.append(g.detach())

    gs = torch.stack(gs).mean(0)
    return gs.cpu()


def process_audio_folder(input_folder, output_folder, model, device):
    """
    Process all audio files in a folder and its subfolders, 
    save the extracted features as .pt files in the output folder with the same structure.

    Args:
        input_folder (str): Path to the input folder containing audio files.
        output_folder (str): Path to the output folder to save .pt files.
        model: Pre-trained model for feature extraction.
        device: Torch device (e.g., 'cpu' or 'cuda').
    """
    # Collect all audio file paths
    audio_files = []
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(('.wav', '.mp3', '.flac')):  # Adjust for the audio formats you want to process
                audio_files.append(os.path.join(root, file))

    # Process each audio file with tqdm for progress
    for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
        # Construct output path
        relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
        output_dir = os.path.join(output_folder, relative_path)
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')

        # Check if the .pt file already exists
        if os.path.exists(output_path):
            # print(f"Skipped (already exists): {output_path}")
            continue  # Skip processing this file
        # Extract features
        target_se = se_extractor(audio_path, model).to(device)
        # Save the feature as .pt
        torch.save(target_se, output_path)
        # print(f"Processed and saved: {output_path}")


if __name__ == '__main__':
    ckpt_converter = 'checkpoints_v2/converter'
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
    model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

    # audio_path = 'debug.wav'
    # target_se = se_extractor(audio_path, model).to(device)

    # source_path = 'source.wav'
    # source_se = se_extractor(source_path, model).to(device)

    # encode_message = "@MyShell"
    # model.convert(
    #     audio_src_path=source_path,
    #     src_se=source_se,
    #     tgt_se=target_se,
    #     output_path='output.wav',
    #     message=encode_message)
    # input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
    # output_folder = 'spk/VCTK-Corpus/'
    # process_audio_folder(input_folder, output_folder, model, device)

    input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
    output_folder = 'spk/LibriTTS-R/train-clean-360/'
    process_audio_folder(input_folder, output_folder, model, device)