File size: 4,314 Bytes

import os
import torch
import librosa
from tqdm import tqdm
from openvoice.api import ToneColorConverter
from openvoice.mel_processing import spectrogram_torch
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments


@torch.no_grad()
def se_extractor(audio_path, vc):
    # vad
    SAMPLE_RATE = 16000
    audio_vad = get_audio_tensor(audio_path)
    segments = get_vad_segments(
        audio_vad,
        output_sample=True,
        min_speech_duration=0.1,
        min_silence_duration=1,
        method="silero",
    )
    segments = [(seg["start"], seg["end"]) for seg in segments]
    segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]

    if len(segments) == 0:
        segments = [(0, len(audio_vad)/SAMPLE_RATE)]
        print(segments)

    # spk
    hps = vc.hps
    device = vc.device
    model = vc.model
    gs = []

    audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
    audio = torch.tensor(audio).float().to(device)

    for s, e in segments:
        y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
        y = y.to(device)
        y = y.unsqueeze(0)
        y = spectrogram_torch(y, hps.data.filter_length,
                              hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
                              center=False).to(device)
        g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
        gs.append(g.detach())

    gs = torch.stack(gs).mean(0)
    return gs.cpu()


def process_audio_folder(input_folder, output_folder, model, device):
    """
    Process all audio files in a folder and its subfolders, 
    save the extracted features as .pt files in the output folder with the same structure.

    Args:
        input_folder (str): Path to the input folder containing audio files.
        output_folder (str): Path to the output folder to save .pt files.
        model: Pre-trained model for feature extraction.
        device: Torch device (e.g., 'cpu' or 'cuda').
    """
    # Collect all audio file paths
    audio_files = []
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(('.wav', '.mp3', '.flac')):  # Adjust for the audio formats you want to process
                audio_files.append(os.path.join(root, file))

    # Process each audio file with tqdm for progress
    for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
        # Construct output path
        relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
        output_dir = os.path.join(output_folder, relative_path)
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')

        # Check if the .pt file already exists
        if os.path.exists(output_path):
            # print(f"Skipped (already exists): {output_path}")
            continue  # Skip processing this file
        # Extract features
        target_se = se_extractor(audio_path, model).to(device)
        # Save the feature as .pt
        torch.save(target_se, output_path)
        # print(f"Processed and saved: {output_path}")


if __name__ == '__main__':
    ckpt_converter = 'checkpoints_v2/converter'
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
    model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

    # audio_path = 'debug.wav'
    # target_se = se_extractor(audio_path, model).to(device)

    # source_path = 'source.wav'
    # source_se = se_extractor(source_path, model).to(device)

    # encode_message = "@MyShell"
    # model.convert(
    #     audio_src_path=source_path,
    #     src_se=source_se,
    #     tgt_se=target_se,
    #     output_path='output.wav',
    #     message=encode_message)
    # input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
    # output_folder = 'spk/VCTK-Corpus/'
    # process_audio_folder(input_folder, output_folder, model, device)

    input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
    output_folder = 'spk/LibriTTS-R/train-clean-360/'
    process_audio_folder(input_folder, output_folder, model, device)