import os import torch import librosa from tqdm import tqdm from openvoice.api import ToneColorConverter from openvoice.mel_processing import spectrogram_torch from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments @torch.no_grad() def se_extractor(audio_path, vc): # vad SAMPLE_RATE = 16000 audio_vad = get_audio_tensor(audio_path) segments = get_vad_segments( audio_vad, output_sample=True, min_speech_duration=0.1, min_silence_duration=1, method="silero", ) segments = [(seg["start"], seg["end"]) for seg in segments] segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments] if len(segments) == 0: segments = [(0, len(audio_vad)/SAMPLE_RATE)] print(segments) # spk hps = vc.hps device = vc.device model = vc.model gs = [] audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate) audio = torch.tensor(audio).float().to(device) for s, e in segments: y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)] y = y.to(device) y = y.unsqueeze(0) y = spectrogram_torch(y, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False).to(device) g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1) gs.append(g.detach()) gs = torch.stack(gs).mean(0) return gs.cpu() def process_audio_folder(input_folder, output_folder, model, device): """ Process all audio files in a folder and its subfolders, save the extracted features as .pt files in the output folder with the same structure. Args: input_folder (str): Path to the input folder containing audio files. output_folder (str): Path to the output folder to save .pt files. model: Pre-trained model for feature extraction. device: Torch device (e.g., 'cpu' or 'cuda'). """ # Collect all audio file paths audio_files = [] for root, _, files in os.walk(input_folder): for file in files: if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process audio_files.append(os.path.join(root, file)) # Process each audio file with tqdm for progress for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"): # Construct output path relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder) output_dir = os.path.join(output_folder, relative_path) os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt') # Check if the .pt file already exists if os.path.exists(output_path): # print(f"Skipped (already exists): {output_path}") continue # Skip processing this file # Extract features target_se = se_extractor(audio_path, model).to(device) # Save the feature as .pt torch.save(target_se, output_path) # print(f"Processed and saved: {output_path}") if __name__ == '__main__': ckpt_converter = 'checkpoints_v2/converter' device = "cuda:0" if torch.cuda.is_available() else "cpu" model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) model.load_ckpt(f'{ckpt_converter}/checkpoint.pth') # audio_path = 'debug.wav' # target_se = se_extractor(audio_path, model).to(device) # source_path = 'source.wav' # source_se = se_extractor(source_path, model).to(device) # encode_message = "@MyShell" # model.convert( # audio_src_path=source_path, # src_se=source_se, # tgt_se=target_se, # output_path='output.wav', # message=encode_message) # input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/' # output_folder = 'spk/VCTK-Corpus/' # process_audio_folder(input_folder, output_folder, model, device) input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360' output_folder = 'spk/LibriTTS-R/train-clean-360/' process_audio_folder(input_folder, output_folder, model, device)