File size: 4,314 Bytes
bd3a23c 0dabde8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import os
import torch
import librosa
from tqdm import tqdm
from openvoice.api import ToneColorConverter
from openvoice.mel_processing import spectrogram_torch
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
@torch.no_grad()
def se_extractor(audio_path, vc):
# vad
SAMPLE_RATE = 16000
audio_vad = get_audio_tensor(audio_path)
segments = get_vad_segments(
audio_vad,
output_sample=True,
min_speech_duration=0.1,
min_silence_duration=1,
method="silero",
)
segments = [(seg["start"], seg["end"]) for seg in segments]
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
if len(segments) == 0:
segments = [(0, len(audio_vad)/SAMPLE_RATE)]
print(segments)
# spk
hps = vc.hps
device = vc.device
model = vc.model
gs = []
audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
audio = torch.tensor(audio).float().to(device)
for s, e in segments:
y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
y = y.to(device)
y = y.unsqueeze(0)
y = spectrogram_torch(y, hps.data.filter_length,
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
center=False).to(device)
g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
gs.append(g.detach())
gs = torch.stack(gs).mean(0)
return gs.cpu()
def process_audio_folder(input_folder, output_folder, model, device):
"""
Process all audio files in a folder and its subfolders,
save the extracted features as .pt files in the output folder with the same structure.
Args:
input_folder (str): Path to the input folder containing audio files.
output_folder (str): Path to the output folder to save .pt files.
model: Pre-trained model for feature extraction.
device: Torch device (e.g., 'cpu' or 'cuda').
"""
# Collect all audio file paths
audio_files = []
for root, _, files in os.walk(input_folder):
for file in files:
if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process
audio_files.append(os.path.join(root, file))
# Process each audio file with tqdm for progress
for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
# Construct output path
relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
output_dir = os.path.join(output_folder, relative_path)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')
# Check if the .pt file already exists
if os.path.exists(output_path):
# print(f"Skipped (already exists): {output_path}")
continue # Skip processing this file
# Extract features
target_se = se_extractor(audio_path, model).to(device)
# Save the feature as .pt
torch.save(target_se, output_path)
# print(f"Processed and saved: {output_path}")
if __name__ == '__main__':
ckpt_converter = 'checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
# audio_path = 'debug.wav'
# target_se = se_extractor(audio_path, model).to(device)
# source_path = 'source.wav'
# source_se = se_extractor(source_path, model).to(device)
# encode_message = "@MyShell"
# model.convert(
# audio_src_path=source_path,
# src_se=source_se,
# tgt_se=target_se,
# output_path='output.wav',
# message=encode_message)
# input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
# output_folder = 'spk/VCTK-Corpus/'
# process_audio_folder(input_folder, output_folder, model, device)
input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
output_folder = 'spk/LibriTTS-R/train-clean-360/'
process_audio_folder(input_folder, output_folder, model, device) |