|
import os |
|
import torch |
|
import librosa |
|
from tqdm import tqdm |
|
from openvoice.api import ToneColorConverter |
|
from openvoice.mel_processing import spectrogram_torch |
|
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments |
|
|
|
|
|
@torch.no_grad() |
|
def se_extractor(audio_path, vc): |
|
|
|
SAMPLE_RATE = 16000 |
|
audio_vad = get_audio_tensor(audio_path) |
|
segments = get_vad_segments( |
|
audio_vad, |
|
output_sample=True, |
|
min_speech_duration=0.1, |
|
min_silence_duration=1, |
|
method="silero", |
|
) |
|
segments = [(seg["start"], seg["end"]) for seg in segments] |
|
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments] |
|
|
|
if len(segments) == 0: |
|
segments = [(0, len(audio_vad)/SAMPLE_RATE)] |
|
print(segments) |
|
|
|
|
|
hps = vc.hps |
|
device = vc.device |
|
model = vc.model |
|
gs = [] |
|
|
|
audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate) |
|
audio = torch.tensor(audio).float().to(device) |
|
|
|
for s, e in segments: |
|
y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)] |
|
y = y.to(device) |
|
y = y.unsqueeze(0) |
|
y = spectrogram_torch(y, hps.data.filter_length, |
|
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, |
|
center=False).to(device) |
|
g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1) |
|
gs.append(g.detach()) |
|
|
|
gs = torch.stack(gs).mean(0) |
|
return gs.cpu() |
|
|
|
|
|
def process_audio_folder(input_folder, output_folder, model, device): |
|
""" |
|
Process all audio files in a folder and its subfolders, |
|
save the extracted features as .pt files in the output folder with the same structure. |
|
|
|
Args: |
|
input_folder (str): Path to the input folder containing audio files. |
|
output_folder (str): Path to the output folder to save .pt files. |
|
model: Pre-trained model for feature extraction. |
|
device: Torch device (e.g., 'cpu' or 'cuda'). |
|
""" |
|
|
|
audio_files = [] |
|
for root, _, files in os.walk(input_folder): |
|
for file in files: |
|
if file.endswith(('.wav', '.mp3', '.flac')): |
|
audio_files.append(os.path.join(root, file)) |
|
|
|
|
|
for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"): |
|
|
|
relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder) |
|
output_dir = os.path.join(output_folder, relative_path) |
|
os.makedirs(output_dir, exist_ok=True) |
|
output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt') |
|
|
|
|
|
if os.path.exists(output_path): |
|
|
|
continue |
|
|
|
target_se = se_extractor(audio_path, model).to(device) |
|
|
|
torch.save(target_se, output_path) |
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
ckpt_converter = 'checkpoints_v2/converter' |
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) |
|
model.load_ckpt(f'{ckpt_converter}/checkpoint.pth') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360' |
|
output_folder = 'spk/LibriTTS-R/train-clean-360/' |
|
process_audio_folder(input_folder, output_folder, model, device) |