DreamVoice / dreamvoice /train_utils /src /spk_ext.py

freevc plugin

0dabde8 6 months ago

4.31 kB

	import os
	import torch
	import librosa
	from tqdm import tqdm
	from openvoice.api import ToneColorConverter
	from openvoice.mel_processing import spectrogram_torch
	from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments


	@torch.no_grad()
	def se_extractor(audio_path, vc):
	# vad
	SAMPLE_RATE = 16000
	audio_vad = get_audio_tensor(audio_path)
	segments = get_vad_segments(
	audio_vad,
	output_sample=True,
	min_speech_duration=0.1,
	min_silence_duration=1,
	method="silero",
	)
	segments = [(seg["start"], seg["end"]) for seg in segments]
	segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]

	if len(segments) == 0:
	segments = [(0, len(audio_vad)/SAMPLE_RATE)]
	print(segments)

	# spk
	hps = vc.hps
	device = vc.device
	model = vc.model
	gs = []

	audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
	audio = torch.tensor(audio).float().to(device)

	for s, e in segments:
	y = audio[int(hps.data.sampling_rates):int(hps.data.sampling_ratee)]
	y = y.to(device)
	y = y.unsqueeze(0)
	y = spectrogram_torch(y, hps.data.filter_length,
	hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
	center=False).to(device)
	g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
	gs.append(g.detach())

	gs = torch.stack(gs).mean(0)
	return gs.cpu()


	def process_audio_folder(input_folder, output_folder, model, device):
	"""
	Process all audio files in a folder and its subfolders,
	save the extracted features as .pt files in the output folder with the same structure.

	Args:
	input_folder (str): Path to the input folder containing audio files.
	output_folder (str): Path to the output folder to save .pt files.
	model: Pre-trained model for feature extraction.
	device: Torch device (e.g., 'cpu' or 'cuda').
	"""
	# Collect all audio file paths
	audio_files = []
	for root, _, files in os.walk(input_folder):
	for file in files:
	if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process
	audio_files.append(os.path.join(root, file))

	# Process each audio file with tqdm for progress
	for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
	# Construct output path
	relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
	output_dir = os.path.join(output_folder, relative_path)
	os.makedirs(output_dir, exist_ok=True)
	output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')

	# Check if the .pt file already exists
	if os.path.exists(output_path):
	# print(f"Skipped (already exists): {output_path}")
	continue # Skip processing this file
	# Extract features
	target_se = se_extractor(audio_path, model).to(device)
	# Save the feature as .pt
	torch.save(target_se, output_path)
	# print(f"Processed and saved: {output_path}")


	if __name__ == '__main__':
	ckpt_converter = 'checkpoints_v2/converter'
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
	model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

	# audio_path = 'debug.wav'
	# target_se = se_extractor(audio_path, model).to(device)

	# source_path = 'source.wav'
	# source_se = se_extractor(source_path, model).to(device)

	# encode_message = "@MyShell"
	# model.convert(
	# audio_src_path=source_path,
	# src_se=source_se,
	# tgt_se=target_se,
	# output_path='output.wav',
	# message=encode_message)
	# input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
	# output_folder = 'spk/VCTK-Corpus/'
	# process_audio_folder(input_folder, output_folder, model, device)

	input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
	output_folder = 'spk/LibriTTS-R/train-clean-360/'
	process_audio_folder(input_folder, output_folder, model, device)