|
import os
|
|
import sys
|
|
import time
|
|
import tqdm
|
|
import torch
|
|
import shutil
|
|
import logging
|
|
import argparse
|
|
import warnings
|
|
import onnxruntime
|
|
import logging.handlers
|
|
import concurrent.futures
|
|
|
|
import numpy as np
|
|
import torch.multiprocessing as mp
|
|
|
|
from random import shuffle
|
|
from distutils.util import strtobool
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
from main.configs.config import Config
|
|
from main.library.predictors.Generator import Generator
|
|
from main.library.utils import check_predictors, check_embedders, load_audio, load_embedders_model
|
|
|
|
logger = logging.getLogger(__name__)
|
|
config = Config()
|
|
translations = config.translations
|
|
logger.propagate = False
|
|
|
|
warnings.filterwarnings("ignore")
|
|
for l in ["torch", "faiss", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "matplotlib"]:
|
|
logging.getLogger(l).setLevel(logging.ERROR)
|
|
|
|
def parse_arguments():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--model_name", type=str, required=True)
|
|
parser.add_argument("--rvc_version", type=str, default="v2")
|
|
parser.add_argument("--f0_method", type=str, default="rmvpe")
|
|
parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True)
|
|
parser.add_argument("--hop_length", type=int, default=128)
|
|
parser.add_argument("--cpu_cores", type=int, default=2)
|
|
parser.add_argument("--gpu", type=str, default="-")
|
|
parser.add_argument("--sample_rate", type=int, required=True)
|
|
parser.add_argument("--embedder_model", type=str, default="contentvec_base")
|
|
parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False)
|
|
parser.add_argument("--embedders_mode", type=str, default="fairseq")
|
|
|
|
return parser.parse_args()
|
|
|
|
def generate_config(rvc_version, sample_rate, model_path):
|
|
config_save_path = os.path.join(model_path, "config.json")
|
|
if not os.path.exists(config_save_path): shutil.copy(os.path.join("main", "configs", rvc_version, f"{sample_rate}.json"), config_save_path)
|
|
|
|
def generate_filelist(pitch_guidance, model_path, rvc_version, sample_rate, embedders_mode = "fairseq"):
|
|
gt_wavs_dir, feature_dir = os.path.join(model_path, "sliced_audios"), os.path.join(model_path, f"{rvc_version}_extracted")
|
|
f0_dir, f0nsf_dir = None, None
|
|
if pitch_guidance: f0_dir, f0nsf_dir = os.path.join(model_path, "f0"), os.path.join(model_path, "f0_voiced")
|
|
gt_wavs_files, feature_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir)), set(name.split(".")[0] for name in os.listdir(feature_dir))
|
|
names = gt_wavs_files & feature_files & set(name.split(".")[0] for name in os.listdir(f0_dir)) & set(name.split(".")[0] for name in os.listdir(f0nsf_dir)) if pitch_guidance else gt_wavs_files & feature_files
|
|
options = []
|
|
mute_base_path = os.path.join("assets", "logs", "mute")
|
|
|
|
for name in names:
|
|
options.append(f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|0" if pitch_guidance else f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|0")
|
|
|
|
mute_audio_path, mute_feature_path = os.path.join(mute_base_path, "sliced_audios", f"mute{sample_rate}.wav"), os.path.join(mute_base_path, f"{rvc_version}_extracted", f"mute{'_spin' if embedders_mode == 'spin' else ''}.npy")
|
|
for _ in range(2):
|
|
options.append(f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|0" if pitch_guidance else f"{mute_audio_path}|{mute_feature_path}|0")
|
|
|
|
shuffle(options)
|
|
with open(os.path.join(model_path, "filelist.txt"), "w") as f:
|
|
f.write("\n".join(options))
|
|
|
|
def setup_paths(exp_dir, version = None):
|
|
wav_path = os.path.join(exp_dir, "sliced_audios_16k")
|
|
if version:
|
|
out_path = os.path.join(exp_dir, f"{version}_extracted")
|
|
os.makedirs(out_path, exist_ok=True)
|
|
return wav_path, out_path
|
|
else:
|
|
output_root1, output_root2 = os.path.join(exp_dir, "f0"), os.path.join(exp_dir, "f0_voiced")
|
|
os.makedirs(output_root1, exist_ok=True); os.makedirs(output_root2, exist_ok=True)
|
|
return wav_path, output_root1, output_root2
|
|
|
|
def get_providers():
|
|
ort_providers = onnxruntime.get_available_providers()
|
|
|
|
if "CUDAExecutionProvider" in ort_providers: providers = ["CUDAExecutionProvider"]
|
|
elif "CoreMLExecutionProvider" in ort_providers: providers = ["CoreMLExecutionProvider"]
|
|
else: providers = ["CPUExecutionProvider"]
|
|
|
|
return providers
|
|
|
|
class FeatureInput:
|
|
def __init__(self, sample_rate=16000, hop_size=160, is_half=False, device=config.device):
|
|
self.fs = sample_rate
|
|
self.hop = hop_size
|
|
self.f0_bin = 256
|
|
self.f0_max = 1100.0
|
|
self.f0_min = 50.0
|
|
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
|
|
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
|
|
self.device = device
|
|
self.is_half = is_half
|
|
self.f0_gen = Generator(self.fs, self.hop, self.f0_min, self.f0_max, self.is_half, self.device, get_providers(), False)
|
|
|
|
def compute_f0(self, np_arr, f0_method, hop_length, f0_onnx=False):
|
|
self.f0_gen.hop_length, self.f0_gen.f0_onnx_mode = hop_length, f0_onnx
|
|
return self.f0_gen.calculator(f0_method, np_arr, None, 0)
|
|
|
|
def coarse_f0(self, f0):
|
|
return np.rint(np.clip(((1127 * np.log(1 + f0 / 700)) - self.f0_mel_min) * (self.f0_bin - 2) / (self.f0_mel_max - self.f0_mel_min) + 1, 1, self.f0_bin - 1)).astype(int)
|
|
|
|
def process_file(self, file_info, f0_method, hop_length, f0_onnx):
|
|
inp_path, opt_path1, opt_path2, file_inp = file_info
|
|
if os.path.exists(opt_path1 + ".npy") and os.path.exists(opt_path2 + ".npy"): return
|
|
|
|
try:
|
|
feature_pit = self.compute_f0(load_audio(logger, file_inp, self.fs), f0_method, hop_length, f0_onnx)
|
|
if isinstance(feature_pit, tuple): feature_pit = feature_pit[0]
|
|
np.save(opt_path2, feature_pit, allow_pickle=False)
|
|
np.save(opt_path1, self.coarse_f0(feature_pit), allow_pickle=False)
|
|
except Exception as e:
|
|
raise RuntimeError(f"{translations['extract_file_error']} {inp_path}: {e}")
|
|
|
|
def process_files(self, files, f0_method, hop_length, f0_onnx, device, is_half, threads):
|
|
self.device = device
|
|
self.is_half = is_half
|
|
|
|
def worker(file_info):
|
|
self.process_file(file_info, f0_method, hop_length, f0_onnx)
|
|
|
|
with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
|
for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
|
|
pbar.update(1)
|
|
|
|
def run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, devices, f0_onnx, is_half):
|
|
input_root, *output_roots = setup_paths(exp_dir)
|
|
output_root1, output_root2 = output_roots if len(output_roots) == 2 else (output_roots[0], None)
|
|
paths = [(os.path.join(input_root, name), os.path.join(output_root1, name) if output_root1 else None, os.path.join(output_root2, name) if output_root2 else None, os.path.join(input_root, name)) for name in sorted(os.listdir(input_root)) if "spec" not in name]
|
|
start_time = time.time()
|
|
logger.info(translations["extract_f0_method"].format(num_processes=num_processes, f0_method=f0_method))
|
|
|
|
feature_input = FeatureInput()
|
|
with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
|
|
concurrent.futures.wait([executor.submit(feature_input.process_files, paths[i::len(devices)], f0_method, hop_length, f0_onnx, devices[i], is_half, num_processes // len(devices)) for i in range(len(devices))])
|
|
|
|
logger.info(translations["extract_f0_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))
|
|
|
|
def extract_features(model, feats, version):
|
|
return torch.as_tensor(model.run([model.get_outputs()[0].name, model.get_outputs()[1].name], {"feats": feats.detach().cpu().numpy()})[0 if version == "v1" else 1], dtype=torch.float32, device=feats.device)
|
|
|
|
def process_file_embedding(files, embedder_model, embedders_mode, device, version, is_half, threads):
|
|
model, embed_suffix = load_embedders_model(embedder_model, embedders_mode, providers=get_providers())
|
|
if embed_suffix != ".onnx": model = model.to(device).to(torch.float16 if is_half else torch.float32).eval()
|
|
threads = max(1, threads)
|
|
|
|
def worker(file_info):
|
|
file, out_path = file_info
|
|
out_file_path = os.path.join(out_path, os.path.basename(file.replace("wav", "npy")))
|
|
if os.path.exists(out_file_path): return
|
|
feats = torch.from_numpy(load_audio(logger, file, 16000)).to(device).to(torch.float16 if is_half else torch.float32).view(1, -1)
|
|
|
|
with torch.no_grad():
|
|
if embed_suffix == ".pt":
|
|
logits = model.extract_features(**{"source": feats, "padding_mask": torch.BoolTensor(feats.shape).fill_(False).to(device), "output_layer": 9 if version == "v1" else 12})
|
|
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
|
elif embed_suffix == ".onnx": feats = extract_features(model, feats, version).to(device)
|
|
elif embed_suffix == ".safetensors":
|
|
logits = model(feats)["last_hidden_state"]
|
|
feats = (model.final_proj(logits[0]).unsqueeze(0) if version == "v1" else logits)
|
|
else: raise ValueError(translations["option_not_valid"])
|
|
|
|
feats = feats.squeeze(0).float().cpu().numpy()
|
|
if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False)
|
|
else: logger.warning(f"{file} {translations['NaN']}")
|
|
|
|
with tqdm.tqdm(total=len(files), ncols=100, unit="p", leave=True) as pbar:
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
|
for _ in concurrent.futures.as_completed([executor.submit(worker, f) for f in files]):
|
|
pbar.update(1)
|
|
|
|
def run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, is_half):
|
|
wav_path, out_path = setup_paths(exp_dir, version)
|
|
start_time = time.time()
|
|
logger.info(translations["start_extract_hubert"])
|
|
paths = sorted([(os.path.join(wav_path, file), out_path) for file in os.listdir(wav_path) if file.endswith(".wav")])
|
|
|
|
with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor:
|
|
concurrent.futures.wait([executor.submit(process_file_embedding, paths[i::len(devices)], embedder_model, embedders_mode, devices[i], version, is_half, num_processes // len(devices)) for i in range(len(devices))])
|
|
|
|
logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}"))
|
|
|
|
def main():
|
|
args = parse_arguments()
|
|
exp_dir = os.path.join("assets", "logs", args.model_name)
|
|
f0_method, hop_length, num_processes, gpus, version, pitch_guidance, sample_rate, embedder_model, f0_onnx, embedders_mode = args.f0_method, args.hop_length, args.cpu_cores, args.gpu, args.rvc_version, args.pitch_guidance, args.sample_rate, args.embedder_model, args.f0_onnx, args.embedders_mode
|
|
devices = ["cpu"] if gpus == "-" else [f"cuda:{idx}" for idx in gpus.split("-")]
|
|
check_predictors(f0_method, f0_onnx); check_embedders(embedder_model, embedders_mode)
|
|
|
|
if logger.hasHandlers(): logger.handlers.clear()
|
|
else:
|
|
console_handler = logging.StreamHandler()
|
|
console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
|
console_handler.setFormatter(console_formatter)
|
|
console_handler.setLevel(logging.INFO)
|
|
file_handler = logging.handlers.RotatingFileHandler(os.path.join(exp_dir, "extract.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
|
|
file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
|
file_handler.setFormatter(file_formatter)
|
|
file_handler.setLevel(logging.DEBUG)
|
|
logger.addHandler(console_handler)
|
|
logger.addHandler(file_handler)
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
log_data = {translations['modelname']: args.model_name, translations['export_process']: exp_dir, translations['f0_method']: f0_method, translations['pretrain_sr']: sample_rate, translations['cpu_core']: num_processes, "Gpu": gpus, "Hop length": hop_length, translations['training_version']: version, translations['extract_f0']: pitch_guidance, translations['hubert_model']: embedder_model, translations["f0_onnx_mode"]: f0_onnx, translations["embed_mode"]: embedders_mode}
|
|
for key, value in log_data.items():
|
|
logger.debug(f"{key}: {value}")
|
|
|
|
pid_path = os.path.join(exp_dir, "extract_pid.txt")
|
|
with open(pid_path, "w") as pid_file:
|
|
pid_file.write(str(os.getpid()))
|
|
|
|
try:
|
|
run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, devices, f0_onnx, config.is_half)
|
|
run_embedding_extraction(exp_dir, version, num_processes, devices, embedder_model, embedders_mode, config.is_half)
|
|
generate_config(version, sample_rate, exp_dir)
|
|
generate_filelist(pitch_guidance, exp_dir, version, sample_rate, embedders_mode)
|
|
except Exception as e:
|
|
logger.error(f"{translations['extract_error']}: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
|
|
if os.path.exists(pid_path): os.remove(pid_path)
|
|
logger.info(f"{translations['extract_success']} {args.model_name}.")
|
|
|
|
if __name__ == "__main__":
|
|
mp.set_start_method("spawn", force=True)
|
|
main() |