|
import os
|
|
import gc
|
|
import sys
|
|
import time
|
|
import tqdm
|
|
import torch
|
|
import shutil
|
|
import codecs
|
|
import pyworld
|
|
import librosa
|
|
import logging
|
|
import argparse
|
|
import warnings
|
|
import subprocess
|
|
import torchcrepe
|
|
import parselmouth
|
|
import logging.handlers
|
|
|
|
import numpy as np
|
|
import soundfile as sf
|
|
import torch.nn.functional as F
|
|
|
|
from random import shuffle
|
|
from functools import partial
|
|
from multiprocessing import Pool
|
|
from distutils.util import strtobool
|
|
from fairseq import checkpoint_utils
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
now_dir = os.getcwd()
|
|
sys.path.append(now_dir)
|
|
|
|
from main.configs.config import Config
|
|
from main.library.predictors.FCPE import FCPE
|
|
from main.library.predictors.RMVPE import RMVPE
|
|
|
|
logging.getLogger("wget").setLevel(logging.ERROR)
|
|
|
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
warnings.filterwarnings("ignore", category=UserWarning)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger.propagate = False
|
|
|
|
config = Config()
|
|
translations = config.translations
|
|
|
|
def parse_arguments() -> tuple:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--model_name", type=str, required=True)
|
|
parser.add_argument("--rvc_version", type=str, default="v2")
|
|
parser.add_argument("--f0_method", type=str, default="rmvpe")
|
|
parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True)
|
|
parser.add_argument("--hop_length", type=int, default=128)
|
|
parser.add_argument("--cpu_cores", type=int, default=2)
|
|
parser.add_argument("--gpu", type=str, default="-")
|
|
parser.add_argument("--sample_rate", type=int, required=True)
|
|
parser.add_argument("--embedder_model", type=str, default="contentvec_base")
|
|
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
def load_audio(file, sample_rate):
|
|
try:
|
|
file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
|
audio, sr = sf.read(file)
|
|
|
|
if len(audio.shape) > 1: audio = librosa.to_mono(audio.T)
|
|
if sr != sample_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
|
|
except Exception as e:
|
|
raise RuntimeError(f"{translations['errors_loading_audio']}: {e}")
|
|
|
|
return audio.flatten()
|
|
|
|
def check_rmvpe_fcpe(method):
|
|
if method == "rmvpe" and not os.path.exists(os.path.join("assets", "model", "predictors", "rmvpe.pt")): subprocess.run(["wget", "-q", "--show-progress", "--no-check-certificate", codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Pbyno_EIP_Cebwrpg_2/erfbyir/znva/", "rot13") + "rmvpe.pt", "-P", os.path.join("assets", "model", "predictors")], check=True)
|
|
elif method == "fcpe" and not os.path.exists(os.path.join("assets", "model", "predictors", "fcpe.pt")): subprocess.run(["wget", "-q", "--show-progress", "--no-check-certificate", codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Pbyno_EIP_Cebwrpg_2/erfbyir/znva/", "rot13") + "fcpe.pt", "-P", os.path.join("assets", "model", "predictors")], check=True)
|
|
|
|
def check_hubert(hubert):
|
|
if hubert == "contentvec_base" or hubert == "hubert_base" or hubert == "japanese_hubert_base" or hubert == "korean_hubert_base" or hubert == "chinese_hubert_base":
|
|
model_path = os.path.join(now_dir, "assets", "model", "embedders", hubert + '.pt')
|
|
|
|
if not os.path.exists(model_path): subprocess.run(["wget", "-q", "--show-progress", "--no-check-certificate", codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Pbyno_EIP_Cebwrpg_2/erfbyir/znva/", "rot13") + f"{hubert}.pt", "-P", os.path.join("assets", "model", "embedders")], check=True)
|
|
|
|
def generate_config(rvc_version, sample_rate, model_path):
|
|
config_path = os.path.join("main", "configs", rvc_version, f"{sample_rate}.json")
|
|
config_save_path = os.path.join(model_path, "config.json")
|
|
if not os.path.exists(config_save_path): shutil.copy(config_path, config_save_path)
|
|
|
|
|
|
def generate_filelist(pitch_guidance, model_path, rvc_version, sample_rate):
|
|
gt_wavs_dir = os.path.join(model_path, "sliced_audios")
|
|
feature_dir = os.path.join(model_path, f"{rvc_version}_extracted")
|
|
|
|
f0_dir, f0nsf_dir = None, None
|
|
|
|
if pitch_guidance:
|
|
f0_dir = os.path.join(model_path, "f0")
|
|
f0nsf_dir = os.path.join(model_path, "f0_voiced")
|
|
|
|
gt_wavs_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir))
|
|
feature_files = set(name.split(".")[0] for name in os.listdir(feature_dir))
|
|
|
|
if pitch_guidance:
|
|
f0_files = set(name.split(".")[0] for name in os.listdir(f0_dir))
|
|
f0nsf_files = set(name.split(".")[0] for name in os.listdir(f0nsf_dir))
|
|
names = gt_wavs_files & feature_files & f0_files & f0nsf_files
|
|
else: names = gt_wavs_files & feature_files
|
|
|
|
options = []
|
|
mute_base_path = os.path.join(now_dir, "assets", "logs", "mute")
|
|
|
|
for name in names:
|
|
if pitch_guidance: options.append(f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|0")
|
|
else: options.append(f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|0")
|
|
|
|
mute_audio_path = os.path.join(mute_base_path, "sliced_audios", f"mute{sample_rate}.wav")
|
|
mute_feature_path = os.path.join(mute_base_path, f"{rvc_version}_extracted", "mute.npy")
|
|
|
|
for _ in range(2):
|
|
if pitch_guidance:
|
|
mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy")
|
|
mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy")
|
|
options.append(f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|0")
|
|
else: options.append(f"{mute_audio_path}|{mute_feature_path}|0")
|
|
|
|
shuffle(options)
|
|
|
|
with open(os.path.join(model_path, "filelist.txt"), "w") as f:
|
|
f.write("\n".join(options))
|
|
|
|
def setup_paths(exp_dir, version = None):
|
|
wav_path = os.path.join(exp_dir, "sliced_audios_16k")
|
|
if version:
|
|
out_path = os.path.join(exp_dir, "v1_extracted" if version == "v1" else "v2_extracted")
|
|
os.makedirs(out_path, exist_ok=True)
|
|
|
|
return wav_path, out_path
|
|
else:
|
|
output_root1 = os.path.join(exp_dir, "f0")
|
|
output_root2 = os.path.join(exp_dir, "f0_voiced")
|
|
|
|
os.makedirs(output_root1, exist_ok=True)
|
|
os.makedirs(output_root2, exist_ok=True)
|
|
|
|
return wav_path, output_root1, output_root2
|
|
|
|
def read_wave(wav_path, normalize = False):
|
|
wav, sr = sf.read(wav_path)
|
|
assert sr == 16000, translations["sr_not_16000"]
|
|
|
|
feats = torch.from_numpy(wav).float()
|
|
|
|
if config.is_half: feats = feats.half()
|
|
if feats.dim() == 2: feats = feats.mean(-1)
|
|
|
|
feats = feats.view(1, -1)
|
|
|
|
if normalize: feats = F.layer_norm(feats, feats.shape)
|
|
|
|
return feats
|
|
|
|
def get_device(gpu_index):
|
|
if gpu_index == "cpu": return "cpu"
|
|
|
|
try:
|
|
index = int(gpu_index)
|
|
if index < torch.cuda.device_count(): return f"cuda:{index}"
|
|
else: logger.warning(translations["gpu_not_valid"])
|
|
except ValueError:
|
|
logger.warning(translations["gpu_not_valid"])
|
|
|
|
return "cpu"
|
|
|
|
class FeatureInput:
|
|
def __init__(self, sample_rate=16000, hop_size=160, device="cpu"):
|
|
self.fs = sample_rate
|
|
self.hop = hop_size
|
|
self.f0_bin = 256
|
|
self.f0_max = 1100.0
|
|
self.f0_min = 50.0
|
|
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
|
|
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
|
|
self.device = device
|
|
|
|
def compute_f0(self, np_arr, f0_method, hop_length):
|
|
if f0_method == "pm": return self.get_pm(np_arr)
|
|
elif f0_method == 'dio': return self.get_dio(np_arr)
|
|
elif f0_method == "crepe": return self.get_crepe(np_arr, int(hop_length))
|
|
elif f0_method == "crepe-tiny": return self.get_crepe(np_arr, int(hop_length), "tiny")
|
|
elif f0_method == "fcpe": return self.get_fcpe(np_arr, int(hop_length))
|
|
elif f0_method == "rmvpe": return self.get_rmvpe(np_arr)
|
|
elif f0_method == "harvest": return self.get_harvest(np_arr)
|
|
else: raise ValueError(translations["method_not_valid"])
|
|
|
|
def get_pm(self, x):
|
|
time_step = 160 / 16000 * 1000
|
|
f0 = (parselmouth.Sound(x, self.fs).to_pitch_ac(time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=50, pitch_ceiling=1100).selected_array["frequency"])
|
|
pad_size = ((x.size // self.hop) - len(f0) + 1) // 2
|
|
if pad_size > 0 or (x.size // self.hop) - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, (x.size // self.hop) - len(f0) - pad_size]], mode="constant")
|
|
|
|
return f0
|
|
|
|
def get_dio(self, x):
|
|
f0, t = pyworld.dio(x.astype(np.double), fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.hop / self.fs)
|
|
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
|
|
|
|
return f0
|
|
|
|
def get_crepe(self, x, hop_length, model="full"):
|
|
audio = torch.from_numpy(x.astype(np.float32)).to(self.device)
|
|
audio /= torch.quantile(torch.abs(audio), 0.999)
|
|
audio = audio.unsqueeze(0)
|
|
|
|
pitch = torchcrepe.predict(audio, self.fs, hop_length, self.f0_min, self.f0_max, model=model, batch_size=hop_length * 2, device=self.device, pad=True)
|
|
|
|
source = pitch.squeeze(0).cpu().float().numpy()
|
|
source[source < 0.001] = np.nan
|
|
target = np.interp(np.arange(0, len(source) * (x.size // self.hop), len(source)) / (x.size // self.hop), np.arange(0, len(source)), source)
|
|
|
|
return np.nan_to_num(target)
|
|
|
|
|
|
def get_fcpe(self, x, hop_length):
|
|
self.model_fcpe = FCPE(os.path.join("assets", "model", "predictors", "fcpe.pt"), hop_length=int(hop_length), f0_min=self.f0_min, f0_max=self.f0_max, dtype=torch.float32, device=self.device, sample_rate=self.fs, threshold=0.03)
|
|
f0 = self.model_fcpe.compute_f0(x, p_len=(x.size // self.hop))
|
|
del self.model_fcpe
|
|
gc.collect()
|
|
return f0
|
|
|
|
|
|
def get_rmvpe(self, x):
|
|
self.model_rmvpe = RMVPE(os.path.join("assets", "model", "predictors", "rmvpe.pt"), is_half=False, device=self.device)
|
|
return self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
|
|
|
|
|
def get_harvest(self, x):
|
|
f0, t = pyworld.harvest(x.astype(np.double), fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.hop / self.fs)
|
|
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
|
|
return f0
|
|
|
|
|
|
def coarse_f0(self, f0):
|
|
f0_mel = 1127 * np.log(1 + f0 / 700)
|
|
f0_mel = np.clip((f0_mel - self.f0_mel_min) * (self.f0_bin - 2) / (self.f0_mel_max - self.f0_mel_min) + 1, 1, self.f0_bin - 1)
|
|
return np.rint(f0_mel).astype(int)
|
|
|
|
|
|
def process_file(self, file_info, f0_method, hop_length):
|
|
inp_path, opt_path1, opt_path2, np_arr = file_info
|
|
|
|
if os.path.exists(opt_path1 + ".npy") and os.path.exists(opt_path2 + ".npy"): return
|
|
|
|
try:
|
|
feature_pit = self.compute_f0(np_arr, f0_method, hop_length)
|
|
np.save(opt_path2, feature_pit, allow_pickle=False)
|
|
coarse_pit = self.coarse_f0(feature_pit)
|
|
np.save(opt_path1, coarse_pit, allow_pickle=False)
|
|
except Exception as e:
|
|
raise RuntimeError(f"{translations['extract_file_error']} {inp_path}: {e}")
|
|
|
|
|
|
def process_files(self, files, f0_method, hop_length, pbar):
|
|
for file_info in files:
|
|
self.process_file(file_info, f0_method, hop_length)
|
|
pbar.update()
|
|
|
|
def run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, gpus):
|
|
input_root, *output_roots = setup_paths(exp_dir)
|
|
|
|
if len(output_roots) == 2: output_root1, output_root2 = output_roots
|
|
else:
|
|
output_root1 = output_roots[0]
|
|
output_root2 = None
|
|
|
|
paths = [
|
|
(
|
|
os.path.join(input_root, name),
|
|
os.path.join(output_root1, name) if output_root1 else None,
|
|
os.path.join(output_root2, name) if output_root2 else None,
|
|
load_audio(os.path.join(input_root, name), 16000),
|
|
)
|
|
for name in sorted(os.listdir(input_root))
|
|
if "spec" not in name
|
|
]
|
|
|
|
logger.info(translations["extract_f0_method"].format(num_processes=num_processes, f0_method=f0_method))
|
|
start_time = time.time()
|
|
|
|
if gpus != "-":
|
|
gpus = gpus.split("-")
|
|
num_gpus = len(gpus)
|
|
|
|
process_partials = []
|
|
|
|
pbar = tqdm.tqdm(total=len(paths), desc=translations["extract_f0"])
|
|
|
|
for idx, gpu in enumerate(gpus):
|
|
device = get_device(gpu)
|
|
feature_input = FeatureInput(device=device)
|
|
|
|
part_paths = paths[idx::num_gpus]
|
|
process_partials.append((feature_input, part_paths))
|
|
|
|
with ThreadPoolExecutor() as executor:
|
|
futures = [executor.submit(FeatureInput.process_files, feature_input, part_paths, f0_method, hop_length, pbar) for feature_input, part_paths in process_partials]
|
|
|
|
for future in as_completed(futures):
|
|
pbar.update(1)
|
|
future.result()
|
|
|
|
pbar.close()
|
|
else:
|
|
feature_input = FeatureInput(device="cpu")
|
|
|
|
with tqdm.tqdm(total=len(paths), desc=translations["extract_f0"]) as pbar:
|
|
with Pool(processes=num_processes) as pool:
|
|
process_file_partial = partial(feature_input.process_file, f0_method=f0_method, hop_length=hop_length)
|
|
|
|
for _ in pool.imap_unordered(process_file_partial, paths):
|
|
pbar.update(1)
|
|
|
|
|
|
elapsed_time = time.time() - start_time
|
|
logger.info(translations["extract_f0_success"].format(elapsed_time=f"{elapsed_time:.2f}"))
|
|
|
|
def process_file_embedding(file, wav_path, out_path, model, device, version, saved_cfg):
|
|
wav_file_path = os.path.join(wav_path, file)
|
|
out_file_path = os.path.join(out_path, file.replace("wav", "npy"))
|
|
|
|
if os.path.exists(out_file_path): return
|
|
|
|
feats = read_wave(wav_file_path, normalize=saved_cfg.task.normalize)
|
|
dtype = torch.float16 if device.startswith("cuda") else torch.float32
|
|
feats = feats.to(dtype).to(device)
|
|
|
|
padding_mask = torch.BoolTensor(feats.shape).fill_(False).to(dtype).to(device)
|
|
|
|
inputs = {
|
|
"source": feats,
|
|
"padding_mask": padding_mask,
|
|
"output_layer": 9 if version == "v1" else 12,
|
|
}
|
|
|
|
with torch.no_grad():
|
|
model = model.to(device).to(dtype)
|
|
logits = model.extract_features(**inputs)
|
|
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
|
|
|
feats = feats.squeeze(0).float().cpu().numpy()
|
|
|
|
if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False)
|
|
else: logger.warning(f"{file} {translations['NaN']}")
|
|
|
|
def run_embedding_extraction(exp_dir, version, gpus, embedder_model):
|
|
wav_path, out_path = setup_paths(exp_dir, version)
|
|
|
|
logger.info(translations["start_extract_hubert"])
|
|
start_time = time.time()
|
|
|
|
try:
|
|
models, saved_cfg, _ = checkpoint_utils.load_model_ensemble_and_task([os.path.join(now_dir, "assets", "model", "embedders", embedder_model + '.pt')], suffix="")
|
|
except Exception as e:
|
|
raise ImportError(translations["read_model_error"].format(e=e))
|
|
|
|
model = models[0]
|
|
devices = [get_device(gpu) for gpu in (gpus.split("-") if gpus != "-" else ["cpu"])]
|
|
|
|
paths = sorted([file for file in os.listdir(wav_path) if file.endswith(".wav")])
|
|
|
|
if not paths:
|
|
logger.warning(translations["not_found_audio_file"])
|
|
sys.exit(1)
|
|
|
|
pbar = tqdm.tqdm(total=len(paths) * len(devices), desc=translations["extract_hubert"])
|
|
|
|
tasks = [(file, wav_path, out_path, model, device, version, saved_cfg) for file in paths for device in devices]
|
|
|
|
for task in tasks:
|
|
try:
|
|
process_file_embedding(*task)
|
|
except Exception as e:
|
|
raise RuntimeError(f"{translations['process_error']} {task[0]}: {e}")
|
|
|
|
pbar.update(1)
|
|
|
|
pbar.close()
|
|
|
|
elapsed_time = time.time() - start_time
|
|
logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{elapsed_time:.2f}"))
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_arguments()
|
|
|
|
exp_dir = os.path.join("assets", "logs", args.model_name)
|
|
f0_method = args.f0_method
|
|
hop_length = args.hop_length
|
|
num_processes = args.cpu_cores
|
|
gpus = args.gpu
|
|
version = args.rvc_version
|
|
pitch_guidance = args.pitch_guidance
|
|
sample_rate = args.sample_rate
|
|
embedder_model = args.embedder_model
|
|
|
|
check_rmvpe_fcpe(f0_method)
|
|
check_hubert(embedder_model)
|
|
|
|
if len([f for f in os.listdir(os.path.join(exp_dir, "sliced_audios")) if os.path.isfile(os.path.join(exp_dir, "sliced_audios", f))]) < 1 or len([f for f in os.listdir(os.path.join(exp_dir, "sliced_audios_16k")) if os.path.isfile(os.path.join(exp_dir, "sliced_audios_16k", f))]) < 1: raise FileNotFoundError("Không tìm thấy dữ liệu được xử lý, vui lòng xử lý lại âm thanh")
|
|
|
|
log_file = os.path.join(exp_dir, "extract.log")
|
|
|
|
if logger.hasHandlers(): logger.handlers.clear()
|
|
else:
|
|
console_handler = logging.StreamHandler()
|
|
console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
|
|
|
console_handler.setFormatter(console_formatter)
|
|
console_handler.setLevel(logging.INFO)
|
|
|
|
file_handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
|
|
file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
|
|
|
file_handler.setFormatter(file_formatter)
|
|
file_handler.setLevel(logging.DEBUG)
|
|
|
|
logger.addHandler(console_handler)
|
|
logger.addHandler(file_handler)
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
logger.debug(f"{translations['modelname']}: {args.model_name}")
|
|
logger.debug(f"{translations['export_process']}: {exp_dir}")
|
|
logger.debug(f"{translations['f0_method']}: {f0_method}")
|
|
logger.debug(f"{translations['pretrain_sr']}: {sample_rate}")
|
|
logger.debug(f"{translations['cpu_core']}: {num_processes}")
|
|
logger.debug(f"Gpu: {gpus}")
|
|
if f0_method == "crepe" or f0_method == "crepe-tiny" or f0_method == "fcpe": logger.debug(f"Hop length: {hop_length}")
|
|
logger.debug(f"{translations['training_version']}: {version}")
|
|
logger.debug(f"{translations['extract_f0']}: {pitch_guidance}")
|
|
logger.debug(f"{translations['hubert_model']}: {embedder_model}")
|
|
|
|
try:
|
|
run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, gpus)
|
|
run_embedding_extraction(exp_dir, version, gpus, embedder_model)
|
|
|
|
generate_config(version, sample_rate, exp_dir)
|
|
generate_filelist(pitch_guidance, exp_dir, version, sample_rate)
|
|
except Exception as e:
|
|
logger.error(f"{translations['extract_error']}: {e}")
|
|
|
|
logger.info(f"{translations['extract_success']} {args.model_name}.") |