AnhP's picture
Upload 95 files
5c91488 verified
raw
history blame
9.32 kB
import re
import os
import torch
import numpy as np
import scipy.signal as signal
class Generator:
def __init__(self, sample_rate = 16000, hop_length = 160, f0_min = 50, f0_max = 1100, is_half = False, device = "cpu", providers = None, f0_onnx_mode = False):
self.sample_rate = sample_rate
self.hop_length = hop_length
self.f0_min = f0_min
self.f0_max = f0_max
self.is_half = is_half
self.device = device
self.providers = providers
self.f0_onnx_mode = f0_onnx_mode
self.window = 160
def calculator(self, f0_method, x, p_len = None, filter_radius = 3):
if p_len is None: p_len = x.shape[0] // self.window
model = self.get_f0_hybrid if "hybrid" in f0_method else self.compute_f0
return model(f0_method, x, p_len, filter_radius if filter_radius % 2 != 0 else filter_radius + 1)
def _interpolate_f0(self, f0):
data = np.reshape(f0, (f0.size, 1))
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
vuv_vector[data > 0.0] = 1.0
vuv_vector[data <= 0.0] = 0.0
ip_data = data
frame_number = data.size
last_value = 0.0
for i in range(frame_number):
if data[i] <= 0.0:
j = i + 1
for j in range(i + 1, frame_number):
if data[j] > 0.0: break
if j < frame_number - 1:
if last_value > 0.0:
step = (data[j] - data[i - 1]) / float(j - i)
for k in range(i, j):
ip_data[k] = data[i - 1] + step * (k - i + 1)
else:
for k in range(i, j):
ip_data[k] = data[j]
else:
for k in range(i, frame_number):
ip_data[k] = last_value
else:
ip_data[i] = data[i]
last_value = data[i]
return ip_data[:, 0], vuv_vector[:, 0]
def _resize_f0(self, x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
return np.nan_to_num(np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source))
def compute_f0(self, f0_method, x, p_len, filter_radius):
f0 = {"pm": lambda: self.get_f0_pm(x, p_len), "dio": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "dio"), "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, "tiny"), "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, "small"), "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, "medium"), "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, "large"), "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, "full"), "crepe-tiny": lambda: self.get_f0_crepe(x, p_len, "tiny"), "crepe-small": lambda: self.get_f0_crepe(x, p_len, "small"), "crepe-medium": lambda: self.get_f0_crepe(x, p_len, "medium"), "crepe-large": lambda: self.get_f0_crepe(x, p_len, "large"), "crepe-full": lambda: self.get_f0_crepe(x, p_len, "full"), "fcpe": lambda: self.get_f0_fcpe(x, p_len), "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, legacy=True), "rmvpe": lambda: self.get_f0_rmvpe(x, p_len), "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, p_len, legacy=True), "harvest": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "harvest"), "yin": lambda: self.get_f0_yin(x, p_len, mode="yin"), "pyin": lambda: self.get_f0_yin(x, p_len, mode="pyin"), "swipe": lambda: self.get_f0_swipe(x, p_len)}
return f0[f0_method]()
def get_f0_hybrid(self, methods_str, x, p_len, filter_radius):
methods_str = re.search("hybrid\[(.+)\]", methods_str)
if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")]
f0_computation_stack, resampled_stack = [], []
x = x.astype(np.float32)
x /= np.quantile(np.abs(x), 0.999)
for method in methods:
f0 = None
f0 = self.compute_f0(method, x, p_len, filter_radius)
f0_computation_stack.append(f0)
for f0 in f0_computation_stack:
resampled_stack.append(np.interp(np.linspace(0, len(f0), p_len), np.arange(len(f0)), f0))
return resampled_stack[0] if len(resampled_stack) == 1 else np.nanmedian(np.vstack(resampled_stack), axis=0)
def get_f0_pm(self, x, p_len):
import parselmouth
f0 = (parselmouth.Sound(x, self.sample_rate).to_pitch_ac(time_step=160 / self.sample_rate * 1000 / 1000, voicing_threshold=0.6, pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array["frequency"])
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
return self._interpolate_f0(f0)[0]
def get_f0_mangio_crepe(self, x, p_len, model="full"):
from main.library.predictors.CREPE import predict
x = x.astype(np.float32)
x /= np.quantile(np.abs(x), 0.999)
audio = torch.unsqueeze(torch.from_numpy(x).to(self.device, copy=True), dim=0)
if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach()
return self._interpolate_f0(self._resize_f0(predict(audio.detach(), self.sample_rate, self.hop_length, self.f0_min, self.f0_max, model, batch_size=self.hop_length * 2, device=self.device, pad=True, providers=self.providers, onnx=self.f0_onnx_mode).squeeze(0).cpu().float().numpy(), p_len))[0]
def get_f0_crepe(self, x, p_len, model="full"):
from main.library.predictors.CREPE import predict, mean, median
f0, pd = predict(torch.tensor(np.copy(x))[None].float(), self.sample_rate, self.window, self.f0_min, self.f0_max, model, batch_size=512, device=self.device, return_periodicity=True, providers=self.providers, onnx=self.f0_onnx_mode)
f0, pd = mean(f0, 3), median(pd, 3)
f0[pd < 0.1] = 0
return self._interpolate_f0(self._resize_f0(f0[0].cpu().numpy(), p_len))[0]
def get_f0_fcpe(self, x, p_len, legacy=False):
if not hasattr(self, "fcpe"):
from main.library.predictors.FCPE import FCPE
self.fcpe = FCPE(os.path.join("assets", "models", "predictors", ("fcpe_legacy" if legacy else "fcpe") + (".onnx" if self.f0_onnx_mode else ".pt")), hop_length=self.hop_length, f0_min=self.f0_min, f0_max=self.f0_max, dtype=torch.float32, device=self.device, sample_rate=self.sample_rate, threshold=0.03 if legacy else 0.006, providers=self.providers, onnx=self.f0_onnx_mode, legacy=legacy)
f0 = self.fcpe.compute_f0(x, p_len)
if self.f0_onnx_mode: del self.fcpe
return f0
def get_f0_rmvpe(self, x, p_len, legacy=False):
if not hasattr(self, "rmvpe"):
from main.library.predictors.RMVPE import RMVPE
self.rmvpe = RMVPE(os.path.join("assets", "models", "predictors", "rmvpe" + (".onnx" if self.f0_onnx_mode else ".pt")), is_half=self.is_half, device=self.device, onnx=self.f0_onnx_mode, providers=self.providers)
f0 = self.rmvpe.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else self.rmvpe.infer_from_audio(x, thred=0.03)
if self.f0_onnx_mode: del self.rmvpe, self.rmvpe.model
return self._interpolate_f0(self._resize_f0(f0, p_len))[0]
def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest"):
if not hasattr(self, "pw"):
from main.library.predictors.WORLD import PYWORLD
self.pw = PYWORLD()
x = x.astype(np.double)
f0, t = self.pw.harvest(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.window / self.sample_rate) if model == "harvest" else self.pw.dio(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.window / self.sample_rate)
f0 = self.pw.stonemask(x, self.sample_rate, t, f0)
if filter_radius > 2 and model == "harvest": f0 = signal.medfilt(f0, filter_radius)
elif model == "dio":
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self._interpolate_f0(self._resize_f0(f0, p_len))[0]
def get_f0_swipe(self, x, p_len):
from main.library.predictors.SWIPE import swipe, stonemask
f0, t = swipe(x.astype(np.float32), self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, frame_period=1000 * self.window / self.sample_rate)
return self._interpolate_f0(self._resize_f0(stonemask(x, self.sample_rate, t, f0), p_len))[0]
def get_f0_yin(self, x, p_len, mode="yin"):
from librosa import yin, pyin
return self._interpolate_f0(self._resize_f0(yin(x.astype(np.float32), sr=self.sample_rate, fmin=self.f0_min, fmax=self.f0_max, hop_length=self.hop_length) if mode == "yin" else pyin(x.astype(np.float32), fmin=self.f0_min, fmax=self.f0_max, sr=self.sample_rate, hop_length=self.hop_length)[0], p_len))[0]