File size: 9,320 Bytes
5c91488 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import re
import os
import torch
import numpy as np
import scipy.signal as signal
class Generator:
def __init__(self, sample_rate = 16000, hop_length = 160, f0_min = 50, f0_max = 1100, is_half = False, device = "cpu", providers = None, f0_onnx_mode = False):
self.sample_rate = sample_rate
self.hop_length = hop_length
self.f0_min = f0_min
self.f0_max = f0_max
self.is_half = is_half
self.device = device
self.providers = providers
self.f0_onnx_mode = f0_onnx_mode
self.window = 160
def calculator(self, f0_method, x, p_len = None, filter_radius = 3):
if p_len is None: p_len = x.shape[0] // self.window
model = self.get_f0_hybrid if "hybrid" in f0_method else self.compute_f0
return model(f0_method, x, p_len, filter_radius if filter_radius % 2 != 0 else filter_radius + 1)
def _interpolate_f0(self, f0):
data = np.reshape(f0, (f0.size, 1))
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
vuv_vector[data > 0.0] = 1.0
vuv_vector[data <= 0.0] = 0.0
ip_data = data
frame_number = data.size
last_value = 0.0
for i in range(frame_number):
if data[i] <= 0.0:
j = i + 1
for j in range(i + 1, frame_number):
if data[j] > 0.0: break
if j < frame_number - 1:
if last_value > 0.0:
step = (data[j] - data[i - 1]) / float(j - i)
for k in range(i, j):
ip_data[k] = data[i - 1] + step * (k - i + 1)
else:
for k in range(i, j):
ip_data[k] = data[j]
else:
for k in range(i, frame_number):
ip_data[k] = last_value
else:
ip_data[i] = data[i]
last_value = data[i]
return ip_data[:, 0], vuv_vector[:, 0]
def _resize_f0(self, x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
return np.nan_to_num(np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source))
def compute_f0(self, f0_method, x, p_len, filter_radius):
f0 = {"pm": lambda: self.get_f0_pm(x, p_len), "dio": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "dio"), "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, "tiny"), "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, "small"), "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, "medium"), "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, "large"), "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, "full"), "crepe-tiny": lambda: self.get_f0_crepe(x, p_len, "tiny"), "crepe-small": lambda: self.get_f0_crepe(x, p_len, "small"), "crepe-medium": lambda: self.get_f0_crepe(x, p_len, "medium"), "crepe-large": lambda: self.get_f0_crepe(x, p_len, "large"), "crepe-full": lambda: self.get_f0_crepe(x, p_len, "full"), "fcpe": lambda: self.get_f0_fcpe(x, p_len), "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, legacy=True), "rmvpe": lambda: self.get_f0_rmvpe(x, p_len), "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, p_len, legacy=True), "harvest": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "harvest"), "yin": lambda: self.get_f0_yin(x, p_len, mode="yin"), "pyin": lambda: self.get_f0_yin(x, p_len, mode="pyin"), "swipe": lambda: self.get_f0_swipe(x, p_len)}
return f0[f0_method]()
def get_f0_hybrid(self, methods_str, x, p_len, filter_radius):
methods_str = re.search("hybrid\[(.+)\]", methods_str)
if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")]
f0_computation_stack, resampled_stack = [], []
x = x.astype(np.float32)
x /= np.quantile(np.abs(x), 0.999)
for method in methods:
f0 = None
f0 = self.compute_f0(method, x, p_len, filter_radius)
f0_computation_stack.append(f0)
for f0 in f0_computation_stack:
resampled_stack.append(np.interp(np.linspace(0, len(f0), p_len), np.arange(len(f0)), f0))
return resampled_stack[0] if len(resampled_stack) == 1 else np.nanmedian(np.vstack(resampled_stack), axis=0)
def get_f0_pm(self, x, p_len):
import parselmouth
f0 = (parselmouth.Sound(x, self.sample_rate).to_pitch_ac(time_step=160 / self.sample_rate * 1000 / 1000, voicing_threshold=0.6, pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array["frequency"])
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
return self._interpolate_f0(f0)[0]
def get_f0_mangio_crepe(self, x, p_len, model="full"):
from main.library.predictors.CREPE import predict
x = x.astype(np.float32)
x /= np.quantile(np.abs(x), 0.999)
audio = torch.unsqueeze(torch.from_numpy(x).to(self.device, copy=True), dim=0)
if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach()
return self._interpolate_f0(self._resize_f0(predict(audio.detach(), self.sample_rate, self.hop_length, self.f0_min, self.f0_max, model, batch_size=self.hop_length * 2, device=self.device, pad=True, providers=self.providers, onnx=self.f0_onnx_mode).squeeze(0).cpu().float().numpy(), p_len))[0]
def get_f0_crepe(self, x, p_len, model="full"):
from main.library.predictors.CREPE import predict, mean, median
f0, pd = predict(torch.tensor(np.copy(x))[None].float(), self.sample_rate, self.window, self.f0_min, self.f0_max, model, batch_size=512, device=self.device, return_periodicity=True, providers=self.providers, onnx=self.f0_onnx_mode)
f0, pd = mean(f0, 3), median(pd, 3)
f0[pd < 0.1] = 0
return self._interpolate_f0(self._resize_f0(f0[0].cpu().numpy(), p_len))[0]
def get_f0_fcpe(self, x, p_len, legacy=False):
if not hasattr(self, "fcpe"):
from main.library.predictors.FCPE import FCPE
self.fcpe = FCPE(os.path.join("assets", "models", "predictors", ("fcpe_legacy" if legacy else "fcpe") + (".onnx" if self.f0_onnx_mode else ".pt")), hop_length=self.hop_length, f0_min=self.f0_min, f0_max=self.f0_max, dtype=torch.float32, device=self.device, sample_rate=self.sample_rate, threshold=0.03 if legacy else 0.006, providers=self.providers, onnx=self.f0_onnx_mode, legacy=legacy)
f0 = self.fcpe.compute_f0(x, p_len)
if self.f0_onnx_mode: del self.fcpe
return f0
def get_f0_rmvpe(self, x, p_len, legacy=False):
if not hasattr(self, "rmvpe"):
from main.library.predictors.RMVPE import RMVPE
self.rmvpe = RMVPE(os.path.join("assets", "models", "predictors", "rmvpe" + (".onnx" if self.f0_onnx_mode else ".pt")), is_half=self.is_half, device=self.device, onnx=self.f0_onnx_mode, providers=self.providers)
f0 = self.rmvpe.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else self.rmvpe.infer_from_audio(x, thred=0.03)
if self.f0_onnx_mode: del self.rmvpe, self.rmvpe.model
return self._interpolate_f0(self._resize_f0(f0, p_len))[0]
def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest"):
if not hasattr(self, "pw"):
from main.library.predictors.WORLD import PYWORLD
self.pw = PYWORLD()
x = x.astype(np.double)
f0, t = self.pw.harvest(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.window / self.sample_rate) if model == "harvest" else self.pw.dio(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.window / self.sample_rate)
f0 = self.pw.stonemask(x, self.sample_rate, t, f0)
if filter_radius > 2 and model == "harvest": f0 = signal.medfilt(f0, filter_radius)
elif model == "dio":
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self._interpolate_f0(self._resize_f0(f0, p_len))[0]
def get_f0_swipe(self, x, p_len):
from main.library.predictors.SWIPE import swipe, stonemask
f0, t = swipe(x.astype(np.float32), self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, frame_period=1000 * self.window / self.sample_rate)
return self._interpolate_f0(self._resize_f0(stonemask(x, self.sample_rate, t, f0), p_len))[0]
def get_f0_yin(self, x, p_len, mode="yin"):
from librosa import yin, pyin
return self._interpolate_f0(self._resize_f0(yin(x.astype(np.float32), sr=self.sample_rate, fmin=self.f0_min, fmax=self.f0_max, hop_length=self.hop_length) if mode == "yin" else pyin(x.astype(np.float32), fmin=self.f0_min, fmax=self.f0_max, sr=self.sample_rate, hop_length=self.hop_length)[0], p_len))[0] |