Spaces:
Running
Running
import util | |
import numpy as np | |
import librosa | |
import hashlib | |
import json | |
import os | |
import torch | |
import logging | |
from rvc.infer_pack.models import ( | |
SynthesizerTrnMs768NSFsid, | |
SynthesizerTrnMs768NSFsid_nono | |
) | |
from os import getenv | |
from typing import Union, Tuple, List | |
from rvc.vc_infer_pipeline import VC | |
from speakers.processors import BaseProcessor, ProcessorData | |
from speakers.common.utils import get_abs_path | |
from omegaconf import OmegaConf | |
from speakers.common.registry import registry | |
from pydantic import Field | |
logger = logging.getLogger('speaker_runner') | |
def set_rvc_speakers_logger(l): | |
global logger | |
logger = l | |
class RvcProcessorData(ProcessorData): | |
""" | |
# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118 # noqa | |
:param f0_up_key: 变调(整数, 半音数量, 升八度12降八度-12) | |
:param input_audio: | |
:param f0_file: F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调 | |
:param protect: 保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果 | |
:param model_index: | |
:param f0_method: | |
:param index_rate: 检索特征占比 | |
:param filter_radius: >=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音 | |
:param rms_mix_rate: 输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络 | |
:param resample_sr: 后处理重采样至最终采样率,0为不进行重采样 | |
:return: | |
""" | |
sample_rate: int = Field( | |
default=0 | |
) | |
audio_samples: List[float] = Field( | |
default=[] | |
) | |
model_index: int | |
""" 变调(整数, 半音数量, 升八度12降八度-12)""" | |
f0_up_key: int | |
""" F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调""" | |
f0_method: str | |
"""检索特征占比""" | |
index_rate: float | |
""" >=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音""" | |
filter_radius: int | |
"""输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络""" | |
rms_mix_rate: float | |
"""后处理重采样至最终采样率,0为不进行重采样""" | |
resample_sr: int | |
"""保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果""" | |
protect: float = Field( | |
default=0.33 | |
) | |
f0_file: str = Field( | |
default=None | |
) | |
def type(self) -> str: | |
"""Type of the Message, used for serialization.""" | |
return "RVC" | |
class RVCSpeakers(BaseProcessor): | |
""" | |
音频处理器有抽象处理器Processor,通过单独的Processor配置,预加载音频处理器, | |
不同的处理器有着特定人物的说话风格与配置参数 | |
""" | |
def __init__(self, hubert_model_path: str, rvc_config_file: str): | |
# Reference: https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L21 # noqa | |
self.in_hf_space = getenv('SYSTEM') == 'spaces' | |
self._loaded_models = [] | |
self._load_hubert(hubert_model_path=hubert_model_path) | |
self._load_rvc_mode(rvc_config_file=rvc_config_file) | |
def __call__( | |
self, | |
data: RvcProcessorData | |
): | |
# 将 Python 列表转换为 NumPy 数组 | |
audio_samples_np = np.array(data.audio_samples, dtype=np.float32) | |
input_audio = (data.sample_rate, audio_samples_np) | |
return self.vc_func(input_audio=input_audio, | |
model_index=data.model_index, | |
f0_up_key=data.f0_up_key, | |
f0_method=data.f0_method, | |
index_rate=data.index_rate, | |
filter_radius=data.filter_radius, | |
rms_mix_rate=data.rms_mix_rate, | |
resample_sr=data.resample_sr, | |
protect=data.protect, | |
f0_file=data.f0_file) | |
def from_config(cls, cfg=None): | |
if cfg is None: | |
raise RuntimeError("from_config cfg is None.") | |
hubert_model_path = cfg.get("hubert_model_path", "") | |
rvc_config_file = cfg.get("rvc_config_file", "") | |
return cls(hubert_model_path=os.path.join(registry.get_path("rvc_library_root"), | |
hubert_model_path), | |
rvc_config_file=os.path.join(registry.get_path("rvc_library_root"), | |
rvc_config_file)) | |
def match(self, data: ProcessorData): | |
return "RVC" in data.type | |
def loaded_models(self): | |
return self._loaded_models | |
def _load_hubert(self, hubert_model_path: str): | |
# Load hubert model | |
logger.info(f'Load hubert model{hubert_model_path}') | |
self.hubert_model = util.load_hubert_model(registry.get("device"), model_path=hubert_model_path) | |
self.hubert_model.eval() | |
logger.info('Loaded hubert model') | |
def _load_rvc_mode(self, rvc_config_file: str): | |
""" | |
模型配置加载 | |
:param rvc_config_file: | |
:return: | |
""" | |
# Load models | |
logger.info(f'Models Load:rvc_speakers') | |
multi_cfg = OmegaConf.load(get_abs_path(rvc_config_file)) | |
rmvpe_path = os.path.join(registry.get_path("rvc_library_root"), multi_cfg.get("rmvpe_path")) | |
logger.info(f'rmvpe_path:{rmvpe_path}') | |
for item in multi_cfg.get('models'): | |
for key, model_info in item.items(): # 使用 .items() 方法获取键值对 | |
logger.info(f'Loading model: {key}') | |
model_name = model_info.get("model_name") | |
# Load model info | |
model_info_config_file = os.path.join(registry.get_path("rvc_library_root"), | |
model_info.get("path"), | |
'config.json') | |
logger.info(f'Loading model model_info_config_file: {model_info_config_file}') | |
model_info_config = json.load(open(model_info_config_file, 'r')) | |
# Load RVC checkpoint | |
torch_file = os.path.join(registry.get_path("rvc_library_root"), | |
model_info.get("path"), | |
model_info_config['model']) | |
cpt = torch.load( | |
torch_file, | |
map_location='cpu' | |
) | |
tgt_sr = cpt['config'][-1] | |
cpt['config'][-3] = cpt['weight']['emb_g.weight'].shape[0] # n_spk | |
if_f0 = cpt.get('f0', 1) | |
net_g: Union[SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono] | |
if if_f0 == 1: | |
net_g = SynthesizerTrnMs768NSFsid( | |
*cpt['config'], | |
is_half=util.is_half(registry.get("device")) | |
) | |
else: | |
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt['config']) | |
del net_g.enc_q | |
# According to original code, this thing seems necessary. | |
logger.info(net_g.load_state_dict(cpt['weight'], strict=False)) | |
net_g.eval().to(registry.get("device")) | |
net_g = net_g.half() if util.is_half(registry.get("device")) else net_g.float() | |
vc = VC(tgt_sr, | |
registry.get("x_pad"), | |
registry.get("x_query"), | |
registry.get("x_center"), | |
registry.get("x_max"), | |
registry.get("is_half"), | |
registry.get("device"), | |
rmvpe_path=rmvpe_path | |
) | |
self._loaded_models.append(dict( | |
name=model_name, | |
metadata=model_info_config, | |
vc=vc, | |
net_g=net_g, | |
if_f0=if_f0, | |
target_sr=tgt_sr | |
)) | |
logger.info(f'Models loaded:rvc_speakers, len:{len(self._loaded_models)}') | |
def vc_func( | |
self, | |
input_audio: Tuple[int, np.ndarray], model_index, f0_up_key, f0_method: str, index_rate, | |
filter_radius, rms_mix_rate, resample_sr, protect: float = 0.33, f0_file: str = None | |
) -> Tuple[int, np.ndarray]: | |
""" | |
# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118 # noqa | |
:param f0_up_key: 变调(整数, 半音数量, 升八度12降八度-12) | |
:param input_audio: | |
:param f0_file: F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调 | |
:param protect: 保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果 | |
:param model_index: | |
:param f0_method: | |
:param index_rate: 检索特征占比 | |
:param filter_radius: >=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音 | |
:param rms_mix_rate: 输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络 | |
:param resample_sr: 后处理重采样至最终采样率,0为不进行重采样 | |
:return: | |
""" | |
if input_audio is None: | |
raise RuntimeError("Please provide input audio.") | |
if model_index is None: | |
raise RuntimeError("Please select a model.") | |
model = self._loaded_models[model_index] | |
# Reference: so-vits | |
(audio_samp, audio_npy) = input_audio | |
# https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49 | |
# Can be change well, we will see | |
if (audio_npy.shape[0] / audio_samp) > 600 and self.in_hf_space: | |
raise RuntimeError("Input audio is longer than 600 secs.") | |
# Bloody hell: https://stackoverflow.com/questions/26921836/ | |
if audio_npy.dtype != np.float32: # :thonk: | |
audio_npy = ( | |
audio_npy / np.iinfo(audio_npy.dtype).max | |
).astype(np.float32) | |
if len(audio_npy.shape) > 1: | |
audio_npy = librosa.to_mono(audio_npy.transpose(1, 0)) | |
if audio_samp != 16000: | |
audio_npy = librosa.resample( | |
audio_npy, | |
orig_sr=audio_samp, | |
target_sr=16000 | |
) | |
f0_up_key = int(f0_up_key) | |
times = [0, 0, 0] | |
checksum = hashlib.sha512() | |
checksum.update(audio_npy.tobytes()) | |
feat_file_index = '' | |
if ( | |
model['metadata']['feat_index'] != "" | |
# and file_big_npy != "" | |
# and os.path.exists(file_big_npy) == True | |
and os.path.exists(model['metadata']['feat_index']) == True | |
and index_rate != 0 | |
): | |
feat_file_index = model['metadata']['feat_index'] | |
output_audio = model['vc'].pipeline( | |
self.hubert_model, | |
model['net_g'], | |
model['metadata'].get('speaker_id', 0), | |
audio_npy, | |
checksum.hexdigest(), | |
times, | |
f0_up_key, | |
f0_method, | |
feat_file_index, | |
index_rate, | |
model['if_f0'], | |
filter_radius, | |
model['target_sr'], | |
resample_sr, | |
rms_mix_rate, | |
'v2', | |
protect, | |
f0_file=f0_file | |
) | |
out_sr = ( | |
resample_sr if 16000 <= resample_sr != model['target_sr'] | |
else model['target_sr'] | |
) | |
logger.info(f'npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s') | |
return out_sr, output_audio | |