RVC-Speakers / speakers /processors /rvc_speakers_processor.py
glide-the
Add large files to Git LFS
04ffec9
import util
import numpy as np
import librosa
import hashlib
import json
import os
import torch
import logging
from rvc.infer_pack.models import (
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono
)
from os import getenv
from typing import Union, Tuple, List
from rvc.vc_infer_pipeline import VC
from speakers.processors import BaseProcessor, ProcessorData
from speakers.common.utils import get_abs_path
from omegaconf import OmegaConf
from speakers.common.registry import registry
from pydantic import Field
logger = logging.getLogger('speaker_runner')
def set_rvc_speakers_logger(l):
global logger
logger = l
class RvcProcessorData(ProcessorData):
"""
# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118 # noqa
:param f0_up_key: 变调(整数, 半音数量, 升八度12降八度-12)
:param input_audio:
:param f0_file: F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调
:param protect: 保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果
:param model_index:
:param f0_method:
:param index_rate: 检索特征占比
:param filter_radius: >=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音
:param rms_mix_rate: 输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络
:param resample_sr: 后处理重采样至最终采样率,0为不进行重采样
:return:
"""
sample_rate: int = Field(
default=0
)
audio_samples: List[float] = Field(
default=[]
)
model_index: int
""" 变调(整数, 半音数量, 升八度12降八度-12)"""
f0_up_key: int
""" F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"""
f0_method: str
"""检索特征占比"""
index_rate: float
""" >=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"""
filter_radius: int
"""输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"""
rms_mix_rate: float
"""后处理重采样至最终采样率,0为不进行重采样"""
resample_sr: int
"""保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"""
protect: float = Field(
default=0.33
)
f0_file: str = Field(
default=None
)
@property
def type(self) -> str:
"""Type of the Message, used for serialization."""
return "RVC"
@registry.register_processor("rvc_speakers")
class RVCSpeakers(BaseProcessor):
"""
音频处理器有抽象处理器Processor,通过单独的Processor配置,预加载音频处理器,
不同的处理器有着特定人物的说话风格与配置参数
"""
def __init__(self, hubert_model_path: str, rvc_config_file: str):
# Reference: https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L21 # noqa
self.in_hf_space = getenv('SYSTEM') == 'spaces'
self._loaded_models = []
self._load_hubert(hubert_model_path=hubert_model_path)
self._load_rvc_mode(rvc_config_file=rvc_config_file)
def __call__(
self,
data: RvcProcessorData
):
# 将 Python 列表转换为 NumPy 数组
audio_samples_np = np.array(data.audio_samples, dtype=np.float32)
input_audio = (data.sample_rate, audio_samples_np)
return self.vc_func(input_audio=input_audio,
model_index=data.model_index,
f0_up_key=data.f0_up_key,
f0_method=data.f0_method,
index_rate=data.index_rate,
filter_radius=data.filter_radius,
rms_mix_rate=data.rms_mix_rate,
resample_sr=data.resample_sr,
protect=data.protect,
f0_file=data.f0_file)
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
raise RuntimeError("from_config cfg is None.")
hubert_model_path = cfg.get("hubert_model_path", "")
rvc_config_file = cfg.get("rvc_config_file", "")
return cls(hubert_model_path=os.path.join(registry.get_path("rvc_library_root"),
hubert_model_path),
rvc_config_file=os.path.join(registry.get_path("rvc_library_root"),
rvc_config_file))
def match(self, data: ProcessorData):
return "RVC" in data.type
@property
def loaded_models(self):
return self._loaded_models
def _load_hubert(self, hubert_model_path: str):
# Load hubert model
logger.info(f'Load hubert model{hubert_model_path}')
self.hubert_model = util.load_hubert_model(registry.get("device"), model_path=hubert_model_path)
self.hubert_model.eval()
logger.info('Loaded hubert model')
def _load_rvc_mode(self, rvc_config_file: str):
"""
模型配置加载
:param rvc_config_file:
:return:
"""
# Load models
logger.info(f'Models Load:rvc_speakers')
multi_cfg = OmegaConf.load(get_abs_path(rvc_config_file))
rmvpe_path = os.path.join(registry.get_path("rvc_library_root"), multi_cfg.get("rmvpe_path"))
logger.info(f'rmvpe_path:{rmvpe_path}')
for item in multi_cfg.get('models'):
for key, model_info in item.items(): # 使用 .items() 方法获取键值对
logger.info(f'Loading model: {key}')
model_name = model_info.get("model_name")
# Load model info
model_info_config_file = os.path.join(registry.get_path("rvc_library_root"),
model_info.get("path"),
'config.json')
logger.info(f'Loading model model_info_config_file: {model_info_config_file}')
model_info_config = json.load(open(model_info_config_file, 'r'))
# Load RVC checkpoint
torch_file = os.path.join(registry.get_path("rvc_library_root"),
model_info.get("path"),
model_info_config['model'])
cpt = torch.load(
torch_file,
map_location='cpu'
)
tgt_sr = cpt['config'][-1]
cpt['config'][-3] = cpt['weight']['emb_g.weight'].shape[0] # n_spk
if_f0 = cpt.get('f0', 1)
net_g: Union[SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono]
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(
*cpt['config'],
is_half=util.is_half(registry.get("device"))
)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt['config'])
del net_g.enc_q
# According to original code, this thing seems necessary.
logger.info(net_g.load_state_dict(cpt['weight'], strict=False))
net_g.eval().to(registry.get("device"))
net_g = net_g.half() if util.is_half(registry.get("device")) else net_g.float()
vc = VC(tgt_sr,
registry.get("x_pad"),
registry.get("x_query"),
registry.get("x_center"),
registry.get("x_max"),
registry.get("is_half"),
registry.get("device"),
rmvpe_path=rmvpe_path
)
self._loaded_models.append(dict(
name=model_name,
metadata=model_info_config,
vc=vc,
net_g=net_g,
if_f0=if_f0,
target_sr=tgt_sr
))
logger.info(f'Models loaded:rvc_speakers, len:{len(self._loaded_models)}')
def vc_func(
self,
input_audio: Tuple[int, np.ndarray], model_index, f0_up_key, f0_method: str, index_rate,
filter_radius, rms_mix_rate, resample_sr, protect: float = 0.33, f0_file: str = None
) -> Tuple[int, np.ndarray]:
"""
# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118 # noqa
:param f0_up_key: 变调(整数, 半音数量, 升八度12降八度-12)
:param input_audio:
:param f0_file: F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调
:param protect: 保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果
:param model_index:
:param f0_method:
:param index_rate: 检索特征占比
:param filter_radius: >=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音
:param rms_mix_rate: 输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络
:param resample_sr: 后处理重采样至最终采样率,0为不进行重采样
:return:
"""
if input_audio is None:
raise RuntimeError("Please provide input audio.")
if model_index is None:
raise RuntimeError("Please select a model.")
model = self._loaded_models[model_index]
# Reference: so-vits
(audio_samp, audio_npy) = input_audio
# https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49
# Can be change well, we will see
if (audio_npy.shape[0] / audio_samp) > 600 and self.in_hf_space:
raise RuntimeError("Input audio is longer than 600 secs.")
# Bloody hell: https://stackoverflow.com/questions/26921836/
if audio_npy.dtype != np.float32: # :thonk:
audio_npy = (
audio_npy / np.iinfo(audio_npy.dtype).max
).astype(np.float32)
if len(audio_npy.shape) > 1:
audio_npy = librosa.to_mono(audio_npy.transpose(1, 0))
if audio_samp != 16000:
audio_npy = librosa.resample(
audio_npy,
orig_sr=audio_samp,
target_sr=16000
)
f0_up_key = int(f0_up_key)
times = [0, 0, 0]
checksum = hashlib.sha512()
checksum.update(audio_npy.tobytes())
feat_file_index = ''
if (
model['metadata']['feat_index'] != ""
# and file_big_npy != ""
# and os.path.exists(file_big_npy) == True
and os.path.exists(model['metadata']['feat_index']) == True
and index_rate != 0
):
feat_file_index = model['metadata']['feat_index']
output_audio = model['vc'].pipeline(
self.hubert_model,
model['net_g'],
model['metadata'].get('speaker_id', 0),
audio_npy,
checksum.hexdigest(),
times,
f0_up_key,
f0_method,
feat_file_index,
index_rate,
model['if_f0'],
filter_radius,
model['target_sr'],
resample_sr,
rms_mix_rate,
'v2',
protect,
f0_file=f0_file
)
out_sr = (
resample_sr if 16000 <= resample_sr != model['target_sr']
else model['target_sr']
)
logger.info(f'npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s')
return out_sr, output_audio