File size: 3,797 Bytes
5195a87 99185bb 5195a87 2fba440 26e83c1 2fba440 c195376 99185bb 2fba440 99185bb 2fba440 99185bb 5195a87 422853a 5195a87 422853a 5195a87 422853a 5195a87 2fba440 5195a87 2fba440 422853a 2fba440 5195a87 422853a 5195a87 422853a 5195a87 f9403b4 5195a87 422853a 5195a87 2fba440 5195a87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
import os
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
import logging
numba_logger = logging.getLogger('numba')
numba_logger.setLevel(logging.WARNING)
import librosa
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
def resize2d(source, target_len):
source[source<0.001] = np.nan
target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
return np.nan_to_num(target)
def convert_wav_22050_to_f0(audio):
tmp = librosa.pyin(audio,
fmin=librosa.note_to_hz('C0'),
fmax=librosa.note_to_hz('C7'),
frame_length=1780)[0]
f0 = np.zeros_like(tmp)
f0[tmp>0] = tmp[tmp>0]
return f0
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
print(text_norm.shape)
return text_norm
hps = utils.get_hparams_from_file("configs/ljs_base.json")
hps_ms = utils.get_hparams_from_file("configs/vctk_base.json")
net_g_ms = SynthesizerTrn(
len(symbols),
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps.data.hop_length,
n_speakers=hps_ms.data.n_speakers,
**hps_ms.model)
import numpy as np
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")
_ = utils.load_checkpoint("G_312000.pth", net_g_ms, None)
def vc_fn(input_audio,vc_transform):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
# print(audio.shape,sampling_rate)
duration = audio.shape[0] / sampling_rate
if duration > 30:
return "Error: Audio is too long", None
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
audio22050 = librosa.resample(audio, orig_sr=16000, target_sr=22050)
f0 = convert_wav_22050_to_f0(audio22050)
source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0)
print(source.shape)
with torch.inference_mode():
units = hubert.units(source)
soft = units.squeeze(0).numpy()
print(sampling_rate)
f0 = resize2d(f0, len(soft[:, 0])) * vc_transform
soft[:, 0] = f0 / 10
sid = torch.LongTensor([0])
stn_tst = torch.FloatTensor(soft)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
audio = net_g_ms.infer(x_tst, x_tst_lengths,sid=sid, noise_scale=0.1, noise_scale_w=0.1, length_scale=1)[0][
0, 0].data.float().numpy()
return "Success", (hps.data.sampling_rate, audio)
app = gr.Blocks()
with app:
with gr.Tabs():
with gr.TabItem("Basic"):
gr.Markdown(value="""目前模型已更新2.0,新模型模型的 [在线Demo](https://huggingface.co/spaces/innnky/nyaru-svc2.0)
自己制作数据集并训练模型一键脚本 [b站专栏](https://www.bilibili.com/read/cv18548051)
""")
vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
vc_transform = gr.Number(label="transform",value=1.0)
vc_submit = gr.Button("Convert", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio")
vc_submit.click(vc_fn, [ vc_input3,vc_transform], [vc_output1, vc_output2])
app.launch() |