Spaces:

innnky
/

nanami

App Files Files Community

nanami / app.py

innnky

init

9075be8 about 2 years ago

raw

history blame

3.65 kB

	import gradio as gr

	import torch,pdb
	import numpy as np
	import soundfile as sf
	from models import SynthesizerTrn256
	from scipy.io import wavfile
	from fairseq import checkpoint_utils
	import pyworld,librosa
	import torch.nn.functional as F


	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model_path = "checkpoint_best_legacy_500.pt"#checkpoint_best_legacy_500.pt
	print("load model(s) from {}".format(model_path))
	models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
	[model_path],
	suffix="",
	)
	model = models[0]
	model = model.to(device)
	model.eval()

	net_g = SynthesizerTrn256(513,40,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,4,2,2,2],512,[16,16,4,4,4],0)
	weights=torch.load("qihai.pt", map_location=torch.device('cpu'))
	net_g.load_state_dict(weights,strict=True)
	net_g.eval().to(device)


	def get_f0(x, f0_up_key=0):
	f0_max = 1100.0
	f0_min = 50.0
	f0_mel_min = 1127 * np.log(1 + f0_min / 700)
	f0_mel_max = 1127 * np.log(1 + f0_max / 700)

	f0, t = pyworld.dio(
	x.astype(np.double),
	fs=16000,
	f0_ceil=800,
	frame_period=10,
	)
	f0 = pyworld.stonemask(x.astype(np.double), f0, t, 16000)
	f0 *= pow(2, f0_up_key / 12)
	f0_mel = 1127 * np.log(1 + f0 / 700)
	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > 255] = 255
	f0_coarse = np.rint(f0_mel).astype(np.int)
	return f0_coarse



	def vc_fn( input_audio,f0_up_key):
	if input_audio is None:
	return "You need to upload an audio", None
	sampling_rate, audio = input_audio
	duration = audio.shape[0] / sampling_rate
	if duration > 45:
	return "请上传小于45s的音频，需要转换长音频请使用colab", None
	audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio.transpose(1, 0))
	if sampling_rate != 16000:
	audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
	pitch = get_f0(audio, f0_up_key)

	feats = torch.from_numpy(audio).float()
	if feats.dim() == 2: # double channels
	feats = feats.mean(-1)
	assert feats.dim() == 1, feats.dim()
	feats = feats.view(1, -1)
	padding_mask = torch.BoolTensor(feats.shape).fill_(False)
	inputs = {
	"source": feats.to(device),
	"padding_mask": padding_mask.to(device),
	"output_layer": 9, # layer 9
	}
	with torch.no_grad():
	logits = model.extract_features(**inputs)
	feats = model.final_proj(logits[0])
	feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
	p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
	feats = feats[:, :p_len, :]
	pitch = pitch[:p_len]
	p_len = torch.LongTensor([p_len]).to(device)
	pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
	with torch.no_grad():
	audio = net_g.infer(feats, p_len, pitch)[0][0, 0].data.cpu().float().numpy()

	return "Success", (32000, audio)


	app = gr.Blocks()
	with app:
	with gr.Tabs():
	with gr.TabItem("Basic"):
	gr.Markdown(value="""""")
	vc_input3 = gr.Audio(label="上传音频（长度小于45秒）")
	f0_up_key = gr.Number(label="变调")
	vc_submit = gr.Button("转换", variant="primary")
	vc_output1 = gr.Textbox(label="Output Message")
	vc_output2 = gr.Audio(label="Output Audio")
	vc_submit.click(vc_fn, [ vc_input3, f0_up_key], [vc_output1, vc_output2])

	app.launch()