Spaces:

ccds
/

vits_onnx

Runtime error

vits_onnx / export /vits /inference_onnx.py

chocolatedesue

init

223aff6 over 2 years ago

5.49 kB

	# Copyright (c) 2022, Yongqiang Li ([email protected])
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	from text import text_to_sequence
	import numpy as np
	from scipy.io import wavfile
	import torch
	import json
	import commons
	import utils
	import sys
	import pathlib

	try:
	import onnxruntime as ort
	except ImportError:
	print('Please install onnxruntime!')
	sys.exit(1)


	def to_numpy(tensor: torch.Tensor):
	return tensor.detach().cpu().numpy() if tensor.requires_grad \
	else tensor.detach().numpy()


	def get_args():
	parser = argparse.ArgumentParser(description='inference')
	parser.add_argument('--onnx_model', required=True, help='onnx model')
	parser.add_argument('--cfg', required=True, help='config file')
	parser.add_argument('--outdir', default="onnx_output",
	help='ouput directory')
	# parser.add_argument('--phone_table',
	# required=True,
	# help='input phone dict')
	# parser.add_argument('--speaker_table', default=None, help='speaker table')
	parser.add_argument('--test_file', required=True, help='test file')
	args = parser.parse_args()
	return args


	def get_symbols_from_json(path):
	import os
	assert os.path.isfile(path)
	with open(path, 'r') as f:
	data = json.load(f)
	return data['symbols']


	def main():
	args = get_args()
	print(args)
	if not pathlib.Path(args.outdir).exists():
	pathlib.Path(args.outdir).mkdir(exist_ok=True, parents=True)
	# phones =
	symbols = get_symbols_from_json(args.cfg)
	phone_dict = {
	symbol: i for i, symbol in enumerate(symbols)
	}

	# speaker_dict = {}
	# if args.speaker_table is not None:
	# with open(args.speaker_table) as p_f:
	# for line in p_f:
	# arr = line.strip().split()
	# assert len(arr) == 2
	# speaker_dict[arr[0]] = int(arr[1])
	hps = utils.get_hparams_from_file(args.cfg)

	ort_sess = ort.InferenceSession(args.onnx_model)

	with open(args.test_file) as fin:
	for line in fin:
	arr = line.strip().split("\|")
	audio_path = arr[0]

	# TODO: 控制说话人编号
	sid = 8
	text = arr[1]
	# else:
	# sid = speaker_dict[arr[1]]
	# text = arr[2]
	seq = text_to_sequence(text, symbols=hps.symbols, cleaner_names=["japanese_cleaners2"]
	)
	if hps.data.add_blank:
	seq = commons.intersperse(seq, 0)

	# if hps.data.add_blank:
	# seq = commons.intersperse(seq, 0)
	with torch.no_grad():
	# x = torch.LongTensor([seq])
	# x_len = torch.IntTensor([x.size(1)]).long()
	# sid = torch.LongTensor([sid]).long()
	# scales = torch.FloatTensor([0.667, 1.0, 1])
	# # make triton dynamic shape happy
	# scales = scales.unsqueeze(0)

	# use numpy to replace torch
	x = np.array([seq], dtype=np.int64)
	x_len = np.array([x.shape[1]], dtype=np.int64)
	sid = np.array([sid], dtype=np.int64)
	# noise(可用于控制感情等变化程度) lenth(可用于控制整体语速) noisew(控制音素发音长度变化程度)
	# 参考 https://github.com/gbxh/genshinTTS
	scales = np.array([0.667, 0.8, 1], dtype=np.float32)
	# scales = scales[np.newaxis, :]
	# scales.reshape(1, -1)
	scales.resize(1, 3)

	ort_inputs = {
	'input': x,
	'input_lengths': x_len,
	'scales': scales,
	'sid': sid
	}

	# ort_inputs = {
	# 'input': to_numpy(x),
	# 'input_lengths': to_numpy(x_len),
	# 'scales': to_numpy(scales),
	# 'sid': to_numpy(sid)
	# }
	import time
	# start_time = time.time()
	start_time = time.perf_counter()
	audio = np.squeeze(ort_sess.run(None, ort_inputs))
	audio = 32767.0 / max(0.01, np.max(np.abs(audio))) 0.6
	audio = np.clip(audio, -32767.0, 32767.0)
	end_time = time.perf_counter()
	# end_time = time.time()
	print("infer time cost: ", end_time - start_time, "s")

	wavfile.write(args.outdir + "/" + audio_path.split("/")[-1],
	hps.data.sampling_rate, audio.astype(np.int16))


	if __name__ == '__main__':
	main()