Spaces:

ccds
/

vits_onnx

Runtime error

File size: 5,086 Bytes

223aff6

# Copyright (c) 2022, Yongqiang Li ([email protected])
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import os
import sys

import torch

from models import SynthesizerTrn
import utils

try:
    import onnxruntime as ort
except ImportError:
    print('Please install onnxruntime!')
    sys.exit(1)


def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad \
        else tensor.detach().numpy()


def get_args():
    parser = argparse.ArgumentParser(description='export onnx model')
    parser.add_argument('--checkpoint', required=True, help='checkpoint')
    parser.add_argument('--cfg', required=True, help='config file')
    parser.add_argument('--onnx_model', required=True, help='onnx model name')
    # parser.add_argument('--phone_table',
    #                     required=True,
    #                     help='input phone dict')
    # parser.add_argument('--speaker_table', default=None, help='speaker table')
    # parser.add_argument("--speaker_num", required=True,
    #                     type=int, help="speaker num")
    parser.add_argument(
        '--providers',
        required=False,
        default='CPUExecutionProvider',
        choices=['CUDAExecutionProvider', 'CPUExecutionProvider'],
        help='the model to send request to')
    args = parser.parse_args()
    return args


def get_data_from_cfg(cfg_path: str):
    assert os.path.isfile(cfg_path)
    with open(cfg_path, 'r') as f:
        data = json.load(f)
        symbols = data["symbols"]
        speaker_num = data["data"]["n_speakers"]
    return len(symbols), speaker_num


def main():
    args = get_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    hps = utils.get_hparams_from_file(args.cfg)
    # with open(args.phone_table) as p_f:
    #     phone_num = len(p_f.readlines()) + 1
    # num_speakers = 1
    # if args.speaker_table is not None:
    #     num_speakers = len(open(args.speaker_table).readlines()) + 1
    phone_num, num_speakers = get_data_from_cfg(args.cfg)
    net_g = SynthesizerTrn(phone_num,
                           hps.data.filter_length // 2 + 1,
                           hps.train.segment_size // hps.data.hop_length,
                           n_speakers=num_speakers,
                           **hps.model)
    utils.load_checkpoint(args.checkpoint, net_g, None)
    net_g.forward = net_g.export_forward
    net_g.eval()

    seq = torch.randint(low=0, high=phone_num, size=(1, 10), dtype=torch.long)
    seq_len = torch.IntTensor([seq.size(1)]).long()

    # noise(可用于控制感情等变化程度) lenth(可用于控制整体语速) noisew(控制音素发音长度变化程度)
    # 参考 https://github.com/gbxh/genshinTTS
    scales = torch.FloatTensor([0.667, 1.0, 0.8])
    # make triton dynamic shape happy
    scales = scales.unsqueeze(0)
    sid = torch.IntTensor([0]).long()

    dummy_input = (seq, seq_len, scales, sid)
    torch.onnx.export(model=net_g,
                      args=dummy_input,
                      f=args.onnx_model,
                      input_names=['input', 'input_lengths', 'scales', 'sid'],
                      output_names=['output'],
                      dynamic_axes={
                          'input': {
                              0: 'batch',
                              1: 'phonemes'
                          },
                          'input_lengths': {
                              0: 'batch'
                          },
                          'scales': {
                              0: 'batch'
                          },
                          'sid': {
                              0: 'batch'
                          },
                          'output': {
                              0: 'batch',
                              1: 'audio',
                              2: 'audio_length'
                          }
                      },
                      opset_version=13,
                      verbose=False)

    # Verify onnx precision
    torch_output = net_g(seq, seq_len, scales, sid)
    providers = [args.providers]
    ort_sess = ort.InferenceSession(args.onnx_model, providers=providers)
    ort_inputs = {
        'input': to_numpy(seq),
        'input_lengths': to_numpy(seq_len),
        'scales': to_numpy(scales),
        'sid': to_numpy(sid),
    }
    onnx_output = ort_sess.run(None, ort_inputs)


if __name__ == '__main__':
    main()