File size: 3,653 Bytes
223aff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# Copyright (c) 2022, Yongqiang Li ([email protected])
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
from scipy.io import wavfile
import torch
import commons
from models import SynthesizerTrn
import utils
def get_args():
parser = argparse.ArgumentParser(description='inference')
parser.add_argument('--checkpoint', required=True, help='checkpoint')
parser.add_argument('--cfg', required=True, help='config file')
parser.add_argument('--outdir', required=True, help='ouput directory')
parser.add_argument('--phone_table',
required=True,
help='input phone dict')
parser.add_argument('--speaker_table', default=None, help='speaker table')
parser.add_argument('--test_file', required=True, help='test file')
args = parser.parse_args()
return args
def main():
args = get_args()
print(args)
phone_dict = {}
with open(args.phone_table) as p_f:
for line in p_f:
phone_id = line.strip().split()
phone_dict[phone_id[0]] = int(phone_id[1])
speaker_dict = {}
if args.speaker_table is not None:
with open(args.speaker_table) as p_f:
for line in p_f:
arr = line.strip().split()
assert len(arr) == 2
speaker_dict[arr[0]] = int(arr[1])
hps = utils.get_hparams_from_file(args.cfg)
net_g = SynthesizerTrn(
len(phone_dict) + 1,
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=len(speaker_dict) + 1, # 0 is kept for unknown speaker
**hps.model).cuda()
net_g.eval()
utils.load_checkpoint(args.checkpoint, net_g, None)
with open(args.test_file) as fin:
for line in fin:
arr = line.strip().split("|")
audio_path = arr[0]
if len(arr) == 2:
sid = 0
text = arr[1]
else:
sid = speaker_dict[arr[1]]
text = arr[2]
seq = [phone_dict[symbol] for symbol in text.split()]
if hps.data.add_blank:
seq = commons.intersperse(seq, 0)
seq = torch.LongTensor(seq)
with torch.no_grad():
x = seq.cuda().unsqueeze(0)
x_length = torch.LongTensor([seq.size(0)]).cuda()
sid = torch.LongTensor([sid]).cuda()
audio = net_g.infer(
x,
x_length,
sid=sid,
noise_scale=.667,
noise_scale_w=0.8,
length_scale=1)[0][0, 0].data.cpu().float().numpy()
audio *= 32767 / max(0.01, np.max(np.abs(audio))) * 0.6
audio = np.clip(audio, -32767.0, 32767.0)
wavfile.write(args.outdir + "/" + audio_path.split("/")[-1],
hps.data.sampling_rate, audio.astype(np.int16))
if __name__ == '__main__':
main()
|