File size: 2,700 Bytes
f42d33e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from functools import lru_cache
import torch,json,os
import yaml
from scipy.io import wavfile
from mtts.text import TextProcessor
from mtts.models.fs2_model import FastSpeech2
import numpy as np
with open("dict_han_pinyin.json","r",encoding="utf-8") as f:
    data_dict = json.load(f)
def normalize(wav):
    assert wav.dtype == np.float32
    eps = 1e-6
    sil = wav[1500:2000]
    #wav = wav - np.mean(sil)
    #wav = (wav - np.min(wav))/(np.max(wav)-np.min(wav)+eps)
    wav = wav / np.max(np.abs(wav))
    #wav = wav*2-1
    wav = wav * 32767
    return wav.astype('int16')


def to_int16(wav):
    wav = wav = wav * 32767
    wav = np.clamp(wav, -32767, 32768)
    return wav.astype('int16')
def __build_vocoder(config):
    vocoder_name = config['vocoder']['type']
    VocoderClass = eval(vocoder_name)
    model = VocoderClass(config=config['vocoder'][vocoder_name])
    return model
@lru_cache(maxsize=10)
def get_pretrained_model(line):
    config = "text_to_speech\examples/biaobei\config.yaml"
    checkpoint = "text_to_speech\checkpoints\checkpoint_140000.pth.tar"
    with open(config) as f:
        config = yaml.safe_load(f)

    sr = config['fbank']['sample_rate']
    vocoder = __build_vocoder(config)
    text_processor = TextProcessor(config)
    model = FastSpeech2(config)
    if checkpoint != '':
        sd = torch.load(checkpoint, map_location="cpu")
        if 'model' in sd.keys():
            sd = sd['model']
    model.load_state_dict(sd)
    del sd  # to save mem
    model = model.to("cpu")
    torch.set_grad_enabled(False)
    
    pinyin = ""
    hanzi = ""
    for i in line:
        pinyin+=data_dict[i]+" "
        hanzi +=i+" "
    post_line = f"text1|sil {pinyin}sil|sil {hanzi}sil|0" 
    name, tokens = text_processor(post_line)
    tokens = tokens.to("cpu")
    seq_len = torch.tensor([tokens.shape[1]])
    tokens = tokens.unsqueeze(1)
    seq_len = seq_len.to("cpu")
    max_src_len = torch.max(seq_len)
    output = model(tokens, seq_len, max_src_len=max_src_len, d_control=1.0)
    mel_pred, mel_postnet, d_pred, src_mask, mel_mask, mel_len = output

    # convert to waveform using vocoder
    mel_postnet = mel_postnet[0].transpose(0, 1).detach()
    mel_postnet += config['fbank']['mel_mean']
    wav = vocoder(mel_postnet)
    if config['synthesis']['normalize']:
        wav = normalize(wav)
    else:
        wav = to_int16(wav)
    dst_file = os.path.join(f'{name}.wav')
    #np.save(dst_file+'.npy',mel_postnet.cpu().numpy())
    wavfile.write(dst_file, sr, wav)
    return dst_file,2.0
chinese_models = {
    "csukuangfj/vits-piper-zh_CN-huayan-medium": 1}
language_to_models = {
    "Chinese (Mandarin, 普通话)": list(chinese_models.keys())}