waysolong commited on
Commit
f5b630a
·
1 Parent(s): 2619420
Files changed (2) hide show
  1. app.py +11 -15
  2. model.py +14 -13
app.py CHANGED
@@ -22,11 +22,9 @@
22
  import logging
23
  import os
24
  import time
25
- import uuid
26
 
27
  import gradio as gr
28
- import soundfile as sf
29
-
30
  from model import get_pretrained_model, language_to_models
31
 
32
  title = "# Text-to-speech (TTS)"
@@ -76,9 +74,16 @@ def process(language: str, repo_id: str, text: str, sid: str, speed: float):
76
  logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
77
  sid = int(sid)
78
 
79
-
 
 
 
 
 
 
 
80
  start = time.time()
81
- dst_file,duration = get_pretrained_model(text)
82
  end = time.time()
83
 
84
 
@@ -189,18 +194,9 @@ with demo:
189
  gr.Markdown(description)
190
 
191
 
192
- def download_espeak_ng_data():
193
- os.system(
194
- """
195
- cd /tmp
196
- wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
197
- tar xf espeak-ng-data.tar.bz2
198
- """
199
- )
200
-
201
 
202
  if __name__ == "__main__":
203
- download_espeak_ng_data()
204
  formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
205
 
206
  logging.basicConfig(format=formatter, level=logging.INFO)
 
22
  import logging
23
  import os
24
  import time
 
25
 
26
  import gradio as gr
27
+ import yaml
 
28
  from model import get_pretrained_model, language_to_models
29
 
30
  title = "# Text-to-speech (TTS)"
 
74
  logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
75
  sid = int(sid)
76
 
77
+ config = "examples/biaobei/config.yaml"
78
+ checkpoint = "checkpoints\checkpoint_140000.pth.tar"
79
+ if os.path.exists(config):
80
+ print("file cunzai ")
81
+ else:
82
+ print("12")
83
+ with open(config) as f:
84
+ config = yaml.safe_load(f)
85
  start = time.time()
86
+ dst_file, duration = get_pretrained_model(text,config,checkpoint)
87
  end = time.time()
88
 
89
 
 
194
  gr.Markdown(description)
195
 
196
 
 
 
 
 
 
 
 
 
 
197
 
198
  if __name__ == "__main__":
199
+
200
  formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
201
 
202
  logging.basicConfig(format=formatter, level=logging.INFO)
model.py CHANGED
@@ -1,12 +1,20 @@
1
- from functools import lru_cache
2
  import torch,json,os
3
- import yaml
4
  from scipy.io import wavfile
5
  from mtts.text import TextProcessor
6
  from mtts.models.fs2_model import FastSpeech2
7
  import numpy as np
8
  with open("dict_han_pinyin.json","r",encoding="utf-8") as f:
 
9
  data_dict = json.load(f)
 
 
 
 
 
 
 
 
10
  def normalize(wav):
11
  assert wav.dtype == np.float32
12
  eps = 1e-6
@@ -23,17 +31,15 @@ def to_int16(wav):
23
  wav = wav = wav * 32767
24
  wav = np.clamp(wav, -32767, 32768)
25
  return wav.astype('int16')
 
26
  def __build_vocoder(config):
27
  vocoder_name = config['vocoder']['type']
28
  VocoderClass = eval(vocoder_name)
29
  model = VocoderClass(config=config['vocoder'][vocoder_name])
30
  return model
31
- @lru_cache(maxsize=10)
32
- def get_pretrained_model(line):
33
- config = "examples\biaobei\config.yaml"
34
- checkpoint = "checkpoints\checkpoint_140000.pth.tar"
35
- with open(config) as f:
36
- config = yaml.safe_load(f)
37
 
38
  sr = config['fbank']['sample_rate']
39
  vocoder = __build_vocoder(config)
@@ -44,7 +50,6 @@ def get_pretrained_model(line):
44
  if 'model' in sd.keys():
45
  sd = sd['model']
46
  model.load_state_dict(sd)
47
- del sd # to save mem
48
  model = model.to("cpu")
49
  torch.set_grad_enabled(False)
50
 
@@ -75,7 +80,3 @@ def get_pretrained_model(line):
75
  #np.save(dst_file+'.npy',mel_postnet.cpu().numpy())
76
  wavfile.write(dst_file, sr, wav)
77
  return dst_file,2.0
78
- chinese_models = {
79
- "csukuangfj/vits-piper-zh_CN-huayan-medium": 1}
80
- language_to_models = {
81
- "Chinese (Mandarin, 普通话)": list(chinese_models.keys())}
 
 
1
  import torch,json,os
2
+
3
  from scipy.io import wavfile
4
  from mtts.text import TextProcessor
5
  from mtts.models.fs2_model import FastSpeech2
6
  import numpy as np
7
  with open("dict_han_pinyin.json","r",encoding="utf-8") as f:
8
+ print("loading")
9
  data_dict = json.load(f)
10
+
11
+
12
+ chinese_models = {
13
+ "csukuangfj/vits-piper-zh_CN-huayan-medium": 1}
14
+ language_to_models = {
15
+ "Chinese (Mandarin, 普通话)": list(chinese_models.keys())}
16
+
17
+
18
  def normalize(wav):
19
  assert wav.dtype == np.float32
20
  eps = 1e-6
 
31
  wav = wav = wav * 32767
32
  wav = np.clamp(wav, -32767, 32768)
33
  return wav.astype('int16')
34
+
35
  def __build_vocoder(config):
36
  vocoder_name = config['vocoder']['type']
37
  VocoderClass = eval(vocoder_name)
38
  model = VocoderClass(config=config['vocoder'][vocoder_name])
39
  return model
40
+
41
+ def get_pretrained_model(line,config,checkpoint):
42
+
 
 
 
43
 
44
  sr = config['fbank']['sample_rate']
45
  vocoder = __build_vocoder(config)
 
50
  if 'model' in sd.keys():
51
  sd = sd['model']
52
  model.load_state_dict(sd)
 
53
  model = model.to("cpu")
54
  torch.set_grad_enabled(False)
55
 
 
80
  #np.save(dst_file+'.npy',mel_postnet.cpu().numpy())
81
  wavfile.write(dst_file, sr, wav)
82
  return dst_file,2.0