Spaces:

waysolong
/

text_to_speech

Sleeping

App Files Files Community

waysolong commited on Mar 13, 2024

Commit

f5b630a

1 Parent(s): 2619420

fix bug

Browse files

Files changed (2) hide show

app.py +11 -15
model.py +14 -13

app.py CHANGED Viewed

@@ -22,11 +22,9 @@
 import logging
 import os
 import time
-import uuid
 import gradio as gr
-import soundfile as sf
 from model import get_pretrained_model, language_to_models
 title = "# Text-to-speech (TTS)"
@@ -76,9 +74,16 @@ def process(language: str, repo_id: str, text: str, sid: str, speed: float):
     logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
     sid = int(sid)
     start = time.time()
-    dst_file,duration = get_pretrained_model(text)
     end = time.time()
@@ -189,18 +194,9 @@ with demo:
     gr.Markdown(description)
-def download_espeak_ng_data():
-    os.system(
-        """
-    cd /tmp
-    wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
-    tar xf espeak-ng-data.tar.bz2
-    """
-    )
 if __name__ == "__main__":
-    download_espeak_ng_data()
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)

 import logging
 import os
 import time
 import gradio as gr
+import yaml
 from model import get_pretrained_model, language_to_models
 title = "# Text-to-speech (TTS)"
     logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
     sid = int(sid)
+    config = "examples/biaobei/config.yaml"
+    checkpoint = "checkpoints\checkpoint_140000.pth.tar"
+    if os.path.exists(config):
+        print("file cunzai ")
+    else:
+        print("12")
+    with open(config) as f:
+        config = yaml.safe_load(f)
     start = time.time()
+    dst_file, duration = get_pretrained_model(text,config,checkpoint)
     end = time.time()
     gr.Markdown(description)
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)

model.py CHANGED Viewed

@@ -1,12 +1,20 @@
-from functools import lru_cache
 import torch,json,os
-import yaml
 from scipy.io import wavfile
 from mtts.text import TextProcessor
 from mtts.models.fs2_model import FastSpeech2
 import numpy as np
 with open("dict_han_pinyin.json","r",encoding="utf-8") as f:
     data_dict = json.load(f)
 def normalize(wav):
     assert wav.dtype == np.float32
     eps = 1e-6
@@ -23,17 +31,15 @@ def to_int16(wav):
     wav = wav = wav * 32767
     wav = np.clamp(wav, -32767, 32768)
     return wav.astype('int16')
 def __build_vocoder(config):
     vocoder_name = config['vocoder']['type']
     VocoderClass = eval(vocoder_name)
     model = VocoderClass(config=config['vocoder'][vocoder_name])
     return model
-@lru_cache(maxsize=10)
-def get_pretrained_model(line):
-    config = "examples\biaobei\config.yaml"
-    checkpoint = "checkpoints\checkpoint_140000.pth.tar"
-    with open(config) as f:
-        config = yaml.safe_load(f)
     sr = config['fbank']['sample_rate']
     vocoder = __build_vocoder(config)
@@ -44,7 +50,6 @@ def get_pretrained_model(line):
         if 'model' in sd.keys():
             sd = sd['model']
     model.load_state_dict(sd)
-    del sd  # to save mem
     model = model.to("cpu")
     torch.set_grad_enabled(False)
@@ -75,7 +80,3 @@ def get_pretrained_model(line):
     #np.save(dst_file+'.npy',mel_postnet.cpu().numpy())
     wavfile.write(dst_file, sr, wav)
     return dst_file,2.0
-chinese_models = {
-    "csukuangfj/vits-piper-zh_CN-huayan-medium": 1}
-language_to_models = {
-    "Chinese (Mandarin, 普通话)": list(chinese_models.keys())}

 import torch,json,os
 from scipy.io import wavfile
 from mtts.text import TextProcessor
 from mtts.models.fs2_model import FastSpeech2
 import numpy as np
 with open("dict_han_pinyin.json","r",encoding="utf-8") as f:
+    print("loading")
     data_dict = json.load(f)
+chinese_models = {
+    "csukuangfj/vits-piper-zh_CN-huayan-medium": 1}
+language_to_models = {
+    "Chinese (Mandarin, 普通话)": list(chinese_models.keys())}
 def normalize(wav):
     assert wav.dtype == np.float32
     eps = 1e-6
     wav = wav = wav * 32767
     wav = np.clamp(wav, -32767, 32768)
     return wav.astype('int16')
 def __build_vocoder(config):
     vocoder_name = config['vocoder']['type']
     VocoderClass = eval(vocoder_name)
     model = VocoderClass(config=config['vocoder'][vocoder_name])
     return model
+def get_pretrained_model(line,config,checkpoint):
     sr = config['fbank']['sample_rate']
     vocoder = __build_vocoder(config)
         if 'model' in sd.keys():
             sd = sd['model']
     model.load_state_dict(sd)
     model = model.to("cpu")
     torch.set_grad_enabled(False)
     #np.save(dst_file+'.npy',mel_postnet.cpu().numpy())
     wavfile.write(dst_file, sr, wav)
     return dst_file,2.0