Spaces:

miya3333
/

TTSDemo

Running

miya3333 commited on Jan 3

Commit

dc4f75a

verified ·

1 Parent(s): 881ca56

Upload 2 files

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,23 +1,29 @@
 import gradio as gr
-from transformers import pipeline
-import soundfile as sf
 import torch
-# モデルのロード (例: speechbrain/tts-hifigan-ljspeech)
-synthesizer = pipeline("text-to-speech", "speechbrain/tts-hifigan-ljspeech")
 # 推論関数の定義
 def synthesize_speech(text):
-    with torch.no_grad():
-        output = synthesizer(text)
-    sf.write("speech.wav", output["audio"], output["sampling_rate"]) #numpy arrayからwavファイルに変換
-    return "speech.wav"
 # Gradioインターフェースの作成
 iface = gr.Interface(
     fn=synthesize_speech,
     inputs=gr.Textbox(lines=5, label="Input Text"),
-    outputs=gr.Audio(label="Output Audio"),
     title="TTS Demo",
     description="Enter text to synthesize speech."
 )

 import gradio as gr
 import torch
+from speechbrain.pretrained import Tacotron2
+from speechbrain.pretrained import HIFIGAN
+# モデルのロード
+hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
+tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
 # 推論関数の定義
 def synthesize_speech(text):
+    # Tacotron2でmel spectrogramを生成
+    mel_output, _, _ = tacotron2.encode_text(text)
+    # HiFi-GANでmel spectrogramから音声を生成
+    waveforms = hifi_gan.decode_batch(mel_output)
+    # torch tensorをwavfileとして保存
+    torch.save(waveforms, "speech.pt")
+    return "speech.pt"
 # Gradioインターフェースの作成
 iface = gr.Interface(
     fn=synthesize_speech,
     inputs=gr.Textbox(lines=5, label="Input Text"),
+    outputs=gr.Audio(label="Output Audio", type="filepath"),
     title="TTS Demo",
     description="Enter text to synthesize speech."
 )

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 gradio
-transformers
 torch
 soundfile

 gradio
+speechbrain
 torch
 soundfile