TTSDemo / app.py
miya3333's picture
Upload app.py
62564a6 verified
raw
history blame
1.4 kB
import gradio as gr
import torch
import soundfile as sf
from speechbrain.inference.TTS import Tacotron2
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.dataio.dataio import read_audio
# モデルのロード
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
# 推論関数の定義
def synthesize_speech(text):
# テキストをトークンIDに変換
text = text.lower()
tokenized = tacotron2.hparams.tokenize(text, phonemize=False)
# トークンIDをLong型のテンソルに変換
tokens = torch.LongTensor(tokenized)
# Tacotron2でmel spectrogramを生成
mel_output, mel_length, alignment = tacotron2.encode_batch(tokens)
# HiFi-GANでmel spectrogramから音声を生成
waveforms = hifi_gan.decode_batch(mel_output)
# 音声を .wav 形式で保存
sf.write("speech.wav", waveforms.squeeze().cpu().numpy(), samplerate=hifi_gan.hparams.sample_rate)
return "speech.wav"
# Gradioインターフェースの作成
iface = gr.Interface(
fn=synthesize_speech,
inputs=gr.Textbox(lines=5, label="Input Text"),
outputs=gr.Audio(label="Output Audio", type="filepath"),
title="TTS Demo",
description="Enter text to synthesize speech."
)
iface.launch()