File size: 1,262 Bytes
bb51650
b6e1372
 
 
bb51650
b6e1372
 
 
 
bb51650
b6e1372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb51650
b6e1372
bb51650
b6e1372
 
bb51650
b6e1372
 
bb51650
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import gradio as gr
import tensorflow as tf
import numpy as np
from tensorflow_tts.inference import TFAutoModel, AutoProcessor

# Load pre-trained models
processor = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")

# Define inference function
def tts_inference(text):
    # Convert text to sequence
    input_ids = processor.text_to_sequence(text)
    
    # Generate mel spectrogram
    mel_outputs = fastspeech2.inference(
        input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32)
    )
    
    # Convert mel spectrogram to waveform
    audio = melgan.inference(mel_outputs)[0, :, 0]
    audio = audio.numpy()
    
    # Save to a temporary file and return path
    return audio, 22050  # Return audio and sample rate for Gradio to play

# Create Gradio interface
iface = gr.Interface(
    fn=tts_inference, 
    inputs="text", 
    outputs="audio",
    title="FastSpeech2_vi TTS",
    description="Enter Vietnamese text and generate speech using FastSpeech2"
)

iface.launch()