import gradio as gr import tensorflow as tf from tensorflow_tts.inference import TFAutoModel, AutoProcessor import soundfile as sf # Load the model and processor lightspeech = TFAutoModel.from_pretrained("bookbot/lightspeech-mfa-sw-v4") processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-sw-v4") mb_melgan = TFAutoModel.from_pretrained("bookbot/mb-melgan-hifi-postnets-sw-v4") def tts(text, speaker_name="sw-TZ-Victoria"): # Process input text input_ids = processor.text_to_sequence(text) # Generate mel-spectrogram mel, _, _ = lightspeech.inference( input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), speaker_ids=tf.convert_to_tensor( [processor.speakers_map[speaker_name]], dtype=tf.int32 ), speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), ) # Generate audio from mel-spectrogram audio = mb_melgan.inference(mel)[0, :, 0] # Save to file sf.write("output.wav", audio, 44100, "PCM_16") # Return the audio file for Gradio to play return "output.wav" # Create a Gradio interface iface = gr.Interface(fn=tts, inputs="text", outputs="audio") # Launch the interface iface.launch()