import os import torch import gradio as gr import torchaudio import time from datetime import datetime from tortoise.api import TextToSpeech from tortoise.utils.text import split_and_recombine_text from tortoise.utils.audio import load_audio, load_voice, load_voices VOICE_OPTIONS = [ "kasuri", "shanuka", ] def inference( text, voice ): if text is None or text.strip() == "": raise gr.Error("Please provide text.") texts = split_and_recombine_text(text) voice_samples, conditioning_latents = load_voice(voice) start_time = time.time() for j, text in enumerate(texts): for audio_frame in tts.tts_with_preset( text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset="ultra_fast", k=1 ): yield (24000, audio_frame.cpu().detach().numpy()) def main(): title = "Tortoise TTS" description = """ """ text = gr.Textbox( lines=4, label="Text:", ) voice = gr.Dropdown( VOICE_OPTIONS, value="kasuri", label="Select voice:", type="value" ) output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True) interface = gr.Interface( fn=inference, inputs=[ text, voice ], title=title, description=description, outputs=[output_audio], ) interface.queue().launch() if __name__ == "__main__": tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) with open("Tortoise_TTS_Runs_Scripts.log", "a") as f: f.write( f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n" ) main()