Spaces:
Sleeping
Sleeping
import os | |
import torch | |
import gradio as gr | |
import torchaudio | |
import time | |
from datetime import datetime | |
from tortoise.api import TextToSpeech | |
from tortoise.utils.text import split_and_recombine_text | |
from tortoise.utils.audio import load_audio, load_voice, load_voices | |
VOICE_OPTIONS = [ | |
"angie", | |
"deniro", | |
"freeman", | |
"random", # special option for random voice | |
] | |
def inference( | |
text, | |
script, | |
voice, | |
voice_b, | |
seed, | |
split_by_newline, | |
): | |
if text is None or text.strip() == "": | |
with open(script.name) as f: | |
text = f.read() | |
if text.strip() == "": | |
raise gr.Error("Please provide either text or script file with content.") | |
if split_by_newline == "Yes": | |
texts = list(filter(lambda x: x.strip() != "", text.split("\n"))) | |
else: | |
texts = split_and_recombine_text(text) | |
voices = [voice] | |
if voice_b != "disabled": | |
voices.append(voice_b) | |
if len(voices) == 1: | |
voice_samples, conditioning_latents = load_voice(voice) | |
else: | |
voice_samples, conditioning_latents = load_voices(voices) | |
start_time = time.time() | |
for j, text in enumerate(texts): | |
for audio_frame in tts.tts_with_preset( | |
text, | |
voice_samples=voice_samples, | |
conditioning_latents=conditioning_latents, | |
preset="ultra_fast", | |
k=1 | |
): | |
yield (24000, audio_frame.cpu().detach().numpy()) | |
def main(): | |
title = "Tortoise TTS " | |
label="Text (Provide either text, or upload a newline separated text file below):", | |
) | |
script = gr.File(label="Upload a text file") | |
voice = gr.Dropdown( | |
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value" | |
) | |
voice_b = gr.Dropdown( | |
VOICE_OPTIONS, | |
value="disabled", | |
label="(Optional) Select second voice:", | |
type="value", | |
) | |
split_by_newline = gr.Radio( | |
["Yes", "No"], | |
label="Split by newline (If [No], it will automatically try to find relevant splits):", | |
type="value", | |
value="No", | |
) | |
output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True) | |
interface = gr.Interface( | |
fn=inference, | |
inputs=[ | |
text, | |
script, | |
voice, | |
voice_b, | |
split_by_newline, | |
], | |
title=title, | |
outputs=[output_audio], | |
) | |
interface.queue().launch() |