|
import os |
|
import torch |
|
import gradio as gr |
|
import torchaudio |
|
import time |
|
from datetime import datetime |
|
from tortoise.api import TextToSpeech |
|
from tortoise.utils.text import split_and_recombine_text |
|
from tortoise.utils.audio import load_audio, load_voice, load_voices |
|
|
|
VOICE_OPTIONS = [ |
|
"angie", |
|
"deniro", |
|
"freeman", |
|
"halle", |
|
"lj", |
|
"myself", |
|
"pat2", |
|
"snakes", |
|
"tom", |
|
"daws", |
|
"dreams", |
|
"grace", |
|
"lescault", |
|
"weaver", |
|
"applejack", |
|
"daniel", |
|
"emma", |
|
"geralt", |
|
"jlaw", |
|
"mol", |
|
"pat", |
|
"rainbow", |
|
"tim_reynolds", |
|
"atkins", |
|
"dortice", |
|
"empire", |
|
"kennard", |
|
"mouse", |
|
"william", |
|
"jane_eyre", |
|
"random", |
|
] |
|
|
|
|
|
def inference( |
|
text, |
|
script, |
|
voice, |
|
voice_b, |
|
seed, |
|
split_by_newline, |
|
): |
|
if text is None or text.strip() == "": |
|
with open(script.name) as f: |
|
text = f.read() |
|
if text.strip() == "": |
|
raise gr.Error("Please provide either text or script file with content.") |
|
|
|
if split_by_newline == "Yes": |
|
texts = list(filter(lambda x: x.strip() != "", text.split("\n"))) |
|
else: |
|
texts = split_and_recombine_text(text) |
|
|
|
voices = [voice] |
|
if voice_b != "disabled": |
|
voices.append(voice_b) |
|
|
|
if len(voices) == 1: |
|
voice_samples, conditioning_latents = load_voice(voice) |
|
else: |
|
voice_samples, conditioning_latents = load_voices(voices) |
|
|
|
start_time = time.time() |
|
|
|
|
|
for j, text in enumerate(texts): |
|
for audio_frame in tts.tts_with_preset( |
|
text, |
|
voice_samples=voice_samples, |
|
conditioning_latents=conditioning_latents, |
|
preset="ultra_fast", |
|
k=1 |
|
): |
|
|
|
|
|
yield (24000, audio_frame.cpu().detach().numpy()) |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
title = "Tortoise TTS 🐢" |
|
description = """ |
|
A text-to-speech system which powers lot of organizations in Speech synthesis domain. |
|
<br/> |
|
a model with strong multi-voice capabilities, highly realistic prosody and intonation. |
|
<br/> |
|
for faster inference, use the 'ultra_fast' preset and duplicate space if you don't want to wait in a queue. |
|
<br/> |
|
""" |
|
text = gr.Textbox( |
|
lines=4, |
|
label="Text (Provide either text, or upload a newline separated text file below):", |
|
) |
|
script = gr.File(label="Upload a text file") |
|
|
|
voice = gr.Dropdown( |
|
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value" |
|
) |
|
voice_b = gr.Dropdown( |
|
VOICE_OPTIONS, |
|
value="disabled", |
|
label="(Optional) Select second voice:", |
|
type="value", |
|
) |
|
split_by_newline = gr.Radio( |
|
["Yes", "No"], |
|
label="Split by newline (If [No], it will automatically try to find relevant splits):", |
|
type="value", |
|
value="No", |
|
) |
|
|
|
output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True) |
|
|
|
interface = gr.Interface( |
|
fn=inference, |
|
inputs=[ |
|
text, |
|
script, |
|
voice, |
|
voice_b, |
|
split_by_newline, |
|
], |
|
title=title, |
|
description=description, |
|
outputs=[output_audio], |
|
) |
|
interface.queue().launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) |
|
|
|
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f: |
|
f.write( |
|
f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n" |
|
) |
|
|
|
main() |