Spaces:
Sleeping
Sleeping
File size: 2,483 Bytes
97e4faf deb14ad e9c4729 97e4faf deb14ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import os
import torch
import gradio as gr
import torchaudio
import time
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.text import split_and_recombine_text
from tortoise.utils.audio import load_audio, load_voice, load_voices
VOICE_OPTIONS = [
"angie",
"deniro",
"freeman",
"random", # special option for random voice
]
def inference(
text,
script,
voice,
voice_b,
seed,
split_by_newline,
):
if text is None or text.strip() == "":
with open(script.name) as f:
text = f.read()
if text.strip() == "":
raise gr.Error("Please provide either text or script file with content.")
if split_by_newline == "Yes":
texts = list(filter(lambda x: x.strip() != "", text.split("\n")))
else:
texts = split_and_recombine_text(text)
voices = [voice]
if voice_b != "disabled":
voices.append(voice_b)
if len(voices) == 1:
voice_samples, conditioning_latents = load_voice(voice)
else:
voice_samples, conditioning_latents = load_voices(voices)
start_time = time.time()
for j, text in enumerate(texts):
for audio_frame in tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset="ultra_fast",
k=1
):
yield (24000, audio_frame.cpu().detach().numpy())
def main():
title = "Tortoise TTS "
label="Text (Provide either text, or upload a newline separated text file below):",
)
script = gr.File(label="Upload a text file")
voice = gr.Dropdown(
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
)
voice_b = gr.Dropdown(
VOICE_OPTIONS,
value="disabled",
label="(Optional) Select second voice:",
type="value",
)
split_by_newline = gr.Radio(
["Yes", "No"],
label="Split by newline (If [No], it will automatically try to find relevant splits):",
type="value",
value="No",
)
output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
interface = gr.Interface(
fn=inference,
inputs=[
text,
script,
voice,
voice_b,
split_by_newline,
],
title=title,
outputs=[output_audio],
)
interface.queue().launch() |