Spaces:
Sleeping
Sleeping
File size: 2,548 Bytes
97e4faf bd104fa 97e4faf b38366a 97e4faf ea84da0 1d9f047 97e4faf ea84da0 bd104fa 97e4faf ea84da0 97e4faf 1d9f047 97e4faf deb14ad e9c4729 bd104fa 97e4faf ea84da0 1d9f047 97e4faf ea84da0 1d9f047 97e4faf bd104fa 944b0be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import os
import torch
import gradio as gr
import torchaudio
import time
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices
VOICE_OPTIONS = [
"angie",
"deniro",
"freeman",
"random",
]
def inference(
text,
voice,
emotion_option,
preset_option,
):
texts = [text]
Angry_tone = "[I am so angry]"
Sad_tone = "[I am so sad]"
Happy_tone = "[I am so happy]"
Scared_tone = "[I am so scared]"
if emotion_option == "Angry":
text = Angry_tone + text
if emotion_option == "Sad":
text = Sad_tone + text
if emotion_option == "Happy":
text = Happy_tone + text
if emotion_option == "Scared":
text = Scared_tone + text
voices = [voice]
if len(voices) == 1:
voice_samples, conditioning_latents = load_voice(voice)
else:
voice_samples, conditioning_latents = load_voices(voices)
start_time = time.time()
for j, text in enumerate(texts):
for audio_frame in tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset= preset_option,
k=1
):
yield (24000, audio_frame.cpu().detach().numpy())
def main():
title = "Tortoise TTS "
text = gr.Textbox(
lines=4,
label="Text:",
)
voice = gr.Dropdown(
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
)
emotion_option = gr.Radio(
["Angry", "Sad", "Happy", "Scared"],
type="value",
)
preset_option = gr.Radio(
["ultra_fast", "fast", "standard", "high_quality"],
label="ultra_fast for quick inference and high_quality for better inference",
type="value",
value="ultra_fast",
)
output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
interface = gr.Interface(
fn=inference,
inputs=[
text,
voice,
emotion_option,
preset_option,
],
title=title,
outputs=[output_audio],
)
interface.queue().launch()
if __name__ == "__main__":
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
f.write(
f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
)
main()
|