Spaces:
Running
Running
File size: 3,848 Bytes
4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 d2eb80b ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 d2eb80b ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 d2eb80b 4408097 ba3f0c0 4408097 ba3f0c0 d2eb80b 9dc043d ba3f0c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import shutil
import streamlit as st
import torchaudio
import IPython
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
# Initialize TextToSpeech model
tts = TextToSpeech()
# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"
voice_samples = None
conditioning_latents = None
# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Streamlit UI elements
st.title("Tortoise Text-to-Speech App")
# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
"Upload Audio Samples for a New Voice",
accept_multiple_files=True,
type=["wav"],
)
# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")
if st.sidebar.button("Create Voice") and voice_name.strip() != "":
new_voice_name = voice_name.strip().replace(" ", "_")
voices_dir = f"./tortoise/voices/{new_voice_name}/"
if os.path.exists(voices_dir):
shutil.rmtree(voices_dir)
os.makedirs(voices_dir)
for index, uploaded_file in enumerate(uploaded_files):
bytes_data = uploaded_file.read()
with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
wav_file.write(bytes_data)
st.sidebar.success(f"Voice '{voice_name}' created successfully!")
voice_samples, conditioning_latents = load_voice(voice_name)
# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
"Enter Text",
help="Enter the text you want to convert to speech.",
value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)
preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")
voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")
# Generate speech
if st.button("Generate Speech"):
if voice_name.strip() == "":
st.warning("Please create a voice first.")
else:
st.info("Generating speech...")
# Load voice samples
# voice_samples, conditioning_latents = load_voice(voice)
print(voice_samples)
# Generate speech with Tortoise
gen = tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=preset,
)
# Save and display the generated audio
output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
print(output_path)
# Log the path of the generated audio
st.write(f"Generated audio saved at: {output_path}")
# Display the generated audio
st.subheader("Generated Output")
st.audio(output_path, format="audio/wav")
if st.checkbox("Play Audio"):
IPython.display.Audio(output_path)
st.success("Speech generated successfully!")
# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
shutil.rmtree(UPLOAD_FOLDER)
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
shutil.rmtree(OUTPUT_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
st.sidebar.success("Clean up completed!")
# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
"This app allows you to create a new voice by uploading .wav files. You can then generate speech "
"using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)
|