tortoise5c / app.py
djkesu's picture
Fixed load_voice
496993f
raw
history blame
4.27 kB
import os
import shutil
import streamlit as st
import torchaudio
import IPython
import base64
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices
# Initialize TextToSpeech model
tts = TextToSpeech()
# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"
voice_samples = None
conditioning_latents = None
# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Streamlit UI elements
st.title("Tortoise Cloning App")
# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
"Upload Audio Samples for a New Voice",
accept_multiple_files=True,
type=["wav"],
)
# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")
if st.sidebar.button("Create Voice") and voice_name.strip() != "":
new_voice_name = voice_name.strip().replace(" ", "_")
voices_dir = f"./tortoise/voices/{new_voice_name}/"
if os.path.exists(voices_dir):
shutil.rmtree(voices_dir)
os.makedirs(voices_dir)
for index, uploaded_file in enumerate(uploaded_files):
bytes_data = uploaded_file.read()
with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
wav_file.write(bytes_data)
st.sidebar.success(f"Voice '{voice_name}' created successfully!")
voice_samples, conditioning_latents = load_voice(voice_name)
# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
"Enter Text",
help="Enter the text you want to convert to speech.",
value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)
preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")
voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")
# Generate speech
if st.button("Generate Speech"):
if voice_name.strip() == "":
st.warning("Please create a voice first.")
else:
st.info("Generating speech...")
# Load voice samples
# voice_samples, conditioning_latents = load_voice(voice)
print(voice_samples)
# Generate speech with Tortoise
gen = tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=preset,
)
# Save and display the generated audio
output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
print(output_path)
# Log the path of the generated audio
st.write(f"Generated audio saved at: {output_path}")
# Display the generated audio
st.subheader("Generated Output")
# Create a download link for the generated audio
audio_base64 = base64.b64encode(open(output_path, 'rb').read()).decode('utf-8')
href = f'<a href="data:audio/wav;base64,{audio_base64}" download="generated.wav">Download Audio</a>'
st.markdown(href, unsafe_allow_html=True)
# Display the generated audio
st.audio(output_path, format="audio/wav", start_time=0)
# Autoplay the audio
st.audio(output_path, format="audio/wav", start_time=0, autoplay=True)
st.success("Speech generated successfully!")
# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
shutil.rmtree(UPLOAD_FOLDER)
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
shutil.rmtree(OUTPUT_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
st.sidebar.success("Clean up completed!")
# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
"This app allows you to create a new voice by uploading .wav files. You can then generate speech "
"using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)