Spaces:
Running
Running
import os | |
import shutil | |
import streamlit as st | |
import torchaudio | |
import IPython | |
import base64 | |
from tortoise.api import TextToSpeech | |
from tortoise.utils.audio import load_voice, load_voices | |
# Initialize TextToSpeech model | |
tts = TextToSpeech() | |
# Constants | |
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"] | |
UPLOAD_FOLDER = "./uploads" | |
OUTPUT_FOLDER = "./output" | |
voice_samples = None | |
conditioning_latents = None | |
# Create upload and output directories if they don't exist | |
os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
os.makedirs(OUTPUT_FOLDER, exist_ok=True) | |
# Streamlit UI elements | |
st.title("Tortoise Cloning App") | |
# Upload .wav files | |
st.sidebar.header("Upload Audio Samples") | |
uploaded_files = st.sidebar.file_uploader( | |
"Upload Audio Samples for a New Voice", | |
accept_multiple_files=True, | |
type=["wav"], | |
) | |
# Create a new voice | |
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.") | |
if st.sidebar.button("Create Voice") and voice_name.strip() != "": | |
new_voice_name = voice_name.strip().replace(" ", "_") | |
voices_dir = f"./tortoise/voices/{new_voice_name}/" | |
if os.path.exists(voices_dir): | |
shutil.rmtree(voices_dir) | |
os.makedirs(voices_dir) | |
for index, uploaded_file in enumerate(uploaded_files): | |
bytes_data = uploaded_file.read() | |
with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file: | |
wav_file.write(bytes_data) | |
st.sidebar.success(f"Voice '{voice_name}' created successfully!") | |
voice_samples, conditioning_latents = load_voice(voice_name) | |
# Input text and settings | |
st.header("Text-to-Speech Generation") | |
text = st.text_area( | |
"Enter Text", | |
help="Enter the text you want to convert to speech.", | |
value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?", | |
) | |
preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.") | |
voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"] | |
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.") | |
# Generate speech | |
if st.button("Generate Speech"): | |
if voice_name.strip() == "": | |
st.warning("Please create a voice first.") | |
else: | |
st.info("Generating speech...") | |
# Load voice samples | |
# voice_samples, conditioning_latents = load_voice(voice) | |
print(voice_samples) | |
# Generate speech with Tortoise | |
gen = tts.tts_with_preset( | |
text, | |
voice_samples=voice_samples, | |
conditioning_latents=conditioning_latents, | |
preset=preset, | |
) | |
# Save and display the generated audio | |
output_path = os.path.join(OUTPUT_FOLDER, "generated.wav") | |
torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000) | |
print(output_path) | |
# Log the path of the generated audio | |
st.write(f"Generated audio saved at: {output_path}") | |
# Display the generated audio | |
st.subheader("Generated Output") | |
# Create a download link for the generated audio | |
audio_base64 = base64.b64encode(open(output_path, 'rb').read()).decode('utf-8') | |
href = f'<a href="data:audio/wav;base64,{audio_base64}" download="generated.wav">Download Audio</a>' | |
st.markdown(href, unsafe_allow_html=True) | |
# Display the generated audio | |
st.audio(output_path, format="audio/wav", start_time=0) | |
# Autoplay the audio | |
st.audio(output_path, format="audio/wav", start_time=0, autoplay=True) | |
st.success("Speech generated successfully!") | |
# Clean up uploaded files and output directory | |
if st.sidebar.button("Clean Up"): | |
shutil.rmtree(UPLOAD_FOLDER) | |
os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
shutil.rmtree(OUTPUT_FOLDER) | |
os.makedirs(OUTPUT_FOLDER, exist_ok=True) | |
st.sidebar.success("Clean up completed!") | |
# Display information | |
st.sidebar.header("Information") | |
st.sidebar.markdown( | |
"This app allows you to create a new voice by uploading .wav files. You can then generate speech " | |
"using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed." | |
) | |