Spaces:

djkesu
/

tortoise5c

Running

File size: 3,848 Bytes

4408097
 
 
ba3f0c0
 
4408097
ba3f0c0
 
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
4408097
d2eb80b
 
 
ba3f0c0
 
 
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
 
 
 
4408097
ba3f0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2eb80b
ba3f0c0
 
 
 
 
 
 
 
4408097
ba3f0c0
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
 
 
4408097
ba3f0c0
d2eb80b
 
 
4408097
ba3f0c0
 
 
 
 
 
4408097
 
ba3f0c0
 
 
 
d2eb80b
 
9dc043d
 
 
 
ba3f0c0

import os
import shutil
import streamlit as st
import torchaudio
import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice

# Initialize TextToSpeech model
tts = TextToSpeech()

# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"

voice_samples = None
conditioning_latents = None

# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Streamlit UI elements
st.title("Tortoise Text-to-Speech App")

# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
    "Upload Audio Samples for a New Voice",
    accept_multiple_files=True,
    type=["wav"],
)

# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")

if st.sidebar.button("Create Voice") and voice_name.strip() != "":
    new_voice_name = voice_name.strip().replace(" ", "_")
    voices_dir = f"./tortoise/voices/{new_voice_name}/"
    if os.path.exists(voices_dir):
        shutil.rmtree(voices_dir)
    os.makedirs(voices_dir)

    for index, uploaded_file in enumerate(uploaded_files):
        bytes_data = uploaded_file.read()
        with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
            wav_file.write(bytes_data)

    st.sidebar.success(f"Voice '{voice_name}' created successfully!")
    voice_samples, conditioning_latents = load_voice(voice_name)

# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
    "Enter Text",
    help="Enter the text you want to convert to speech.",
    value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)

preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")

voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")

# Generate speech
if st.button("Generate Speech"):
    if voice_name.strip() == "":
        st.warning("Please create a voice first.")
    else:
        st.info("Generating speech...")

        # Load voice samples
        # voice_samples, conditioning_latents = load_voice(voice)
        
        print(voice_samples)

        # Generate speech with Tortoise
        gen = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=preset,
        )

        # Save and display the generated audio
        output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)

        print(output_path)

        # Log the path of the generated audio
        st.write(f"Generated audio saved at: {output_path}")

        # Display the generated audio
        st.subheader("Generated Output")
        st.audio(output_path, format="audio/wav")
        if st.checkbox("Play Audio"):
            IPython.display.Audio(output_path)

        st.success("Speech generated successfully!")

# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
    shutil.rmtree(UPLOAD_FOLDER)
    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
    shutil.rmtree(OUTPUT_FOLDER)
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    st.sidebar.success("Clean up completed!")

# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
    "This app allows you to create a new voice by uploading .wav files. You can then generate speech "
    "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)