File size: 4,274 Bytes
4408097
 
 
ba3f0c0
 
ffdeba9
4408097
ba3f0c0
496993f
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
4408097
d2eb80b
 
 
ba3f0c0
 
 
4408097
ba3f0c0
ffdeba9
4408097
ba3f0c0
 
 
 
 
 
 
4408097
ba3f0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2eb80b
ba3f0c0
 
 
 
 
 
 
 
4408097
ba3f0c0
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
 
 
4408097
ba3f0c0
d2eb80b
 
 
4408097
ba3f0c0
 
 
 
 
 
4408097
 
ba3f0c0
 
 
 
d2eb80b
 
9dc043d
 
 
 
ba3f0c0
ffdeba9
 
 
 
 
 
 
 
 
 
 
ba3f0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import shutil
import streamlit as st
import torchaudio
import IPython
import base64

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices

# Initialize TextToSpeech model
tts = TextToSpeech()

# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"

voice_samples = None
conditioning_latents = None

# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Streamlit UI elements
st.title("Tortoise Cloning App")

# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
    "Upload Audio Samples for a New Voice",
    accept_multiple_files=True,
    type=["wav"],
)

# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")

if st.sidebar.button("Create Voice") and voice_name.strip() != "":
    new_voice_name = voice_name.strip().replace(" ", "_")
    voices_dir = f"./tortoise/voices/{new_voice_name}/"
    if os.path.exists(voices_dir):
        shutil.rmtree(voices_dir)
    os.makedirs(voices_dir)

    for index, uploaded_file in enumerate(uploaded_files):
        bytes_data = uploaded_file.read()
        with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
            wav_file.write(bytes_data)

    st.sidebar.success(f"Voice '{voice_name}' created successfully!")
    voice_samples, conditioning_latents = load_voice(voice_name)

# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
    "Enter Text",
    help="Enter the text you want to convert to speech.",
    value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)

preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")

voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")

# Generate speech
if st.button("Generate Speech"):
    if voice_name.strip() == "":
        st.warning("Please create a voice first.")
    else:
        st.info("Generating speech...")

        # Load voice samples
        # voice_samples, conditioning_latents = load_voice(voice)
        
        print(voice_samples)

        # Generate speech with Tortoise
        gen = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=preset,
        )

        # Save and display the generated audio
        output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)

        print(output_path)

        # Log the path of the generated audio
        st.write(f"Generated audio saved at: {output_path}")

        # Display the generated audio
        st.subheader("Generated Output")
        
        # Create a download link for the generated audio
        audio_base64 = base64.b64encode(open(output_path, 'rb').read()).decode('utf-8')
        href = f'<a href="data:audio/wav;base64,{audio_base64}" download="generated.wav">Download Audio</a>'
        st.markdown(href, unsafe_allow_html=True)
        
        # Display the generated audio
        st.audio(output_path, format="audio/wav", start_time=0)
        
        # Autoplay the audio
        st.audio(output_path, format="audio/wav", start_time=0, autoplay=True)

        st.success("Speech generated successfully!")

# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
    shutil.rmtree(UPLOAD_FOLDER)
    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
    shutil.rmtree(OUTPUT_FOLDER)
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    st.sidebar.success("Clean up completed!")

# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
    "This app allows you to create a new voice by uploading .wav files. You can then generate speech "
    "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)