import os
import shutil
import streamlit as st
import torchaudio
import IPython
import base64

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load

# Initialize TextToSpeech model
tts = TextToSpeech()

# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"

voice_samples = None
conditioning_latents = None

# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Streamlit UI elements
st.title("Tortoise Cloning App")

# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
    "Upload Audio Samples for a New Voice",
    accept_multiple_files=True,
    type=["wav"],
)

# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")

if st.sidebar.button("Create Voice") and voice_name.strip() != "":
    new_voice_name = voice_name.strip().replace(" ", "_")
    voices_dir = f"./tortoise/voices/{new_voice_name}/"
    if os.path.exists(voices_dir):
        shutil.rmtree(voices_dir)
    os.makedirs(voices_dir)

    for index, uploaded_file in enumerate(uploaded_files):
        bytes_data = uploaded_file.read()
        with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
            wav_file.write(bytes_data)

    st.sidebar.success(f"Voice '{voice_name}' created successfully!")
    voice_samples, conditioning_latents = load_voice(voice_name)

# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
    "Enter Text",
    help="Enter the text you want to convert to speech.",
    value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)

preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")

voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")

# Generate speech
if st.button("Generate Speech"):
    if voice_name.strip() == "":
        st.warning("Please create a voice first.")
    else:
        st.info("Generating speech...")

        # Load voice samples
        # voice_samples, conditioning_latents = load_voice(voice)
        
        print(voice_samples)

        # Generate speech with Tortoise
        gen = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=preset,
        )

        # Save and display the generated audio
        output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)

        print(output_path)

        # Log the path of the generated audio
        st.write(f"Generated audio saved at: {output_path}")

        # Display the generated audio
        st.subheader("Generated Output")
        
        # Create a download link for the generated audio
        audio_base64 = base64.b64encode(open(output_path, 'rb').read()).decode('utf-8')
        href = f'<a href="data:audio/wav;base64,{audio_base64}" download="generated.wav">Download Audio</a>'
        st.markdown(href, unsafe_allow_html=True)
        
        # Display the generated audio
        st.audio(output_path, format="audio/wav", start_time=0)
        
        # Autoplay the audio
        st.audio(output_path, format="audio/wav", start_time=0, autoplay=True)

        st.success("Speech generated successfully!")

# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
    shutil.rmtree(UPLOAD_FOLDER)
    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
    shutil.rmtree(OUTPUT_FOLDER)
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    st.sidebar.success("Clean up completed!")

# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
    "This app allows you to create a new voice by uploading .wav files. You can then generate speech "
    "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)