import streamlit as st
import outetts
from scipy.io.wavfile import write
import tempfile
import os
from pydub import AudioSegment
import sounddevice as sd
import wave
import numpy as np

# Initialize model configuration
model_config = outetts.HFModelConfig_v1(
    model_path="OuteAI/OuteTTS-0.2-500M",
    language="en"  # Supported languages: en, zh, ja, ko
)

# Initialize the interface
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config)

# Streamlit UI
st.title("OuteTTS Speech Synthesis")
st.write("Enter text below to generate speech.")

# Sidebar for reference voice
st.sidebar.title("Voice Cloning")
reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"])
transcript = st.sidebar.text_area("Transcription of the reference audio")

# Function to convert audio to WAV format
def convert_to_wav(audio_file):
    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    audio = AudioSegment.from_file(audio_file)
    audio.export(temp_audio.name, format="wav")
    return temp_audio.name

if reference_audio and transcript:
    ref_audio_path = convert_to_wav(reference_audio)
    # Create speaker profile
    speaker = interface.create_speaker(ref_audio_path, transcript)
    # Save the speaker profile
    interface.save_speaker(speaker, "speaker.json")
else:
    speaker = None

# Recording functionality
def record_audio(duration=5, samplerate=44100):
    st.sidebar.write("Recording...")
    recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()
    temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
    with wave.open(temp_audio_path, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(samplerate)
        wf.writeframes(recording.tobytes())
    return temp_audio_path

if not speaker:
    st.sidebar.write("Or record your voice below:")
    if st.sidebar.button("Record Voice"):
        ref_audio_path = record_audio()
        st.sidebar.success("Recording complete!")
        transcript = st.sidebar.text_area("Transcription of the recorded audio")
        if transcript:
            # Create speaker profile from recorded audio
            speaker = interface.create_speaker(ref_audio_path, transcript)
            # Save the speaker profile
            interface.save_speaker(speaker, "speaker.json")

text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.")

if st.button("Generate Speech"):
    with st.spinner("Generating audio..."):
        # Generate speech with or without the speaker profile
        output = interface.generate(
            text=text_input,
            temperature=0.1,
            repetition_penalty=1.1,
            max_length=4096,
            speaker=speaker
        )
        
        # Save the synthesized speech to a file
        output_path = "output.wav"
        output.save(output_path)
        
        # Play the audio in the Streamlit app
        st.audio(output_path, format="audio/wav")
        st.success("Speech generated successfully!")

# Clean up temporary files
if reference_audio:
    os.remove(ref_audio_path)