import os
import torch
import whisper
import streamlit as st
from groq import Groq
from dotenv import load_dotenv
from tempfile import NamedTemporaryFile

# Load environment variables
load_dotenv()
API_KEY = os.getenv("GROQ_API_KEY")
HF_TOKEN = os.getenv("HF_TOKEN")

# By using XTTS you agree to CPML license
os.environ["COQUI_TOS_AGREED"] = "1"

# Import TTS components
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir

# Download and configure XTTS model
print("Downloading Coqui XTTS V2 if not already downloaded")
from TTS.utils.manage import ModelManager

model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print("XTTS downloaded")

config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))

model = Xtts.init_from_config(config)
model.load_checkpoint(
    config,
    checkpoint_path=os.path.join(model_path, "model.pth"),
    vocab_path=os.path.join(model_path, "vocab.json"),
    eval=True,
    use_deepspeed=True,
)
if torch.cuda.is_available():
    model.cuda()

supported_languages = config.languages

# LLM Response Function
def get_llm_response(api_key, user_input):
    if not api_key:
        return "API key not found. Please set the GROQ_API_KEY environment variable."
    
    client = Groq(api_key=api_key)
    prompt = (
        "IMPORTANT: You are an AI assistant that MUST provide responses in 25 words or less.\n"
        "CRITICAL RULES:\n"
        "1. NEVER exceed 25 words unless absolutely necessary.\n"
        "2. Always give a complete sentence with full context.\n"
        "3. Answer directly and precisely.\n"
        "4. Use clear, simple language.\n"
        "5. Maintain a polite, professional tone.\n"
        "6. NO lists, bullet points, or multiple paragraphs.\n"
        "7. NEVER apologize for brevity - embrace it.\n"
        "Your response will be converted to speech. Maximum 25 words."
    )
    
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": user_input}
            ],
            model="llama3-8b-8192",
            temperature=0.5,
            top_p=1,
            stream=False,
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error with LLM: {str(e)}"

# Transcribe Audio
def transcribe_audio(audio_path, model_size="base"):
    try:
        model = whisper.load_model(model_size)
        result = model.transcribe(audio_path)
        return result["text"]
    except Exception as e:
        return f"Error transcribing audio: {str(e)}"

# Generate Speech using the configured XTTS model
def generate_speech(text, output_file, speaker_wav, language="en"):
    if not os.path.exists(speaker_wav):
        raise FileNotFoundError("Reference audio file not found. Please upload a valid audio.")
    
    if language not in supported_languages:
        st.warning(f"Language {language} is not supported. Defaulting to English.")
        language = "en"
    
    # Use the configured model directly
    try:
        import time
        t_latent = time.time()
        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
            audio_path=speaker_wav, 
            gpt_cond_len=30, 
            gpt_cond_chunk_len=4,
            max_ref_length=60
        )
        
        out = model.inference(
            text,
            language,
            gpt_cond_latent,
            speaker_embedding,
            repetition_penalty=5.0,
            temperature=0.75,
        )
        
        # Save the audio to file
        torch.tensor(out["wav"]).unsqueeze(0).cpu().numpy()
        import soundfile as sf
        sf.write(output_file, out["wav"], 24000, 'PCM_24')
        
        return True, "Speech generated successfully"
    except Exception as e:
        return False, f"Error generating speech: {str(e)}"

# Streamlit App
def main():
    st.set_page_config(page_title="Vocal AI", layout="wide")
    
    st.title("VocaL AI - Voice Cloning Assistant")
    st.write("Clone your voice and interact with an AI assistant that responds in your voice!")
    
    st.sidebar.title("Settings")
    
    # Language selection
    language = st.sidebar.selectbox(
        "Output Language", 
        supported_languages,
        index=supported_languages.index("en") if "en" in supported_languages else 0
    )
    
    # TOS agreement
    agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)
    
    import uuid
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.header("Step 1: Provide Reference Voice")
        reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3", "ogg"])
        ref_audio_path = None
        
        if reference_audio:
            with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
                temp_ref_audio.write(reference_audio.read())
                ref_audio_path = temp_ref_audio.name
            st.audio(ref_audio_path)

    with col2:
        st.header("Step 2: Ask Something")
        # User Input (Text or Audio)
        input_type = st.radio("Choose Input Type", ("Text", "Upload Audio"))
        user_input = None

        if input_type == "Text":
            user_input = st.text_area("Enter your question or prompt here")
        else:
            user_audio = st.file_uploader("Upload your question as audio", type=["wav", "mp3", "ogg"])
            if user_audio:
                with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
                    temp_user_audio.write(user_audio.read())
                    st.audio(temp_user_audio.name)
                    user_input = transcribe_audio(temp_user_audio.name)
                    st.write(f"Transcribed: {user_input}")

    # Process and generate response
    if st.button("Generate AI Response in My Voice"):
        if not agree_tos:
            st.error("Please agree to the Coqui Public Model License to continue.")
            return
            
        if not ref_audio_path:
            st.error("Please upload reference audio.")
            return

        if not user_input:
            st.error("Please enter text or upload an audio question.")
            return

        with st.spinner("Processing..."):
            # Get AI Response
            llm_response = get_llm_response(API_KEY, user_input)
            st.subheader("AI Response:")
            st.write(llm_response)
            
            # Generate Speech
            output_audio_path = f"output_speech_{uuid.uuid4()}.wav"
            success, message = generate_speech(
                llm_response, 
                output_audio_path, 
                ref_audio_path, 
                language
            )
            
            if success:
                st.subheader("Listen to the response in your voice:")
                st.audio(output_audio_path, format="audio/wav")
            else:
                st.error(message)

if __name__ == "__main__":
    main()