import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import soundfile as sf
from xcodec2.modeling_xcodec2 import XCodec2Model
import torchaudio
import gradio as gr
import tempfile
import os
import numpy as np

llasa_1b ='HKUSTAudio/Llasa-1B-Multilingual'

tokenizer = AutoTokenizer.from_pretrained(llasa_1b, token=os.getenv("HF_TOKEN"))

model = AutoModelForCausalLM.from_pretrained(
    llasa_1b, trust_remote_code=True, device_map="cuda", token=os.getenv("HF_TOKEN")
)

model_path = "srinivasbilla/xcodec2"
Codec_model = XCodec2Model.from_pretrained(model_path)
Codec_model.eval().cuda()

whisper_turbo_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3-turbo",
    torch_dtype=torch.float16,
    device="cuda",
)


SPEAKERS = {
    "Male 1": {
        "path": "speakers/deep_speaker.mp3",
        "transcript": "Das große Tor von Minas Tirith brach erst, nachdem er die Ramme eingesetzt hatte.",
        "description": "Eine tiefe epische Männerstimme.",
    },
    "Male 2": {
        "path": "speakers/male_austrian_accent.mp3",
        "transcript": "Man kann sich auch leichter vorstellen, wie schwierig es ist, dass man Entscheidungen trifft, die allen passen.",
        "description": "Eine männliche Stimme mit österreicherischem Akzent.",
    },
    "Male 3": {
        "path": "speakers/male_energic.mp3",
        "transcript": "Wo keine Infrastruktur, da auch keine Ansiedlung von IT-Unternehmen und deren Beschäftigten bzw. dem geeigneten Fachkräftenachwuchs. Kann man diese Rechnung so einfach aufmachen, wie es es tatsächlich um deren regionale Verteilung beschäftigt?",
        "description": "Eine männliche energische Stimme",
    },
    "Male 4": {
        "path": "speakers/schneller_speaker.mp3",
        "transcript": "Genau, wenn wir alle Dächer voll machen, also alle Dächer von Einfamilienhäusern, alleine mit den Einfamilienhäusern können wir 20 Prozent des heutigen Strombedarfs decken.",
        "description": "Eine männliche Spreche mit schnellerem Tempo.",
    },
    "Female 1": {
        "path": "speakers/female_standard.mp3",
        "transcript": "Es wird ein Beispiel für ein barrierearmes Layout gegeben, sowie Tipps und ein Verweis auf eine Checkliste, die hilft, Barrierearmut in den eigenen Materialien zu prüfen bzw. umzusetzen.",
        "description": "Eine weibliche Stimme.",
    },
    "Female 2": {
        "path": "speakers/female_energic.mp3",
        "transcript": "Dunkel flog weiter durch das Wald. Er sah die Sterne am Phaneten an sich vorbeiziehen und fühlte sich frei und glücklich.",
        "description": "Eine weibliche Erzähler-Stimme.",
    },
    "Female 3": {
        "path": "speakers/austrian_accent.mp3",
        "transcript": "Die politische Europäische Union war geboren, verbrieft im Vertrag von Maastricht. Ab diesem Zeitpunkt bestehen zwei Vertragswerke.",
        "description": "Eine weibliche Stimme mit österreicherischem Akzent.",
    },
      "Special 1": {
        "path": "speakers/low_audio.mp3",
        "transcript": "Druckplatten und Lasersensoren, um sicherzugehen, dass er auch da drin ist und",
        "description": "Eine männliche Stimme mit schlechter Audioqualität als Effekt.",
    }, 
}


def preview_speaker(display_name):
    """Returns the audio and transcript for preview"""
    speaker_name = speaker_display_dict[display_name] 
    if speaker_name in SPEAKERS:
        waveform, sample_rate = torchaudio.load(SPEAKERS[speaker_name]["path"])
        return (sample_rate, waveform[0].numpy()), SPEAKERS[speaker_name]["transcript"]
    return None, ""


def normalize_audio(waveform: torch.Tensor, target_db: float = -20) -> torch.Tensor:
    """
    Normalize audio volume to target dB and limit gain range.

    Args:
        waveform (torch.Tensor): Input audio waveform
        target_db (float): Target dB level (default: -20)

    Returns:
        torch.Tensor: Normalized audio waveform
    """
    # Calculate current dB
    eps = 1e-10
    current_db = 20 * torch.log10(torch.max(torch.abs(waveform)) + eps)

    # Calculate required gain
    gain_db = target_db - current_db

    # Limit gain to -3 to 3 dB range
    gain_db = torch.clamp(gain_db, min=-3, max=3)

    # Apply gain
    gain_factor = 10 ** (gain_db / 20)
    normalized = waveform * gain_factor

    # Final peak normalization
    max_amplitude = torch.max(torch.abs(normalized))
    if max_amplitude > 0:
        normalized = normalized / max_amplitude

    return normalized


def ids_to_speech_tokens(speech_ids):
    speech_tokens_str = []
    for speech_id in speech_ids:
        speech_tokens_str.append(f"<|s_{speech_id}|>")
    return speech_tokens_str


def extract_speech_ids(speech_tokens_str):
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith("<|s_") and token_str.endswith("|>"):
            num_str = token_str[4:-2]

            num = int(num_str)
            speech_ids.append(num)
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

@spaces.GPU(duration=30)
@torch.inference_mode()
def infer_with_speaker(
    display_name,
    target_text,
    temp,
    top_p_val,
    min_new_tokens,
    do_sample,
    progress=gr.Progress(),
):
    """Modified infer function that uses predefined speaker"""
    speaker_name = speaker_display_dict[display_name]  # Get actual speaker name
    if speaker_name not in SPEAKERS:
        return None, "Invalid speaker selected"


    return infer(
        SPEAKERS[speaker_name]["path"],
        target_text,
        temp,
        top_p_val,
        min_new_tokens,
        do_sample,
        SPEAKERS[speaker_name]["transcript"],  # Pass the predefined transcript
        progress,
    )

@spaces.GPU(duration=30)
@torch.inference_mode()
def gradio_infer(*args, **kwargs):
    return infer(*args, **kwargs)

def infer(
    sample_audio_path,
    target_text,
    temp,
    top_p_val,
    min_new_tokens,
    do_sample,
    transcribed_text=None,
    progress=gr.Progress(),
):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        progress(0, "Loading and trimming audio...")
        waveform, sample_rate = torchaudio.load(sample_audio_path)

        waveform = normalize_audio(waveform)

        if len(waveform[0]) / sample_rate > 15:
            gr.Warning("Trimming audio to first 15secs.")
            waveform = waveform[:, : sample_rate * 15]
            waveform = torch.nn.functional.pad(
                waveform, (0, int(sample_rate * 0.5)), "constant", 0
            )

        # Check if the audio is stereo (i.e., has more than one channel)
        if waveform.size(0) > 1:
            # Convert stereo to mono by averaging the channels
            waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
        else:
            # If already mono, just use the original waveform
            waveform_mono = waveform

        prompt_wav = torchaudio.transforms.Resample(
            orig_freq=sample_rate, new_freq=16000
        )(waveform_mono)

        if transcribed_text is None:
            progress(0.3, "Transcribing audio...")
            prompt_text = whisper_turbo_pipe(prompt_wav[0].numpy())["text"].strip()
        else:
            prompt_text = transcribed_text

        progress(0.5, "Transcribed! Generating speech...")

        if len(target_text) == 0:
            return None
        elif len(target_text) > 500:
            gr.Warning("Text is too long. Please keep it under 300 characters.")
            target_text = target_text[:500]

        input_text = prompt_text + " " + target_text

        # TTS start!
        with torch.no_grad():
            # Encode the prompt wav
            vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)

            vq_code_prompt = vq_code_prompt[0, 0, :]
            # Convert int 12345 to token <|s_12345|>
            speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)

            formatted_text = (
                f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
            )

            # Tokenize the text and the speech prefix
            chat = [
                {
                    "role": "user",
                    "content": "Convert the text to speech:" + formatted_text,
                },
                {
                    "role": "assistant",
                    "content": "<|SPEECH_GENERATION_START|>"
                    + "".join(speech_ids_prefix),
                },
            ]

            input_ids = tokenizer.apply_chat_template(
                chat,
                tokenize=True,
                return_tensors="pt",
                continue_final_message=True,
            )
            input_ids = input_ids.to("cuda")
            speech_end_id = tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")

            # Generate the speech autoregressively
            outputs = model.generate(
                input_ids,
                max_length=2048,  # We trained our model with a max length of 2048
                eos_token_id=speech_end_id,
                do_sample=do_sample,
                top_p=top_p_val,
                temperature=temp,
                min_new_tokens=min_new_tokens,
            )

            # Extract the speech tokens
            generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix) : -1]

            speech_tokens = tokenizer.batch_decode(
                generated_ids, skip_special_tokens=False
            )
            raw_output = " ".join(speech_tokens)  # Capture raw tokens

            speech_tokens = tokenizer.batch_decode(
                generated_ids, skip_special_tokens=True
            )

            # Convert  token <|s_23456|> to int 23456
            speech_tokens = extract_speech_ids(speech_tokens)

            speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)

            # Decode the speech tokens to speech waveform
            gen_wav = Codec_model.decode_code(speech_tokens)

            # if only need the generated part
            gen_wav = gen_wav[:, :, prompt_wav.shape[1] :]

            progress(1, "Synthesized!")

        return (
            16000,
            gen_wav[0, 0, :].cpu().numpy(),
        ), raw_output  # Return both audio and raw tokens


with gr.Blocks() as app_tts:
    gr.Markdown("# Zero Shot Voice Clone TTS")

    with gr.Accordion("Model Settings", open=False):
        temperature = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.8,
            step=0.1,
            label="Temperature",
            info="Higher values = more random/creative output",
        )
        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=1.0,
            step=0.1,
            label="Top P",
            info="Nucleus sampling threshold",
        )
        min_new_tokens = gr.Slider(
            minimum=0,
            maximum=128,
            value=3,
            step=1,
            label="Min Length",
            info="If the model just produces a click you can force it to create longer generations.",
        )
        do_sample = gr.Checkbox(
            label="Sample", value=True, info="Sample from the distribution"
        )

    ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
    gen_text_input = gr.Textbox(label="Text to Generate", lines=10)

    generate_btn = gr.Button("Synthesize", variant="primary")

    audio_output = gr.Audio(label="Synthesized Audio")
    raw_output_display = gr.Textbox(
        label="Raw Model Output", interactive=False
    )  # Add textbox

    generate_btn.click(
        lambda *args: gradio_infer(*args, transcribed_text=None),
        inputs=[
            ref_audio_input,
            gen_text_input,
            temperature,
            top_p,
            min_new_tokens,
            do_sample,
        ],
        outputs=[audio_output, raw_output_display],  # Include both outputs
    )


with gr.Blocks() as app_speaker:
    gr.Markdown("# Predefined Speaker TTS")

    with gr.Accordion("Model Settings", open=False):
        temperature = gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.7,
            step=0.1,
            label="Temperature",
            info="Higher values = more random/creative output",
        )
        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=1.0,
            step=0.1,
            label="Top P",
            info="Nucleus sampling threshold",
        )
        min_new_tokens = gr.Slider(
            minimum=0,
            maximum=128,
            value=3,
            step=1,
            label="Min Length",
            info="If the model just produces a click you can force it to create longer generations.",
        )
        do_sample = gr.Checkbox(
            label="Sample", value=True, info="Sample from the distribution"
        )

    with gr.Row():
        speaker_display_dict = {
        f"{name} - {SPEAKERS[name]['description']}": name 
        for name in SPEAKERS.keys()
        }
        speaker_dropdown = gr.Dropdown(
            choices=list(speaker_display_dict.keys()),
            label="Select Speaker",
            value=list(speaker_display_dict.keys())[0],
        )
        preview_btn = gr.Button("Preview Voice")


    with gr.Row():
        preview_audio = gr.Audio(label="Preview")
        preview_text = gr.Textbox(label="Original Transcript", interactive=False)

    gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
    generate_btn = gr.Button("Synthesize", variant="primary")

    audio_output = gr.Audio(label="Synthesized Audio")
    raw_output_display = gr.Textbox(label="Raw Model Output", interactive=False)

    # Connect the preview button
    preview_btn.click(
        preview_speaker,
        inputs=[speaker_dropdown],
        outputs=[preview_audio, preview_text],
    )

    # Connect the generate button
    generate_btn.click(
        infer_with_speaker,
        inputs=[
            speaker_dropdown,
            gen_text_input,
            temperature,
            top_p,
            min_new_tokens,
            do_sample,
        ],
        outputs=[audio_output, raw_output_display],
    )


with gr.Blocks() as app_credits:
    gr.Markdown("""
# Credits

* [zhenye234](https://github.com/zhenye234) for the original [repo](https://github.com/zhenye234/LLaSA_training)
* [mrfakename](https://huggingface.co/mrfakename) for the [gradio demo code](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)        
""")

with gr.Blocks() as app:
    gr.Markdown(
        """
Official Multilingual version
"""
    )
    gr.TabbedInterface([app_speaker, app_tts], ["Speaker", "Clone"])


app.launch(ssr_mode=False)