# coding=utf-8

import base64
import io
import os
import re
import tempfile

import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import spaces
import torch
import torchaudio
from funasr import AutoModel
from sv import clean_and_emoji_annotate_speech, process_audio


@spaces.GPU
def model_inference(input_wav, language, fs=16000):
    language_abbr = {
        "auto": "auto",
        "zh": "zh",
        "en": "en",
        "yue": "yue",
        "ja": "ja",
        "ko": "ko",
        "nospeech": "nospeech",
    }

    language = "auto" if len(language) < 1 else language
    selected_language = language_abbr[language]

    # Handle input_wav format
    if isinstance(input_wav, tuple):
        fs, input_wav = input_wav
        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
        if len(input_wav.shape) > 1:
            input_wav = input_wav.mean(-1)
        if fs != 16000:
            resampler = torchaudio.transforms.Resample(fs, 16000)
            input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
            input_wav = resampler(input_wav_t[None, :])[0, :].numpy()

    # Save the input audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
        sf.write(temp_audio.name, input_wav, 16000)
        temp_audio_path = temp_audio.name

    try:
        # Process the audio using the function from sv.py
        result = process_audio(temp_audio_path, language=selected_language)
    finally:
        # Remove the temporary audio file
        os.remove(temp_audio_path)

    return result


audio_examples = [
    ["example/mtr.mp3", "auto"],
]


def launch():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        with gr.Row():
            with gr.Column():
                audio_inputs = gr.Audio(label="Upload audio or use the microphone")

                with gr.Accordion("Configuration"):
                    language_inputs = gr.Dropdown(
                        choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
                        value="auto",
                        label="Language",
                    )
                fn_button = gr.Button("Start", variant="primary")
                text_outputs = gr.Textbox(label="Results")
            gr.Examples(
                examples=audio_examples,
                inputs=[audio_inputs, language_inputs],
                examples_per_page=20,
            )

        fn_button.click(
            model_inference,
            inputs=[audio_inputs, language_inputs],
            outputs=text_outputs,
        )

    demo.launch()


if __name__ == "__main__":
    # iface.launch()
    launch()