import os
import sys
import torch
import gradio as gr
from pydub import AudioSegment
import mimetypes

sys.path.append('./Amphion')
import Amphion.models.vc.vevo.vevo_utils as vevo_utils
from huggingface_hub import snapshot_download

def load_model():
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    # Content Tokenizer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir="./ckpts/Vevo",
        allow_patterns=["tokenizer/vq32/*"],
    )
    content_tokenizer_ckpt_path = os.path.join(
        local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
    )

    # Content-Style Tokenizer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir="./ckpts/Vevo",
        allow_patterns=["tokenizer/vq8192/*"],
    )
    content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

    # Autoregressive Transformer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir="./ckpts/Vevo",
        allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
    )
    ar_cfg_path = "./config/Vq32ToVq8192.json"
    ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")

    # Flow Matching Transformer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir="./ckpts/Vevo",
        allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
    )
    fmt_cfg_path = "./config/Vq8192ToMels.json"
    fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

    # Vocoder
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir="./ckpts/Vevo",
        allow_patterns=["acoustic_modeling/Vocoder/*"],
    )
    vocoder_cfg_path = "./Amphion/models/vc/vevo/config/Vocoder.json"
    vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

    pipeline = vevo_utils.VevoInferencePipeline(
        content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
        content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
        ar_cfg_path=ar_cfg_path,
        ar_ckpt_path=ar_ckpt_path,
        fmt_cfg_path=fmt_cfg_path,
        fmt_ckpt_path=fmt_ckpt_path,
        vocoder_cfg_path=vocoder_cfg_path,
        vocoder_ckpt_path=vocoder_ckpt_path,
        device=device
    )
    return pipeline

def convert_to_wav(audio_path):
    if audio_path is None:
        return None
        
    mime, _ = mimetypes.guess_type(audio_path)
    if mime == 'audio/wav' or mime == 'audio/x-wav':
        return audio_path
    elif mime == 'audio/mpeg':
        seg = AudioSegment.from_mp3(audio_path)
        wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
        seg.export(wav_path, format="wav")
        return wav_path
    else:
        raise ValueError(f"Unsupported audio format: {mime}")

def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio, 
                 src_text, ref_text, src_language, ref_language, steps,
                 progress=gr.Progress()):
    try:
        # Convert uploaded audio files to WAV if needed
        if content_audio:
            content_path = convert_to_wav(content_audio)
        else:
            content_path = None
            
        if ref_style_audio:
            ref_style_path = convert_to_wav(ref_style_audio)
        else:
            ref_style_path = None
            
        if ref_timbre_audio:
            ref_timbre_path = convert_to_wav(ref_timbre_audio)
        else:
            ref_timbre_path = None

        # Run inference based on mode
        if mode == 'voice':
            if not all([content_path, ref_style_path, ref_timbre_path]):
                raise ValueError("Voice mode requires all audio inputs")
                
            gen_audio = inference_pipeline.inference_ar_and_fm(
                src_wav_path=content_path,
                src_text=None,
                style_ref_wav_path=ref_style_path,
                timbre_ref_wav_path=ref_timbre_path,
                flow_matching_steps=steps
            )
            
        elif mode == 'timbre':
            if not all([content_path, ref_timbre_path]):
                raise ValueError("Timbre mode requires source and timbre reference audio")
                
            gen_audio = inference_pipeline.inference_fm(
                src_wav_path=content_path,
                timbre_ref_wav_path=ref_timbre_path,
                flow_matching_steps=steps
            )
            
        elif mode == 'tts':
            if not all([ref_style_path, ref_timbre_path, src_text]):
                raise ValueError("TTS mode requires style audio, timbre audio, and source text")
                
            gen_audio = inference_pipeline.inference_ar_and_fm(
                src_wav_path=None,
                src_text=src_text,
                style_ref_wav_path=ref_style_path,
                timbre_ref_wav_path=ref_timbre_path,
                style_ref_wav_text=ref_text if ref_text else None,
                src_text_language=src_language,
                style_ref_wav_text_language=ref_language
            )
        
        # Save and return the generated audio
        output_path = "output.wav"
        vevo_utils.save_audio(gen_audio, target_sample_rate=48000, output_path=output_path)
        return output_path
        
    except Exception as e:
        raise gr.Error(str(e))

# Initialize the model
print("Loading model...")
inference_pipeline = load_model()
print("Model loaded successfully!")

# Create the Gradio interface
with gr.Blocks(title="Vevo Voice Conversion") as demo:
    gr.Markdown("# Vevo Voice Conversion")
    
    with gr.Row():
        mode = gr.Radio(
            choices=["voice", "timbre", "tts"],
            value="timbre",
            label="Inference Mode"
        )
    
    with gr.Row():
        with gr.Column():
            content_audio = gr.Audio(
                label="Source Audio",
                type="filepath"
            )
            
            ref_style_audio = gr.Audio(
                label="Reference Style Audio",
                type="filepath"
            )
            
            ref_timbre_audio = gr.Audio(
                label="Reference Timbre Audio",
                type="filepath"
            )
            
        with gr.Column():
            src_text = gr.Textbox(
                label="Source Text",
                placeholder="Enter text for TTS mode",
                visible=False
            )
            
            ref_text = gr.Textbox(
                label="Reference Style Text",
                placeholder="Optional: Enter reference text",
                visible=False
            )
            
            src_language = gr.Dropdown(
                choices=["en", "zh"],
                value="en",
                label="Source Language",
                visible=False
            )
            
            ref_language = gr.Dropdown(
                choices=["en", "zh"],
                value="en",
                label="Reference Language",
                visible=False
            )
    
    with gr.Row():
        steps = gr.Slider(
            minimum=1,
            maximum=64,
            value=32,
            step=1,
            label="Flow Matching Steps"
        )
    
    with gr.Row():
        submit_btn = gr.Button("Generate")
        output_audio = gr.Audio(label="Generated Audio")
    
    # Handle visibility of components based on mode
    def update_visibility(mode):
        is_tts = mode == "tts"
        is_voice = mode == "voice"
        is_timbre = mode == "timbre"
        
        return {
            content_audio: not is_tts,
            ref_style_audio: not is_timbre,
            src_text: is_tts,
            ref_text: is_tts,
            src_language: is_tts,
            ref_language: is_tts
        }
    
    mode.change(
        fn=update_visibility,
        inputs=[mode],
        outputs=[content_audio, ref_style_audio, src_text, ref_text, src_language, ref_language]
    )
    
    # Handle generation
    submit_btn.click(
        fn=process_audio,
        inputs=[
            mode,
            content_audio,
            ref_style_audio,
            ref_timbre_audio,
            src_text,
            ref_text,
            src_language,
            ref_language,
            steps
        ],
        outputs=output_audio
    )

if __name__ == "__main__":
    demo.launch()