import numpy as np
import soundfile as sf

# import torch
from moviepy import AudioFileClip, VideoFileClip
from pydub import AudioSegment
from pytubefix import YouTube
from pytubefix.cli import on_progress

# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from source_separation import Predictor


def token_verifier():
    visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D"
    po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA=="
    return (visitor_data, po_token)


def download_from_youtube(url, folder_path):
    yt = YouTube(
        url,
        on_progress_callback=on_progress,
        use_po_token=True,
        po_token_verifier=token_verifier,
    )
    print(yt.title)

    ys = yt.streams.get_highest_resolution()
    ys.download(output_path=folder_path, filename="temp.mp4")


def separate_video_and_audio(video_path, audio_path):
    # Load the video clip
    video_clip = VideoFileClip(video_path)

    # Extract the audio from the video clip
    audio_clip = video_clip.audio

    # Write the audio to a separate file
    audio_clip.write_audiofile(audio_path)


def load_audio(audio_path, sample_rate=44_100):
    audio = AudioSegment.from_file(audio_path)

    print("Entering the preprocessing of audio")

    # Convert the audio file to WAV format
    audio = audio.set_frame_rate(sample_rate)
    audio = audio.set_sample_width(2)  # Set bit depth to 16bit
    audio = audio.set_channels(1)  # Set to mono

    print("Audio file converted to WAV format")

    # Calculate the gain to be applied
    target_dBFS = -20
    gain = target_dBFS - audio.dBFS
    print(f"Calculating the gain needed for the audio: {gain} dB")

    # Normalize volume and limit gain range to between -3 and 3
    normalized_audio = audio.apply_gain(min(max(gain, -3), 3))

    waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32)
    max_amplitude = np.max(np.abs(waveform))
    waveform /= max_amplitude  # Normalize

    print(f"waveform shape: {waveform.shape}")
    print("waveform in np ndarray, dtype=" + str(waveform.dtype))

    return waveform, sample_rate


args = {
    "model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx",
    "denoise": True,
    "margin": 44100,
    "chunks": 15,
    "n_fft": 6144,
    "dim_t": 8,
    "dim_f": 3072,
}

separate_predictor = Predictor(args=args, device="cpu")


def source_separation(waveform):
    """
    Separate the audio into vocals and non-vocals using the given predictor.

    Args:
        predictor: The separation model predictor.
        audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate.

    Returns
    -------
        dict: A dictionary containing the separated vocals and updated audio waveform.
    """
    vocals, no_vocals = separate_predictor.predict(waveform)

    vocals = vocals[:, 0]  # vocals is stereo, only use one channel
    no_vocals = no_vocals[:, 0]  # no_vocals is stereo, only use one channel

    return vocals, no_vocals


def export_to_wav(vocals, no_vocals, sample_rate, folder_path):
    """Export segmented audio to WAV files."""
    sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate)
    sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate)


def combine_video_and_audio(video_path, no_vocals_path, output_path):
    my_clip = VideoFileClip(video_path, audio=False)
    audio_background = AudioFileClip(no_vocals_path)
    my_clip.audio = audio_background
    my_clip.write_videofile(output_path)


# https://www.youtube.com/watch?v=1jZEyU_eO1s
def get_karaoke(url):
    folder_path = "data/samples/"
    video_path = folder_path + "temp.mp4"
    audio_path = folder_path + "temp.mp3"
    no_vocals_path = folder_path + "temp_no_vocals.wav"
    output_path = folder_path + "result.mp4"

    download_from_youtube(url, folder_path)
    separate_video_and_audio(video_path, audio_path)
    waveform, sample_rate = load_audio(audio_path)
    vocals, no_vocals = source_separation(waveform)
    export_to_wav(vocals, no_vocals, sample_rate, folder_path)
    combine_video_and_audio(video_path, no_vocals_path, output_path)
    return output_path