import spaces
import torch

import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

import tempfile
import os

MODEL_NAME = "TalTechNLP/whisper-large-v3-turbo-et-subs"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


def convert_to_vtt(whisper_output):
    """
    Convert Whisper ASR output to VTT subtitle format.
    
    Args:
        whisper_output (dict): Dictionary containing Whisper ASR output with 'text' and 'chunks'
        
    Returns:
        str: VTT formatted subtitles as a string
    """
    def format_timestamp(seconds):
        """Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
        if seconds is None:
            return "99:59:59.999"  # Use max time for None values
        
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds_remainder = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}".replace('.', ',')

    # Start with VTT header
    vtt_output = "WEBVTT\n\n"
    
    # Process each chunk
    for i, chunk in enumerate(whisper_output['chunks'], 1):
        start_time, end_time = chunk['timestamp']
        
        # Format the subtitle entry
        vtt_output += f"{i}\n"
        vtt_output += f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n"
        vtt_output += f"{chunk['text'].strip()}\n\n"
    
    return vtt_output


def dynamic_gpu_duration(func, duration, *args):

    @spaces.GPU(duration=duration)
    def wrapped_func():
        return func(*args)

    return wrapped_func()

@spaces.GPU
def dummy_gpu():
    return None

def do_transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "et"}, return_timestamps=True)
    return convert_to_vtt(result)

def transcribe(file_path):
    with open(file_path, "rb") as f:
        audio_data = ffmpeg_read(f.read(), 16000)
    
    # Calculate the length in seconds
    audio_length = len(audio_data) / 16000
    #expected_transcribe_duration = max(59, int(audio_length / 5.0))
    expected_transcribe_duration = 59
    gr.Info(f"Starting to transcribe, requesting a GPU for {expected_transcribe_duration} seconds")
    return dynamic_gpu_duration(do_transcribe, expected_transcribe_duration, file_path)


def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()
    
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))
    
    file_length = info["duration_string"]
    file_h_m_s = file_length.split(":")
    file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
    
    if len(file_h_m_s) == 1:
        file_h_m_s.insert(0, 0)
    if len(file_h_m_s) == 2:
        file_h_m_s.insert(0, 0)
    file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
    
    if file_length_s > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
    
    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
    
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([yt_url])
        except youtube_dl.utils.ExtractorError as err:
            raise gr.Error(str(err))


def yt_transcribe(yt_url, max_filesize=75.0):

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        text = transcribe(transcribe, filepath)
        

    return text


demo = gr.Blocks(theme=gr.themes.Ocean())

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath")
    ],
    #outputs="text",
    outputs=gr.Textbox(label="VTT subtitles", elem_id="text", show_label=True, show_copy_button=True, autoscroll=False, interactive=True),
    title="Generate Estonian subtitles",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Audio file")
    ],
    #outputs="text",
    outputs=gr.Textbox(label="VTT subtitles", elem_id="text", show_label=True, show_copy_button=True, autoscroll=False, interactive=True),
    title="Generate Estonian subtitles",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
    ],
    #outputs=["html", "text"],
    outputs=gr.Textbox(label="VTT subtitles", elem_id="text", show_label=True, show_copy_button=True, autoscroll=False, interactive=True),
    title="Generate Estonian subtitles",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length. NB! YouTube seems to often block download requests from Huggingface and there is nothing we can do about it."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])

demo.queue().launch(ssr_mode=False)