import os
import tempfile

import gradio as gr
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

MODEL_NAME = "jensenlwt/whisper-small-singlish-122k"
FILE_LIMIT_MB = 1000

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error(
            "No audio file submitted! Please upload or record an audio file before submitting your request."
        )

    text = pipe(
        inputs,
        generate_kwargs={"language": "english"},
        return_timestamps=True,
    )["chunks"]
    return text


demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Small: Singlish Edition 🇸🇬",
    description=(""),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(
            source="upload", type="filepath", optional=True, label="Audio file"
        ),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Small: Singlish Edition 🇸🇬",
    description=(
        "NOTE: Current space seems to cut off the last few seconds of the recording. For exploration, I would recommend sticking to audio <10s long."
    ),
    allow_flagging="never",
)


with demo:
    gr.TabbedInterface(
        [mf_transcribe, file_transcribe],
        ["Microphone", "Audio file"],
    )

demo.launch(enable_queue=True)