import gradio as gr
import torch
from transformers import pipeline

# Load the emotion recognition pipeline
emotion_pipeline = pipeline(model="j-hartmann/emotion-english-distilroberta-base", task="text-classification")
# Load the asr pipeline
asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")
#Creating a function to detect emotion from a speech. First we recognise the speech and output as a text and then we feed that to the emotion detection pipeline.
def transcribe_speech(filepath):
    if not filepath:
        return "No audio found, please retry.", []

    # Transcribe the speech
    output = asr(filepath)
    transcription = output["text"]

    # Detect the emotion in the transcribed text
    emotion_output = emotion_pipeline(transcription)

    # Format emotion output
    formatted_emotions = [
        f"{emo['label']}: {emo['score']:.4f}" for emo in emotion_output
    ]

    return transcription, formatted_emotions
# Gradio interfaces for microphone and file upload
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=[
        gr.Textbox(label="Transcription", lines=3),
        gr.Textbox(label="Emotions", lines=5)
    ],
    allow_flagging="never"
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=[
        gr.Textbox(label="Transcription", lines=3),
        gr.Textbox(label="Emotions", lines=5)
    ],
    allow_flagging="never"
)
# Create the demo with tabbed interfaces
demo = gr.Blocks()

with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

# Launch the Gradio demo
if __name__ == "__main__":
    demo.launch(debug=True)