Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import pipeline | |
# Load the emotion recognition pipeline | |
emotion_pipeline = pipeline(model="j-hartmann/emotion-english-distilroberta-base", task="text-classification") | |
# Load the asr pipeline | |
asr = pipeline(task="automatic-speech-recognition", | |
model="distil-whisper/distil-small.en") | |
#Creating a function to detect emotion from a speech. First we recognise the speech and output as a text and then we feed that to the emotion detection pipeline. | |
def transcribe_speech(filepath): | |
if not filepath: | |
return "No audio found, please retry.", [] | |
# Transcribe the speech | |
output = asr(filepath) | |
transcription = output["text"] | |
# Detect the emotion in the transcribed text | |
emotion_output = emotion_pipeline(transcription) | |
# Format emotion output | |
formatted_emotions = [ | |
f"{emo['label']}: {emo['score']:.4f}" for emo in emotion_output | |
] | |
return transcription, formatted_emotions | |
# Gradio interfaces for microphone and file upload | |
mic_transcribe = gr.Interface( | |
fn=transcribe_speech, | |
inputs=gr.Audio(sources="microphone", type="filepath"), | |
outputs=[ | |
gr.Textbox(label="Transcription", lines=3), | |
gr.Textbox(label="Emotions", lines=5) | |
], | |
allow_flagging="never" | |
) | |
file_transcribe = gr.Interface( | |
fn=transcribe_speech, | |
inputs=gr.Audio(sources="upload", type="filepath"), | |
outputs=[ | |
gr.Textbox(label="Transcription", lines=3), | |
gr.Textbox(label="Emotions", lines=5) | |
], | |
allow_flagging="never" | |
) | |
# Create the demo with tabbed interfaces | |
demo = gr.Blocks() | |
with demo: | |
gr.TabbedInterface( | |
[mic_transcribe, file_transcribe], | |
["Transcribe Microphone", "Transcribe Audio File"], | |
) | |
# Launch the Gradio demo | |
if __name__ == "__main__": | |
demo.launch(debug=True) |