Spaces:
Running
Running
File size: 1,855 Bytes
56a4e3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import gradio as gr
import torch
from transformers import pipeline
# Load the emotion recognition pipeline
emotion_pipeline = pipeline(model="j-hartmann/emotion-english-distilroberta-base", task="text-classification")
# Load the asr pipeline
asr = pipeline(task="automatic-speech-recognition",
model="distil-whisper/distil-small.en")
#Creating a function to detect emotion from a speech. First we recognise the speech and output as a text and then we feed that to the emotion detection pipeline.
def transcribe_speech(filepath):
if not filepath:
return "No audio found, please retry.", []
# Transcribe the speech
output = asr(filepath)
transcription = output["text"]
# Detect the emotion in the transcribed text
emotion_output = emotion_pipeline(transcription)
# Format emotion output
formatted_emotions = [
f"{emo['label']}: {emo['score']:.4f}" for emo in emotion_output
]
return transcription, formatted_emotions
# Gradio interfaces for microphone and file upload
mic_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=[
gr.Textbox(label="Transcription", lines=3),
gr.Textbox(label="Emotions", lines=5)
],
allow_flagging="never"
)
file_transcribe = gr.Interface(
fn=transcribe_speech,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=[
gr.Textbox(label="Transcription", lines=3),
gr.Textbox(label="Emotions", lines=5)
],
allow_flagging="never"
)
# Create the demo with tabbed interfaces
demo = gr.Blocks()
with demo:
gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Transcribe Microphone", "Transcribe Audio File"],
)
# Launch the Gradio demo
if __name__ == "__main__":
demo.launch(debug=True) |