import gradio as gr import torch from transformers import pipeline # Load the emotion recognition pipeline emotion_pipeline = pipeline(model="j-hartmann/emotion-english-distilroberta-base", task="text-classification") # Load the asr pipeline asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en") #Creating a function to detect emotion from a speech. First we recognise the speech and output as a text and then we feed that to the emotion detection pipeline. def transcribe_speech(filepath): if not filepath: return "No audio found, please retry.", [] # Transcribe the speech output = asr(filepath) transcription = output["text"] # Detect the emotion in the transcribed text emotion_output = emotion_pipeline(transcription) # Format emotion output formatted_emotions = [ f"{emo['label']}: {emo['score']:.4f}" for emo in emotion_output ] return transcription, formatted_emotions # Gradio interfaces for microphone and file upload mic_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=[ gr.Textbox(label="Transcription", lines=3), gr.Textbox(label="Emotions", lines=5) ], allow_flagging="never" ) file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs=[ gr.Textbox(label="Transcription", lines=3), gr.Textbox(label="Emotions", lines=5) ], allow_flagging="never" ) # Create the demo with tabbed interfaces demo = gr.Blocks() with demo: gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"], ) # Launch the Gradio demo if __name__ == "__main__": demo.launch(debug=True)