|
import gradio as gr |
|
import torch |
|
import whisper |
|
import warnings |
|
import os |
|
import librosa |
|
import numpy as np |
|
from transformers import pipeline |
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
MODEL_NAME = "openai/whisper-small" |
|
BATCH_SIZE = 8 |
|
|
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
pipe = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME, |
|
chunk_length_s=30, |
|
device=device |
|
) |
|
|
|
|
|
emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True) |
|
|
|
|
|
def extract_audio_features(audio_file): |
|
y, sr = librosa.load(audio_file) |
|
|
|
|
|
pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr) |
|
pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0]) |
|
|
|
|
|
rms = np.mean(librosa.feature.rms(y=y)) |
|
|
|
|
|
S = np.abs(librosa.stft(y))**2 |
|
loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr))) |
|
|
|
return { |
|
"pitch": pitch, |
|
"rms": rms, |
|
"loudness": loudness |
|
} |
|
|
|
|
|
def translate_and_classify(audio): |
|
|
|
text_result = pipe(audio, batch_size=BATCH_SIZE)["text"] |
|
|
|
|
|
prosodic_features = extract_audio_features(audio) |
|
|
|
|
|
emotion = emotion_classifier(text_result) |
|
detected_emotion = {} |
|
for emotion_item in emotion[0]: |
|
detected_emotion[emotion_item["label"]] = emotion_item["score"] |
|
|
|
|
|
combined_result = { |
|
"transcription": text_result, |
|
"text_based_emotion": detected_emotion, |
|
"prosody": prosodic_features |
|
} |
|
|
|
return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
"""# Emotion Detection from Speech |
|
|
|
##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity) |
|
""") |
|
|
|
with gr.Column(): |
|
with gr.Tab("Record Audio"): |
|
audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath") |
|
transcribe_audio_r = gr.Button('Transcribe') |
|
|
|
with gr.Tab("Upload Audio as File"): |
|
audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath") |
|
transcribe_audio_u = gr.Button('Transcribe') |
|
|
|
with gr.Row(): |
|
transcript_output = gr.Textbox(label="Transcription", lines=3) |
|
emotion_output = gr.Label(label="Detected Emotion from Text") |
|
prosody_output = gr.Label(label="Prosodic Features (Pitch, Loudness, Intensity)") |
|
|
|
transcribe_audio_r.click(translate_and_classify, inputs=audio_input_r, outputs=[transcript_output, emotion_output, prosody_output]) |
|
transcribe_audio_u.click(translate_and_classify, inputs=audio_input_u, outputs=[transcript_output, emotion_output, prosody_output]) |
|
|
|
demo.launch(share=True) |