File size: 3,469 Bytes
3ad24cf
 
36f0712
3ad24cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42bb520
36f0712
 
 
 
 
42bb520
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
import torch
import whisper
import warnings
import os
import librosa
import numpy as np
from transformers import pipeline

warnings.filterwarnings('ignore')

MODEL_NAME = "openai/whisper-small"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

# Whisper for transcription
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device
)

# Emotion classifier for text-based classification
emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True)

# Function to extract prosodic features using librosa
def extract_audio_features(audio_file):
    y, sr = librosa.load(audio_file)
    
    # Pitch (Fundamental Frequency)
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0])

    # Intensity (RMS)
    rms = np.mean(librosa.feature.rms(y=y))

    # Loudness (Using the perceptual C-weighting of the signal)
    S = np.abs(librosa.stft(y))**2
    loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr)))

    return {
        "pitch": pitch,
        "rms": rms,
        "loudness": loudness
    }

# Function to transcribe and classify emotions (dual-pipeline)
def translate_and_classify(audio):
    # Step 1: Transcribe audio to text using Whisper
    text_result = pipe(audio, batch_size=BATCH_SIZE)["text"]
    
    # Step 2: Extract prosodic features from the audio using librosa
    prosodic_features = extract_audio_features(audio)
    
    # Step 3: Use the emotion classifier on the transcribed text
    emotion = emotion_classifier(text_result)
    detected_emotion = {}
    for emotion_item in emotion[0]:
        detected_emotion[emotion_item["label"]] = emotion_item["score"]

    # Combine prosodic features and text-based emotion detection
    combined_result = {
        "transcription": text_result,
        "text_based_emotion": detected_emotion,
        "prosody": prosodic_features
    }
    
    return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"]

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown(
    """# Emotion Detection from Speech
    
    ##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity)
    """)

    with gr.Column():
        with gr.Tab("Record Audio"):
            audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath")
            transcribe_audio_r = gr.Button('Transcribe')

        with gr.Tab("Upload Audio as File"):
            audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath")
            transcribe_audio_u = gr.Button('Transcribe')

        with gr.Row():
            transcript_output = gr.Textbox(label="Transcription", lines=3)
            emotion_output = gr.Label(label="Detected Emotion from Text")
            prosody_output = gr.Label(label="Prosodic Features (Pitch, Loudness, Intensity)")

    transcribe_audio_r.click(translate_and_classify, inputs=audio_input_r, outputs=[transcript_output, emotion_output, prosody_output])
    transcribe_audio_u.click(translate_and_classify, inputs=audio_input_u, outputs=[transcript_output, emotion_output, prosody_output])

demo.launch(share=True)