File size: 3,469 Bytes
3ad24cf 36f0712 3ad24cf 42bb520 36f0712 42bb520 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import gradio as gr
import torch
import whisper
import warnings
import os
import librosa
import numpy as np
from transformers import pipeline
warnings.filterwarnings('ignore')
MODEL_NAME = "openai/whisper-small"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
# Whisper for transcription
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device
)
# Emotion classifier for text-based classification
emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True)
# Function to extract prosodic features using librosa
def extract_audio_features(audio_file):
y, sr = librosa.load(audio_file)
# Pitch (Fundamental Frequency)
pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0])
# Intensity (RMS)
rms = np.mean(librosa.feature.rms(y=y))
# Loudness (Using the perceptual C-weighting of the signal)
S = np.abs(librosa.stft(y))**2
loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr)))
return {
"pitch": pitch,
"rms": rms,
"loudness": loudness
}
# Function to transcribe and classify emotions (dual-pipeline)
def translate_and_classify(audio):
# Step 1: Transcribe audio to text using Whisper
text_result = pipe(audio, batch_size=BATCH_SIZE)["text"]
# Step 2: Extract prosodic features from the audio using librosa
prosodic_features = extract_audio_features(audio)
# Step 3: Use the emotion classifier on the transcribed text
emotion = emotion_classifier(text_result)
detected_emotion = {}
for emotion_item in emotion[0]:
detected_emotion[emotion_item["label"]] = emotion_item["score"]
# Combine prosodic features and text-based emotion detection
combined_result = {
"transcription": text_result,
"text_based_emotion": detected_emotion,
"prosody": prosodic_features
}
return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"]
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown(
"""# Emotion Detection from Speech
##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity)
""")
with gr.Column():
with gr.Tab("Record Audio"):
audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath")
transcribe_audio_r = gr.Button('Transcribe')
with gr.Tab("Upload Audio as File"):
audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath")
transcribe_audio_u = gr.Button('Transcribe')
with gr.Row():
transcript_output = gr.Textbox(label="Transcription", lines=3)
emotion_output = gr.Label(label="Detected Emotion from Text")
prosody_output = gr.Label(label="Prosodic Features (Pitch, Loudness, Intensity)")
transcribe_audio_r.click(translate_and_classify, inputs=audio_input_r, outputs=[transcript_output, emotion_output, prosody_output])
transcribe_audio_u.click(translate_and_classify, inputs=audio_input_u, outputs=[transcript_output, emotion_output, prosody_output])
demo.launch(share=True) |