Ayushdavidkushwahaaaa's picture
Update app.py
7037a1e verified
import gradio as gr
import torch
import whisper
import warnings
import os
import librosa
import numpy as np
from transformers import pipeline
warnings.filterwarnings('ignore')
MODEL_NAME = "openai/whisper-small"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
# Whisper for transcription
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device
)
# Emotion classifier for text-based classification
emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True)
# Function to extract prosodic features using librosa
def extract_audio_features(audio_file):
y, sr = librosa.load(audio_file)
# Pitch (Fundamental Frequency)
pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0])
# Intensity (RMS)
rms = np.mean(librosa.feature.rms(y=y))
# Loudness (Using the perceptual C-weighting of the signal)
S = np.abs(librosa.stft(y))**2
loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr)))
return {
"pitch": pitch,
"rms": rms,
"loudness": loudness
}
# Function to transcribe and classify emotions (dual-pipeline)
def translate_and_classify(audio):
# Step 1: Transcribe audio to text using Whisper
text_result = pipe(audio, batch_size=BATCH_SIZE)["text"]
# Step 2: Extract prosodic features from the audio using librosa
prosodic_features = extract_audio_features(audio)
# Step 3: Use the emotion classifier on the transcribed text
emotion = emotion_classifier(text_result)
detected_emotion = {}
for emotion_item in emotion[0]:
detected_emotion[emotion_item["label"]] = emotion_item["score"]
# Combine prosodic features and text-based emotion detection
combined_result = {
"transcription": text_result,
"text_based_emotion": detected_emotion,
"prosody": prosodic_features
}
return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"]
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown(
"""# Emotion Detection from Speech
##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity)
""")
with gr.Column():
with gr.Tab("Record Audio"):
audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath")
transcribe_audio_r = gr.Button('Transcribe')
with gr.Tab("Upload Audio as File"):
audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath")
transcribe_audio_u = gr.Button('Transcribe')
with gr.Row():
transcript_output = gr.Textbox(label="Transcription", lines=3)
emotion_output = gr.Label(label="Detected Emotion from Text")
prosody_output = gr.Label(label="Prosodic Features (Pitch, Loudness, Intensity)")
transcribe_audio_r.click(translate_and_classify, inputs=audio_input_r, outputs=[transcript_output, emotion_output, prosody_output])
transcribe_audio_u.click(translate_and_classify, inputs=audio_input_u, outputs=[transcript_output, emotion_output, prosody_output])
demo.launch(share=True)