Ayushdavidkushwahaaaa's picture
Update app.py
7037a1e verified
raw
history blame
3.47 kB
import gradio as gr
import torch
import whisper
import warnings
import os
import librosa
import numpy as np
from transformers import pipeline
warnings.filterwarnings('ignore')
MODEL_NAME = "openai/whisper-small"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
# Whisper for transcription
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device
)
# Emotion classifier for text-based classification
emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True)
# Function to extract prosodic features using librosa
def extract_audio_features(audio_file):
y, sr = librosa.load(audio_file)
# Pitch (Fundamental Frequency)
pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0])
# Intensity (RMS)
rms = np.mean(librosa.feature.rms(y=y))
# Loudness (Using the perceptual C-weighting of the signal)
S = np.abs(librosa.stft(y))**2
loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr)))
return {
"pitch": pitch,
"rms": rms,
"loudness": loudness
}
# Function to transcribe and classify emotions (dual-pipeline)
def translate_and_classify(audio):
# Step 1: Transcribe audio to text using Whisper
text_result = pipe(audio, batch_size=BATCH_SIZE)["text"]
# Step 2: Extract prosodic features from the audio using librosa
prosodic_features = extract_audio_features(audio)
# Step 3: Use the emotion classifier on the transcribed text
emotion = emotion_classifier(text_result)
detected_emotion = {}
for emotion_item in emotion[0]:
detected_emotion[emotion_item["label"]] = emotion_item["score"]
# Combine prosodic features and text-based emotion detection
combined_result = {
"transcription": text_result,
"text_based_emotion": detected_emotion,
"prosody": prosodic_features
}
return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"]
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown(
"""# Emotion Detection from Speech
##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity)
""")
with gr.Column():
with gr.Tab("Record Audio"):
audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath")
transcribe_audio_r = gr.Button('Transcribe')
with gr.Tab("Upload Audio as File"):
audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath")
transcribe_audio_u = gr.Button('Transcribe')
with gr.Row():
transcript_output = gr.Textbox(label="Transcription", lines=3)
emotion_output = gr.Label(label="Detected Emotion from Text")
prosody_output = gr.Label(label="Prosodic Features (Pitch, Loudness, Intensity)")
transcribe_audio_r.click(translate_and_classify, inputs=audio_input_r, outputs=[transcript_output, emotion_output, prosody_output])
transcribe_audio_u.click(translate_and_classify, inputs=audio_input_u, outputs=[transcript_output, emotion_output, prosody_output])
demo.launch(share=True)