Spaces:

Ayushdavidkushwahaaaa
/

Dual_pipeline_model_testing_now

Sleeping

App Files Files Community

Dual_pipeline_model_testing_now / app.py

Ayushdavidkushwahaaaa

Update app.py

7037a1e verified 9 months ago

raw

history blame

3.47 kB

	import gradio as gr
	import torch
	import whisper
	import warnings
	import os
	import librosa
	import numpy as np
	from transformers import pipeline

	warnings.filterwarnings('ignore')

	MODEL_NAME = "openai/whisper-small"
	BATCH_SIZE = 8

	device = 0 if torch.cuda.is_available() else "cpu"

	# Whisper for transcription
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	device=device
	)

	# Emotion classifier for text-based classification
	emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True)

	# Function to extract prosodic features using librosa
	def extract_audio_features(audio_file):
	y, sr = librosa.load(audio_file)

	# Pitch (Fundamental Frequency)
	pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
	pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0])

	# Intensity (RMS)
	rms = np.mean(librosa.feature.rms(y=y))

	# Loudness (Using the perceptual C-weighting of the signal)
	S = np.abs(librosa.stft(y))**2
	loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr)))

	return {
	"pitch": pitch,
	"rms": rms,
	"loudness": loudness
	}

	# Function to transcribe and classify emotions (dual-pipeline)
	def translate_and_classify(audio):
	# Step 1: Transcribe audio to text using Whisper
	text_result = pipe(audio, batch_size=BATCH_SIZE)["text"]

	# Step 2: Extract prosodic features from the audio using librosa
	prosodic_features = extract_audio_features(audio)

	# Step 3: Use the emotion classifier on the transcribed text
	emotion = emotion_classifier(text_result)
	detected_emotion = {}
	for emotion_item in emotion[0]:
	detected_emotion[emotion_item["label"]] = emotion_item["score"]

	# Combine prosodic features and text-based emotion detection
	combined_result = {
	"transcription": text_result,
	"text_based_emotion": detected_emotion,
	"prosody": prosodic_features
	}

	return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"]

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown(
	"""# Emotion Detection from Speech

	##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity)
	""")

	with gr.Column():
	with gr.Tab("Record Audio"):
	audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath")
	transcribe_audio_r = gr.Button('Transcribe')

	with gr.Tab("Upload Audio as File"):
	audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath")
	transcribe_audio_u = gr.Button('Transcribe')

	with gr.Row():
	transcript_output = gr.Textbox(label="Transcription", lines=3)
	emotion_output = gr.Label(label="Detected Emotion from Text")
	prosody_output = gr.Label(label="Prosodic Features (Pitch, Loudness, Intensity)")

	transcribe_audio_r.click(translate_and_classify, inputs=audio_input_r, outputs=[transcript_output, emotion_output, prosody_output])
	transcribe_audio_u.click(translate_and_classify, inputs=audio_input_u, outputs=[transcript_output, emotion_output, prosody_output])

	demo.launch(share=True)