Spaces:

KingInTheNorth
/

audio-emotion-detector

Sleeping

manikanta2026

changes3

4f299e1 about 1 month ago

8.55 kB

	import os
	import numpy as np
	import librosa
	import librosa.display
	import pickle
	import tensorflow as tf
	import gradio as gr
	import matplotlib.pyplot as plt
	import matplotlib
	matplotlib.use('Agg') # Use non-interactive backend
	from io import BytesIO
	from PIL import Image
	import warnings

	# Suppress warnings and logs
	warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
	os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

	# Load model and label encoder
	model = tf.keras.models.load_model("ann_new_emotion_recognition_model.h5", compile=False)
	with open("new_label_encoder.pkl", "rb") as f:
	label_encoder = pickle.load(f)

	def extract_features(audio, sr, max_len=40):
	# Extract MFCCs
	mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
	mfccs = np.mean(mfccs.T, axis=0)

	# Extract Chroma
	chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
	chroma = np.mean(chroma.T, axis=0)

	# Extract Spectral Contrast
	contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
	contrast = np.mean(contrast.T, axis=0)

	# Extract Zero-Crossing Rate
	zcr = librosa.feature.zero_crossing_rate(y=audio)
	zcr = np.mean(zcr.T, axis=0)

	# Extract Spectral Centroid
	centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
	centroid = np.mean(centroid.T, axis=0)

	# Extract Spectral Rolloff
	rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)
	rolloff = np.mean(rolloff.T, axis=0)

	# Extract RMS Energy
	rms = librosa.feature.rms(y=audio)
	rms = np.mean(rms.T, axis=0)

	features = np.concatenate([mfccs, chroma, contrast, zcr, centroid, rolloff, rms])

	# Pad or trim to fixed length
	if len(features) < max_len:
	features = np.pad(features, (0, max_len - len(features)), mode='constant')
	else:
	features = features[:max_len]
	return features

	def create_mel_spectrogram(audio, sr):
	"""Create mel spectrogram plot"""
	plt.figure(figsize=(10, 4))
	S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=8000)
	S_dB = librosa.power_to_db(S, ref=np.max)
	librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
	plt.colorbar(format='%+2.0f dB')
	plt.title('Mel Spectrogram')
	plt.tight_layout()

	# Save to BytesIO and convert to PIL Image
	buf = BytesIO()
	plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	img = Image.open(buf)
	plt.close()
	return img

	def create_polar_plot(emotion_probabilities):
	"""Create polar plot of emotion probabilities"""
	emotions = list(emotion_probabilities.keys())
	probabilities = [prob * 100 for prob in emotion_probabilities.values()] # Convert to percentages

	# Prepare data for polar plot
	angles = np.linspace(0, 2 * np.pi, len(emotions), endpoint=False).tolist()
	angles += angles[:1] # Complete the circle
	probabilities += probabilities[:1] # Complete the circle

	# Create polar plot
	fig, ax = plt.subplots(figsize=(4, 4), subplot_kw=dict(projection='polar'))
	ax.fill(angles, probabilities, color='skyblue', alpha=0.4)
	ax.plot(angles, probabilities, color='blue', linewidth=2, marker='o')

	# Customize the plot
	ax.set_yticks([20, 40, 60, 80, 100])
	ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], color="gray", fontsize=10)
	ax.set_xticks(angles[:-1])
	ax.set_xticklabels(emotions, fontsize=12, color="darkblue")
	ax.set_ylim(0, 100)

	ax.set_title("Emotion Probabilities", va='bottom', fontsize=14, color="darkblue", pad=20)
	plt.tight_layout()

	# Save to BytesIO and convert to PIL Image
	buf = BytesIO()
	plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	img = Image.open(buf)
	plt.close()
	return img

	def create_waveform_plot(audio, sr):
	"""Create waveform plot"""
	plt.figure(figsize=(12, 4))
	librosa.display.waveshow(audio, sr=sr)
	plt.title('Audio Waveform')
	plt.xlabel('Time (seconds)')
	plt.ylabel('Amplitude')
	plt.tight_layout()

	# Save to BytesIO and convert to PIL Image
	buf = BytesIO()
	plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	img = Image.open(buf)
	plt.close()
	return img

	def predict_emotion(audio_file):
	try:
	# Load audio file
	audio_np, sr = librosa.load(audio_file, sr=None, res_type='kaiser_fast')

	# Extract features
	features = extract_features(audio_np, sr)
	features = np.expand_dims(features, axis=0)

	# Make prediction
	predictions = model.predict(features, verbose=0)
	predicted_class = np.argmax(predictions[0])
	predicted_emotion = label_encoder.inverse_transform([predicted_class])[0]

	# Calculate emotion probabilities (as percentages for display)
	emotion_probabilities = {
	label_encoder.inverse_transform([i])[0]: round(float(pred), 4)
	for i, pred in enumerate(predictions[0])
	}

	# Create visualizations
	mel_spec_plot = create_mel_spectrogram(audio_np, sr)
	polar_plot = create_polar_plot(emotion_probabilities)
	waveform_plot = create_waveform_plot(audio_np, sr)

	# Convert probabilities to percentages for better display
	emotion_probabilities_percent = {
	emotion: round(prob, 2)
	for emotion, prob in emotion_probabilities.items()
	}

	return (
	predicted_emotion,
	emotion_probabilities_percent,
	mel_spec_plot,
	polar_plot,
	waveform_plot
	)

	except Exception as e:
	error_msg = f"Error processing audio: {str(e)}"
	return error_msg, {}, None, None, None

	# Create Gradio interface
	with gr.Blocks(title="🎤 Emotion Recognition from Audio", theme=gr.themes.Soft()) as iface:
	gr.Markdown(
	"""
	# 🎤 Emotion Recognition from Audio
	Upload or record an audio file to analyze the emotional content and view detailed visualizations.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(
	label="Upload or Record Audio",
	type="filepath",
	sources=["upload", "microphone"]
	)

	predict_btn = gr.Button("🔍 Analyze Emotion", variant="primary", size="lg")

	with gr.Column(scale=1):
	predicted_emotion = gr.Text(label="🎯 Predicted Emotion", interactive=False)
	emotion_probs = gr.Label(label="📊 Emotion Probabilities (%)", num_top_classes=10)

	with gr.Row():
	with gr.Column():
	waveform_plot = gr.Image(label="🌊 Audio Waveform", type="pil")
	with gr.Column():
	mel_spec_plot = gr.Image(label="🎵 Mel Spectrogram", type="pil")

	with gr.Row():
	polar_plot = gr.Image(label="🎯 Emotion Probability Radar", type="pil")

	# Set up the prediction function
	predict_btn.click(
	fn=predict_emotion,
	inputs=[audio_input],
	outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
	)

	# Also allow automatic prediction when audio is uploaded
	audio_input.change(
	fn=predict_emotion,
	inputs=[audio_input],
	outputs=[predicted_emotion, emotion_probs, mel_spec_plot, polar_plot, waveform_plot]
	)

	gr.Markdown(
	"""
	### 📝 How it works:
	1. Upload an audio file or record directly using your microphone
	2. The system extracts audio features (MFCCs, Chroma, Spectral features, etc.)
	3. A trained neural network predicts the emotion
	4. View the results with detailed visualizations:
	- Waveform: Shows the audio signal over time
	- Mel Spectrogram: Visual representation of the audio's frequency content
	- Radar Chart: Probability distribution across all diff emotion categories

	### 🎭 Supported Emotions:
	Depending on your model training, this may include emotions like: Happy, Sad, Angry, Fear, Disgust, Surprise, Neutral, and others.
	"""
	)

	# Launch the interface
	if __name__ == "__main__":
	iface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)