Emotion-detection / handler.py
Devops-hestabit's picture
Update handler.py
6b6c787 verified
raw
history blame
3.54 kB
import io
import numpy as np
import pydub
import scipy
from scipy.io import wavfile
from pydub import AudioSegment
import base64
import librosa
import tensorflow as tf
class EndpointHandler():
def __init__(self, path):
self.emotion_labels = ['Angry', 'Calm', 'Fearful', 'Happy', 'Sad']
self.emotion_model = tf.keras.models.load_model(f"{path}/models/best_model_emotion.h5")
self.depression_model = tf.keras.models.load_model(f"{path}/models/best_model_depression.h5")
def __call__(self, input_data):
audio_base64 = input_data.pop("inputs", input_data)
audio_features = self.preprocess_audio_data(audio_base64)
emotion_prediction, depression_prediction = self.perform_emotion_analysis(audio_features)
return {
"emotion": emotion_prediction,
"depression": float(depression_prediction[0])
}
def get_mfcc_features(self, features, padding):
padded_features = padding - features.shape[1]
if padded_features > 0:
features = np.pad(features, [(0, 0), (0, padded_features)], mode='constant')
elif padded_features < 0:
features = features[:, padded_features:]
return np.expand_dims(features, axis=0)
def preprocess_audio_data(self, base64_string, duration=2.5, desired_sr=22050*2, offset=0.5):
try:
# Decode the base64 string
audio_bytes = base64.b64decode(base64_string)
audio_io = io.BytesIO(audio_bytes)
# Try to load with librosa first
try:
y, sr = librosa.load(audio_io, sr=desired_sr, duration=duration, offset=offset)
except:
# If librosa fails, try using pydub
audio_io.seek(0) # Reset file pointer
audio = AudioSegment.from_file(audio_io)
audio = audio.set_channels(1) # Convert to mono
audio = audio.set_frame_rate(desired_sr)
samples = np.array(audio.get_array_of_samples())
y = samples.astype(np.float32) / 32768.0 # Normalize
sr = desired_sr
# Normalize the audio
y = librosa.util.normalize(y)
# Extract MFCC features
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=30)
if mfcc.shape[1] < 216:
mfcc = np.pad(mfcc, ((0, 0), (0, 216 - mfcc.shape[1])), mode='constant')
elif mfcc.shape[1] > 216:
mfcc = mfcc[:, :216]
return mfcc
except Exception as e:
print(f"Error in preprocess_audio_data: {str(e)}")
raise
def perform_emotion_analysis(self, features, emotion_padding=216, depression_padding=2584):
emotion_features = features[:, :emotion_padding]
emotion_features = np.expand_dims(emotion_features, axis=-1) # Add channel dimension
emotion_features = np.expand_dims(emotion_features, axis=0) # Add batch dimension
depression_features = self.get_mfcc_features(features, depression_padding)
print("Emotion model input shape:", self.emotion_model.input_shape)
print("Emotion features shape:", emotion_features.shape)
emotion_prediction = self.emotion_model.predict(emotion_features)[0]
emotion_prediction = self.emotion_labels[np.argmax(emotion_prediction)]
depression_prediction = self.depression_model.predict(depression_features)[0]
return emotion_prediction, depression_prediction