|
import io |
|
import numpy as np |
|
import pydub |
|
import scipy |
|
from scipy.io import wavfile |
|
from pydub import AudioSegment |
|
import base64 |
|
import librosa |
|
import tensorflow as tf |
|
|
|
class EndpointHandler(): |
|
|
|
def __init__(self, path): |
|
self.emotion_labels = ['Angry', 'Calm', 'Fearful', 'Happy', 'Sad'] |
|
self.emotion_model = tf.keras.models.load_model(f"{path}/models/best_model_emotion.h5") |
|
self.depression_model = tf.keras.models.load_model(f"{path}/models/best_model_depression.h5") |
|
|
|
def __call__(self, input_data): |
|
audio_base64 = input_data.pop("inputs", input_data) |
|
audio_features = self.preprocess_audio_data(audio_base64) |
|
emotion_prediction, depression_prediction = self.perform_emotion_analysis(audio_features) |
|
return { |
|
"emotion": emotion_prediction, |
|
"depression": float(depression_prediction[0]) |
|
} |
|
|
|
def get_mfcc_features(self, features, padding): |
|
padded_features = padding - features.shape[1] |
|
if padded_features > 0: |
|
features = np.pad(features, [(0, 0), (0, padded_features)], mode='constant') |
|
elif padded_features < 0: |
|
features = features[:, padded_features:] |
|
return np.expand_dims(features, axis=0) |
|
|
|
def preprocess_audio_data(self, base64_string, duration=2.5, desired_sr=22050*2, offset=0.5): |
|
try: |
|
|
|
audio_bytes = base64.b64decode(base64_string) |
|
audio_io = io.BytesIO(audio_bytes) |
|
|
|
|
|
try: |
|
y, sr = librosa.load(audio_io, sr=desired_sr, duration=duration, offset=offset) |
|
except: |
|
|
|
audio_io.seek(0) |
|
audio = AudioSegment.from_file(audio_io) |
|
audio = audio.set_channels(1) |
|
audio = audio.set_frame_rate(desired_sr) |
|
|
|
samples = np.array(audio.get_array_of_samples()) |
|
y = samples.astype(np.float32) / 32768.0 |
|
sr = desired_sr |
|
|
|
|
|
y = librosa.util.normalize(y) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=30) |
|
if mfcc.shape[1] < 216: |
|
mfcc = np.pad(mfcc, ((0, 0), (0, 216 - mfcc.shape[1])), mode='constant') |
|
elif mfcc.shape[1] > 216: |
|
mfcc = mfcc[:, :216] |
|
|
|
return mfcc |
|
|
|
except Exception as e: |
|
print(f"Error in preprocess_audio_data: {str(e)}") |
|
raise |
|
|
|
def perform_emotion_analysis(self, features, emotion_padding=216, depression_padding=2584): |
|
emotion_features = features[:, :emotion_padding] |
|
emotion_features = np.expand_dims(emotion_features, axis=-1) |
|
emotion_features = np.expand_dims(emotion_features, axis=0) |
|
|
|
depression_features = self.get_mfcc_features(features, depression_padding) |
|
|
|
print("Emotion model input shape:", self.emotion_model.input_shape) |
|
print("Emotion features shape:", emotion_features.shape) |
|
|
|
emotion_prediction = self.emotion_model.predict(emotion_features)[0] |
|
emotion_prediction = self.emotion_labels[np.argmax(emotion_prediction)] |
|
|
|
depression_prediction = self.depression_model.predict(depression_features)[0] |
|
return emotion_prediction, depression_prediction |