File size: 3,544 Bytes
a9c0da2 cdc083e a9c0da2 e04d5f6 cdc083e e04d5f6 a9c0da2 e04d5f6 a9c0da2 e04d5f6 6b6c787 a9c0da2 e04d5f6 a9c0da2 6b6c787 a9c0da2 6b6c787 a9c0da2 6b6c787 a9c0da2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import io
import numpy as np
import pydub
import scipy
from scipy.io import wavfile
from pydub import AudioSegment
import base64
import librosa
import tensorflow as tf
class EndpointHandler():
def __init__(self, path):
self.emotion_labels = ['Angry', 'Calm', 'Fearful', 'Happy', 'Sad']
self.emotion_model = tf.keras.models.load_model(f"{path}/models/best_model_emotion.h5")
self.depression_model = tf.keras.models.load_model(f"{path}/models/best_model_depression.h5")
def __call__(self, input_data):
audio_base64 = input_data.pop("inputs", input_data)
audio_features = self.preprocess_audio_data(audio_base64)
emotion_prediction, depression_prediction = self.perform_emotion_analysis(audio_features)
return {
"emotion": emotion_prediction,
"depression": float(depression_prediction[0])
}
def get_mfcc_features(self, features, padding):
padded_features = padding - features.shape[1]
if padded_features > 0:
features = np.pad(features, [(0, 0), (0, padded_features)], mode='constant')
elif padded_features < 0:
features = features[:, padded_features:]
return np.expand_dims(features, axis=0)
def preprocess_audio_data(self, base64_string, duration=2.5, desired_sr=22050*2, offset=0.5):
try:
# Decode the base64 string
audio_bytes = base64.b64decode(base64_string)
audio_io = io.BytesIO(audio_bytes)
# Try to load with librosa first
try:
y, sr = librosa.load(audio_io, sr=desired_sr, duration=duration, offset=offset)
except:
# If librosa fails, try using pydub
audio_io.seek(0) # Reset file pointer
audio = AudioSegment.from_file(audio_io)
audio = audio.set_channels(1) # Convert to mono
audio = audio.set_frame_rate(desired_sr)
samples = np.array(audio.get_array_of_samples())
y = samples.astype(np.float32) / 32768.0 # Normalize
sr = desired_sr
# Normalize the audio
y = librosa.util.normalize(y)
# Extract MFCC features
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=30)
if mfcc.shape[1] < 216:
mfcc = np.pad(mfcc, ((0, 0), (0, 216 - mfcc.shape[1])), mode='constant')
elif mfcc.shape[1] > 216:
mfcc = mfcc[:, :216]
return mfcc
except Exception as e:
print(f"Error in preprocess_audio_data: {str(e)}")
raise
def perform_emotion_analysis(self, features, emotion_padding=216, depression_padding=2584):
emotion_features = features[:, :emotion_padding]
emotion_features = np.expand_dims(emotion_features, axis=-1) # Add channel dimension
emotion_features = np.expand_dims(emotion_features, axis=0) # Add batch dimension
depression_features = self.get_mfcc_features(features, depression_padding)
print("Emotion model input shape:", self.emotion_model.input_shape)
print("Emotion features shape:", emotion_features.shape)
emotion_prediction = self.emotion_model.predict(emotion_features)[0]
emotion_prediction = self.emotion_labels[np.argmax(emotion_prediction)]
depression_prediction = self.depression_model.predict(depression_features)[0]
return emotion_prediction, depression_prediction |