File size: 3,544 Bytes
a9c0da2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc083e
a9c0da2
 
 
 
 
 
 
 
 
 
 
e04d5f6
 
 
 
cdc083e
e04d5f6
 
 
 
 
 
 
 
 
 
 
 
 
a9c0da2
e04d5f6
 
a9c0da2
e04d5f6
 
6b6c787
 
 
 
a9c0da2
e04d5f6
 
 
 
 
a9c0da2
 
6b6c787
 
 
 
a9c0da2
6b6c787
 
 
 
a9c0da2
 
6b6c787
a9c0da2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import io
import numpy as np
import pydub
import scipy
from scipy.io import wavfile
from pydub import AudioSegment
import base64
import librosa
import tensorflow as tf

class EndpointHandler():
    
    def __init__(self, path):
        self.emotion_labels = ['Angry', 'Calm', 'Fearful', 'Happy', 'Sad']
        self.emotion_model = tf.keras.models.load_model(f"{path}/models/best_model_emotion.h5")
        self.depression_model = tf.keras.models.load_model(f"{path}/models/best_model_depression.h5")
    
    def __call__(self, input_data):
        audio_base64 = input_data.pop("inputs", input_data)
        audio_features = self.preprocess_audio_data(audio_base64)
        emotion_prediction, depression_prediction = self.perform_emotion_analysis(audio_features)
        return {
            "emotion": emotion_prediction,
            "depression": float(depression_prediction[0])
        }
    
    def get_mfcc_features(self, features, padding):
        padded_features = padding - features.shape[1]
        if padded_features > 0:
            features = np.pad(features, [(0, 0), (0, padded_features)], mode='constant')
        elif padded_features < 0:
            features = features[:, padded_features:]
        return np.expand_dims(features, axis=0)
    
    def preprocess_audio_data(self, base64_string, duration=2.5, desired_sr=22050*2, offset=0.5):
        try:
        # Decode the base64 string
            audio_bytes = base64.b64decode(base64_string)
            audio_io = io.BytesIO(audio_bytes)

        # Try to load with librosa first
            try:
                y, sr = librosa.load(audio_io, sr=desired_sr, duration=duration, offset=offset)
            except:
            # If librosa fails, try using pydub
                audio_io.seek(0)  # Reset file pointer
                audio = AudioSegment.from_file(audio_io)
                audio = audio.set_channels(1)  # Convert to mono
                audio = audio.set_frame_rate(desired_sr)
            
                samples = np.array(audio.get_array_of_samples())
                y = samples.astype(np.float32) / 32768.0  # Normalize
                sr = desired_sr

        # Normalize the audio
            y = librosa.util.normalize(y)

        # Extract MFCC features
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=30)
            if mfcc.shape[1] < 216:
                mfcc = np.pad(mfcc, ((0, 0), (0, 216 - mfcc.shape[1])), mode='constant')
            elif mfcc.shape[1] > 216:
                mfcc = mfcc[:, :216]

            return mfcc

        except Exception as e:
            print(f"Error in preprocess_audio_data: {str(e)}")
            raise
    
    def perform_emotion_analysis(self, features, emotion_padding=216, depression_padding=2584):
        emotion_features = features[:, :emotion_padding]
        emotion_features = np.expand_dims(emotion_features, axis=-1)  # Add channel dimension
        emotion_features = np.expand_dims(emotion_features, axis=0)  # Add batch dimension

        depression_features = self.get_mfcc_features(features, depression_padding)
    
        print("Emotion model input shape:", self.emotion_model.input_shape)
        print("Emotion features shape:", emotion_features.shape)
    
        emotion_prediction = self.emotion_model.predict(emotion_features)[0]
        emotion_prediction = self.emotion_labels[np.argmax(emotion_prediction)]
    
        depression_prediction = self.depression_model.predict(depression_features)[0]
        return emotion_prediction, depression_prediction