Spaces:

raffaelsiregar
/

speech-emotion-recognition

Sleeping

File size: 5,452 Bytes

import gradio as gr
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import numpy as np
from sklearn.preprocessing import LabelEncoder

class CNN1DLSTMAudioClassifier(nn.Module):
    def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160):
        super(CNN1DLSTMAudioClassifier, self).__init__()

        # 1D CNN layers
        self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm1d(8)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(16)
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2)
        self.bn3 = nn.BatchNorm1d(32)
        self.pool3 = nn.MaxPool1d(kernel_size=2)

        # Calculate the output size of the last CNN layer
        self._to_linear = None
        self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length)

        # LSTM layers
        self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True)

        # Fully connected layer
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_classes)

        # Dropout
        self.dropout = nn.Dropout(0.2)

    def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length):
        # Calculate the size of the input to the LSTM layer
        num_frames = (sample_rate - n_fft) // hop_length + 1
        x = torch.randn(1, input_channels, num_frames)
        self.convs(x)
        self._to_linear = x.shape[1]

    def convs(self, x):
        x = self.pool1(self.bn1(F.relu(self.conv1(x))))
        x = self.pool2(self.bn2(F.relu(self.conv2(x))))
        x = self.pool3(self.bn3(F.relu(self.conv3(x))))
        return x

    def forward(self, x):
        x = x.view(x.size(0), 1, -1)
        x = self.convs(x)

        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = x[:, -1, :]

        # Fully connected layers
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x
    
num_class = 6
model = CNN1DLSTMAudioClassifier(num_class)

model.load_state_dict(torch.load("best-model-emotion-recognition.bin", weights_only=False))
model.eval()

def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512):
    # Load the audio file
    waveform, sr = torchaudio.load(file_path)
    
    # Resample if necessary
    if sr != sample_rate:
        resampler = torchaudio.transforms.Resample(sr, sample_rate)
        waveform = resampler(waveform)
    
    # Ensure consistent audio length (2 seconds)
    target_length = 2 * sample_rate
    if waveform.size(1) > target_length:
        waveform = waveform[:, :target_length]
    else:
        waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1)))
    
    # Apply Mel Spectrogram transform
    mel_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length
    )
    mel_spectrogram = mel_transform(waveform)
    
    # Normalize (use the mean and std from your training data)
    mean = 12.65
    std = 117.07
    normalized_mel_spectrogram = (mel_spectrogram - mean) / std
    
    # Flatten the mel spectrogram
    flattened = normalized_mel_spectrogram.flatten()
    
    if flattened.shape[0] < 12288:
        flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0]))
    elif flattened.shape[0] > 12288:
        flattened = flattened[:12288]
    
    return flattened

def decode_emotion_prediction(prediction_tensor, label_encoder):
    """
    Decodes the prediction tensor into an emotion label.
    
    Args:
    prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6]
    label_encoder (LabelEncoder): The LabelEncoder used during training
    
    Returns:
    str: The predicted emotion label
    float: The confidence score for the prediction
    """
    # Get the index of the highest probability
    max_index = torch.argmax(prediction_tensor, dim=1).item()
    
    # Get the confidence score (probability) for the prediction
    confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item()
    
    # Decode the index to get the emotion label
    predicted_emotion = label_encoder.inverse_transform([max_index])[0]
    
    return predicted_emotion, confidence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def predict(wave):
    wave = preprocess_single_audio(wave)
    le = LabelEncoder()
    le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad'])
    wave = wave.unsqueeze(0)
    with torch.no_grad():
        prediction = model(wave)
    predicted_emotion, confidence = decode_emotion_prediction(prediction, le)
    return f"Predicted emotion: {predicted_emotion} (Confidence: {confidence:.2f})"

# Gradio Interface
iface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs="text",
    live=True,
    title="Speech Emotion Recognition",
    description="Record your voice and get the predicted emotion."
)

iface.launch()