File size: 5,452 Bytes
3ab0534 e6cc02d 3ab0534 e6cc02d 3ab0534 e2950b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import gradio as gr
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import numpy as np
from sklearn.preprocessing import LabelEncoder
class CNN1DLSTMAudioClassifier(nn.Module):
def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160):
super(CNN1DLSTMAudioClassifier, self).__init__()
# 1D CNN layers
self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2)
self.bn1 = nn.BatchNorm1d(8)
self.pool1 = nn.MaxPool1d(kernel_size=2)
self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2)
self.bn2 = nn.BatchNorm1d(16)
self.pool2 = nn.MaxPool1d(kernel_size=2)
self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2)
self.bn3 = nn.BatchNorm1d(32)
self.pool3 = nn.MaxPool1d(kernel_size=2)
# Calculate the output size of the last CNN layer
self._to_linear = None
self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length)
# LSTM layers
self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True)
# Fully connected layer
self.fc1 = nn.Linear(128, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, num_classes)
# Dropout
self.dropout = nn.Dropout(0.2)
def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length):
# Calculate the size of the input to the LSTM layer
num_frames = (sample_rate - n_fft) // hop_length + 1
x = torch.randn(1, input_channels, num_frames)
self.convs(x)
self._to_linear = x.shape[1]
def convs(self, x):
x = self.pool1(self.bn1(F.relu(self.conv1(x))))
x = self.pool2(self.bn2(F.relu(self.conv2(x))))
x = self.pool3(self.bn3(F.relu(self.conv3(x))))
return x
def forward(self, x):
x = x.view(x.size(0), 1, -1)
x = self.convs(x)
x = x.permute(0, 2, 1)
x, _ = self.lstm(x)
x = x[:, -1, :]
# Fully connected layers
x = self.dropout(x)
x = self.fc1(x)
x = self.dropout(x)
x = self.fc2(x)
return x
num_class = 6
model = CNN1DLSTMAudioClassifier(num_class)
model.load_state_dict(torch.load("best-model-emotion-recognition.bin", weights_only=False))
model.eval()
def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512):
# Load the audio file
waveform, sr = torchaudio.load(file_path)
# Resample if necessary
if sr != sample_rate:
resampler = torchaudio.transforms.Resample(sr, sample_rate)
waveform = resampler(waveform)
# Ensure consistent audio length (2 seconds)
target_length = 2 * sample_rate
if waveform.size(1) > target_length:
waveform = waveform[:, :target_length]
else:
waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1)))
# Apply Mel Spectrogram transform
mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=sample_rate,
n_mels=n_mels,
n_fft=n_fft,
hop_length=hop_length
)
mel_spectrogram = mel_transform(waveform)
# Normalize (use the mean and std from your training data)
mean = 12.65
std = 117.07
normalized_mel_spectrogram = (mel_spectrogram - mean) / std
# Flatten the mel spectrogram
flattened = normalized_mel_spectrogram.flatten()
if flattened.shape[0] < 12288:
flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0]))
elif flattened.shape[0] > 12288:
flattened = flattened[:12288]
return flattened
def decode_emotion_prediction(prediction_tensor, label_encoder):
"""
Decodes the prediction tensor into an emotion label.
Args:
prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6]
label_encoder (LabelEncoder): The LabelEncoder used during training
Returns:
str: The predicted emotion label
float: The confidence score for the prediction
"""
# Get the index of the highest probability
max_index = torch.argmax(prediction_tensor, dim=1).item()
# Get the confidence score (probability) for the prediction
confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item()
# Decode the index to get the emotion label
predicted_emotion = label_encoder.inverse_transform([max_index])[0]
return predicted_emotion, confidence
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
def predict(wave):
wave = preprocess_single_audio(wave)
le = LabelEncoder()
le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad'])
wave = wave.unsqueeze(0)
with torch.no_grad():
prediction = model(wave)
predicted_emotion, confidence = decode_emotion_prediction(prediction, le)
return f"Predicted emotion: {predicted_emotion} (Confidence: {confidence:.2f})"
# Gradio Interface
iface = gr.Interface(
fn=predict,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
live=True,
title="Speech Emotion Recognition",
description="Record your voice and get the predicted emotion."
)
iface.launch()
|