File size: 6,038 Bytes
3ab0534 51d9b91 fc7fc7d ff22bc5 3ab0534 49cd5f4 fc4c547 49cd5f4 3ab0534 e2950b2 d5fa086 fc4c547 d5fa086 fc4c547 d5fa086 49cd5f4 fc4c547 d5fa086 49cd5f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import gradio as gr
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import numpy as np
from sklearn.preprocessing import LabelEncoder
class CNN1DLSTMAudioClassifier(nn.Module):
def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160):
super(CNN1DLSTMAudioClassifier, self).__init__()
# 1D CNN layers
self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2)
self.bn1 = nn.BatchNorm1d(8)
self.pool1 = nn.MaxPool1d(kernel_size=2)
self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2)
self.bn2 = nn.BatchNorm1d(16)
self.pool2 = nn.MaxPool1d(kernel_size=2)
self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2)
self.bn3 = nn.BatchNorm1d(32)
self.pool3 = nn.MaxPool1d(kernel_size=2)
# Calculate the output size of the last CNN layer
self._to_linear = None
self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length)
# LSTM layers
self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True)
# Fully connected layer
self.fc1 = nn.Linear(128, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, num_classes)
# Dropout
self.dropout = nn.Dropout(0.2)
def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length):
# Calculate the size of the input to the LSTM layer
num_frames = (sample_rate - n_fft) // hop_length + 1
x = torch.randn(1, input_channels, num_frames)
self.convs(x)
self._to_linear = x.shape[1]
def convs(self, x):
x = self.pool1(self.bn1(F.relu(self.conv1(x))))
x = self.pool2(self.bn2(F.relu(self.conv2(x))))
x = self.pool3(self.bn3(F.relu(self.conv3(x))))
return x
def forward(self, x):
x = x.view(x.size(0), 1, -1)
x = self.convs(x)
x = x.permute(0, 2, 1)
x, _ = self.lstm(x)
x = x[:, -1, :]
# Fully connected layers
x = self.dropout(x)
x = self.fc1(x)
x = self.dropout(x)
x = self.fc2(x)
return x
num_class = 6
device = torch.device('cpu')
state_dict = torch.load('best_model.pth', map_location=device)
model = CNN1DLSTMAudioClassifier(num_class)
model.load_state_dict(state_dict)
model.eval()
def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512):
# Load the audio file
waveform, sr = torchaudio.load(file_path)
# Resample if necessary
if sr != sample_rate:
resampler = torchaudio.transforms.Resample(sr, sample_rate)
waveform = resampler(waveform)
# Ensure consistent audio length (2 seconds)
target_length = 2 * sample_rate
if waveform.size(1) > target_length:
waveform = waveform[:, :target_length]
else:
waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1)))
# Apply Mel Spectrogram transform
mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=sample_rate,
n_mels=n_mels,
n_fft=n_fft,
hop_length=hop_length
)
mel_spectrogram = mel_transform(waveform)
# Normalize (use the mean and std from your training data)
mean = 12.65
std = 117.07
normalized_mel_spectrogram = (mel_spectrogram - mean) / std
# Flatten the mel spectrogram
flattened = normalized_mel_spectrogram.flatten()
if flattened.shape[0] < 12288:
flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0]))
elif flattened.shape[0] > 12288:
flattened = flattened[:12288]
return flattened
def decode_emotion_prediction(prediction_tensor, label_encoder):
"""
Decodes the prediction tensor into an emotion label.
Args:
prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6]
label_encoder (LabelEncoder): The LabelEncoder used during training
Returns:
str: The predicted emotion label
float: The confidence score for the prediction
"""
# Get the index of the highest probability
max_index = torch.argmax(prediction_tensor, dim=1).item()
# Get the confidence score (probability) for the prediction
confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item()
# Decode the index to get the emotion label
predicted_emotion = label_encoder.inverse_transform([max_index])[0]
return predicted_emotion, confidence
def predict(wave):
if wave is None or wave == '':
return "No audio input provided."
try:
wave = preprocess_single_audio(wave)
le = LabelEncoder()
le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad'])
wave = wave.unsqueeze(0)
with torch.no_grad():
prediction = model(wave)
predicted_emotion, confidence = decode_emotion_prediction(prediction, le)
return f"Your emotion is: {predicted_emotion} with {confidence*100:.4f}% confidence level."
except Exception as e:
return f'Error in processing audio: {str(e)}'
# Gradio Interface
article = """
### How It Works
- The model classifies the speech emotion given into 6 emotions (Angry, Happy, Sad, Disgusting, Fear, Neutral).
- It returns the highest chance of the emotion and its confidence level.
- This model is built with CNN Architecture combined with LSTM Architecture.
- Please use English to record your voice.
"""
iface = gr.Interface(
fn=predict,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
live=True,
title="Speech Emotion Recognition",
description="Record your voice to express an emotion and get the predicted emotion. The model only support English. Record it about 2-3 s",
article=article
)
iface.launch()
|