Vishal-Padia commited on
Commit
c8fe9e1
·
verified ·
1 Parent(s): 987fbd4

Upload speech emotion recognition model

Browse files
Files changed (1) hide show
  1. main.py +519 -0
main.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import wandb
4
+ import librosa
5
+ import torchaudio
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import seaborn as sns
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ import matplotlib.pyplot as plt
13
+ import torch.nn.functional as F
14
+
15
+ from sklearn.utils import class_weight
16
+ from torch.utils.data import Dataset, DataLoader
17
+ from torch.optim.lr_scheduler import ReduceLROnPlateau
18
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
19
+ from sklearn.metrics import classification_report, confusion_matrix
20
+ from sklearn.model_selection import train_test_split, StratifiedKFold
21
+
22
+
23
+ # Advanced Configuration with More Options
24
+ class Config:
25
+ """Enhanced configuration for emotion recognition project"""
26
+
27
+ # Data paths
28
+ DATA_DIR = "archive"
29
+
30
+ # Audio processing parameters
31
+ SAMPLE_RATE = 22050 # Standard sample rate
32
+ DURATION = 3 # seconds
33
+ N_MFCC = 20
34
+
35
+ # Model hyperparameters
36
+ BATCH_SIZE = 32
37
+ LEARNING_RATE = 0.001
38
+ NUM_EPOCHS = 20
39
+
40
+ # Feature extraction parameters
41
+ FEATURES = [
42
+ "mfcc",
43
+ "spectral_centroid",
44
+ "chroma",
45
+ "spectral_contrast",
46
+ "zero_crossing_rate",
47
+ "spectral_rolloff",
48
+ ]
49
+
50
+ # Augmentation parameters
51
+ AUGMENTATION = True
52
+ NOISE_FACTOR = 0.005
53
+ SCALE_RANGE = (0.9, 1.1)
54
+
55
+
56
+ def extract_advanced_features(file_path):
57
+ """
58
+ Extract multiple audio features with more comprehensive approach
59
+
60
+ Args:
61
+ file_path (str): Path to the audio file
62
+
63
+ Returns:
64
+ numpy.ndarray: Concatenated feature vector
65
+ """
66
+ # Load the audio file
67
+ y, sr = librosa.load(file_path, duration=Config.DURATION, sr=Config.SAMPLE_RATE)
68
+
69
+ # Feature extraction
70
+ features = []
71
+
72
+ # MFCC features (increased resolution)
73
+ if "mfcc" in Config.FEATURES:
74
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=Config.N_MFCC)
75
+ mfccs_processed = np.mean(mfccs.T, axis=0)
76
+ features.append(mfccs_processed)
77
+
78
+ # Spectral Centroid
79
+ if "spectral_centroid" in Config.FEATURES:
80
+ spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
81
+ spectral_centroids_processed = np.mean(spectral_centroids)
82
+ features.append([spectral_centroids_processed])
83
+
84
+ # Chroma Features
85
+ if "chroma" in Config.FEATURES:
86
+ chroma = librosa.feature.chroma_stft(y=y, sr=sr)
87
+ chroma_processed = np.mean(chroma.T, axis=0)
88
+ features.append(chroma_processed)
89
+
90
+ # Spectral Contrast
91
+ if "spectral_contrast" in Config.FEATURES:
92
+ spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
93
+ spectral_contrast_processed = np.mean(spectral_contrast.T, axis=0)
94
+ features.append(spectral_contrast_processed)
95
+
96
+ # Zero Crossing Rate
97
+ if "zero_crossing_rate" in Config.FEATURES:
98
+ zcr = librosa.feature.zero_crossing_rate(y)
99
+ zcr_processed = np.mean(zcr)
100
+ features.append([zcr_processed])
101
+
102
+ # Spectral Rolloff
103
+ if "spectral_rolloff" in Config.FEATURES:
104
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
105
+ spectral_rolloff_processed = np.mean(spectral_rolloff)
106
+ features.append([spectral_rolloff_processed])
107
+
108
+ # Concatenate all features
109
+ return np.concatenate(features)
110
+
111
+
112
+ def augment_features(
113
+ features, noise_factor=Config.NOISE_FACTOR, scale_range=Config.SCALE_RANGE
114
+ ):
115
+ """
116
+ Advanced feature augmentation technique
117
+
118
+ Args:
119
+ features (numpy.ndarray): Input feature array
120
+ noise_factor (float): Magnitude of noise to add
121
+ scale_range (tuple): Range for feature scaling
122
+
123
+ Returns:
124
+ numpy.ndarray: Augmented features
125
+ """
126
+ if not Config.AUGMENTATION:
127
+ return features
128
+
129
+ # Add Gaussian noise
130
+ noise = np.random.normal(0, noise_factor, features.shape)
131
+ augmented_features = features + noise
132
+
133
+ # Random scaling
134
+ scale_factor = np.random.uniform(scale_range[0], scale_range[1])
135
+ augmented_features *= scale_factor
136
+
137
+ return augmented_features
138
+
139
+
140
+ def prepare_dataset(data_dir):
141
+ """
142
+ Prepare dataset with more robust feature extraction and potential augmentation
143
+
144
+ Args:
145
+ data_dir (str): Root directory containing actor subdirectories
146
+
147
+ Returns:
148
+ tuple: Features and labels
149
+ """
150
+ features = []
151
+ labels = []
152
+
153
+ # Emotion mapping with potential for expansion
154
+ emotion_map = {
155
+ "01": "neutral",
156
+ "02": "calm",
157
+ "03": "happy",
158
+ "04": "sad",
159
+ "05": "angry",
160
+ "06": "fearful",
161
+ "07": "disgust",
162
+ "08": "surprised",
163
+ }
164
+
165
+ # Walk through all directories and subdirectories
166
+ for root, dirs, files in os.walk(data_dir):
167
+ for filename in files:
168
+ if filename.endswith(".wav"):
169
+ # Full file path
170
+ file_path = os.path.join(root, filename)
171
+
172
+ try:
173
+ # Extract emotion from filename
174
+ emotion_code = filename.split("-")[2]
175
+ emotion = emotion_map.get(emotion_code, "unknown")
176
+
177
+ # Extract original features
178
+ file_features = extract_advanced_features(file_path)
179
+ features.append(file_features)
180
+ labels.append(emotion)
181
+
182
+ # Optional augmentation
183
+ if Config.AUGMENTATION:
184
+ augmented_features = augment_features(file_features)
185
+ features.append(augmented_features)
186
+ labels.append(emotion)
187
+
188
+ except Exception as e:
189
+ print(f"Error processing {filename}: {e}")
190
+
191
+ # Informative print about dataset
192
+ print(f"Dataset Summary:")
193
+ print(f"Total files processed: {len(features)}")
194
+
195
+ # Count of emotions
196
+ from collections import Counter
197
+
198
+ emotion_counts = Counter(labels)
199
+ for emotion, count in emotion_counts.items():
200
+ print(f"{emotion.capitalize()} emotion: {count} samples")
201
+
202
+ return np.array(features), np.array(labels)
203
+
204
+
205
+ class EmotionDataset(Dataset):
206
+ """Enhanced Custom PyTorch Dataset for Emotion Recognition"""
207
+
208
+ def __init__(self, features, labels, scaler=None):
209
+ # Standardize features
210
+ if scaler is None:
211
+ self.scaler = StandardScaler()
212
+ features = self.scaler.fit_transform(features)
213
+ else:
214
+ features = scaler.transform(features)
215
+
216
+ self.features = torch.FloatTensor(features)
217
+
218
+ # Encode labels
219
+ self.label_encoder = LabelEncoder()
220
+ self.labels = torch.LongTensor(self.label_encoder.fit_transform(labels))
221
+
222
+ def __len__(self):
223
+ return len(self.labels)
224
+
225
+ def __getitem__(self, idx):
226
+ return self.features[idx], self.labels[idx]
227
+
228
+ def get_num_classes(self):
229
+ return len(self.label_encoder.classes_)
230
+
231
+ def get_class_names(self):
232
+ return self.label_encoder.classes_
233
+
234
+
235
+ class HybridEmotionRecognitionModel(nn.Module):
236
+ """Advanced Hybrid Neural Network for Emotion Recognition"""
237
+
238
+ def __init__(self, input_dim, num_classes):
239
+ super().__init__()
240
+
241
+ # Enhanced input projection with residual connection
242
+ self.input_projection = nn.Sequential(
243
+ nn.Linear(input_dim, 512),
244
+ nn.BatchNorm1d(512),
245
+ nn.ReLU(),
246
+ nn.Dropout(0.3),
247
+ nn.Linear(512, 256),
248
+ nn.ReLU(),
249
+ )
250
+
251
+ # More complex convolutional layers with residual connections
252
+ self.conv_layers = nn.ModuleList(
253
+ [
254
+ nn.Sequential(
255
+ nn.Conv1d(1, 64, kernel_size=3, padding=1),
256
+ nn.BatchNorm1d(64),
257
+ nn.ReLU(),
258
+ nn.MaxPool1d(2),
259
+ ),
260
+ nn.Sequential(
261
+ nn.Conv1d(64, 128, kernel_size=3, padding=1),
262
+ nn.BatchNorm1d(128),
263
+ nn.ReLU(),
264
+ nn.MaxPool1d(2),
265
+ ),
266
+ ]
267
+ )
268
+
269
+ # Bidirectional LSTM with more layers
270
+ self.lstm_layers = nn.LSTM(
271
+ input_size=128,
272
+ hidden_size=256,
273
+ num_layers=3,
274
+ batch_first=True,
275
+ bidirectional=True,
276
+ dropout=0.4,
277
+ )
278
+
279
+ # More complex fully connected layers
280
+ self.fc_layers = nn.Sequential(
281
+ nn.Linear(512, 256), # Note the 512 due to bidirectional LSTM
282
+ nn.BatchNorm1d(256),
283
+ nn.ReLU(),
284
+ nn.Dropout(0.4),
285
+ nn.Linear(256, 128),
286
+ nn.BatchNorm1d(128),
287
+ nn.ReLU(),
288
+ nn.Dropout(0.3),
289
+ )
290
+
291
+ self.output_layer = nn.Linear(128, num_classes)
292
+
293
+ def forward(self, x):
294
+ # Input projection
295
+ x = self.input_projection(x)
296
+
297
+ # Reshape for conv layers
298
+ x = x.unsqueeze(1)
299
+
300
+ # Convolutional layers with residual-like processing
301
+ for conv_layer in self.conv_layers:
302
+ x = conv_layer(x)
303
+
304
+ # Prepare for LSTM
305
+ x = x.permute(0, 2, 1)
306
+
307
+ # LSTM processing
308
+ lstm_out, _ = self.lstm_layers(x)
309
+ x = lstm_out[:, -1, :]
310
+
311
+ # Fully connected layers
312
+ x = self.fc_layers(x)
313
+
314
+ return self.output_layer(x)
315
+
316
+
317
+ def train_model(model, train_loader, val_loader, labels, num_epochs=Config.NUM_EPOCHS):
318
+ """
319
+ Advanced training function with improved techniques
320
+
321
+ Args:
322
+ model (nn.Module): PyTorch model
323
+ train_loader (DataLoader): Training data loader
324
+ val_loader (DataLoader): Validation data loader
325
+ labels (numpy.ndarray): Original labels for class weight computation
326
+ num_epochs (int): Number of training epochs
327
+ """
328
+ # Compute class weights to handle class imbalance
329
+ class_weights = class_weight.compute_class_weight(
330
+ "balanced", classes=np.unique(labels), y=labels
331
+ )
332
+ class_weights = torch.FloatTensor(class_weights)
333
+
334
+ # Loss with class weights
335
+ criterion = nn.CrossEntropyLoss(weight=class_weights)
336
+
337
+ # Adam with weight decay (L2 regularization)
338
+ optimizer = optim.AdamW(
339
+ model.parameters(), lr=Config.LEARNING_RATE, weight_decay=1e-5
340
+ )
341
+
342
+ # Learning rate scheduler
343
+ scheduler = ReduceLROnPlateau(
344
+ optimizer, mode="min", factor=0.5, patience=5, verbose=True
345
+ )
346
+
347
+ # Initialize wandb
348
+ wandb.init(
349
+ project="SentimentSound",
350
+ config={
351
+ "learning_rate": Config.LEARNING_RATE,
352
+ "batch_size": Config.BATCH_SIZE,
353
+ "epochs": num_epochs,
354
+ "augmentation": Config.AUGMENTATION,
355
+ },
356
+ )
357
+
358
+ # Training loop with more advanced techniques
359
+ best_val_loss = float("inf")
360
+ for epoch in range(num_epochs):
361
+ model.train()
362
+ train_loss = 0
363
+ train_correct = 0
364
+ train_total = 0
365
+
366
+ for features, batch_labels in train_loader:
367
+ optimizer.zero_grad()
368
+
369
+ # Forward and backward pass
370
+ outputs = model(features)
371
+ loss = criterion(outputs, batch_labels)
372
+
373
+ loss.backward()
374
+
375
+ # Gradient clipping
376
+ nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
377
+ optimizer.step()
378
+
379
+ train_loss += loss.item()
380
+ _, predicted = torch.max(outputs.data, 1)
381
+ train_total += batch_labels.size(0)
382
+ train_correct += (predicted == batch_labels).sum().item()
383
+
384
+ # Validation
385
+ model.eval()
386
+ val_loss = 0
387
+ val_correct = 0
388
+ val_total = 0
389
+
390
+ with torch.no_grad():
391
+ for features, batch_labels in val_loader:
392
+ outputs = model(features)
393
+ loss = criterion(outputs, batch_labels)
394
+
395
+ val_loss += loss.item()
396
+ _, predicted = torch.max(outputs.data, 1)
397
+ val_total += batch_labels.size(0)
398
+ val_correct += (predicted == batch_labels).sum().item()
399
+
400
+ # Compute metrics
401
+ train_accuracy = 100 * train_correct / train_total
402
+ val_accuracy = 100 * val_correct / val_total
403
+
404
+ # Learning rate scheduling
405
+ scheduler.step(val_loss)
406
+
407
+ # Logging to wandb
408
+ wandb.log(
409
+ {
410
+ "train_loss": train_loss / len(train_loader),
411
+ "train_accuracy": train_accuracy,
412
+ "val_loss": val_loss / len(val_loader),
413
+ "val_accuracy": val_accuracy,
414
+ }
415
+ )
416
+
417
+ # Print epoch summary
418
+ print(f"Epoch {epoch+1}/{num_epochs}")
419
+ print(f"Train Loss: {train_loss / len(train_loader):.4f}")
420
+ print(f"Train Accuracy: {train_accuracy:.2f}%")
421
+ print(f"Val Loss: {val_loss / len(val_loader):.4f}")
422
+ print(f"Val Accuracy: {val_accuracy:.2f}%")
423
+
424
+ # Save best model
425
+ if val_loss < best_val_loss:
426
+ best_val_loss = val_loss
427
+ torch.save(model.state_dict(), "best_emotion_model.pth")
428
+
429
+ # Finish wandb run
430
+ wandb.finish()
431
+
432
+ return model
433
+
434
+
435
+ def evaluate_model(model, test_loader, dataset):
436
+ """
437
+ Evaluate the model and generate detailed metrics
438
+
439
+ Args:
440
+ model (nn.Module): Trained PyTorch model
441
+ test_loader (DataLoader): Test data loader
442
+ dataset (EmotionDataset): Dataset for class names
443
+ """
444
+ model.eval()
445
+ all_preds = []
446
+ all_labels = []
447
+
448
+ with torch.no_grad():
449
+ for features, labels in test_loader:
450
+ outputs = model(features)
451
+ _, predicted = torch.max(outputs, 1)
452
+ all_preds.extend(predicted.numpy())
453
+ all_labels.extend(labels.numpy())
454
+
455
+ # Classification Report
456
+ class_names = dataset.get_class_names()
457
+ print("\nClassification Report:")
458
+ print(classification_report(all_labels, all_preds, target_names=class_names))
459
+
460
+ # Confusion Matrix Visualization
461
+ cm = confusion_matrix(all_labels, all_preds)
462
+ plt.figure(figsize=(10, 8))
463
+ sns.heatmap(
464
+ cm, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names
465
+ )
466
+ plt.title("Confusion Matrix")
467
+ plt.xlabel("Predicted")
468
+ plt.ylabel("Actual")
469
+ plt.tight_layout()
470
+ plt.savefig("confusion_matrix.png")
471
+ plt.close()
472
+
473
+
474
+ def main():
475
+ # Set random seed for reproducibility
476
+ torch.manual_seed(42)
477
+ np.random.seed(42)
478
+
479
+ # Data Preparation
480
+ features, labels = prepare_dataset(Config.DATA_DIR)
481
+
482
+ # Split data
483
+ X_train, X_test, y_train, y_test = train_test_split(
484
+ features, labels, test_size=0.2, random_state=42
485
+ )
486
+ X_train, X_val, y_train, y_val = train_test_split(
487
+ X_train, y_train, test_size=0.2, random_state=42
488
+ )
489
+
490
+ # Create datasets
491
+ train_dataset = EmotionDataset(X_train, y_train)
492
+ val_dataset = EmotionDataset(X_val, y_val)
493
+ test_dataset = EmotionDataset(X_test, y_test)
494
+
495
+ # Data loaders
496
+ train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
497
+ val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE)
498
+ test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE)
499
+
500
+ # Model Initialization
501
+ model = HybridEmotionRecognitionModel(
502
+ input_dim=len(X_train[0]), num_classes=train_dataset.get_num_classes()
503
+ )
504
+
505
+ # Train Model
506
+ train_model(
507
+ model,
508
+ train_loader,
509
+ val_loader,
510
+ labels,
511
+ num_epochs=Config.NUM_EPOCHS,
512
+ )
513
+
514
+ # Evaluate Model
515
+ evaluate_model(model, test_loader, train_dataset)
516
+
517
+
518
+ if __name__ == "__main__":
519
+ main()