Spaces:

raffaelsiregar
/

speech-emotion-recognition

Sleeping

App Files Files Community

raffaelsiregar commited on Oct 10, 2024

Commit

3ab0534

verified ·

1 Parent(s): 06b7cba

Create app.py

Browse files

Files changed (1) hide show

app.py +174 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import gradio as gr
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+import numpy as np
+from sklearn.preprocessing import LabelEncoder
+class CNN1DLSTMAudioClassifier(nn.Module):
+    def __init__(self, num_classes, input_channels=1, sample_rate=16000, n_fft=400, hop_length=160):
+        super(CNN1DLSTMAudioClassifier, self).__init__()
+        # 1D CNN layers
+        self.conv1 = nn.Conv1d(input_channels, 8, kernel_size=5, stride=1, padding=2)
+        self.bn1 = nn.BatchNorm1d(8)
+        self.pool1 = nn.MaxPool1d(kernel_size=2)
+        self.conv2 = nn.Conv1d(8, 16, kernel_size=5, stride=1, padding=2)
+        self.bn2 = nn.BatchNorm1d(16)
+        self.pool2 = nn.MaxPool1d(kernel_size=2)
+        self.conv3 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2)
+        self.bn3 = nn.BatchNorm1d(32)
+        self.pool3 = nn.MaxPool1d(kernel_size=2)
+        # Calculate the output size of the last CNN layer
+        self._to_linear = None
+        self._calculate_to_linear(input_channels, sample_rate, n_fft, hop_length)
+        # LSTM layers
+        self.lstm = nn.LSTM(input_size=32, hidden_size=64, num_layers=3, batch_first=True, bidirectional=True)
+        # Fully connected layer
+        self.fc1 = nn.Linear(128, 64)
+        self.fc2 = nn.Linear(64, 32)
+        self.fc3 = nn.Linear(32, num_classes)
+        # Dropout
+        self.dropout = nn.Dropout(0.2)
+    def _calculate_to_linear(self, input_channels, sample_rate, n_fft, hop_length):
+        # Calculate the size of the input to the LSTM layer
+        num_frames = (sample_rate - n_fft) // hop_length + 1
+        x = torch.randn(1, input_channels, num_frames)
+        self.convs(x)
+        self._to_linear = x.shape[1]
+    def convs(self, x):
+        x = self.pool1(self.bn1(F.relu(self.conv1(x))))
+        x = self.pool2(self.bn2(F.relu(self.conv2(x))))
+        x = self.pool3(self.bn3(F.relu(self.conv3(x))))
+        return x
+    def forward(self, x):
+        x = x.view(x.size(0), 1, -1)
+        x = self.convs(x)
+        x = x.permute(0, 2, 1)
+        x, _ = self.lstm(x)
+        x = x[:, -1, :]
+        # Fully connected layers
+        x = self.dropout(x)
+        x = self.fc1(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return x
+num_class = 6
+model = CNN1DLSTMAudioClassifier(num_class)
+model.load_state_dict(torch.load("speech-emotion-recognition-best-model.bin", weights_only=False))
+model.eval()
+def preprocess_single_audio(file_path, sample_rate=16000, n_mels=128, n_fft=2048, hop_length=512):
+    # Load the audio file
+    waveform, sr = torchaudio.load(file_path)
+    # Resample if necessary
+    if sr != sample_rate:
+        resampler = torchaudio.transforms.Resample(sr, sample_rate)
+        waveform = resampler(waveform)
+    # Ensure consistent audio length (2 seconds)
+    target_length = 2 * sample_rate
+    if waveform.size(1) > target_length:
+        waveform = waveform[:, :target_length]
+    else:
+        waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.size(1)))
+    # Apply Mel Spectrogram transform
+    mel_transform = torchaudio.transforms.MelSpectrogram(
+        sample_rate=sample_rate,
+        n_mels=n_mels,
+        n_fft=n_fft,
+        hop_length=hop_length
+    )
+    mel_spectrogram = mel_transform(waveform)
+    # Normalize (use the mean and std from your training data)
+    mean = 12.65
+    std = 117.07
+    normalized_mel_spectrogram = (mel_spectrogram - mean) / std
+    # Flatten the mel spectrogram
+    flattened = normalized_mel_spectrogram.flatten()
+    if flattened.shape[0] < 12288:
+        flattened = torch.nn.functional.pad(flattened, (0, 12288 - flattened.shape[0]))
+    elif flattened.shape[0] > 12288:
+        flattened = flattened[:12288]
+    return flattened
+def decode_emotion_prediction(prediction_tensor, label_encoder):
+    """
+    Decodes the prediction tensor into an emotion label.
+    Args:
+    prediction_tensor (torch.Tensor): The model's output tensor of shape [1, 6]
+    label_encoder (LabelEncoder): The LabelEncoder used during training
+    Returns:
+    str: The predicted emotion label
+    float: The confidence score for the prediction
+    """
+    # Get the index of the highest probability
+    max_index = torch.argmax(prediction_tensor, dim=1).item()
+    # Get the confidence score (probability) for the prediction
+    confidence = torch.softmax(prediction_tensor, dim=1)[0, max_index].item()
+    # Decode the index to get the emotion label
+    predicted_emotion = label_encoder.inverse_transform([max_index])[0]
+    return predicted_emotion, confidence
+def predict(wave):
+    wave = preprocess_single_audio(wave)
+    le = LabelEncoder()
+    le.classes_ = np.array(['Angry', 'Disgusting', 'Fear', 'Happy', 'Neutral', 'Sad'])
+    wave = wave.unsqueeze(0)
+    with torch.no_grad():
+        prediction = model(wave)
+    predicted_emotion, confidence = decode_emotion_prediction(prediction, le)
+    return f"Predicted emotion: {predicted_emotion} (Confidence: {confidence:.2f})"
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Audio Prediction App")
+    gr.Markdown("Upload an audio file or record directly to get a prediction")
+    with gr.Row():
+        audio_input = gr.Audio(source="microphone", type="filepath")
+        audio_output = gr.Audio(label="Processed Audio")
+    with gr.Row():
+        submit_btn = gr.Button("Get Prediction", variant="primary")
+        clear_btn = gr.Button("Clear")
+    prediction_output = gr.Textbox(label="Prediction")
+    submit_btn.click(
+        fn=predict,
+        inputs=[audio_input, audio_input.source],
+        outputs=[audio_output, prediction_output]
+    )
+    clear_btn.click(
+        fn=lambda: (None, None, ""),
+        outputs=[audio_input, audio_output, prediction_output]
+    )
+demo.launch()