File size: 7,049 Bytes
574ecd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# streamlit_app.py

import streamlit as st
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score, calibration_curve, ConfusionMatrixDisplay
from keras.models import load_model

SAMPLE_RATE = 16000
DURATION = 5
N_MELS = 128
MAX_TIME_STEPS = 109
NUM_CLASSES = 2

# Streamlit App
st.title("Audio Spoofing Detection App")

st.sidebar.header("Model Options")
task = st.sidebar.selectbox("Select Task", ["Train Model", "Evaluate Model", "Visualize Spectrogram"])

if task == "Train Model":
    st.header("Train a New Model")
    
    uploaded_files = st.file_uploader("Upload FLAC Training Files", accept_multiple_files=True, type='flac')
    label_file = st.file_uploader("Upload Labels File (txt)", type="txt")

    if uploaded_files and label_file:
        # Parse the label file
        labels = {}
        for line in label_file.getvalue().decode("utf-8").splitlines():
            parts = line.strip().split()
            file_name = parts[1]
            label = 1 if parts[-1] == "bonafide" else 0
            labels[file_name] = label

        X, y = [], []
        for file in uploaded_files:
            file_name = file.name.split(".")[0]
            label = labels[file_name]

            # Load audio file
            audio, _ = librosa.load(file, sr=SAMPLE_RATE, duration=DURATION)

            # Extract Mel spectrogram
            mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS)
            mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

            # Padding
            if mel_spectrogram.shape[1] < MAX_TIME_STEPS:
                mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, MAX_TIME_STEPS - mel_spectrogram.shape[1])), mode='constant')
            else:
                mel_spectrogram = mel_spectrogram[:, :MAX_TIME_STEPS]

            X.append(mel_spectrogram)
            y.append(label)

        X = np.array(X)
        y = np.array(y)

        y_encoded = to_categorical(y, NUM_CLASSES)

        # Split into train and validation sets
        split_index = int(0.8 * len(X))
        X_train, X_val = X[:split_index], X[split_index:]
        y_train, y_val = y_encoded[:split_index], y_encoded[split_index:]

        input_shape = (N_MELS, X_train.shape[2], 1)
        
        # Define CNN model
        model_input = tf.keras.Input(shape=input_shape)
        x = tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu')(model_input)
        x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
        x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
        x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(128, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.5)(x)
        model_output = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(x)

        model = tf.keras.Model(inputs=model_input, outputs=model_output)
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        
        # Train the model
        if st.button("Start Training"):
            st.write("Training in progress...")
            model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))
            model.save("audio_classifier.h5")
            st.success("Training Complete. Model Saved!")

if task == "Evaluate Model":
    st.header("Evaluate a Trained Model")

    model_file = st.file_uploader("Upload Model (h5)", type='h5')
    test_files = st.file_uploader("Upload Test FLAC Files", accept_multiple_files=True, type='flac')
    protocol_file = st.file_uploader("Upload Protocol File (txt)", type='txt')

    if model_file and test_files and protocol_file:
        # Load Model
        model = load_model(model_file)

        # Prepare test data
        X_test = []
        for file in test_files:
            audio, _ = librosa.load(file, sr=SAMPLE_RATE, duration=DURATION)
            mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS)
            mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
            if mel_spectrogram.shape[1] < MAX_TIME_STEPS:
                mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, MAX_TIME_STEPS - mel_spectrogram.shape[1])), mode='constant')
            else:
                mel_spectrogram = mel_spectrogram[:, :MAX_TIME_STEPS]
            X_test.append(mel_spectrogram)

        X_test = np.array(X_test)

        y_pred = model.predict(X_test)
        y_pred_classes = np.argmax(y_pred, axis=1)

        # Parse the true labels
        true_labels = {}
        for line in protocol_file.getvalue().decode("utf-8").splitlines():
            parts = line.strip().split()
            if len(parts) > 1:
                file_name = parts[0]
                label = parts[-1]
                true_labels[file_name] = 1 if label == "bonafide" else 0

        y_true = np.array([label for label in true_labels.values()])

        # Confusion Matrix
        cm = confusion_matrix(y_true, y_pred_classes)
        ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Spoof", "Bonafide"]).plot(cmap=plt.cm.Blues)
        st.pyplot(plt)

        # ROC Curve
        y_pred_prob = y_pred[:, 1]
        fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.legend(loc="lower right")
        st.pyplot(plt)

        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_true, y_pred_prob)
        avg_precision = average_precision_score(y_true, y_pred_prob)
        plt.figure()
        plt.plot(recall, precision, color='darkorange', lw=2, label=f'Avg. Precision = {avg_precision:.2f}')
        st.pyplot(plt)

if task == "Visualize Spectrogram":
    st.header("Visualize Mel Spectrogram")

    test_files = st.file_uploader("Upload Test FLAC Files", accept_multiple_files=True, type='flac')

    if test_files:
        for file in test_files:
            audio, _ = librosa.load(file, sr=SAMPLE_RATE, duration=DURATION)
            mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS)
            mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

            plt.figure(figsize=(10, 6))
            librosa.display.specshow(mel_spectrogram, x_axis='time', y_axis='mel', sr=SAMPLE_RATE)
            plt.colorbar(format='%+2.0f dB')
            plt.title(f'Mel Spectrogram - {file.name}')
            st.pyplot(plt)