# streamlit_app.py import streamlit as st import os import librosa import librosa.display import numpy as np import matplotlib.pyplot as plt import seaborn as sns import tensorflow as tf from tensorflow.keras.utils import to_categorical from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score, calibration_curve, ConfusionMatrixDisplay from keras.models import load_model SAMPLE_RATE = 16000 DURATION = 5 N_MELS = 128 MAX_TIME_STEPS = 109 NUM_CLASSES = 2 # Streamlit App st.title("Audio Spoofing Detection App") st.sidebar.header("Model Options") task = st.sidebar.selectbox("Select Task", ["Train Model", "Evaluate Model", "Visualize Spectrogram"]) if task == "Train Model": st.header("Train a New Model") uploaded_files = st.file_uploader("Upload FLAC Training Files", accept_multiple_files=True, type='flac') label_file = st.file_uploader("Upload Labels File (txt)", type="txt") if uploaded_files and label_file: # Parse the label file labels = {} for line in label_file.getvalue().decode("utf-8").splitlines(): parts = line.strip().split() file_name = parts[1] label = 1 if parts[-1] == "bonafide" else 0 labels[file_name] = label X, y = [], [] for file in uploaded_files: file_name = file.name.split(".")[0] label = labels[file_name] # Load audio file audio, _ = librosa.load(file, sr=SAMPLE_RATE, duration=DURATION) # Extract Mel spectrogram mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS) mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max) # Padding if mel_spectrogram.shape[1] < MAX_TIME_STEPS: mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, MAX_TIME_STEPS - mel_spectrogram.shape[1])), mode='constant') else: mel_spectrogram = mel_spectrogram[:, :MAX_TIME_STEPS] X.append(mel_spectrogram) y.append(label) X = np.array(X) y = np.array(y) y_encoded = to_categorical(y, NUM_CLASSES) # Split into train and validation sets split_index = int(0.8 * len(X)) X_train, X_val = X[:split_index], X[split_index:] y_train, y_val = y_encoded[:split_index], y_encoded[split_index:] input_shape = (N_MELS, X_train.shape[2], 1) # Define CNN model model_input = tf.keras.Input(shape=input_shape) x = tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu')(model_input) x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x) x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x) x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x) x = tf.keras.layers.Flatten()(x) x = tf.keras.layers.Dense(128, activation='relu')(x) x = tf.keras.layers.Dropout(0.5)(x) model_output = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(x) model = tf.keras.Model(inputs=model_input, outputs=model_output) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # Train the model if st.button("Start Training"): st.write("Training in progress...") model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val)) model.save("audio_classifier.h5") st.success("Training Complete. Model Saved!") if task == "Evaluate Model": st.header("Evaluate a Trained Model") model_file = st.file_uploader("Upload Model (h5)", type='h5') test_files = st.file_uploader("Upload Test FLAC Files", accept_multiple_files=True, type='flac') protocol_file = st.file_uploader("Upload Protocol File (txt)", type='txt') if model_file and test_files and protocol_file: # Load Model model = load_model(model_file) # Prepare test data X_test = [] for file in test_files: audio, _ = librosa.load(file, sr=SAMPLE_RATE, duration=DURATION) mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS) mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max) if mel_spectrogram.shape[1] < MAX_TIME_STEPS: mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, MAX_TIME_STEPS - mel_spectrogram.shape[1])), mode='constant') else: mel_spectrogram = mel_spectrogram[:, :MAX_TIME_STEPS] X_test.append(mel_spectrogram) X_test = np.array(X_test) y_pred = model.predict(X_test) y_pred_classes = np.argmax(y_pred, axis=1) # Parse the true labels true_labels = {} for line in protocol_file.getvalue().decode("utf-8").splitlines(): parts = line.strip().split() if len(parts) > 1: file_name = parts[0] label = parts[-1] true_labels[file_name] = 1 if label == "bonafide" else 0 y_true = np.array([label for label in true_labels.values()]) # Confusion Matrix cm = confusion_matrix(y_true, y_pred_classes) ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Spoof", "Bonafide"]).plot(cmap=plt.cm.Blues) st.pyplot(plt) # ROC Curve y_pred_prob = y_pred[:, 1] fpr, tpr, _ = roc_curve(y_true, y_pred_prob) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})') plt.legend(loc="lower right") st.pyplot(plt) # Precision-Recall Curve precision, recall, _ = precision_recall_curve(y_true, y_pred_prob) avg_precision = average_precision_score(y_true, y_pred_prob) plt.figure() plt.plot(recall, precision, color='darkorange', lw=2, label=f'Avg. Precision = {avg_precision:.2f}') st.pyplot(plt) if task == "Visualize Spectrogram": st.header("Visualize Mel Spectrogram") test_files = st.file_uploader("Upload Test FLAC Files", accept_multiple_files=True, type='flac') if test_files: for file in test_files: audio, _ = librosa.load(file, sr=SAMPLE_RATE, duration=DURATION) mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS) mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max) plt.figure(figsize=(10, 6)) librosa.display.specshow(mel_spectrogram, x_axis='time', y_axis='mel', sr=SAMPLE_RATE) plt.colorbar(format='%+2.0f dB') plt.title(f'Mel Spectrogram - {file.name}') st.pyplot(plt)