import streamlit as st import pandas as pd import joblib import json import numpy as np from datetime import datetime from sklearn.neighbors import LocalOutlierFactor # ✅ Tabs for Application & Model Evaluation app, model_eval = st.tabs(["Application", "📊 Model Evaluation"]) # ---------------- APPLICATION TAB ---------------- # with app: # Load trained models iso_forest = joblib.load("isolation_forest_model.pkl") one_class_svm = joblib.load("one_class_svm_model.pkl") lof_model = joblib.load("local_outlier_factor_model.pkl") # LOF model trained earlier lof_threshold = joblib.load("lof_threshold.pkl") # Precomputed threshold for LOF # Load location mapping with open("location_mapping.json", "r") as f: location_mapping = json.load(f) # Manual mapping for categorical variables transaction_type_mapping = {"Debit": 0, "Credit": 1} channel_mapping = {"ATM": 0, "Online": 1, "Branch": 2} day_of_week_mapping = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6} st.title("Anomaly Detection for Bank Transactions") # Sidebar for model selection model_choice = st.sidebar.radio("Select Anomaly Detection Model", ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]) # User inputs date = st.date_input("Select Transaction Date") time = st.time_input("Select Transaction Time") location = st.selectbox("Select Location", options=list(location_mapping.keys())) transaction_type = st.radio("Transaction Type", options=["Debit", "Credit"]) channel = st.radio("Transaction Channel", options=["ATM", "Online", "Branch"]) transaction_duration = st.slider("Transaction Duration (seconds)", min_value=0, max_value=600, value=30) login_attempts = st.number_input("Login Attempts", min_value=0) transaction_amount = st.number_input("Transaction Amount", min_value=0.0, format="%.2f") if st.button("Check for Anomaly"): # Convert date to day of the week day_of_week = day_of_week_mapping[date.strftime('%A')] # Convert time to total seconds since midnight total_seconds = time.hour * 3600 + time.minute * 60 # Convert categorical values to numeric location_encoded = location_mapping.get(location, -1) # Default to -1 if not found transaction_type_encoded = transaction_type_mapping[transaction_type] channel_encoded = channel_mapping[channel] # Ensure the order of features matches training input_data = pd.DataFrame([[ transaction_type_encoded, location_encoded, channel_encoded, total_seconds, transaction_duration, login_attempts, day_of_week, transaction_amount ]], columns=[ "TransactionType", "Location", "Channel", "Time", "TransactionDuration", "LoginAttempts", "DayOfWeek", "TransactionAmount" ]) if model_choice == "Isolation Forest": prediction = iso_forest.predict(input_data)[0] anomaly_label = "Anomalous" if prediction == -1 else "Normal" elif model_choice == "One-Class SVM": prediction = one_class_svm.predict(input_data)[0] anomaly_label = "Anomalous" if prediction == -1 else "Normal" elif model_choice == "Local Outlier Factor": # Get the distance of input_data from the neighbors distances, _ = lof_model.kneighbors(input_data) avg_distance = np.mean(distances) # Compare with the LOF threshold anomaly_label = "Anomalous" if avg_distance > lof_threshold else "Normal" # Display result st.write(f"### The transaction is: **{anomaly_label}**") # ---------------- MODEL EVALUATION TAB ---------------- # with model_eval: st.header("Model Evaluation") st.write("The Anomaly Detection model was trained to classify bank transactions as 'Anomalous' or 'Normal'. The dataset was taken from Kaggle.") st.write("Dataset by Vala Khorasani : [Kaggle Link](https://www.kaggle.com/datasets/valakhorasani/bank-transaction-dataset-for-fraud-detection)") # Sidebar to choose which model's evaluation to display eval_model_choice = st.sidebar.radio("Select Model for Evaluation", ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]) # Display evaluation metrics based on selected model if eval_model_choice == "Isolation Forest": st.image("Anomaly_IF_counts.png", caption="Anomaly Counts - Isolation Forest", use_column_width=True) st.write("### 📌 Isolation Forest Performance") st.write("- Detects anomalies based on random sub-sampling of data.") st.write("- **Lower False Positives** in structured transaction data.") elif eval_model_choice == "One-Class SVM": st.image("Anomaly_OCSVM_counts.png", caption="Anomaly Counts - One-Class SVM", use_column_width=True) st.write("### 📌 One-Class SVM Performance") st.write("- Uses a hyperplane to separate normal from anomalous data.") st.write("- **Better suited for small datasets** but may be computationally expensive.") elif eval_model_choice == "Local Outlier Factor": st.image("Anomaly_LOF_counts.png", caption="Anomaly Counts - Local Outlier Factor", use_column_width=True) st.write("### 📌 Local Outlier Factor (LOF) Performance") st.write("- Uses density-based analysis to detect anomalies.") st.write("- **Best for identifying local anomalies**, but requires careful tuning of `k-neighbors`.") st.image("silhouette_scores.png", caption="Silhouette Scores for All Models", use_column_width=True) st.header("Comparison") st.write("OCSVM (One-Class SVM) is likely overfitting or being too aggressive in marking transactions as anomalies. Its silhouette score of 0.00 suggests poor cluster structure, meaning it does not effectively separate normal vs. anomalous transactions. LOF (Local Outlier Factor) is the best-performing model because it has the highest silhouette score (0.16), indicating it maintains a clear distinction between anomalies and normal transactions. Isolation Forest (IF) is a close second, performing slightly worse than LOF but still better than OCSVM.")