Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import joblib | |
import json | |
import numpy as np | |
from datetime import datetime | |
from sklearn.neighbors import LocalOutlierFactor | |
# β Tabs for Application & Model Evaluation | |
app, model_eval = st.tabs(["Application", "π Model Evaluation"]) | |
# ---------------- APPLICATION TAB ---------------- # | |
with app: | |
# Load trained models | |
iso_forest = joblib.load("isolation_forest_model.pkl") | |
one_class_svm = joblib.load("one_class_svm_model.pkl") | |
lof_model = joblib.load("local_outlier_factor_model.pkl") # LOF model trained earlier | |
lof_threshold = joblib.load("lof_threshold.pkl") # Precomputed threshold for LOF | |
# Load location mapping | |
with open("location_mapping.json", "r") as f: | |
location_mapping = json.load(f) | |
# Manual mapping for categorical variables | |
transaction_type_mapping = {"Debit": 0, "Credit": 1} | |
channel_mapping = {"ATM": 0, "Online": 1, "Branch": 2} | |
day_of_week_mapping = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6} | |
st.title("Anomaly Detection for Bank Transactions") | |
# Sidebar for model selection | |
model_choice = st.sidebar.radio("Select Anomaly Detection Model", ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]) | |
# User inputs | |
date = st.date_input("Select Transaction Date") | |
time = st.time_input("Select Transaction Time") | |
location = st.selectbox("Select Location", options=list(location_mapping.keys())) | |
transaction_type = st.radio("Transaction Type", options=["Debit", "Credit"]) | |
channel = st.radio("Transaction Channel", options=["ATM", "Online", "Branch"]) | |
transaction_duration = st.slider("Transaction Duration (seconds)", min_value=0, max_value=600, value=30) | |
login_attempts = st.number_input("Login Attempts", min_value=0) | |
transaction_amount = st.number_input("Transaction Amount", min_value=0.0, format="%.2f") | |
if st.button("Check for Anomaly"): | |
# Convert date to day of the week | |
day_of_week = day_of_week_mapping[date.strftime('%A')] | |
# Convert time to total seconds since midnight | |
total_seconds = time.hour * 3600 + time.minute * 60 | |
# Convert categorical values to numeric | |
location_encoded = location_mapping.get(location, -1) # Default to -1 if not found | |
transaction_type_encoded = transaction_type_mapping[transaction_type] | |
channel_encoded = channel_mapping[channel] | |
# Ensure the order of features matches training | |
input_data = pd.DataFrame([[ | |
transaction_type_encoded, location_encoded, channel_encoded, total_seconds, | |
transaction_duration, login_attempts, day_of_week, transaction_amount | |
]], columns=[ | |
"TransactionType", "Location", "Channel", "Time", | |
"TransactionDuration", "LoginAttempts", "DayOfWeek", "TransactionAmount" | |
]) | |
if model_choice == "Isolation Forest": | |
prediction = iso_forest.predict(input_data)[0] | |
anomaly_label = "Anomalous" if prediction == -1 else "Normal" | |
elif model_choice == "One-Class SVM": | |
prediction = one_class_svm.predict(input_data)[0] | |
anomaly_label = "Anomalous" if prediction == -1 else "Normal" | |
elif model_choice == "Local Outlier Factor": | |
# Get the distance of input_data from the neighbors | |
distances, _ = lof_model.kneighbors(input_data) | |
avg_distance = np.mean(distances) | |
# Compare with the LOF threshold | |
anomaly_label = "Anomalous" if avg_distance > lof_threshold else "Normal" | |
# Display result | |
st.write(f"### The transaction is: **{anomaly_label}**") | |
# ---------------- MODEL EVALUATION TAB ---------------- # | |
with model_eval: | |
st.header("Model Evaluation") | |
st.write("The Anomaly Detection model was trained to classify bank transactions as 'Anomalous' or 'Normal'. The dataset was taken from Kaggle.") | |
st.write("Dataset by Vala Khorasani : [Kaggle Link](https://www.kaggle.com/datasets/valakhorasani/bank-transaction-dataset-for-fraud-detection)") | |
# Sidebar to choose which model's evaluation to display | |
eval_model_choice = st.sidebar.radio("Select Model for Evaluation", ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]) | |
# Display evaluation metrics based on selected model | |
if eval_model_choice == "Isolation Forest": | |
st.image("Anomaly_IF_counts.png", caption="Anomaly Counts - Isolation Forest", use_column_width=True) | |
st.write("### π Isolation Forest Performance") | |
st.write("- Detects anomalies based on random sub-sampling of data.") | |
st.write("- **Lower False Positives** in structured transaction data.") | |
elif eval_model_choice == "One-Class SVM": | |
st.image("Anomaly_OCSVM_counts.png", caption="Anomaly Counts - One-Class SVM", use_column_width=True) | |
st.write("### π One-Class SVM Performance") | |
st.write("- Uses a hyperplane to separate normal from anomalous data.") | |
st.write("- **Better suited for small datasets** but may be computationally expensive.") | |
elif eval_model_choice == "Local Outlier Factor": | |
st.image("Anomaly_LOF_counts.png", caption="Anomaly Counts - Local Outlier Factor", use_column_width=True) | |
st.write("### π Local Outlier Factor (LOF) Performance") | |
st.write("- Uses density-based analysis to detect anomalies.") | |
st.write("- **Best for identifying local anomalies**, but requires careful tuning of `k-neighbors`.") | |
st.image("silhouette_scores.png", caption="Silhouette Scores for All Models", use_column_width=True) | |
st.header("Comparison") | |
st.write("OCSVM (One-Class SVM) is likely overfitting or being too aggressive in marking transactions as anomalies. Its silhouette score of 0.00 suggests poor cluster structure, meaning it does not effectively separate normal vs. anomalous transactions. LOF (Local Outlier Factor) is the best-performing model because it has the highest silhouette score (0.16), indicating it maintains a clear distinction between anomalies and normal transactions. Isolation Forest (IF) is a close second, performing slightly worse than LOF but still better than OCSVM.") | |