FraudDetection / app.py
CristopherWVSU's picture
Update app.py
eceb2da verified
import streamlit as st
import pandas as pd
import joblib
import json
import numpy as np
from datetime import datetime
from sklearn.neighbors import LocalOutlierFactor
# βœ… Tabs for Application & Model Evaluation
app, model_eval = st.tabs(["Application", "πŸ“Š Model Evaluation"])
# ---------------- APPLICATION TAB ---------------- #
with app:
# Load trained models
iso_forest = joblib.load("isolation_forest_model.pkl")
one_class_svm = joblib.load("one_class_svm_model.pkl")
lof_model = joblib.load("local_outlier_factor_model.pkl") # LOF model trained earlier
lof_threshold = joblib.load("lof_threshold.pkl") # Precomputed threshold for LOF
# Load location mapping
with open("location_mapping.json", "r") as f:
location_mapping = json.load(f)
# Manual mapping for categorical variables
transaction_type_mapping = {"Debit": 0, "Credit": 1}
channel_mapping = {"ATM": 0, "Online": 1, "Branch": 2}
day_of_week_mapping = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}
st.title("Anomaly Detection for Bank Transactions")
# Sidebar for model selection
model_choice = st.sidebar.radio("Select Anomaly Detection Model", ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"])
# User inputs
date = st.date_input("Select Transaction Date")
time = st.time_input("Select Transaction Time")
location = st.selectbox("Select Location", options=list(location_mapping.keys()))
transaction_type = st.radio("Transaction Type", options=["Debit", "Credit"])
channel = st.radio("Transaction Channel", options=["ATM", "Online", "Branch"])
transaction_duration = st.slider("Transaction Duration (seconds)", min_value=0, max_value=600, value=30)
login_attempts = st.number_input("Login Attempts", min_value=0)
transaction_amount = st.number_input("Transaction Amount", min_value=0.0, format="%.2f")
if st.button("Check for Anomaly"):
# Convert date to day of the week
day_of_week = day_of_week_mapping[date.strftime('%A')]
# Convert time to total seconds since midnight
total_seconds = time.hour * 3600 + time.minute * 60
# Convert categorical values to numeric
location_encoded = location_mapping.get(location, -1) # Default to -1 if not found
transaction_type_encoded = transaction_type_mapping[transaction_type]
channel_encoded = channel_mapping[channel]
# Ensure the order of features matches training
input_data = pd.DataFrame([[
transaction_type_encoded, location_encoded, channel_encoded, total_seconds,
transaction_duration, login_attempts, day_of_week, transaction_amount
]], columns=[
"TransactionType", "Location", "Channel", "Time",
"TransactionDuration", "LoginAttempts", "DayOfWeek", "TransactionAmount"
])
if model_choice == "Isolation Forest":
prediction = iso_forest.predict(input_data)[0]
anomaly_label = "Anomalous" if prediction == -1 else "Normal"
elif model_choice == "One-Class SVM":
prediction = one_class_svm.predict(input_data)[0]
anomaly_label = "Anomalous" if prediction == -1 else "Normal"
elif model_choice == "Local Outlier Factor":
# Get the distance of input_data from the neighbors
distances, _ = lof_model.kneighbors(input_data)
avg_distance = np.mean(distances)
# Compare with the LOF threshold
anomaly_label = "Anomalous" if avg_distance > lof_threshold else "Normal"
# Display result
st.write(f"### The transaction is: **{anomaly_label}**")
# ---------------- MODEL EVALUATION TAB ---------------- #
with model_eval:
st.header("Model Evaluation")
st.write("The Anomaly Detection model was trained to classify bank transactions as 'Anomalous' or 'Normal'. The dataset was taken from Kaggle.")
st.write("Dataset by Vala Khorasani : [Kaggle Link](https://www.kaggle.com/datasets/valakhorasani/bank-transaction-dataset-for-fraud-detection)")
# Sidebar to choose which model's evaluation to display
eval_model_choice = st.sidebar.radio("Select Model for Evaluation", ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"])
# Display evaluation metrics based on selected model
if eval_model_choice == "Isolation Forest":
st.image("Anomaly_IF_counts.png", caption="Anomaly Counts - Isolation Forest", use_column_width=True)
st.write("### πŸ“Œ Isolation Forest Performance")
st.write("- Detects anomalies based on random sub-sampling of data.")
st.write("- **Lower False Positives** in structured transaction data.")
elif eval_model_choice == "One-Class SVM":
st.image("Anomaly_OCSVM_counts.png", caption="Anomaly Counts - One-Class SVM", use_column_width=True)
st.write("### πŸ“Œ One-Class SVM Performance")
st.write("- Uses a hyperplane to separate normal from anomalous data.")
st.write("- **Better suited for small datasets** but may be computationally expensive.")
elif eval_model_choice == "Local Outlier Factor":
st.image("Anomaly_LOF_counts.png", caption="Anomaly Counts - Local Outlier Factor", use_column_width=True)
st.write("### πŸ“Œ Local Outlier Factor (LOF) Performance")
st.write("- Uses density-based analysis to detect anomalies.")
st.write("- **Best for identifying local anomalies**, but requires careful tuning of `k-neighbors`.")
st.image("silhouette_scores.png", caption="Silhouette Scores for All Models", use_column_width=True)
st.header("Comparison")
st.write("OCSVM (One-Class SVM) is likely overfitting or being too aggressive in marking transactions as anomalies. Its silhouette score of 0.00 suggests poor cluster structure, meaning it does not effectively separate normal vs. anomalous transactions. LOF (Local Outlier Factor) is the best-performing model because it has the highest silhouette score (0.16), indicating it maintains a clear distinction between anomalies and normal transactions. Isolation Forest (IF) is a close second, performing slightly worse than LOF but still better than OCSVM.")