Spaces:
Running
Running
File size: 8,108 Bytes
f460ec4 818bac1 f460ec4 818bac1 f460ec4 818bac1 f460ec4 818bac1 24d08c5 f460ec4 818bac1 f460ec4 818bac1 f460ec4 1758206 818bac1 f460ec4 818bac1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import streamlit as st
from datasets import load_dataset
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Cache the dataset and model to avoid reloading on every visit
@st.cache_data
def load_data():
dataset = load_dataset("Nooha/cc_fraud_detection_dataset")
df = pd.DataFrame(dataset['train'])
df = df.rename(columns={'Class': 'is_fraud'})
return df
@st.cache_resource
def load_model():
return joblib.load("cc_fraud_model.pkl")
@st.cache_resource
def load_scaler():
return joblib.load("cc_fraud_scaler.pkl")
# Feature explanations
feature_info = {
"city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.",
"cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.",
"unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).",
"amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.",
"acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.",
"zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)."
}
def get_random_choices(df, feature, num_choices=5):
return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist()
def main():
st.title("๐ณ Credit Card Fraud Detection Application")
st.write("โณ **NOTE:** Data loading may take some time as it contains **2 million rows**. ๐")
st.write("โ
Worry not! Once loaded, the dataset and models are **cached** for faster access next time. ๐")
with st.expander("๐ **About This Application**", expanded=False):
st.markdown("""
This application is designed to help you detect fraudulent credit card transactions using machine learning. ๐
It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions.
""")
with st.expander("โ ๏ธ **Why Fraud Detection Matters**", expanded=False):
st.markdown("""
๐ฐ Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually.
Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. ๐
This app demonstrates how machine learning can be used to identify suspicious transactions.
""")
with st.expander("โ๏ธ **How It Works**", expanded=False):
st.markdown("""
๐ **Features of this application:**
1. ๐ **Dataset Preview**: Explore the dataset used to train the model.
2. ๐ **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix.
3. ๐ **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate.
โ
Let's get started!
""")
df = load_data()
model = load_model()
scaler = load_scaler()
numeric_df = df.select_dtypes(include=['number'])
X = numeric_df.drop(columns=['is_fraud'])
y = numeric_df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
tab1, tab2, tab3 = st.tabs(["๐ Dataset Preview", "๐ Model Performance", "๐ Fraud Prediction"])
with tab1:
st.header("๐ Dataset Overview")
col1, col2 = st.columns(2)
with col1:
st.dataframe(df.head(20))
with col2:
st.metric("๐ Total Transactions", f"{len(df):,}")
st.metric("๐จ Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)")
chart = alt.Chart(df).mark_bar().encode(
x=alt.X('is_fraud:O', title='Fraud Status'),
y=alt.Y('count()', title='Count'),
color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red']))
)
st.altair_chart(chart, use_container_width=True)
with tab2:
st.header("๐ Model Performance")
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
st.metric("๐ฏ Model Accuracy", f"{accuracy:.4f}")
report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True)
report_df = pd.DataFrame(report_dict).T.round(3)
st.dataframe(report_df.style.format("{:.3f}"))
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
st.pyplot(fig)
with tab3:
st.header("๐ Fraud Prediction")
st.markdown("๐ก Select transaction details below.")
# Define feature descriptions
feature_descriptions = {
"acct_num": "๐ **Account Number** - Unique identifier for the transaction account.",
"amt": "๐ฐ **Transaction Amount** - The total amount involved in the transaction.",
"unix_time": "โณ **Unix Timestamp** - The time when the transaction occurred (in Unix format).",
"zip": "๐ฎ **ZIP Code** - Postal code for the transaction location.",
"city_pop": "๐ **City Population** - The number of residents in the city where the transaction took place.",
"cc_num": "๐ณ **Credit Card Number** - Anonymized credit card number used for the transaction."
}
available_features = X.columns.tolist()
# Feature selection UI
selected_features = st.multiselect("๐๏ธ Select Features to Use", available_features, default=available_features[:3])
# Display descriptions of selected features
for feature in selected_features:
st.markdown(feature_descriptions.get(feature, "โน๏ธ No description available for this feature."))
input_data = {}
# Ensure all required columns are present
for feature in X.columns:
if feature not in input_data:
input_data[feature] = 0 # Default value
input_df = pd.DataFrame([input_data])
col1, col2 = st.columns(2)
for i, feature in enumerate(selected_features):
choices = get_random_choices(df, feature)
with (col1 if i % 2 == 0 else col2):
input_data[feature] = st.selectbox(f"๐ข {feature}", choices)
if st.button("๐ Predict Fraudulence"):
input_df = pd.DataFrame([input_data])
input_scaled = scaler.transform(input_df)
prediction = model.predict(input_scaled)
confidence = model.predict_proba(input_scaled)[0]
st.subheader("๐ง Prediction Result")
if prediction[0] == 1:
st.toast("๐จ Fraudulent Transaction Detected! ๐ด", icon='โ ๏ธ')
st.error("This transaction is likely fraudulent.")
else:
st.toast("โ
Legitimate Transaction ๐ข", icon='โ๏ธ')
st.success("This transaction appears legitimate.")
st.progress(int(max(confidence) * 100))
st.write(f"๐ฏ **Confidence:** {max(confidence) * 100:.2f}%")
if __name__ == "__main__":
main()
|