Spaces:
Running
Running
import streamlit as st | |
from datasets import load_dataset | |
import pandas as pd | |
import joblib | |
import numpy as np | |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import altair as alt | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.model_selection import train_test_split | |
# Cache the dataset and model to avoid reloading on every visit | |
def load_data(): | |
dataset = load_dataset("Nooha/cc_fraud_detection_dataset") | |
df = pd.DataFrame(dataset['train']) | |
df = df.rename(columns={'Class': 'is_fraud'}) | |
return df | |
def load_model(): | |
return joblib.load("cc_fraud_model.pkl") | |
def load_scaler(): | |
return joblib.load("cc_fraud_scaler.pkl") | |
# Feature explanations | |
feature_info = { | |
"city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.", | |
"cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.", | |
"unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).", | |
"amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.", | |
"acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.", | |
"zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)." | |
} | |
def get_random_choices(df, feature, num_choices=5): | |
return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist() | |
def main(): | |
st.title("๐ณ Credit Card Fraud Detection Application") | |
st.write("โณ **NOTE:** Data loading may take some time as it contains **2 million rows**. ๐") | |
st.write("โ Worry not! Once loaded, the dataset and models are **cached** for faster access next time. ๐") | |
with st.expander("๐ **About This Application**", expanded=False): | |
st.markdown(""" | |
This application is designed to help you detect fraudulent credit card transactions using machine learning. ๐ | |
It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions. | |
""") | |
with st.expander("โ ๏ธ **Why Fraud Detection Matters**", expanded=False): | |
st.markdown(""" | |
๐ฐ Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually. | |
Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. ๐ | |
This app demonstrates how machine learning can be used to identify suspicious transactions. | |
""") | |
with st.expander("โ๏ธ **How It Works**", expanded=False): | |
st.markdown(""" | |
๐ **Features of this application:** | |
1. ๐ **Dataset Preview**: Explore the dataset used to train the model. | |
2. ๐ **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix. | |
3. ๐ **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate. | |
โ Let's get started! | |
""") | |
df = load_data() | |
model = load_model() | |
scaler = load_scaler() | |
numeric_df = df.select_dtypes(include=['number']) | |
X = numeric_df.drop(columns=['is_fraud']) | |
y = numeric_df['is_fraud'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
X_train_scaled = scaler.transform(X_train) | |
X_test_scaled = scaler.transform(X_test) | |
tab1, tab2, tab3 = st.tabs(["๐ Dataset Preview", "๐ Model Performance", "๐ Fraud Prediction"]) | |
with tab1: | |
st.header("๐ Dataset Overview") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.dataframe(df.head(20)) | |
with col2: | |
st.metric("๐ Total Transactions", f"{len(df):,}") | |
st.metric("๐จ Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)") | |
chart = alt.Chart(df).mark_bar().encode( | |
x=alt.X('is_fraud:O', title='Fraud Status'), | |
y=alt.Y('count()', title='Count'), | |
color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red'])) | |
) | |
st.altair_chart(chart, use_container_width=True) | |
with tab2: | |
st.header("๐ Model Performance") | |
y_pred = model.predict(X_test_scaled) | |
accuracy = accuracy_score(y_test, y_pred) | |
st.metric("๐ฏ Model Accuracy", f"{accuracy:.4f}") | |
report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True) | |
report_df = pd.DataFrame(report_dict).T.round(3) | |
st.dataframe(report_df.style.format("{:.3f}")) | |
cm = confusion_matrix(y_test, y_pred) | |
fig, ax = plt.subplots() | |
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud']) | |
plt.xlabel("Predicted") | |
plt.ylabel("Actual") | |
st.pyplot(fig) | |
with tab3: | |
st.header("๐ Fraud Prediction") | |
st.markdown("๐ก Select transaction details below.") | |
# Define feature descriptions | |
feature_descriptions = { | |
"acct_num": "๐ **Account Number** - Unique identifier for the transaction account.", | |
"amt": "๐ฐ **Transaction Amount** - The total amount involved in the transaction.", | |
"unix_time": "โณ **Unix Timestamp** - The time when the transaction occurred (in Unix format).", | |
"zip": "๐ฎ **ZIP Code** - Postal code for the transaction location.", | |
"city_pop": "๐ **City Population** - The number of residents in the city where the transaction took place.", | |
"cc_num": "๐ณ **Credit Card Number** - Anonymized credit card number used for the transaction." | |
} | |
available_features = X.columns.tolist() | |
# Feature selection UI | |
selected_features = st.multiselect("๐๏ธ Select Features to Use", available_features, default=available_features[:3]) | |
# Display descriptions of selected features | |
for feature in selected_features: | |
st.markdown(feature_descriptions.get(feature, "โน๏ธ No description available for this feature.")) | |
input_data = {} | |
# Ensure all required columns are present | |
for feature in X.columns: | |
if feature not in input_data: | |
input_data[feature] = 0 # Default value | |
input_df = pd.DataFrame([input_data]) | |
col1, col2 = st.columns(2) | |
for i, feature in enumerate(selected_features): | |
choices = get_random_choices(df, feature) | |
with (col1 if i % 2 == 0 else col2): | |
input_data[feature] = st.selectbox(f"๐ข {feature}", choices) | |
if st.button("๐ Predict Fraudulence"): | |
input_df = pd.DataFrame([input_data]) | |
input_scaled = scaler.transform(input_df) | |
prediction = model.predict(input_scaled) | |
confidence = model.predict_proba(input_scaled)[0] | |
st.subheader("๐ง Prediction Result") | |
if prediction[0] == 1: | |
st.toast("๐จ Fraudulent Transaction Detected! ๐ด", icon='โ ๏ธ') | |
st.error("This transaction is likely fraudulent.") | |
else: | |
st.toast("โ Legitimate Transaction ๐ข", icon='โ๏ธ') | |
st.success("This transaction appears legitimate.") | |
st.progress(int(max(confidence) * 100)) | |
st.write(f"๐ฏ **Confidence:** {max(confidence) * 100:.2f}%") | |
if __name__ == "__main__": | |
main() | |