Spaces:

Markndrei
/

fraud_detection_model

Running

File size: 8,108 Bytes

import streamlit as st
from datasets import load_dataset
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Cache the dataset and model to avoid reloading on every visit
@st.cache_data
def load_data():
    dataset = load_dataset("Nooha/cc_fraud_detection_dataset")
    df = pd.DataFrame(dataset['train'])
    df = df.rename(columns={'Class': 'is_fraud'})
    return df

@st.cache_resource
def load_model():
    return joblib.load("cc_fraud_model.pkl")

@st.cache_resource
def load_scaler():
    return joblib.load("cc_fraud_scaler.pkl")

# Feature explanations
feature_info = {
    "city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.",
    "cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.",
    "unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).",
    "amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.",
    "acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.",
    "zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)."
}

def get_random_choices(df, feature, num_choices=5):
    return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist()

def main():
    st.title("💳 Credit Card Fraud Detection Application")
    st.write("⏳ **NOTE:** Data loading may take some time as it contains **2 million rows**. 📊")  
    st.write("✅ Worry not! Once loaded, the dataset and models are **cached** for faster access next time. 🚀")

    
    with st.expander("🔍 **About This Application**", expanded=False):
        st.markdown("""
            This application is designed to help you detect fraudulent credit card transactions using machine learning. 🚀
            It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions.
        """)

    with st.expander("⚠️ **Why Fraud Detection Matters**", expanded=False):
        st.markdown("""
            💰 Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually. 
            Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. 🔐
            This app demonstrates how machine learning can be used to identify suspicious transactions.
        """)
    
    with st.expander("⚙️ **How It Works**", expanded=False):
        st.markdown("""
            🛠 **Features of this application:**
            1. 📊 **Dataset Preview**: Explore the dataset used to train the model.
            2. 📈 **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix.
            3. 🔎 **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate.
            
            ✅ Let's get started!
        """)

    df = load_data()
    model = load_model()
    scaler = load_scaler()

    numeric_df = df.select_dtypes(include=['number'])
    X = numeric_df.drop(columns=['is_fraud'])
    y = numeric_df['is_fraud']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    tab1, tab2, tab3 = st.tabs(["📄 Dataset Preview", "📊 Model Performance", "🔍 Fraud Prediction"])
    
    with tab1:
        st.header("📄 Dataset Overview")
        col1, col2 = st.columns(2)
        with col1:
            st.dataframe(df.head(20))
        with col2:
            st.metric("🛒 Total Transactions", f"{len(df):,}")
            st.metric("🚨 Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)")
            
        chart = alt.Chart(df).mark_bar().encode(
            x=alt.X('is_fraud:O', title='Fraud Status'),
            y=alt.Y('count()', title='Count'),
            color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red']))
        )
        st.altair_chart(chart, use_container_width=True)
    
    with tab2:
        st.header("📊 Model Performance")
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        st.metric("🎯 Model Accuracy", f"{accuracy:.4f}")
        
        report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True)
        report_df = pd.DataFrame(report_dict).T.round(3)
        st.dataframe(report_df.style.format("{:.3f}"))
        
        cm = confusion_matrix(y_test, y_pred)
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        st.pyplot(fig)
    
    with tab3:
        st.header("🔍 Fraud Prediction")
        st.markdown("💡 Select transaction details below.")

        # Define feature descriptions
        feature_descriptions = {
            "acct_num": "📌 **Account Number** - Unique identifier for the transaction account.",
            "amt": "💰 **Transaction Amount** - The total amount involved in the transaction.",
            "unix_time": "⏳ **Unix Timestamp** - The time when the transaction occurred (in Unix format).",
            "zip": "📮 **ZIP Code** - Postal code for the transaction location.",
            "city_pop": "🌆 **City Population** - The number of residents in the city where the transaction took place.",
            "cc_num": "💳 **Credit Card Number** - Anonymized credit card number used for the transaction."
        }

        available_features = X.columns.tolist()
        
        # Feature selection UI
        selected_features = st.multiselect("🎛️ Select Features to Use", available_features, default=available_features[:3])
        
        # Display descriptions of selected features
        for feature in selected_features:
            st.markdown(feature_descriptions.get(feature, "ℹ️ No description available for this feature."))

        input_data = {}

        # Ensure all required columns are present
        for feature in X.columns:
            if feature not in input_data:
                input_data[feature] = 0  # Default value

        input_df = pd.DataFrame([input_data])

        col1, col2 = st.columns(2)
        for i, feature in enumerate(selected_features):
            choices = get_random_choices(df, feature)
            with (col1 if i % 2 == 0 else col2):
                input_data[feature] = st.selectbox(f"🔢 {feature}", choices)

        if st.button("🚀 Predict Fraudulence"):
            input_df = pd.DataFrame([input_data])
            input_scaled = scaler.transform(input_df)
            prediction = model.predict(input_scaled)
            confidence = model.predict_proba(input_scaled)[0]

            st.subheader("🧐 Prediction Result")
            if prediction[0] == 1:
                st.toast("🚨 Fraudulent Transaction Detected! 🔴", icon='⚠️')
                st.error("This transaction is likely fraudulent.")
            else:
                st.toast("✅ Legitimate Transaction 🟢", icon='✔️')
                st.success("This transaction appears legitimate.")

            st.progress(int(max(confidence) * 100))
            st.write(f"🎯 **Confidence:** {max(confidence) * 100:.2f}%")


if __name__ == "__main__":
    main()