File size: 8,108 Bytes
f460ec4
818bac1
f460ec4
818bac1
f460ec4
818bac1
 
 
 
 
f460ec4
 
818bac1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24d08c5
 
 
f460ec4
818bac1
 
 
 
 
 
 
 
 
 
 
 
f460ec4
818bac1
 
 
 
 
 
 
 
f460ec4
1758206
818bac1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f460ec4
818bac1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import streamlit as st
from datasets import load_dataset
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Cache the dataset and model to avoid reloading on every visit
@st.cache_data
def load_data():
    dataset = load_dataset("Nooha/cc_fraud_detection_dataset")
    df = pd.DataFrame(dataset['train'])
    df = df.rename(columns={'Class': 'is_fraud'})
    return df

@st.cache_resource
def load_model():
    return joblib.load("cc_fraud_model.pkl")

@st.cache_resource
def load_scaler():
    return joblib.load("cc_fraud_scaler.pkl")

# Feature explanations
feature_info = {
    "city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.",
    "cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.",
    "unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).",
    "amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.",
    "acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.",
    "zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)."
}

def get_random_choices(df, feature, num_choices=5):
    return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist()

def main():
    st.title("๐Ÿ’ณ Credit Card Fraud Detection Application")
    st.write("โณ **NOTE:** Data loading may take some time as it contains **2 million rows**. ๐Ÿ“Š")  
    st.write("โœ… Worry not! Once loaded, the dataset and models are **cached** for faster access next time. ๐Ÿš€")

    
    with st.expander("๐Ÿ” **About This Application**", expanded=False):
        st.markdown("""
            This application is designed to help you detect fraudulent credit card transactions using machine learning. ๐Ÿš€
            It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions.
        """)

    with st.expander("โš ๏ธ **Why Fraud Detection Matters**", expanded=False):
        st.markdown("""
            ๐Ÿ’ฐ Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually. 
            Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. ๐Ÿ”
            This app demonstrates how machine learning can be used to identify suspicious transactions.
        """)
    
    with st.expander("โš™๏ธ **How It Works**", expanded=False):
        st.markdown("""
            ๐Ÿ›  **Features of this application:**
            1. ๐Ÿ“Š **Dataset Preview**: Explore the dataset used to train the model.
            2. ๐Ÿ“ˆ **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix.
            3. ๐Ÿ”Ž **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate.
            
            โœ… Let's get started!
        """)

    df = load_data()
    model = load_model()
    scaler = load_scaler()

    numeric_df = df.select_dtypes(include=['number'])
    X = numeric_df.drop(columns=['is_fraud'])
    y = numeric_df['is_fraud']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    tab1, tab2, tab3 = st.tabs(["๐Ÿ“„ Dataset Preview", "๐Ÿ“Š Model Performance", "๐Ÿ” Fraud Prediction"])
    
    with tab1:
        st.header("๐Ÿ“„ Dataset Overview")
        col1, col2 = st.columns(2)
        with col1:
            st.dataframe(df.head(20))
        with col2:
            st.metric("๐Ÿ›’ Total Transactions", f"{len(df):,}")
            st.metric("๐Ÿšจ Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)")
            
        chart = alt.Chart(df).mark_bar().encode(
            x=alt.X('is_fraud:O', title='Fraud Status'),
            y=alt.Y('count()', title='Count'),
            color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red']))
        )
        st.altair_chart(chart, use_container_width=True)
    
    with tab2:
        st.header("๐Ÿ“Š Model Performance")
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        st.metric("๐ŸŽฏ Model Accuracy", f"{accuracy:.4f}")
        
        report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True)
        report_df = pd.DataFrame(report_dict).T.round(3)
        st.dataframe(report_df.style.format("{:.3f}"))
        
        cm = confusion_matrix(y_test, y_pred)
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        st.pyplot(fig)
    
    with tab3:
        st.header("๐Ÿ” Fraud Prediction")
        st.markdown("๐Ÿ’ก Select transaction details below.")

        # Define feature descriptions
        feature_descriptions = {
            "acct_num": "๐Ÿ“Œ **Account Number** - Unique identifier for the transaction account.",
            "amt": "๐Ÿ’ฐ **Transaction Amount** - The total amount involved in the transaction.",
            "unix_time": "โณ **Unix Timestamp** - The time when the transaction occurred (in Unix format).",
            "zip": "๐Ÿ“ฎ **ZIP Code** - Postal code for the transaction location.",
            "city_pop": "๐ŸŒ† **City Population** - The number of residents in the city where the transaction took place.",
            "cc_num": "๐Ÿ’ณ **Credit Card Number** - Anonymized credit card number used for the transaction."
        }

        available_features = X.columns.tolist()
        
        # Feature selection UI
        selected_features = st.multiselect("๐ŸŽ›๏ธ Select Features to Use", available_features, default=available_features[:3])
        
        # Display descriptions of selected features
        for feature in selected_features:
            st.markdown(feature_descriptions.get(feature, "โ„น๏ธ No description available for this feature."))

        input_data = {}

        # Ensure all required columns are present
        for feature in X.columns:
            if feature not in input_data:
                input_data[feature] = 0  # Default value

        input_df = pd.DataFrame([input_data])

        col1, col2 = st.columns(2)
        for i, feature in enumerate(selected_features):
            choices = get_random_choices(df, feature)
            with (col1 if i % 2 == 0 else col2):
                input_data[feature] = st.selectbox(f"๐Ÿ”ข {feature}", choices)

        if st.button("๐Ÿš€ Predict Fraudulence"):
            input_df = pd.DataFrame([input_data])
            input_scaled = scaler.transform(input_df)
            prediction = model.predict(input_scaled)
            confidence = model.predict_proba(input_scaled)[0]

            st.subheader("๐Ÿง Prediction Result")
            if prediction[0] == 1:
                st.toast("๐Ÿšจ Fraudulent Transaction Detected! ๐Ÿ”ด", icon='โš ๏ธ')
                st.error("This transaction is likely fraudulent.")
            else:
                st.toast("โœ… Legitimate Transaction ๐ŸŸข", icon='โœ”๏ธ')
                st.success("This transaction appears legitimate.")

            st.progress(int(max(confidence) * 100))
            st.write(f"๐ŸŽฏ **Confidence:** {max(confidence) * 100:.2f}%")


if __name__ == "__main__":
    main()