Spaces:

Markndrei
/

fraud_detection_model

Running

App Files Files Community

Markndrei commited on Mar 3

Commit

818bac1

verified ·

1 Parent(s): f460ec4

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -76

app.py CHANGED Viewed

@@ -1,85 +1,172 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
-from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report
-from datasets import load_dataset
-# Load dataset from Hugging Face
-dataset = load_dataset("Nooha/cc_fraud_detection_dataset", split="train")
-df = pd.DataFrame(dataset)
-# Select relevant features and target variable
-X = df[['Amount', 'Time', 'V1', 'V2', 'V3']]
-y = df['Class']
-# Split dataset into training and testing sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-# Train a RandomForestClassifier model
-model = RandomForestClassifier(n_estimators=100, random_state=42)
-model.fit(X_train, y_train)
-y_pred = model.predict(X_test)
-# Model Performance Metrics
-accuracy = accuracy_score(y_test, y_pred)
-class_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
-# Application Title
-st.title('💳 Credit Card Fraud Detection System')
-st.markdown(
-    """
-    ## 📖 Introduction
-    Welcome to the **Credit Card Fraud Detection System**! This tool analyzes credit card transactions to detect fraudulent activity using a **Random Forest model**.
-    """
-)
-# Tab Structure
-tab1, tab2, tab3 = st.tabs(['📊 Dataset Preview', '📈 Model Performance', '🔍 Fraud Prediction'])
-# Dataset Preview
-with tab1:
-    st.markdown(
-        """
-        ## 📊 Dataset Preview
-        Below is a sample of the credit card transaction dataset used for fraud detection.
-        """
-    )
-    st.dataframe(df.head())
-# Model Performance
-with tab2:
-    st.markdown(
-        """
-        ## 📈 Model Performance
-        - **Accuracy:** Measures overall model performance.
-        - **Classification Report:** Precision, recall, and F1-score breakdown.
-        """
-    )
-    st.write(f"**📌 Model Accuracy:** {accuracy:.2%}")
-    st.markdown("### 📋 Classification Report")
-    st.dataframe(class_report_df)
-# Fraud Prediction
-with tab3:
-    st.markdown("""
-        ## 🔍 Fraud Prediction
-        Enter transaction details below to predict if it's fraudulent.
         """)
-    amount_input = st.number_input("💵 Transaction Amount", min_value=0.0, value=100.0, step=1.0)
-    time_input = st.number_input("⏳ Transaction Time", min_value=0.0, value=50000.0, step=1000.0)
-    v1_input = st.number_input("🔢 Feature V1", value=0.0, step=0.1)
-    v2_input = st.number_input("🔢 Feature V2", value=0.0, step=0.1)
-    v3_input = st.number_input("🔢 Feature V3", value=0.0, step=0.1)
-    if st.button("🔎 Predict Fraud"):
-        input_data = np.array([[amount_input, time_input, v1_input, v2_input, v3_input]])
-        prediction = model.predict(input_data)[0]
-        result = "🚨 Fraudulent" if prediction == 1 else "✅ Legitimate"
-        st.success(f"### 🎯 Prediction: **{result}**")

 import streamlit as st
+from datasets import load_dataset
 import pandas as pd
+import joblib
 import numpy as np
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+import altair as alt
+from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
+# Cache the dataset and model to avoid reloading on every visit
+@st.cache_data
+def load_data():
+    dataset = load_dataset("Nooha/cc_fraud_detection_dataset")
+    df = pd.DataFrame(dataset['train'])
+    df = df.rename(columns={'Class': 'is_fraud'})
+    return df
+@st.cache_resource
+def load_model():
+    return joblib.load("cc_fraud_model.pkl")
+@st.cache_resource
+def load_scaler():
+    return joblib.load("cc_fraud_scaler.pkl")
+# Feature explanations
+feature_info = {
+    "city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.",
+    "cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.",
+    "unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).",
+    "amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.",
+    "acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.",
+    "zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)."
+}
+def get_random_choices(df, feature, num_choices=5):
+    return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist()
+def main():
+    st.title("💳 Credit Card Fraud Detection Application")
+    with st.expander("🔍 **About This Application**", expanded=False):
+        st.markdown("""
+            This application is designed to help you detect fraudulent credit card transactions using machine learning. 🚀
+            It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions.
+        """)
+    with st.expander("⚠️ **Why Fraud Detection Matters**", expanded=False):
+        st.markdown("""
+            💰 Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually.
+            Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. 🔐
+            This app demonstrates how machine learning can be used to identify suspicious transactions.
+        """)
+    with st.expander("⚙️ **How It Works**", expanded=False):
+        st.markdown("""
+            🛠 **Features of this application:**
+            1. 📊 **Dataset Preview**: Explore the dataset used to train the model.
+            2. 📈 **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix.
+            3. 🔎 **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate.
+            ✅ Let's get started!
         """)
+    df = load_data()
+    model = load_model()
+    scaler = load_scaler()
+    numeric_df = df.select_dtypes(include=['number'])
+    X = numeric_df.drop(columns=['is_fraud'])
+    y = numeric_df['is_fraud']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    X_train_scaled = scaler.transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    tab1, tab2, tab3 = st.tabs(["📄 Dataset Preview", "📊 Model Performance", "🔍 Fraud Prediction"])
+    with tab1:
+        st.header("📄 Dataset Overview")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.dataframe(df.head(20))
+        with col2:
+            st.metric("🛒 Total Transactions", f"{len(df):,}")
+            st.metric("🚨 Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)")
+        chart = alt.Chart(df).mark_bar().encode(
+            x=alt.X('is_fraud:O', title='Fraud Status'),
+            y=alt.Y('count()', title='Count'),
+            color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red']))
+        )
+        st.altair_chart(chart, use_container_width=True)
+    with tab2:
+        st.header("📊 Model Performance")
+        y_pred = model.predict(X_test_scaled)
+        accuracy = accuracy_score(y_test, y_pred)
+        st.metric("🎯 Model Accuracy", f"{accuracy:.4f}")
+        report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True)
+        report_df = pd.DataFrame(report_dict).T.round(3)
+        st.dataframe(report_df.style.format("{:.3f}"))
+        cm = confusion_matrix(y_test, y_pred)
+        fig, ax = plt.subplots()
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
+        plt.xlabel("Predicted")
+        plt.ylabel("Actual")
+        st.pyplot(fig)
+    with tab3:
+        st.header("🔍 Fraud Prediction")
+        st.markdown("💡 Select transaction details below.")
+        # Define feature descriptions
+        feature_descriptions = {
+            "acct_num": "📌 **Account Number** - Unique identifier for the transaction account.",
+            "amt": "💰 **Transaction Amount** - The total amount involved in the transaction.",
+            "unix_time": "⏳ **Unix Timestamp** - The time when the transaction occurred (in Unix format).",
+            "zip": "📮 **ZIP Code** - Postal code for the transaction location.",
+            "city_pop": "🌆 **City Population** - The number of residents in the city where the transaction took place.",
+            "cc_num": "💳 **Credit Card Number** - Anonymized credit card number used for the transaction."
+        }
+        available_features = X.columns.tolist()
+        # Feature selection UI
+        selected_features = st.multiselect("🎛️ Select Features to Use", available_features, default=available_features[:3])
+        # Display descriptions of selected features
+        for feature in selected_features:
+            st.markdown(feature_descriptions.get(feature, "ℹ️ No description available for this feature."))
+        input_data = {}
+        # Ensure all required columns are present
+        for feature in X.columns:
+            if feature not in input_data:
+                input_data[feature] = 0  # Default value
+        input_df = pd.DataFrame([input_data])
+        col1, col2 = st.columns(2)
+        for i, feature in enumerate(selected_features):
+            choices = get_random_choices(df, feature)
+            with (col1 if i % 2 == 0 else col2):
+                input_data[feature] = st.selectbox(f"🔢 {feature}", choices)
+        if st.button("🚀 Predict Fraudulence"):
+            input_df = pd.DataFrame([input_data])
+            input_scaled = scaler.transform(input_df)
+            prediction = model.predict(input_scaled)
+            confidence = model.predict_proba(input_scaled)[0]
+            st.subheader("🧐 Prediction Result")
+            if prediction[0] == 1:
+                st.toast("🚨 Fraudulent Transaction Detected! 🔴", icon='⚠️')
+                st.error("This transaction is likely fraudulent.")
+            else:
+                st.toast("✅ Legitimate Transaction 🟢", icon='✔️')
+                st.success("This transaction appears legitimate.")
+            st.progress(int(max(confidence) * 100))
+            st.write(f"🎯 **Confidence:** {max(confidence) * 100:.2f}%")
+if __name__ == "__main__":
+    main()