import streamlit as st from datasets import load_dataset import pandas as pd import joblib import numpy as np from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import altair as alt from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split # Cache the dataset and model to avoid reloading on every visit @st.cache_data def load_data(): dataset = load_dataset("Nooha/cc_fraud_detection_dataset") df = pd.DataFrame(dataset['train']) df = df.rename(columns={'Class': 'is_fraud'}) return df @st.cache_resource def load_model(): return joblib.load("cc_fraud_model.pkl") @st.cache_resource def load_scaler(): return joblib.load("cc_fraud_scaler.pkl") # Feature explanations feature_info = { "city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.", "cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.", "unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).", "amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.", "acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.", "zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)." } def get_random_choices(df, feature, num_choices=5): return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist() def main(): st.title("đŸ’ŗ Credit Card Fraud Detection Application") st.write("âŗ **NOTE:** Data loading may take some time as it contains **2 million rows**. 📊") st.write("✅ Worry not! Once loaded, the dataset and models are **cached** for faster access next time. 🚀") with st.expander("🔍 **About This Application**", expanded=False): st.markdown(""" This application is designed to help you detect fraudulent credit card transactions using machine learning. 🚀 It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions. """) with st.expander("âš ī¸ **Why Fraud Detection Matters**", expanded=False): st.markdown(""" 💰 Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually. Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. 🔐 This app demonstrates how machine learning can be used to identify suspicious transactions. """) with st.expander("âš™ī¸ **How It Works**", expanded=False): st.markdown(""" 🛠 **Features of this application:** 1. 📊 **Dataset Preview**: Explore the dataset used to train the model. 2. 📈 **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix. 3. 🔎 **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate. ✅ Let's get started! """) df = load_data() model = load_model() scaler = load_scaler() numeric_df = df.select_dtypes(include=['number']) X = numeric_df.drop(columns=['is_fraud']) y = numeric_df['is_fraud'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) tab1, tab2, tab3 = st.tabs(["📄 Dataset Preview", "📊 Model Performance", "🔍 Fraud Prediction"]) with tab1: st.header("📄 Dataset Overview") col1, col2 = st.columns(2) with col1: st.dataframe(df.head(20)) with col2: st.metric("🛒 Total Transactions", f"{len(df):,}") st.metric("🚨 Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)") chart = alt.Chart(df).mark_bar().encode( x=alt.X('is_fraud:O', title='Fraud Status'), y=alt.Y('count()', title='Count'), color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red'])) ) st.altair_chart(chart, use_container_width=True) with tab2: st.header("📊 Model Performance") y_pred = model.predict(X_test_scaled) accuracy = accuracy_score(y_test, y_pred) st.metric("đŸŽ¯ Model Accuracy", f"{accuracy:.4f}") report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True) report_df = pd.DataFrame(report_dict).T.round(3) st.dataframe(report_df.style.format("{:.3f}")) cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud']) plt.xlabel("Predicted") plt.ylabel("Actual") st.pyplot(fig) with tab3: st.header("🔍 Fraud Prediction") st.markdown("💡 Select transaction details below.") # Define feature descriptions feature_descriptions = { "acct_num": "📌 **Account Number** - Unique identifier for the transaction account.", "amt": "💰 **Transaction Amount** - The total amount involved in the transaction.", "unix_time": "âŗ **Unix Timestamp** - The time when the transaction occurred (in Unix format).", "zip": "📮 **ZIP Code** - Postal code for the transaction location.", "city_pop": "🌆 **City Population** - The number of residents in the city where the transaction took place.", "cc_num": "đŸ’ŗ **Credit Card Number** - Anonymized credit card number used for the transaction." } available_features = X.columns.tolist() # Feature selection UI selected_features = st.multiselect("đŸŽ›ī¸ Select Features to Use", available_features, default=available_features[:3]) # Display descriptions of selected features for feature in selected_features: st.markdown(feature_descriptions.get(feature, "â„šī¸ No description available for this feature.")) input_data = {} # Ensure all required columns are present for feature in X.columns: if feature not in input_data: input_data[feature] = 0 # Default value input_df = pd.DataFrame([input_data]) col1, col2 = st.columns(2) for i, feature in enumerate(selected_features): choices = get_random_choices(df, feature) with (col1 if i % 2 == 0 else col2): input_data[feature] = st.selectbox(f"đŸ”ĸ {feature}", choices) if st.button("🚀 Predict Fraudulence"): input_df = pd.DataFrame([input_data]) input_scaled = scaler.transform(input_df) prediction = model.predict(input_scaled) confidence = model.predict_proba(input_scaled)[0] st.subheader("🧐 Prediction Result") if prediction[0] == 1: st.toast("🚨 Fraudulent Transaction Detected! 🔴", icon='âš ī¸') st.error("This transaction is likely fraudulent.") else: st.toast("✅ Legitimate Transaction đŸŸĸ", icon='âœ”ī¸') st.success("This transaction appears legitimate.") st.progress(int(max(confidence) * 100)) st.write(f"đŸŽ¯ **Confidence:** {max(confidence) * 100:.2f}%") if __name__ == "__main__": main()