Markndrei's picture
Update app.py
24d08c5 verified
raw
history blame
8.11 kB
import streamlit as st
from datasets import load_dataset
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Cache the dataset and model to avoid reloading on every visit
@st.cache_data
def load_data():
dataset = load_dataset("Nooha/cc_fraud_detection_dataset")
df = pd.DataFrame(dataset['train'])
df = df.rename(columns={'Class': 'is_fraud'})
return df
@st.cache_resource
def load_model():
return joblib.load("cc_fraud_model.pkl")
@st.cache_resource
def load_scaler():
return joblib.load("cc_fraud_scaler.pkl")
# Feature explanations
feature_info = {
"city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.",
"cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.",
"unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).",
"amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.",
"acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.",
"zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)."
}
def get_random_choices(df, feature, num_choices=5):
return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist()
def main():
st.title("๐Ÿ’ณ Credit Card Fraud Detection Application")
st.write("โณ **NOTE:** Data loading may take some time as it contains **2 million rows**. ๐Ÿ“Š")
st.write("โœ… Worry not! Once loaded, the dataset and models are **cached** for faster access next time. ๐Ÿš€")
with st.expander("๐Ÿ” **About This Application**", expanded=False):
st.markdown("""
This application is designed to help you detect fraudulent credit card transactions using machine learning. ๐Ÿš€
It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions.
""")
with st.expander("โš ๏ธ **Why Fraud Detection Matters**", expanded=False):
st.markdown("""
๐Ÿ’ฐ Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually.
Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. ๐Ÿ”
This app demonstrates how machine learning can be used to identify suspicious transactions.
""")
with st.expander("โš™๏ธ **How It Works**", expanded=False):
st.markdown("""
๐Ÿ›  **Features of this application:**
1. ๐Ÿ“Š **Dataset Preview**: Explore the dataset used to train the model.
2. ๐Ÿ“ˆ **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix.
3. ๐Ÿ”Ž **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate.
โœ… Let's get started!
""")
df = load_data()
model = load_model()
scaler = load_scaler()
numeric_df = df.select_dtypes(include=['number'])
X = numeric_df.drop(columns=['is_fraud'])
y = numeric_df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
tab1, tab2, tab3 = st.tabs(["๐Ÿ“„ Dataset Preview", "๐Ÿ“Š Model Performance", "๐Ÿ” Fraud Prediction"])
with tab1:
st.header("๐Ÿ“„ Dataset Overview")
col1, col2 = st.columns(2)
with col1:
st.dataframe(df.head(20))
with col2:
st.metric("๐Ÿ›’ Total Transactions", f"{len(df):,}")
st.metric("๐Ÿšจ Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)")
chart = alt.Chart(df).mark_bar().encode(
x=alt.X('is_fraud:O', title='Fraud Status'),
y=alt.Y('count()', title='Count'),
color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red']))
)
st.altair_chart(chart, use_container_width=True)
with tab2:
st.header("๐Ÿ“Š Model Performance")
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
st.metric("๐ŸŽฏ Model Accuracy", f"{accuracy:.4f}")
report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True)
report_df = pd.DataFrame(report_dict).T.round(3)
st.dataframe(report_df.style.format("{:.3f}"))
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
st.pyplot(fig)
with tab3:
st.header("๐Ÿ” Fraud Prediction")
st.markdown("๐Ÿ’ก Select transaction details below.")
# Define feature descriptions
feature_descriptions = {
"acct_num": "๐Ÿ“Œ **Account Number** - Unique identifier for the transaction account.",
"amt": "๐Ÿ’ฐ **Transaction Amount** - The total amount involved in the transaction.",
"unix_time": "โณ **Unix Timestamp** - The time when the transaction occurred (in Unix format).",
"zip": "๐Ÿ“ฎ **ZIP Code** - Postal code for the transaction location.",
"city_pop": "๐ŸŒ† **City Population** - The number of residents in the city where the transaction took place.",
"cc_num": "๐Ÿ’ณ **Credit Card Number** - Anonymized credit card number used for the transaction."
}
available_features = X.columns.tolist()
# Feature selection UI
selected_features = st.multiselect("๐ŸŽ›๏ธ Select Features to Use", available_features, default=available_features[:3])
# Display descriptions of selected features
for feature in selected_features:
st.markdown(feature_descriptions.get(feature, "โ„น๏ธ No description available for this feature."))
input_data = {}
# Ensure all required columns are present
for feature in X.columns:
if feature not in input_data:
input_data[feature] = 0 # Default value
input_df = pd.DataFrame([input_data])
col1, col2 = st.columns(2)
for i, feature in enumerate(selected_features):
choices = get_random_choices(df, feature)
with (col1 if i % 2 == 0 else col2):
input_data[feature] = st.selectbox(f"๐Ÿ”ข {feature}", choices)
if st.button("๐Ÿš€ Predict Fraudulence"):
input_df = pd.DataFrame([input_data])
input_scaled = scaler.transform(input_df)
prediction = model.predict(input_scaled)
confidence = model.predict_proba(input_scaled)[0]
st.subheader("๐Ÿง Prediction Result")
if prediction[0] == 1:
st.toast("๐Ÿšจ Fraudulent Transaction Detected! ๐Ÿ”ด", icon='โš ๏ธ')
st.error("This transaction is likely fraudulent.")
else:
st.toast("โœ… Legitimate Transaction ๐ŸŸข", icon='โœ”๏ธ')
st.success("This transaction appears legitimate.")
st.progress(int(max(confidence) * 100))
st.write(f"๐ŸŽฏ **Confidence:** {max(confidence) * 100:.2f}%")
if __name__ == "__main__":
main()