Spaces:

Markndrei
/

fraud_detection_model

Running

App Files Files Community

fraud_detection_model / app.py

Markndrei

Update app.py

24d08c5 verified 4 months ago

raw

history blame

8.11 kB

	import streamlit as st
	from datasets import load_dataset
	import pandas as pd
	import joblib
	import numpy as np
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	import matplotlib.pyplot as plt
	import seaborn as sns
	import altair as alt
	from sklearn.preprocessing import StandardScaler
	from sklearn.model_selection import train_test_split

	# Cache the dataset and model to avoid reloading on every visit
	@st.cache_data
	def load_data():
	dataset = load_dataset("Nooha/cc_fraud_detection_dataset")
	df = pd.DataFrame(dataset['train'])
	df = df.rename(columns={'Class': 'is_fraud'})
	return df

	@st.cache_resource
	def load_model():
	return joblib.load("cc_fraud_model.pkl")

	@st.cache_resource
	def load_scaler():
	return joblib.load("cc_fraud_scaler.pkl")

	# Feature explanations
	feature_info = {
	"city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.",
	"cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.",
	"unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).",
	"amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.",
	"acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.",
	"zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)."
	}

	def get_random_choices(df, feature, num_choices=5):
	return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist()

	def main():
	st.title("💳 Credit Card Fraud Detection Application")
	st.write("⏳ NOTE: Data loading may take some time as it contains 2 million rows. 📊")
	st.write("✅ Worry not! Once loaded, the dataset and models are cached for faster access next time. 🚀")


	with st.expander("🔍 About This Application", expanded=False):
	st.markdown("""
	This application is designed to help you detect fraudulent credit card transactions using machine learning. 🚀
	It uses the Nooha/cc_fraud_detection_dataset from Hugging Face, which contains anonymized credit card transactions.
	""")

	with st.expander("⚠️ Why Fraud Detection Matters", expanded=False):
	st.markdown("""
	💰 Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually.
	Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. 🔐
	This app demonstrates how machine learning can be used to identify suspicious transactions.
	""")

	with st.expander("⚙️ How It Works", expanded=False):
	st.markdown("""
	🛠 Features of this application:
	1. 📊 Dataset Preview: Explore the dataset used to train the model.
	2. 📈 Model Performance: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix.
	3. 🔎 Test Prediction: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate.

	✅ Let's get started!
	""")

	df = load_data()
	model = load_model()
	scaler = load_scaler()

	numeric_df = df.select_dtypes(include=['number'])
	X = numeric_df.drop(columns=['is_fraud'])
	y = numeric_df['is_fraud']

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	X_train_scaled = scaler.transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	tab1, tab2, tab3 = st.tabs(["📄 Dataset Preview", "📊 Model Performance", "🔍 Fraud Prediction"])

	with tab1:
	st.header("📄 Dataset Overview")
	col1, col2 = st.columns(2)
	with col1:
	st.dataframe(df.head(20))
	with col2:
	st.metric("🛒 Total Transactions", f"{len(df):,}")
	st.metric("🚨 Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)")

	chart = alt.Chart(df).mark_bar().encode(
	x=alt.X('is_fraud:O', title='Fraud Status'),
	y=alt.Y('count()', title='Count'),
	color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red']))
	)
	st.altair_chart(chart, use_container_width=True)

	with tab2:
	st.header("📊 Model Performance")
	y_pred = model.predict(X_test_scaled)
	accuracy = accuracy_score(y_test, y_pred)
	st.metric("🎯 Model Accuracy", f"{accuracy:.4f}")

	report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True)
	report_df = pd.DataFrame(report_dict).T.round(3)
	st.dataframe(report_df.style.format("{:.3f}"))

	cm = confusion_matrix(y_test, y_pred)
	fig, ax = plt.subplots()
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
	plt.xlabel("Predicted")
	plt.ylabel("Actual")
	st.pyplot(fig)

	with tab3:
	st.header("🔍 Fraud Prediction")
	st.markdown("💡 Select transaction details below.")

	# Define feature descriptions
	feature_descriptions = {
	"acct_num": "📌 Account Number - Unique identifier for the transaction account.",
	"amt": "💰 Transaction Amount - The total amount involved in the transaction.",
	"unix_time": "⏳ Unix Timestamp - The time when the transaction occurred (in Unix format).",
	"zip": "📮 ZIP Code - Postal code for the transaction location.",
	"city_pop": "🌆 City Population - The number of residents in the city where the transaction took place.",
	"cc_num": "💳 Credit Card Number - Anonymized credit card number used for the transaction."
	}

	available_features = X.columns.tolist()

	# Feature selection UI
	selected_features = st.multiselect("🎛️ Select Features to Use", available_features, default=available_features[:3])

	# Display descriptions of selected features
	for feature in selected_features:
	st.markdown(feature_descriptions.get(feature, "ℹ️ No description available for this feature."))

	input_data = {}

	# Ensure all required columns are present
	for feature in X.columns:
	if feature not in input_data:
	input_data[feature] = 0 # Default value

	input_df = pd.DataFrame([input_data])

	col1, col2 = st.columns(2)
	for i, feature in enumerate(selected_features):
	choices = get_random_choices(df, feature)
	with (col1 if i % 2 == 0 else col2):
	input_data[feature] = st.selectbox(f"🔢 {feature}", choices)

	if st.button("🚀 Predict Fraudulence"):
	input_df = pd.DataFrame([input_data])
	input_scaled = scaler.transform(input_df)
	prediction = model.predict(input_scaled)
	confidence = model.predict_proba(input_scaled)[0]

	st.subheader("🧐 Prediction Result")
	if prediction[0] == 1:
	st.toast("🚨 Fraudulent Transaction Detected! 🔴", icon='⚠️')
	st.error("This transaction is likely fraudulent.")
	else:
	st.toast("✅ Legitimate Transaction 🟢", icon='✔️')
	st.success("This transaction appears legitimate.")

	st.progress(int(max(confidence) * 100))
	st.write(f"🎯 Confidence: {max(confidence) * 100:.2f}%")


	if __name__ == "__main__":
	main()