Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,85 +1,172 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
import pandas as pd
|
|
|
3 |
import numpy as np
|
4 |
-
from sklearn.
|
|
|
|
|
|
|
|
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
-
from sklearn.metrics import accuracy_score, classification_report
|
7 |
-
from datasets import load_dataset
|
8 |
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
)
|
39 |
-
|
40 |
-
# Tab Structure
|
41 |
-
tab1, tab2, tab3 = st.tabs(['๐ Dataset Preview', '๐ Model Performance', '๐ Fraud Prediction'])
|
42 |
-
|
43 |
-
# Dataset Preview
|
44 |
-
with tab1:
|
45 |
-
st.markdown(
|
46 |
-
"""
|
47 |
-
## ๐ Dataset Preview
|
48 |
-
Below is a sample of the credit card transaction dataset used for fraud detection.
|
49 |
-
"""
|
50 |
-
)
|
51 |
-
st.dataframe(df.head())
|
52 |
-
|
53 |
-
# Model Performance
|
54 |
-
with tab2:
|
55 |
-
st.markdown(
|
56 |
-
"""
|
57 |
-
## ๐ Model Performance
|
58 |
-
- **Accuracy:** Measures overall model performance.
|
59 |
-
- **Classification Report:** Precision, recall, and F1-score breakdown.
|
60 |
-
"""
|
61 |
-
)
|
62 |
|
63 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
st.
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
v1_input = st.number_input("๐ข Feature V1", value=0.0, step=0.1)
|
78 |
-
v2_input = st.number_input("๐ข Feature V2", value=0.0, step=0.1)
|
79 |
-
v3_input = st.number_input("๐ข Feature V3", value=0.0, step=0.1)
|
80 |
-
|
81 |
-
if st.button("๐ Predict Fraud"):
|
82 |
-
input_data = np.array([[amount_input, time_input, v1_input, v2_input, v3_input]])
|
83 |
-
prediction = model.predict(input_data)[0]
|
84 |
-
result = "๐จ Fraudulent" if prediction == 1 else "โ
Legitimate"
|
85 |
-
st.success(f"### ๐ฏ Prediction: **{result}**")
|
|
|
1 |
import streamlit as st
|
2 |
+
from datasets import load_dataset
|
3 |
import pandas as pd
|
4 |
+
import joblib
|
5 |
import numpy as np
|
6 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import seaborn as sns
|
9 |
+
import altair as alt
|
10 |
+
from sklearn.preprocessing import StandardScaler
|
11 |
from sklearn.model_selection import train_test_split
|
|
|
|
|
12 |
|
13 |
+
# Cache the dataset and model to avoid reloading on every visit
|
14 |
+
@st.cache_data
|
15 |
+
def load_data():
|
16 |
+
dataset = load_dataset("Nooha/cc_fraud_detection_dataset")
|
17 |
+
df = pd.DataFrame(dataset['train'])
|
18 |
+
df = df.rename(columns={'Class': 'is_fraud'})
|
19 |
+
return df
|
20 |
+
|
21 |
+
@st.cache_resource
|
22 |
+
def load_model():
|
23 |
+
return joblib.load("cc_fraud_model.pkl")
|
24 |
+
|
25 |
+
@st.cache_resource
|
26 |
+
def load_scaler():
|
27 |
+
return joblib.load("cc_fraud_scaler.pkl")
|
28 |
+
|
29 |
+
# Feature explanations
|
30 |
+
feature_info = {
|
31 |
+
"city_pop": "City Population - The number of residents in the city where the transaction took place. Example: 5000, 250000, 1000000.",
|
32 |
+
"cc_num": "Credit Card Number (Anonymized) - A unique identifier for the credit card used. Example: 1234567890123456, 9876543210987654.",
|
33 |
+
"unix_time": "Transaction Timestamp in Unix Time - Represents the time since January 1, 1970. Example: 1625097600 (2021-07-01 00:00:00 UTC).",
|
34 |
+
"amt": "Transaction Amount - The amount spent in the transaction. Example: 5.99, 100.50, 999.99.",
|
35 |
+
"acct_num": "Account Number (Anonymized) - A unique identifier for the linked bank account. Example: 1122334455, 9988776655.",
|
36 |
+
"zip": "Zip Code of Transaction Location - The postal code where the transaction occurred. Example: 10001 (NY), 94105 (SF)."
|
37 |
+
}
|
38 |
+
|
39 |
+
def get_random_choices(df, feature, num_choices=5):
|
40 |
+
return np.random.choice(df[feature].dropna().unique(), num_choices, replace=False).tolist()
|
41 |
+
|
42 |
+
def main():
|
43 |
+
st.title("๐ณ Credit Card Fraud Detection Application")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
with st.expander("๐ **About This Application**", expanded=False):
|
46 |
+
st.markdown("""
|
47 |
+
This application is designed to help you detect fraudulent credit card transactions using machine learning. ๐
|
48 |
+
It uses the **Nooha/cc_fraud_detection_dataset** from Hugging Face, which contains anonymized credit card transactions.
|
49 |
+
""")
|
50 |
+
|
51 |
+
with st.expander("โ ๏ธ **Why Fraud Detection Matters**", expanded=False):
|
52 |
+
st.markdown("""
|
53 |
+
๐ฐ Credit card fraud is a significant issue in the financial industry, costing billions of dollars annually.
|
54 |
+
Detecting fraudulent transactions in real-time is crucial to prevent financial losses and protect customers. ๐
|
55 |
+
This app demonstrates how machine learning can be used to identify suspicious transactions.
|
56 |
+
""")
|
57 |
|
58 |
+
with st.expander("โ๏ธ **How It Works**", expanded=False):
|
59 |
+
st.markdown("""
|
60 |
+
๐ **Features of this application:**
|
61 |
+
1. ๐ **Dataset Preview**: Explore the dataset used to train the model.
|
62 |
+
2. ๐ **Model Performance**: Evaluate the performance of the trained model using accuracy, classification reports, and a confusion matrix.
|
63 |
+
3. ๐ **Test Prediction**: Input transaction details and get real-time predictions on whether the transaction is fraudulent or legitimate.
|
64 |
+
|
65 |
+
โ
Let's get started!
|
66 |
""")
|
67 |
+
|
68 |
+
df = load_data()
|
69 |
+
model = load_model()
|
70 |
+
scaler = load_scaler()
|
71 |
+
|
72 |
+
numeric_df = df.select_dtypes(include=['number'])
|
73 |
+
X = numeric_df.drop(columns=['is_fraud'])
|
74 |
+
y = numeric_df['is_fraud']
|
75 |
+
|
76 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
77 |
+
X_train_scaled = scaler.transform(X_train)
|
78 |
+
X_test_scaled = scaler.transform(X_test)
|
79 |
+
|
80 |
+
tab1, tab2, tab3 = st.tabs(["๐ Dataset Preview", "๐ Model Performance", "๐ Fraud Prediction"])
|
81 |
+
|
82 |
+
with tab1:
|
83 |
+
st.header("๐ Dataset Overview")
|
84 |
+
col1, col2 = st.columns(2)
|
85 |
+
with col1:
|
86 |
+
st.dataframe(df.head(20))
|
87 |
+
with col2:
|
88 |
+
st.metric("๐ Total Transactions", f"{len(df):,}")
|
89 |
+
st.metric("๐จ Fraudulent Transactions", f"{df['is_fraud'].sum():,} ({df['is_fraud'].mean() * 100:.2f}%)")
|
90 |
+
|
91 |
+
chart = alt.Chart(df).mark_bar().encode(
|
92 |
+
x=alt.X('is_fraud:O', title='Fraud Status'),
|
93 |
+
y=alt.Y('count()', title='Count'),
|
94 |
+
color=alt.Color('is_fraud:N', scale=alt.Scale(domain=[0, 1], range=['green', 'red']))
|
95 |
+
)
|
96 |
+
st.altair_chart(chart, use_container_width=True)
|
97 |
+
|
98 |
+
with tab2:
|
99 |
+
st.header("๐ Model Performance")
|
100 |
+
y_pred = model.predict(X_test_scaled)
|
101 |
+
accuracy = accuracy_score(y_test, y_pred)
|
102 |
+
st.metric("๐ฏ Model Accuracy", f"{accuracy:.4f}")
|
103 |
+
|
104 |
+
report_dict = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'], output_dict=True)
|
105 |
+
report_df = pd.DataFrame(report_dict).T.round(3)
|
106 |
+
st.dataframe(report_df.style.format("{:.3f}"))
|
107 |
+
|
108 |
+
cm = confusion_matrix(y_test, y_pred)
|
109 |
+
fig, ax = plt.subplots()
|
110 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
|
111 |
+
plt.xlabel("Predicted")
|
112 |
+
plt.ylabel("Actual")
|
113 |
+
st.pyplot(fig)
|
114 |
+
|
115 |
+
with tab3:
|
116 |
+
st.header("๐ Fraud Prediction")
|
117 |
+
st.markdown("๐ก Select transaction details below.")
|
118 |
+
|
119 |
+
# Define feature descriptions
|
120 |
+
feature_descriptions = {
|
121 |
+
"acct_num": "๐ **Account Number** - Unique identifier for the transaction account.",
|
122 |
+
"amt": "๐ฐ **Transaction Amount** - The total amount involved in the transaction.",
|
123 |
+
"unix_time": "โณ **Unix Timestamp** - The time when the transaction occurred (in Unix format).",
|
124 |
+
"zip": "๐ฎ **ZIP Code** - Postal code for the transaction location.",
|
125 |
+
"city_pop": "๐ **City Population** - The number of residents in the city where the transaction took place.",
|
126 |
+
"cc_num": "๐ณ **Credit Card Number** - Anonymized credit card number used for the transaction."
|
127 |
+
}
|
128 |
+
|
129 |
+
available_features = X.columns.tolist()
|
130 |
+
|
131 |
+
# Feature selection UI
|
132 |
+
selected_features = st.multiselect("๐๏ธ Select Features to Use", available_features, default=available_features[:3])
|
133 |
+
|
134 |
+
# Display descriptions of selected features
|
135 |
+
for feature in selected_features:
|
136 |
+
st.markdown(feature_descriptions.get(feature, "โน๏ธ No description available for this feature."))
|
137 |
+
|
138 |
+
input_data = {}
|
139 |
+
|
140 |
+
# Ensure all required columns are present
|
141 |
+
for feature in X.columns:
|
142 |
+
if feature not in input_data:
|
143 |
+
input_data[feature] = 0 # Default value
|
144 |
+
|
145 |
+
input_df = pd.DataFrame([input_data])
|
146 |
+
|
147 |
+
col1, col2 = st.columns(2)
|
148 |
+
for i, feature in enumerate(selected_features):
|
149 |
+
choices = get_random_choices(df, feature)
|
150 |
+
with (col1 if i % 2 == 0 else col2):
|
151 |
+
input_data[feature] = st.selectbox(f"๐ข {feature}", choices)
|
152 |
+
|
153 |
+
if st.button("๐ Predict Fraudulence"):
|
154 |
+
input_df = pd.DataFrame([input_data])
|
155 |
+
input_scaled = scaler.transform(input_df)
|
156 |
+
prediction = model.predict(input_scaled)
|
157 |
+
confidence = model.predict_proba(input_scaled)[0]
|
158 |
+
|
159 |
+
st.subheader("๐ง Prediction Result")
|
160 |
+
if prediction[0] == 1:
|
161 |
+
st.toast("๐จ Fraudulent Transaction Detected! ๐ด", icon='โ ๏ธ')
|
162 |
+
st.error("This transaction is likely fraudulent.")
|
163 |
+
else:
|
164 |
+
st.toast("โ
Legitimate Transaction ๐ข", icon='โ๏ธ')
|
165 |
+
st.success("This transaction appears legitimate.")
|
166 |
+
|
167 |
+
st.progress(int(max(confidence) * 100))
|
168 |
+
st.write(f"๐ฏ **Confidence:** {max(confidence) * 100:.2f}%")
|
169 |
+
|
170 |
|
171 |
+
if __name__ == "__main__":
|
172 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|