Spaces:
Sleeping
Sleeping
Create Streamlit.py
Browse files- Streamlit.py +165 -0
Streamlit.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
import streamlit as st
|
5 |
+
import joblib
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from sklearn.preprocessing import StandardScaler
|
8 |
+
from sklearn.metrics import confusion_matrix
|
9 |
+
from xgboost import XGBClassifier
|
10 |
+
|
11 |
+
# Page configuration
|
12 |
+
st.set_page_config(page_title="Bank Account Prediction Dashboard", page_icon="💳")
|
13 |
+
st.title('Bank Account Prediction Dashboard')
|
14 |
+
|
15 |
+
# Load model and preprocessing objects
|
16 |
+
def load_model_objects():
|
17 |
+
model_xgb = joblib.load('xgb_clf.joblib')
|
18 |
+
scaler = joblib.load('scaler.joblib')
|
19 |
+
encoder_y = joblib.load('encoder.joblib') # For target variable
|
20 |
+
le_country_economy = joblib.load('country_encoder.joblib')
|
21 |
+
le_regionwb = joblib.load('regionwb_encoder.joblib')
|
22 |
+
return model_xgb, scaler, encoder_y, le_country_economy, le_regionwb
|
23 |
+
|
24 |
+
model_xgb, _scaler, _label_encoder, le_country_economy, le_regionwb = load_model_objects()
|
25 |
+
|
26 |
+
@st.cache_data
|
27 |
+
def load_data():
|
28 |
+
# Load the actual data from the CSV file
|
29 |
+
return pd.read_csv(
|
30 |
+
'micro_world_139countries.csv',
|
31 |
+
encoding='ISO-8859-1'
|
32 |
+
)
|
33 |
+
|
34 |
+
@st.cache_data
|
35 |
+
def process_data(df, _scaler, _label_encoder, _country_encoder, _regionwb_encoder):
|
36 |
+
# Select relevant columns and sample
|
37 |
+
sample_df = df[['remittances', 'educ', 'age', 'female', 'mobileowner',
|
38 |
+
'internetaccess', 'pay_utilities', 'receive_transfers',
|
39 |
+
'receive_pension', 'economy', 'regionwb', 'account']].sample(
|
40 |
+
n=5000, random_state=42, replace=True)
|
41 |
+
|
42 |
+
# Drop rows with missing values in specified columns
|
43 |
+
sample_df = sample_df.dropna(subset=['account', 'remittances', 'educ', 'age', 'female',
|
44 |
+
'mobileowner', 'internetaccess', 'pay_utilities',
|
45 |
+
'receive_transfers', 'receive_pension',
|
46 |
+
'economy', 'regionwb'])
|
47 |
+
|
48 |
+
# Encode 'economy' using the loaded LabelEncoder
|
49 |
+
sample_df['economy'] = _country_encoder.transform(sample_df['economy'])
|
50 |
+
|
51 |
+
# Encode 'regionwb' using the loaded LabelEncoder
|
52 |
+
sample_df['regionwb'] = _regionwb_encoder.transform(sample_df['regionwb'])
|
53 |
+
|
54 |
+
# Manual encoding for 'educ'
|
55 |
+
educ_mapping = {'None': 0, 'Primary': 1, 'Secondary': 2, 'Tertiary': 3}
|
56 |
+
sample_df['educ'] = sample_df['educ'].map(educ_mapping).fillna(-1).astype(int)
|
57 |
+
|
58 |
+
# Manual encoding for 'female'
|
59 |
+
gender_mapping = {'Male': 0, 'Female': 1}
|
60 |
+
sample_df['female'] = sample_df['female'].map(gender_mapping).fillna(-1).astype(int)
|
61 |
+
|
62 |
+
# Convert boolean columns to integers
|
63 |
+
boolean_columns = ['mobileowner', 'internetaccess', 'pay_utilities',
|
64 |
+
'receive_transfers', 'receive_pension']
|
65 |
+
for col in boolean_columns:
|
66 |
+
sample_df[col] = sample_df[col].astype(int)
|
67 |
+
|
68 |
+
# Separate features and target
|
69 |
+
X = sample_df.drop('account', axis=1)
|
70 |
+
y = sample_df['account']
|
71 |
+
|
72 |
+
# Encode target variable
|
73 |
+
y = _label_encoder.transform(y)
|
74 |
+
|
75 |
+
# Scale features using the loaded scaler
|
76 |
+
X_scaled = _scaler.transform(X)
|
77 |
+
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
|
78 |
+
|
79 |
+
return X_scaled, y
|
80 |
+
|
81 |
+
# Load data
|
82 |
+
df = load_data()
|
83 |
+
df = df.drop('inc_q', axis=1, errors='ignore') # Ensure 'inc_q' is dropped if it exists
|
84 |
+
|
85 |
+
# Adding a sidebar for user input
|
86 |
+
with st.sidebar:
|
87 |
+
st.title("Input User Data for Prediction")
|
88 |
+
with st.form("user_inputs"):
|
89 |
+
remittances = st.number_input('Remittances', min_value=0, max_value=100000, step=100)
|
90 |
+
educ = st.selectbox('Education Level', options=['None', 'Primary', 'Secondary', 'Tertiary'])
|
91 |
+
age = st.number_input('Age', min_value=18, max_value=100, step=1)
|
92 |
+
female = st.selectbox('Gender', options=['Male', 'Female'])
|
93 |
+
mobileowner = st.radio('Owns a Mobile', options=[True, False])
|
94 |
+
internetaccess = st.radio('Has Internet Access', options=[True, False])
|
95 |
+
pay_utilities = st.radio('Pays Utilities Online', options=[True, False])
|
96 |
+
receive_transfers = st.radio('Receives Transfers', options=[True, False])
|
97 |
+
receive_pension = st.radio('Receives Pension', options=[True, False])
|
98 |
+
economy = st.selectbox('Country', options=list(le_country_economy.classes_))
|
99 |
+
regionwb = st.selectbox('Region', options=list(le_regionwb.classes_))
|
100 |
+
account = 1 # Placeholder or default value
|
101 |
+
submit_button = st.form_submit_button("Predict")
|
102 |
+
|
103 |
+
# Processing user input for prediction
|
104 |
+
if submit_button:
|
105 |
+
user_data = pd.DataFrame({
|
106 |
+
'remittances': [remittances],
|
107 |
+
'educ': [educ],
|
108 |
+
'age': [age],
|
109 |
+
'female': [female],
|
110 |
+
'mobileowner': [mobileowner],
|
111 |
+
'internetaccess': [internetaccess],
|
112 |
+
'pay_utilities': [pay_utilities],
|
113 |
+
'receive_transfers': [receive_transfers],
|
114 |
+
'receive_pension': [receive_pension],
|
115 |
+
'economy': [economy],
|
116 |
+
'regionwb': [regionwb],
|
117 |
+
'account': [account]
|
118 |
+
})
|
119 |
+
|
120 |
+
try:
|
121 |
+
processed_user_data, _ = process_data(
|
122 |
+
user_data, _scaler, _label_encoder, le_country_economy, le_regionwb
|
123 |
+
)
|
124 |
+
|
125 |
+
prediction = model_xgb.predict(processed_user_data)
|
126 |
+
result = 'Has Bank Account' if prediction[0] == 1 else 'Does Not Have Bank Account'
|
127 |
+
st.sidebar.write(f'Prediction: {result}')
|
128 |
+
except Exception as e:
|
129 |
+
st.sidebar.error(f"Error in processing data: {e}")
|
130 |
+
|
131 |
+
# Process example data
|
132 |
+
scaled_data, _ = process_data(df, _scaler, _label_encoder, le_country_economy, le_regionwb)
|
133 |
+
|
134 |
+
# Display the processed data in your Streamlit app
|
135 |
+
if scaled_data is not None:
|
136 |
+
st.write("Scaled Data:", scaled_data)
|
137 |
+
|
138 |
+
# Main prediction logic
|
139 |
+
# Process the main dataset for predictions
|
140 |
+
processed_data, y_main = process_data(df, _scaler, _label_encoder, le_country_economy, le_regionwb)
|
141 |
+
if processed_data is not None:
|
142 |
+
# Prepare features for prediction
|
143 |
+
X = processed_data # 'account' has been dropped in process_data
|
144 |
+
y = y_main
|
145 |
+
|
146 |
+
# Make predictions
|
147 |
+
predictions = model_xgb.predict(X)
|
148 |
+
|
149 |
+
# Show predictions
|
150 |
+
st.write("Predictions:")
|
151 |
+
st.write(predictions)
|
152 |
+
|
153 |
+
# Plotting a confusion matrix
|
154 |
+
st.subheader("Confusion Matrix")
|
155 |
+
cm = confusion_matrix(y, predictions)
|
156 |
+
cm_fig, ax = plt.subplots()
|
157 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
|
158 |
+
plt.ylabel('Actual')
|
159 |
+
plt.xlabel('Predicted')
|
160 |
+
st.pyplot(cm_fig)
|
161 |
+
|
162 |
+
# Feature importance
|
163 |
+
if st.button('Show Feature Importances'):
|
164 |
+
feat_importances = pd.Series(model_xgb.feature_importances_, index=X.columns)
|
165 |
+
st.bar_chart(feat_importances)
|