Spaces:
Sleeping
Sleeping
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import streamlit as st | |
from pathlib import Path | |
import os | |
import joblib | |
import shap | |
from streamlit_shap import st_shap | |
from streamlit_folium import st_folium # Import st_folium to embed Folium map in Streamlit | |
import folium | |
# Load the model, scaler, one-hot encoder, and pre-processed DataFrame | |
# Cache the model objects to avoid reloading on every interaction | |
def load_model_objects(): | |
models_dir = os.path.join(os.getcwd(), 'models') # Adjust the 'models' folder if needed | |
xgb_clf = joblib.load(os.path.join(models_dir, 'xgb_clf.joblib')) | |
scaler = joblib.load(os.path.join(models_dir, 'scaler.joblib')) | |
ohe = joblib.load(os.path.join(models_dir, 'ohe.joblib')) | |
df = joblib.load(os.path.join(models_dir, 'df.joblib')) # Pre-processed DataFrame | |
df_sample = joblib.load(os.path.join(models_dir, 'df_sample.joblib')) # Sampled DataFrame | |
return xgb_clf, scaler, ohe, df, df_sample | |
# Load the model, scaler, encoder, and pre-processed DataFrame | |
xgb_clf, scaler, ohe, df, df_sample = load_model_objects() | |
# ============================================================================================================================= | |
# Sidebar navigation | |
# ============================================================================================================================= | |
st.sidebar.title("Navigation") | |
page = st.sidebar.radio("Go to", ["Info", "Destribution", "Statistics", "Prediction"]) | |
# ============================================================================================================================= | |
# Info page | |
# ============================================================================================================================= | |
if page == "Info": # if the page is info then show following | |
st.title("Info") | |
st.write("Welcome to the Streamlit Dashboard of the FINDEX dataset!") | |
st.write("This dashboard provides insights from the final submission in Introduction to business data science. The data in this app is from the Global Findex 2021 / World Bank survey.") | |
st.write("The dataset from Findex contains financial inclusion data from 2021. The data covers various demographics, income, and financial behaviors across multiple countries.") | |
st.subheader("Understand Business Context - Problem Definition") | |
st.write("Based on the data this app will help to understand the financial inclusion of the respondents. The app wil adress the following questions.") | |
st.write(""" | |
- Can we predict whether an individual is likely to own a bank account based on income, education, and other socioeconomic factors? | |
- What factors influence on having and account? | |
- How is the correlation between the diffenrent varibles? | |
""") | |
st.subheader("Key Variable Descriptions (df_sample)") | |
st.write(""" | |
- **Account**: Binary variable indicating whether the respondent has a bank account. | |
- **Income**: Income quintile of the respondent. | |
- **Remittances**: Amount of remittances received by the respondent. | |
- **Education Level**: Education level of the respondent. | |
- **Age**: Respondent's age. | |
- **Gender**: Gender of the respondent. | |
- **Mobile Owner**: Binary variable indicating whether the respondent owns a mobile phone. | |
- **Internet Access**: Binary variable indicating whether the respondent has access to the internet. | |
- **Pay Utilities**: Binary variable indicating if the respondent uses digital payment methods for paying utilities. | |
- **Receive Transfers**: Binary variable indicating if the respondent receives money transfers. | |
- **Receive Pension**: Binary variable indicating if the respondent receives a pension. | |
- **Economy**: Country of the respondent. | |
- **Regionwb**: World Bank region of the respondent. | |
- **Digital Payment Usage**: Binary variable indicating if the respondent uses digital payment methods. | |
""") | |
st.write("All these variables are used to predict the account variable in the prediction page.") | |
# ============================================================================================================================= | |
# Destribution page | |
# ============================================================================================================================= | |
elif page == "Destribution": # if the page is destribution then show following | |
st.title("Visulisation of the data distribution og the data") | |
st.write("Here is a preview of the Age Distribution:") | |
def plot_age_distribution(data): | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.histplot(data['age'], kde=True) | |
st.pyplot(fig) | |
plot_age_distribution(df) | |
st.write("Here is a preview of the percentage of the different features:") | |
# Dictionary to map numeric codes to their actual meanings | |
mapping_dict = { | |
'mobile_owner': {1: 'Owns mobile phone', 2: 'Does not own', 3: "Don't know"}, | |
'internet_access': {1: 'Has access', 2: 'No access', 3: "Don't know"}, | |
'pay_utilities': {1: 'Paid from account', 2: 'Paid in cash', 3: 'Other method', 4: 'Did not pay'}, | |
'receive_transfers': {1: 'Received via account', 2: 'Received in cash', 3: 'Other method', 4: 'Did not receive'}, | |
'receive_pension': {1: 'Received via account', 2: 'Received in cash', 3: 'Other method', 4: 'Did not receive'}, | |
'education_level': {1: 'Primary or less', 2: 'Secondary', 3: 'Tertiary or more'}, | |
'gender': {1: 'Female', 2: 'Male'}, | |
'account': {1: 'Yes', 0: 'No'}, | |
'digital_payment_usage': {1: 'Yes', 0: 'No'} | |
} | |
# List of categorical/binary features to plot | |
cat_features = [ | |
'account', 'mobile_owner', 'internet_access', | |
'pay_utilities', 'receive_transfers', 'gender', | |
'education_level', 'digital_payment_usage' | |
] | |
# Set up the figure for multiple subplots | |
fig, axes = plt.subplots(4, 2, figsize=(10, 20)) # 2 rows, 4 columns abd the firure size | |
# Flatten axes to easily iterate over them in a single loop | |
axes = axes.flatten() # | |
# Loop through features to create bar plots (instead of doing the same for each plot, we can do it once using loop) | |
for i, col in enumerate(cat_features): | |
# Create a copy of the current column and apply mapping for the plot | |
data_for_plot = df_sample[col].copy().replace(mapping_dict.get(col, {})) # Use copy() to avoid modifying the original data | |
# Calculate percentages for each category | |
percentage_data = data_for_plot.value_counts(normalize=True) * 100 | |
# Plot the bar plot showing percentage distribution | |
sns.barplot(x=percentage_data.index, y=percentage_data.values, ax=axes[i], palette="Blues_d") | |
# Set plot title and labels | |
axes[i].set_title(f'Percentage Distribution of {col}') | |
axes[i].set_ylabel('Percentage (%)') | |
axes[i].set_xlabel(col) | |
# Rotate x-axis labels if there are long categories | |
axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right') | |
# Adjust layout for better appearance | |
plt.tight_layout() | |
# Display the plot in Streamlit | |
st.pyplot(fig) | |
# ============================================================================================================================= | |
# Statistics page | |
# ============================================================================================================================= | |
elif page == "Statistics": | |
# sidebar filtering settings | |
# Map gender and education level codes to readable labels for the select boxes | |
gender_mapping = {1: 'Female', 2: 'Male'} | |
education_level_mapping = {1: 'Primary or less', 2: 'Secondary', 3: 'Tertiary or more'} | |
# Create new columns for the labels in the df_sample dataframe | |
df_sample['gender_label'] = df_sample['gender'].map(gender_mapping) | |
df_sample['education_level_label'] = df_sample['education_level'].map(education_level_mapping) | |
# Sidebar economy dropdown | |
selected_economy = st.sidebar.multiselect('Select Economy', df_sample['economy'].unique(), default=[]) | |
# Sidebar gender dropdown (using gender_label column) | |
selected_genders = st.sidebar.multiselect('Select Gender', df_sample['gender_label'].unique(), default=[]) | |
# Sidebar education level dropdown (using education_level_label column) | |
selected_educational_level = st.sidebar.multiselect('Select educational level', df_sample['education_level_label'].unique(), default=[]) | |
# Sidebar Age Slider | |
st.sidebar.header('Filter by Age') | |
age_range = st.sidebar.slider('Select Age Range', int(df_sample['age'].min()), int(df_sample['age'].max()), (15, 99)) | |
# Initial filter - apply all conditions cumulatively | |
filtered_data = df_sample[df_sample['age'].between(age_range[0], age_range[1])] | |
# Apply economy filter if selections are made | |
if selected_economy: | |
filtered_data = filtered_data[filtered_data['economy'].isin(selected_economy)] | |
# Apply gender filter based on the gender_label column | |
if selected_genders: | |
filtered_data = filtered_data[filtered_data['gender_label'].isin(selected_genders)] | |
# Apply educational level filter based on the education_level_label column | |
if selected_educational_level: | |
filtered_data = filtered_data[filtered_data['education_level_label'].isin(selected_educational_level)] | |
st.title("Statistics Page") | |
# Check if filtered data is not empty and calculate statistics, otherwise use "N/A" | |
if not filtered_data.empty: | |
mean_age = f"{filtered_data['age'].mean():.2f}" | |
median_age = f"{filtered_data['age'].median():.2f}" | |
max_age = f"{filtered_data['age'].max():.2f}" | |
min_age = f"{filtered_data['age'].min():.2f}" | |
else: | |
mean_age = median_age = max_age = min_age = "N/A" | |
# Display the statistics in columns | |
st.subheader('Age Statistics') | |
col1, col2, col3, col4 = st.columns(4) | |
col1.metric('Mean Age', mean_age) | |
col2.metric('Median Age', median_age) | |
col3.metric('Max Age', max_age) | |
col4.metric('Min Age', min_age) | |
# Add your subheader | |
st.subheader("Boxplot of Age") | |
# Boxplot before applying the cap and hurdle on age | |
st.write("Boxplot of Age - figure showing the distribution") | |
plt.figure(figsize=(8, 4)) # Define the size of the figure | |
sns.boxplot(x='age', data=filtered_data) # Create a boxplot based on "age" | |
plt.title("Boxplot of Age") # Title of the plot | |
st.pyplot(plt) # Display the plot in Streamlit | |
# If filtered data is not empty, continue with analysis | |
if not filtered_data.empty: | |
# Barplot: Account Ownership Distribution by Education Level | |
st.subheader('Account Ownership Distribution by Education Level') | |
# Create a crosstab to show the distribution | |
education_account_dist = pd.crosstab(filtered_data['education_level'], filtered_data['account'], normalize='index') * 100 | |
# Rename columns to be more descriptive | |
education_account_dist.columns = ['No Account (%)', 'Has Account (%)'] | |
# Bar plot for education level distribution | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
education_account_dist.plot(kind='bar', stacked=True, color=['#3498db', '#2ecc71'], ax=ax) | |
ax.set_xlabel('Education Level', fontsize=12) | |
ax.set_ylabel('Percentage of Account Ownership (%)', fontsize=12) | |
ax.set_title('Account Ownership by Education Level', fontsize=14) | |
ax.legend(title='Account Ownership', loc='upper right') | |
plt.xticks(rotation=45, ha='right') | |
# Display the plot | |
st.pyplot(fig) | |
# Barplot: Income Distribution by Account Ownership | |
st.subheader('Income Distribution by Account Ownership') | |
# Create a crosstab to show the distribution | |
income_account_dist = pd.crosstab(filtered_data['income'], filtered_data['account'], normalize='index') * 100 | |
# Rename columns to be more descriptive | |
income_account_dist.columns = ['No Account (%)', 'Has Account (%)'] | |
# Bar plot for income quintile distribution | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
income_account_dist.plot(kind='bar', stacked=True, color=['#3498db', '#2ecc71'], ax=ax) | |
ax.set_xlabel('Income Quintile', fontsize=12) | |
ax.set_ylabel('Percentage of Account Ownership (%)', fontsize=12) | |
ax.set_title('Account Ownership by Income Quintile', fontsize=14) | |
ax.legend(title='Account Ownership', loc='upper right') | |
plt.xticks(rotation=45, ha='right') | |
# Display the plot | |
st.pyplot(fig) | |
# Barplot: Percentage of People Having an Account by Age Group | |
st.subheader('Percentage of People Having an Account by Age Group') | |
# Calculate the proportion of people having an account in each age group | |
account_by_age = filtered_data.groupby('age_group')['account'].mean().reset_index() | |
account_by_age['account'] = (account_by_age['account'] * 100).round(2) | |
# Create the bar plot using Matplotlib and Seaborn | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
sns.barplot(x='age_group', y='account', data=account_by_age, palette="Blues_d", ax=ax) | |
ax.set_xlabel('Age Group', fontsize=12) | |
ax.set_ylabel('Percentage of Account Ownership (%)', fontsize=12) | |
ax.set_title('Percentage of People with an Account by Age Group', fontsize=14) | |
# Add values on top of each bar | |
for index, value in enumerate(account_by_age['account']): | |
ax.text(index, value + 1, f'{value}%', ha='center', fontsize=10) | |
# Rotate x-axis labels for readability | |
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right') | |
# Display the plot in Streamlit | |
st.pyplot(fig) | |
else: | |
st.write("No data available for the selected filters.") | |
# Display filtered data | |
st.write("You can download the filtered data here") | |
st.dataframe(filtered_data) | |
# Prediction page | |
# ============================================================================================================================= | |
elif page == "Prediction": | |
st.title("Prediction Page") | |
# Get valid categories for economy and regionwb from the OneHotEncoder | |
economy_categories = ohe.categories_[0] | |
regionwb_categories = ohe.categories_[1] | |
# Create SHAP explainer | |
explainer = shap.TreeExplainer(xgb_clf) | |
# App description | |
with st.expander("What's this app?"): | |
st.markdown(""" | |
This app predicts whether an individual has a bank account based on their demographic and socioeconomic data. | |
Using advanced AI models trained on relevant data, we provide insights into financial inclusion. | |
Explore the SHAP explanations to understand the key factors behind the predictions! | |
""") | |
st.subheader('Input Your Data') | |
# User input section | |
col1, col2 = st.columns(2) | |
with col1: | |
inc_q = st.selectbox("Income Quintile", options=[1, 2, 3, 4, 5]) | |
remittances = st.selectbox("Receives Remittances", options=[1, 2, 3, 4, 5, 6], | |
format_func=lambda x: ['Via Account', 'Via MTO', 'Cash Only', 'Other Methods', 'None', 'Don’t Know'][x-1]) | |
educ = st.selectbox("Education Level", options=[1, 2, 3], | |
format_func=lambda x: ['Primary or Less', 'Secondary', 'Tertiary'][x-1]) | |
age = st.slider("Age", 18, 100, 30) | |
female = st.selectbox("Gender", options=[1, 2], format_func=lambda x: 'Female' if x == 1 else 'Male') | |
with col2: | |
mobileowner = st.selectbox("Owns Mobile Phone", options=[1, 2, 3, 4], | |
format_func=lambda x: ['Yes', 'No', 'Don’t Know', 'Refused'][x-1]) | |
internetaccess = st.selectbox("Has Internet Access", options=[1, 2, 3, 4], | |
format_func=lambda x: ['Yes', 'No', 'Don’t Know', 'Refused'][x-1]) | |
pay_utilities = st.selectbox("Utility Payment Method", options=[1, 2, 3, 4, 5], | |
format_func=lambda x: ['Account', 'Cash', 'Other', 'None', 'Don’t Know'][x-1]) | |
receive_transfers = st.selectbox("Government Transfer Method", options=[1, 2, 3, 4, 5], | |
format_func=lambda x: ['Account', 'Cash', 'Other', 'None', 'Don’t Know'][x-1]) | |
receive_pension = st.selectbox("Receives Pension", options=[1, 2, 3, 4, 5], | |
format_func=lambda x: ['Account', 'Cash', 'Other', 'None', 'Don’t Know'][x-1]) | |
economy = st.selectbox("Economy", options=economy_categories) # Dynamically populated | |
regionwb = st.selectbox("World Bank Region", options=regionwb_categories) # Dynamically populated | |
# Prediction button | |
if st.button('Predict Bank Account Ownership 🚀'): | |
# Prepare categorical and numerical features | |
cat_features = pd.DataFrame({'economy': [economy], 'regionwb': [regionwb]}) | |
cat_encoded = pd.DataFrame(ohe.transform(cat_features).todense(), columns=ohe.get_feature_names_out(['economy', 'regionwb'])) | |
num_features = pd.DataFrame({ | |
'inc_q': [inc_q], | |
'remittances': [remittances], | |
'educ': [educ], | |
'age': [age], | |
'female': [female], | |
'mobileowner': [mobileowner], | |
'internetaccess': [internetaccess], | |
'pay_utilities': [pay_utilities], | |
'receive_transfers': [receive_transfers], | |
'receive_pension': [receive_pension] | |
}) | |
# Scale numerical features | |
num_scaled = pd.DataFrame(scaler.transform(num_features), columns=num_features.columns) | |
# Combine categorical and numerical features | |
features = pd.concat([num_scaled, cat_encoded], axis=1) | |
# Make prediction | |
prediction = xgb_clf.predict(features)[0] | |
# Display prediction | |
st.metric(label="Bank Account Prediction", value='Has Account' if prediction == 1 else 'No Account') | |
# SHAP explanation | |
st.subheader('Factors Behind the Prediction 🤖') | |
shap_values = explainer.shap_values(features) | |
st_shap(shap.force_plot(explainer.expected_value, shap_values[0], features), height=400, width=600) | |