File size: 18,874 Bytes
2e82d86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import streamlit as st
from pathlib import Path
import os
import joblib
import shap
from streamlit_shap import st_shap
from streamlit_folium import st_folium  # Import st_folium to embed Folium map in Streamlit
import folium


# Load the model, scaler, one-hot encoder, and pre-processed DataFrame
@st.cache_resource  # Cache the model objects to avoid reloading on every interaction
def load_model_objects():
    models_dir = os.path.join(os.getcwd(), 'models')  # Adjust the 'models' folder if needed
    xgb_clf = joblib.load(os.path.join(models_dir, 'xgb_clf.joblib'))
    scaler = joblib.load(os.path.join(models_dir, 'scaler.joblib'))
    ohe = joblib.load(os.path.join(models_dir, 'ohe.joblib'))
    df = joblib.load(os.path.join(models_dir, 'df.joblib'))  # Pre-processed DataFrame
    df_sample = joblib.load(os.path.join(models_dir, 'df_sample.joblib'))  # Sampled DataFrame

    return xgb_clf, scaler, ohe, df, df_sample

# Load the model, scaler, encoder, and pre-processed DataFrame
xgb_clf, scaler, ohe, df, df_sample = load_model_objects()



# =============================================================================================================================
# Sidebar navigation
# =============================================================================================================================

st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Info", "Destribution", "Statistics", "Prediction"])



# =============================================================================================================================
# Info page
# =============================================================================================================================
if page == "Info": # if the page is info then show following
    st.title("Info")
    st.write("Welcome to the Streamlit Dashboard of the FINDEX dataset!")

    st.write("This dashboard provides insights from the final submission in Introduction to business data science. The data in this app is from the Global Findex 2021 / World Bank survey.")
    st.write("The dataset from Findex contains financial inclusion data from 2021. The data covers various demographics, income, and financial behaviors across multiple countries.") 


    st.subheader("Understand Business Context - Problem Definition")
    st.write("Based on the data this app will help to understand the financial inclusion of the respondents. The app wil adress the following questions.")
    st.write("""

    - Can we predict whether an individual is likely to own a bank account based on income, education, and other socioeconomic factors?

    - What factors influence on having and account?

    - How is the correlation between the diffenrent varibles? 

    """)

    st.subheader("Key Variable Descriptions (df_sample)")
    st.write("""

    - **Account**: Binary variable indicating whether the respondent has a bank account.

    - **Income**: Income quintile of the respondent.

    - **Remittances**: Amount of remittances received by the respondent.

    - **Education Level**: Education level of the respondent.

    - **Age**: Respondent's age.

    - **Gender**: Gender of the respondent.

    - **Mobile Owner**: Binary variable indicating whether the respondent owns a mobile phone.

    - **Internet Access**: Binary variable indicating whether the respondent has access to the internet.

    - **Pay Utilities**: Binary variable indicating if the respondent uses digital payment methods for paying utilities.

    - **Receive Transfers**: Binary variable indicating if the respondent receives money transfers.

    - **Receive Pension**: Binary variable indicating if the respondent receives a pension.

    - **Economy**: Country of the respondent.

    - **Regionwb**: World Bank region of the respondent.

    - **Digital Payment Usage**: Binary variable indicating if the respondent uses digital payment methods.

    """)
    st.write("All these variables are used to predict the account variable in the prediction page.")


# =============================================================================================================================
# Destribution page 
# =============================================================================================================================
elif page == "Destribution": # if the page is destribution then show following

    st.title("Visulisation of the data distribution og the data")
    
    st.write("Here is a preview of the Age Distribution:")
    def plot_age_distribution(data):
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.histplot(data['age'], kde=True)
        st.pyplot(fig)
    plot_age_distribution(df)


    st.write("Here is a preview of the percentage of the different features:")

    # Dictionary to map numeric codes to their actual meanings
    mapping_dict = {
        'mobile_owner': {1: 'Owns mobile phone', 2: 'Does not own', 3: "Don't know"},
        'internet_access': {1: 'Has access', 2: 'No access', 3: "Don't know"},
        'pay_utilities': {1: 'Paid from account', 2: 'Paid in cash', 3: 'Other method', 4: 'Did not pay'},
        'receive_transfers': {1: 'Received via account', 2: 'Received in cash', 3: 'Other method', 4: 'Did not receive'},
        'receive_pension': {1: 'Received via account', 2: 'Received in cash', 3: 'Other method', 4: 'Did not receive'},
        'education_level': {1: 'Primary or less', 2: 'Secondary', 3: 'Tertiary or more'},
        'gender': {1: 'Female', 2: 'Male'},
        'account': {1: 'Yes', 0: 'No'},
        'digital_payment_usage': {1: 'Yes', 0: 'No'}
    }

    # List of categorical/binary features to plot
    cat_features = [
        'account', 'mobile_owner', 'internet_access', 
        'pay_utilities', 'receive_transfers', 'gender', 
        'education_level', 'digital_payment_usage'
    ]

    # Set up the figure for multiple subplots
    fig, axes = plt.subplots(4, 2, figsize=(10, 20))  # 2 rows, 4 columns abd the firure size

    # Flatten axes to easily iterate over them in a single loop
    axes = axes.flatten() #

    # Loop through features to create bar plots (instead of doing the same for each plot, we can do it once using loop)
    for i, col in enumerate(cat_features):
        # Create a copy of the current column and apply mapping for the plot
        data_for_plot = df_sample[col].copy().replace(mapping_dict.get(col, {}))  # Use copy() to avoid modifying the original data

        # Calculate percentages for each category
        percentage_data = data_for_plot.value_counts(normalize=True) * 100

        # Plot the bar plot showing percentage distribution
        sns.barplot(x=percentage_data.index, y=percentage_data.values, ax=axes[i], palette="Blues_d")

        # Set plot title and labels
        axes[i].set_title(f'Percentage Distribution of {col}')
        axes[i].set_ylabel('Percentage (%)')
        axes[i].set_xlabel(col)

        # Rotate x-axis labels if there are long categories
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')

    # Adjust layout for better appearance
    plt.tight_layout()

    # Display the plot in Streamlit
    st.pyplot(fig)



# =============================================================================================================================
# Statistics page 
# =============================================================================================================================
elif page == "Statistics":

    # sidebar filtering settings 
    # Map gender and education level codes to readable labels for the select boxes
    gender_mapping = {1: 'Female', 2: 'Male'}
    education_level_mapping = {1: 'Primary or less', 2: 'Secondary', 3: 'Tertiary or more'}

    # Create new columns for the labels in the df_sample dataframe
    df_sample['gender_label'] = df_sample['gender'].map(gender_mapping)
    df_sample['education_level_label'] = df_sample['education_level'].map(education_level_mapping)

    # Sidebar economy dropdown
    selected_economy = st.sidebar.multiselect('Select Economy', df_sample['economy'].unique(), default=[])

    # Sidebar gender dropdown (using gender_label column)
    selected_genders = st.sidebar.multiselect('Select Gender', df_sample['gender_label'].unique(), default=[])

    # Sidebar education level dropdown (using education_level_label column)
    selected_educational_level = st.sidebar.multiselect('Select educational level', df_sample['education_level_label'].unique(), default=[])

    # Sidebar Age Slider
    st.sidebar.header('Filter by Age')
    age_range = st.sidebar.slider('Select Age Range', int(df_sample['age'].min()), int(df_sample['age'].max()), (15, 99))

    # Initial filter - apply all conditions cumulatively
    filtered_data = df_sample[df_sample['age'].between(age_range[0], age_range[1])]

    # Apply economy filter if selections are made
    if selected_economy:
        filtered_data = filtered_data[filtered_data['economy'].isin(selected_economy)]

    # Apply gender filter based on the gender_label column
    if selected_genders:
        filtered_data = filtered_data[filtered_data['gender_label'].isin(selected_genders)]

    # Apply educational level filter based on the education_level_label column
    if selected_educational_level:
        filtered_data = filtered_data[filtered_data['education_level_label'].isin(selected_educational_level)]


    st.title("Statistics Page")
    # Check if filtered data is not empty and calculate statistics, otherwise use "N/A"
    if not filtered_data.empty:
        mean_age = f"{filtered_data['age'].mean():.2f}"
        median_age = f"{filtered_data['age'].median():.2f}"
        max_age = f"{filtered_data['age'].max():.2f}"
        min_age = f"{filtered_data['age'].min():.2f}"
    else:
        mean_age = median_age = max_age = min_age = "N/A"

    # Display the statistics in columns
    st.subheader('Age Statistics')
    col1, col2, col3, col4 = st.columns(4)

    col1.metric('Mean Age', mean_age)
    col2.metric('Median Age', median_age)
    col3.metric('Max Age', max_age)
    col4.metric('Min Age', min_age)



    # Add your subheader
    st.subheader("Boxplot of Age")

    # Boxplot before applying the cap and hurdle on age
    st.write("Boxplot of Age - figure showing the distribution")
    plt.figure(figsize=(8, 4))  # Define the size of the figure
    sns.boxplot(x='age', data=filtered_data)  # Create a boxplot based on "age"
    plt.title("Boxplot of Age")  # Title of the plot
    st.pyplot(plt)  # Display the plot in Streamlit


    # If filtered data is not empty, continue with analysis
    if not filtered_data.empty:
        # Barplot: Account Ownership Distribution by Education Level    
        st.subheader('Account Ownership Distribution by Education Level')

        # Create a crosstab to show the distribution
        education_account_dist = pd.crosstab(filtered_data['education_level'], filtered_data['account'], normalize='index') * 100

        # Rename columns to be more descriptive
        education_account_dist.columns = ['No Account (%)', 'Has Account (%)']

        # Bar plot for education level distribution
        fig, ax = plt.subplots(figsize=(10, 6))
        education_account_dist.plot(kind='bar', stacked=True, color=['#3498db', '#2ecc71'], ax=ax)

        ax.set_xlabel('Education Level', fontsize=12)
        ax.set_ylabel('Percentage of Account Ownership (%)', fontsize=12)
        ax.set_title('Account Ownership by Education Level', fontsize=14)
        ax.legend(title='Account Ownership', loc='upper right')
        plt.xticks(rotation=45, ha='right')

        # Display the plot
        st.pyplot(fig)



        # Barplot: Income Distribution by Account Ownership
        st.subheader('Income Distribution by Account Ownership')

        # Create a crosstab to show the distribution
        income_account_dist = pd.crosstab(filtered_data['income'], filtered_data['account'], normalize='index') * 100

        # Rename columns to be more descriptive
        income_account_dist.columns = ['No Account (%)', 'Has Account (%)']

        # Bar plot for income quintile distribution
        fig, ax = plt.subplots(figsize=(10, 6))
        income_account_dist.plot(kind='bar', stacked=True, color=['#3498db', '#2ecc71'], ax=ax)

        ax.set_xlabel('Income Quintile', fontsize=12)
        ax.set_ylabel('Percentage of Account Ownership (%)', fontsize=12)
        ax.set_title('Account Ownership by Income Quintile', fontsize=14)
        ax.legend(title='Account Ownership', loc='upper right')
        plt.xticks(rotation=45, ha='right')

        # Display the plot
        st.pyplot(fig)



        # Barplot: Percentage of People Having an Account by Age Group
        st.subheader('Percentage of People Having an Account by Age Group')

        # Calculate the proportion of people having an account in each age group
        account_by_age = filtered_data.groupby('age_group')['account'].mean().reset_index()
        account_by_age['account'] = (account_by_age['account'] * 100).round(2)

        # Create the bar plot using Matplotlib and Seaborn
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.barplot(x='age_group', y='account', data=account_by_age, palette="Blues_d", ax=ax)
        ax.set_xlabel('Age Group', fontsize=12)
        ax.set_ylabel('Percentage of Account Ownership (%)', fontsize=12)
        ax.set_title('Percentage of People with an Account by Age Group', fontsize=14)

        # Add values on top of each bar
        for index, value in enumerate(account_by_age['account']):
            ax.text(index, value + 1, f'{value}%', ha='center', fontsize=10)

        # Rotate x-axis labels for readability
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

        # Display the plot in Streamlit
        st.pyplot(fig)

    else:
        st.write("No data available for the selected filters.")


    # Display filtered data
    st.write("You can download the filtered data here")
    st.dataframe(filtered_data)



# Prediction page
# =============================================================================================================================
elif page == "Prediction":
    st.title("Prediction Page")

    # Get valid categories for economy and regionwb from the OneHotEncoder
    economy_categories = ohe.categories_[0]
    regionwb_categories = ohe.categories_[1]

    # Create SHAP explainer
    explainer = shap.TreeExplainer(xgb_clf)

    # App description
    with st.expander("What's this app?"):
        st.markdown("""

        This app predicts whether an individual has a bank account based on their demographic and socioeconomic data.

        Using advanced AI models trained on relevant data, we provide insights into financial inclusion. 

        Explore the SHAP explanations to understand the key factors behind the predictions!

        """)

    st.subheader('Input Your Data')

    # User input section
    col1, col2 = st.columns(2)

    with col1:
        inc_q = st.selectbox("Income Quintile", options=[1, 2, 3, 4, 5])
        remittances = st.selectbox("Receives Remittances", options=[1, 2, 3, 4, 5, 6],
                                format_func=lambda x: ['Via Account', 'Via MTO', 'Cash Only', 'Other Methods', 'None', 'Don’t Know'][x-1])
        educ = st.selectbox("Education Level", options=[1, 2, 3], 
                            format_func=lambda x: ['Primary or Less', 'Secondary', 'Tertiary'][x-1])
        age = st.slider("Age", 18, 100, 30)
        female = st.selectbox("Gender", options=[1, 2], format_func=lambda x: 'Female' if x == 1 else 'Male')

    with col2:
        mobileowner = st.selectbox("Owns Mobile Phone", options=[1, 2, 3, 4],
                                format_func=lambda x: ['Yes', 'No', 'Don’t Know', 'Refused'][x-1])
        internetaccess = st.selectbox("Has Internet Access", options=[1, 2, 3, 4],
                                    format_func=lambda x: ['Yes', 'No', 'Don’t Know', 'Refused'][x-1])
        pay_utilities = st.selectbox("Utility Payment Method", options=[1, 2, 3, 4, 5],
                                    format_func=lambda x: ['Account', 'Cash', 'Other', 'None', 'Don’t Know'][x-1])
        receive_transfers = st.selectbox("Government Transfer Method", options=[1, 2, 3, 4, 5],
                                        format_func=lambda x: ['Account', 'Cash', 'Other', 'None', 'Don’t Know'][x-1])
        receive_pension = st.selectbox("Receives Pension", options=[1, 2, 3, 4, 5],
                                    format_func=lambda x: ['Account', 'Cash', 'Other', 'None', 'Don’t Know'][x-1])
        economy = st.selectbox("Economy", options=economy_categories)  # Dynamically populated
        regionwb = st.selectbox("World Bank Region", options=regionwb_categories)  # Dynamically populated

    # Prediction button
    if st.button('Predict Bank Account Ownership 🚀'):
        # Prepare categorical and numerical features
        cat_features = pd.DataFrame({'economy': [economy], 'regionwb': [regionwb]})
        cat_encoded = pd.DataFrame(ohe.transform(cat_features).todense(), columns=ohe.get_feature_names_out(['economy', 'regionwb']))
        
        num_features = pd.DataFrame({
            'inc_q': [inc_q],
            'remittances': [remittances],
            'educ': [educ],
            'age': [age],
            'female': [female],
            'mobileowner': [mobileowner],
            'internetaccess': [internetaccess],
            'pay_utilities': [pay_utilities],
            'receive_transfers': [receive_transfers],
            'receive_pension': [receive_pension]
        })
        
        # Scale numerical features
        num_scaled = pd.DataFrame(scaler.transform(num_features), columns=num_features.columns)
        
        # Combine categorical and numerical features
        features = pd.concat([num_scaled, cat_encoded], axis=1)
        
        # Make prediction
        prediction = xgb_clf.predict(features)[0]
        
        # Display prediction
        st.metric(label="Bank Account Prediction", value='Has Account' if prediction == 1 else 'No Account')
        
        # SHAP explanation
        st.subheader('Factors Behind the Prediction 🤖')
        shap_values = explainer.shap_values(features)
        st_shap(shap.force_plot(explainer.expected_value, shap_values[0], features), height=400, width=600)