File size: 8,620 Bytes
22f048c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434dc4c
22f048c
 
 
434dc4c
22f048c
 
 
 
 
 
 
 
 
 
434dc4c
22f048c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Load key libraries and modules
import streamlit as st
import os
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBClassifier


# Set the page configurations
st.set_page_config(page_title = "Insurance Prediction App", initial_sidebar_state = "auto")

# Set the page title
st.title("Insurance Claim Prediction with Machine Learning")

# ---- Importing and creating other key elements items
# Function to import the Machine Learning toolkit
@st.cache_resource()
def load_ml_toolkit(relative_path):
    """
    This function loads the src/toolkit into this file by taking the relative path to the src/toolkit.

    Args:
        relative_path (string, optional): It receives the file path to the ML toolkit for loading.
    Returns:
        file: It returns the pickle file (which contains the Machine Learning items in this case).
    """

    with open(relative_path, "rb") as file:
        loaded_object = pickle.load(file)
    return loaded_object


# Function to load the dataset
@st.cache_resource()
def load_data(relative_path):
    """
    This function is used to load the DataFrame into the current file.

    Args:
        relative_path (string): The relative path to the DataFrame to be loaded.

    Returns:
        DataFrame: Returns the DataFrame at the path provided.
    """
    
    dataset = pd.read_csv(relative_path)
    return dataset

# ----- Loading the key components
# Loading the base dataframe
rpath = r"data/train_data.csv"
df_loaded = load_data(rpath)

# Loading the toolkit
loaded_toolkit = load_ml_toolkit(r"src/Streamlit_toolkit")
if "results" not in st.session_state:
    st.session_state["results"] = []

# Instantiating the elements of the Machine Learning Toolkit
scaler = loaded_toolkit["scaler"]
model = loaded_toolkit["model"]
encoder = loaded_toolkit["encoder"]

# Import the model
model = XGBClassifier()
model.load_model(r"src/xgb_model.json")


# Define app sections
header = st.container()
dataset = st.container()
features_and_output = st.container()

# Instantiate a from to receive inputs
form = st.form(key="Information", clear_on_submit = True)

# Structure the header section
with header:
    header.write("This app is built of a machine learning model to predict the whether or not a building will have an insurance claim over a period based on given variables for which you will make inputs (see the input section below). The model was trained based on the DSN Insurance Prediction dataset.")
    header.write("---")

# Structure the sidebar
st.sidebar.header("Information on Columns")
st.sidebar.markdown(""" 
                    - *Customer Id*: Identification number for the Policy holder
                    - *YearOfObservation*: year of observation for the insured policy
                    - *Insured_Period*: duration of insurance policy in Olusola Insurance. (Ex: Full year insurance, Policy Duration = 1; 6 months = 0.5
                    - *Residential*: is the building a residential building or not
                    - *Building_Painted*: is the building painted or not (N-Painted, V-Not Painted)
                    - *Building_Fenced*: is the building fence or not (N-Fenced, V-Not Fenced)
                    - *Garden*: building has garden or not (V-has garden; O-no garden)
                    - *Settlement*: Area where the building is located. (R- rural area; U- urban area)
                    - *Building Dimension*: Size of the insured building in m2
                    - *Building_Type*: The type of building (Type 1, 2, 3, 4)
                    - *Date_of_Occupancy*: date building was first occupied
                    - *NumberOfWindows*: number of windows in the building
                    - *Geo Code*: Geographical Code of the Insured building
                    - *Claim*: target variable. (0: no claim, 1: at least one claim over insured period).
                    """)

# Structure the dataset section
with dataset:
    if dataset.checkbox("Preview the dataset"):
        dataset.write(df_loaded.head())
        dataset.write("Take a look at the sidebar for more information on the columns")
    dataset.write("---")

# Define a list of expected variables
expected_inputs = ["YearOfObservation", "Insured_Period", "Residential", "Building_Painted", "Building_Fenced", "Garden", "Settlement", "Building Dimension", "Building_Type", "Date_of_Occupancy", "NumberOfWindows"]

# List of features to encode
categoricals = ["Building_Painted", "Building_Fenced", "Garden", "Settlement"]

# List of features to scale
cols_to_scale = ["YearOfObservation", "Insured_Period", "Residential", "Building Dimension", "Building_Type", "Date_of_Occupancy", "NumberOfWindows"]


# Structure the features and output section
with features_and_output:
    features_and_output.subheader("Inputs")
    features_and_output.write("This section captures the inputs to be used in predictions...")
    
    left_col, right_col = features_and_output.columns(2)
    
    # Design the input section
    with form:
        left_col.markdown("**Inputs Set 1:")
        YearOfObservation = left_col.number_input("Select a year:", min_value = 2012, step = 1)
        Insured_Period = left_col.selectbox("Insured Period (Full year or half-year):", options = [0.5,1])
        Residential = left_col.radio("Is the building residential (1) or not (0):", options= [0, 1], horizontal = True)
        Building_Painted = left_col.selectbox("Is the building painted (N) or not(V):", options = ["N", "V"])
        Building_Fenced =  left_col.selectbox("Is the building fenced (N) or not(V):", options = ["N", "V"])
        
        right_col.markdown("**Inputs Set 2**")
        Garden = right_col.radio("Does the building have a garden (V) or not (O):", options = ["V", "O"], horizontal = True)
        Settlement = right_col.radio("Is the building situated in a rural (R) or urban (U) area?:", options = ["R", "U"], horizontal = True)
        Building_Dimension = right_col.number_input("What is the size of the insured building (m2)?", min_value= 1, value= 1)
        Building_Type = right_col.selectbox("What type of building is it?", options = [1,2,3,4])
        Date_of_Occupancy = right_col.number_input("On what date was the building first occupied?", min_value= 1545, value= 1970)
        NumberOfWindows = right_col.select_slider("How many windows does the building have?", options= range(1,11))
        
        # Submit button
        submitted = form.form_submit_button(label= "Submit")

# Process inputs from user
if submitted:
    with features_and_output:
        # Inputs formatting
        input_dict = {
            "YearOfObservation": [YearOfObservation],
            "Insured_Period": [Insured_Period],
            "Residential": [Residential],
            "Building_Painted": [Building_Painted],
            "Building_Fenced": [Building_Fenced],
            "Garden": [Garden],
            "Settlement": [Settlement],
            "Building Dimension": [Building_Dimension],
            "Building_Type": [Building_Type],
            "Date_of_Occupancy": [Date_of_Occupancy],
            "NumberOfWindows": [NumberOfWindows]
            }

        # Converting the input into a dataframe
        input_data = pd.DataFrame.from_dict(input_dict)
    
        # Encode the categorical columns
        encoded_test_categoricals = encoder.transform(input_data[categoricals])
        encoded_test_categoricals = pd.DataFrame(encoded_test_categoricals, columns = encoder.get_feature_names_out().tolist())
        
        # Add the encoded categoricals to the DataFrame and dropping the original columns
        input_data = input_data.join(encoded_test_categoricals)
        input_data.drop(columns= categoricals, inplace= True)

        # Scale the numeric columns
        input_data[cols_to_scale] = scaler.transform(input_data[cols_to_scale])
        
        # Make the prediction
        model_output = model.predict(input_data)
        input_data["Prediction"] = model_output
        
        if model_output[0] == 0:
            display = "The building does not have a claim over the insured period."
        else:
            display = "The building has a claim over the insured period."
        # Adding the predictions to previous predictions
        st.session_state["results"].append(input_data)
        result = pd.concat(st.session_state["results"])

    # Displaying prediction results
    st.success(f"**Prediction**: {display}")

    # Expander to display previous predictions
    previous_output = st.expander("**Review previous predictions**")
    previous_output.dataframe(result, use_container_width= True)