File size: 8,620 Bytes
22f048c 434dc4c 22f048c 434dc4c 22f048c 434dc4c 22f048c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# Load key libraries and modules
import streamlit as st
import os
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
# Set the page configurations
st.set_page_config(page_title = "Insurance Prediction App", initial_sidebar_state = "auto")
# Set the page title
st.title("Insurance Claim Prediction with Machine Learning")
# ---- Importing and creating other key elements items
# Function to import the Machine Learning toolkit
@st.cache_resource()
def load_ml_toolkit(relative_path):
"""
This function loads the src/toolkit into this file by taking the relative path to the src/toolkit.
Args:
relative_path (string, optional): It receives the file path to the ML toolkit for loading.
Returns:
file: It returns the pickle file (which contains the Machine Learning items in this case).
"""
with open(relative_path, "rb") as file:
loaded_object = pickle.load(file)
return loaded_object
# Function to load the dataset
@st.cache_resource()
def load_data(relative_path):
"""
This function is used to load the DataFrame into the current file.
Args:
relative_path (string): The relative path to the DataFrame to be loaded.
Returns:
DataFrame: Returns the DataFrame at the path provided.
"""
dataset = pd.read_csv(relative_path)
return dataset
# ----- Loading the key components
# Loading the base dataframe
rpath = r"data/train_data.csv"
df_loaded = load_data(rpath)
# Loading the toolkit
loaded_toolkit = load_ml_toolkit(r"src/Streamlit_toolkit")
if "results" not in st.session_state:
st.session_state["results"] = []
# Instantiating the elements of the Machine Learning Toolkit
scaler = loaded_toolkit["scaler"]
model = loaded_toolkit["model"]
encoder = loaded_toolkit["encoder"]
# Import the model
model = XGBClassifier()
model.load_model(r"src/xgb_model.json")
# Define app sections
header = st.container()
dataset = st.container()
features_and_output = st.container()
# Instantiate a from to receive inputs
form = st.form(key="Information", clear_on_submit = True)
# Structure the header section
with header:
header.write("This app is built of a machine learning model to predict the whether or not a building will have an insurance claim over a period based on given variables for which you will make inputs (see the input section below). The model was trained based on the DSN Insurance Prediction dataset.")
header.write("---")
# Structure the sidebar
st.sidebar.header("Information on Columns")
st.sidebar.markdown("""
- *Customer Id*: Identification number for the Policy holder
- *YearOfObservation*: year of observation for the insured policy
- *Insured_Period*: duration of insurance policy in Olusola Insurance. (Ex: Full year insurance, Policy Duration = 1; 6 months = 0.5
- *Residential*: is the building a residential building or not
- *Building_Painted*: is the building painted or not (N-Painted, V-Not Painted)
- *Building_Fenced*: is the building fence or not (N-Fenced, V-Not Fenced)
- *Garden*: building has garden or not (V-has garden; O-no garden)
- *Settlement*: Area where the building is located. (R- rural area; U- urban area)
- *Building Dimension*: Size of the insured building in m2
- *Building_Type*: The type of building (Type 1, 2, 3, 4)
- *Date_of_Occupancy*: date building was first occupied
- *NumberOfWindows*: number of windows in the building
- *Geo Code*: Geographical Code of the Insured building
- *Claim*: target variable. (0: no claim, 1: at least one claim over insured period).
""")
# Structure the dataset section
with dataset:
if dataset.checkbox("Preview the dataset"):
dataset.write(df_loaded.head())
dataset.write("Take a look at the sidebar for more information on the columns")
dataset.write("---")
# Define a list of expected variables
expected_inputs = ["YearOfObservation", "Insured_Period", "Residential", "Building_Painted", "Building_Fenced", "Garden", "Settlement", "Building Dimension", "Building_Type", "Date_of_Occupancy", "NumberOfWindows"]
# List of features to encode
categoricals = ["Building_Painted", "Building_Fenced", "Garden", "Settlement"]
# List of features to scale
cols_to_scale = ["YearOfObservation", "Insured_Period", "Residential", "Building Dimension", "Building_Type", "Date_of_Occupancy", "NumberOfWindows"]
# Structure the features and output section
with features_and_output:
features_and_output.subheader("Inputs")
features_and_output.write("This section captures the inputs to be used in predictions...")
left_col, right_col = features_and_output.columns(2)
# Design the input section
with form:
left_col.markdown("**Inputs Set 1:")
YearOfObservation = left_col.number_input("Select a year:", min_value = 2012, step = 1)
Insured_Period = left_col.selectbox("Insured Period (Full year or half-year):", options = [0.5,1])
Residential = left_col.radio("Is the building residential (1) or not (0):", options= [0, 1], horizontal = True)
Building_Painted = left_col.selectbox("Is the building painted (N) or not(V):", options = ["N", "V"])
Building_Fenced = left_col.selectbox("Is the building fenced (N) or not(V):", options = ["N", "V"])
right_col.markdown("**Inputs Set 2**")
Garden = right_col.radio("Does the building have a garden (V) or not (O):", options = ["V", "O"], horizontal = True)
Settlement = right_col.radio("Is the building situated in a rural (R) or urban (U) area?:", options = ["R", "U"], horizontal = True)
Building_Dimension = right_col.number_input("What is the size of the insured building (m2)?", min_value= 1, value= 1)
Building_Type = right_col.selectbox("What type of building is it?", options = [1,2,3,4])
Date_of_Occupancy = right_col.number_input("On what date was the building first occupied?", min_value= 1545, value= 1970)
NumberOfWindows = right_col.select_slider("How many windows does the building have?", options= range(1,11))
# Submit button
submitted = form.form_submit_button(label= "Submit")
# Process inputs from user
if submitted:
with features_and_output:
# Inputs formatting
input_dict = {
"YearOfObservation": [YearOfObservation],
"Insured_Period": [Insured_Period],
"Residential": [Residential],
"Building_Painted": [Building_Painted],
"Building_Fenced": [Building_Fenced],
"Garden": [Garden],
"Settlement": [Settlement],
"Building Dimension": [Building_Dimension],
"Building_Type": [Building_Type],
"Date_of_Occupancy": [Date_of_Occupancy],
"NumberOfWindows": [NumberOfWindows]
}
# Converting the input into a dataframe
input_data = pd.DataFrame.from_dict(input_dict)
# Encode the categorical columns
encoded_test_categoricals = encoder.transform(input_data[categoricals])
encoded_test_categoricals = pd.DataFrame(encoded_test_categoricals, columns = encoder.get_feature_names_out().tolist())
# Add the encoded categoricals to the DataFrame and dropping the original columns
input_data = input_data.join(encoded_test_categoricals)
input_data.drop(columns= categoricals, inplace= True)
# Scale the numeric columns
input_data[cols_to_scale] = scaler.transform(input_data[cols_to_scale])
# Make the prediction
model_output = model.predict(input_data)
input_data["Prediction"] = model_output
if model_output[0] == 0:
display = "The building does not have a claim over the insured period."
else:
display = "The building has a claim over the insured period."
# Adding the predictions to previous predictions
st.session_state["results"].append(input_data)
result = pd.concat(st.session_state["results"])
# Displaying prediction results
st.success(f"**Prediction**: {display}")
# Expander to display previous predictions
previous_output = st.expander("**Review previous predictions**")
previous_output.dataframe(result, use_container_width= True)
|