# Load key libraries and modules import streamlit as st import os import pickle import numpy as np import pandas as pd from xgboost import XGBClassifier # Set the page configurations st.set_page_config(page_title = "Insurance Prediction App", initial_sidebar_state = "auto") # Set the page title st.title("Insurance Claim Prediction with Machine Learning") # ---- Importing and creating other key elements items # Function to import the Machine Learning toolkit @st.cache_resource() def load_ml_toolkit(relative_path): """ This function loads the src/toolkit into this file by taking the relative path to the src/toolkit. Args: relative_path (string, optional): It receives the file path to the ML toolkit for loading. Returns: file: It returns the pickle file (which contains the Machine Learning items in this case). """ with open(relative_path, "rb") as file: loaded_object = pickle.load(file) return loaded_object # Function to load the dataset @st.cache_resource() def load_data(relative_path): """ This function is used to load the DataFrame into the current file. Args: relative_path (string): The relative path to the DataFrame to be loaded. Returns: DataFrame: Returns the DataFrame at the path provided. """ dataset = pd.read_csv(relative_path) return dataset # ----- Loading the key components # Loading the base dataframe rpath = r"data/train_data.csv" df_loaded = load_data(rpath) # Loading the toolkit loaded_toolkit = load_ml_toolkit(r"src/Streamlit_toolkit") if "results" not in st.session_state: st.session_state["results"] = [] # Instantiating the elements of the Machine Learning Toolkit scaler = loaded_toolkit["scaler"] model = loaded_toolkit["model"] encoder = loaded_toolkit["encoder"] # Import the model model = XGBClassifier() model.load_model(r"src/xgb_model.json") # Define app sections header = st.container() dataset = st.container() features_and_output = st.container() # Instantiate a from to receive inputs form = st.form(key="Information", clear_on_submit = True) # Structure the header section with header: header.write("This app is built of a machine learning model to predict the whether or not a building will have an insurance claim over a period based on given variables for which you will make inputs (see the input section below). The model was trained based on the DSN Insurance Prediction dataset.") header.write("---") # Structure the sidebar st.sidebar.header("Information on Columns") st.sidebar.markdown(""" - *Customer Id*: Identification number for the Policy holder - *YearOfObservation*: year of observation for the insured policy - *Insured_Period*: duration of insurance policy in Olusola Insurance. (Ex: Full year insurance, Policy Duration = 1; 6 months = 0.5 - *Residential*: is the building a residential building or not - *Building_Painted*: is the building painted or not (N-Painted, V-Not Painted) - *Building_Fenced*: is the building fence or not (N-Fenced, V-Not Fenced) - *Garden*: building has garden or not (V-has garden; O-no garden) - *Settlement*: Area where the building is located. (R- rural area; U- urban area) - *Building Dimension*: Size of the insured building in m2 - *Building_Type*: The type of building (Type 1, 2, 3, 4) - *Date_of_Occupancy*: date building was first occupied - *NumberOfWindows*: number of windows in the building - *Geo Code*: Geographical Code of the Insured building - *Claim*: target variable. (0: no claim, 1: at least one claim over insured period). """) # Structure the dataset section with dataset: if dataset.checkbox("Preview the dataset"): dataset.write(df_loaded.head()) dataset.write("Take a look at the sidebar for more information on the columns") dataset.write("---") # Define a list of expected variables expected_inputs = ["YearOfObservation", "Insured_Period", "Residential", "Building_Painted", "Building_Fenced", "Garden", "Settlement", "Building Dimension", "Building_Type", "Date_of_Occupancy", "NumberOfWindows"] # List of features to encode categoricals = ["Building_Painted", "Building_Fenced", "Garden", "Settlement"] # List of features to scale cols_to_scale = ["YearOfObservation", "Insured_Period", "Residential", "Building Dimension", "Building_Type", "Date_of_Occupancy", "NumberOfWindows"] # Structure the features and output section with features_and_output: features_and_output.subheader("Inputs") features_and_output.write("This section captures the inputs to be used in predictions...") left_col, right_col = features_and_output.columns(2) # Design the input section with form: left_col.markdown("**Inputs Set 1:") YearOfObservation = left_col.number_input("Select a year:", min_value = 2012, step = 1) Insured_Period = left_col.selectbox("Insured Period (Full year or half-year):", options = [0.5,1]) Residential = left_col.radio("Is the building residential (1) or not (0):", options= [0, 1], horizontal = True) Building_Painted = left_col.selectbox("Is the building painted (N) or not(V):", options = ["N", "V"]) Building_Fenced = left_col.selectbox("Is the building fenced (N) or not(V):", options = ["N", "V"]) right_col.markdown("**Inputs Set 2**") Garden = right_col.radio("Does the building have a garden (V) or not (O):", options = ["V", "O"], horizontal = True) Settlement = right_col.radio("Is the building situated in a rural (R) or urban (U) area?:", options = ["R", "U"], horizontal = True) Building_Dimension = right_col.number_input("What is the size of the insured building (m2)?", min_value= 1, value= 1) Building_Type = right_col.selectbox("What type of building is it?", options = [1,2,3,4]) Date_of_Occupancy = right_col.number_input("On what date was the building first occupied?", min_value= 1545, value= 1970) NumberOfWindows = right_col.select_slider("How many windows does the building have?", options= range(1,11)) # Submit button submitted = form.form_submit_button(label= "Submit") # Process inputs from user if submitted: with features_and_output: # Inputs formatting input_dict = { "YearOfObservation": [YearOfObservation], "Insured_Period": [Insured_Period], "Residential": [Residential], "Building_Painted": [Building_Painted], "Building_Fenced": [Building_Fenced], "Garden": [Garden], "Settlement": [Settlement], "Building Dimension": [Building_Dimension], "Building_Type": [Building_Type], "Date_of_Occupancy": [Date_of_Occupancy], "NumberOfWindows": [NumberOfWindows] } # Converting the input into a dataframe input_data = pd.DataFrame.from_dict(input_dict) # Encode the categorical columns encoded_test_categoricals = encoder.transform(input_data[categoricals]) encoded_test_categoricals = pd.DataFrame(encoded_test_categoricals, columns = encoder.get_feature_names_out().tolist()) # Add the encoded categoricals to the DataFrame and dropping the original columns input_data = input_data.join(encoded_test_categoricals) input_data.drop(columns= categoricals, inplace= True) # Scale the numeric columns input_data[cols_to_scale] = scaler.transform(input_data[cols_to_scale]) # Make the prediction model_output = model.predict(input_data) input_data["Prediction"] = model_output if model_output[0] == 0: display = "The building does not have a claim over the insured period." else: display = "The building has a claim over the insured period." # Adding the predictions to previous predictions st.session_state["results"].append(input_data) result = pd.concat(st.session_state["results"]) # Displaying prediction results st.success(f"**Prediction**: {display}") # Expander to display previous predictions previous_output = st.expander("**Review previous predictions**") previous_output.dataframe(result, use_container_width= True)