|
|
|
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn import tree
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
import joblib
|
|
|
|
class InsuranceClaimModelTrainer:
|
|
def __init__(self, data_path):
|
|
self.data_path = data_path
|
|
self.model = None
|
|
|
|
def load_data(self):
|
|
|
|
df = pd.read_csv(self.data_path)
|
|
|
|
X = df.drop(columns=['insuranceclaim'])
|
|
y = df['insuranceclaim']
|
|
return X, y
|
|
|
|
def preprocess_data(self, X):
|
|
|
|
numerical_features = ['age', 'bmi', 'children', 'charges']
|
|
numerical_transformer = StandardScaler()
|
|
|
|
|
|
categorical_features = ['sex', 'smoker', 'region']
|
|
categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')
|
|
|
|
|
|
preprocessor = ColumnTransformer(
|
|
transformers=[
|
|
('num', numerical_transformer, numerical_features),
|
|
('cat', categorical_transformer, categorical_features)
|
|
])
|
|
|
|
return preprocessor
|
|
|
|
def train_model(self):
|
|
|
|
X, y = self.load_data()
|
|
preprocessor = self.preprocess_data(X)
|
|
|
|
|
|
self.model = Pipeline(steps=[
|
|
('preprocessor', preprocessor),
|
|
('classifier', tree.DecisionTreeClassifier(random_state=42))
|
|
])
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
|
|
|
|
|
|
self.model.fit(X_train, y_train)
|
|
|
|
|
|
joblib.dump(self.model, 'model/insurance_claim_prediction_model.joblib')
|
|
print("Model trained and saved successfully!")
|
|
|
|
if __name__ == "__main__":
|
|
trainer = InsuranceClaimModelTrainer('dataset/insurance2.csv')
|
|
trainer.train_model()
|
|
|