# train_model.py import pandas as pd from sklearn.model_selection import train_test_split from sklearn import tree from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder import joblib class InsuranceClaimModelTrainer: def __init__(self, data_path): self.data_path = data_path self.model = None def load_data(self): # Load the dataset df = pd.read_csv(self.data_path) # Separate features and target X = df.drop(columns=['insuranceclaim']) y = df['insuranceclaim'] return X, y def preprocess_data(self, X): # Define preprocessing for numerical features (scaling) numerical_features = ['age', 'bmi', 'children', 'charges'] numerical_transformer = StandardScaler() # Define preprocessing for categorical features (one-hot encoding) categorical_features = ['sex', 'smoker', 'region'] categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first') # Combine preprocessing steps preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features) ]) return preprocessor def train_model(self): # Load and preprocess the data X, y = self.load_data() preprocessor = self.preprocess_data(X) # Create a preprocessing and modeling pipeline self.model = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', tree.DecisionTreeClassifier(random_state=42)) ]) # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10) # Train the model self.model.fit(X_train, y_train) # Save the preprocessor and the trained model using joblib joblib.dump(self.model, 'model/insurance_claim_prediction_model.joblib') print("Model trained and saved successfully!") if __name__ == "__main__": trainer = InsuranceClaimModelTrainer('dataset/insurance2.csv') trainer.train_model()