# train_model.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

class InsuranceClaimModelTrainer:
    def __init__(self, data_path):
        self.data_path = data_path
        self.model = None

    def load_data(self):
        # Load the dataset
        df = pd.read_csv(self.data_path)
        # Separate features and target
        X = df.drop(columns=['insuranceclaim'])
        y = df['insuranceclaim']
        return X, y

    def preprocess_data(self, X):
        # Define preprocessing for numerical features (scaling)
        numerical_features = ['age', 'bmi', 'children', 'charges']
        numerical_transformer = StandardScaler()

        # Define preprocessing for categorical features (one-hot encoding)
        categorical_features = ['sex', 'smoker', 'region']
        categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

        # Combine preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        return preprocessor

    def train_model(self):
        # Load and preprocess the data
        X, y = self.load_data()
        preprocessor = self.preprocess_data(X)

        # Create a preprocessing and modeling pipeline
        self.model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', tree.DecisionTreeClassifier(random_state=42))
        ])

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

        # Train the model
        self.model.fit(X_train, y_train)

        # Save the preprocessor and the trained model using joblib
        joblib.dump(self.model, 'model/insurance_claim_prediction_model.joblib')
        print("Model trained and saved successfully!")

if __name__ == "__main__":
    trainer = InsuranceClaimModelTrainer('dataset/insurance2.csv')
    trainer.train_model()