File size: 3,015 Bytes
4386418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

warnings.filterwarnings("ignore")

# Load and preprocess data
data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39")
data.replace('?', np.nan, inplace=True)

# Function to check data
def check_data(data):
    return pd.DataFrame({
        'type': data.dtypes,
        'amount_unique': data.nunique(),
        'unique_values': [data[x].unique() for x in data.columns],
        'null_values': data.isna().sum(),
        'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2)
    })

print(check_data(data).sort_values("null_values", ascending=False))

# Fill missing values with mode
for column in data.columns:
    mode_value = data[column].mode().iloc[0]
    data[column] = data[column].replace(np.nan, mode_value)

# Encode categorical variables
le = LabelEncoder()
for col in data.columns:
    if data[col].dtype == 'O':
        data[col] = le.fit_transform(data[col])

# Drop less important columns
to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location',
           'auto_year', 'auto_make', 'auto_model']
data.drop(columns=to_drop, inplace=True)

# Correlation heatmap
plt.figure(figsize=(23, 23))
corr_matrix = data.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma')
plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16)
plt.show()

# Drop less correlated features
to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age',
           'incident_hour_of_the_day', 'insured_occupation']
data.drop(columns=to_drop, inplace=True)

# Feature importance
X = data.iloc[:, :-1]
Y = data['fraud_reported']
model = RandomForestClassifier(n_estimators=1000)
model.fit(X, Y)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
final_feat = feat_importances.nlargest(10).index.tolist()
final_feat.append('fraud_reported')
data_new = data[final_feat]

# Prepare data for modeling
df_model = data_new.copy()
X = df_model.drop(columns='fraud_reported')
y = df_model['fraud_reported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

# Train the final model
final_model = RandomForestClassifier(
    criterion='gini',
    max_depth=5,
    min_samples_leaf=4,
    min_samples_split=10,
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train, y_train)

# Evaluate the model
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

# Save the model
joblib.dump(final_model, 'model/only_model.joblib')
print("Model saved successfully.")