File size: 3,015 Bytes
4386418 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
warnings.filterwarnings("ignore")
# Load and preprocess data
data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39")
data.replace('?', np.nan, inplace=True)
# Function to check data
def check_data(data):
return pd.DataFrame({
'type': data.dtypes,
'amount_unique': data.nunique(),
'unique_values': [data[x].unique() for x in data.columns],
'null_values': data.isna().sum(),
'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2)
})
print(check_data(data).sort_values("null_values", ascending=False))
# Fill missing values with mode
for column in data.columns:
mode_value = data[column].mode().iloc[0]
data[column] = data[column].replace(np.nan, mode_value)
# Encode categorical variables
le = LabelEncoder()
for col in data.columns:
if data[col].dtype == 'O':
data[col] = le.fit_transform(data[col])
# Drop less important columns
to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location',
'auto_year', 'auto_make', 'auto_model']
data.drop(columns=to_drop, inplace=True)
# Correlation heatmap
plt.figure(figsize=(23, 23))
corr_matrix = data.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma')
plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16)
plt.show()
# Drop less correlated features
to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age',
'incident_hour_of_the_day', 'insured_occupation']
data.drop(columns=to_drop, inplace=True)
# Feature importance
X = data.iloc[:, :-1]
Y = data['fraud_reported']
model = RandomForestClassifier(n_estimators=1000)
model.fit(X, Y)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
final_feat = feat_importances.nlargest(10).index.tolist()
final_feat.append('fraud_reported')
data_new = data[final_feat]
# Prepare data for modeling
df_model = data_new.copy()
X = df_model.drop(columns='fraud_reported')
y = df_model['fraud_reported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)
# Train the final model
final_model = RandomForestClassifier(
criterion='gini',
max_depth=5,
min_samples_leaf=4,
min_samples_split=10,
n_estimators=100,
random_state=42,
class_weight='balanced'
)
final_model.fit(X_train, y_train)
# Evaluate the model
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
# Save the model
joblib.dump(final_model, 'model/only_model.joblib')
print("Model saved successfully.")
|