File size: 6,299 Bytes
f7ab812 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
root_mean_squared_error,
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from constants import HES
import pandas as pd
import matplotlib.pyplot as plt
def choose_model(model: str = "rf"):
"""
Choose the model to use for training.
Parameters:
model (str): The model to use, it can be "rf" for Random Forest or "xgb" for XGBoost.
Returns:
model: The training model.
"""
if model == "rf":
rf = RandomForestRegressor()
return rf
elif model == "xgb":
xgb = XGBRegressor()
return xgb
elif model == "lgbm":
lgbm = LGBMRegressor()
return lgbm
else:
raise ValueError("Invalid model name")
def training_testing_data(
df: pd.DataFrame,
train_start: str,
train_end: str,
test_start: str,
test_end: str,
train_start_bis: str = None,
train_end_bis: str = None,
target: str = "cpih_medical",
columns: list = ["cpih", *HES],
) -> tuple:
"""
Split the data into training and testing sets.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
target (str): The target column.
train_start (str): The start date for the training set, in the format "YYYY-MM-DD".
train_end (str): The end date for the training set, in the format "YYYY-MM-DD".
test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
train_start_bis (str): The start date for the additional training set, in the format "YYYY-MM-DD".
train_end_bis (str): The end date for the additional training set, in the format "YYYY-MM-DD".
columns (list): The columns to use as features.
Returns:
tuple: A tuple containing the training and testing sets."""
if train_start_bis and train_end_bis:
train = df[
((df["date"] >= train_start) & (df["date"] <= train_end))
| ((df["date"] >= train_start_bis) & (df["date"] <= train_end_bis))
]
else:
train = df[(df["date"] >= train_start) & (df["date"] <= train_end)]
test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]
X_train = train[columns]
y_train = train[target]
X_test = test[columns]
y_test = test[target]
return X_train, y_train, X_test, y_test
def train_model(model, X_train, y_train, X_test, y_test) -> tuple:
"""
Train the model on the training data and evaluate it on the testing data.
Parameters:
model: The model to train.
X_train: The features for the training set.
y_train: The target for the training set.
X_test: The features for the testing set.
y_test: The target for the testing set.
Returns:
float: The R^2 score of the model.
float: The mean absolute error of the model.
float: The mean squared error of the model.
float: The root mean squared error of the model."""
model.fit(X_train, y_train)
# display feature importance
feature_importances = model.feature_importances_
importance_df = pd.DataFrame(
{"Feature": X_train.columns, "Importance": feature_importances}
)
importance_df = importance_df.sort_values(by="Importance", ascending=False)
print(f"Feature Importance: \n{importance_df}")
y_pred = model.predict(X_test)
r2 = model.score(X_test, y_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
return r2, mae, mse, rmse
def get_best_params(model, X_train, y_train, X_test, y_test, param_grid):
"""
Find the best hyperparameters for the model using grid search.
Parameters:
model: The model to train.
X_train: The features for the training set.
y_train: The target for the training set.
X_test: The features for the testing set.
y_test: The target for the testing set.
param_grid: The hyperparameters to search over.
Returns:
dict: The best hyperparameters for the model."""
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
return best_params
def plot(
df: pd.DataFrame,
model,
test_start: str,
test_end: str,
target: str,
features: list,
):
"""
Plot the predicted and actual values of the target variable.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
model: The model to use.
test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
target (str): The target column.
features (list): The features to use.
"""
dates_all = df["date"]
actual_all = df[target]
dates_new = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"]
actual_new = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target]
predicted_new = model.predict(
df[(df["date"] >= test_start) & (df["date"] <= test_end)][features]
)
plt.figure(figsize=(12, 6))
plt.scatter(
dates_new,
predicted_new,
color="blue",
alpha=0.7,
label="Predicted CPIH (2022/2023)",
zorder=2,
)
plt.scatter(
dates_new,
actual_new,
color="orange",
alpha=0.7,
label="Actual CPIH (2022/2023)",
zorder=2,
)
plt.plot(
dates_new, predicted_new, color="blue", linestyle="--", alpha=0.7, zorder=1
)
plt.plot(
dates_all,
actual_all,
color="green",
alpha=0.8,
label="Actual CPIH (All Years)",
zorder=0,
)
plt.title(f"CPIH Medical: Test on {test_start} to {test_end}", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("CPIH Medical", fontsize=12)
plt.xticks(rotation=45)
plt.legend(fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f"quanti/data/CPIH Medical Test {test_start} to {test_end}.png")
plt.show()
|