File size: 5,305 Bytes
f7ab812 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
from models import *
from data_preprocessing import *
def plot_rmse_explanation(
dates: pd.Series,
actual: pd.Series,
predicted: pd.Series,
rmse: float,
title: str = "Understanding RMSE: Actual vs Predicted",
):
"""
Plot the actual vs. predicted values with error visualization and RMSE explanation.
Parameters:
dates (pd.Series): Dates corresponding to the observations.
actual (pd.Series): Actual target values.
predicted (pd.Series): Predicted values by the model.
rmse (float): The root mean squared error value.
title (str): The title of the plot.
"""
plt.figure(figsize=(14, 8))
# Plot actual vs. predicted values
plt.plot(dates, actual, label="Actual Values", color="orange", linewidth=2)
plt.plot(
dates,
predicted,
label="Predicted Values",
color="blue",
linestyle="--",
linewidth=2,
)
# Highlight errors (residuals)
for date, act, pred in zip(dates, actual, predicted):
plt.plot(
[date, date], [act, pred], color="red", alpha=0.5
) # Vertical lines showing residuals
# Annotate RMSE value
plt.text(
0.05,
0.95,
f"RMSE: {rmse:.2f}",
transform=plt.gca().transAxes,
fontsize=14,
color="red",
bbox=dict(facecolor="white", alpha=0.7, edgecolor="red"),
)
# Add plot details
plt.title(title, fontsize=16)
plt.xlabel("Date", fontsize=14)
plt.ylabel("CPIH Medical", fontsize=14)
plt.xticks(rotation=45)
plt.legend(fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f"quanti/data/{title},{dates.iloc[0]},{dates.iloc[-1]}.png")
plt.show()
def main(
df: pd.DataFrame,
model,
train_start: str,
train_end: str,
test_start: str,
test_end: str,
target: str,
features: list,
param_grid: dict,
train_start_bis: str = None,
train_end_bis: str = None,
):
"""
Train and evaluate a model on the given data.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
model (str): The model to use.
train_start (str): The start date for the training set, in the format "YYYY-MM-DD".
train_end (str): The end date for the training set, in the format "YYYY-MM-DD".
test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
target (str): The target column.
features (list): The features to use.
param_grid (dict): The hyperparameter grid.
train_start_bis (str): The start date for the second training set, in the format "YYYY-MM-DD".
train_end_bis (str): The end date for the second training set, in the format "YYYY-MM-DD".
Returns:
tuple: A tuple containing the R^2, MAE, MSE, and RMSE values.
"""
X_train, y_train, X_test, y_test = training_testing_data(
df,
train_start,
train_end,
test_start,
test_end,
train_start_bis,
train_end_bis,
target,
features,
)
params = get_best_params(model, X_train, y_train, X_test, y_test, param_grid)
print(X_train.columns)
model = model.set_params(**params)
# Add random state for reproducibility
model = model.set_params(random_state=42)
r2, mae, mse, rmse = train_model(model, X_train, y_train, X_test, y_test)
print(f"R^2: {r2}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
plot(df, model, test_start, test_end, target, features)
dates_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"]
actual_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target]
predicted_test = model.predict(X_test)
plot_rmse_explanation(dates_test, actual_test, predicted_test, rmse)
return r2, mae, mse, rmse
if __name__ == "__main__":
cpih_df = read_cpih("quanti/data/cpih.csv", medical=False)
cpim_df = read_cpih("quanti/data/cpih_medical.csv", medical=True)
hes = read_hes("quanti/data/HES_M5_OPEN_DATA.csv")
df = get_global_df(cpih_df, cpim_df, hes)
df = get_final_df(df)
print(df.columns)
model = choose_model("rf")
train_start = "2014-01-01"
train_end = "2025-01-01"
# train_start_bis = "2019-01-01"
# train_end_bis = "2024-12-01"
test_start = "2007-01-01"
test_end = "2014-01-01"
target = "target"
features = df.columns.drop(["date", "target"]).tolist()
param_grid = {
"n_estimators": [50, 100, 200],
"max_depth": [None, 5, 10, 20],
"criterion": ["mse", "poisson"],
}
# param_grid = {
# "n_estimators": [50, 100, 200], # Number of trees
# "max_depth": [3, 5, 7, 10], # Maximum depth of a tree
# "learning_rate": [0.01, 0.1, 0.2], # Learning rate
# }
# param_grid = {
# "n_estimators": [100, 200, 500],
# "learning_rate": [0.01, 0.1, 0.2],
# "max_depth": [5, 10, 20],
# }
r2, mae, mse, rmse = main(
df,
model,
train_start,
train_end,
test_start,
test_end,
target,
features,
param_grid,
# train_start_bis,
# train_end_bis,
)
|