from models import * from data_preprocessing import * def plot_rmse_explanation( dates: pd.Series, actual: pd.Series, predicted: pd.Series, rmse: float, title: str = "Understanding RMSE: Actual vs Predicted", ): """ Plot the actual vs. predicted values with error visualization and RMSE explanation. Parameters: dates (pd.Series): Dates corresponding to the observations. actual (pd.Series): Actual target values. predicted (pd.Series): Predicted values by the model. rmse (float): The root mean squared error value. title (str): The title of the plot. """ plt.figure(figsize=(14, 8)) # Plot actual vs. predicted values plt.plot(dates, actual, label="Actual Values", color="orange", linewidth=2) plt.plot( dates, predicted, label="Predicted Values", color="blue", linestyle="--", linewidth=2, ) # Highlight errors (residuals) for date, act, pred in zip(dates, actual, predicted): plt.plot( [date, date], [act, pred], color="red", alpha=0.5 ) # Vertical lines showing residuals # Annotate RMSE value plt.text( 0.05, 0.95, f"RMSE: {rmse:.2f}", transform=plt.gca().transAxes, fontsize=14, color="red", bbox=dict(facecolor="white", alpha=0.7, edgecolor="red"), ) # Add plot details plt.title(title, fontsize=16) plt.xlabel("Date", fontsize=14) plt.ylabel("CPIH Medical", fontsize=14) plt.xticks(rotation=45) plt.legend(fontsize=12) plt.grid(alpha=0.3) plt.tight_layout() plt.savefig(f"quanti/data/{title},{dates.iloc[0]},{dates.iloc[-1]}.png") plt.show() def main( df: pd.DataFrame, model, train_start: str, train_end: str, test_start: str, test_end: str, target: str, features: list, param_grid: dict, train_start_bis: str = None, train_end_bis: str = None, ): """ Train and evaluate a model on the given data. Parameters: df (pd.DataFrame): The DataFrame containing the data. model (str): The model to use. train_start (str): The start date for the training set, in the format "YYYY-MM-DD". train_end (str): The end date for the training set, in the format "YYYY-MM-DD". test_start (str): The start date for the testing set, in the format "YYYY-MM-DD". test_end (str): The end date for the testing set, in the format "YYYY-MM-DD". target (str): The target column. features (list): The features to use. param_grid (dict): The hyperparameter grid. train_start_bis (str): The start date for the second training set, in the format "YYYY-MM-DD". train_end_bis (str): The end date for the second training set, in the format "YYYY-MM-DD". Returns: tuple: A tuple containing the R^2, MAE, MSE, and RMSE values. """ X_train, y_train, X_test, y_test = training_testing_data( df, train_start, train_end, test_start, test_end, train_start_bis, train_end_bis, target, features, ) params = get_best_params(model, X_train, y_train, X_test, y_test, param_grid) print(X_train.columns) model = model.set_params(**params) # Add random state for reproducibility model = model.set_params(random_state=42) r2, mae, mse, rmse = train_model(model, X_train, y_train, X_test, y_test) print(f"R^2: {r2}") print(f"MAE: {mae}") print(f"MSE: {mse}") print(f"RMSE: {rmse}") plot(df, model, test_start, test_end, target, features) dates_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"] actual_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target] predicted_test = model.predict(X_test) plot_rmse_explanation(dates_test, actual_test, predicted_test, rmse) return r2, mae, mse, rmse if __name__ == "__main__": cpih_df = read_cpih("quanti/data/cpih.csv", medical=False) cpim_df = read_cpih("quanti/data/cpih_medical.csv", medical=True) hes = read_hes("quanti/data/HES_M5_OPEN_DATA.csv") df = get_global_df(cpih_df, cpim_df, hes) df = get_final_df(df) print(df.columns) model = choose_model("rf") train_start = "2014-01-01" train_end = "2025-01-01" # train_start_bis = "2019-01-01" # train_end_bis = "2024-12-01" test_start = "2007-01-01" test_end = "2014-01-01" target = "target" features = df.columns.drop(["date", "target"]).tolist() param_grid = { "n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "criterion": ["mse", "poisson"], } # param_grid = { # "n_estimators": [50, 100, 200], # Number of trees # "max_depth": [3, 5, 7, 10], # Maximum depth of a tree # "learning_rate": [0.01, 0.1, 0.2], # Learning rate # } # param_grid = { # "n_estimators": [100, 200, 500], # "learning_rate": [0.01, 0.1, 0.2], # "max_depth": [5, 10, 20], # } r2, mae, mse, rmse = main( df, model, train_start, train_end, test_start, test_end, target, features, param_grid, # train_start_bis, # train_end_bis, )