Spaces:

macota1
/

axa

Runtime error

axa

File size: 5,305 Bytes

f7ab812

from models import *
from data_preprocessing import *


def plot_rmse_explanation(
    dates: pd.Series,
    actual: pd.Series,
    predicted: pd.Series,
    rmse: float,
    title: str = "Understanding RMSE: Actual vs Predicted",
):
    """
    Plot the actual vs. predicted values with error visualization and RMSE explanation.

    Parameters:
    dates (pd.Series): Dates corresponding to the observations.
    actual (pd.Series): Actual target values.
    predicted (pd.Series): Predicted values by the model.
    rmse (float): The root mean squared error value.
    title (str): The title of the plot.
    """
    plt.figure(figsize=(14, 8))

    # Plot actual vs. predicted values
    plt.plot(dates, actual, label="Actual Values", color="orange", linewidth=2)
    plt.plot(
        dates,
        predicted,
        label="Predicted Values",
        color="blue",
        linestyle="--",
        linewidth=2,
    )

    # Highlight errors (residuals)
    for date, act, pred in zip(dates, actual, predicted):
        plt.plot(
            [date, date], [act, pred], color="red", alpha=0.5
        )  # Vertical lines showing residuals

    # Annotate RMSE value
    plt.text(
        0.05,
        0.95,
        f"RMSE: {rmse:.2f}",
        transform=plt.gca().transAxes,
        fontsize=14,
        color="red",
        bbox=dict(facecolor="white", alpha=0.7, edgecolor="red"),
    )

    # Add plot details
    plt.title(title, fontsize=16)
    plt.xlabel("Date", fontsize=14)
    plt.ylabel("CPIH Medical", fontsize=14)
    plt.xticks(rotation=45)
    plt.legend(fontsize=12)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"quanti/data/{title},{dates.iloc[0]},{dates.iloc[-1]}.png")
    plt.show()


def main(
    df: pd.DataFrame,
    model,
    train_start: str,
    train_end: str,
    test_start: str,
    test_end: str,
    target: str,
    features: list,
    param_grid: dict,
    train_start_bis: str = None,
    train_end_bis: str = None,
):
    """
    Train and evaluate a model on the given data.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    model (str): The model to use.
    train_start (str): The start date for the training set, in the format "YYYY-MM-DD".
    train_end (str): The end date for the training set, in the format "YYYY-MM-DD".
    test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
    test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
    target (str): The target column.
    features (list): The features to use.
    param_grid (dict): The hyperparameter grid.
    train_start_bis (str): The start date for the second training set, in the format "YYYY-MM-DD".
    train_end_bis (str): The end date for the second training set, in the format "YYYY-MM-DD".


    Returns:
    tuple: A tuple containing the R^2, MAE, MSE, and RMSE values.
    """
    X_train, y_train, X_test, y_test = training_testing_data(
        df,
        train_start,
        train_end,
        test_start,
        test_end,
        train_start_bis,
        train_end_bis,
        target,
        features,
    )
    params = get_best_params(model, X_train, y_train, X_test, y_test, param_grid)
    print(X_train.columns)
    model = model.set_params(**params)
    # Add random state for reproducibility
    model = model.set_params(random_state=42)
    r2, mae, mse, rmse = train_model(model, X_train, y_train, X_test, y_test)
    print(f"R^2: {r2}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    plot(df, model, test_start, test_end, target, features)
    dates_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"]
    actual_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target]
    predicted_test = model.predict(X_test)
    plot_rmse_explanation(dates_test, actual_test, predicted_test, rmse)
    return r2, mae, mse, rmse


if __name__ == "__main__":
    cpih_df = read_cpih("quanti/data/cpih.csv", medical=False)
    cpim_df = read_cpih("quanti/data/cpih_medical.csv", medical=True)
    hes = read_hes("quanti/data/HES_M5_OPEN_DATA.csv")
    df = get_global_df(cpih_df, cpim_df, hes)
    df = get_final_df(df)
    print(df.columns)
    model = choose_model("rf")
    train_start = "2014-01-01"
    train_end = "2025-01-01"
    # train_start_bis = "2019-01-01"
    # train_end_bis = "2024-12-01"
    test_start = "2007-01-01"
    test_end = "2014-01-01"
    target = "target"
    features = df.columns.drop(["date", "target"]).tolist()
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 5, 10, 20],
        "criterion": ["mse", "poisson"],
    }
    # param_grid = {
    #     "n_estimators": [50, 100, 200],  # Number of trees
    #     "max_depth": [3, 5, 7, 10],  # Maximum depth of a tree
    #     "learning_rate": [0.01, 0.1, 0.2],  # Learning rate
    # }
    # param_grid = {
    #     "n_estimators": [100, 200, 500],
    #     "learning_rate": [0.01, 0.1, 0.2],
    #     "max_depth": [5, 10, 20],
    # }
    r2, mae, mse, rmse = main(
        df,
        model,
        train_start,
        train_end,
        test_start,
        test_end,
        target,
        features,
        param_grid,
        # train_start_bis,
        # train_end_bis,
    )