axa / quanti /models.py
Mayara Ayat
Upload folder using huggingface_hub
f7ab812 verified
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
root_mean_squared_error,
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from constants import HES
import pandas as pd
import matplotlib.pyplot as plt
def choose_model(model: str = "rf"):
"""
Choose the model to use for training.
Parameters:
model (str): The model to use, it can be "rf" for Random Forest or "xgb" for XGBoost.
Returns:
model: The training model.
"""
if model == "rf":
rf = RandomForestRegressor()
return rf
elif model == "xgb":
xgb = XGBRegressor()
return xgb
elif model == "lgbm":
lgbm = LGBMRegressor()
return lgbm
else:
raise ValueError("Invalid model name")
def training_testing_data(
df: pd.DataFrame,
train_start: str,
train_end: str,
test_start: str,
test_end: str,
train_start_bis: str = None,
train_end_bis: str = None,
target: str = "cpih_medical",
columns: list = ["cpih", *HES],
) -> tuple:
"""
Split the data into training and testing sets.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
target (str): The target column.
train_start (str): The start date for the training set, in the format "YYYY-MM-DD".
train_end (str): The end date for the training set, in the format "YYYY-MM-DD".
test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
train_start_bis (str): The start date for the additional training set, in the format "YYYY-MM-DD".
train_end_bis (str): The end date for the additional training set, in the format "YYYY-MM-DD".
columns (list): The columns to use as features.
Returns:
tuple: A tuple containing the training and testing sets."""
if train_start_bis and train_end_bis:
train = df[
((df["date"] >= train_start) & (df["date"] <= train_end))
| ((df["date"] >= train_start_bis) & (df["date"] <= train_end_bis))
]
else:
train = df[(df["date"] >= train_start) & (df["date"] <= train_end)]
test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]
X_train = train[columns]
y_train = train[target]
X_test = test[columns]
y_test = test[target]
return X_train, y_train, X_test, y_test
def train_model(model, X_train, y_train, X_test, y_test) -> tuple:
"""
Train the model on the training data and evaluate it on the testing data.
Parameters:
model: The model to train.
X_train: The features for the training set.
y_train: The target for the training set.
X_test: The features for the testing set.
y_test: The target for the testing set.
Returns:
float: The R^2 score of the model.
float: The mean absolute error of the model.
float: The mean squared error of the model.
float: The root mean squared error of the model."""
model.fit(X_train, y_train)
# display feature importance
feature_importances = model.feature_importances_
importance_df = pd.DataFrame(
{"Feature": X_train.columns, "Importance": feature_importances}
)
importance_df = importance_df.sort_values(by="Importance", ascending=False)
print(f"Feature Importance: \n{importance_df}")
y_pred = model.predict(X_test)
r2 = model.score(X_test, y_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
return r2, mae, mse, rmse
def get_best_params(model, X_train, y_train, X_test, y_test, param_grid):
"""
Find the best hyperparameters for the model using grid search.
Parameters:
model: The model to train.
X_train: The features for the training set.
y_train: The target for the training set.
X_test: The features for the testing set.
y_test: The target for the testing set.
param_grid: The hyperparameters to search over.
Returns:
dict: The best hyperparameters for the model."""
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
return best_params
def plot(
df: pd.DataFrame,
model,
test_start: str,
test_end: str,
target: str,
features: list,
):
"""
Plot the predicted and actual values of the target variable.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
model: The model to use.
test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
target (str): The target column.
features (list): The features to use.
"""
dates_all = df["date"]
actual_all = df[target]
dates_new = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"]
actual_new = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target]
predicted_new = model.predict(
df[(df["date"] >= test_start) & (df["date"] <= test_end)][features]
)
plt.figure(figsize=(12, 6))
plt.scatter(
dates_new,
predicted_new,
color="blue",
alpha=0.7,
label="Predicted CPIH (2022/2023)",
zorder=2,
)
plt.scatter(
dates_new,
actual_new,
color="orange",
alpha=0.7,
label="Actual CPIH (2022/2023)",
zorder=2,
)
plt.plot(
dates_new, predicted_new, color="blue", linestyle="--", alpha=0.7, zorder=1
)
plt.plot(
dates_all,
actual_all,
color="green",
alpha=0.8,
label="Actual CPIH (All Years)",
zorder=0,
)
plt.title(f"CPIH Medical: Test on {test_start} to {test_end}", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("CPIH Medical", fontsize=12)
plt.xticks(rotation=45)
plt.legend(fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f"quanti/data/CPIH Medical Test {test_start} to {test_end}.png")
plt.show()