|
"""Implements the VAEP framework. |
|
|
|
Attributes |
|
---------- |
|
xfns_default : list(callable) |
|
The default VAEP features. |
|
|
|
""" |
|
|
|
import math |
|
from typing import Any, Optional |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.exceptions import NotFittedError |
|
from sklearn.metrics import brier_score_loss, roc_auc_score |
|
|
|
import socceraction.spadl as spadlcfg |
|
|
|
from . import features as fs |
|
from . import formula as vaep |
|
from . import labels as lab |
|
|
|
try: |
|
import xgboost |
|
except ImportError: |
|
xgboost = None |
|
try: |
|
import catboost |
|
except ImportError: |
|
catboost = None |
|
try: |
|
import lightgbm |
|
except ImportError: |
|
lightgbm = None |
|
|
|
|
|
xfns_default = [ |
|
fs.actiontype_onehot, |
|
fs.result_onehot, |
|
fs.actiontype_result_onehot, |
|
fs.bodypart_onehot, |
|
fs.time, |
|
fs.startlocation, |
|
fs.endlocation, |
|
fs.startpolar, |
|
fs.endpolar, |
|
fs.movement, |
|
fs.team, |
|
fs.time_delta, |
|
fs.space_delta, |
|
fs.goalscore, |
|
] |
|
|
|
|
|
class VAEP: |
|
""" |
|
An implementation of the VAEP framework. |
|
|
|
VAEP (Valuing Actions by Estimating Probabilities) [1]_ defines the |
|
problem of valuing a soccer player's contributions within a match as |
|
a binary classification problem and rates actions by estimating its effect |
|
on the short-term probablities that a team will both score and concede. |
|
|
|
Parameters |
|
---------- |
|
xfns : list |
|
List of feature transformers (see :mod:`socceraction.vaep.features`) |
|
used to describe the game states. Uses :attr:`~socceraction.vaep.base.xfns_default` |
|
if None. |
|
nb_prev_actions : int, default=3 # noqa: DAR103 |
|
Number of previous actions used to decscribe the game state. |
|
|
|
|
|
References |
|
---------- |
|
.. [1] Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis. |
|
"Actions speak louder than goals: Valuing player actions in soccer." In |
|
Proceedings of the 25th ACM SIGKDD International Conference on Knowledge |
|
Discovery & Data Mining, pp. 1851-1861. 2019. |
|
""" |
|
|
|
_spadlcfg = spadlcfg |
|
_fs = fs |
|
_lab = lab |
|
_vaep = vaep |
|
|
|
def __init__( |
|
self, |
|
xfns: Optional[list[fs.FeatureTransfomer]] = None, |
|
nb_prev_actions: int = 3, |
|
) -> None: |
|
self.__models: dict[str, Any] = {} |
|
self.xfns = xfns_default if xfns is None else xfns |
|
self.yfns = [self._lab.scores, self._lab.concedes] |
|
self.nb_prev_actions = nb_prev_actions |
|
|
|
def compute_features(self, game: pd.Series, game_actions: fs.Actions) -> pd.DataFrame: |
|
""" |
|
Transform actions to the feature-based representation of game states. |
|
|
|
Parameters |
|
---------- |
|
game : pd.Series |
|
The SPADL representation of a single game. |
|
game_actions : pd.DataFrame |
|
The actions performed during `game` in the SPADL representation. |
|
|
|
Returns |
|
------- |
|
features : pd.DataFrame |
|
Returns the feature-based representation of each game state in the game. |
|
""" |
|
game_actions_with_names = self._spadlcfg.add_names(game_actions) |
|
gamestates = self._fs.gamestates(game_actions_with_names, self.nb_prev_actions) |
|
gamestates = self._fs.play_left_to_right(gamestates, game.home_team_id) |
|
return pd.concat([fn(gamestates) for fn in self.xfns], axis=1) |
|
|
|
def compute_labels( |
|
self, |
|
game: pd.Series, |
|
game_actions: fs.Actions, |
|
) -> pd.DataFrame: |
|
""" |
|
Compute the labels for each game state in the given game. |
|
|
|
Parameters |
|
---------- |
|
game : pd.Series |
|
The SPADL representation of a single game. |
|
game_actions : pd.DataFrame |
|
The actions performed during `game` in the SPADL representation. |
|
|
|
Returns |
|
------- |
|
labels : pd.DataFrame |
|
Returns the labels of each game state in the game. |
|
""" |
|
game_actions_with_names = self._spadlcfg.add_names(game_actions) |
|
return pd.concat([fn(game_actions_with_names) for fn in self.yfns], axis=1) |
|
|
|
def fit( |
|
self, |
|
X: pd.DataFrame, |
|
y: pd.DataFrame, |
|
learner: str = "xgboost", |
|
val_size: float = 0.25, |
|
tree_params: Optional[dict[str, Any]] = None, |
|
fit_params: Optional[dict[str, Any]] = None, |
|
) -> "VAEP": |
|
""" |
|
Fit the model according to the given training data. |
|
|
|
Parameters |
|
---------- |
|
X : pd.DataFrame |
|
Feature representation of the game states. |
|
y : pd.DataFrame |
|
Scoring and conceding labels for each game state. |
|
learner : string, default='xgboost' # noqa: DAR103 |
|
Gradient boosting implementation which should be used to learn the |
|
model. The supported learners are 'xgboost', 'catboost' and 'lightgbm'. |
|
val_size : float, default=0.25 # noqa: DAR103 |
|
Percentage of the dataset that will be used as the validation set |
|
for early stopping. When zero, no validation data will be used. |
|
tree_params : dict |
|
Parameters passed to the constructor of the learner. |
|
fit_params : dict |
|
Parameters passed to the fit method of the learner. |
|
|
|
Raises |
|
------ |
|
ValueError |
|
If one of the features is missing in the provided dataframe. |
|
|
|
Returns |
|
------- |
|
self |
|
Fitted VAEP model. |
|
|
|
""" |
|
nb_states = len(X) |
|
idx = np.random.permutation(nb_states) |
|
|
|
train_idx = idx[:math.floor(nb_states * (1 - val_size))] |
|
val_idx = idx[(math.floor(nb_states * (1 - val_size)) + 1):] |
|
|
|
|
|
|
|
cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions) |
|
if not set(cols).issubset(set(X.columns)): |
|
missing_cols = " and ".join(set(cols).difference(X.columns)) |
|
raise ValueError(f"{missing_cols} are not available in the features dataframe") |
|
|
|
|
|
X_train, y_train = X.iloc[train_idx][cols], y.iloc[train_idx] |
|
X_val, y_val = X.iloc[val_idx][cols], y.iloc[val_idx] |
|
|
|
|
|
for col in list(y.columns): |
|
eval_set = [(X_val, y_val[col])] if val_size > 0 else None |
|
if learner == "xgboost": |
|
self.__models[col] = self._fit_xgboost( |
|
X_train, y_train[col], eval_set, tree_params, fit_params |
|
) |
|
elif learner == "catboost": |
|
self.__models[col] = self._fit_catboost( |
|
X_train, y_train[col], eval_set, tree_params, fit_params |
|
) |
|
elif learner == "lightgbm": |
|
self.__models[col] = self._fit_lightgbm( |
|
X_train, y_train[col], eval_set, tree_params, fit_params |
|
) |
|
else: |
|
raise ValueError(f"A {learner} learner is not supported") |
|
return self |
|
|
|
def _fit_xgboost( |
|
self, |
|
X: pd.DataFrame, |
|
y: pd.Series, |
|
eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None, |
|
tree_params: Optional[dict[str, Any]] = None, |
|
fit_params: Optional[dict[str, Any]] = None, |
|
) -> "xgboost.XGBClassifier": |
|
if xgboost is None: |
|
raise ImportError("xgboost is not installed.") |
|
|
|
if tree_params is None: |
|
tree_params = { |
|
"n_estimators": 100, |
|
"max_depth": 3, |
|
"eval_metric": "auc", |
|
"early_stopping_rounds": 10, |
|
"enable_categorical": True, |
|
} |
|
if fit_params is None: |
|
fit_params = {"verbose": True} |
|
if eval_set is not None: |
|
val_params = {"eval_set": eval_set} |
|
fit_params = {**fit_params, **val_params} |
|
|
|
model = xgboost.XGBClassifier(**tree_params) |
|
return model.fit(X, y, **fit_params) |
|
|
|
def _fit_catboost( |
|
self, |
|
X: pd.DataFrame, |
|
y: pd.Series, |
|
eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None, |
|
tree_params: Optional[dict[str, Any]] = None, |
|
fit_params: Optional[dict[str, Any]] = None, |
|
) -> "catboost.CatBoostClassifier": |
|
if catboost is None: |
|
raise ImportError("catboost is not installed.") |
|
|
|
if tree_params is None: |
|
tree_params = { |
|
"eval_metric": "BrierScore", |
|
"loss_function": "Logloss", |
|
"iterations": 100, |
|
} |
|
if fit_params is None: |
|
is_cat_feature = [c.dtype.name == "category" for (_, c) in X.iteritems()] |
|
fit_params = { |
|
"cat_features": np.nonzero(is_cat_feature)[0].tolist(), |
|
"verbose": True, |
|
} |
|
if eval_set is not None: |
|
val_params = {"early_stopping_rounds": 10, "eval_set": eval_set} |
|
fit_params = {**fit_params, **val_params} |
|
|
|
model = catboost.CatBoostClassifier(**tree_params) |
|
return model.fit(X, y, **fit_params) |
|
|
|
def _fit_lightgbm( |
|
self, |
|
X: pd.DataFrame, |
|
y: pd.Series, |
|
eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None, |
|
tree_params: Optional[dict[str, Any]] = None, |
|
fit_params: Optional[dict[str, Any]] = None, |
|
) -> "lightgbm.LGBMClassifier": |
|
if lightgbm is None: |
|
raise ImportError("lightgbm is not installed.") |
|
if tree_params is None: |
|
tree_params = {"n_estimators": 100, "max_depth": 3} |
|
if fit_params is None: |
|
fit_params = {"eval_metric": "auc", "verbose": True} |
|
if eval_set is not None: |
|
val_params = {"early_stopping_rounds": 10, "eval_set": eval_set} |
|
fit_params = {**fit_params, **val_params} |
|
|
|
model = lightgbm.LGBMClassifier(**tree_params) |
|
return model.fit(X, y, **fit_params) |
|
|
|
def _estimate_probabilities(self, X: pd.DataFrame) -> pd.DataFrame: |
|
|
|
cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions) |
|
if not set(cols).issubset(set(X.columns)): |
|
missing_cols = " and ".join(set(cols).difference(X.columns)) |
|
raise ValueError(f"{missing_cols} are not available in the features dataframe") |
|
|
|
Y_hat = pd.DataFrame() |
|
for col in self.__models: |
|
Y_hat[col] = [p[1] for p in self.__models[col].predict_proba(X[cols])] |
|
return Y_hat |
|
|
|
def rate( |
|
self, |
|
game: pd.Series, |
|
game_actions: fs.Actions, |
|
game_states: Optional[fs.Features] = None, |
|
) -> pd.DataFrame: |
|
""" |
|
Compute the VAEP rating for the given game states. |
|
|
|
Parameters |
|
---------- |
|
game : pd.Series |
|
The SPADL representation of a single game. |
|
game_actions : pd.DataFrame |
|
The actions performed during `game` in the SPADL representation. |
|
game_states : pd.DataFrame, default=None |
|
DataFrame with the game state representation of each action. If |
|
`None`, these will be computed on-th-fly. |
|
|
|
Raises |
|
------ |
|
NotFittedError |
|
If the model is not fitted yet. |
|
|
|
Returns |
|
------- |
|
ratings : pd.DataFrame |
|
Returns the VAEP rating for each given action, as well as the |
|
offensive and defensive value of each action. |
|
""" |
|
if not self.__models: |
|
raise NotFittedError() |
|
|
|
game_actions_with_names = self._spadlcfg.add_names(game_actions) |
|
if game_states is None: |
|
game_states = self.compute_features(game, game_actions) |
|
|
|
y_hat = self._estimate_probabilities(game_states) |
|
p_scores, p_concedes = y_hat.scores, y_hat.concedes |
|
vaep_values = self._vaep.value(game_actions_with_names, p_scores, p_concedes) |
|
return vaep_values |
|
|
|
def score(self, X: pd.DataFrame, y: pd.DataFrame) -> dict[str, dict[str, float]]: |
|
"""Evaluate the fit of the model on the given test data and labels. |
|
|
|
Parameters |
|
---------- |
|
X : pd.DataFrame |
|
Feature representation of the game states. |
|
y : pd.DataFrame |
|
Scoring and conceding labels for each game state. |
|
|
|
Raises |
|
------ |
|
NotFittedError |
|
If the model is not fitted yet. |
|
|
|
Returns |
|
------- |
|
score : dict |
|
The Brier and AUROC scores for both binary classification problems. |
|
""" |
|
if not self.__models: |
|
raise NotFittedError() |
|
|
|
y_hat = self._estimate_probabilities(X) |
|
|
|
scores: dict[str, dict[str, float]] = {} |
|
for col in self.__models: |
|
scores[col] = {} |
|
scores[col]["brier"] = brier_score_loss(y[col], y_hat[col]) |
|
scores[col]["auroc"] = roc_auc_score(y[col], y_hat[col]) |
|
|
|
return scores |
|
|