socr / vaep /base.py
scfive's picture
Upload 203 files
d6ea71e verified
"""Implements the VAEP framework.
Attributes
----------
xfns_default : list(callable)
The default VAEP features.
"""
import math
from typing import Any, Optional
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.metrics import brier_score_loss, roc_auc_score
import socceraction.spadl as spadlcfg
from . import features as fs
from . import formula as vaep
from . import labels as lab
try:
import xgboost
except ImportError:
xgboost = None # type: ignore
try:
import catboost
except ImportError:
catboost = None # type: ignore
try:
import lightgbm
except ImportError:
lightgbm = None # type: ignore
xfns_default = [
fs.actiontype_onehot,
fs.result_onehot,
fs.actiontype_result_onehot,
fs.bodypart_onehot,
fs.time,
fs.startlocation,
fs.endlocation,
fs.startpolar,
fs.endpolar,
fs.movement,
fs.team,
fs.time_delta,
fs.space_delta,
fs.goalscore,
]
class VAEP:
"""
An implementation of the VAEP framework.
VAEP (Valuing Actions by Estimating Probabilities) [1]_ defines the
problem of valuing a soccer player's contributions within a match as
a binary classification problem and rates actions by estimating its effect
on the short-term probablities that a team will both score and concede.
Parameters
----------
xfns : list
List of feature transformers (see :mod:`socceraction.vaep.features`)
used to describe the game states. Uses :attr:`~socceraction.vaep.base.xfns_default`
if None.
nb_prev_actions : int, default=3 # noqa: DAR103
Number of previous actions used to decscribe the game state.
References
----------
.. [1] Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis.
"Actions speak louder than goals: Valuing player actions in soccer." In
Proceedings of the 25th ACM SIGKDD International Conference on Knowledge
Discovery & Data Mining, pp. 1851-1861. 2019.
"""
_spadlcfg = spadlcfg
_fs = fs
_lab = lab
_vaep = vaep
def __init__(
self,
xfns: Optional[list[fs.FeatureTransfomer]] = None,
nb_prev_actions: int = 3,
) -> None:
self.__models: dict[str, Any] = {}
self.xfns = xfns_default if xfns is None else xfns
self.yfns = [self._lab.scores, self._lab.concedes]
self.nb_prev_actions = nb_prev_actions
def compute_features(self, game: pd.Series, game_actions: fs.Actions) -> pd.DataFrame:
"""
Transform actions to the feature-based representation of game states.
Parameters
----------
game : pd.Series
The SPADL representation of a single game.
game_actions : pd.DataFrame
The actions performed during `game` in the SPADL representation.
Returns
-------
features : pd.DataFrame
Returns the feature-based representation of each game state in the game.
"""
game_actions_with_names = self._spadlcfg.add_names(game_actions) # type: ignore
gamestates = self._fs.gamestates(game_actions_with_names, self.nb_prev_actions)
gamestates = self._fs.play_left_to_right(gamestates, game.home_team_id)
return pd.concat([fn(gamestates) for fn in self.xfns], axis=1)
def compute_labels(
self,
game: pd.Series,
game_actions: fs.Actions, # pylint: disable=W0613
) -> pd.DataFrame:
"""
Compute the labels for each game state in the given game.
Parameters
----------
game : pd.Series
The SPADL representation of a single game.
game_actions : pd.DataFrame
The actions performed during `game` in the SPADL representation.
Returns
-------
labels : pd.DataFrame
Returns the labels of each game state in the game.
"""
game_actions_with_names = self._spadlcfg.add_names(game_actions) # type: ignore
return pd.concat([fn(game_actions_with_names) for fn in self.yfns], axis=1)
def fit(
self,
X: pd.DataFrame,
y: pd.DataFrame,
learner: str = "xgboost",
val_size: float = 0.25,
tree_params: Optional[dict[str, Any]] = None,
fit_params: Optional[dict[str, Any]] = None,
) -> "VAEP":
"""
Fit the model according to the given training data.
Parameters
----------
X : pd.DataFrame
Feature representation of the game states.
y : pd.DataFrame
Scoring and conceding labels for each game state.
learner : string, default='xgboost' # noqa: DAR103
Gradient boosting implementation which should be used to learn the
model. The supported learners are 'xgboost', 'catboost' and 'lightgbm'.
val_size : float, default=0.25 # noqa: DAR103
Percentage of the dataset that will be used as the validation set
for early stopping. When zero, no validation data will be used.
tree_params : dict
Parameters passed to the constructor of the learner.
fit_params : dict
Parameters passed to the fit method of the learner.
Raises
------
ValueError
If one of the features is missing in the provided dataframe.
Returns
-------
self
Fitted VAEP model.
"""
nb_states = len(X)
idx = np.random.permutation(nb_states)
# fmt: off
train_idx = idx[:math.floor(nb_states * (1 - val_size))]
val_idx = idx[(math.floor(nb_states * (1 - val_size)) + 1):]
# fmt: on
# filter feature columns
cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions)
if not set(cols).issubset(set(X.columns)):
missing_cols = " and ".join(set(cols).difference(X.columns))
raise ValueError(f"{missing_cols} are not available in the features dataframe")
# split train and validation data
X_train, y_train = X.iloc[train_idx][cols], y.iloc[train_idx]
X_val, y_val = X.iloc[val_idx][cols], y.iloc[val_idx]
# train classifiers F(X) = Y
for col in list(y.columns):
eval_set = [(X_val, y_val[col])] if val_size > 0 else None
if learner == "xgboost":
self.__models[col] = self._fit_xgboost(
X_train, y_train[col], eval_set, tree_params, fit_params
)
elif learner == "catboost":
self.__models[col] = self._fit_catboost(
X_train, y_train[col], eval_set, tree_params, fit_params
)
elif learner == "lightgbm":
self.__models[col] = self._fit_lightgbm(
X_train, y_train[col], eval_set, tree_params, fit_params
)
else:
raise ValueError(f"A {learner} learner is not supported")
return self
def _fit_xgboost(
self,
X: pd.DataFrame,
y: pd.Series,
eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
tree_params: Optional[dict[str, Any]] = None,
fit_params: Optional[dict[str, Any]] = None,
) -> "xgboost.XGBClassifier":
if xgboost is None:
raise ImportError("xgboost is not installed.")
# Default settings
if tree_params is None:
tree_params = {
"n_estimators": 100,
"max_depth": 3,
"eval_metric": "auc",
"early_stopping_rounds": 10,
"enable_categorical": True,
}
if fit_params is None:
fit_params = {"verbose": True}
if eval_set is not None:
val_params = {"eval_set": eval_set}
fit_params = {**fit_params, **val_params}
# Train the model
model = xgboost.XGBClassifier(**tree_params)
return model.fit(X, y, **fit_params)
def _fit_catboost(
self,
X: pd.DataFrame,
y: pd.Series,
eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
tree_params: Optional[dict[str, Any]] = None,
fit_params: Optional[dict[str, Any]] = None,
) -> "catboost.CatBoostClassifier":
if catboost is None:
raise ImportError("catboost is not installed.")
# Default settings
if tree_params is None:
tree_params = {
"eval_metric": "BrierScore",
"loss_function": "Logloss",
"iterations": 100,
}
if fit_params is None:
is_cat_feature = [c.dtype.name == "category" for (_, c) in X.iteritems()]
fit_params = {
"cat_features": np.nonzero(is_cat_feature)[0].tolist(),
"verbose": True,
}
if eval_set is not None:
val_params = {"early_stopping_rounds": 10, "eval_set": eval_set}
fit_params = {**fit_params, **val_params}
# Train the model
model = catboost.CatBoostClassifier(**tree_params)
return model.fit(X, y, **fit_params)
def _fit_lightgbm(
self,
X: pd.DataFrame,
y: pd.Series,
eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
tree_params: Optional[dict[str, Any]] = None,
fit_params: Optional[dict[str, Any]] = None,
) -> "lightgbm.LGBMClassifier":
if lightgbm is None:
raise ImportError("lightgbm is not installed.")
if tree_params is None:
tree_params = {"n_estimators": 100, "max_depth": 3}
if fit_params is None:
fit_params = {"eval_metric": "auc", "verbose": True}
if eval_set is not None:
val_params = {"early_stopping_rounds": 10, "eval_set": eval_set}
fit_params = {**fit_params, **val_params}
# Train the model
model = lightgbm.LGBMClassifier(**tree_params)
return model.fit(X, y, **fit_params)
def _estimate_probabilities(self, X: pd.DataFrame) -> pd.DataFrame:
# filter feature columns
cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions)
if not set(cols).issubset(set(X.columns)):
missing_cols = " and ".join(set(cols).difference(X.columns))
raise ValueError(f"{missing_cols} are not available in the features dataframe")
Y_hat = pd.DataFrame()
for col in self.__models:
Y_hat[col] = [p[1] for p in self.__models[col].predict_proba(X[cols])]
return Y_hat
def rate(
self,
game: pd.Series,
game_actions: fs.Actions,
game_states: Optional[fs.Features] = None,
) -> pd.DataFrame:
"""
Compute the VAEP rating for the given game states.
Parameters
----------
game : pd.Series
The SPADL representation of a single game.
game_actions : pd.DataFrame
The actions performed during `game` in the SPADL representation.
game_states : pd.DataFrame, default=None
DataFrame with the game state representation of each action. If
`None`, these will be computed on-th-fly.
Raises
------
NotFittedError
If the model is not fitted yet.
Returns
-------
ratings : pd.DataFrame
Returns the VAEP rating for each given action, as well as the
offensive and defensive value of each action.
"""
if not self.__models:
raise NotFittedError()
game_actions_with_names = self._spadlcfg.add_names(game_actions) # type: ignore
if game_states is None:
game_states = self.compute_features(game, game_actions)
y_hat = self._estimate_probabilities(game_states)
p_scores, p_concedes = y_hat.scores, y_hat.concedes
vaep_values = self._vaep.value(game_actions_with_names, p_scores, p_concedes)
return vaep_values
def score(self, X: pd.DataFrame, y: pd.DataFrame) -> dict[str, dict[str, float]]:
"""Evaluate the fit of the model on the given test data and labels.
Parameters
----------
X : pd.DataFrame
Feature representation of the game states.
y : pd.DataFrame
Scoring and conceding labels for each game state.
Raises
------
NotFittedError
If the model is not fitted yet.
Returns
-------
score : dict
The Brier and AUROC scores for both binary classification problems.
"""
if not self.__models:
raise NotFittedError()
y_hat = self._estimate_probabilities(X)
scores: dict[str, dict[str, float]] = {}
for col in self.__models:
scores[col] = {}
scores[col]["brier"] = brier_score_loss(y[col], y_hat[col])
scores[col]["auroc"] = roc_auc_score(y[col], y_hat[col])
return scores