Spaces:

scfive
/

socr

Configuration error

App Files Files Community

socr / vaep /base.py

scfive

Upload 203 files

d6ea71e verified 5 months ago

raw

history blame contribute delete

13.2 kB

	"""Implements the VAEP framework.

	Attributes
	----------
	xfns_default : list(callable)
	The default VAEP features.

	"""

	import math
	from typing import Any, Optional

	import numpy as np
	import pandas as pd
	from sklearn.exceptions import NotFittedError
	from sklearn.metrics import brier_score_loss, roc_auc_score

	import socceraction.spadl as spadlcfg

	from . import features as fs
	from . import formula as vaep
	from . import labels as lab

	try:
	import xgboost
	except ImportError:
	xgboost = None # type: ignore
	try:
	import catboost
	except ImportError:
	catboost = None # type: ignore
	try:
	import lightgbm
	except ImportError:
	lightgbm = None # type: ignore


	xfns_default = [
	fs.actiontype_onehot,
	fs.result_onehot,
	fs.actiontype_result_onehot,
	fs.bodypart_onehot,
	fs.time,
	fs.startlocation,
	fs.endlocation,
	fs.startpolar,
	fs.endpolar,
	fs.movement,
	fs.team,
	fs.time_delta,
	fs.space_delta,
	fs.goalscore,
	]


	class VAEP:
	"""
	An implementation of the VAEP framework.

	VAEP (Valuing Actions by Estimating Probabilities) [1]_ defines the
	problem of valuing a soccer player's contributions within a match as
	a binary classification problem and rates actions by estimating its effect
	on the short-term probablities that a team will both score and concede.

	Parameters
	----------
	xfns : list
	List of feature transformers (see :mod:`socceraction.vaep.features`)
	used to describe the game states. Uses :attr:`~socceraction.vaep.base.xfns_default`
	if None.
	nb_prev_actions : int, default=3 # noqa: DAR103
	Number of previous actions used to decscribe the game state.


	References
	----------
	.. [1] Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis.
	"Actions speak louder than goals: Valuing player actions in soccer." In
	Proceedings of the 25th ACM SIGKDD International Conference on Knowledge
	Discovery & Data Mining, pp. 1851-1861. 2019.
	"""

	_spadlcfg = spadlcfg
	_fs = fs
	_lab = lab
	_vaep = vaep

	def __init__(
	self,
	xfns: Optional[list[fs.FeatureTransfomer]] = None,
	nb_prev_actions: int = 3,
	) -> None:
	self.__models: dict[str, Any] = {}
	self.xfns = xfns_default if xfns is None else xfns
	self.yfns = [self._lab.scores, self._lab.concedes]
	self.nb_prev_actions = nb_prev_actions

	def compute_features(self, game: pd.Series, game_actions: fs.Actions) -> pd.DataFrame:
	"""
	Transform actions to the feature-based representation of game states.

	Parameters
	----------
	game : pd.Series
	The SPADL representation of a single game.
	game_actions : pd.DataFrame
	The actions performed during `game` in the SPADL representation.

	Returns
	-------
	features : pd.DataFrame
	Returns the feature-based representation of each game state in the game.
	"""
	game_actions_with_names = self._spadlcfg.add_names(game_actions) # type: ignore
	gamestates = self._fs.gamestates(game_actions_with_names, self.nb_prev_actions)
	gamestates = self._fs.play_left_to_right(gamestates, game.home_team_id)
	return pd.concat([fn(gamestates) for fn in self.xfns], axis=1)

	def compute_labels(
	self,
	game: pd.Series,
	game_actions: fs.Actions, # pylint: disable=W0613
	) -> pd.DataFrame:
	"""
	Compute the labels for each game state in the given game.

	Parameters
	----------
	game : pd.Series
	The SPADL representation of a single game.
	game_actions : pd.DataFrame
	The actions performed during `game` in the SPADL representation.

	Returns
	-------
	labels : pd.DataFrame
	Returns the labels of each game state in the game.
	"""
	game_actions_with_names = self._spadlcfg.add_names(game_actions) # type: ignore
	return pd.concat([fn(game_actions_with_names) for fn in self.yfns], axis=1)

	def fit(
	self,
	X: pd.DataFrame,
	y: pd.DataFrame,
	learner: str = "xgboost",
	val_size: float = 0.25,
	tree_params: Optional[dict[str, Any]] = None,
	fit_params: Optional[dict[str, Any]] = None,
	) -> "VAEP":
	"""
	Fit the model according to the given training data.

	Parameters
	----------
	X : pd.DataFrame
	Feature representation of the game states.
	y : pd.DataFrame
	Scoring and conceding labels for each game state.
	learner : string, default='xgboost' # noqa: DAR103
	Gradient boosting implementation which should be used to learn the
	model. The supported learners are 'xgboost', 'catboost' and 'lightgbm'.
	val_size : float, default=0.25 # noqa: DAR103
	Percentage of the dataset that will be used as the validation set
	for early stopping. When zero, no validation data will be used.
	tree_params : dict
	Parameters passed to the constructor of the learner.
	fit_params : dict
	Parameters passed to the fit method of the learner.

	Raises
	------
	ValueError
	If one of the features is missing in the provided dataframe.

	Returns
	-------
	self
	Fitted VAEP model.

	"""
	nb_states = len(X)
	idx = np.random.permutation(nb_states)
	# fmt: off
	train_idx = idx[:math.floor(nb_states * (1 - val_size))]
	val_idx = idx[(math.floor(nb_states * (1 - val_size)) + 1):]
	# fmt: on

	# filter feature columns
	cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions)
	if not set(cols).issubset(set(X.columns)):
	missing_cols = " and ".join(set(cols).difference(X.columns))
	raise ValueError(f"{missing_cols} are not available in the features dataframe")

	# split train and validation data
	X_train, y_train = X.iloc[train_idx][cols], y.iloc[train_idx]
	X_val, y_val = X.iloc[val_idx][cols], y.iloc[val_idx]

	# train classifiers F(X) = Y
	for col in list(y.columns):
	eval_set = [(X_val, y_val[col])] if val_size > 0 else None
	if learner == "xgboost":
	self.__models[col] = self._fit_xgboost(
	X_train, y_train[col], eval_set, tree_params, fit_params
	)
	elif learner == "catboost":
	self.__models[col] = self._fit_catboost(
	X_train, y_train[col], eval_set, tree_params, fit_params
	)
	elif learner == "lightgbm":
	self.__models[col] = self._fit_lightgbm(
	X_train, y_train[col], eval_set, tree_params, fit_params
	)
	else:
	raise ValueError(f"A {learner} learner is not supported")
	return self

	def _fit_xgboost(
	self,
	X: pd.DataFrame,
	y: pd.Series,
	eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
	tree_params: Optional[dict[str, Any]] = None,
	fit_params: Optional[dict[str, Any]] = None,
	) -> "xgboost.XGBClassifier":
	if xgboost is None:
	raise ImportError("xgboost is not installed.")
	# Default settings
	if tree_params is None:
	tree_params = {
	"n_estimators": 100,
	"max_depth": 3,
	"eval_metric": "auc",
	"early_stopping_rounds": 10,
	"enable_categorical": True,
	}
	if fit_params is None:
	fit_params = {"verbose": True}
	if eval_set is not None:
	val_params = {"eval_set": eval_set}
	fit_params = {fit_params, val_params}
	# Train the model
	model = xgboost.XGBClassifier(**tree_params)
	return model.fit(X, y, **fit_params)

	def _fit_catboost(
	self,
	X: pd.DataFrame,
	y: pd.Series,
	eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
	tree_params: Optional[dict[str, Any]] = None,
	fit_params: Optional[dict[str, Any]] = None,
	) -> "catboost.CatBoostClassifier":
	if catboost is None:
	raise ImportError("catboost is not installed.")
	# Default settings
	if tree_params is None:
	tree_params = {
	"eval_metric": "BrierScore",
	"loss_function": "Logloss",
	"iterations": 100,
	}
	if fit_params is None:
	is_cat_feature = [c.dtype.name == "category" for (_, c) in X.iteritems()]
	fit_params = {
	"cat_features": np.nonzero(is_cat_feature)[0].tolist(),
	"verbose": True,
	}
	if eval_set is not None:
	val_params = {"early_stopping_rounds": 10, "eval_set": eval_set}
	fit_params = {fit_params, val_params}
	# Train the model
	model = catboost.CatBoostClassifier(**tree_params)
	return model.fit(X, y, **fit_params)

	def _fit_lightgbm(
	self,
	X: pd.DataFrame,
	y: pd.Series,
	eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
	tree_params: Optional[dict[str, Any]] = None,
	fit_params: Optional[dict[str, Any]] = None,
	) -> "lightgbm.LGBMClassifier":
	if lightgbm is None:
	raise ImportError("lightgbm is not installed.")
	if tree_params is None:
	tree_params = {"n_estimators": 100, "max_depth": 3}
	if fit_params is None:
	fit_params = {"eval_metric": "auc", "verbose": True}
	if eval_set is not None:
	val_params = {"early_stopping_rounds": 10, "eval_set": eval_set}
	fit_params = {fit_params, val_params}
	# Train the model
	model = lightgbm.LGBMClassifier(**tree_params)
	return model.fit(X, y, **fit_params)

	def _estimate_probabilities(self, X: pd.DataFrame) -> pd.DataFrame:
	# filter feature columns
	cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions)
	if not set(cols).issubset(set(X.columns)):
	missing_cols = " and ".join(set(cols).difference(X.columns))
	raise ValueError(f"{missing_cols} are not available in the features dataframe")

	Y_hat = pd.DataFrame()
	for col in self.__models:
	Y_hat[col] = [p[1] for p in self.__models[col].predict_proba(X[cols])]
	return Y_hat

	def rate(
	self,
	game: pd.Series,
	game_actions: fs.Actions,
	game_states: Optional[fs.Features] = None,
	) -> pd.DataFrame:
	"""
	Compute the VAEP rating for the given game states.

	Parameters
	----------
	game : pd.Series
	The SPADL representation of a single game.
	game_actions : pd.DataFrame
	The actions performed during `game` in the SPADL representation.
	game_states : pd.DataFrame, default=None
	DataFrame with the game state representation of each action. If
	`None`, these will be computed on-th-fly.

	Raises
	------
	NotFittedError
	If the model is not fitted yet.

	Returns
	-------
	ratings : pd.DataFrame
	Returns the VAEP rating for each given action, as well as the
	offensive and defensive value of each action.
	"""
	if not self.__models:
	raise NotFittedError()

	game_actions_with_names = self._spadlcfg.add_names(game_actions) # type: ignore
	if game_states is None:
	game_states = self.compute_features(game, game_actions)

	y_hat = self._estimate_probabilities(game_states)
	p_scores, p_concedes = y_hat.scores, y_hat.concedes
	vaep_values = self._vaep.value(game_actions_with_names, p_scores, p_concedes)
	return vaep_values

	def score(self, X: pd.DataFrame, y: pd.DataFrame) -> dict[str, dict[str, float]]:
	"""Evaluate the fit of the model on the given test data and labels.

	Parameters
	----------
	X : pd.DataFrame
	Feature representation of the game states.
	y : pd.DataFrame
	Scoring and conceding labels for each game state.

	Raises
	------
	NotFittedError
	If the model is not fitted yet.

	Returns
	-------
	score : dict
	The Brier and AUROC scores for both binary classification problems.
	"""
	if not self.__models:
	raise NotFittedError()

	y_hat = self._estimate_probabilities(X)

	scores: dict[str, dict[str, float]] = {}
	for col in self.__models:
	scores[col] = {}
	scores[col]["brier"] = brier_score_loss(y[col], y_hat[col])
	scores[col]["auroc"] = roc_auc_score(y[col], y_hat[col])

	return scores