Spaces:

scfive
/

socr

Configuration error

App Files Files Community

socr / vaep /features.py

scfive

Upload 203 files

d6ea71e verified 5 months ago

raw

history blame contribute delete

22 kB

	"""Implements the feature tranformers of the VAEP framework."""

	from functools import wraps
	from typing import Any, Callable, Union, no_type_check

	import numpy as np # type: ignore
	import pandas as pd # type: ignore
	from pandera.typing import DataFrame

	import socceraction.spadl.config as spadlcfg
	from socceraction.atomic.spadl import AtomicSPADLSchema
	from socceraction.spadl.schema import SPADLSchema

	SPADLActions = DataFrame[SPADLSchema]
	Actions = Union[DataFrame[SPADLSchema], DataFrame[AtomicSPADLSchema]]
	GameStates = list[Actions]
	Features = DataFrame[Any]
	FeatureTransfomer = Callable[[GameStates], Features]


	def feature_column_names(fs: list[FeatureTransfomer], nb_prev_actions: int = 3) -> list[str]:
	"""Return the names of the features generated by a list of transformers.

	Parameters
	----------
	fs : list(callable)
	A list of feature transformers.
	nb_prev_actions : int, default=3 # noqa: DAR103
	The number of previous actions included in the game state.

	Returns
	-------
	list(str)
	The name of each generated feature.
	"""
	spadlcolumns = [
	"game_id",
	"original_event_id",
	"action_id",
	"period_id",
	"time_seconds",
	"team_id",
	"player_id",
	"start_x",
	"start_y",
	"end_x",
	"end_y",
	"result_id",
	"result_name",
	"bodypart_id",
	"bodypart_name",
	"type_id",
	"type_name",
	]
	dummy_actions = pd.DataFrame(np.zeros((10, len(spadlcolumns))), columns=spadlcolumns)
	for c in spadlcolumns:
	if "name" in c:
	dummy_actions[c] = dummy_actions[c].astype(str)
	gs = gamestates(dummy_actions, nb_prev_actions) # type: ignore
	return list(pd.concat([f(gs) for f in fs], axis=1).columns.values)


	def gamestates(actions: Actions, nb_prev_actions: int = 3) -> GameStates:
	r"""Convert a dataframe of actions to gamestates.

	Each gamestate is represented as the <nb_prev_actions> previous actions.

	The list of gamestates is internally represented as a list of actions
	dataframes :math:`[a_0,a_1,\ldots]` where each row in the a_i dataframe contains the
	previous action of the action in the same row in the :math:`a_{i-1}` dataframe.

	Parameters
	----------
	actions : Actions
	A DataFrame with the actions of a game.
	nb_prev_actions : int, default=3 # noqa: DAR103
	The number of previous actions included in the game state.

	Raises
	------
	ValueError
	If the number of actions is smaller 1.

	Returns
	-------
	GameStates
	The <nb_prev_actions> previous actions for each action.
	"""
	if nb_prev_actions < 1:
	raise ValueError("The game state should include at least one preceding action.")
	states = [actions]
	for i in range(1, nb_prev_actions):
	prev_actions = actions.groupby(["game_id", "period_id"], sort=False, as_index=False).apply(
	lambda x: x.shift(i, fill_value=float("nan")).fillna(x.iloc[0]) # noqa: B023
	)
	prev_actions.index = actions.index.copy()
	states.append(prev_actions) # type: ignore
	return states


	def play_left_to_right(gamestates: GameStates, home_team_id: int) -> GameStates:
	"""Perform all actions in a gamestate in the same playing direction.

	This changes the start and end location of each action in a gamestate,
	such that all actions are performed as if the team that performs the first
	action in the gamestate plays from left to right.

	Parameters
	----------
	gamestates : GameStates
	The game states of a game.
	home_team_id : int
	The ID of the home team.

	Returns
	-------
	GameStates
	The game states with all actions performed left to right.

	See Also
	--------
	socceraction.vaep.features.play_left_to_right : For transforming actions.
	"""
	a0 = gamestates[0]
	away_idx = a0.team_id != home_team_id
	for actions in gamestates:
	for col in ["start_x", "end_x"]:
	actions.loc[away_idx, col] = spadlcfg.field_length - actions[away_idx][col].values
	for col in ["start_y", "end_y"]:
	actions.loc[away_idx, col] = spadlcfg.field_width - actions[away_idx][col].values
	return gamestates


	@no_type_check
	def simple(actionfn: Callable) -> FeatureTransfomer:
	"""Make a function decorator to apply actionfeatures to game states.

	Parameters
	----------
	actionfn : Callable
	A feature transformer that operates on actions.

	Returns
	-------
	FeatureTransfomer
	A feature transformer that operates on game states.
	"""

	@wraps(actionfn)
	def _wrapper(gamestates: list[Actions]) -> pd.DataFrame:
	if not isinstance(gamestates, (list,)):
	gamestates = [gamestates]
	X = []
	for i, a in enumerate(gamestates):
	Xi = actionfn(a)
	Xi.columns = [c + "_a" + str(i) for c in Xi.columns]
	X.append(Xi)
	return pd.concat(X, axis=1)

	return _wrapper


	# SIMPLE FEATURES


	@simple
	def actiontype(actions: Actions) -> Features:
	"""Get the type of each action.

	Parameters
	----------
	actions : Actions
	The actions of a game.

	Returns
	-------
	Features
	The 'type_id' of each action.
	"""
	X = pd.DataFrame(index=actions.index)
	X["actiontype"] = pd.Categorical(
	actions["type_id"].replace(spadlcfg.actiontypes_df().type_name.to_dict()),
	categories=spadlcfg.actiontypes,
	ordered=False,
	)
	return X


	@simple
	def actiontype_onehot(actions: SPADLActions) -> Features:
	"""Get the one-hot-encoded type of each action.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	A one-hot encoding of each action's type.
	"""
	X = {}
	for type_id, type_name in enumerate(spadlcfg.actiontypes):
	col = "actiontype_" + type_name
	X[col] = actions["type_id"] == type_id
	return pd.DataFrame(X, index=actions.index)


	@simple
	def result(actions: SPADLActions) -> Features:
	"""Get the result of each action.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The 'result_id' of each action.
	"""
	X = pd.DataFrame(index=actions.index)
	X["result"] = pd.Categorical(
	actions["result_id"].replace(spadlcfg.results_df().result_name.to_dict()),
	categories=spadlcfg.results,
	ordered=False,
	)
	return X


	@simple
	def result_onehot(actions: SPADLActions) -> Features:
	"""Get the one-hot-encode result of each action.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The one-hot encoding of each action's result.
	"""
	X = {}
	for result_id, result_name in enumerate(spadlcfg.results):
	col = "result_" + result_name
	X[col] = actions["result_id"] == result_id
	return pd.DataFrame(X, index=actions.index)


	@simple
	def actiontype_result_onehot(actions: SPADLActions) -> Features:
	"""Get a one-hot encoding of the combination between the type and result of each action.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The one-hot encoding of each action's type and result.
	"""
	res = result_onehot.__wrapped__(actions) # type: ignore
	tys = actiontype_onehot.__wrapped__(actions) # type: ignore
	df = {}
	for tyscol in list(tys.columns):
	for rescol in list(res.columns):
	df[tyscol + "_" + rescol] = tys[tyscol] & res[rescol]
	return pd.DataFrame(df, index=actions.index)


	@simple
	def bodypart(actions: Actions) -> Features:
	"""Get the body part used to perform each action.

	This feature generator does not distinguish between the left and right foot.

	Parameters
	----------
	actions : Actions
	The actions of a game.

	Returns
	-------
	Features
	The 'bodypart_id' of each action.

	See Also
	--------
	bodypart_detailed :
	An alternative version that splits between the left and right foot.
	"""
	X = pd.DataFrame(index=actions.index)
	foot_id = spadlcfg.bodyparts.index("foot")
	left_foot_id = spadlcfg.bodyparts.index("foot_left")
	right_foot_id = spadlcfg.bodyparts.index("foot_right")
	X["bodypart"] = pd.Categorical(
	actions["bodypart_id"]
	.replace([left_foot_id, right_foot_id], foot_id)
	.replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()),
	categories=["foot", "head", "other", "head/other"],
	ordered=False,
	)
	return X


	@simple
	def bodypart_detailed(actions: Actions) -> Features:
	"""Get the body part with split by foot used to perform each action.

	This feature generator distinguishes between the left and right foot, if
	supported by the dataprovider.

	Parameters
	----------
	actions : Actions
	The actions of a game.

	Returns
	-------
	Features
	The 'bodypart_id' of each action.

	See Also
	--------
	bodypart :
	An alternative version that does not split between the left and right foot.
	"""
	X = pd.DataFrame(index=actions.index)
	X["bodypart"] = pd.Categorical(
	actions["bodypart_id"].replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()),
	categories=spadlcfg.bodyparts,
	ordered=False,
	)
	return X


	@simple
	def bodypart_onehot(actions: Actions) -> Features:
	"""Get the one-hot-encoded bodypart of each action.

	This feature generator does not distinguish between the left and right foot.

	Parameters
	----------
	actions : Actions
	The actions of a game.

	Returns
	-------
	Features
	The one-hot encoding of each action's bodypart.

	See Also
	--------
	bodypart_detailed_onehot :
	An alternative version that splits between the left and right foot.
	"""
	X = {}
	for bodypart_id, bodypart_name in enumerate(spadlcfg.bodyparts):
	if bodypart_name in ("foot_left", "foot_right"):
	continue
	col = "bodypart_" + bodypart_name
	if bodypart_name == "foot":
	foot_id = spadlcfg.bodyparts.index("foot")
	left_foot_id = spadlcfg.bodyparts.index("foot_left")
	right_foot_id = spadlcfg.bodyparts.index("foot_right")
	X[col] = actions["bodypart_id"].isin([foot_id, left_foot_id, right_foot_id])
	elif bodypart_name == "head/other":
	head_id = spadlcfg.bodyparts.index("head")
	other_id = spadlcfg.bodyparts.index("other")
	head_other_id = spadlcfg.bodyparts.index("head/other")
	X[col] = actions["bodypart_id"].isin([head_id, other_id, head_other_id])
	else:
	X[col] = actions["bodypart_id"] == bodypart_id
	return pd.DataFrame(X, index=actions.index)


	@simple
	def bodypart_detailed_onehot(actions: Actions) -> Features:
	"""Get the one-hot-encoded bodypart with split by foot of each action.

	This feature generator distinguishes between the left and right foot, if
	supported by the dataprovider.

	Parameters
	----------
	actions : Actions
	The actions of a game.

	Returns
	-------
	Features
	The one-hot encoding of each action's bodypart.

	See Also
	--------
	bodypart_onehot :
	An alternative version that does not split between the left and right foot.
	"""
	X = {}
	for bodypart_id, bodypart_name in enumerate(spadlcfg.bodyparts):
	col = "bodypart_" + bodypart_name
	if bodypart_name == "foot":
	foot_id = spadlcfg.bodyparts.index("foot")
	left_foot_id = spadlcfg.bodyparts.index("foot_left")
	right_foot_id = spadlcfg.bodyparts.index("foot_right")
	X[col] = actions["bodypart_id"].isin([foot_id, left_foot_id, right_foot_id])
	elif bodypart_name == "head/other":
	head_id = spadlcfg.bodyparts.index("head")
	other_id = spadlcfg.bodyparts.index("other")
	head_other_id = spadlcfg.bodyparts.index("head/other")
	X[col] = actions["bodypart_id"].isin([head_id, other_id, head_other_id])
	else:
	X[col] = actions["bodypart_id"] == bodypart_id
	return pd.DataFrame(X, index=actions.index)


	@simple
	def time(actions: Actions) -> Features:
	"""Get the time when each action was performed.

	This generates the following features:
	:period_id:
	The ID of the period.
	:time_seconds:
	Seconds since the start of the period.
	:time_seconds_overall:
	Seconds since the start of the game. Stoppage time during previous
	periods is ignored.

	Parameters
	----------
	actions : Actions
	The actions of a game.

	Returns
	-------
	Features
	The 'period_id', 'time_seconds' and 'time_seconds_overall' when each
	action was performed.
	"""
	match_time_at_period_start = {1: 0, 2: 45, 3: 90, 4: 105, 5: 120}
	timedf = actions[["period_id", "time_seconds"]].copy()
	timedf["time_seconds_overall"] = (
	timedf.period_id.map(match_time_at_period_start) * 60
	) + timedf.time_seconds
	return timedf


	@simple
	def startlocation(actions: SPADLActions) -> Features:
	"""Get the location where each action started.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The 'start_x' and 'start_y' location of each action.
	"""
	return actions[["start_x", "start_y"]]


	@simple
	def endlocation(actions: SPADLActions) -> Features:
	"""Get the location where each action ended.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The 'end_x' and 'end_y' location of each action.
	"""
	return actions[["end_x", "end_y"]]


	_goal_x: float = spadlcfg.field_length
	_goal_y: float = spadlcfg.field_width / 2


	@simple
	def startpolar(actions: SPADLActions) -> Features:
	"""Get the polar coordinates of each action's start location.

	The center of the opponent's goal is used as the origin.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The 'start_dist_to_goal' and 'start_angle_to_goal' of each action.
	"""
	polardf = pd.DataFrame(index=actions.index)
	dx = (_goal_x - actions["start_x"]).abs().values
	dy = (_goal_y - actions["start_y"]).abs().values
	polardf["start_dist_to_goal"] = np.sqrt(dx2 + dy2)
	with np.errstate(divide="ignore", invalid="ignore"):
	polardf["start_angle_to_goal"] = np.nan_to_num(np.arctan(dy / dx))
	return polardf


	@simple
	def endpolar(actions: SPADLActions) -> Features:
	"""Get the polar coordinates of each action's end location.

	The center of the opponent's goal is used as the origin.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The 'end_dist_to_goal' and 'end_angle_to_goal' of each action.
	"""
	polardf = pd.DataFrame(index=actions.index)
	dx = (_goal_x - actions["end_x"]).abs().values
	dy = (_goal_y - actions["end_y"]).abs().values
	polardf["end_dist_to_goal"] = np.sqrt(dx2 + dy2)
	with np.errstate(divide="ignore", invalid="ignore"):
	polardf["end_angle_to_goal"] = np.nan_to_num(np.arctan(dy / dx))
	return polardf


	@simple
	def movement(actions: SPADLActions) -> Features:
	"""Get the distance covered by each action.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The horizontal ('dx'), vertical ('dy') and total ('movement') distance
	covered by each action.
	"""
	mov = pd.DataFrame(index=actions.index)
	mov["dx"] = actions.end_x - actions.start_x
	mov["dy"] = actions.end_y - actions.start_y
	mov["movement"] = np.sqrt(mov.dx2 + mov.dy2)
	return mov


	@simple
	def player_possession_time(actions: SPADLActions) -> Features:
	"""Get the time (sec) a player was in ball possession before attempting the action.

	We only look at the dribble preceding the action and reset the possession
	time after a defensive interception attempt or a take-on.

	Parameters
	----------
	actions : SPADLActions
	The actions of a game.

	Returns
	-------
	Features
	The 'player_possession_time' of each action.
	"""
	cur_action = actions[["period_id", "time_seconds", "player_id", "type_id"]]
	prev_action = actions.copy().shift(1)[["period_id", "time_seconds", "player_id", "type_id"]]
	df = cur_action.join(prev_action, rsuffix="_prev")
	same_player = df.player_id == df.player_id_prev
	same_period = df.period_id == df.period_id_prev
	prev_dribble = df.type_id_prev == spadlcfg.actiontypes.index("dribble")
	mask = same_period & same_player & prev_dribble
	df.loc[mask, "player_possession_time"] = (
	df.loc[mask, "time_seconds"] - df.loc[mask, "time_seconds_prev"]
	)
	return df[["player_possession_time"]].fillna(0.0)


	# STATE FEATURES


	def team(gamestates: GameStates) -> Features:
	"""Check whether the possession changed during the game state.

	For each action in the game state, True if the team that performed the
	action is the same team that performed the last action of the game state;
	otherwise False.

	Parameters
	----------
	gamestates : GameStates
	The game states of a game.

	Returns
	-------
	Features
	A dataframe with a column 'team_ai' for each <nb_prev_actions> indicating
	whether the team that performed action a0 is in possession.
	"""
	a0 = gamestates[0]
	teamdf = pd.DataFrame(index=a0.index)
	for i, a in enumerate(gamestates[1:]):
	teamdf["team_" + (str(i + 1))] = a.team_id == a0.team_id
	return teamdf


	def time_delta(gamestates: GameStates) -> Features:
	"""Get the number of seconds between the last and previous actions.

	Parameters
	----------
	gamestates : GameStates
	The game states of a game.

	Returns
	-------
	Features
	A dataframe with a column 'time_delta_i' for each <nb_prev_actions>
	containing the number of seconds between action ai and action a0.
	"""
	a0 = gamestates[0]
	dt = pd.DataFrame(index=a0.index)
	for i, a in enumerate(gamestates[1:]):
	dt["time_delta_" + (str(i + 1))] = a0.time_seconds - a.time_seconds
	return dt


	def space_delta(gamestates: GameStates) -> Features:
	"""Get the distance covered between the last and previous actions.

	Parameters
	----------
	gamestates : GameStates
	The gamestates of a game.

	Returns
	-------
	Features
	A dataframe with a column for the horizontal ('dx_a0i'), vertical
	('dy_a0i') and total ('mov_a0i') distance covered between each
	<nb_prev_actions> action ai and action a0.
	"""
	a0 = gamestates[0]
	spaced = pd.DataFrame(index=a0.index)
	for i, a in enumerate(gamestates[1:]):
	dx = a.end_x - a0.start_x
	spaced["dx_a0" + (str(i + 1))] = dx
	dy = a.end_y - a0.start_y
	spaced["dy_a0" + (str(i + 1))] = dy
	spaced["mov_a0" + (str(i + 1))] = np.sqrt(dx2 + dy2)
	return spaced


	def speed(gamestates: GameStates) -> Features:
	"""Get the speed at which the ball moved during the previous actions.

	Parameters
	----------
	gamestates : GameStates
	The game states of a game.

	Returns
	-------
	Features
	A dataframe with columns 'speedx_a0i', 'speedy_a0i', 'speed_a0i'
	for each <nb_prev_actions> containing the ball speed in m/s between
	action ai and action a0.
	"""
	a0 = gamestates[0]
	speed = pd.DataFrame(index=a0.index)
	for i, a in enumerate(gamestates[1:]):
	dx = a.end_x - a0.start_x
	dy = a.end_y - a0.start_y
	dt = a0.time_seconds - a.time_seconds
	dt[dt <= 0] = 1e-6
	speed["speedx_a0" + (str(i + 1))] = dx.abs() / dt
	speed["speedy_a0" + (str(i + 1))] = dy.abs() / dt
	speed["speed_a0" + (str(i + 1))] = np.sqrt(dx2 + dy2) / dt
	return speed


	# CONTEXT FEATURES


	def goalscore(gamestates: GameStates) -> Features:
	"""Get the number of goals scored by each team after the action.

	Parameters
	----------
	gamestates : GameStates
	The gamestates of a game.

	Returns
	-------
	Features
	The number of goals scored by the team performing the last action of the
	game state ('goalscore_team'), by the opponent ('goalscore_opponent'),
	and the goal difference between both teams ('goalscore_diff').
	"""
	actions = gamestates[0]
	teamA = actions["team_id"].values[0]
	goals = actions["type_name"].str.contains("shot") & (
	actions["result_id"] == spadlcfg.results.index("success")
	)
	owngoals = actions["type_name"].str.contains("shot") & (
	actions["result_id"] == spadlcfg.results.index("owngoal")
	)
	teamisA = actions["team_id"] == teamA
	teamisB = ~teamisA
	goalsteamA = (goals & teamisA) \| (owngoals & teamisB)
	goalsteamB = (goals & teamisB) \| (owngoals & teamisA)
	goalscoreteamA = goalsteamA.cumsum() - goalsteamA
	goalscoreteamB = goalsteamB.cumsum() - goalsteamB

	scoredf = pd.DataFrame(index=actions.index)
	scoredf["goalscore_team"] = (goalscoreteamA * teamisA) + (goalscoreteamB * teamisB)
	scoredf["goalscore_opponent"] = (goalscoreteamB * teamisA) + (goalscoreteamA * teamisB)
	scoredf["goalscore_diff"] = scoredf["goalscore_team"] - scoredf["goalscore_opponent"]
	return scoredf