socr / vaep /features.py
scfive's picture
Upload 203 files
d6ea71e verified
"""Implements the feature tranformers of the VAEP framework."""
from functools import wraps
from typing import Any, Callable, Union, no_type_check
import numpy as np # type: ignore
import pandas as pd # type: ignore
from pandera.typing import DataFrame
import socceraction.spadl.config as spadlcfg
from socceraction.atomic.spadl import AtomicSPADLSchema
from socceraction.spadl.schema import SPADLSchema
SPADLActions = DataFrame[SPADLSchema]
Actions = Union[DataFrame[SPADLSchema], DataFrame[AtomicSPADLSchema]]
GameStates = list[Actions]
Features = DataFrame[Any]
FeatureTransfomer = Callable[[GameStates], Features]
def feature_column_names(fs: list[FeatureTransfomer], nb_prev_actions: int = 3) -> list[str]:
"""Return the names of the features generated by a list of transformers.
Parameters
----------
fs : list(callable)
A list of feature transformers.
nb_prev_actions : int, default=3 # noqa: DAR103
The number of previous actions included in the game state.
Returns
-------
list(str)
The name of each generated feature.
"""
spadlcolumns = [
"game_id",
"original_event_id",
"action_id",
"period_id",
"time_seconds",
"team_id",
"player_id",
"start_x",
"start_y",
"end_x",
"end_y",
"result_id",
"result_name",
"bodypart_id",
"bodypart_name",
"type_id",
"type_name",
]
dummy_actions = pd.DataFrame(np.zeros((10, len(spadlcolumns))), columns=spadlcolumns)
for c in spadlcolumns:
if "name" in c:
dummy_actions[c] = dummy_actions[c].astype(str)
gs = gamestates(dummy_actions, nb_prev_actions) # type: ignore
return list(pd.concat([f(gs) for f in fs], axis=1).columns.values)
def gamestates(actions: Actions, nb_prev_actions: int = 3) -> GameStates:
r"""Convert a dataframe of actions to gamestates.
Each gamestate is represented as the <nb_prev_actions> previous actions.
The list of gamestates is internally represented as a list of actions
dataframes :math:`[a_0,a_1,\ldots]` where each row in the a_i dataframe contains the
previous action of the action in the same row in the :math:`a_{i-1}` dataframe.
Parameters
----------
actions : Actions
A DataFrame with the actions of a game.
nb_prev_actions : int, default=3 # noqa: DAR103
The number of previous actions included in the game state.
Raises
------
ValueError
If the number of actions is smaller 1.
Returns
-------
GameStates
The <nb_prev_actions> previous actions for each action.
"""
if nb_prev_actions < 1:
raise ValueError("The game state should include at least one preceding action.")
states = [actions]
for i in range(1, nb_prev_actions):
prev_actions = actions.groupby(["game_id", "period_id"], sort=False, as_index=False).apply(
lambda x: x.shift(i, fill_value=float("nan")).fillna(x.iloc[0]) # noqa: B023
)
prev_actions.index = actions.index.copy()
states.append(prev_actions) # type: ignore
return states
def play_left_to_right(gamestates: GameStates, home_team_id: int) -> GameStates:
"""Perform all actions in a gamestate in the same playing direction.
This changes the start and end location of each action in a gamestate,
such that all actions are performed as if the team that performs the first
action in the gamestate plays from left to right.
Parameters
----------
gamestates : GameStates
The game states of a game.
home_team_id : int
The ID of the home team.
Returns
-------
GameStates
The game states with all actions performed left to right.
See Also
--------
socceraction.vaep.features.play_left_to_right : For transforming actions.
"""
a0 = gamestates[0]
away_idx = a0.team_id != home_team_id
for actions in gamestates:
for col in ["start_x", "end_x"]:
actions.loc[away_idx, col] = spadlcfg.field_length - actions[away_idx][col].values
for col in ["start_y", "end_y"]:
actions.loc[away_idx, col] = spadlcfg.field_width - actions[away_idx][col].values
return gamestates
@no_type_check
def simple(actionfn: Callable) -> FeatureTransfomer:
"""Make a function decorator to apply actionfeatures to game states.
Parameters
----------
actionfn : Callable
A feature transformer that operates on actions.
Returns
-------
FeatureTransfomer
A feature transformer that operates on game states.
"""
@wraps(actionfn)
def _wrapper(gamestates: list[Actions]) -> pd.DataFrame:
if not isinstance(gamestates, (list,)):
gamestates = [gamestates]
X = []
for i, a in enumerate(gamestates):
Xi = actionfn(a)
Xi.columns = [c + "_a" + str(i) for c in Xi.columns]
X.append(Xi)
return pd.concat(X, axis=1)
return _wrapper
# SIMPLE FEATURES
@simple
def actiontype(actions: Actions) -> Features:
"""Get the type of each action.
Parameters
----------
actions : Actions
The actions of a game.
Returns
-------
Features
The 'type_id' of each action.
"""
X = pd.DataFrame(index=actions.index)
X["actiontype"] = pd.Categorical(
actions["type_id"].replace(spadlcfg.actiontypes_df().type_name.to_dict()),
categories=spadlcfg.actiontypes,
ordered=False,
)
return X
@simple
def actiontype_onehot(actions: SPADLActions) -> Features:
"""Get the one-hot-encoded type of each action.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
A one-hot encoding of each action's type.
"""
X = {}
for type_id, type_name in enumerate(spadlcfg.actiontypes):
col = "actiontype_" + type_name
X[col] = actions["type_id"] == type_id
return pd.DataFrame(X, index=actions.index)
@simple
def result(actions: SPADLActions) -> Features:
"""Get the result of each action.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The 'result_id' of each action.
"""
X = pd.DataFrame(index=actions.index)
X["result"] = pd.Categorical(
actions["result_id"].replace(spadlcfg.results_df().result_name.to_dict()),
categories=spadlcfg.results,
ordered=False,
)
return X
@simple
def result_onehot(actions: SPADLActions) -> Features:
"""Get the one-hot-encode result of each action.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The one-hot encoding of each action's result.
"""
X = {}
for result_id, result_name in enumerate(spadlcfg.results):
col = "result_" + result_name
X[col] = actions["result_id"] == result_id
return pd.DataFrame(X, index=actions.index)
@simple
def actiontype_result_onehot(actions: SPADLActions) -> Features:
"""Get a one-hot encoding of the combination between the type and result of each action.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The one-hot encoding of each action's type and result.
"""
res = result_onehot.__wrapped__(actions) # type: ignore
tys = actiontype_onehot.__wrapped__(actions) # type: ignore
df = {}
for tyscol in list(tys.columns):
for rescol in list(res.columns):
df[tyscol + "_" + rescol] = tys[tyscol] & res[rescol]
return pd.DataFrame(df, index=actions.index)
@simple
def bodypart(actions: Actions) -> Features:
"""Get the body part used to perform each action.
This feature generator does not distinguish between the left and right foot.
Parameters
----------
actions : Actions
The actions of a game.
Returns
-------
Features
The 'bodypart_id' of each action.
See Also
--------
bodypart_detailed :
An alternative version that splits between the left and right foot.
"""
X = pd.DataFrame(index=actions.index)
foot_id = spadlcfg.bodyparts.index("foot")
left_foot_id = spadlcfg.bodyparts.index("foot_left")
right_foot_id = spadlcfg.bodyparts.index("foot_right")
X["bodypart"] = pd.Categorical(
actions["bodypart_id"]
.replace([left_foot_id, right_foot_id], foot_id)
.replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()),
categories=["foot", "head", "other", "head/other"],
ordered=False,
)
return X
@simple
def bodypart_detailed(actions: Actions) -> Features:
"""Get the body part with split by foot used to perform each action.
This feature generator distinguishes between the left and right foot, if
supported by the dataprovider.
Parameters
----------
actions : Actions
The actions of a game.
Returns
-------
Features
The 'bodypart_id' of each action.
See Also
--------
bodypart :
An alternative version that does not split between the left and right foot.
"""
X = pd.DataFrame(index=actions.index)
X["bodypart"] = pd.Categorical(
actions["bodypart_id"].replace(spadlcfg.bodyparts_df().bodypart_name.to_dict()),
categories=spadlcfg.bodyparts,
ordered=False,
)
return X
@simple
def bodypart_onehot(actions: Actions) -> Features:
"""Get the one-hot-encoded bodypart of each action.
This feature generator does not distinguish between the left and right foot.
Parameters
----------
actions : Actions
The actions of a game.
Returns
-------
Features
The one-hot encoding of each action's bodypart.
See Also
--------
bodypart_detailed_onehot :
An alternative version that splits between the left and right foot.
"""
X = {}
for bodypart_id, bodypart_name in enumerate(spadlcfg.bodyparts):
if bodypart_name in ("foot_left", "foot_right"):
continue
col = "bodypart_" + bodypart_name
if bodypart_name == "foot":
foot_id = spadlcfg.bodyparts.index("foot")
left_foot_id = spadlcfg.bodyparts.index("foot_left")
right_foot_id = spadlcfg.bodyparts.index("foot_right")
X[col] = actions["bodypart_id"].isin([foot_id, left_foot_id, right_foot_id])
elif bodypart_name == "head/other":
head_id = spadlcfg.bodyparts.index("head")
other_id = spadlcfg.bodyparts.index("other")
head_other_id = spadlcfg.bodyparts.index("head/other")
X[col] = actions["bodypart_id"].isin([head_id, other_id, head_other_id])
else:
X[col] = actions["bodypart_id"] == bodypart_id
return pd.DataFrame(X, index=actions.index)
@simple
def bodypart_detailed_onehot(actions: Actions) -> Features:
"""Get the one-hot-encoded bodypart with split by foot of each action.
This feature generator distinguishes between the left and right foot, if
supported by the dataprovider.
Parameters
----------
actions : Actions
The actions of a game.
Returns
-------
Features
The one-hot encoding of each action's bodypart.
See Also
--------
bodypart_onehot :
An alternative version that does not split between the left and right foot.
"""
X = {}
for bodypart_id, bodypart_name in enumerate(spadlcfg.bodyparts):
col = "bodypart_" + bodypart_name
if bodypart_name == "foot":
foot_id = spadlcfg.bodyparts.index("foot")
left_foot_id = spadlcfg.bodyparts.index("foot_left")
right_foot_id = spadlcfg.bodyparts.index("foot_right")
X[col] = actions["bodypart_id"].isin([foot_id, left_foot_id, right_foot_id])
elif bodypart_name == "head/other":
head_id = spadlcfg.bodyparts.index("head")
other_id = spadlcfg.bodyparts.index("other")
head_other_id = spadlcfg.bodyparts.index("head/other")
X[col] = actions["bodypart_id"].isin([head_id, other_id, head_other_id])
else:
X[col] = actions["bodypart_id"] == bodypart_id
return pd.DataFrame(X, index=actions.index)
@simple
def time(actions: Actions) -> Features:
"""Get the time when each action was performed.
This generates the following features:
:period_id:
The ID of the period.
:time_seconds:
Seconds since the start of the period.
:time_seconds_overall:
Seconds since the start of the game. Stoppage time during previous
periods is ignored.
Parameters
----------
actions : Actions
The actions of a game.
Returns
-------
Features
The 'period_id', 'time_seconds' and 'time_seconds_overall' when each
action was performed.
"""
match_time_at_period_start = {1: 0, 2: 45, 3: 90, 4: 105, 5: 120}
timedf = actions[["period_id", "time_seconds"]].copy()
timedf["time_seconds_overall"] = (
timedf.period_id.map(match_time_at_period_start) * 60
) + timedf.time_seconds
return timedf
@simple
def startlocation(actions: SPADLActions) -> Features:
"""Get the location where each action started.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The 'start_x' and 'start_y' location of each action.
"""
return actions[["start_x", "start_y"]]
@simple
def endlocation(actions: SPADLActions) -> Features:
"""Get the location where each action ended.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The 'end_x' and 'end_y' location of each action.
"""
return actions[["end_x", "end_y"]]
_goal_x: float = spadlcfg.field_length
_goal_y: float = spadlcfg.field_width / 2
@simple
def startpolar(actions: SPADLActions) -> Features:
"""Get the polar coordinates of each action's start location.
The center of the opponent's goal is used as the origin.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The 'start_dist_to_goal' and 'start_angle_to_goal' of each action.
"""
polardf = pd.DataFrame(index=actions.index)
dx = (_goal_x - actions["start_x"]).abs().values
dy = (_goal_y - actions["start_y"]).abs().values
polardf["start_dist_to_goal"] = np.sqrt(dx**2 + dy**2)
with np.errstate(divide="ignore", invalid="ignore"):
polardf["start_angle_to_goal"] = np.nan_to_num(np.arctan(dy / dx))
return polardf
@simple
def endpolar(actions: SPADLActions) -> Features:
"""Get the polar coordinates of each action's end location.
The center of the opponent's goal is used as the origin.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The 'end_dist_to_goal' and 'end_angle_to_goal' of each action.
"""
polardf = pd.DataFrame(index=actions.index)
dx = (_goal_x - actions["end_x"]).abs().values
dy = (_goal_y - actions["end_y"]).abs().values
polardf["end_dist_to_goal"] = np.sqrt(dx**2 + dy**2)
with np.errstate(divide="ignore", invalid="ignore"):
polardf["end_angle_to_goal"] = np.nan_to_num(np.arctan(dy / dx))
return polardf
@simple
def movement(actions: SPADLActions) -> Features:
"""Get the distance covered by each action.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The horizontal ('dx'), vertical ('dy') and total ('movement') distance
covered by each action.
"""
mov = pd.DataFrame(index=actions.index)
mov["dx"] = actions.end_x - actions.start_x
mov["dy"] = actions.end_y - actions.start_y
mov["movement"] = np.sqrt(mov.dx**2 + mov.dy**2)
return mov
@simple
def player_possession_time(actions: SPADLActions) -> Features:
"""Get the time (sec) a player was in ball possession before attempting the action.
We only look at the dribble preceding the action and reset the possession
time after a defensive interception attempt or a take-on.
Parameters
----------
actions : SPADLActions
The actions of a game.
Returns
-------
Features
The 'player_possession_time' of each action.
"""
cur_action = actions[["period_id", "time_seconds", "player_id", "type_id"]]
prev_action = actions.copy().shift(1)[["period_id", "time_seconds", "player_id", "type_id"]]
df = cur_action.join(prev_action, rsuffix="_prev")
same_player = df.player_id == df.player_id_prev
same_period = df.period_id == df.period_id_prev
prev_dribble = df.type_id_prev == spadlcfg.actiontypes.index("dribble")
mask = same_period & same_player & prev_dribble
df.loc[mask, "player_possession_time"] = (
df.loc[mask, "time_seconds"] - df.loc[mask, "time_seconds_prev"]
)
return df[["player_possession_time"]].fillna(0.0)
# STATE FEATURES
def team(gamestates: GameStates) -> Features:
"""Check whether the possession changed during the game state.
For each action in the game state, True if the team that performed the
action is the same team that performed the last action of the game state;
otherwise False.
Parameters
----------
gamestates : GameStates
The game states of a game.
Returns
-------
Features
A dataframe with a column 'team_ai' for each <nb_prev_actions> indicating
whether the team that performed action a0 is in possession.
"""
a0 = gamestates[0]
teamdf = pd.DataFrame(index=a0.index)
for i, a in enumerate(gamestates[1:]):
teamdf["team_" + (str(i + 1))] = a.team_id == a0.team_id
return teamdf
def time_delta(gamestates: GameStates) -> Features:
"""Get the number of seconds between the last and previous actions.
Parameters
----------
gamestates : GameStates
The game states of a game.
Returns
-------
Features
A dataframe with a column 'time_delta_i' for each <nb_prev_actions>
containing the number of seconds between action ai and action a0.
"""
a0 = gamestates[0]
dt = pd.DataFrame(index=a0.index)
for i, a in enumerate(gamestates[1:]):
dt["time_delta_" + (str(i + 1))] = a0.time_seconds - a.time_seconds
return dt
def space_delta(gamestates: GameStates) -> Features:
"""Get the distance covered between the last and previous actions.
Parameters
----------
gamestates : GameStates
The gamestates of a game.
Returns
-------
Features
A dataframe with a column for the horizontal ('dx_a0i'), vertical
('dy_a0i') and total ('mov_a0i') distance covered between each
<nb_prev_actions> action ai and action a0.
"""
a0 = gamestates[0]
spaced = pd.DataFrame(index=a0.index)
for i, a in enumerate(gamestates[1:]):
dx = a.end_x - a0.start_x
spaced["dx_a0" + (str(i + 1))] = dx
dy = a.end_y - a0.start_y
spaced["dy_a0" + (str(i + 1))] = dy
spaced["mov_a0" + (str(i + 1))] = np.sqrt(dx**2 + dy**2)
return spaced
def speed(gamestates: GameStates) -> Features:
"""Get the speed at which the ball moved during the previous actions.
Parameters
----------
gamestates : GameStates
The game states of a game.
Returns
-------
Features
A dataframe with columns 'speedx_a0i', 'speedy_a0i', 'speed_a0i'
for each <nb_prev_actions> containing the ball speed in m/s between
action ai and action a0.
"""
a0 = gamestates[0]
speed = pd.DataFrame(index=a0.index)
for i, a in enumerate(gamestates[1:]):
dx = a.end_x - a0.start_x
dy = a.end_y - a0.start_y
dt = a0.time_seconds - a.time_seconds
dt[dt <= 0] = 1e-6
speed["speedx_a0" + (str(i + 1))] = dx.abs() / dt
speed["speedy_a0" + (str(i + 1))] = dy.abs() / dt
speed["speed_a0" + (str(i + 1))] = np.sqrt(dx**2 + dy**2) / dt
return speed
# CONTEXT FEATURES
def goalscore(gamestates: GameStates) -> Features:
"""Get the number of goals scored by each team after the action.
Parameters
----------
gamestates : GameStates
The gamestates of a game.
Returns
-------
Features
The number of goals scored by the team performing the last action of the
game state ('goalscore_team'), by the opponent ('goalscore_opponent'),
and the goal difference between both teams ('goalscore_diff').
"""
actions = gamestates[0]
teamA = actions["team_id"].values[0]
goals = actions["type_name"].str.contains("shot") & (
actions["result_id"] == spadlcfg.results.index("success")
)
owngoals = actions["type_name"].str.contains("shot") & (
actions["result_id"] == spadlcfg.results.index("owngoal")
)
teamisA = actions["team_id"] == teamA
teamisB = ~teamisA
goalsteamA = (goals & teamisA) | (owngoals & teamisB)
goalsteamB = (goals & teamisB) | (owngoals & teamisA)
goalscoreteamA = goalsteamA.cumsum() - goalsteamA
goalscoreteamB = goalsteamB.cumsum() - goalsteamB
scoredf = pd.DataFrame(index=actions.index)
scoredf["goalscore_team"] = (goalscoreteamA * teamisA) + (goalscoreteamB * teamisB)
scoredf["goalscore_opponent"] = (goalscoreteamB * teamisA) + (goalscoreteamA * teamisB)
scoredf["goalscore_diff"] = scoredf["goalscore_team"] - scoredf["goalscore_opponent"]
return scoredf