|
"""Implements serializers for StatsBomb data.""" |
|
|
|
import os |
|
from typing import Any, Optional, cast |
|
|
|
import pandas as pd |
|
from pandera.typing import DataFrame |
|
|
|
try: |
|
from statsbombpy import sb |
|
except ImportError: |
|
sb = None |
|
|
|
from socceraction.data.base import ( |
|
EventDataLoader, |
|
ParseError, |
|
_expand_minute, |
|
_localloadjson, |
|
) |
|
|
|
from .schema import ( |
|
StatsBombCompetitionSchema, |
|
StatsBombEventSchema, |
|
StatsBombGameSchema, |
|
StatsBombPlayerSchema, |
|
StatsBombTeamSchema, |
|
) |
|
|
|
|
|
class StatsBombLoader(EventDataLoader): |
|
"""Load Statsbomb data either from a remote location or from a local folder. |
|
|
|
To load remote data, this loader uses the `statsbombpy |
|
<https://github.com/statsbomb/statsbombpy>`__ package. Data can be retrieved |
|
from the StatsBomb API and from the `Open Data GitHub repo |
|
<https://github.com/statsbomb/open-data/>`__. |
|
API access is for paying customers only. Authentication can be done by |
|
setting environment variables named ``SB_USERNAME`` and ``SB_PASSWORD`` to |
|
your login credentials. Alternatively, pass your login credentials using |
|
the ``creds`` parameter. |
|
StatsBomb's open data can be accessed without the need of authentication |
|
but its use is subject to a `user agreement |
|
<https://github.com/statsbomb/open-data/blob/master/LICENSE.pdf>`__. |
|
|
|
To load local data, point ``root`` to the root folder of the data. This folder |
|
should use the same directory structure as used in the Open Data GitHub repo. |
|
|
|
Parameters |
|
---------- |
|
getter : str |
|
"remote" or "local" |
|
root : str, optional |
|
Root-path of the data. Only used when getter is "local". |
|
creds: dict, optional |
|
Login credentials in the format {"user": "", "passwd": ""}. Only used |
|
when getter is "remote". |
|
""" |
|
|
|
def __init__( |
|
self, |
|
getter: str = "remote", |
|
root: Optional[str] = None, |
|
creds: Optional[dict[str, str]] = None, |
|
) -> None: |
|
if getter == "remote": |
|
if sb is None: |
|
raise ImportError( |
|
"""The 'statsbombpy' package is required. Install with 'pip install statsbombpy'.""" |
|
) |
|
self._creds = creds or sb.DEFAULT_CREDS |
|
self._local = False |
|
elif getter == "local": |
|
if root is None: |
|
raise ValueError("""The 'root' parameter is required when loading local data.""") |
|
self._local = True |
|
self._root = root |
|
else: |
|
raise ValueError("Invalid getter specified") |
|
|
|
def competitions(self) -> DataFrame[StatsBombCompetitionSchema]: |
|
"""Return a dataframe with all available competitions and seasons. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all available competitions and seasons. See |
|
:class:`~socceraction.spadl.statsbomb.StatsBombCompetitionSchema` for the schema. |
|
""" |
|
cols = [ |
|
"season_id", |
|
"competition_id", |
|
"competition_name", |
|
"country_name", |
|
"competition_gender", |
|
"season_name", |
|
] |
|
if self._local: |
|
obj = _localloadjson(str(os.path.join(self._root, "competitions.json"))) |
|
else: |
|
obj = list(sb.competitions(fmt="dict", creds=self._creds).values()) |
|
if not isinstance(obj, list): |
|
raise ParseError("The retrieved data should contain a list of competitions") |
|
if len(obj) == 0: |
|
return cast(DataFrame[StatsBombCompetitionSchema], pd.DataFrame(columns=cols)) |
|
return cast(DataFrame[StatsBombCompetitionSchema], pd.DataFrame(obj)[cols]) |
|
|
|
def games(self, competition_id: int, season_id: int) -> DataFrame[StatsBombGameSchema]: |
|
"""Return a dataframe with all available games in a season. |
|
|
|
Parameters |
|
---------- |
|
competition_id : int |
|
The ID of the competition. |
|
season_id : int |
|
The ID of the season. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all available games. See |
|
:class:`~socceraction.spadl.statsbomb.StatsBombGameSchema` for the schema. |
|
""" |
|
cols = [ |
|
"game_id", |
|
"season_id", |
|
"competition_id", |
|
"competition_stage", |
|
"game_day", |
|
"game_date", |
|
"home_team_id", |
|
"away_team_id", |
|
"home_score", |
|
"away_score", |
|
"venue", |
|
"referee", |
|
] |
|
if self._local: |
|
obj = _localloadjson( |
|
str(os.path.join(self._root, "matches", f"{competition_id}", f"{season_id}.json")) |
|
) |
|
else: |
|
obj = list( |
|
sb.matches(competition_id, season_id, fmt="dict", creds=self._creds).values() |
|
) |
|
if not isinstance(obj, list): |
|
raise ParseError("The retrieved data should contain a list of games") |
|
if len(obj) == 0: |
|
return cast(DataFrame[StatsBombGameSchema], pd.DataFrame(columns=cols)) |
|
gamesdf = pd.DataFrame(_flatten(m) for m in obj) |
|
gamesdf["kick_off"] = gamesdf["kick_off"].fillna("12:00:00.000") |
|
gamesdf["match_date"] = pd.to_datetime( |
|
gamesdf[["match_date", "kick_off"]].agg(" ".join, axis=1) |
|
) |
|
gamesdf.rename( |
|
columns={ |
|
"match_id": "game_id", |
|
"match_date": "game_date", |
|
"match_week": "game_day", |
|
"stadium_name": "venue", |
|
"referee_name": "referee", |
|
"competition_stage_name": "competition_stage", |
|
}, |
|
inplace=True, |
|
) |
|
if "venue" not in gamesdf: |
|
gamesdf["venue"] = None |
|
if "referee" not in gamesdf: |
|
gamesdf["referee"] = None |
|
return cast(DataFrame[StatsBombGameSchema], gamesdf[cols]) |
|
|
|
def _lineups(self, game_id: int) -> list[dict[str, Any]]: |
|
if self._local: |
|
obj = _localloadjson(str(os.path.join(self._root, "lineups", f"{game_id}.json"))) |
|
else: |
|
obj = list(sb.lineups(game_id, fmt="dict", creds=self._creds).values()) |
|
if not isinstance(obj, list): |
|
raise ParseError("The retrieved data should contain a list of teams") |
|
if len(obj) != 2: |
|
raise ParseError("The retrieved data should contain two teams") |
|
return obj |
|
|
|
def teams(self, game_id: int) -> DataFrame[StatsBombTeamSchema]: |
|
"""Return a dataframe with both teams that participated in a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Raises |
|
------ |
|
ParseError # noqa: DAR402 |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing both teams. See |
|
:class:`~socceraction.spadl.statsbomb.StatsBombTeamSchema` for the schema. |
|
""" |
|
cols = ["team_id", "team_name"] |
|
obj = self._lineups(game_id) |
|
return cast(DataFrame[StatsBombTeamSchema], pd.DataFrame(obj)[cols]) |
|
|
|
def players(self, game_id: int) -> DataFrame[StatsBombPlayerSchema]: |
|
"""Return a dataframe with all players that participated in a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Raises |
|
------ |
|
ParseError # noqa: DAR402 |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all players. See |
|
:class:`~socceraction.spadl.statsbomb.StatsBombPlayerSchema` for the schema. |
|
""" |
|
cols = [ |
|
"game_id", |
|
"team_id", |
|
"player_id", |
|
"player_name", |
|
"nickname", |
|
"jersey_number", |
|
"is_starter", |
|
"starting_position_id", |
|
"starting_position_name", |
|
"minutes_played", |
|
] |
|
|
|
obj = self._lineups(game_id) |
|
playersdf = pd.DataFrame(_flatten_id(p) for lineup in obj for p in lineup["lineup"]) |
|
playergamesdf = extract_player_games(self.events(game_id)) |
|
playersdf = pd.merge( |
|
playersdf, |
|
playergamesdf[ |
|
["player_id", "team_id", "position_id", "position_name", "minutes_played"] |
|
], |
|
on="player_id", |
|
) |
|
playersdf["game_id"] = game_id |
|
playersdf["position_name"] = playersdf["position_name"].replace(0, "Substitute") |
|
playersdf["position_id"] = playersdf["position_id"].fillna(0).astype(int) |
|
playersdf["is_starter"] = playersdf["position_id"] != 0 |
|
playersdf.rename( |
|
columns={ |
|
"player_nickname": "nickname", |
|
"country_name": "country", |
|
"position_id": "starting_position_id", |
|
"position_name": "starting_position_name", |
|
}, |
|
inplace=True, |
|
) |
|
return cast(DataFrame[StatsBombPlayerSchema], playersdf[cols]) |
|
|
|
def events(self, game_id: int, load_360: bool = False) -> DataFrame[StatsBombEventSchema]: |
|
"""Return a dataframe with the event stream of a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
load_360 : bool |
|
Whether to load the 360 data. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing the event stream. See |
|
:class:`~socceraction.spadl.statsbomb.StatsBombEventSchema` for the schema. |
|
""" |
|
cols = [ |
|
"game_id", |
|
"event_id", |
|
"period_id", |
|
"team_id", |
|
"player_id", |
|
"type_id", |
|
"type_name", |
|
"index", |
|
"timestamp", |
|
"minute", |
|
"second", |
|
"possession", |
|
"possession_team_id", |
|
"possession_team_name", |
|
"play_pattern_id", |
|
"play_pattern_name", |
|
"team_name", |
|
"duration", |
|
"extra", |
|
"related_events", |
|
"player_name", |
|
"position_id", |
|
"position_name", |
|
"location", |
|
"under_pressure", |
|
"counterpress", |
|
] |
|
|
|
if self._local: |
|
obj = _localloadjson(str(os.path.join(self._root, "events", f"{game_id}.json"))) |
|
else: |
|
obj = list(sb.events(game_id, fmt="dict", creds=self._creds).values()) |
|
if not isinstance(obj, list): |
|
raise ParseError("The retrieved data should contain a list of events") |
|
if len(obj) == 0: |
|
return cast(DataFrame[StatsBombEventSchema], pd.DataFrame(columns=cols)) |
|
|
|
eventsdf = pd.DataFrame(_flatten_id(e) for e in obj) |
|
eventsdf["match_id"] = game_id |
|
eventsdf["timestamp"] = pd.to_timedelta(eventsdf["timestamp"]) |
|
eventsdf["related_events"] = eventsdf["related_events"].apply( |
|
lambda d: d if isinstance(d, list) else [] |
|
) |
|
eventsdf["under_pressure"] = eventsdf["under_pressure"].fillna(False).astype(bool) |
|
eventsdf["counterpress"] = eventsdf["counterpress"].fillna(False).astype(bool) |
|
eventsdf.rename( |
|
columns={"id": "event_id", "period": "period_id", "match_id": "game_id"}, |
|
inplace=True, |
|
) |
|
if not load_360: |
|
return cast(DataFrame[StatsBombEventSchema], eventsdf[cols]) |
|
|
|
|
|
cols_360 = ["visible_area_360", "freeze_frame_360"] |
|
if self._local: |
|
obj = _localloadjson(str(os.path.join(self._root, "three-sixty", f"{game_id}.json"))) |
|
else: |
|
obj = sb.frames(game_id, fmt="dict", creds=self._creds) |
|
if not isinstance(obj, list): |
|
raise ParseError("The retrieved data should contain a list of frames") |
|
if len(obj) == 0: |
|
eventsdf["visible_area_360"] = None |
|
eventsdf["freeze_frame_360"] = None |
|
return cast(DataFrame[StatsBombEventSchema], eventsdf[cols + cols_360]) |
|
framesdf = pd.DataFrame(obj).rename( |
|
columns={ |
|
"event_uuid": "event_id", |
|
"visible_area": "visible_area_360", |
|
"freeze_frame": "freeze_frame_360", |
|
}, |
|
)[["event_id", "visible_area_360", "freeze_frame_360"]] |
|
return cast( |
|
DataFrame[StatsBombEventSchema], |
|
pd.merge(eventsdf, framesdf, on="event_id", how="left")[cols + cols_360], |
|
) |
|
|
|
|
|
def extract_player_games(events: pd.DataFrame) -> pd.DataFrame: |
|
"""Extract player games [player_id, game_id, minutes_played] from statsbomb match events. |
|
|
|
Parameters |
|
---------- |
|
events : pd.DataFrame |
|
DataFrame containing StatsBomb events of a single game. |
|
|
|
Returns |
|
------- |
|
player_games : pd.DataFrame |
|
A DataFrame with the number of minutes played by each player during the game. |
|
""" |
|
|
|
periods = pd.DataFrame( |
|
[ |
|
{"period_id": 1, "minute": 45}, |
|
{"period_id": 2, "minute": 45}, |
|
{"period_id": 3, "minute": 15}, |
|
{"period_id": 4, "minute": 15}, |
|
|
|
|
|
] |
|
).set_index("period_id") |
|
periods_minutes = ( |
|
events.loc[events.type_name == "Half End", ["period_id", "minute"]] |
|
.drop_duplicates() |
|
.set_index("period_id") |
|
.sort_index() |
|
.subtract(periods.cumsum().shift(1).fillna(0)) |
|
.minute.dropna() |
|
.astype(int) |
|
.tolist() |
|
) |
|
|
|
game_minutes = sum(periods_minutes) |
|
|
|
game_id = events.game_id.mode().values[0] |
|
players = {} |
|
|
|
red_cards = events[ |
|
events.apply( |
|
lambda x: any( |
|
e in x.extra |
|
and "card" in x.extra[e] |
|
and x.extra[e]["card"]["name"] in ["Second Yellow", "Red Card"] |
|
for e in ["foul_committed", "bad_behaviour"] |
|
), |
|
axis=1, |
|
) |
|
] |
|
|
|
for startxi in events[events.type_name == "Starting XI"].itertuples(): |
|
team_id, team_name = startxi.team_id, startxi.team_name |
|
for player in startxi.extra["tactics"]["lineup"]: |
|
player = _flatten_id(player) |
|
player = { |
|
**player, |
|
**{ |
|
"game_id": game_id, |
|
"team_id": team_id, |
|
"team_name": team_name, |
|
"minutes_played": game_minutes, |
|
}, |
|
} |
|
player_red_card = red_cards[red_cards.player_id == player["player_id"]] |
|
if len(player_red_card) > 0: |
|
red_card_minute = player_red_card.iloc[0].minute |
|
player["minutes_played"] = _expand_minute(red_card_minute, periods_minutes) |
|
players[player["player_id"]] = player |
|
|
|
for substitution in events[events.type_name == "Substitution"].itertuples(): |
|
exp_sub_minute = _expand_minute(substitution.minute, periods_minutes) |
|
replacement = { |
|
"player_id": substitution.extra["substitution"]["replacement"]["id"], |
|
"player_name": substitution.extra["substitution"]["replacement"]["name"], |
|
"minutes_played": game_minutes - exp_sub_minute, |
|
"team_id": substitution.team_id, |
|
"game_id": game_id, |
|
"team_name": substitution.team_name, |
|
} |
|
player_red_card = red_cards[red_cards.player_id == replacement["player_id"]] |
|
if len(player_red_card) > 0: |
|
red_card_minute = player_red_card.iloc[0].minute |
|
replacement["minutes_played"] = ( |
|
_expand_minute(red_card_minute, periods_minutes) - exp_sub_minute |
|
) |
|
players[replacement["player_id"]] = replacement |
|
players[substitution.player_id]["minutes_played"] = exp_sub_minute |
|
pg = pd.DataFrame(players.values()).fillna(0) |
|
for col in pg.columns: |
|
if "_id" in col: |
|
pg[col] = pg[col].astype(int) |
|
return pg |
|
|
|
|
|
def _flatten_id(d: dict[str, dict[str, Any]]) -> dict[str, Any]: |
|
newd = {} |
|
extra = {} |
|
for k, v in d.items(): |
|
if isinstance(v, dict): |
|
if "id" in v and "name" in v: |
|
newd[k + "_id"] = v["id"] |
|
newd[k + "_name"] = v["name"] |
|
else: |
|
extra[k] = v |
|
else: |
|
newd[k] = v |
|
newd["extra"] = extra |
|
return newd |
|
|
|
|
|
def _flatten(d: dict[str, dict[str, Any]]) -> dict[str, Any]: |
|
newd = {} |
|
for k, v in d.items(): |
|
if isinstance(v, dict): |
|
if "id" in v and "name" in v: |
|
newd[k + "_id"] = v["id"] |
|
newd[k + "_name"] = v["name"] |
|
newd[k + "_extra"] = {l: w for (l, w) in v.items() if l in ("id", "name")} |
|
else: |
|
newd = {**newd, **_flatten(v)} |
|
else: |
|
newd[k] = v |
|
return newd |
|
|