|
"""Implements serializers for Wyscout data.""" |
|
|
|
import glob |
|
import os |
|
import re |
|
import warnings |
|
from pathlib import Path |
|
from typing import Any, Callable, Optional, Union, cast |
|
from urllib.error import HTTPError |
|
from urllib.parse import urlparse |
|
from urllib.request import urlopen, urlretrieve |
|
from zipfile import ZipFile, is_zipfile |
|
|
|
import pandas as pd |
|
from pandera.typing import DataFrame |
|
|
|
from ..base import ( |
|
EventDataLoader, |
|
JSONType, |
|
MissingDataError, |
|
ParseError, |
|
_auth_remoteloadjson, |
|
_expand_minute, |
|
_has_auth, |
|
_localloadjson, |
|
_remoteloadjson, |
|
) |
|
from .schema import ( |
|
WyscoutCompetitionSchema, |
|
WyscoutEventSchema, |
|
WyscoutGameSchema, |
|
WyscoutPlayerSchema, |
|
WyscoutTeamSchema, |
|
) |
|
|
|
|
|
class PublicWyscoutLoader(EventDataLoader): |
|
""" |
|
Load the public Wyscout dataset. |
|
|
|
This dataset is a public release of event stream data, collected by Wyscout |
|
(https://wyscout.com/) containing all matches of the 2017/18 season of the |
|
top-5 European leagues (La Liga, Serie A, Bundesliga, Premier League, Ligue |
|
1), the FIFA World Cup 2018, and UEFA Euro Cup 2016. For a detailed |
|
description, see Pappalardo et al. [1]_. |
|
|
|
Parameters |
|
---------- |
|
root : str |
|
Path where a local copy of the dataset is stored or where the |
|
downloaded dataset should be stored. |
|
download : bool |
|
Whether to force a redownload of the data. |
|
|
|
References |
|
---------- |
|
.. [1] Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of |
|
spatio-temporal match events in soccer competitions. Sci Data 6, 236 |
|
(2019). https://doi.org/10.1038/s41597-019-0247-7 |
|
""" |
|
|
|
def __init__(self, root: Optional[str] = None, download: bool = False) -> None: |
|
if root is None: |
|
self.root = os.path.join(os.getcwd(), "wyscout_data") |
|
os.makedirs(self.root, exist_ok=True) |
|
else: |
|
self.root = root |
|
|
|
self.get = _localloadjson |
|
|
|
if download or len(os.listdir(self.root)) == 0: |
|
self._download_repo() |
|
|
|
self._index = pd.DataFrame( |
|
[ |
|
{ |
|
"competition_id": 524, |
|
"season_id": 181248, |
|
"season_name": "2017/2018", |
|
"db_matches": "matches_Italy.json", |
|
"db_events": "events_Italy.json", |
|
}, |
|
{ |
|
"competition_id": 364, |
|
"season_id": 181150, |
|
"season_name": "2017/2018", |
|
"db_matches": "matches_England.json", |
|
"db_events": "events_England.json", |
|
}, |
|
{ |
|
"competition_id": 795, |
|
"season_id": 181144, |
|
"season_name": "2017/2018", |
|
"db_matches": "matches_Spain.json", |
|
"db_events": "events_Spain.json", |
|
}, |
|
{ |
|
"competition_id": 412, |
|
"season_id": 181189, |
|
"season_name": "2017/2018", |
|
"db_matches": "matches_France.json", |
|
"db_events": "events_France.json", |
|
}, |
|
{ |
|
"competition_id": 426, |
|
"season_id": 181137, |
|
"season_name": "2017/2018", |
|
"db_matches": "matches_Germany.json", |
|
"db_events": "events_Germany.json", |
|
}, |
|
{ |
|
"competition_id": 102, |
|
"season_id": 9291, |
|
"season_name": "2016", |
|
"db_matches": "matches_European_Championship.json", |
|
"db_events": "events_European_Championship.json", |
|
}, |
|
{ |
|
"competition_id": 28, |
|
"season_id": 10078, |
|
"season_name": "2018", |
|
"db_matches": "matches_World_Cup.json", |
|
"db_events": "events_World_Cup.json", |
|
}, |
|
] |
|
).set_index(["competition_id", "season_id"]) |
|
self._match_index = self._create_match_index().set_index("match_id") |
|
self._cache: Optional[dict[str, Any]] = None |
|
|
|
def _download_repo(self) -> None: |
|
dataset_urls = { |
|
"competitions": "https://ndownloader.figshare.com/files/15073685", |
|
"teams": "https://ndownloader.figshare.com/files/15073697", |
|
"players": "https://ndownloader.figshare.com/files/15073721", |
|
"matches": "https://ndownloader.figshare.com/files/14464622", |
|
"events": "https://ndownloader.figshare.com/files/14464685", |
|
} |
|
|
|
for url in dataset_urls.values(): |
|
url_obj = urlopen(url).geturl() |
|
path = Path(urlparse(url_obj).path) |
|
file_name = os.path.join(self.root, path.name) |
|
file_local, _ = urlretrieve(url_obj, file_name) |
|
if is_zipfile(file_local): |
|
with ZipFile(file_local) as zip_file: |
|
zip_file.extractall(self.root) |
|
|
|
def _create_match_index(self) -> pd.DataFrame: |
|
df_matches = pd.concat( |
|
[pd.DataFrame(self.get(path)) for path in glob.iglob(f"{self.root}/matches_*.json")] |
|
) |
|
df_matches.rename( |
|
columns={ |
|
"wyId": "match_id", |
|
"competitionId": "competition_id", |
|
"seasonId": "season_id", |
|
}, |
|
inplace=True, |
|
) |
|
return pd.merge( |
|
df_matches[["match_id", "competition_id", "season_id"]], |
|
self._index, |
|
on=["competition_id", "season_id"], |
|
how="left", |
|
) |
|
|
|
def competitions(self) -> DataFrame[WyscoutCompetitionSchema]: |
|
"""Return a dataframe with all available competitions and seasons. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all available competitions and seasons. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema. |
|
""" |
|
path = os.path.join(self.root, "competitions.json") |
|
df_competitions = pd.DataFrame(self.get(path)) |
|
df_competitions.rename( |
|
columns={"wyId": "competition_id", "name": "competition_name"}, inplace=True |
|
) |
|
df_competitions["country_name"] = df_competitions.apply( |
|
lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1 |
|
) |
|
df_competitions["competition_gender"] = "male" |
|
df_competitions = pd.merge( |
|
df_competitions, |
|
self._index.reset_index()[["competition_id", "season_id", "season_name"]], |
|
on="competition_id", |
|
how="left", |
|
) |
|
return cast( |
|
DataFrame[WyscoutCompetitionSchema], |
|
df_competitions.reset_index()[ |
|
[ |
|
"competition_id", |
|
"season_id", |
|
"country_name", |
|
"competition_name", |
|
"competition_gender", |
|
"season_name", |
|
] |
|
], |
|
) |
|
|
|
def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]: |
|
"""Return a dataframe with all available games in a season. |
|
|
|
Parameters |
|
---------- |
|
competition_id : int |
|
The ID of the competition. |
|
season_id : int |
|
The ID of the season. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all available games. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema. |
|
""" |
|
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"]) |
|
df_matches = pd.DataFrame(self.get(path)) |
|
return cast(DataFrame[WyscoutGameSchema], _convert_games(df_matches)) |
|
|
|
def _lineups(self, game_id: int) -> list[dict[str, Any]]: |
|
competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]] |
|
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_matches"]) |
|
df_matches = pd.DataFrame(self.get(path)).set_index("wyId") |
|
return list(df_matches.at[game_id, "teamsData"].values()) |
|
|
|
def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]: |
|
"""Return a dataframe with both teams that participated in a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing both teams. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema. |
|
""" |
|
path = os.path.join(self.root, "teams.json") |
|
df_teams = pd.DataFrame(self.get(path)).set_index("wyId") |
|
df_teams_match_id = pd.DataFrame(self._lineups(game_id))["teamId"] |
|
df_teams_match = df_teams.loc[df_teams_match_id].reset_index() |
|
return cast(DataFrame[WyscoutTeamSchema], _convert_teams(df_teams_match)) |
|
|
|
def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]: |
|
"""Return a dataframe with all players that participated in a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all players. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema. |
|
""" |
|
path = os.path.join(self.root, "players.json") |
|
df_players = pd.DataFrame(self.get(path)).set_index("wyId") |
|
lineups = self._lineups(game_id) |
|
players_match = [] |
|
for team in lineups: |
|
playerlist = team["formation"]["lineup"] |
|
if team["formation"]["substitutions"] != "null": |
|
for p in team["formation"]["substitutions"]: |
|
try: |
|
playerlist.append( |
|
next( |
|
item |
|
for item in team["formation"]["bench"] |
|
if item["playerId"] == p["playerIn"] |
|
) |
|
) |
|
except StopIteration: |
|
warnings.warn( |
|
f'A player with ID={p["playerIn"]} was substituted ' |
|
f'in the {p["minute"]}th minute of game {game_id}, but ' |
|
"could not be found on the bench." |
|
) |
|
df = pd.DataFrame(playerlist) |
|
df["side"] = team["side"] |
|
df["team_id"] = team["teamId"] |
|
players_match.append(df) |
|
df_players_match = ( |
|
pd.concat(players_match) |
|
.rename(columns={"playerId": "wyId"}) |
|
.set_index("wyId") |
|
.join(df_players, how="left") |
|
) |
|
df_players_match.reset_index(inplace=True) |
|
for c in ["shortName", "lastName", "firstName"]: |
|
df_players_match[c] = df_players_match[c].apply( |
|
lambda x: x.encode().decode("unicode-escape") |
|
) |
|
df_players_match = _convert_players(df_players_match) |
|
|
|
|
|
competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]] |
|
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"]) |
|
if self._cache is not None and self._cache["path"] == path: |
|
df_events = self._cache["events"] |
|
else: |
|
df_events = pd.DataFrame(self.get(path)).set_index("matchId") |
|
|
|
|
|
self._cache = {"path": path, "events": df_events} |
|
match_events = df_events.loc[game_id].reset_index().to_dict("records") |
|
mp = _get_minutes_played(lineups, match_events) |
|
df_players_match = pd.merge(df_players_match, mp, on="player_id", how="right") |
|
df_players_match["minutes_played"] = df_players_match.minutes_played.fillna(0) |
|
df_players_match["game_id"] = game_id |
|
return cast(DataFrame[WyscoutPlayerSchema], df_players_match) |
|
|
|
def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]: |
|
"""Return a dataframe with the event stream of a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing the event stream. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema. |
|
""" |
|
competition_id, season_id = self._match_index.loc[game_id, ["competition_id", "season_id"]] |
|
path = os.path.join(self.root, self._index.at[(competition_id, season_id), "db_events"]) |
|
if self._cache is not None and self._cache["path"] == path: |
|
df_events = self._cache["events"] |
|
else: |
|
df_events = pd.DataFrame(self.get(path)).set_index("matchId") |
|
|
|
|
|
self._cache = {"path": path, "events": df_events} |
|
return cast( |
|
DataFrame[WyscoutEventSchema], _convert_events(df_events.loc[game_id].reset_index()) |
|
) |
|
|
|
|
|
class WyscoutLoader(EventDataLoader): |
|
"""Load event data either from a remote location or from a local folder. |
|
|
|
Parameters |
|
---------- |
|
root : str |
|
Root-path of the data. |
|
getter : str or callable, default: "remote" |
|
"remote", "local" or a function that returns loads JSON data from a path. |
|
feeds : dict(str, str) |
|
Glob pattern for each feed that should be parsed. The default feeds for |
|
a "remote" getter are:: |
|
|
|
{ |
|
'competitions': 'competitions', |
|
'seasons': 'competitions/{season_id}/seasons', |
|
'games': 'seasons/{season_id}/matches', |
|
'events': 'matches/{game_id}/events?fetch=teams,players,match,substitutions' |
|
} |
|
|
|
The default feeds for a "local" getter are:: |
|
|
|
{ |
|
'competitions': 'competitions.json', |
|
'seasons': 'seasons_{competition_id}.json', |
|
'games': 'matches_{season_id}.json', |
|
'events': 'matches/events_{game_id}.json', |
|
} |
|
|
|
creds: dict, optional |
|
Login credentials in the format {"user": "", "passwd": ""}. Only used |
|
when getter is "remote". |
|
""" |
|
|
|
_wyscout_api: str = "https://apirest.wyscout.com/v2/" |
|
|
|
def __init__( |
|
self, |
|
root: str = _wyscout_api, |
|
getter: Union[str, Callable[[str], JSONType]] = "remote", |
|
feeds: Optional[dict[str, str]] = None, |
|
creds: Optional[dict[str, str]] = None, |
|
) -> None: |
|
self.root = root |
|
|
|
|
|
if creds is None: |
|
creds = { |
|
"user": os.environ.get("WY_USERNAME", ""), |
|
"passwd": os.environ.get("WY_PASSWORD", ""), |
|
} |
|
|
|
|
|
if getter == "remote": |
|
self.get = _remoteloadjson |
|
if _has_auth(creds): |
|
_auth_remoteloadjson(creds["user"], creds["passwd"]) |
|
elif getter == "local": |
|
self.get = _localloadjson |
|
else: |
|
self.get = getter |
|
|
|
|
|
if feeds is not None: |
|
self.feeds = feeds |
|
elif getter == "remote": |
|
self.feeds = { |
|
"seasons": "competitions/{competition_id}/seasons?fetch=competition", |
|
"games": "seasons/{season_id}/matches", |
|
"events": "matches/{game_id}/events?fetch=teams,players,match,coaches,referees,formations,substitutions", |
|
} |
|
elif getter == "local": |
|
self.feeds = { |
|
"competitions": "competitions.json", |
|
"seasons": "seasons_{competition_id}.json", |
|
"games": "matches_{season_id}.json", |
|
"events": "matches/events_{game_id}.json", |
|
} |
|
else: |
|
raise ValueError("No feeds specified.") |
|
|
|
def _get_file_or_url( |
|
self, |
|
feed: str, |
|
competition_id: Optional[int] = None, |
|
season_id: Optional[int] = None, |
|
game_id: Optional[int] = None, |
|
) -> list[str]: |
|
competition_id_glob = "*" if competition_id is None else competition_id |
|
season_id_glob = "*" if season_id is None else season_id |
|
game_id_glob = "*" if game_id is None else game_id |
|
glob_pattern = self.feeds[feed].format( |
|
competition_id=competition_id_glob, season_id=season_id_glob, game_id=game_id_glob |
|
) |
|
if "*" in glob_pattern: |
|
files = glob.glob(os.path.join(self.root, glob_pattern)) |
|
if len(files) == 0: |
|
raise MissingDataError |
|
return files |
|
return [glob_pattern] |
|
|
|
def competitions( |
|
self, competition_id: Optional[int] = None |
|
) -> DataFrame[WyscoutCompetitionSchema]: |
|
"""Return a dataframe with all available competitions and seasons. |
|
|
|
Parameters |
|
---------- |
|
competition_id : int, optional |
|
The ID of the competition. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all available competitions and seasons. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutCompetitionSchema` for the schema. |
|
""" |
|
|
|
if "competitions" in self.feeds: |
|
competitions_url = self._get_file_or_url("competitions")[0] |
|
path = os.path.join(self.root, competitions_url) |
|
obj = self.get(path) |
|
if not isinstance(obj, dict) or "competitions" not in obj: |
|
raise ParseError(f"{path} should contain a list of competitions") |
|
seasons_urls = [ |
|
self._get_file_or_url("seasons", competition_id=c["wyId"])[0] |
|
for c in obj["competitions"] |
|
] |
|
else: |
|
seasons_urls = self._get_file_or_url("seasons", competition_id=competition_id) |
|
|
|
competitions = [] |
|
seasons = [] |
|
for seasons_url in seasons_urls: |
|
try: |
|
path = os.path.join(self.root, seasons_url) |
|
obj = self.get(path) |
|
if not isinstance(obj, dict) or "competition" not in obj or "seasons" not in obj: |
|
raise ParseError( |
|
f"{path} should contain a list of competition and list of seasons" |
|
) |
|
competitions.append(obj["competition"]) |
|
seasons.extend([s["season"] for s in obj["seasons"]]) |
|
except FileNotFoundError: |
|
warnings.warn(f"File not found: {seasons_url}") |
|
df_competitions = _convert_competitions(pd.DataFrame(competitions)) |
|
df_seasons = _convert_seasons(pd.DataFrame(seasons)) |
|
|
|
return cast( |
|
DataFrame[WyscoutCompetitionSchema], |
|
pd.merge(df_competitions, df_seasons, on="competition_id"), |
|
) |
|
|
|
def games(self, competition_id: int, season_id: int) -> DataFrame[WyscoutGameSchema]: |
|
"""Return a dataframe with all available games in a season. |
|
|
|
Parameters |
|
---------- |
|
competition_id : int |
|
The ID of the competition. |
|
season_id : int |
|
The ID of the season. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all available games. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutGameSchema` for the schema. |
|
""" |
|
|
|
if "games" in self.feeds: |
|
games_url = self._get_file_or_url( |
|
"games", competition_id=competition_id, season_id=season_id |
|
)[0] |
|
path = os.path.join(self.root, games_url) |
|
obj = self.get(path) |
|
if not isinstance(obj, dict) or "matches" not in obj: |
|
raise ParseError(f"{path} should contain a list of matches") |
|
gamedetails_urls = [ |
|
self._get_file_or_url( |
|
"events", |
|
competition_id=competition_id, |
|
season_id=season_id, |
|
game_id=g["matchId"], |
|
)[0] |
|
for g in obj["matches"] |
|
] |
|
else: |
|
gamedetails_urls = self._get_file_or_url( |
|
"events", competition_id=competition_id, season_id=season_id |
|
) |
|
games = [] |
|
for gamedetails_url in gamedetails_urls: |
|
try: |
|
path = os.path.join(self.root, gamedetails_url) |
|
obj = self.get(path) |
|
if not isinstance(obj, dict) or "match" not in obj: |
|
raise ParseError(f"{path} should contain a match") |
|
games.append(obj["match"]) |
|
except FileNotFoundError: |
|
warnings.warn(f"File not found: {gamedetails_url}") |
|
except HTTPError: |
|
warnings.warn(f"Resource not found: {gamedetails_url}") |
|
df_games = _convert_games(pd.DataFrame(games)) |
|
return cast(DataFrame[WyscoutGameSchema], df_games) |
|
|
|
def teams(self, game_id: int) -> DataFrame[WyscoutTeamSchema]: |
|
"""Return a dataframe with both teams that participated in a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing both teams. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutTeamSchema` for the schema. |
|
""" |
|
events_url = self._get_file_or_url("events", game_id=game_id)[0] |
|
path = os.path.join(self.root, events_url) |
|
obj = self.get(path) |
|
if not isinstance(obj, dict) or "teams" not in obj: |
|
raise ParseError(f"{path} should contain a list of matches") |
|
teams = [t["team"] for t in obj["teams"].values() if t.get("team")] |
|
df_teams = _convert_teams(pd.DataFrame(teams)) |
|
return cast(DataFrame[WyscoutTeamSchema], df_teams) |
|
|
|
def players(self, game_id: int) -> DataFrame[WyscoutPlayerSchema]: |
|
"""Return a dataframe with all players that participated in a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing all players. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutPlayerSchema` for the schema. |
|
""" |
|
events_url = self._get_file_or_url("events", game_id=game_id)[0] |
|
path = os.path.join(self.root, events_url) |
|
obj = self.get(path) |
|
if not isinstance(obj, dict) or "players" not in obj: |
|
raise ParseError(f"{path} should contain a list of players") |
|
players = [ |
|
player["player"] |
|
for team in obj["players"].values() |
|
for player in team |
|
if player.get("player") |
|
] |
|
df_players = _convert_players(pd.DataFrame(players).drop_duplicates("wyId")) |
|
df_players = pd.merge( |
|
df_players, |
|
_get_minutes_played(obj["match"]["teamsData"], obj["events"]), |
|
on="player_id", |
|
how="right", |
|
) |
|
df_players["minutes_played"] = df_players.minutes_played.fillna(0) |
|
df_players["game_id"] = game_id |
|
return cast(DataFrame[WyscoutPlayerSchema], df_players) |
|
|
|
def events(self, game_id: int) -> DataFrame[WyscoutEventSchema]: |
|
"""Return a dataframe with the event stream of a game. |
|
|
|
Parameters |
|
---------- |
|
game_id : int |
|
The ID of the game. |
|
|
|
Raises |
|
------ |
|
ParseError |
|
When the raw data does not adhere to the expected format. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
A dataframe containing the event stream. See |
|
:class:`~socceraction.spadl.wyscout.WyscoutEventSchema` for the schema. |
|
""" |
|
events_url = self._get_file_or_url("events", game_id=game_id)[0] |
|
path = os.path.join(self.root, events_url) |
|
obj = self.get(path) |
|
if not isinstance(obj, dict) or "events" not in obj: |
|
raise ParseError(f"{path} should contain a list of events") |
|
df_events = _convert_events(pd.DataFrame(obj["events"])) |
|
return cast(DataFrame[WyscoutEventSchema], df_events) |
|
|
|
|
|
def _convert_competitions(competitions: pd.DataFrame) -> pd.DataFrame: |
|
competitionsmapping = { |
|
"wyId": "competition_id", |
|
"name": "competition_name", |
|
"gender": "competition_gender", |
|
} |
|
cols = ["competition_id", "competition_name", "country_name", "competition_gender"] |
|
competitions["country_name"] = competitions.apply( |
|
lambda x: x.area["name"] if x.area["name"] != "" else "International", axis=1 |
|
) |
|
competitions = competitions.rename(columns=competitionsmapping)[cols] |
|
return competitions |
|
|
|
|
|
def _convert_seasons(seasons: pd.DataFrame) -> pd.DataFrame: |
|
seasonsmapping = { |
|
"wyId": "season_id", |
|
"name": "season_name", |
|
"competitionId": "competition_id", |
|
} |
|
cols = ["season_id", "season_name", "competition_id"] |
|
seasons = seasons.rename(columns=seasonsmapping)[cols] |
|
return seasons |
|
|
|
|
|
def _convert_games(matches: pd.DataFrame) -> pd.DataFrame: |
|
gamesmapping = { |
|
"wyId": "game_id", |
|
"dateutc": "game_date", |
|
"competitionId": "competition_id", |
|
"seasonId": "season_id", |
|
"gameweek": "game_day", |
|
} |
|
cols = ["game_id", "competition_id", "season_id", "game_date", "game_day"] |
|
games = matches.rename(columns=gamesmapping)[cols] |
|
games["game_date"] = pd.to_datetime(games["game_date"]) |
|
games["home_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "home")) |
|
games["away_team_id"] = matches.teamsData.apply(lambda x: _get_team_id(x, "away")) |
|
return games |
|
|
|
|
|
def _get_team_id(teamsData: dict[int, Any], side: str) -> int: |
|
for team_id, data in teamsData.items(): |
|
if data["side"] == side: |
|
return int(team_id) |
|
raise ValueError() |
|
|
|
|
|
def _convert_players(players: pd.DataFrame) -> pd.DataFrame: |
|
playermapping = { |
|
"wyId": "player_id", |
|
"shortName": "nickname", |
|
"firstName": "firstname", |
|
"lastName": "lastname", |
|
"birthDate": "birth_date", |
|
} |
|
cols = ["player_id", "nickname", "firstname", "lastname", "birth_date"] |
|
df_players = players.rename(columns=playermapping)[cols] |
|
df_players["player_name"] = df_players[["firstname", "lastname"]].agg(" ".join, axis=1) |
|
df_players["birth_date"] = pd.to_datetime(df_players["birth_date"]) |
|
return df_players |
|
|
|
|
|
def _convert_teams(teams: pd.DataFrame) -> pd.DataFrame: |
|
teammapping = { |
|
"wyId": "team_id", |
|
"name": "team_name_short", |
|
"officialName": "team_name", |
|
} |
|
cols = ["team_id", "team_name_short", "team_name"] |
|
return teams.rename(columns=teammapping)[cols] |
|
|
|
|
|
def _convert_events(raw_events: pd.DataFrame) -> pd.DataFrame: |
|
eventmapping = { |
|
"id": "event_id", |
|
"match_id": "game_id", |
|
"event_name": "type_name", |
|
"sub_event_name": "subtype_name", |
|
} |
|
cols = [ |
|
"event_id", |
|
"game_id", |
|
"period_id", |
|
"milliseconds", |
|
"team_id", |
|
"player_id", |
|
"type_id", |
|
"type_name", |
|
"subtype_id", |
|
"subtype_name", |
|
"positions", |
|
"tags", |
|
] |
|
events = raw_events.copy() |
|
|
|
pattern = re.compile(r"(?<!^)(?=[A-Z])") |
|
events.columns = [pattern.sub("_", c).lower() for c in events.columns] |
|
|
|
events["type_id"] = ( |
|
pd.to_numeric( |
|
events["event_id"] if "event_id" in events.columns else None, errors="coerce" |
|
) |
|
.fillna(0) |
|
.astype(int) |
|
) |
|
del events["event_id"] |
|
events["subtype_id"] = ( |
|
pd.to_numeric( |
|
events["sub_event_id"] if "sub_event_id" in events.columns else None, errors="coerce" |
|
) |
|
.fillna(0) |
|
.astype(int) |
|
) |
|
del events["sub_event_id"] |
|
events["period_id"] = events.match_period.apply(lambda x: wyscout_periods[x]) |
|
events["milliseconds"] = events.event_sec * 1000 |
|
return events.rename(columns=eventmapping)[cols] |
|
|
|
|
|
def _get_minutes_played( |
|
teamsData: list[dict[str, Any]], events: list[dict[str, Any]] |
|
) -> pd.DataFrame: |
|
|
|
periods_ts = {i: [0] for i in range(6)} |
|
for e in events: |
|
period_id = wyscout_periods[e["matchPeriod"]] |
|
periods_ts[period_id].append(e["eventSec"]) |
|
periods_duration = [ |
|
round(max(periods_ts[i]) / 60) for i in range(5) if max(periods_ts[i]) != 0 |
|
] |
|
|
|
duration = sum(periods_duration) |
|
|
|
|
|
playergames: dict[int, dict[str, Any]] = {} |
|
if isinstance(teamsData, dict): |
|
teamsData = list(teamsData.values()) |
|
for teamData in teamsData: |
|
formation = teamData.get("formation", {}) |
|
substitutions = formation.get("substitutions", []) |
|
red_cards = { |
|
player["playerId"]: _expand_minute(int(player["redCards"]), periods_duration) |
|
for key in ["bench", "lineup"] |
|
for player in formation.get(key, []) |
|
if player["redCards"] != "0" |
|
} |
|
pg = { |
|
player["playerId"]: { |
|
"team_id": teamData["teamId"], |
|
"player_id": player["playerId"], |
|
"jersey_number": player.get("shirtNumber", 0), |
|
"minutes_played": red_cards.get(player["playerId"], duration), |
|
"is_starter": True, |
|
} |
|
for player in formation.get("lineup", []) |
|
} |
|
|
|
|
|
if substitutions != "null": |
|
for substitution in substitutions: |
|
expanded_minute_sub = _expand_minute(substitution["minute"], periods_duration) |
|
substitute = { |
|
"team_id": teamData["teamId"], |
|
"player_id": substitution["playerIn"], |
|
"jersey_number": next( |
|
( |
|
p.get("shirtNumber", 0) |
|
for p in formation.get("bench", []) |
|
if p["playerId"] == substitution["playerIn"] |
|
), |
|
0, |
|
), |
|
"minutes_played": duration - expanded_minute_sub, |
|
"is_starter": False, |
|
} |
|
if substitution["playerIn"] in red_cards: |
|
substitute["minutes_played"] = ( |
|
red_cards[substitution["playerIn"]] - expanded_minute_sub |
|
) |
|
pg[substitution["playerIn"]] = substitute |
|
pg[substitution["playerOut"]]["minutes_played"] = expanded_minute_sub |
|
|
|
playergames = {**playergames, **pg} |
|
return pd.DataFrame(playergames.values()) |
|
|
|
|
|
wyscout_periods = {"1H": 1, "2H": 2, "E1": 3, "E2": 4, "P": 5} |
|
|