socr / data /opta /parsers /ma3_json.py
scfive's picture
Upload 203 files
d6ea71e verified
"""JSON parser for Stats Perform MA3 feeds."""
from datetime import datetime
from typing import Any, Optional
import pandas as pd
from ...base import MissingDataError
from .base import OptaJSONParser, _get_end_x, _get_end_y, assertget
class MA3JSONParser(OptaJSONParser):
"""Extract data from a Stats Perform MA3 data stream.
Parameters
----------
path : str
Path of the data file.
"""
_position_map = {
1: "Goalkeeper",
2: "Defender",
3: "Midfielder",
4: "Forward",
5: "Substitute",
}
def _get_match_info(self) -> dict[str, Any]:
if "matchInfo" in self.root:
return self.root["matchInfo"]
raise MissingDataError
def _get_live_data(self) -> dict[str, Any]:
if "liveData" in self.root:
return self.root["liveData"]
raise MissingDataError
def extract_competitions(self) -> dict[tuple[str, str], dict[str, Any]]:
"""Return a dictionary with all available competitions.
Returns
-------
dict
A mapping between competion IDs and the information available about
each competition in the data stream.
"""
match_info = self._get_match_info()
season = assertget(match_info, "tournamentCalendar")
competition = assertget(match_info, "competition")
competition_id = assertget(competition, "id")
season_id = assertget(season, "id")
season = {
# Fields required by the base schema
"season_id": season_id,
"season_name": assertget(season, "name"),
"competition_id": competition_id,
"competition_name": assertget(competition, "name"),
}
return {(competition_id, season_id): season}
def extract_games(self) -> dict[str, dict[str, Any]]:
"""Return a dictionary with all available games.
Returns
-------
dict
A mapping between game IDs and the information available about
each game in the data stream.
"""
match_info = self._get_match_info()
game_id = assertget(match_info, "id")
season = assertget(match_info, "tournamentCalendar")
competition = assertget(match_info, "competition")
contestant = assertget(match_info, "contestant")
game_date = assertget(match_info, "date")[0:10]
game_time = assertget(match_info, "time")[0:8]
game_datetime = f"{game_date}T{game_time}"
venue = assertget(match_info, "venue")
game_obj = {
"game_id": game_id,
"competition_id": assertget(competition, "id"),
"season_id": assertget(season, "id"),
"game_day": int(match_info["week"]) if "week" in match_info else None,
"game_date": datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%S"),
"home_team_id": self._extract_team_id(contestant, "home"),
"away_team_id": self._extract_team_id(contestant, "away"),
"venue": assertget(venue, "shortName"),
}
live_data = self._get_live_data()
if "matchDetails" in live_data:
match_details = assertget(live_data, "matchDetails")
if "matchLengthMin" in match_details:
game_obj["duration"] = assertget(match_details, "matchLengthMin")
if "scores" in match_details:
scores = assertget(match_details, "scores")
game_obj["home_score"] = assertget(scores, "total")["home"]
game_obj["away_score"] = assertget(scores, "total")["away"]
return {game_id: game_obj}
def extract_teams(self) -> dict[str, dict[str, Any]]:
"""Return a dictionary with all available teams.
Returns
-------
dict
A mapping between team IDs and the information available about
each team in the data stream.
"""
match_info = self._get_match_info()
contestants = assertget(match_info, "contestant")
teams = {}
for contestant in contestants:
team_id = assertget(contestant, "id")
team = {
# Fields required by the base schema
"team_id": team_id,
"team_name": assertget(contestant, "name"),
}
teams[team_id] = team
return teams
def extract_players(self) -> dict[tuple[str, str], dict[str, Any]]: # noqa: C901
"""Return a dictionary with all available players.
Returns
-------
dict
A mapping between (game ID, player ID) tuples and the information
available about each player in the data stream.
"""
match_info = self._get_match_info()
game_id = assertget(match_info, "id")
live_data = self._get_live_data()
events = assertget(live_data, "event")
game_duration = self._extract_duration()
playerid_to_name = {}
players_data: dict[str, list[Any]] = {
"starting_position_id": [],
"player_id": [],
"team_id": [],
"position_in_formation": [],
"jersey_number": [],
}
red_cards = {}
for event in events:
event_type = assertget(event, "typeId")
if event_type == 34:
team_id = assertget(event, "contestantId")
qualifiers = assertget(event, "qualifier")
for q in qualifiers:
qualifier_id = assertget(q, "qualifierId")
value = assertget(q, "value")
value = value.split(", ")
if qualifier_id == 30:
players_data["player_id"] += value
team = [team_id for _ in range(len(value))]
players_data["team_id"] += team
elif qualifier_id == 44:
value = [int(v) for v in value]
players_data["starting_position_id"] += value
elif qualifier_id == 131:
value = [int(v) for v in value]
players_data["position_in_formation"] += value
elif qualifier_id == 59:
value = [int(v) for v in value]
players_data["jersey_number"] += value
elif event_type == 17 and "playerId" in event:
qualifiers = assertget(event, "qualifier")
for q in qualifiers:
qualifier_id = assertget(q, "qualifierId")
if qualifier_id in [32, 33]:
red_cards[event["playerId"]] = event["timeMin"]
player_id = event.get("playerId")
if player_id is None:
continue
player_name = assertget(event, "playerName")
if player_id not in playerid_to_name:
playerid_to_name[player_id] = player_name
df_players_data = pd.DataFrame.from_dict(players_data) # type: ignore
substitutions = list(self.extract_substitutions().values())
substitutions_columns = ["player_id", "team_id", "minute_start", "minute_end"]
df_substitutions = pd.DataFrame(substitutions, columns=substitutions_columns)
df_substitutions = df_substitutions.groupby(["player_id", "team_id"]).max().reset_index()
df_substitutions["minute_start"] = df_substitutions["minute_start"].fillna(0)
df_substitutions["minute_end"] = df_substitutions["minute_end"].fillna(game_duration)
if df_substitutions.empty:
df_players_data["minute_start"] = 0
df_players_data["minute_end"] = game_duration
else:
df_players_data = df_players_data.merge(
df_substitutions, on=["team_id", "player_id"], how="left"
)
df_players_data["minute_end"] = df_players_data.apply(
lambda row: red_cards[row["player_id"]]
if row["player_id"] in red_cards
else row["minute_end"],
axis=1,
)
df_players_data["is_starter"] = df_players_data["position_in_formation"] > 0
df_players_data.loc[
df_players_data["is_starter"] & df_players_data["minute_start"].isnull(),
"minute_start",
] = 0
df_players_data.loc[
df_players_data["is_starter"] & df_players_data["minute_end"].isnull(), "minute_end"
] = game_duration
df_players_data["minutes_played"] = (
(df_players_data["minute_end"] - df_players_data["minute_start"]).fillna(0).astype(int)
)
players = {}
for _, player in df_players_data.iterrows():
if player.minutes_played > 0:
players[(game_id, player.player_id)] = {
# Fields required by the base schema
"game_id": game_id,
"team_id": player.team_id,
"player_id": player.player_id,
"player_name": playerid_to_name[player.player_id],
"is_starter": player.is_starter,
"minutes_played": player.minutes_played,
"jersey_number": player.jersey_number,
# Fields required by the opta schema
"starting_position": self._position_map.get(
player.starting_position_id, "Unknown"
),
}
return players
def extract_events(self) -> dict[tuple[str, int], dict[str, Any]]:
"""Return a dictionary with all available events.
Returns
-------
dict
A mapping between (game ID, event ID) tuples and the information
available about each event in the data stream.
"""
match_info = self._get_match_info()
live_data = self._get_live_data()
game_id = assertget(match_info, "id")
events = {}
for element in assertget(live_data, "event"):
timestamp_string = assertget(element, "timeStamp")
timestamp = self._convert_timestamp(timestamp_string)
qualifiers = {
int(q["qualifierId"]): q.get("value") for q in element.get("qualifier", [])
}
start_x = float(assertget(element, "x"))
start_y = float(assertget(element, "y"))
end_x = _get_end_x(qualifiers)
end_y = _get_end_y(qualifiers)
event_id = int(assertget(element, "id"))
event = {
# Fields required by the base schema
"game_id": game_id,
"event_id": event_id,
"period_id": int(assertget(element, "periodId")),
"team_id": assertget(element, "contestantId"),
"player_id": element.get("playerId"),
"type_id": int(assertget(element, "typeId")),
# Fields required by the opta schema
"timestamp": timestamp,
"minute": int(assertget(element, "timeMin")),
"second": int(assertget(element, "timeSec")),
"outcome": bool(int(element.get("outcome", 1))),
"start_x": start_x,
"start_y": start_y,
"end_x": end_x if end_x is not None else start_x,
"end_y": end_y if end_y is not None else start_y,
"qualifiers": qualifiers,
# Optional fields
"assist": bool(int(element.get("assist", 0))),
"keypass": bool(int(element.get("keyPass", 0))),
}
events[(game_id, event_id)] = event
return events
def extract_substitutions(self) -> dict[int, dict[str, Any]]:
"""Return a dictionary with all substitution events.
Returns
-------
dict
A mapping between player IDs and the information available about
each substitution in the data stream.
"""
live_data = self._get_live_data()
events = assertget(live_data, "event")
subs = {}
for e in events:
event_type = assertget(e, "typeId")
if event_type in (18, 19):
sub_id = assertget(e, "playerId")
substitution_data = {
"player_id": assertget(e, "playerId"),
"team_id": assertget(e, "contestantId"),
}
if event_type == 18:
substitution_data["minute_end"] = assertget(e, "timeMin")
else:
substitution_data["minute_start"] = assertget(e, "timeMin")
subs[sub_id] = substitution_data
return subs
def _extract_duration(self) -> int:
live_data = self._get_live_data()
events = assertget(live_data, "event")
game_duration = 90
for event in events:
event_type = assertget(event, "typeId")
if event_type == 30:
# todo: add 1st half time
qualifiers = assertget(event, "qualifier")
for q in qualifiers:
qualifier = assertget(q, "qualifierId")
if qualifier == 209:
new_duration = assertget(event, "timeMin")
if new_duration > game_duration:
game_duration = new_duration
return game_duration
@staticmethod
def _extract_team_id(teams: list[dict[str, str]], side: str) -> Optional[str]:
for team in teams:
team_side = assertget(team, "position")
if team_side == side:
team_id = assertget(team, "id")
return team_id
raise MissingDataError
@staticmethod
def _convert_timestamp(timestamp_string: str) -> datetime:
try:
return datetime.strptime(timestamp_string, "%Y-%m-%dT%H:%M:%S.%fZ")
except ValueError:
return datetime.strptime(timestamp_string, "%Y-%m-%dT%H:%M:%SZ")