|
"""JSON parser for WhoScored feeds.""" |
|
|
|
import json |
|
import re |
|
from datetime import datetime, timedelta |
|
from typing import Any, Optional |
|
|
|
from ...base import MissingDataError |
|
from .base import OptaParser, _get_end_x, _get_end_y, assertget |
|
|
|
|
|
def _position_mapping(formation: str, x: float, y: float) -> str: |
|
if x == 0 and y == 5: |
|
return "GK" |
|
return "Unknown" |
|
|
|
|
|
class WhoScoredParser(OptaParser): |
|
"""Extract data from a JSON data stream scraped from WhoScored. |
|
|
|
Parameters |
|
---------- |
|
path : str |
|
Path of the data file. |
|
competition_id : int |
|
ID of the competition to which the provided data file belongs. If |
|
None, this information is extracted from a field 'competition_id' in |
|
the JSON. |
|
season_id : int |
|
ID of the season to which the provided data file belongs. If None, |
|
this information is extracted from a field 'season_id' in the JSON. |
|
game_id : int |
|
ID of the game to which the provided data file belongs. If None, this |
|
information is extracted from a field 'game_id' in the JSON. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
path: str, |
|
competition_id: Optional[int] = None, |
|
season_id: Optional[int] = None, |
|
game_id: Optional[int] = None, |
|
) -> None: |
|
with open(path, encoding="utf-8") as fh: |
|
self.root = json.load(fh) |
|
|
|
if competition_id is None: |
|
try: |
|
competition_id = int(assertget(self.root, "competition_id")) |
|
except AssertionError as e: |
|
raise MissingDataError( |
|
"""Could not determine the competition id. Add it to the |
|
file path or include a field 'competition_id' in the |
|
JSON.""" |
|
) from e |
|
self.competition_id = competition_id |
|
|
|
if season_id is None: |
|
try: |
|
season_id = int(assertget(self.root, "season_id")) |
|
except AssertionError as e: |
|
raise MissingDataError( |
|
"""Could not determine the season id. Add it to the file |
|
path or include a field 'season_id' in the JSON.""" |
|
) from e |
|
self.season_id = season_id |
|
|
|
if game_id is None: |
|
try: |
|
game_id = int(assertget(self.root, "game_id")) |
|
except AssertionError as e: |
|
raise MissingDataError( |
|
"""Could not determine the game id. Add it to the file |
|
path or include a field 'game_id' in the JSON.""" |
|
) from e |
|
self.game_id = game_id |
|
|
|
def _get_period_id(self, event: dict[str, Any]) -> int: |
|
period = assertget(event, "period") |
|
period_id = int(assertget(period, "value")) |
|
return period_id |
|
|
|
def _get_period_milliseconds(self, event: dict[str, Any]) -> int: |
|
period_minute_limits = assertget(self.root, "periodMinuteLimits") |
|
period_id = self._get_period_id(event) |
|
if period_id == 16: |
|
return 0 |
|
if period_id == 14: |
|
return 0 |
|
minute = int(assertget(event, "minute")) |
|
period_minute = minute |
|
if period_id > 1: |
|
period_minute = minute - period_minute_limits[str(period_id - 1)] |
|
period_second = period_minute * 60 + int(event.get("second", 0)) |
|
return period_second * 1000 |
|
|
|
def extract_games(self) -> dict[int, dict[str, Any]]: |
|
"""Return a dictionary with all available games. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between game IDs and the information available about |
|
each game in the data stream. |
|
""" |
|
team_home = assertget(self.root, "home") |
|
team_away = assertget(self.root, "away") |
|
game_dict = { |
|
|
|
"game_id": self.game_id, |
|
"season_id": self.season_id, |
|
"competition_id": self.competition_id, |
|
"game_day": None, |
|
"game_date": datetime.strptime( |
|
assertget(self.root, "startTime"), "%Y-%m-%dT%H:%M:%S" |
|
), |
|
"home_team_id": int(assertget(team_home, "teamId")), |
|
"away_team_id": int(assertget(team_away, "teamId")), |
|
|
|
"home_score": int(assertget(assertget(self.root["home"], "scores"), "running")), |
|
"away_score": int(assertget(assertget(self.root["away"], "scores"), "running")), |
|
"duration": int(self.root.get("expandedMaxMinute")) |
|
if "expandedMaxMinute" in self.root |
|
else None, |
|
"referee": self.root.get("referee", {}).get("name"), |
|
"venue": self.root.get("venueName"), |
|
"attendance": int(self.root.get("attendance")) if "attendance" in self.root else None, |
|
"home_manager": team_home.get("managerName"), |
|
"away_manager": team_away.get("managerName"), |
|
} |
|
return {self.game_id: game_dict} |
|
|
|
def extract_teams(self) -> dict[int, dict[str, Any]]: |
|
"""Return a dictionary with all available teams. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between team IDs and the information available about |
|
each team in the data stream. |
|
""" |
|
teams = {} |
|
for side in [self.root["home"], self.root["away"]]: |
|
team_id = int(assertget(side, "teamId")) |
|
teams[team_id] = { |
|
|
|
"team_id": team_id, |
|
"team_name": assertget(side, "name"), |
|
} |
|
return teams |
|
|
|
def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]: |
|
"""Return a dictionary with all available players. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between (game ID, player ID) tuples and the information |
|
available about each player in the data stream. |
|
""" |
|
game_id = self.game_id |
|
player_gamestats = self.extract_playergamestats() |
|
players = {} |
|
for team in [self.root["home"], self.root["away"]]: |
|
team_id = int(assertget(team, "teamId")) |
|
for p in team["players"]: |
|
player_id = int(assertget(p, "playerId")) |
|
players[(game_id, player_id)] = { |
|
|
|
"game_id": game_id, |
|
"team_id": team_id, |
|
"player_id": player_id, |
|
"player_name": assertget(p, "name"), |
|
"is_starter": bool(p.get("isFirstEleven", False)), |
|
"minutes_played": player_gamestats[(game_id, player_id)]["minutes_played"], |
|
"jersey_number": player_gamestats[(game_id, player_id)]["jersey_number"], |
|
|
|
"starting_position": player_gamestats[(game_id, player_id)]["position_code"], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
return players |
|
|
|
def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]: |
|
"""Return a dictionary with all available events. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between (game ID, event ID) tuples and the information |
|
available about each event in the data stream. |
|
""" |
|
events = {} |
|
|
|
time_start_str = assertget(self.root, "startTime") |
|
time_start = datetime.strptime(time_start_str, "%Y-%m-%dT%H:%M:%S") |
|
for attr in self.root["events"]: |
|
event_id = int(assertget(attr, "id" if "id" in attr else "eventId")) |
|
eventtype = attr.get("type", {}) |
|
start_x = float(assertget(attr, "x")) |
|
start_y = float(assertget(attr, "y")) |
|
minute = int(assertget(attr, "expandedMinute")) |
|
second = int(attr.get("second", 0)) |
|
qualifiers = { |
|
int(q["type"]["value"]): q.get("value", True) for q in attr.get("qualifiers", []) |
|
} |
|
end_x = attr.get("endX", _get_end_x(qualifiers)) |
|
end_y = attr.get("endY", _get_end_y(qualifiers)) |
|
events[(self.game_id, event_id)] = { |
|
|
|
"game_id": self.game_id, |
|
"event_id": event_id, |
|
"period_id": self._get_period_id(attr), |
|
"team_id": int(assertget(attr, "teamId")), |
|
"player_id": int(attr.get("playerId")) if "playerId" in attr else None, |
|
"type_id": int(assertget(eventtype, "value")), |
|
|
|
|
|
|
|
|
|
|
|
"timestamp": (time_start + timedelta(seconds=(minute * 60 + second))), |
|
"minute": minute, |
|
"second": second, |
|
"outcome": bool(attr["outcomeType"].get("value")) |
|
if "outcomeType" in attr |
|
else None, |
|
"start_x": start_x, |
|
"start_y": start_y, |
|
"end_x": end_x if end_x is not None else start_x, |
|
"end_y": end_y if end_y is not None else start_y, |
|
"qualifiers": qualifiers, |
|
|
|
"related_player_id": int(attr.get("relatedPlayerId")) |
|
if "relatedPlayerId" in attr |
|
else None, |
|
"touch": bool(attr.get("isTouch", False)), |
|
"goal": bool(attr.get("isGoal", False)), |
|
"shot": bool(attr.get("isShot", False)), |
|
|
|
|
|
} |
|
|
|
return events |
|
|
|
def extract_substitutions(self) -> dict[tuple[int, int], dict[str, Any]]: |
|
"""Return a dictionary with all substitution events. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between (game ID, player ID) tuples and the information |
|
available about each substitution in the data stream. |
|
""" |
|
subs = {} |
|
subonevents = [e for e in self.root["events"] if e["type"].get("value") == 19] |
|
for e in subonevents: |
|
sub_id = int(assertget(e, "playerId")) |
|
sub = { |
|
"game_id": self.game_id, |
|
"team_id": int(assertget(e, "teamId")), |
|
"period_id": self._get_period_id(e), |
|
"period_milliseconds": self._get_period_milliseconds(e), |
|
"player_in_id": int(assertget(e, "playerId")), |
|
"player_out_id": int(assertget(e, "relatedPlayerId")), |
|
} |
|
subs[(self.game_id, sub_id)] = sub |
|
return subs |
|
|
|
def extract_positions(self) -> dict[tuple[int, int, int], dict[str, Any]]: |
|
"""Return a dictionary with each player's position during a game. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between (game ID, player ID, epoch ID) tuples and the |
|
information available about each player's position in the data stream. |
|
""" |
|
positions = {} |
|
for t in [self.root["home"], self.root["away"]]: |
|
team_id = int(assertget(t, "teamId")) |
|
for f in assertget(t, "formations"): |
|
fpositions = assertget(f, "formationPositions") |
|
playersIds = assertget(f, "playerIds") |
|
formation = assertget(f, "formationName") |
|
|
|
period_end_minutes = assertget(self.root, "periodEndMinutes") |
|
period_minute_limits = assertget(self.root, "periodMinuteLimits") |
|
start_minute = int(assertget(f, "startMinuteExpanded")) |
|
end_minute = int(assertget(f, "endMinuteExpanded")) |
|
for period_id in sorted(period_end_minutes.keys()): |
|
if period_end_minutes[period_id] > start_minute: |
|
break |
|
period_id = int(period_id) |
|
period_minute = start_minute |
|
if period_id > 1: |
|
period_minute = start_minute - period_minute_limits[str(period_id - 1)] |
|
|
|
for i, p in enumerate(fpositions): |
|
player_id = int(playersIds[i]) |
|
x = float(assertget(p, "vertical")) |
|
y = float(assertget(p, "horizontal")) |
|
position_code = _position_mapping(formation, x, y) |
|
positions[(self.game_id, player_id, start_minute)] = { |
|
"game_id": self.game_id, |
|
"team_id": team_id, |
|
"player_id": player_id, |
|
"period_id": period_id, |
|
"period_milliseconds": (period_minute * 60 * 1000), |
|
"start_milliseconds": (start_minute * 60 * 1000), |
|
"end_milliseconds": (end_minute * 60 * 1000), |
|
"formation_scheme": formation, |
|
"player_position": position_code, |
|
"player_position_x": x, |
|
"player_position_y": y, |
|
} |
|
return positions |
|
|
|
def extract_teamgamestats(self) -> dict[tuple[int, int], dict[str, Any]]: |
|
"""Return some aggregated statistics of each team in a game. |
|
|
|
Returns |
|
------- |
|
list(dict) |
|
A dictionary with aggregated team statistics for each team. |
|
""" |
|
teams_gamestats = {} |
|
teams = [self.root["home"], self.root["away"]] |
|
for team in teams: |
|
team_id = int(assertget(team, "teamId")) |
|
statsdict = {} |
|
for name in team["stats"]: |
|
if isinstance(team["stats"][name], dict): |
|
statsdict[_camel_to_snake(name)] = sum(team["stats"][name].values()) |
|
|
|
scores = assertget(team, "scores") |
|
teams_gamestats[(self.game_id, team_id)] = dict( |
|
game_id=self.game_id, |
|
team_id=team_id, |
|
side=assertget(team, "field"), |
|
score=assertget(scores, "fulltime"), |
|
shootout_score=scores.get("penalty"), |
|
**{k: statsdict[k] for k in statsdict if not k.endswith("Success")}, |
|
) |
|
|
|
return teams_gamestats |
|
|
|
def extract_playergamestats(self) -> dict[tuple[int, int], dict[str, Any]]: |
|
"""Return some aggregated statistics of each player in a game. |
|
|
|
Returns |
|
------- |
|
dict(dict) |
|
A dictionary with aggregated team statistics for each player. |
|
""" |
|
players_gamestats = {} |
|
for team in [self.root["home"], self.root["away"]]: |
|
team_id = int(assertget(team, "teamId")) |
|
red_cards = { |
|
e["playerId"]: e["expandedMinute"] |
|
for e in team.get("incidentEvents", []) |
|
if "cardType" in e |
|
and e["cardType"]["displayName"] in ["Red", "SecondYellow"] |
|
and "playerId" in e |
|
} |
|
for player in team["players"]: |
|
statsdict = { |
|
_camel_to_snake(name): sum(stat.values()) |
|
for name, stat in player["stats"].items() |
|
} |
|
stats = [k for k in statsdict if not k.endswith("success")] |
|
|
|
player_id = int(assertget(player, "playerId")) |
|
p = dict( |
|
game_id=self.game_id, |
|
team_id=team_id, |
|
player_id=player_id, |
|
is_starter=bool(player.get("isFirstEleven", False)), |
|
position_code=player.get("position", None), |
|
jersey_number=int(player.get("shirtNo", 0)), |
|
mvp=bool(player.get("isManOfTheMatch", False)), |
|
**{k: statsdict[k] for k in stats}, |
|
) |
|
if "subbedInExpandedMinute" in player: |
|
p["minute_start"] = player["subbedInExpandedMinute"] |
|
if "subbedOutExpandedMinute" in player: |
|
p["minute_end"] = player["subbedOutExpandedMinute"] |
|
if player_id in red_cards: |
|
p["minute_end"] = red_cards[player_id] |
|
|
|
|
|
p["minutes_played"] = 0 |
|
|
|
if p["is_starter"] and "minute_end" not in p: |
|
p["minute_start"] = 0 |
|
p["minute_end"] = self.root["expandedMaxMinute"] |
|
p["minutes_played"] = self.root["expandedMaxMinute"] |
|
|
|
elif p["is_starter"] and "minute_end" in p: |
|
p["minute_start"] = 0 |
|
p["minutes_played"] = p["minute_end"] |
|
|
|
elif "minute_start" in p and "minute_end" not in p: |
|
p["minute_end"] = self.root["expandedMaxMinute"] |
|
p["minutes_played"] = self.root["expandedMaxMinute"] - p["minute_start"] |
|
|
|
elif "minute_start" in p and "minute_end" in p: |
|
p["minutes_played"] = p["minute_end"] - p["minute_start"] |
|
|
|
players_gamestats[(self.game_id, player_id)] = p |
|
return players_gamestats |
|
|
|
|
|
def _camel_to_snake(name: str) -> str: |
|
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) |
|
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() |
|
|