socr / data /opta /parsers /whoscored.py
scfive's picture
Upload 203 files
d6ea71e verified
"""JSON parser for WhoScored feeds."""
import json # type: ignore
import re
from datetime import datetime, timedelta
from typing import Any, Optional
from ...base import MissingDataError
from .base import OptaParser, _get_end_x, _get_end_y, assertget
def _position_mapping(formation: str, x: float, y: float) -> str:
if x == 0 and y == 5:
return "GK"
return "Unknown"
class WhoScoredParser(OptaParser):
"""Extract data from a JSON data stream scraped from WhoScored.
Parameters
----------
path : str
Path of the data file.
competition_id : int
ID of the competition to which the provided data file belongs. If
None, this information is extracted from a field 'competition_id' in
the JSON.
season_id : int
ID of the season to which the provided data file belongs. If None,
this information is extracted from a field 'season_id' in the JSON.
game_id : int
ID of the game to which the provided data file belongs. If None, this
information is extracted from a field 'game_id' in the JSON.
"""
def __init__( # noqa: C901
self,
path: str,
competition_id: Optional[int] = None,
season_id: Optional[int] = None,
game_id: Optional[int] = None,
) -> None:
with open(path, encoding="utf-8") as fh:
self.root = json.load(fh)
if competition_id is None:
try:
competition_id = int(assertget(self.root, "competition_id"))
except AssertionError as e:
raise MissingDataError(
"""Could not determine the competition id. Add it to the
file path or include a field 'competition_id' in the
JSON."""
) from e
self.competition_id = competition_id
if season_id is None:
try:
season_id = int(assertget(self.root, "season_id"))
except AssertionError as e:
raise MissingDataError(
"""Could not determine the season id. Add it to the file
path or include a field 'season_id' in the JSON."""
) from e
self.season_id = season_id
if game_id is None:
try:
game_id = int(assertget(self.root, "game_id"))
except AssertionError as e:
raise MissingDataError(
"""Could not determine the game id. Add it to the file
path or include a field 'game_id' in the JSON."""
) from e
self.game_id = game_id
def _get_period_id(self, event: dict[str, Any]) -> int:
period = assertget(event, "period")
period_id = int(assertget(period, "value"))
return period_id
def _get_period_milliseconds(self, event: dict[str, Any]) -> int:
period_minute_limits = assertget(self.root, "periodMinuteLimits")
period_id = self._get_period_id(event)
if period_id == 16: # Pre-match
return 0
if period_id == 14: # Post-game
return 0
minute = int(assertget(event, "minute"))
period_minute = minute
if period_id > 1:
period_minute = minute - period_minute_limits[str(period_id - 1)]
period_second = period_minute * 60 + int(event.get("second", 0))
return period_second * 1000
def extract_games(self) -> dict[int, dict[str, Any]]:
"""Return a dictionary with all available games.
Returns
-------
dict
A mapping between game IDs and the information available about
each game in the data stream.
"""
team_home = assertget(self.root, "home")
team_away = assertget(self.root, "away")
game_dict = {
# Fields required by the base schema
"game_id": self.game_id,
"season_id": self.season_id,
"competition_id": self.competition_id,
"game_day": None, # Cannot be determined from the data stream
"game_date": datetime.strptime(
assertget(self.root, "startTime"), "%Y-%m-%dT%H:%M:%S"
), # Dates are UTC
"home_team_id": int(assertget(team_home, "teamId")),
"away_team_id": int(assertget(team_away, "teamId")),
# Optional fields
"home_score": int(assertget(assertget(self.root["home"], "scores"), "running")),
"away_score": int(assertget(assertget(self.root["away"], "scores"), "running")),
"duration": int(self.root.get("expandedMaxMinute"))
if "expandedMaxMinute" in self.root
else None,
"referee": self.root.get("referee", {}).get("name"),
"venue": self.root.get("venueName"),
"attendance": int(self.root.get("attendance")) if "attendance" in self.root else None,
"home_manager": team_home.get("managerName"),
"away_manager": team_away.get("managerName"),
}
return {self.game_id: game_dict}
def extract_teams(self) -> dict[int, dict[str, Any]]:
"""Return a dictionary with all available teams.
Returns
-------
dict
A mapping between team IDs and the information available about
each team in the data stream.
"""
teams = {}
for side in [self.root["home"], self.root["away"]]:
team_id = int(assertget(side, "teamId"))
teams[team_id] = {
# Fields required by the base schema
"team_id": team_id,
"team_name": assertget(side, "name"),
}
return teams
def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]:
"""Return a dictionary with all available players.
Returns
-------
dict
A mapping between (game ID, player ID) tuples and the information
available about each player in the data stream.
"""
game_id = self.game_id
player_gamestats = self.extract_playergamestats()
players = {}
for team in [self.root["home"], self.root["away"]]:
team_id = int(assertget(team, "teamId"))
for p in team["players"]:
player_id = int(assertget(p, "playerId"))
players[(game_id, player_id)] = {
# Fields required by the base schema
"game_id": game_id,
"team_id": team_id,
"player_id": player_id,
"player_name": assertget(p, "name"),
"is_starter": bool(p.get("isFirstEleven", False)),
"minutes_played": player_gamestats[(game_id, player_id)]["minutes_played"],
"jersey_number": player_gamestats[(game_id, player_id)]["jersey_number"],
# Fields required by the opta schema
"starting_position": player_gamestats[(game_id, player_id)]["position_code"],
# Optional fields
# WhoScored retrieves player details for the current date,
# not for the game date. Hence, we do not innclude this
# info.
# age=int(p["age"]),
# height=float(p.get("height", float("NaN"))),
# weight=float(p.get("weight", float("NaN"))),
}
return players
def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
"""Return a dictionary with all available events.
Returns
-------
dict
A mapping between (game ID, event ID) tuples and the information
available about each event in the data stream.
"""
events = {}
time_start_str = assertget(self.root, "startTime")
time_start = datetime.strptime(time_start_str, "%Y-%m-%dT%H:%M:%S")
for attr in self.root["events"]:
event_id = int(assertget(attr, "id" if "id" in attr else "eventId"))
eventtype = attr.get("type", {})
start_x = float(assertget(attr, "x"))
start_y = float(assertget(attr, "y"))
minute = int(assertget(attr, "expandedMinute"))
second = int(attr.get("second", 0))
qualifiers = {
int(q["type"]["value"]): q.get("value", True) for q in attr.get("qualifiers", [])
}
end_x = attr.get("endX", _get_end_x(qualifiers))
end_y = attr.get("endY", _get_end_y(qualifiers))
events[(self.game_id, event_id)] = {
# Fields required by the base schema
"game_id": self.game_id,
"event_id": event_id,
"period_id": self._get_period_id(attr),
"team_id": int(assertget(attr, "teamId")),
"player_id": int(attr.get("playerId")) if "playerId" in attr else None,
"type_id": int(assertget(eventtype, "value")),
# type_name=assertget(eventtype, "displayName"), # added in the opta loader
# Fields required by the opta schema
# Timestamp is not availe in the data stream. The returned
# timestamp is not accurate, but sufficient for camptability
# with the other Opta data streams.
"timestamp": (time_start + timedelta(seconds=(minute * 60 + second))),
"minute": minute,
"second": second,
"outcome": bool(attr["outcomeType"].get("value"))
if "outcomeType" in attr
else None,
"start_x": start_x,
"start_y": start_y,
"end_x": end_x if end_x is not None else start_x,
"end_y": end_y if end_y is not None else start_y,
"qualifiers": qualifiers,
# Optional fields
"related_player_id": int(attr.get("relatedPlayerId"))
if "relatedPlayerId" in attr
else None,
"touch": bool(attr.get("isTouch", False)),
"goal": bool(attr.get("isGoal", False)),
"shot": bool(attr.get("isShot", False)),
# assist=bool(attr.get('assist')) if "assist" in attr else None,
# keypass=bool(attr.get('keypass')) if "keypass" in attr else None,
}
return events
def extract_substitutions(self) -> dict[tuple[int, int], dict[str, Any]]:
"""Return a dictionary with all substitution events.
Returns
-------
dict
A mapping between (game ID, player ID) tuples and the information
available about each substitution in the data stream.
"""
subs = {}
subonevents = [e for e in self.root["events"] if e["type"].get("value") == 19]
for e in subonevents:
sub_id = int(assertget(e, "playerId"))
sub = {
"game_id": self.game_id,
"team_id": int(assertget(e, "teamId")),
"period_id": self._get_period_id(e),
"period_milliseconds": self._get_period_milliseconds(e),
"player_in_id": int(assertget(e, "playerId")),
"player_out_id": int(assertget(e, "relatedPlayerId")),
}
subs[(self.game_id, sub_id)] = sub
return subs
def extract_positions(self) -> dict[tuple[int, int, int], dict[str, Any]]: # noqa: C901
"""Return a dictionary with each player's position during a game.
Returns
-------
dict
A mapping between (game ID, player ID, epoch ID) tuples and the
information available about each player's position in the data stream.
"""
positions = {}
for t in [self.root["home"], self.root["away"]]:
team_id = int(assertget(t, "teamId"))
for f in assertget(t, "formations"):
fpositions = assertget(f, "formationPositions")
playersIds = assertget(f, "playerIds")
formation = assertget(f, "formationName")
period_end_minutes = assertget(self.root, "periodEndMinutes")
period_minute_limits = assertget(self.root, "periodMinuteLimits")
start_minute = int(assertget(f, "startMinuteExpanded"))
end_minute = int(assertget(f, "endMinuteExpanded"))
for period_id in sorted(period_end_minutes.keys()):
if period_end_minutes[period_id] > start_minute:
break
period_id = int(period_id)
period_minute = start_minute
if period_id > 1:
period_minute = start_minute - period_minute_limits[str(period_id - 1)]
for i, p in enumerate(fpositions):
player_id = int(playersIds[i])
x = float(assertget(p, "vertical"))
y = float(assertget(p, "horizontal"))
position_code = _position_mapping(formation, x, y)
positions[(self.game_id, player_id, start_minute)] = {
"game_id": self.game_id,
"team_id": team_id,
"player_id": player_id,
"period_id": period_id,
"period_milliseconds": (period_minute * 60 * 1000),
"start_milliseconds": (start_minute * 60 * 1000),
"end_milliseconds": (end_minute * 60 * 1000),
"formation_scheme": formation,
"player_position": position_code,
"player_position_x": x,
"player_position_y": y,
}
return positions
def extract_teamgamestats(self) -> dict[tuple[int, int], dict[str, Any]]:
"""Return some aggregated statistics of each team in a game.
Returns
-------
list(dict)
A dictionary with aggregated team statistics for each team.
"""
teams_gamestats = {}
teams = [self.root["home"], self.root["away"]]
for team in teams:
team_id = int(assertget(team, "teamId"))
statsdict = {}
for name in team["stats"]:
if isinstance(team["stats"][name], dict):
statsdict[_camel_to_snake(name)] = sum(team["stats"][name].values())
scores = assertget(team, "scores")
teams_gamestats[(self.game_id, team_id)] = dict(
game_id=self.game_id,
team_id=team_id,
side=assertget(team, "field"),
score=assertget(scores, "fulltime"),
shootout_score=scores.get("penalty"),
**{k: statsdict[k] for k in statsdict if not k.endswith("Success")},
)
return teams_gamestats
def extract_playergamestats(self) -> dict[tuple[int, int], dict[str, Any]]: # noqa: C901
"""Return some aggregated statistics of each player in a game.
Returns
-------
dict(dict)
A dictionary with aggregated team statistics for each player.
"""
players_gamestats = {}
for team in [self.root["home"], self.root["away"]]:
team_id = int(assertget(team, "teamId"))
red_cards = {
e["playerId"]: e["expandedMinute"]
for e in team.get("incidentEvents", [])
if "cardType" in e
and e["cardType"]["displayName"] in ["Red", "SecondYellow"]
and "playerId" in e # not defined if a coach receives a red card
}
for player in team["players"]:
statsdict = {
_camel_to_snake(name): sum(stat.values())
for name, stat in player["stats"].items()
}
stats = [k for k in statsdict if not k.endswith("success")]
player_id = int(assertget(player, "playerId"))
p = dict(
game_id=self.game_id,
team_id=team_id,
player_id=player_id,
is_starter=bool(player.get("isFirstEleven", False)),
position_code=player.get("position", None),
jersey_number=int(player.get("shirtNo", 0)),
mvp=bool(player.get("isManOfTheMatch", False)),
**{k: statsdict[k] for k in stats},
)
if "subbedInExpandedMinute" in player:
p["minute_start"] = player["subbedInExpandedMinute"]
if "subbedOutExpandedMinute" in player:
p["minute_end"] = player["subbedOutExpandedMinute"]
if player_id in red_cards:
p["minute_end"] = red_cards[player_id]
# Did not play
p["minutes_played"] = 0
# Played the full game
if p["is_starter"] and "minute_end" not in p:
p["minute_start"] = 0
p["minute_end"] = self.root["expandedMaxMinute"]
p["minutes_played"] = self.root["expandedMaxMinute"]
# Started but substituted out
elif p["is_starter"] and "minute_end" in p:
p["minute_start"] = 0
p["minutes_played"] = p["minute_end"]
# Substitud in and played the remainder of the game
elif "minute_start" in p and "minute_end" not in p:
p["minute_end"] = self.root["expandedMaxMinute"]
p["minutes_played"] = self.root["expandedMaxMinute"] - p["minute_start"]
# Substitud in and out
elif "minute_start" in p and "minute_end" in p:
p["minutes_played"] = p["minute_end"] - p["minute_start"]
players_gamestats[(self.game_id, player_id)] = p
return players_gamestats
def _camel_to_snake(name: str) -> str:
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()