|
"""XML parser for Opta F7 feeds.""" |
|
|
|
from datetime import datetime |
|
from typing import Any |
|
|
|
from lxml import objectify |
|
|
|
from .base import OptaXMLParser, assertget |
|
|
|
|
|
class F7XMLParser(OptaXMLParser): |
|
"""Extract data from a Opta F7 data stream. |
|
|
|
Parameters |
|
---------- |
|
path : str |
|
Path of the data file. |
|
""" |
|
|
|
def _get_doc(self) -> objectify.ObjectifiedElement: |
|
optadocument = self.root.find("SoccerDocument") |
|
return optadocument |
|
|
|
def _get_stats(self, obj: objectify.ObjectifiedElement) -> dict[str, Any]: |
|
stats = {} |
|
for stat in obj.find("Stat"): |
|
stats[stat.attrib["Type"]] = stat.text |
|
return stats |
|
|
|
def _get_name(self, obj: objectify.ObjectifiedElement) -> str: |
|
if "Known" in obj: |
|
return obj.Known |
|
return obj.First + " " + obj.Last |
|
|
|
def extract_competitions(self) -> dict[tuple[int, int], dict[str, Any]]: |
|
"""Return a dictionary with all available competitions. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between (competion ID, season ID) tuples and the |
|
information available about each competition in the data stream. |
|
""" |
|
optadocument = self._get_doc() |
|
competition = optadocument.Competition |
|
competition_id = int(competition.attrib["uID"][1:]) |
|
stats = self._get_stats(competition) |
|
season_id = int(assertget(stats, "season_id")) |
|
competition_dict = { |
|
|
|
"competition_id": competition_id, |
|
"season_id": season_id, |
|
"season_name": assertget(stats, "season_name"), |
|
"competition_name": competition.Name.text, |
|
} |
|
return {(competition_id, season_id): competition_dict} |
|
|
|
def extract_games(self) -> dict[int, dict[str, Any]]: |
|
"""Return a dictionary with all available games. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between game IDs and the information available about |
|
each game in the data stream. |
|
""" |
|
optadocument = self._get_doc() |
|
competition = optadocument.Competition |
|
competition_id = int(competition.attrib["uID"][1:]) |
|
competition_stats = self._get_stats(competition) |
|
match_info = optadocument.MatchData.MatchInfo |
|
game_id = int(optadocument.attrib["uID"][1:]) |
|
stats = self._get_stats(optadocument.MatchData) |
|
team_data_elms = { |
|
t.attrib["Side"]: t for t in optadocument.MatchData.iterchildren("TeamData") |
|
} |
|
team_officials = {} |
|
for t in optadocument.iterchildren("Team"): |
|
side = ( |
|
"Home" |
|
if int(team_data_elms["Home"].attrib["TeamRef"][1:]) == int(t.attrib["uID"][1:]) |
|
else "Away" |
|
) |
|
for m in t.iterchildren("TeamOfficial"): |
|
if m.attrib["Type"] == "Manager": |
|
team_officials[side] = m |
|
|
|
game_dict = { |
|
|
|
"game_id": game_id, |
|
"season_id": int(assertget(competition_stats, "season_id")), |
|
"competition_id": competition_id, |
|
"game_day": int(competition_stats["matchday"]) |
|
if "matchday" in competition_stats |
|
else None, |
|
"game_date": datetime.strptime(match_info.Date.text, "%Y%m%dT%H%M%S%z").replace( |
|
tzinfo=None |
|
), |
|
"home_team_id": int( |
|
assertget(assertget(team_data_elms, "Home").attrib, "TeamRef")[1:] |
|
), |
|
"away_team_id": int( |
|
assertget(assertget(team_data_elms, "Away").attrib, "TeamRef")[1:] |
|
), |
|
|
|
"home_score": int(assertget(assertget(team_data_elms, "Home").attrib, "Score")), |
|
"away_score": int(assertget(assertget(team_data_elms, "Away").attrib, "Score")), |
|
"duration": int(stats["match_time"]), |
|
"referee": self._get_name(optadocument.MatchData.MatchOfficial.OfficialName), |
|
"venue": optadocument.Venue.Name.text, |
|
"attendance": int(match_info.Attendance), |
|
"home_manager": self._get_name(team_officials["Home"].PersonName) |
|
if "Home" in team_officials |
|
else None, |
|
"away_manager": self._get_name(team_officials["Away"].PersonName) |
|
if "Away" in team_officials |
|
else None, |
|
} |
|
return {game_id: game_dict} |
|
|
|
def extract_teams(self) -> dict[int, dict[str, Any]]: |
|
"""Return a dictionary with all available teams. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between team IDs and the information available about |
|
each team in the data stream. |
|
""" |
|
optadocument = self._get_doc() |
|
team_elms = list(optadocument.iterchildren("Team")) |
|
teams = {} |
|
for team_elm in team_elms: |
|
team_id = int(assertget(team_elm.attrib, "uID")[1:]) |
|
teams[team_id] = { |
|
|
|
"team_id": team_id, |
|
"team_name": team_elm.Name.text, |
|
} |
|
return teams |
|
|
|
def extract_lineups(self) -> dict[int, dict[str, Any]]: |
|
"""Return a dictionary with the lineup of each team. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between team IDs and the information available about |
|
each team's lineup in the data stream. |
|
""" |
|
optadocument = self._get_doc() |
|
|
|
stats = {} |
|
for stat in optadocument.MatchData.find("Stat"): |
|
stats[stat.attrib["Type"]] = stat.text |
|
|
|
lineup_elms = optadocument.MatchData.iterchildren("TeamData") |
|
lineups = {} |
|
for team_elm in lineup_elms: |
|
|
|
team_id = int(team_elm.attrib["TeamRef"][1:]) |
|
lineups[team_id] = { |
|
"formation": team_elm.attrib["Formation"], |
|
"score": int(team_elm.attrib["Score"]), |
|
"side": team_elm.attrib["Side"], |
|
"players": {}, |
|
} |
|
|
|
subst_elms = team_elm.iterchildren("Substitution") |
|
subst = [subst_elm.attrib for subst_elm in subst_elms] |
|
|
|
booking_elms = team_elm.iterchildren("Booking") |
|
red_cards = { |
|
int(booking_elm.attrib["PlayerRef"][1:]): int(booking_elm.attrib["Min"]) |
|
for booking_elm in booking_elms |
|
if "CardType" in booking_elm.attrib |
|
and booking_elm.attrib["CardType"] in ["Red", "SecondYellow"] |
|
and "PlayerRef" in booking_elm.attrib |
|
} |
|
|
|
player_elms = team_elm.PlayerLineUp.iterchildren("MatchPlayer") |
|
for player_elm in player_elms: |
|
player_id = int(player_elm.attrib["PlayerRef"][1:]) |
|
sub_on = int( |
|
next( |
|
( |
|
item["Time"] |
|
for item in subst |
|
if "Retired" not in item and item["SubOn"] == f"p{player_id}" |
|
), |
|
stats["match_time"] if player_elm.attrib["Status"] == "Sub" else 0, |
|
) |
|
) |
|
sub_off = int( |
|
next( |
|
(item["Time"] for item in subst if item["SubOff"] == f"p{player_id}"), |
|
stats["match_time"] |
|
if player_id not in red_cards |
|
else red_cards[player_id], |
|
) |
|
) |
|
minutes_played = sub_off - sub_on |
|
lineups[team_id]["players"][player_id] = { |
|
"starting_position_id": int(player_elm.attrib["Formation_Place"]), |
|
"starting_position_name": player_elm.attrib["Position"], |
|
"jersey_number": int(player_elm.attrib["ShirtNumber"]), |
|
"is_starter": int(player_elm.attrib["Formation_Place"]) != 0, |
|
"minutes_played": minutes_played, |
|
} |
|
return lineups |
|
|
|
def extract_players(self) -> dict[tuple[int, int], dict[str, Any]]: |
|
"""Return a dictionary with all available players. |
|
|
|
Returns |
|
------- |
|
dict |
|
A mapping between (game ID, player ID) tuples and the information |
|
available about each player in the data stream. |
|
""" |
|
optadocument = self._get_doc() |
|
game_id = int(optadocument.attrib["uID"][1:]) |
|
lineups = self.extract_lineups() |
|
team_elms = list(optadocument.iterchildren("Team")) |
|
players = {} |
|
for team_elm in team_elms: |
|
team_id = int(team_elm.attrib["uID"][1:]) |
|
for player_elm in team_elm.iterchildren("Player"): |
|
player_id = int(player_elm.attrib["uID"][1:]) |
|
player = { |
|
|
|
"game_id": game_id, |
|
"team_id": team_id, |
|
"player_id": player_id, |
|
"player_name": self._get_name(player_elm.PersonName), |
|
"is_starter": lineups[team_id]["players"][player_id]["is_starter"], |
|
"minutes_played": lineups[team_id]["players"][player_id]["minutes_played"], |
|
"jersey_number": lineups[team_id]["players"][player_id]["jersey_number"], |
|
|
|
"starting_position": lineups[team_id]["players"][player_id][ |
|
"starting_position_name" |
|
], |
|
|
|
|
|
|
|
|
|
} |
|
players[(game_id, player_id)] = player |
|
|
|
return players |
|
|