socr / data /opta /parsers /f24_xml.py
scfive's picture
Upload 203 files
d6ea71e verified
"""XML parser for Opta F24 feeds."""
from datetime import datetime
from typing import Any
from lxml import objectify
from .base import OptaXMLParser, _get_end_x, _get_end_y, assertget
class F24XMLParser(OptaXMLParser):
"""Extract data from a Opta F24 data stream.
Parameters
----------
path : str
Path of the data file.
"""
def _get_doc(self) -> objectify.ObjectifiedElement:
return self.root
def extract_games(self) -> dict[int, dict[str, Any]]:
"""Return a dictionary with all available games.
Returns
-------
dict
A mapping between game IDs and the information available about
each game in the data stream.
"""
optadocument = self._get_doc()
game_elem = optadocument.find("Game")
attr = game_elem.attrib
game_id = int(assertget(attr, "id"))
game_dict = {
# Fields required by the base schema
"game_id": game_id,
"season_id": int(assertget(attr, "season_id")),
"competition_id": int(assertget(attr, "competition_id")),
"game_day": int(assertget(attr, "matchday")),
"game_date": datetime.strptime(assertget(attr, "game_date"), "%Y-%m-%dT%H:%M:%S"),
"home_team_id": int(assertget(attr, "home_team_id")),
"away_team_id": int(assertget(attr, "away_team_id")),
# Optional fields
"home_score": int(assertget(attr, "home_score")),
"away_score": int(assertget(attr, "away_score")),
# duration=?
# referee=?
# venue=?
# attendance=?
# home_manager=?
# away_manager=?
}
return {game_id: game_dict}
def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
"""Return a dictionary with all available events.
Returns
-------
dict
A mapping between (game ID, event ID) tuples and the information
available about each event in the data stream.
"""
optadocument = self._get_doc()
game_elm = optadocument.find("Game")
game_id = int(assertget(game_elm.attrib, "id"))
events = {}
for event_elm in game_elm.iterchildren("Event"):
attr = dict(event_elm.attrib)
event_id = int(assertget(attr, "id"))
qualifiers = {
int(qualifier_elm.attrib["qualifier_id"]): qualifier_elm.attrib.get("value")
for qualifier_elm in event_elm.iterchildren("Q")
}
start_x = float(assertget(attr, "x"))
start_y = float(assertget(attr, "y"))
end_x = _get_end_x(qualifiers)
end_y = _get_end_y(qualifiers)
events[(game_id, event_id)] = {
# Fields required by the base schema
"game_id": game_id,
"event_id": event_id,
"period_id": int(assertget(attr, "period_id")),
"team_id": int(assertget(attr, "team_id")),
"player_id": int(attr["player_id"]) if "player_id" in attr else None,
"type_id": int(assertget(attr, "type_id")),
# type_name=?, # added in the opta loader
# Fields required by the opta schema
"timestamp": datetime.strptime(
assertget(attr, "timestamp"), "%Y-%m-%dT%H:%M:%S.%f"
),
"minute": int(assertget(attr, "min")),
"second": int(assertget(attr, "sec")),
"outcome": bool(int(attr["outcome"])) if "outcome" in attr else None,
"start_x": start_x,
"start_y": start_y,
"end_x": end_x if end_x is not None else start_x,
"end_y": end_y if end_y is not None else start_y,
"qualifiers": qualifiers,
# Optional fields
"assist": bool(int(attr.get("assist", 0))),
"keypass": bool(int(attr.get("keypass", 0))),
}
return events