File size: 4,561 Bytes
d6ea71e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""JSON parser for Opta F24 feeds."""

from datetime import datetime
from typing import Any

from ...base import MissingDataError
from .base import OptaJSONParser, _get_end_x, _get_end_y, assertget


class F24JSONParser(OptaJSONParser):
    """Extract data from a Opta F24 data stream.

    Parameters
    ----------
    path : str
        Path of the data file.
    """

    def _get_doc(self) -> dict[str, Any]:
        for node in self.root:
            if "Games" in node["data"].keys():
                return node
        raise MissingDataError

    def extract_games(self) -> dict[int, dict[str, Any]]:
        """Return a dictionary with all available games.

        Returns
        -------
        dict
            A mapping between game IDs and the information available about
            each game in the data stream.
        """
        f24 = self._get_doc()

        data = assertget(f24, "data")
        games = assertget(data, "Games")
        game = assertget(games, "Game")
        attr = assertget(game, "@attributes")

        game_id = int(assertget(attr, "id"))
        game_dict = {
            game_id: {
                # Fields required by the base schema
                "game_id": game_id,
                "season_id": int(assertget(attr, "season_id")),
                "competition_id": int(assertget(attr, "competition_id")),
                "game_day": int(assertget(attr, "matchday")),
                "game_date": datetime.strptime(
                    assertget(assertget(attr, "game_date"), "locale"), "%Y-%m-%dT%H:%M:%S.%fZ"
                ).replace(tzinfo=None),
                "home_team_id": int(assertget(attr, "home_team_id")),
                "away_team_id": int(assertget(attr, "away_team_id")),
                # Fields required by the opta schema
                # home_score=?
                # away_score=?
                # duration=?
                # referee=?
                # venue=?,
                # attendance=?
                # Optional fields
                # home_manager=?
                # away_manager=?
            }
        }
        return game_dict

    def extract_events(self) -> dict[tuple[int, int], dict[str, Any]]:
        """Return a dictionary with all available events.

        Returns
        -------
        dict
            A mapping between (game ID, event ID) tuples and the information
            available about each event in the data stream.
        """
        f24 = self._get_doc()

        data = assertget(f24, "data")
        games = assertget(data, "Games")
        game = assertget(games, "Game")
        game_attr = assertget(game, "@attributes")
        game_id = int(assertget(game_attr, "id"))

        events = {}
        for element in assertget(game, "Event"):
            attr = element["@attributes"]
            timestamp = attr["TimeStamp"].get("locale") if attr.get("TimeStamp") else None
            timestamp = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
            qualifiers = {
                int(q["@attributes"]["qualifier_id"]): q["@attributes"]["value"]
                for q in element.get("Q", [])
            }
            start_x = float(assertget(attr, "x"))
            start_y = float(assertget(attr, "y"))
            end_x = _get_end_x(qualifiers)
            end_y = _get_end_y(qualifiers)

            event_id = int(assertget(attr, "id"))
            events[(game_id, event_id)] = {
                # Fields required by the base schema
                "game_id": game_id,
                "event_id": event_id,
                "period_id": int(assertget(attr, "period_id")),
                "team_id": int(assertget(attr, "team_id")),
                "player_id": int(assertget(attr, "player_id")),
                "type_id": int(assertget(attr, "type_id")),
                # type_name=?, # added in the opta loader
                # Fields required by the opta schema
                "timestamp": timestamp,
                "minute": int(assertget(attr, "min")),
                "second": int(assertget(attr, "sec")),
                "outcome": bool(int(attr.get("outcome", 1))),
                "start_x": start_x,
                "start_y": start_y,
                "end_x": end_x if end_x is not None else start_x,
                "end_y": end_y if end_y is not None else start_y,
                "qualifiers": qualifiers,
                # Optional fields
                "assist": bool(int(attr.get("assist", 0))),
                "keypass": bool(int(attr.get("keypass", 0))),
            }
        return events