File size: 11,883 Bytes
d6ea71e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
"""Script for downloading test data."""

import argparse
import logging
import os
import shutil
import ssl
import warnings
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen, urlretrieve
from zipfile import ZipFile, is_zipfile

import pandas as pd

try:
    from tqdm import tqdm
except ImportError:
    tqdm = None

import socceraction.atomic.spadl as atomicspadl
import socceraction.spadl as spadl
import socceraction.spadl.statsbomb as statsbomb
import socceraction.spadl.wyscout as wyscout
from socceraction.data.statsbomb import StatsBombLoader
from socceraction.data.wyscout import PublicWyscoutLoader

warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
warnings.filterwarnings(
    action="ignore", message="credentials were not supplied. open data access only"
)


# optional: if you get a SSL CERTIFICATE_VERIFY_FAILED exception
ssl._create_default_https_context = ssl._create_unverified_context

_data_dir = os.path.dirname(__file__)


def download_statsbomb_data() -> None:
    """Download and extract the StatsBomb open data repository."""
    logging.info("Downloading StatsBomb data")
    dataset_url = "https://github.com/statsbomb/open-data/archive/master.zip"

    tmp_datafolder = os.path.join(_data_dir, "statsbomb", "tmp")
    raw_datafolder = os.path.join(_data_dir, "statsbomb", "raw")
    for datafolder in [tmp_datafolder, raw_datafolder]:
        if not os.path.exists(datafolder):
            os.makedirs(datafolder, exist_ok=True)
    statsbombzip = os.path.join(tmp_datafolder, "statsbomb-open-data.zip")

    with urlopen(dataset_url) as dl_file:
        with open(statsbombzip, "wb") as out_file:
            out_file.write(dl_file.read())

    with ZipFile(statsbombzip, "r") as zipObj:
        zipObj.extractall(tmp_datafolder)

    shutil.rmtree(raw_datafolder)
    Path(f"{tmp_datafolder}/open-data-master/data").rename(raw_datafolder)
    shutil.rmtree(tmp_datafolder)
    logging.info("Done! Data was saved to %s", raw_datafolder)


def convert_statsbomb_data() -> None:
    """Convert StatsBomb data to SPADL."""
    logging.info("Converting StatsBomb data")
    seasons = {
        3: "2018",
    }
    leagues = {
        "FIFA World Cup": "WorldCup",
    }
    spadl_datafolder = os.path.join(_data_dir, "statsbomb")

    free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"

    SBL = StatsBombLoader(root=free_open_data_remote, getter="remote")

    # View all available competitions
    df_competitions = SBL.competitions()
    selected_competitions = df_competitions.competition_name.isin(leagues.keys())
    selected_seasons = df_competitions.season_id.isin(seasons.keys())
    df_selected_competitions = df_competitions.loc[selected_competitions & selected_seasons]

    for competition in df_selected_competitions.itertuples():
        # Get games from all selected competition
        games = SBL.games(competition.competition_id, competition.season_id)

        if tqdm is not None:
            games_verbose = tqdm(list(games.itertuples()), desc="Loading match data")
        else:
            games_verbose = games.itertuples()
        teams, players = [], []

        competition_id = leagues[competition.competition_name]
        season_id = seasons[competition.season_id]
        spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5")
        with pd.HDFStore(spadl_h5) as spadlstore:
            spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table")
            spadlstore.put("results", spadl.results_df(), format="table")
            spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table")

            for game in games_verbose:
                # load data
                teams.append(SBL.teams(game.game_id))
                players.append(SBL.players(game.game_id))
                events = SBL.events(game.game_id)

                # convert data
                spadlstore.put(
                    f"actions/game_{game.game_id}",
                    statsbomb.convert_to_actions(events, game.home_team_id),
                    format="table",
                )

            games.season_id = season_id
            games.competition_id = competition_id
            spadlstore.put("games", games)
            spadlstore.put(
                "teams",
                pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True),
            )
            spadlstore.put(
                "players",
                pd.concat(players).drop_duplicates("player_id").reset_index(drop=True),
            )
    logging.info("Done! Data was saved to %s", spadl_datafolder)


def download_wyscout_data() -> None:
    """Download and extract the Wyscout public dataset."""
    logging.info("Downloading Wyscout data")
    # https://figshare.com/collections/Soccer_match_event_dataset/4415000/5
    dataset_urls = {
        "competitions": "https://ndownloader.figshare.com/files/15073685",
        "teams": "https://ndownloader.figshare.com/files/15073697",
        "players": "https://ndownloader.figshare.com/files/15073721",
        "games": "https://ndownloader.figshare.com/files/14464622",
        "events": "https://ndownloader.figshare.com/files/14464685",
    }

    raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw")
    if not os.path.exists(raw_datafolder):
        os.makedirs(raw_datafolder, exist_ok=True)

    # download and unzip Wyscout open data
    for url in dataset_urls.values():
        url_obj = urlopen(url).geturl()
        path = Path(urlparse(url_obj).path)
        file_name = os.path.join(raw_datafolder, path.name)
        file_local, _ = urlretrieve(url_obj, file_name)
        if is_zipfile(file_local):
            with ZipFile(file_local) as zip_file:
                zip_file.extractall(raw_datafolder)
    logging.info("Done! Data was saved to %s", raw_datafolder)


def convert_wyscout_data() -> None:
    """Convert Wyscout data to SPADL."""
    logging.info("Converting Wyscout data")
    seasons = {
        10078: "2018",
    }
    leagues = {
        28: "WorldCup",
    }

    raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw")
    spadl_datafolder = os.path.join(_data_dir, "wyscout_public")

    WYL = PublicWyscoutLoader(root=raw_datafolder)

    # View all available competitions
    df_competitions = WYL.competitions()
    selected_competitions = df_competitions.competition_id.isin(leagues.keys())
    df_selected_competitions = df_competitions.loc[selected_competitions]

    for competition in df_selected_competitions.itertuples():
        # Get games from all selected competition
        games = WYL.games(competition.competition_id, competition.season_id)

        if tqdm is not None:
            games_verbose = tqdm(list(games.itertuples()), desc="Loading match data")
        else:
            games_verbose = games.itertuples()
        teams, players = [], []

        competition_id = leagues[competition.competition_id]
        season_id = seasons[competition.season_id]
        spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5")
        with pd.HDFStore(spadl_h5) as spadlstore:
            spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table")
            spadlstore.put("results", spadl.results_df(), format="table")
            spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table")

            for game in games_verbose:
                # load data
                teams.append(WYL.teams(game.game_id))
                players.append(WYL.players(game.game_id))
                events = WYL.events(game.game_id)

                # convert data
                spadlstore.put(
                    f"actions/game_{game.game_id}",
                    wyscout.convert_to_actions(events, game.home_team_id),
                    # format='table',
                )

            games.season_id = season_id
            games.competition_id = competition_id
            spadlstore.put("games", games)
            spadlstore.put(
                "teams",
                pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True),
            )
            spadlstore.put(
                "players",
                pd.concat(players).drop_duplicates("player_id").reset_index(drop=True),
            )
    logging.info("Done! Data was saved to %s", spadl_datafolder)


def create_spadl(game_id: int, home_team_id: int) -> None:
    """Create SPADL actions from StatsBomb data for a given game."""
    logging.info("Creating SPADL data")
    spadl_datafolder = os.path.join(_data_dir, "spadl")
    if not os.path.exists(spadl_datafolder):
        os.makedirs(spadl_datafolder, exist_ok=True)

    # load events
    free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"
    SBL = StatsBombLoader(root=free_open_data_remote, getter="remote")
    events = SBL.events(game_id)
    # convert to spadl
    spadl_json = os.path.join(spadl_datafolder, "spadl.json")
    df_actions = statsbomb.convert_to_actions(events, home_team_id)
    pd.concat(
        [
            df_actions[df_actions.period_id == 1].head(n=200),
            df_actions[df_actions.period_id == 2].head(n=200),
        ]
    ).to_json(spadl_json, orient="records")
    # convert to atomic spadl
    atomic_spadl_json = os.path.join(spadl_datafolder, "atomic_spadl.json")
    df_atomic_actions = atomicspadl.convert_to_atomic(df_actions)
    pd.concat(
        [
            df_atomic_actions[df_atomic_actions.period_id == 1].head(n=200),
            df_atomic_actions[df_atomic_actions.period_id == 2].head(n=200),
        ]
    ).to_json(atomic_spadl_json, orient="records")
    logging.info("Done! SPADL data was saved to %s and %s", spadl_json, atomic_spadl_json)


if __name__ == "__main__":
    # Setup logging
    logging.basicConfig(level=logging.INFO)

    # Create the parser
    my_parser = argparse.ArgumentParser(
        prog="download",
        usage="%(prog)s [options]",
        formatter_class=argparse.RawTextHelpFormatter,
        description="""Download and prepare the data needed for running the tests.

        Use the options specified below to select specific preprocessing
        steps. When this script is run without any options, all preprocessing
        steps required to run the default test setup will be executed.
        """,
    )

    # Add the arguments
    my_parser.add_argument(
        "--download-statsbomb",
        action="store_true",
        help="Download the public StatsBomb data.",
    )
    my_parser.add_argument(
        "--convert-statsbomb",
        action="store_true",
        help="Convert the public StatsBomb data to SPADL.",
    )
    my_parser.add_argument(
        "--download-wyscout",
        action="store_true",
        help="Download the public Wyscout data.",
    )
    my_parser.add_argument(
        "--convert-wyscout",
        action="store_true",
        help="Convert the public Wyscout data to SPADL.",
    )
    my_parser.add_argument(
        "--spadl",
        action="store_true",
        help="Create a JSON file with example SPADL and Atomic-SPADL data.",
    )

    # Execute the parse_args() method
    args = my_parser.parse_args()
    no_options = not any(
        [
            args.download_statsbomb,
            args.convert_statsbomb,
            args.download_wyscout,
            args.convert_wyscout,
            args.spadl,
        ]
    )

    # Run the requested steps
    if args.download_statsbomb or no_options:
        download_statsbomb_data()
    if args.convert_statsbomb:
        convert_statsbomb_data()
    if args.download_wyscout or no_options:
        download_wyscout_data()
    if args.convert_wyscout:
        convert_wyscout_data()
    if args.spadl:
        create_spadl(8657, 777)