"""Script for downloading test data.""" import argparse import logging import os import shutil import ssl import warnings from pathlib import Path from urllib.parse import urlparse from urllib.request import urlopen, urlretrieve from zipfile import ZipFile, is_zipfile import pandas as pd try: from tqdm import tqdm except ImportError: tqdm = None import socceraction.atomic.spadl as atomicspadl import socceraction.spadl as spadl import socceraction.spadl.statsbomb as statsbomb import socceraction.spadl.wyscout as wyscout from socceraction.data.statsbomb import StatsBombLoader from socceraction.data.wyscout import PublicWyscoutLoader warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning) warnings.filterwarnings( action="ignore", message="credentials were not supplied. open data access only" ) # optional: if you get a SSL CERTIFICATE_VERIFY_FAILED exception ssl._create_default_https_context = ssl._create_unverified_context _data_dir = os.path.dirname(__file__) def download_statsbomb_data() -> None: """Download and extract the StatsBomb open data repository.""" logging.info("Downloading StatsBomb data") dataset_url = "https://github.com/statsbomb/open-data/archive/master.zip" tmp_datafolder = os.path.join(_data_dir, "statsbomb", "tmp") raw_datafolder = os.path.join(_data_dir, "statsbomb", "raw") for datafolder in [tmp_datafolder, raw_datafolder]: if not os.path.exists(datafolder): os.makedirs(datafolder, exist_ok=True) statsbombzip = os.path.join(tmp_datafolder, "statsbomb-open-data.zip") with urlopen(dataset_url) as dl_file: with open(statsbombzip, "wb") as out_file: out_file.write(dl_file.read()) with ZipFile(statsbombzip, "r") as zipObj: zipObj.extractall(tmp_datafolder) shutil.rmtree(raw_datafolder) Path(f"{tmp_datafolder}/open-data-master/data").rename(raw_datafolder) shutil.rmtree(tmp_datafolder) logging.info("Done! Data was saved to %s", raw_datafolder) def convert_statsbomb_data() -> None: """Convert StatsBomb data to SPADL.""" logging.info("Converting StatsBomb data") seasons = { 3: "2018", } leagues = { "FIFA World Cup": "WorldCup", } spadl_datafolder = os.path.join(_data_dir, "statsbomb") free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/" SBL = StatsBombLoader(root=free_open_data_remote, getter="remote") # View all available competitions df_competitions = SBL.competitions() selected_competitions = df_competitions.competition_name.isin(leagues.keys()) selected_seasons = df_competitions.season_id.isin(seasons.keys()) df_selected_competitions = df_competitions.loc[selected_competitions & selected_seasons] for competition in df_selected_competitions.itertuples(): # Get games from all selected competition games = SBL.games(competition.competition_id, competition.season_id) if tqdm is not None: games_verbose = tqdm(list(games.itertuples()), desc="Loading match data") else: games_verbose = games.itertuples() teams, players = [], [] competition_id = leagues[competition.competition_name] season_id = seasons[competition.season_id] spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5") with pd.HDFStore(spadl_h5) as spadlstore: spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table") spadlstore.put("results", spadl.results_df(), format="table") spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table") for game in games_verbose: # load data teams.append(SBL.teams(game.game_id)) players.append(SBL.players(game.game_id)) events = SBL.events(game.game_id) # convert data spadlstore.put( f"actions/game_{game.game_id}", statsbomb.convert_to_actions(events, game.home_team_id), format="table", ) games.season_id = season_id games.competition_id = competition_id spadlstore.put("games", games) spadlstore.put( "teams", pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True), ) spadlstore.put( "players", pd.concat(players).drop_duplicates("player_id").reset_index(drop=True), ) logging.info("Done! Data was saved to %s", spadl_datafolder) def download_wyscout_data() -> None: """Download and extract the Wyscout public dataset.""" logging.info("Downloading Wyscout data") # https://figshare.com/collections/Soccer_match_event_dataset/4415000/5 dataset_urls = { "competitions": "https://ndownloader.figshare.com/files/15073685", "teams": "https://ndownloader.figshare.com/files/15073697", "players": "https://ndownloader.figshare.com/files/15073721", "games": "https://ndownloader.figshare.com/files/14464622", "events": "https://ndownloader.figshare.com/files/14464685", } raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw") if not os.path.exists(raw_datafolder): os.makedirs(raw_datafolder, exist_ok=True) # download and unzip Wyscout open data for url in dataset_urls.values(): url_obj = urlopen(url).geturl() path = Path(urlparse(url_obj).path) file_name = os.path.join(raw_datafolder, path.name) file_local, _ = urlretrieve(url_obj, file_name) if is_zipfile(file_local): with ZipFile(file_local) as zip_file: zip_file.extractall(raw_datafolder) logging.info("Done! Data was saved to %s", raw_datafolder) def convert_wyscout_data() -> None: """Convert Wyscout data to SPADL.""" logging.info("Converting Wyscout data") seasons = { 10078: "2018", } leagues = { 28: "WorldCup", } raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw") spadl_datafolder = os.path.join(_data_dir, "wyscout_public") WYL = PublicWyscoutLoader(root=raw_datafolder) # View all available competitions df_competitions = WYL.competitions() selected_competitions = df_competitions.competition_id.isin(leagues.keys()) df_selected_competitions = df_competitions.loc[selected_competitions] for competition in df_selected_competitions.itertuples(): # Get games from all selected competition games = WYL.games(competition.competition_id, competition.season_id) if tqdm is not None: games_verbose = tqdm(list(games.itertuples()), desc="Loading match data") else: games_verbose = games.itertuples() teams, players = [], [] competition_id = leagues[competition.competition_id] season_id = seasons[competition.season_id] spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5") with pd.HDFStore(spadl_h5) as spadlstore: spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table") spadlstore.put("results", spadl.results_df(), format="table") spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table") for game in games_verbose: # load data teams.append(WYL.teams(game.game_id)) players.append(WYL.players(game.game_id)) events = WYL.events(game.game_id) # convert data spadlstore.put( f"actions/game_{game.game_id}", wyscout.convert_to_actions(events, game.home_team_id), # format='table', ) games.season_id = season_id games.competition_id = competition_id spadlstore.put("games", games) spadlstore.put( "teams", pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True), ) spadlstore.put( "players", pd.concat(players).drop_duplicates("player_id").reset_index(drop=True), ) logging.info("Done! Data was saved to %s", spadl_datafolder) def create_spadl(game_id: int, home_team_id: int) -> None: """Create SPADL actions from StatsBomb data for a given game.""" logging.info("Creating SPADL data") spadl_datafolder = os.path.join(_data_dir, "spadl") if not os.path.exists(spadl_datafolder): os.makedirs(spadl_datafolder, exist_ok=True) # load events free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/" SBL = StatsBombLoader(root=free_open_data_remote, getter="remote") events = SBL.events(game_id) # convert to spadl spadl_json = os.path.join(spadl_datafolder, "spadl.json") df_actions = statsbomb.convert_to_actions(events, home_team_id) pd.concat( [ df_actions[df_actions.period_id == 1].head(n=200), df_actions[df_actions.period_id == 2].head(n=200), ] ).to_json(spadl_json, orient="records") # convert to atomic spadl atomic_spadl_json = os.path.join(spadl_datafolder, "atomic_spadl.json") df_atomic_actions = atomicspadl.convert_to_atomic(df_actions) pd.concat( [ df_atomic_actions[df_atomic_actions.period_id == 1].head(n=200), df_atomic_actions[df_atomic_actions.period_id == 2].head(n=200), ] ).to_json(atomic_spadl_json, orient="records") logging.info("Done! SPADL data was saved to %s and %s", spadl_json, atomic_spadl_json) if __name__ == "__main__": # Setup logging logging.basicConfig(level=logging.INFO) # Create the parser my_parser = argparse.ArgumentParser( prog="download", usage="%(prog)s [options]", formatter_class=argparse.RawTextHelpFormatter, description="""Download and prepare the data needed for running the tests. Use the options specified below to select specific preprocessing steps. When this script is run without any options, all preprocessing steps required to run the default test setup will be executed. """, ) # Add the arguments my_parser.add_argument( "--download-statsbomb", action="store_true", help="Download the public StatsBomb data.", ) my_parser.add_argument( "--convert-statsbomb", action="store_true", help="Convert the public StatsBomb data to SPADL.", ) my_parser.add_argument( "--download-wyscout", action="store_true", help="Download the public Wyscout data.", ) my_parser.add_argument( "--convert-wyscout", action="store_true", help="Convert the public Wyscout data to SPADL.", ) my_parser.add_argument( "--spadl", action="store_true", help="Create a JSON file with example SPADL and Atomic-SPADL data.", ) # Execute the parse_args() method args = my_parser.parse_args() no_options = not any( [ args.download_statsbomb, args.convert_statsbomb, args.download_wyscout, args.convert_wyscout, args.spadl, ] ) # Run the requested steps if args.download_statsbomb or no_options: download_statsbomb_data() if args.convert_statsbomb: convert_statsbomb_data() if args.download_wyscout or no_options: download_wyscout_data() if args.convert_wyscout: convert_wyscout_data() if args.spadl: create_spadl(8657, 777)