Be sure to run the following notebooks first before running this notebook:
- 1-load-and-convert-statsbomb-data.ipynb
- 2-compute-features-and-labels.ipynb

In [1]:
import os
import warnings
import tqdm
import numpy as np
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
%load_ext autoreload
%autoreload 2
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

## Select data

In [3]:
# Configure file and folder names
datafolder = "../data-fifa"
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
features_h5 = os.path.join(datafolder, "features.h5")
labels_h5 = os.path.join(datafolder, "labels.h5")
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [4]:
# Create a train and test set of games
games = pd.read_hdf(spadl_h5, "games")
traingames = games[:len(games)//2]
testgames = games[len(games)//2:]
print(len(traingames), len(testgames))

32 32


In [5]:
# Select shots from the data and all available info about these shots

def get_shots(games):
 shots = []
 with pd.HDFStore(spadl_h5) as spadlstore,\
 pd.HDFStore(features_h5) as featurestore:
 for game_id in tqdm.tqdm(games.game_id, desc="Selecting features"):
 ai = spadl.add_names(spadlstore[f"actions/game_{game_id}"])
 shot_idx = ai.type_name.str.contains("shot")
 Xi = featurestore[f"game_{game_id}"]
 shots.append(Xi[shot_idx])
 return pd.concat(shots)

train_shots = get_shots(traingames)
test_shots = get_shots(testgames)

Selecting features: 100%|██████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 6.51it/s]
Selecting features: 100%|██████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]


In [6]:
# Decide which features to use to compute the expected goals value of the shots
from re import match

xfns = [
 fs.actiontype_onehot,
 fs.bodypart_onehot,
 fs.startlocation,
 fs.movement,
 fs.space_delta,
 fs.startpolar,
 fs.team,
]
nb_prev_actions = 2

f = fs.feature_column_names(xfns, nb_prev_actions)
f = list(filter(lambda v: not match('type_[a-z_]+_a0', v), f))
f.remove("dx_a0")
f.remove("dy_a0")
f.remove("movement_a0")
f

['actiontype_pass_a0',
 'actiontype_cross_a0',
 'actiontype_throw_in_a0',
 'actiontype_freekick_crossed_a0',
 'actiontype_freekick_short_a0',
 'actiontype_corner_crossed_a0',
 'actiontype_corner_short_a0',
 'actiontype_take_on_a0',
 'actiontype_foul_a0',
 'actiontype_tackle_a0',
 'actiontype_interception_a0',
 'actiontype_shot_a0',
 'actiontype_shot_penalty_a0',
 'actiontype_shot_freekick_a0',
 'actiontype_keeper_save_a0',
 'actiontype_keeper_claim_a0',
 'actiontype_keeper_punch_a0',
 'actiontype_keeper_pick_up_a0',
 'actiontype_clearance_a0',
 'actiontype_bad_touch_a0',
 'actiontype_non_action_a0',
 'actiontype_dribble_a0',
 'actiontype_goalkick_a0',
 'actiontype_pass_a1',
 'actiontype_cross_a1',
 'actiontype_throw_in_a1',
 'actiontype_freekick_crossed_a1',
 'actiontype_freekick_short_a1',
 'actiontype_corner_crossed_a1',
 'actiontype_corner_short_a1',
 'actiontype_take_on_a1',
 'actiontype_foul_a1',
 'actiontype_tackle_a1',
 'actiontype_interception_a1',
 'actiontype_shot_a1',
 'acti

## Train models

In [7]:
# Create features-matrix X and label-vector y.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss
from xgboost import XGBClassifier

def Xy(f,shots):
 return shots[f], shots.result_success_a0

In [8]:
# Logistic regression
X,y = Xy(f, train_shots)
model = LogisticRegression().fit(X, y)

X,y = Xy(f, test_shots)
pred = [p[1] for p in model.predict_proba(X)]

print("ROC AUC: %.3f" % roc_auc_score(y, pred))
print("Brier score: %.3f" % brier_score_loss(y, pred))
print("Log loss: %.3f" % log_loss(y, pred))

ROC AUC: 0.812
Brier score: 0.074
Log loss: 0.266


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
 https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
 https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
 n_iter_i = _check_optimize_result(


In [9]:
# XGBoost
X,y = Xy(f, train_shots)
model = XGBClassifier().fit(X, y)

X,y = Xy(f, test_shots)
pred = [p[1] for p in model.predict_proba(X)]

print("ROC AUC: %.3f" % roc_auc_score(y, pred))
print("Brier score: %.3f" % brier_score_loss(y, pred))
print("Log loss: %.3f" % log_loss(y, pred))

ROC AUC: 0.750
Brier score: 0.092
Log loss: 0.416


In [10]:
# Naive baseline, always predict class distribution
X,y = Xy(f, train_shots)
avgP = np.mean(y)

X,y = Xy(f, test_shots)
pred = [avgP for _i in y]

print("ROC AUC: %.3f" % roc_auc_score(y, pred))
print("Brier score: %.3f" % brier_score_loss(y, pred))
print("Log loss: %.3f" % log_loss(y, pred))

ROC AUC: 0.500
Brier score: 0.096
Log loss: 0.342
