{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Be sure to run the following notebooks first before running this notebook:\n", "- 1-load-and-convert-statsbomb-data.ipynb\n", "- 2-compute-features-and-labels.ipynb" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:08.681771Z", "iopub.status.busy": "2023-12-30T16:26:08.681201Z", "iopub.status.idle": "2023-12-30T16:26:09.070259Z", "shell.execute_reply": "2023-12-30T16:26:09.069694Z" } }, "outputs": [], "source": [ "import os\n", "import warnings\n", "import tqdm\n", "import numpy as np\n", "import pandas as pd\n", "warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:09.072752Z", "iopub.status.busy": "2023-12-30T16:26:09.072537Z", "iopub.status.idle": "2023-12-30T16:26:09.792163Z", "shell.execute_reply": "2023-12-30T16:26:09.790517Z" } }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "import socceraction.spadl as spadl\n", "import socceraction.vaep.features as fs\n", "import socceraction.vaep.labels as lab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:09.797914Z", "iopub.status.busy": "2023-12-30T16:26:09.797264Z", "iopub.status.idle": "2023-12-30T16:26:09.826046Z", "shell.execute_reply": "2023-12-30T16:26:09.825471Z" } }, "outputs": [], "source": [ "# Configure file and folder names\n", "datafolder = \"../data-fifa\"\n", "spadl_h5 = os.path.join(datafolder, \"spadl-statsbomb.h5\")\n", "features_h5 = os.path.join(datafolder, \"features.h5\")\n", "labels_h5 = os.path.join(datafolder, \"labels.h5\")\n", "predictions_h5 = os.path.join(datafolder, \"predictions.h5\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:09.828056Z", "iopub.status.busy": "2023-12-30T16:26:09.827857Z", "iopub.status.idle": "2023-12-30T16:26:11.088612Z", "shell.execute_reply": "2023-12-30T16:26:11.087987Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "32 32\n" ] } ], "source": [ "# Create a train and test set of games\n", "games = pd.read_hdf(spadl_h5, \"games\")\n", "traingames = games[:len(games)//2]\n", "testgames = games[len(games)//2:]\n", "print(len(traingames), len(testgames))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:11.090712Z", "iopub.status.busy": "2023-12-30T16:26:11.090546Z", "iopub.status.idle": "2023-12-30T16:26:21.018426Z", "shell.execute_reply": "2023-12-30T16:26:21.017863Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Selecting features: 100%|██████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 6.51it/s]\n", "Selecting features: 100%|██████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]\n" ] } ], "source": [ "# Select shots from the data and all available info about these shots\n", "\n", "def get_shots(games):\n", " shots = []\n", " with pd.HDFStore(spadl_h5) as spadlstore,\\\n", " pd.HDFStore(features_h5) as featurestore:\n", " for game_id in tqdm.tqdm(games.game_id, desc=\"Selecting features\"):\n", " ai = spadl.add_names(spadlstore[f\"actions/game_{game_id}\"])\n", " shot_idx = ai.type_name.str.contains(\"shot\")\n", " Xi = featurestore[f\"game_{game_id}\"]\n", " shots.append(Xi[shot_idx])\n", " return pd.concat(shots)\n", "\n", "train_shots = get_shots(traingames)\n", "test_shots = get_shots(testgames)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:21.020370Z", "iopub.status.busy": "2023-12-30T16:26:21.020204Z", "iopub.status.idle": "2023-12-30T16:26:21.100585Z", "shell.execute_reply": "2023-12-30T16:26:21.099889Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['actiontype_pass_a0',\n", " 'actiontype_cross_a0',\n", " 'actiontype_throw_in_a0',\n", " 'actiontype_freekick_crossed_a0',\n", " 'actiontype_freekick_short_a0',\n", " 'actiontype_corner_crossed_a0',\n", " 'actiontype_corner_short_a0',\n", " 'actiontype_take_on_a0',\n", " 'actiontype_foul_a0',\n", " 'actiontype_tackle_a0',\n", " 'actiontype_interception_a0',\n", " 'actiontype_shot_a0',\n", " 'actiontype_shot_penalty_a0',\n", " 'actiontype_shot_freekick_a0',\n", " 'actiontype_keeper_save_a0',\n", " 'actiontype_keeper_claim_a0',\n", " 'actiontype_keeper_punch_a0',\n", " 'actiontype_keeper_pick_up_a0',\n", " 'actiontype_clearance_a0',\n", " 'actiontype_bad_touch_a0',\n", " 'actiontype_non_action_a0',\n", " 'actiontype_dribble_a0',\n", " 'actiontype_goalkick_a0',\n", " 'actiontype_pass_a1',\n", " 'actiontype_cross_a1',\n", " 'actiontype_throw_in_a1',\n", " 'actiontype_freekick_crossed_a1',\n", " 'actiontype_freekick_short_a1',\n", " 'actiontype_corner_crossed_a1',\n", " 'actiontype_corner_short_a1',\n", " 'actiontype_take_on_a1',\n", " 'actiontype_foul_a1',\n", " 'actiontype_tackle_a1',\n", " 'actiontype_interception_a1',\n", " 'actiontype_shot_a1',\n", " 'actiontype_shot_penalty_a1',\n", " 'actiontype_shot_freekick_a1',\n", " 'actiontype_keeper_save_a1',\n", " 'actiontype_keeper_claim_a1',\n", " 'actiontype_keeper_punch_a1',\n", " 'actiontype_keeper_pick_up_a1',\n", " 'actiontype_clearance_a1',\n", " 'actiontype_bad_touch_a1',\n", " 'actiontype_non_action_a1',\n", " 'actiontype_dribble_a1',\n", " 'actiontype_goalkick_a1',\n", " 'bodypart_foot_a0',\n", " 'bodypart_head_a0',\n", " 'bodypart_other_a0',\n", " 'bodypart_head/other_a0',\n", " 'bodypart_foot_a1',\n", " 'bodypart_head_a1',\n", " 'bodypart_other_a1',\n", " 'bodypart_head/other_a1',\n", " 'start_x_a0',\n", " 'start_y_a0',\n", " 'start_x_a1',\n", " 'start_y_a1',\n", " 'dx_a1',\n", " 'dy_a1',\n", " 'movement_a1',\n", " 'dx_a01',\n", " 'dy_a01',\n", " 'mov_a01',\n", " 'start_dist_to_goal_a0',\n", " 'start_angle_to_goal_a0',\n", " 'start_dist_to_goal_a1',\n", " 'start_angle_to_goal_a1',\n", " 'team_1']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Decide which features to use to compute the expected goals value of the shots\n", "from re import match\n", "\n", "xfns = [\n", " fs.actiontype_onehot,\n", " fs.bodypart_onehot,\n", " fs.startlocation,\n", " fs.movement,\n", " fs.space_delta,\n", " fs.startpolar,\n", " fs.team,\n", "]\n", "nb_prev_actions = 2\n", "\n", "f = fs.feature_column_names(xfns, nb_prev_actions)\n", "f = list(filter(lambda v: not match('type_[a-z_]+_a0', v), f))\n", "f.remove(\"dx_a0\")\n", "f.remove(\"dy_a0\")\n", "f.remove(\"movement_a0\")\n", "f" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train models" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:21.103129Z", "iopub.status.busy": "2023-12-30T16:26:21.102837Z", "iopub.status.idle": "2023-12-30T16:26:21.196217Z", "shell.execute_reply": "2023-12-30T16:26:21.195662Z" } }, "outputs": [], "source": [ "# Create features-matrix X and label-vector y.\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss\n", "from xgboost import XGBClassifier\n", "\n", "def Xy(f,shots):\n", " return shots[f], shots.result_success_a0" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:21.198293Z", "iopub.status.busy": "2023-12-30T16:26:21.198096Z", "iopub.status.idle": "2023-12-30T16:26:21.271434Z", "shell.execute_reply": "2023-12-30T16:26:21.270454Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC AUC: 0.812\n", "Brier score: 0.074\n", "Log loss: 0.266\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/pieterr/.anyenv/envs/pyenv/versions/3.11.1/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] } ], "source": [ "# Logistic regression\n", "X,y = Xy(f, train_shots)\n", "model = LogisticRegression().fit(X, y)\n", "\n", "X,y = Xy(f, test_shots)\n", "pred = [p[1] for p in model.predict_proba(X)]\n", "\n", "print(\"ROC AUC: %.3f\" % roc_auc_score(y, pred))\n", "print(\"Brier score: %.3f\" % brier_score_loss(y, pred))\n", "print(\"Log loss: %.3f\" % log_loss(y, pred))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:21.275332Z", "iopub.status.busy": "2023-12-30T16:26:21.274553Z", "iopub.status.idle": "2023-12-30T16:26:21.428601Z", "shell.execute_reply": "2023-12-30T16:26:21.428085Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC AUC: 0.750\n", "Brier score: 0.092\n", "Log loss: 0.416\n" ] } ], "source": [ "# XGBoost\n", "X,y = Xy(f, train_shots)\n", "model = XGBClassifier().fit(X, y)\n", "\n", "X,y = Xy(f, test_shots)\n", "pred = [p[1] for p in model.predict_proba(X)]\n", "\n", "print(\"ROC AUC: %.3f\" % roc_auc_score(y, pred))\n", "print(\"Brier score: %.3f\" % brier_score_loss(y, pred))\n", "print(\"Log loss: %.3f\" % log_loss(y, pred))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2023-12-30T16:26:21.430636Z", "iopub.status.busy": "2023-12-30T16:26:21.430334Z", "iopub.status.idle": "2023-12-30T16:26:21.459162Z", "shell.execute_reply": "2023-12-30T16:26:21.458511Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC AUC: 0.500\n", "Brier score: 0.096\n", "Log loss: 0.342\n" ] } ], "source": [ "# Naive baseline, always predict class distribution\n", "X,y = Xy(f, train_shots)\n", "avgP = np.mean(y)\n", "\n", "X,y = Xy(f, test_shots)\n", "pred = [avgP for _i in y]\n", "\n", "print(\"ROC AUC: %.3f\" % roc_auc_score(y, pred))\n", "print(\"Brier score: %.3f\" % brier_score_loss(y, pred))\n", "print(\"Log loss: %.3f\" % log_loss(y, pred))" ] } ], "metadata": { "kernelspec": { "display_name": "socceraction", "language": "python", "name": "socceraction" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }