{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Be sure to run the following notebooks first before running this notebook:\n",
    "- 1-load-and-convert-statsbomb-data.ipynb\n",
    "- 2-compute-features-and-labels.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:08.681771Z",
     "iopub.status.busy": "2023-12-30T16:26:08.681201Z",
     "iopub.status.idle": "2023-12-30T16:26:09.070259Z",
     "shell.execute_reply": "2023-12-30T16:26:09.069694Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings\n",
    "import tqdm\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:09.072752Z",
     "iopub.status.busy": "2023-12-30T16:26:09.072537Z",
     "iopub.status.idle": "2023-12-30T16:26:09.792163Z",
     "shell.execute_reply": "2023-12-30T16:26:09.790517Z"
    }
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "import socceraction.spadl as spadl\n",
    "import socceraction.vaep.features as fs\n",
    "import socceraction.vaep.labels as lab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:09.797914Z",
     "iopub.status.busy": "2023-12-30T16:26:09.797264Z",
     "iopub.status.idle": "2023-12-30T16:26:09.826046Z",
     "shell.execute_reply": "2023-12-30T16:26:09.825471Z"
    }
   },
   "outputs": [],
   "source": [
    "# Configure file and folder names\n",
    "datafolder = \"../data-fifa\"\n",
    "spadl_h5 = os.path.join(datafolder, \"spadl-statsbomb.h5\")\n",
    "features_h5 = os.path.join(datafolder, \"features.h5\")\n",
    "labels_h5 = os.path.join(datafolder, \"labels.h5\")\n",
    "predictions_h5 = os.path.join(datafolder, \"predictions.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:09.828056Z",
     "iopub.status.busy": "2023-12-30T16:26:09.827857Z",
     "iopub.status.idle": "2023-12-30T16:26:11.088612Z",
     "shell.execute_reply": "2023-12-30T16:26:11.087987Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 32\n"
     ]
    }
   ],
   "source": [
    "# Create a train and test set of games\n",
    "games = pd.read_hdf(spadl_h5, \"games\")\n",
    "traingames = games[:len(games)//2]\n",
    "testgames = games[len(games)//2:]\n",
    "print(len(traingames), len(testgames))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:11.090712Z",
     "iopub.status.busy": "2023-12-30T16:26:11.090546Z",
     "iopub.status.idle": "2023-12-30T16:26:21.018426Z",
     "shell.execute_reply": "2023-12-30T16:26:21.017863Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Selecting features: 100%|██████████████████████████████████████████████████████████████| 32/32 [00:04<00:00,  6.51it/s]\n",
      "Selecting features: 100%|██████████████████████████████████████████████████████████████| 32/32 [00:04<00:00,  7.02it/s]\n"
     ]
    }
   ],
   "source": [
    "# Select shots from the data and all available info about these shots\n",
    "\n",
    "def get_shots(games):\n",
    "    shots = []\n",
    "    with pd.HDFStore(spadl_h5) as spadlstore,\\\n",
    "         pd.HDFStore(features_h5) as featurestore:\n",
    "        for game_id in tqdm.tqdm(games.game_id, desc=\"Selecting features\"):\n",
    "            ai = spadl.add_names(spadlstore[f\"actions/game_{game_id}\"])\n",
    "            shot_idx = ai.type_name.str.contains(\"shot\")\n",
    "            Xi = featurestore[f\"game_{game_id}\"]\n",
    "            shots.append(Xi[shot_idx])\n",
    "    return pd.concat(shots)\n",
    "\n",
    "train_shots = get_shots(traingames)\n",
    "test_shots = get_shots(testgames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:21.020370Z",
     "iopub.status.busy": "2023-12-30T16:26:21.020204Z",
     "iopub.status.idle": "2023-12-30T16:26:21.100585Z",
     "shell.execute_reply": "2023-12-30T16:26:21.099889Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['actiontype_pass_a0',\n",
       " 'actiontype_cross_a0',\n",
       " 'actiontype_throw_in_a0',\n",
       " 'actiontype_freekick_crossed_a0',\n",
       " 'actiontype_freekick_short_a0',\n",
       " 'actiontype_corner_crossed_a0',\n",
       " 'actiontype_corner_short_a0',\n",
       " 'actiontype_take_on_a0',\n",
       " 'actiontype_foul_a0',\n",
       " 'actiontype_tackle_a0',\n",
       " 'actiontype_interception_a0',\n",
       " 'actiontype_shot_a0',\n",
       " 'actiontype_shot_penalty_a0',\n",
       " 'actiontype_shot_freekick_a0',\n",
       " 'actiontype_keeper_save_a0',\n",
       " 'actiontype_keeper_claim_a0',\n",
       " 'actiontype_keeper_punch_a0',\n",
       " 'actiontype_keeper_pick_up_a0',\n",
       " 'actiontype_clearance_a0',\n",
       " 'actiontype_bad_touch_a0',\n",
       " 'actiontype_non_action_a0',\n",
       " 'actiontype_dribble_a0',\n",
       " 'actiontype_goalkick_a0',\n",
       " 'actiontype_pass_a1',\n",
       " 'actiontype_cross_a1',\n",
       " 'actiontype_throw_in_a1',\n",
       " 'actiontype_freekick_crossed_a1',\n",
       " 'actiontype_freekick_short_a1',\n",
       " 'actiontype_corner_crossed_a1',\n",
       " 'actiontype_corner_short_a1',\n",
       " 'actiontype_take_on_a1',\n",
       " 'actiontype_foul_a1',\n",
       " 'actiontype_tackle_a1',\n",
       " 'actiontype_interception_a1',\n",
       " 'actiontype_shot_a1',\n",
       " 'actiontype_shot_penalty_a1',\n",
       " 'actiontype_shot_freekick_a1',\n",
       " 'actiontype_keeper_save_a1',\n",
       " 'actiontype_keeper_claim_a1',\n",
       " 'actiontype_keeper_punch_a1',\n",
       " 'actiontype_keeper_pick_up_a1',\n",
       " 'actiontype_clearance_a1',\n",
       " 'actiontype_bad_touch_a1',\n",
       " 'actiontype_non_action_a1',\n",
       " 'actiontype_dribble_a1',\n",
       " 'actiontype_goalkick_a1',\n",
       " 'bodypart_foot_a0',\n",
       " 'bodypart_head_a0',\n",
       " 'bodypart_other_a0',\n",
       " 'bodypart_head/other_a0',\n",
       " 'bodypart_foot_a1',\n",
       " 'bodypart_head_a1',\n",
       " 'bodypart_other_a1',\n",
       " 'bodypart_head/other_a1',\n",
       " 'start_x_a0',\n",
       " 'start_y_a0',\n",
       " 'start_x_a1',\n",
       " 'start_y_a1',\n",
       " 'dx_a1',\n",
       " 'dy_a1',\n",
       " 'movement_a1',\n",
       " 'dx_a01',\n",
       " 'dy_a01',\n",
       " 'mov_a01',\n",
       " 'start_dist_to_goal_a0',\n",
       " 'start_angle_to_goal_a0',\n",
       " 'start_dist_to_goal_a1',\n",
       " 'start_angle_to_goal_a1',\n",
       " 'team_1']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Decide which features to use to compute the expected goals value of the shots\n",
    "from re import match\n",
    "\n",
    "xfns = [\n",
    "    fs.actiontype_onehot,\n",
    "    fs.bodypart_onehot,\n",
    "    fs.startlocation,\n",
    "    fs.movement,\n",
    "    fs.space_delta,\n",
    "    fs.startpolar,\n",
    "    fs.team,\n",
    "]\n",
    "nb_prev_actions = 2\n",
    "\n",
    "f = fs.feature_column_names(xfns, nb_prev_actions)\n",
    "f = list(filter(lambda v: not match('type_[a-z_]+_a0', v), f))\n",
    "f.remove(\"dx_a0\")\n",
    "f.remove(\"dy_a0\")\n",
    "f.remove(\"movement_a0\")\n",
    "f"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:21.103129Z",
     "iopub.status.busy": "2023-12-30T16:26:21.102837Z",
     "iopub.status.idle": "2023-12-30T16:26:21.196217Z",
     "shell.execute_reply": "2023-12-30T16:26:21.195662Z"
    }
   },
   "outputs": [],
   "source": [
    "# Create features-matrix X and label-vector y.\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "def Xy(f,shots):\n",
    "    return shots[f], shots.result_success_a0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:21.198293Z",
     "iopub.status.busy": "2023-12-30T16:26:21.198096Z",
     "iopub.status.idle": "2023-12-30T16:26:21.271434Z",
     "shell.execute_reply": "2023-12-30T16:26:21.270454Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC AUC: 0.812\n",
      "Brier score: 0.074\n",
      "Log loss: 0.266\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/pieterr/.anyenv/envs/pyenv/versions/3.11.1/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n"
     ]
    }
   ],
   "source": [
    "# Logistic regression\n",
    "X,y = Xy(f, train_shots)\n",
    "model = LogisticRegression().fit(X, y)\n",
    "\n",
    "X,y = Xy(f, test_shots)\n",
    "pred = [p[1] for p in model.predict_proba(X)]\n",
    "\n",
    "print(\"ROC AUC: %.3f\" % roc_auc_score(y, pred))\n",
    "print(\"Brier score: %.3f\" % brier_score_loss(y, pred))\n",
    "print(\"Log loss: %.3f\" % log_loss(y, pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:21.275332Z",
     "iopub.status.busy": "2023-12-30T16:26:21.274553Z",
     "iopub.status.idle": "2023-12-30T16:26:21.428601Z",
     "shell.execute_reply": "2023-12-30T16:26:21.428085Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC AUC: 0.750\n",
      "Brier score: 0.092\n",
      "Log loss: 0.416\n"
     ]
    }
   ],
   "source": [
    "# XGBoost\n",
    "X,y = Xy(f, train_shots)\n",
    "model = XGBClassifier().fit(X, y)\n",
    "\n",
    "X,y = Xy(f, test_shots)\n",
    "pred = [p[1] for p in model.predict_proba(X)]\n",
    "\n",
    "print(\"ROC AUC: %.3f\" % roc_auc_score(y, pred))\n",
    "print(\"Brier score: %.3f\" % brier_score_loss(y, pred))\n",
    "print(\"Log loss: %.3f\" % log_loss(y, pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:26:21.430636Z",
     "iopub.status.busy": "2023-12-30T16:26:21.430334Z",
     "iopub.status.idle": "2023-12-30T16:26:21.459162Z",
     "shell.execute_reply": "2023-12-30T16:26:21.458511Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC AUC: 0.500\n",
      "Brier score: 0.096\n",
      "Log loss: 0.342\n"
     ]
    }
   ],
   "source": [
    "# Naive baseline, always predict class distribution\n",
    "X,y = Xy(f, train_shots)\n",
    "avgP = np.mean(y)\n",
    "\n",
    "X,y = Xy(f, test_shots)\n",
    "pred = [avgP for _i in y]\n",
    "\n",
    "print(\"ROC AUC: %.3f\" % roc_auc_score(y, pred))\n",
    "print(\"Brier score: %.3f\" % brier_score_loss(y, pred))\n",
    "print(\"Log loss: %.3f\" % log_loss(y, pred))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "socceraction",
   "language": "python",
   "name": "socceraction"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}