{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:23.505616Z",
     "iopub.status.busy": "2023-12-30T16:23:23.504959Z",
     "iopub.status.idle": "2023-12-30T16:23:23.929105Z",
     "shell.execute_reply": "2023-12-30T16:23:23.928396Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings\n",
    "import tqdm\n",
    "import pandas as pd\n",
    "warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:23.932302Z",
     "iopub.status.busy": "2023-12-30T16:23:23.931947Z",
     "iopub.status.idle": "2023-12-30T16:23:24.659342Z",
     "shell.execute_reply": "2023-12-30T16:23:24.658769Z"
    }
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "import socceraction.vaep.features as fs\n",
    "import socceraction.vaep.labels as lab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:24.662351Z",
     "iopub.status.busy": "2023-12-30T16:23:24.662127Z",
     "iopub.status.idle": "2023-12-30T16:23:24.687448Z",
     "shell.execute_reply": "2023-12-30T16:23:24.686865Z"
    }
   },
   "outputs": [],
   "source": [
    "# Configure file and folder names\n",
    "datafolder = \"../data-fifa\"\n",
    "spadl_h5 = os.path.join(datafolder, \"spadl-statsbomb.h5\")\n",
    "features_h5 = os.path.join(datafolder, \"features.h5\")\n",
    "labels_h5 = os.path.join(datafolder, \"labels.h5\")\n",
    "predictions_h5 = os.path.join(datafolder, \"predictions.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:24.690119Z",
     "iopub.status.busy": "2023-12-30T16:23:24.689947Z",
     "iopub.status.idle": "2023-12-30T16:23:25.973479Z",
     "shell.execute_reply": "2023-12-30T16:23:25.972930Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nb of games: 64\n"
     ]
    }
   ],
   "source": [
    "games = pd.read_hdf(spadl_h5, \"games\")\n",
    "print(\"nb of games:\", len(games))\n",
    "\n",
    "# note: only for the purpose of this example and due to the small dataset,\n",
    "# we use the same data for training and evaluation\n",
    "traingames = games\n",
    "testgames = games"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:25.975644Z",
     "iopub.status.busy": "2023-12-30T16:23:25.975350Z",
     "iopub.status.idle": "2023-12-30T16:23:31.134113Z",
     "shell.execute_reply": "2023-12-30T16:23:31.133562Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Selecting features: 100%|██████████████████████████████████████████████████████████████| 64/64 [00:04<00:00, 13.46it/s]\n",
      "Selecting label: 100%|████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 206.73it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X: ['actiontype_a0', 'actiontype_pass_a0', 'actiontype_cross_a0', 'actiontype_throw_in_a0', 'actiontype_freekick_crossed_a0', 'actiontype_freekick_short_a0', 'actiontype_corner_crossed_a0', 'actiontype_corner_short_a0', 'actiontype_take_on_a0', 'actiontype_foul_a0', 'actiontype_tackle_a0', 'actiontype_interception_a0', 'actiontype_shot_a0', 'actiontype_shot_penalty_a0', 'actiontype_shot_freekick_a0', 'actiontype_keeper_save_a0', 'actiontype_keeper_claim_a0', 'actiontype_keeper_punch_a0', 'actiontype_keeper_pick_up_a0', 'actiontype_clearance_a0', 'actiontype_bad_touch_a0', 'actiontype_non_action_a0', 'actiontype_dribble_a0', 'actiontype_goalkick_a0', 'bodypart_foot_a0', 'bodypart_head_a0', 'bodypart_other_a0', 'bodypart_head/other_a0', 'result_a0', 'result_fail_a0', 'result_success_a0', 'result_offside_a0', 'result_owngoal_a0', 'result_yellow_card_a0', 'result_red_card_a0', 'goalscore_team', 'goalscore_opponent', 'goalscore_diff', 'start_x_a0', 'start_y_a0', 'end_x_a0', 'end_y_a0', 'dx_a0', 'dy_a0', 'movement_a0', 'start_dist_to_goal_a0', 'start_angle_to_goal_a0', 'end_dist_to_goal_a0', 'end_angle_to_goal_a0']\n",
      "Y: ['scores', 'concedes']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# 1. Select feature set X\n",
    "xfns = [\n",
    "    fs.actiontype,\n",
    "    fs.actiontype_onehot,\n",
    "    #fs.bodypart,\n",
    "    fs.bodypart_onehot,\n",
    "    fs.result,\n",
    "    fs.result_onehot,\n",
    "    fs.goalscore,\n",
    "    fs.startlocation,\n",
    "    fs.endlocation,\n",
    "    fs.movement,\n",
    "    fs.space_delta,\n",
    "    fs.startpolar,\n",
    "    fs.endpolar,\n",
    "    fs.team,\n",
    "    #fs.time,\n",
    "    fs.time_delta,\n",
    "    #fs.actiontype_result_onehot\n",
    "]\n",
    "nb_prev_actions = 1\n",
    "\n",
    "Xcols = fs.feature_column_names(xfns, nb_prev_actions)\n",
    "\n",
    "def getXY(games,Xcols):\n",
    "    # generate the columns of the selected feature\n",
    "    X = []\n",
    "    for game_id in tqdm.tqdm(games.game_id, desc=\"Selecting features\"):\n",
    "        Xi = pd.read_hdf(features_h5, f\"game_{game_id}\")\n",
    "        X.append(Xi[Xcols])\n",
    "    X = pd.concat(X).reset_index(drop=True)\n",
    "\n",
    "    # 2. Select label Y\n",
    "    Ycols = [\"scores\",\"concedes\"]\n",
    "    Y = []\n",
    "    for game_id in tqdm.tqdm(games.game_id, desc=\"Selecting label\"):\n",
    "        Yi = pd.read_hdf(labels_h5, f\"game_{game_id}\")\n",
    "        Y.append(Yi[Ycols])\n",
    "    Y = pd.concat(Y).reset_index(drop=True)\n",
    "    return X, Y\n",
    "\n",
    "X, Y = getXY(traingames,Xcols)\n",
    "print(\"X:\", list(X.columns))\n",
    "print(\"Y:\", list(Y.columns))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train a model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:31.135967Z",
     "iopub.status.busy": "2023-12-30T16:23:31.135797Z",
     "iopub.status.idle": "2023-12-30T16:23:32.601027Z",
     "shell.execute_reply": "2023-12-30T16:23:32.600495Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.91 s, sys: 155 ms, total: 6.07 s\n",
      "Wall time: 1.44 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 3. train classifiers F(X) = Y\n",
    "import xgboost\n",
    "\n",
    "Y_hat = pd.DataFrame()\n",
    "models = {}\n",
    "for col in list(Y.columns):\n",
    "    model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1, enable_categorical=True)\n",
    "    model.fit(X, Y[col])\n",
    "    models[col] = model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:32.603183Z",
     "iopub.status.busy": "2023-12-30T16:23:32.602835Z",
     "iopub.status.idle": "2023-12-30T16:23:33.563946Z",
     "shell.execute_reply": "2023-12-30T16:23:33.563317Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### Y: scores ###\n",
      "  Brier score: 0.00837 (0.80900)\n",
      "  log loss score: 0.04199 (0.72267)\n",
      "  ROC AUC: 0.86223\n",
      "### Y: concedes ###\n",
      "  Brier score: 0.00227 (0.84299)\n",
      "  log loss score: 0.01303 (0.69771)\n",
      "  ROC AUC: 0.89311\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss\n",
    "\n",
    "testX, testY = X, Y\n",
    "\n",
    "def evaluate(y, y_hat):\n",
    "    p = sum(y) / len(y)\n",
    "    base = [p] * len(y)\n",
    "    brier = brier_score_loss(y, y_hat)\n",
    "    print(f\"  Brier score: %.5f (%.5f)\" % (brier, brier / brier_score_loss(y, base)))\n",
    "    ll = log_loss(y, y_hat)\n",
    "    print(f\"  log loss score: %.5f (%.5f)\" % (ll, ll / log_loss(y, base)))\n",
    "    print(f\"  ROC AUC: %.5f\" % roc_auc_score(y, y_hat))\n",
    "\n",
    "for col in testY.columns:\n",
    "    Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)]\n",
    "    print(f\"### Y: {col} ###\")\n",
    "    evaluate(testY[col], Y_hat[col])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:23:33.565897Z",
     "iopub.status.busy": "2023-12-30T16:23:33.565730Z",
     "iopub.status.idle": "2023-12-30T16:23:34.539659Z",
     "shell.execute_reply": "2023-12-30T16:23:34.539155Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading game ids: 100%|███████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 143.27it/s]\n",
      "Saving predictions per game: 100%|████████████████████████████████████████████████████| 64/64 [00:00<00:00, 139.84it/s]\n"
     ]
    }
   ],
   "source": [
    "# get rows with game id per action\n",
    "A = []\n",
    "for game_id in tqdm.tqdm(games.game_id, \"Loading game ids\"):\n",
    "    Ai = pd.read_hdf(spadl_h5, f\"actions/game_{game_id}\")\n",
    "    A.append(Ai[[\"game_id\"]])\n",
    "A = pd.concat(A)\n",
    "A = A.reset_index(drop=True)\n",
    "\n",
    "# concatenate action game id rows with predictions and save per game\n",
    "grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby(\"game_id\")\n",
    "with pd.HDFStore(predictions_h5) as predictionstore:\n",
    "    for k, df in tqdm.tqdm(grouped_predictions, desc=\"Saving predictions per game\"):\n",
    "        df = df.reset_index(drop=True)\n",
    "        predictionstore.put(f\"game_{int(k)}\", df[Y_hat.columns])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "socceraction",
   "language": "python",
   "name": "socceraction"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}