{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:40.928261Z",
     "iopub.status.busy": "2023-12-30T16:25:40.927624Z",
     "iopub.status.idle": "2023-12-30T16:25:41.342091Z",
     "shell.execute_reply": "2023-12-30T16:25:41.341524Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import warnings\n",
    "import tqdm\n",
    "import pandas as pd\n",
    "warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:41.344192Z",
     "iopub.status.busy": "2023-12-30T16:25:41.343980Z",
     "iopub.status.idle": "2023-12-30T16:25:42.078383Z",
     "shell.execute_reply": "2023-12-30T16:25:42.077701Z"
    }
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "import socceraction.atomic.vaep.features as fs\n",
    "import socceraction.atomic.vaep.labels as lab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:42.080617Z",
     "iopub.status.busy": "2023-12-30T16:25:42.080397Z",
     "iopub.status.idle": "2023-12-30T16:25:42.102852Z",
     "shell.execute_reply": "2023-12-30T16:25:42.102244Z"
    }
   },
   "outputs": [],
   "source": [
    "# Configure file and folder names\n",
    "datafolder = \"../data-fifa\"\n",
    "spadl_h5 = os.path.join(datafolder, \"atomic-spadl-statsbomb.h5\")\n",
    "features_h5 = os.path.join(datafolder, \"atomic-features.h5\")\n",
    "labels_h5 = os.path.join(datafolder, \"atomic-labels.h5\")\n",
    "predictions_h5 = os.path.join(datafolder, \"atomic-predictions-one-action.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:42.104995Z",
     "iopub.status.busy": "2023-12-30T16:25:42.104812Z",
     "iopub.status.idle": "2023-12-30T16:25:43.414438Z",
     "shell.execute_reply": "2023-12-30T16:25:43.413879Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nb of games: 64\n"
     ]
    }
   ],
   "source": [
    "games = pd.read_hdf(spadl_h5, \"games\")\n",
    "print(\"nb of games:\", len(games))\n",
    "\n",
    "# note: only for the purpose of this example and due to the small dataset,\n",
    "# we use the same data for training and evaluation\n",
    "traingames = games\n",
    "testgames = games"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:43.416469Z",
     "iopub.status.busy": "2023-12-30T16:25:43.416296Z",
     "iopub.status.idle": "2023-12-30T16:25:46.201007Z",
     "shell.execute_reply": "2023-12-30T16:25:46.200353Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Selecting features: 100%|██████████████████████████████████████████████████████████████| 64/64 [00:02<00:00, 26.67it/s]\n",
      "Selecting label: 100%|████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 210.79it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X: ['actiontype_pass_a0', 'actiontype_cross_a0', 'actiontype_throw_in_a0', 'actiontype_freekick_crossed_a0', 'actiontype_freekick_short_a0', 'actiontype_corner_crossed_a0', 'actiontype_corner_short_a0', 'actiontype_take_on_a0', 'actiontype_foul_a0', 'actiontype_tackle_a0', 'actiontype_interception_a0', 'actiontype_shot_a0', 'actiontype_shot_penalty_a0', 'actiontype_shot_freekick_a0', 'actiontype_keeper_save_a0', 'actiontype_keeper_claim_a0', 'actiontype_keeper_punch_a0', 'actiontype_keeper_pick_up_a0', 'actiontype_clearance_a0', 'actiontype_bad_touch_a0', 'actiontype_non_action_a0', 'actiontype_dribble_a0', 'actiontype_goalkick_a0', 'actiontype_receival_a0', 'actiontype_out_a0', 'actiontype_offside_a0', 'actiontype_goal_a0', 'actiontype_owngoal_a0', 'actiontype_yellow_card_a0', 'actiontype_red_card_a0', 'actiontype_corner_a0', 'actiontype_freekick_a0', 'bodypart_foot_a0', 'bodypart_head_a0', 'bodypart_other_a0', 'bodypart_head/other_a0', 'goalscore_team', 'goalscore_opponent', 'goalscore_diff', 'x_a0', 'y_a0', 'dist_to_goal_a0', 'angle_to_goal_a0', 'dx_a0', 'dy_a0', 'period_id_a0', 'time_seconds_a0', 'time_seconds_overall_a0']\n",
      "Y: ['scores', 'concedes']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# 1. Select feature set X\n",
    "xfns = [\n",
    "    #fs.actiontype,\n",
    "    fs.actiontype_onehot,\n",
    "    #fs.bodypart,\n",
    "    fs.bodypart_onehot,\n",
    "    fs.goalscore,\n",
    "    fs.location,\n",
    "    fs.polar,\n",
    "    fs.direction,\n",
    "    fs.team,\n",
    "    fs.time,\n",
    "    fs.time_delta\n",
    "]\n",
    "nb_prev_actions = 1\n",
    "\n",
    "Xcols = fs.feature_column_names(xfns, nb_prev_actions)\n",
    "\n",
    "def getXY(games, Xcols):\n",
    "    # generate the columns of the selected feature\n",
    "    X = []\n",
    "    for game_id in tqdm.tqdm(games.game_id, desc=\"Selecting features\"):\n",
    "        Xi = pd.read_hdf(features_h5, f\"game_{game_id}\")\n",
    "        X.append(Xi[Xcols])\n",
    "    X = pd.concat(X).reset_index(drop=True)\n",
    "\n",
    "    # 2. Select label Y\n",
    "    Ycols = [\"scores\", \"concedes\"]\n",
    "    Y = []\n",
    "    for game_id in tqdm.tqdm(games.game_id, desc=\"Selecting label\"):\n",
    "        Yi = pd.read_hdf(labels_h5, f\"game_{game_id}\")\n",
    "        Y.append(Yi[Ycols])\n",
    "    Y = pd.concat(Y).reset_index(drop=True)\n",
    "    return X, Y\n",
    "\n",
    "X,Y = getXY(traingames, Xcols)\n",
    "print(\"X:\", list(X.columns))\n",
    "print(\"Y:\", list(Y.columns))\n",
    "X = X.fillna(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train a model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:46.203832Z",
     "iopub.status.busy": "2023-12-30T16:25:46.203670Z",
     "iopub.status.idle": "2023-12-30T16:25:47.730905Z",
     "shell.execute_reply": "2023-12-30T16:25:47.730391Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scores\n",
      "concedes\n",
      "CPU times: user 5.38 s, sys: 123 ms, total: 5.5 s\n",
      "Wall time: 1.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# train classifiers F(X) = Y\n",
    "import xgboost\n",
    "\n",
    "Y_hat = pd.DataFrame()\n",
    "models = {}\n",
    "for col in list(Y.columns):\n",
    "    print(col)\n",
    "    model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1, enable_categorical=True)\n",
    "    model.fit(X, Y[col])\n",
    "    models[col] = model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:47.733065Z",
     "iopub.status.busy": "2023-12-30T16:25:47.732729Z",
     "iopub.status.idle": "2023-12-30T16:25:49.088474Z",
     "shell.execute_reply": "2023-12-30T16:25:49.087889Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### Y: scores ###\n",
      "  Brier score: 0.00583 (0.75235)\n",
      "  log loss score: 0.02777 (0.60796)\n",
      "  ROC AUC: 0.93296\n",
      "### Y: concedes ###\n",
      "  Brier score: 0.00113 (0.63887)\n",
      "  log loss score: 0.00603 (0.46528)\n",
      "  ROC AUC: 0.97023\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss\n",
    "\n",
    "testX, testY = X, Y\n",
    "\n",
    "def evaluate(y, y_hat):\n",
    "    p = sum(y) / len(y)\n",
    "    base = [p] * len(y)\n",
    "    brier = brier_score_loss(y, y_hat)\n",
    "    print(f\"  Brier score: %.5f (%.5f)\" % (brier, brier / brier_score_loss(y, base)))\n",
    "    ll = log_loss(y, y_hat)\n",
    "    print(f\"  log loss score: %.5f (%.5f)\" % (ll, ll / log_loss(y, base)))\n",
    "    print(f\"  ROC AUC: %.5f\" % roc_auc_score(y, y_hat))\n",
    "\n",
    "for col in testY.columns:\n",
    "    Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)]\n",
    "    print(f\"### Y: {col} ###\")\n",
    "    evaluate(testY[col], Y_hat[col])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-12-30T16:25:49.090602Z",
     "iopub.status.busy": "2023-12-30T16:25:49.090409Z",
     "iopub.status.idle": "2023-12-30T16:25:56.544583Z",
     "shell.execute_reply": "2023-12-30T16:25:56.543918Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading actions of each game: 100%|███████████████████████████████████████████████████| 64/64 [00:00<00:00, 145.54it/s]\n",
      "Saving predictions per game: 100%|█████████████████████████████████████████████████████| 64/64 [00:06<00:00,  9.17it/s]\n"
     ]
    }
   ],
   "source": [
    "# get rows with game id per action\n",
    "A = []\n",
    "for game_id in tqdm.tqdm(testgames.game_id, \"Loading actions of each game\"):\n",
    "    Ai = pd.read_hdf(spadl_h5, f\"atomic_actions/game_{game_id}\")\n",
    "    A.append(Ai[[\"game_id\"]])\n",
    "A = pd.concat(A)\n",
    "A = A.reset_index(drop=True)\n",
    "\n",
    "# concatenate action game id rows with predictions and save per game\n",
    "grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby(\"game_id\")\n",
    "for k,df in tqdm.tqdm(grouped_predictions, desc=\"Saving predictions per game\"):\n",
    "    df = df.reset_index(drop=True)\n",
    "    df[Y_hat.columns].to_hdf(predictions_h5, f\"game_{int(k)}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "socceraction",
   "language": "python",
   "name": "socceraction"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}