initial commit

Browse files

Files changed (11) hide show

.gitattributes +35 -0
.gitignore +2 -0
README.md +100 -0
eurovoc.py +212 -0
handler.py +74 -0
img/architecture.png +0 -0
mlb.pickle +3 -0
pytorch_model.bin +3 -0
requirements.txt +6 -0
test_handler.py +18 -0
train.ipynb +935 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .idea
2	+ __pycache__

README.md ADDED Viewed

	@@ -0,0 +1,100 @@

+---
+license: eupl-1.1
+datasets:
+- EuropeanParliament/cellar_eurovoc
+language:
+- en
+metrics:
+  - type: f1
+    value: 0.72
+    name: micro F1
+    args:
+      threshold: 0.34
+  - type: NDCG@3
+    value: 0.84
+    name: NDCG@5
+  - type: NDCG@5
+    value: 0.80
+    name: NDCG@5
+  - type: NDCG@10
+    value: 0.83
+    name: NDCG@10
+tags:
+- eurovoc
+pipeline_tag: text-classification
+widget:
+- text: "The Union condemns the continuing grave human rights violations by the Myanmar armed forces, including torture, sexual and gender-based violence, the persecution of civil society actors, human rights defenders and journalists, and attacks on the civilian population, including ethnic and religious minorities."
+---
+# Eurovoc Multilabel Classifer
+[EuroVoc](https://op.europa.eu/fr/web/eu-vocabularies) is a large multidisciplinary multilingual hierarchical thesaurus of more than 7000 classes covering the activities of EU institutions.
+Given the number of legal documents produced every day and the huge mass of pre-existing documents to be classiﬁed high quality automated or semi-automated classiﬁcation methods are most welcome in this domain.
+This model based on BERT Deep Neural Network was trained on more than 200,000 documents to achieve that task and is used in a production environment via the huggingface inference endpoint.
+## Architecture
+![architecture](img/architecture.png)
+7331 Eurovoc labels
+## Usage
+```python
+from eurovoc import EurovocTagger
+model = EurovocTagger.from_pretrained("EuropeanParliament/eurovoc_en")
+```
+## Metrics
+### Eurlex57k Dataset
+| Metric     | Value    | Threshold Value |
+|------------|----------|-----------------|
+| Micro F1   | 0.7233   | 0.34            |
+| NDCG@3     | 0.8438   | -               |
+| NDCG@5     | 0.8079   | -               |
+| NDCG@10    | 0.833    | -               |
+These values are in line with the state of the art in the field, see the publication [Large Scale Legal Text Classification Using Transformer Models](https://arxiv.org/pdf/2010.12871.pdf).
+## Inference Endpoint
+Member of the [European Parliament HuggingFace Organisation](https://huggingface.co/EuropeanParliament) can access to our inference endpoint.
+### Payload example
+```json
+{
+  "inputs": "The Union condemns the continuing grave human rights violations by the Myanmar armed forces, including torture, sexual and gender-based violence, the persecution of civil society actors, human rights defenders and journalists, and attacks on the civilian population, including ethnic and religious minorities. ",
+  "topk": 10,
+  "threshold": 0.16
+}
+```
+result:
+```json
+{'results': [{'label': 'international sanctions', 'score': 0.9994925260543823},
+             {'label': 'economic sanctions', 'score': 0.9991770386695862},
+             {'label': 'natural person', 'score': 0.9591936469078064},
+             {'label': 'EU restrictive measure', 'score': 0.8388392329216003},
+             {'label': 'legal person', 'score': 0.45630475878715515},
+             {'label': 'Burma/Myanmar', 'score': 0.43375277519226074}]}
+```
+Only six results, because the following one score is less that 0.16
+Default value, topk = 5 and threshold = 0.16
+## Author(s)
+Sébastien Campion <[email protected]>

eurovoc.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import pytorch_lightning as pl
+import torch.nn as nn
+from transformers import BertTokenizerFast as BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel
+from huggingface_hub import PyTorchModelHubMixin
+class EurovocDataset(Dataset):
+    def __init__(
+            self,
+            text: np.array,
+            labels: np.array,
+            tokenizer: BertTokenizer,
+            max_token_len: int = 128
+    ):
+        self.tokenizer = tokenizer
+        self.text = text
+        self.labels = labels
+        self.max_token_len = max_token_len
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, index: int):
+        text = self.text[index][0]
+        labels = self.labels[index]
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_token_len,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+        return dict(
+            text=text,
+            input_ids=encoding["input_ids"].flatten(),
+            attention_mask=encoding["attention_mask"].flatten(),
+            labels=torch.FloatTensor(labels)
+        )
+class EuroVocLongTextDataset(Dataset):
+    def __splitter__(text, max_lenght):
+        l = text.split()
+        for i in range(0, len(l), max_lenght):
+            yield l[i:i + max_lenght]
+    def __init__(
+            self,
+            text: np.array,
+            labels: np.array,
+            tokenizer: BertTokenizer,
+            max_token_len: int = 128
+    ):
+        self.tokenizer = tokenizer
+        self.text = text
+        self.labels = labels
+        self.max_token_len = max_token_len
+        self.chunks_and_labels = [(c, l) for t, l in zip(self.text, self.labels) for c in self.__splitter__(t)]
+        self.encoding = self.tokenizer.batch_encode_plus(
+            [c for c, _ in self.chunks_and_labels],
+            add_special_tokens=True,
+            max_length=self.max_token_len,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+    def __len__(self):
+        return len(self.chunks_and_labels)
+    def __getitem__(self, index: int):
+        text, labels = self.chunks_and_labels[index]
+        return dict(
+            text=text,
+            input_ids=self.encoding[index]["input_ids"].flatten(),
+            attention_mask=self.encoding[index]["attention_mask"].flatten(),
+            labels=torch.FloatTensor(labels)
+        )
+class EurovocDataModule(pl.LightningDataModule):
+    def __init__(self, bert_model_name, x_tr, y_tr, x_test, y_test, batch_size=8, max_token_len=512):
+        super().__init__()
+        self.batch_size = batch_size
+        self.x_tr = x_tr
+        self.y_tr = y_tr
+        self.x_test = x_test
+        self.y_test = y_test
+        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+        self.max_token_len = max_token_len
+    def setup(self, stage=None):
+        self.train_dataset = EurovocDataset(
+            self.x_tr,
+            self.y_tr,
+            self.tokenizer,
+            self.max_token_len
+        )
+        self.test_dataset = EurovocDataset(
+            self.x_test,
+            self.y_test,
+            self.tokenizer,
+            self.max_token_len
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=2
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            num_workers=2
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            num_workers=2
+        )
+class EurovocTagger(pl.LightningModule, PyTorchModelHubMixin):
+  def __init__(self, bert_model_name, n_classes, lr=2e-5, eps=1e-8):
+    super().__init__()
+    self.bert = AutoModel.from_pretrained(bert_model_name)
+    self.dropout = nn.Dropout(p=0.2)
+    self.classifier1 = nn.Linear(self.bert.config.hidden_size, n_classes)
+    self.criterion = nn.BCELoss()
+    self.lr = lr
+    self.eps = eps
+  def forward(self, input_ids, attention_mask, labels=None):
+    output = self.bert(input_ids, attention_mask=attention_mask)
+    output = self.dropout(output.pooler_output)
+    output = self.classifier1(output)
+    output = torch.sigmoid(output)
+    loss = 0
+    if labels is not None:
+        loss = self.criterion(output, labels)
+    return loss, output
+  def training_step(self, batch, batch_idx):
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
+    labels = batch["labels"]
+    loss, outputs = self(input_ids, attention_mask, labels)
+    self.log("train_loss", loss, prog_bar=True, logger=True)
+    return {"loss": loss, "predictions": outputs, "labels": labels}
+  def validation_step(self, batch, batch_idx):
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
+    labels = batch["labels"]
+    loss, outputs = self(input_ids, attention_mask, labels)
+    self.log("val_loss", loss, prog_bar=True, logger=True)
+    return loss
+  def test_step(self, batch, batch_idx):
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
+    labels = batch["labels"]
+    loss, outputs = self(input_ids, attention_mask, labels)
+    self.log("test_loss", loss, prog_bar=True, logger=True)
+    return loss
+  def on_train_epoch_end(self,  *args, **kwargs):
+    return
+    #labels = []
+    #predictions = []
+    #for output in args['outputs']:
+    #  for out_labels in output["labels"].detach().cpu():
+    #    labels.append(out_labels)
+    #  for out_predictions in output["predictions"].detach().cpu():
+    #    predictions.append(out_predictions)
+    #labels = torch.stack(labels).int()
+    #predictions = torch.stack(predictions)
+    #for i, name in enumerate(mlb.classes_):
+    #  class_roc_auc = auroc(predictions[:, i], labels[:, i])
+    #  self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
+  def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)

handler.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Dict, List, Any
+import numpy as np
+import pickle
+from sklearn.preprocessing import MultiLabelBinarizer
+from transformers import AutoTokenizer
+import torch
+from eurovoc import EurovocTagger
+BERT_MODEL_NAME = "nlpaueb/legal-bert-base-uncased"
+MAX_LEN = 512
+TEXT_MAX_LEN = MAX_LEN * 50
+tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
+class EndpointHandler:
+    mlb = MultiLabelBinarizer()
+    def __init__(self, path=""):
+        self.mlb = pickle.load(open(f"{path}/mlb.pickle", "rb"))
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = EurovocTagger.from_pretrained(path,
+                                                   bert_model_name=BERT_MODEL_NAME,
+                                                   n_classes=len(self.mlb.classes_),
+                                                   map_location=self.device)
+        self.model.eval()
+        self.model.freeze()
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+       data args:
+            inputs (:obj: `str` | `PIL.Image` | `np.array`)
+            kwargs
+      Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        text = data.pop("inputs", data)
+        topk = data.pop("topk", 5)
+        threshold = data.pop("threshold", 0.16)
+        debug = data.pop("debug", False)
+        prediction = self.get_prediction(text)
+        results = [{"label": label, "score": float(score)} for label, score in
+                   zip(self.mlb.classes_, prediction[0].tolist())]
+        results = sorted(results, key=lambda x: x["score"], reverse=True)
+        results = [r for r in results if r["score"] > threshold]
+        results = results[:topk]
+        if debug:
+            return {"results": results, "values": prediction, "input": text}
+        else:
+            return {"results": results}
+    def get_prediction(self, text):
+        # split text into chunks of MAX_LEN and get average prediction for each chunk
+        chunks = [text[i:i + MAX_LEN] for i in range(0, min(len(text), TEXT_MAX_LEN), MAX_LEN)]
+        predictions = [self._get_prediction(chunk) for chunk in chunks]
+        predictions = np.array(predictions).mean(axis=0)
+        return predictions
+    def _get_prediction(self, text):
+        item = tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=MAX_LEN,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt')
+        _, prediction = self.model(item["input_ids"], item["attention_mask"])
+        prediction = prediction.cpu().detach().numpy()
+        return prediction

img/architecture.png ADDED Viewed

mlb.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8822c2c0cee9ceeadab0afbb155106d7f55fafa58e5a16eac3280aaf9cc980
+size 128152

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57719b9fd61bbe3141cfc0d38291404337dab436cc5be4ab257e88498e636e88
+size 458391285

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+aiohttp==3.8.5
+ipython==8.14.0
+pip-chill==1.0.3
+pytorch-lightning==2.0.5
+scikit-learn==1.3.0
+transformers==4.32.0

test_handler.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pprint import pprint
+from handler import EndpointHandler
+# init handler
+my_handler = EndpointHandler(path=".")
+# prepare sample payload
+payload = {"text": "EN Official Journal of the European Union LI 183/19 COUNCIL IMPLEMENTING REGULATION (EU) 2023/1497 of 20 July 2023 implementing Regulation (EU) No 401/2013 concerning restrictive measures in view of the situation in Myanmar/Burma THE COUNCIL OF THE EUROPEAN UNION, Having regard to the Treaty on the Functioning of the European Union, Having regard to Council Regulation (EU) No 401/2013 of 2 May 2013 concerning restrictive measures in view of the situation in Myanmar/Burma and repealing Regulation (EC) No 194/2008 (1), and in particular Article 4i thereof, Having regard to the proposal from the High Representative of the Union for Foreign Affairs and Security Policy, Whereas: (1) On 2 May 2013, the Council adopted Regulation (EU) No 401/2013. (2) On 31 January 2023, the High Representative of the Union for Foreign Affairs and Security Policy issued a declaration on behalf of the Union strongly condemning the overthrow of Myanmar’s democratically-elected government by the Myanmar armed forces in blatant violation of the will of the people as expressed in the general election of 8 November 2020. This illegitimate act reversed the country’s democratic transition and led to disastrous humanitarian, social, security, economic and human rights consequences. (3) The Union remains deeply concerned by the continuing escalation of violence and the evolution towards a protracted conflict with regional implications. The Union condemns the continuing grave human rights violations by the Myanmar armed forces, including torture, sexual and gender-based violence, the persecution of civil society actors, human rights defenders and journalists, and attacks on the civilian population, including ethnic and religious minorities. (4) In the absence of swift progress in the situation in Myanmar/Burma, the Union has expressed several times its readiness to adopt further restrictive measures against those responsible for undermining democracy and the rule of law and for the serious human rights violations taking place in that country. (5) In view of the continuing grave situation in Myanmar/Burma, six persons and one entity should be added to the list of natural and legal persons, entities and bodies subject to restrictive measures in Annex IV to Regulation (EU) No 401/2013. (6) Regulation (EU) No 401/2013 should therefore be amended accordingly, HAS ADOPTED THIS REGULATION: Article 1 Annex IV to Regulation (EU) No 401/2013 is amended as set out in the Annex to this Regulation. Article 2 This Regulation shall enter into force on the date of its publication in the Official Journal of the European Union. This Regulation shall be binding in its entirety and directly applicable in all Member States. Done at Brussels, 20 July 2023. For the Council The President J. BORRELL FONTELLES (1)  OJ L 121, 3. 5. 2013, p. 1. ANNEX Annex IV to Regulation (EU) No 401/2013 is amended as follows: (1) the following entries are added to the list headed ‘A. Natural persons referred to in Article 4a’:   Name Identifying information Reasons Date of listing ‘94. Aung Kyaw Min Nationality: Myanmar/Burma; Date of birth: circa 1958; Place of birth: Myanmar/Burma; Gender: male; Function: Member of State Administration Council Aung Kyaw Min has been a member of the State Administration Council (SAC) since 1 February 2023. He is also the former Chief-Minister of Rakhine State. SAC is led by Commander in Chief Min Aung Hlaing, who took over the legislative, executive and judicial powers of the State as of 1 February 2021, preventing the democratically-elected government from fulfilling its mandate. As member of the SAC, Aung Kyaw Min has been directly involved in and responsible for decision-making concerning state functions and is therefore responsible for undermining democracy and the rule of law in Myanmar/Burma. Additionally, the SAC has adopted decisions restricting the rights of freedom of expression, including access to information, and peaceful assembly. The military forces and authorities operating under the control of the SAC have committed serious human rights violations since 1 February 2021, killing civilian and unarmed protestors, and have restricted freedom of assembly and of expression. As a member of the SAC, Aung Kyaw Min is directly responsible for those repressive decisions and for serious human rights violations. 20. 7. 2023 95. Kyaw Swar Lin a. k. a Kyaw Swar Linn Nationality: Myanmar/Burma; Place of birth: Myanmar/Burma; Gender: male; Function: Quartermaster General of the Myanmar armed forces Lieutenant General Kyaw Swar Lin was been appointed as Quartermaster General in May 2020. It is the sixth highest position in the military of Myanmar/Burma. The Office of the Quartermaster General is a department under the jurisdiction of the Ministry of Defense and is involved in arms and military equipment procurement for the Myanmar Armed Forces. In addition, Kyaw Swar Lin runs the Myanmar Economic Corporation (MEC), which is one of the two major conglomerates and holding companies operated by the military, generating revenue for the Myanmar armed forces (Tatmadaw). As Quartermaster General, he forms part of the military regime which has seized power in a military coup and overthrown the legitimately elected leaders of Myanmar/Burma. Kyaw Swar Lin is therefore a natural person whose policies and activities undermine democracy and the rule of law in Myanmar/Burma, and who provides support for actions that threaten the peace, security and stability of Myanmar/Burma. 20. 7. 2023 96. Myint Kyaing a. k. a. U Myint Kyaing Nationality: Myanmar/Burma; Date of birth: 17. 4. 1957 Place of birth: Myanmar/Burma; Gender: male; Function: Union Minister of Immigration and Population Myint Kyaing has been the Union Minister for Immigration and Population since 19 August 2021. Before that, he was Union Minister of Labour following the coup of 1 February 2021. He is a member of the State Administration Council (SAC), led by Commander-in-Chief Min Aung Hlaing, which took over the legislative, executive and judicial powers of the State in a military coup on 1 February 2021. As a government Minister, he forms part of the military regime which has seized power in a military coup and overthrown the legitimately elected leaders of Myanmar/Burma. In his capacity as Union Minister, he carries out duties in support of military regime’s repressive immigration and population policy such as restrictions for citizens to travel within the country as well as the policy of the regime towards the minority of the Rohingya in violation of human rights. As Minister for Immigration and Population he also participates in preparations for the elections announced by the military in order to legitimise the illegal coup of February 2021. Myint Kyaing is therefore responsible for undermining democracy and the rule of law in Myanmar/Burma and for providing support for actions that threaten the peace, security and stability of Myanmar/Burma. 20. 7. 2023 97.",
+           "topk": 10,
+           "threshold": 0.16
+           }
+# test the handler
+payload_pred = my_handler(payload)
+pprint(payload_pred)

train.ipynb ADDED Viewed

	@@ -0,0 +1,935 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a8b6caed",
+   "metadata": {},
+   "source": [
+    "# 🇪🇺 🏷️ Eurovoc Model Training Notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c4c73793",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle \n",
+    "import pandas as pd\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "\n",
+    "from datasets import list_datasets, load_dataset\n",
+    "\n",
+    "from sklearn.preprocessing import MultiLabelBinarizer\n",
+    "import torch\n",
+    "\n",
+    "import pytorch_lightning as pl\n",
+    "from pytorch_lightning.callbacks import ModelCheckpoint"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc770f0b",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "\n",
+    "## 1. Data loading\n",
+    "### Choose our dataset, extracted from ep registry or eurlex57k"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9fdc5328",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset json (/home/scampion/.cache/huggingface/datasets/EuropeanParliament___json/EuropeanParliament--cellar_eurovoc-3a27a019ebbf0296/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d5bf91bf9dc2416faefe96d680217da6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#dataset = load_dataset('json', data_files='ep_registry.jsonl')\n",
+    "\n",
+    "#dataset = load_dataset('eurlex')\n",
+    "dataset = load_dataset('EuropeanParliament/cellar_eurovoc')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94967fc2",
+   "metadata": {},
+   "source": [
+    "### Merge train, test and validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ce5f764f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>date</th>\n",
+       "      <th>eurovoc_concepts</th>\n",
+       "      <th>url</th>\n",
+       "      <th>lang</th>\n",
+       "      <th>formats</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Corrigendum to Commission Implementing Regulat...</td>\n",
+       "      <td>2023-07-20</td>\n",
+       "      <td>[China, Malaysia, anti-dumping duty, business ...</td>\n",
+       "      <td>http://publications.europa.eu/resource/cellar/...</td>\n",
+       "      <td>eng</td>\n",
+       "      <td>[fmx4, pdfa2a, xhtml]</td>\n",
+       "      <td>L_2023183EN. 01005801. xml 20. 7. 2023    EN O...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Council Decision (CFSP) 2023/1501 of 20 July 2...</td>\n",
+       "      <td>2023-07-20</td>\n",
+       "      <td>[EU restrictive measure, Russia, Ukraine, econ...</td>\n",
+       "      <td>http://publications.europa.eu/resource/cellar/...</td>\n",
+       "      <td>eng</td>\n",
+       "      <td>[fmx4, pdfa2a, xhtml]</td>\n",
+       "      <td>LI2023183EN. 01004801. xml 20. 7. 2023    EN O...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Council Decision (CFSP) 2023/1502 of 20 July 2...</td>\n",
+       "      <td>2023-07-20</td>\n",
+       "      <td>[Burma/Myanmar, EU restrictive measure, econom...</td>\n",
+       "      <td>http://publications.europa.eu/resource/cellar/...</td>\n",
+       "      <td>eng</td>\n",
+       "      <td>[fmx4, pdfa2a, xhtml]</td>\n",
+       "      <td>LI2023183EN. 01005201. xml 20. 7. 2023    EN O...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>The Committee of the Regions welcomes Croatian...</td>\n",
+       "      <td>2023-07-20</td>\n",
+       "      <td>[Croatia, EU regional policy, European Committ...</td>\n",
+       "      <td>http://publications.europa.eu/resource/cellar/...</td>\n",
+       "      <td>eng</td>\n",
+       "      <td>[pdf]</td>\n",
+       "      <td>EUROPEAN UNION Committee of the Regions The Co...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Corrigendum to Commission Implementing Regulat...</td>\n",
+       "      <td>2023-07-20</td>\n",
+       "      <td>[India, Türkiye, anti-dumping duty, building m...</td>\n",
+       "      <td>http://publications.europa.eu/resource/cellar/...</td>\n",
+       "      <td>eng</td>\n",
+       "      <td>[fmx4, pdfa2a, xhtml]</td>\n",
+       "      <td>L_2023183EN. 01005901. xml 20. 7. 2023    EN O...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               title       date  \\\n",
+       "0  Corrigendum to Commission Implementing Regulat... 2023-07-20   \n",
+       "1  Council Decision (CFSP) 2023/1501 of 20 July 2... 2023-07-20   \n",
+       "2  Council Decision (CFSP) 2023/1502 of 20 July 2... 2023-07-20   \n",
+       "3  The Committee of the Regions welcomes Croatian... 2023-07-20   \n",
+       "4  Corrigendum to Commission Implementing Regulat... 2023-07-20   \n",
+       "\n",
+       "                                    eurovoc_concepts  \\\n",
+       "0  [China, Malaysia, anti-dumping duty, business ...   \n",
+       "1  [EU restrictive measure, Russia, Ukraine, econ...   \n",
+       "2  [Burma/Myanmar, EU restrictive measure, econom...   \n",
+       "3  [Croatia, EU regional policy, European Committ...   \n",
+       "4  [India, Türkiye, anti-dumping duty, building m...   \n",
+       "\n",
+       "                                                 url lang  \\\n",
+       "0  http://publications.europa.eu/resource/cellar/...  eng   \n",
+       "1  http://publications.europa.eu/resource/cellar/...  eng   \n",
+       "2  http://publications.europa.eu/resource/cellar/...  eng   \n",
+       "3  http://publications.europa.eu/resource/cellar/...  eng   \n",
+       "4  http://publications.europa.eu/resource/cellar/...  eng   \n",
+       "\n",
+       "                 formats                                               text  \n",
+       "0  [fmx4, pdfa2a, xhtml]  L_2023183EN. 01005801. xml 20. 7. 2023    EN O...  \n",
+       "1  [fmx4, pdfa2a, xhtml]  LI2023183EN. 01004801. xml 20. 7. 2023    EN O...  \n",
+       "2  [fmx4, pdfa2a, xhtml]  LI2023183EN. 01005201. xml 20. 7. 2023    EN O...  \n",
+       "3                  [pdf]  EUROPEAN UNION Committee of the Regions The Co...  \n",
+       "4  [fmx4, pdfa2a, xhtml]  L_2023183EN. 01005901. xml 20. 7. 2023    EN O...  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train = dataset['train'].to_pandas()\n",
+    "test = dataset['test'].to_pandas() if 'test' in dataset.keys() else None\n",
+    "validation = dataset['validation'].to_pandas() if 'validation' in dataset.keys() else None\n",
+    "\n",
+    "all = pd.concat([train, test, validation])#[:1000]\n",
+    "all.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4c141dfa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#all['eurovoc_concepts_str'] = all['eurovoc_concepts'].apply(str)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aeca89c2",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Create the MultiLabel Binarizer and save it in a file for prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d6846099",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('Number of classes', 6835)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mlb = MultiLabelBinarizer().fit(all['eurovoc_concepts'])\n",
+    "\n",
+    "pickle.dump(mlb, open('mlb.pickle', 'wb'))\n",
+    "\"Number of classes\", len(mlb.classes_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f27b865",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "## 2. Split data using iterative train test "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba290237",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "#X = np.array(all['text'].to_list())\n",
+    "#X = np.expand_dims(X, axis=1)\n",
+    "X = all['text'].to_numpy()\n",
+    "X = np.expand_dims(X, axis=1)\n",
+    "y = mlb.transform(all['eurovoc_concepts'])\n",
+    "\n",
+    "\n",
+    "from skmultilearn.model_selection import iterative_train_test_split\n",
+    "x_tr, y_tr, x_test, y_test = iterative_train_test_split(X, y, test_size = 0.1)\n",
+    "x_tr, y_tr, x_val, y_val = iterative_train_test_split(x_tr, y_tr, test_size = 0.1)\n",
+    "len(x_tr), len(x_val), len(x_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98371ad3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example \n",
+    "i = 10\n",
+    "x_tr[i][0][0:120], mlb.inverse_transform(np.expand_dims(y_tr[i], axis=1).T)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c959b6a",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. Model definition and training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a177f1ce",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4061399",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]\n",
+      "\n",
+      "  | Name        | Type      | Params\n",
+      "------------------------------------------\n",
+      "0 | bert        | BertModel | 109 M \n",
+      "1 | dropout     | Dropout   | 0     \n",
+      "2 | classifier1 | Linear    | 5.1 M \n",
+      "3 | criterion   | BCELoss   | 0     \n",
+      "------------------------------------------\n",
+      "114 M     Trainable params\n",
+      "0         Non-trainable params\n",
+      "114 M     Total params\n",
+      "458.304   Total estimated model params size (MB)\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n",
+      "IOPub message rate exceeded.\n",
+      "The Jupyter server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--ServerApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "ServerApp.rate_limit_window=3.0 (secs)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%capture output\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from eurovoc import EurovocTagger, EurovocDataset, EurovocDataModule\n",
+    "\n",
+    "\n",
+    "BERT_MODEL_NAME = \"nlpaueb/legal-bert-base-uncased\"\n",
+    "N_EPOCHS = 30\n",
+    "BATCH_SIZE = 10\n",
+    "MAX_LEN = 512\n",
+    "LR = 5e-05\n",
+    "\n",
+    "\n",
+    "# Instantiate and set up the data_module\n",
+    "dataloader = EurovocDataModule(BERT_MODEL_NAME, x_tr, y_tr, x_val, y_val , BATCH_SIZE, MAX_LEN)\n",
+    "dataloader.setup()\n",
+    "\n",
+    "\n",
+    "model = EurovocTagger(BERT_MODEL_NAME, len(mlb.classes_), lr=LR)\n",
+    "\n",
+    "checkpoint_callback = ModelCheckpoint(\n",
+    "    monitor='val_loss',\n",
+    "    filename='EurovocTagger-{epoch:02d}-{val_loss:.2f}',\n",
+    "    mode='min',\n",
+    ")\n",
+    "\n",
+    "trainer = pl.Trainer(max_epochs=N_EPOCHS , accelerator=\"gpu\", devices=1, callbacks=[checkpoint_callback])#,strategy=\"ddp_notebook\")\n",
+    "trainer.fit(model, dataloader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "19084e69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.save_checkpoint(\"eurovoc_cellar.ckpt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8289db5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7c250c40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.save('x_test', x_test)\n",
+    "np.save('y_test', y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "418a7fd0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/scampion/training/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py:148: UserWarning: `.test(ckpt_path=None)` was called without a model. The best model of the previous `fit` call will be used. You can pass `.test(ckpt_path='best')` to use the best model or `.test(ckpt_path='last')` to use the last model. If you pass a value, this warning will be silenced.\n",
+      "  rank_zero_warn(\n",
+      "Restoring states from the checkpoint path at /home/scampion/training/lightning_logs/version_9/checkpoints/EurovocTagger-epoch=06-val_loss=0.00.ckpt\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]\n",
+      "Loaded model weights from the checkpoint at /home/scampion/training/lightning_logs/version_9/checkpoints/EurovocTagger-epoch=06-val_loss=0.00.ckpt\n",
+      "/home/scampion/training/venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:432: PossibleUserWarning: The dataloader, test_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 32 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
+      "  rank_zero_warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52fdd2fcc27744c4955dc449cc126100",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Testing: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\">        Test metric        </span>┃<span style=\"font-weight: bold\">       DataLoader 0        </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">         test_loss         </span>│<span style=\"color: #800080; text-decoration-color: #800080\">   0.0031269278842955828   </span>│\n",
+       "└───────────────────────────┴───────────────────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1m       Test metric       \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m      DataLoader 0       \u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36m        test_loss        \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m  0.0031269278842955828  \u001b[0m\u001b[35m \u001b[0m│\n",
+       "└───────────────────────────┴───────────────────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'test_loss': 0.0031269278842955828}]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.test(dataloaders=dataloader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66b871ec",
+   "metadata": {},
+   "source": [
+    "# Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "ba317c3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/home/scampion/training/lightning_logs/version_9/checkpoints/EurovocTagger-epoch=06-val_loss=0.00.ckpt'"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "best_model_path = trainer.checkpoint_callback.best_model_path\n",
+    "best_model_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "fe9751a1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "100%|██████████| 23243/23243 [16:20<00:00, 23.72it/s]  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "trained_model = EurovocTagger.load_from_checkpoint(best_model_path,\n",
+    "                                                   bert_model_name=BERT_MODEL_NAME,\n",
+    "                                                   n_classes=len(mlb.classes_))\n",
+    "trained_model.eval()\n",
+    "trained_model.freeze()\n",
+    "\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "trained_model = trained_model.to(device)\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)\n",
+    "\n",
+    "val_dataset = EurovocDataset(x_test, y_test, tokenizer, max_token_len=MAX_LEN)\n",
+    "predictions = []\n",
+    "labels = []\n",
+    "\n",
+    "for item in tqdm(val_dataset):\n",
+    "  _, prediction = trained_model(\n",
+    "    item[\"input_ids\"].unsqueeze(dim=0).to(device), \n",
+    "    item[\"attention_mask\"].unsqueeze(dim=0).to(device)\n",
+    "  )\n",
+    "  predictions.append(prediction.flatten())\n",
+    "  labels.append(item[\"labels\"].int())\n",
+    "\n",
+    "predictions = torch.stack(predictions).detach().cpu()\n",
+    "labels = torch.stack(labels).detach().cpu()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67477f7f",
+   "metadata": {},
+   "source": [
+    "### F1 Score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f0265f6e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.01 tensor(0.2188)\n",
+      "0.06 tensor(0.3929)\n",
+      "0.11 tensor(0.4353)\n",
+      "0.16 tensor(0.4462)\n",
+      "0.21 tensor(0.4437)\n",
+      "0.26 tensor(0.4364)\n",
+      "0.31 tensor(0.4249)\n",
+      "0.36 tensor(0.4106)\n",
+      "0.41 tensor(0.3947)\n",
+      "0.46 tensor(0.3780)\n",
+      "0.51 tensor(0.3597)\n",
+      "0.56 tensor(0.3404)\n",
+      "0.61 tensor(0.3209)\n",
+      "0.66 tensor(0.3007)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from torchmetrics import F1Score\n",
+    "for i in range(1, 70, 5):\n",
+    "    f1 = F1Score(task=\"multilabel\", num_labels=len(mlb.classes_),  average='weighted', threshold= i / 100.0)\n",
+    "    print(i / 100.0, f1(predictions, labels))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0945ad49",
+   "metadata": {},
+   "source": [
+    "### NDCG Score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4e3291f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import ndcg_score\n",
+    "def calculate_average_ndcg(predictions, labels, top_k=5):\n",
+    "    # Initialize a list to store NDCG scores for each sample\n",
+    "    ndcg_scores = []\n",
+    "\n",
+    "    # Calculate NDCG for each sample\n",
+    "    for i in range(len(predictions)):\n",
+    "        # Convert tensors to numpy arrays\n",
+    "        y_true = labels[i].cpu().numpy().reshape(1, -1)\n",
+    "        y_score = predictions[i].cpu().numpy().reshape(1, -1)\n",
+    "        \n",
+    "        # Calculate NDCG for the sample\n",
+    "        ndcg = ndcg_score(y_true, y_score, k=top_k)\n",
+    "        ndcg_scores.append(ndcg)\n",
+    "\n",
+    "    # Calculate the average NDCG score\n",
+    "    average_ndcg = np.mean(ndcg_scores)\n",
+    "    \n",
+    "    return average_ndcg\n",
+    "\n",
+    "for k in [3, 5, 10]:\n",
+    "    average = calculate_average_ndcg(predictions, labels, top_k=k)\n",
+    "    print(\"NDCG@\"+str(k)+\": \"+ str(round(average, 4)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "eurovoc-env",
+   "language": "python",
+   "name": "eurovoc-env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}