BecomeAllan
/

ML-SLRC

Model card Files Files and versions

xet

Community

BecomeAllan commited on May 26, 2022

Commit

8bf76cf

1 Parent(s): 9c0c4aa

update funs

Browse files

Files changed (3) hide show

.vscode/settings.json +7 -0
ML_SLRC.py +382 -44
Util_funs.py +305 -418

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "workbench.colorCustomizations": {
+    "activityBar.background": "#093518",
+    "titleBar.activeBackground": "#0D4A21",
+    "titleBar.activeForeground": "#F3FDF6"
+  }
+}

ML_SLRC.py CHANGED Viewed

@@ -1,33 +1,18 @@
-import torch.nn.functional as F
-import torch.nn as nn
-import math
 import torch
 import numpy as np
-import pandas as pd
-import time
-import transformers
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from sklearn.manifold import TSNE
-from copy import deepcopy, copy
-import seaborn as sns
-import matplotlib.pylab as plt
-from pprint import pprint
-import shutil
-import datetime
 import re
-import json
-from pathlib import Path
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
 import unicodedata
-import re
 import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
 # Pre-trained model
@@ -117,7 +102,6 @@ class SLR_Classifier(nn.Module):
     return [loss, [feature, logit], predict]
 # Undesirable patterns within texts
 patterns = {
     'CONCLUSIONS AND IMPLICATIONS':'',
@@ -157,27 +141,50 @@ patterns = {
     '</p>':'',
     '<<ETX>>':'',
     '+/-':'',
     }
 patterns = {x.lower():y for x,y in patterns.items()}
-LABEL_MAP = {'negative': 0, 'positive': 1}
 class SLR_DataSet(Dataset):
-  def __init__(self, **args):
     self.tokenizer = args.get('tokenizer')
     self.data = args.get('data')
     self.max_seq_length = args.get("max_seq_length", 512)
     self.INPUT_NAME = args.get("input", 'x')
     self.LABEL_NAME = args.get("output", 'y')
   # Tokenizing and processing text
   def encode_text(self, example):
     comment_text = example[self.INPUT_NAME]
-    comment_text = self.treat_text(comment_text)
     try:
-      labels = LABEL_MAP[example[self.LABEL_NAME]]
     except:
       labels = -1
@@ -200,15 +207,6 @@ class SLR_DataSet(Dataset):
       torch.tensor([torch.tensor(labels).to(int)])
     ))
-  # Text processing function
-  def treat_text(self, text):
-    text = unicodedata.normalize("NFKD",str(text))
-    text = multiple_replace(patterns,text.lower())
-    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
-    text = re.sub('( +)',' ', text)
-    text = re.sub('(, ,)|(,,)',',', text)
-    text = re.sub('(%)|(per cent)',' percent', text)
-    return text
   def __len__(self):
     return len(self.data)
@@ -221,6 +219,350 @@ class SLR_DataSet(Dataset):
     return temp_data
 # Regex multiple replace function
 def multiple_replace(dict, text):
@@ -229,8 +571,4 @@ def multiple_replace(dict, text):
   regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
   # Substitution
-  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
-# Undesirable patterns within texts

+from torch import nn
 import torch
 import numpy as np
+from copy import deepcopy
 import re
 import unicodedata
+from torch.utils.data import Dataset, DataLoader,TensorDataset, RandomSampler
+from sklearn.model_selection import train_test_split
+from torch.optim import Adam
+from copy import deepcopy
+import gc
 import torch
+import numpy as np
+from torchmetrics import functional as fn
+import random
 # Pre-trained model
     return [loss, [feature, logit], predict]
 # Undesirable patterns within texts
 patterns = {
     'CONCLUSIONS AND IMPLICATIONS':'',
     '</p>':'',
     '<<ETX>>':'',
     '+/-':'',
+    '\(.+\)':'',
+    '\[.+\]':'',
+    ' \d ':'',
+    '<':'',
+    '>':'',
+    '- ':'',
+    ' +':' ',
+    ', ,':',',
+    ',,':',',
+    '%':' percent',
+    'per cent':' percent'
     }
 patterns = {x.lower():y for x,y in patterns.items()}
+LABEL_MAP = {'negative': 0,
+             'not included':0,
+             '0':0,
+             0:0,
+             'excluded':0,
+             'positive': 1,
+             'included':1,
+             '1':1,
+             1:1,
+             }
 class SLR_DataSet(Dataset):
+  def __init__(self,treat_text =None, **args):
     self.tokenizer = args.get('tokenizer')
     self.data = args.get('data')
     self.max_seq_length = args.get("max_seq_length", 512)
     self.INPUT_NAME = args.get("input", 'x')
     self.LABEL_NAME = args.get("output", 'y')
+    self.treat_text = treat_text
   # Tokenizing and processing text
   def encode_text(self, example):
     comment_text = example[self.INPUT_NAME]
+    if self.treat_text:
+      comment_text = self.treat_text(comment_text)
     try:
+      labels = LABEL_MAP[example[self.LABEL_NAME].lower()]
     except:
       labels = -1
       torch.tensor([torch.tensor(labels).to(int)])
     ))
   def __len__(self):
     return len(self.data)
     return temp_data
+class Learner(nn.Module):
+    def __init__(self, **args):
+        """
+        :param args:
+        """
+        super(Learner, self).__init__()
+        self.inner_print = args.get('inner_print')
+        self.inner_batch_size = args.get('inner_batch_size')
+        self.outer_update_lr  = args.get('outer_update_lr')
+        self.inner_update_lr  = args.get('inner_update_lr')
+        self.inner_update_step = args.get('inner_update_step')
+        self.inner_update_step_eval = args.get('inner_update_step_eval')
+        self.model = args.get('model')
+        self.device = args.get('device')
+        # Outer optimizer
+        self.outer_optimizer = Adam(self.model.parameters(), lr=self.outer_update_lr)
+        self.model.train()
+    def forward(self, batch_tasks, training = True, valid_train = True):
+        """
+        batch = [(support TensorDataset, query TensorDataset),
+                 (support TensorDataset, query TensorDataset),
+                 (support TensorDataset, query TensorDataset),
+                 (support TensorDataset, query TensorDataset)]
+        # support = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
+        """
+        task_accs = []
+        task_f1 = []
+        task_recall = []
+        sum_gradients = []
+        num_task = len(batch_tasks)
+        num_inner_update_step = self.inner_update_step if training else self.inner_update_step_eval
+        # Outer loop tasks
+        for task_id, task in enumerate(batch_tasks):
+            support = task[0]
+            query   = task[1]
+            name   = task[2]
+            # Copying model
+            fast_model = deepcopy(self.model)
+            fast_model.to(self.device)
+            # Inner trainer optimizer
+            inner_optimizer = Adam(fast_model.parameters(), lr=self.inner_update_lr)
+            # Creating training data loaders
+            if len(support) % self.inner_batch_size == 1 :
+              support_dataloader = DataLoader(support, sampler=RandomSampler(support),
+                                              batch_size=self.inner_batch_size,
+                                              drop_last=True)
+            else:
+              support_dataloader = DataLoader(support, sampler=RandomSampler(support),
+                                              batch_size=self.inner_batch_size,
+                                              drop_last=False)
+            # steps_per_epoch=len(support) // self.inner_batch_size
+            # total_training_steps = steps_per_epoch * 5
+            # warmup_steps = total_training_steps // 3
+            #
+            # scheduler = get_linear_schedule_with_warmup(
+            #            inner_optimizer,
+            #           num_warmup_steps=warmup_steps,
+            #           num_training_steps=total_training_steps
+            #           )
+            fast_model.train()
+            # Inner loop training epoch (support set)
+            if valid_train:
+              print('----Task',task_id,":", name, '----')
+            for i in range(0, num_inner_update_step):
+                all_loss = []
+                # Inner loop training batch (support set)
+                for inner_step, batch in enumerate(support_dataloader):
+                    batch = tuple(t.to(self.device) for t in batch)
+                    input_ids, attention_mask, token_type_ids, label_id = batch
+                    # Feed Foward
+                    loss, _, _ = fast_model(input_ids, attention_mask, token_type_ids=token_type_ids, labels = label_id)
+                    # Computing gradients
+                    loss.backward()
+                    # torch.nn.utils.clip_grad_norm_(fast_model.parameters(), max_norm=1)
+                    # Updating inner training parameters
+                    inner_optimizer.step()
+                    inner_optimizer.zero_grad()
+                    # Appending losses
+                    all_loss.append(loss.item())
+                    del batch, input_ids, attention_mask, label_id
+                    torch.cuda.empty_cache()
+                if valid_train:
+                  if (i+1) % self.inner_print == 0:
+                      print("Inner Loss: ", np.mean(all_loss))
+            fast_model.to(torch.device('cpu'))
+            # Inner training phase weights
+            if training:
+                meta_weights = list(self.model.parameters())
+                fast_weights = list(fast_model.parameters())
+                # Appending gradients
+                gradients = []
+                for i, (meta_params, fast_params) in enumerate(zip(meta_weights, fast_weights)):
+                    gradient = meta_params - fast_params
+                    if task_id == 0:
+                        sum_gradients.append(gradient)
+                    else:
+                        sum_gradients[i] += gradient
+            # Inner test (query set)
+            fast_model.to(self.device)
+            fast_model.eval()
+            if valid_train:
+              # Inner test (query set)
+              fast_model.to(self.device)
+              fast_model.eval()
+            with torch.no_grad():
+                # Data loader
+                query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
+                query_batch = iter(query_dataloader).next()
+                query_batch = tuple(t.to(self.device) for t in query_batch)
+                q_input_ids, q_attention_mask, q_token_type_ids, q_label_id = query_batch
+                # Feedfoward
+                _, _, pre_label_id = fast_model(q_input_ids, q_attention_mask, q_token_type_ids, labels = q_label_id)
+                # Predictions
+                pre_label_id = pre_label_id.detach().cpu().squeeze()
+                # Labels
+                q_label_id = q_label_id.detach().cpu()
+                # Calculating metrics
+                acc = fn.accuracy(pre_label_id, q_label_id).item()
+                recall = fn.recall(pre_label_id, q_label_id).item(),
+                f1 = fn.f1_score(pre_label_id, q_label_id).item()
+                # appending metrics
+                task_accs.append(acc)
+                task_f1.append(f1)
+                task_recall.append(recall)
+                fast_model.to(torch.device('cpu'))
+            del fast_model, inner_optimizer
+            torch.cuda.empty_cache()
+        print("\n")
+        print("f1:",np.mean(task_f1))
+        print("recall:",np.mean(task_recall))
+        # Updating outer training parameters
+        if training:
+            # Mean of gradients
+            for i in range(0,len(sum_gradients)):
+                sum_gradients[i] = sum_gradients[i] / float(num_task)
+            # Indexing parameters to model
+            for i, params in enumerate(self.model.parameters()):
+                params.grad = sum_gradients[i]
+            # Updating parameters
+            self.outer_optimizer.step()
+            self.outer_optimizer.zero_grad()
+            del sum_gradients
+            gc.collect()
+            torch.cuda.empty_cache()
+        if valid_train:
+          return np.mean(task_accs)
+        else:
+          return np.array(0)
+# Creating Meta Tasks
+class MetaTask(Dataset):
+    def __init__(self, examples, num_task, k_support, k_query,
+                 tokenizer, training=True, max_seq_length=512,
+                 treat_text =None, **args):
+        """
+        :param samples: list of samples
+        :param num_task: number of training tasks.
+        :param k_support: number of classes support samples per task
+        :param k_query: number of classes query sample per task
+        """
+        self.examples = examples
+        self.num_task =  num_task
+        self.k_support = k_support
+        self.k_query = k_query
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self.treat_text = treat_text
+        # Randomly generating tasks
+        self.create_batch(self.num_task, training)
+    # Creating batch
+    def create_batch(self, num_task, training):
+        self.supports = []  # support set
+        self.queries = []  # query set
+        self.task_names = [] # Name of task
+        self.supports_indexs = [] # index of supports
+        self.queries_indexs = [] # index of queries
+        self.num_task=num_task
+        # Available tasks
+        domains = self.examples['domain'].unique()
+        # If not training, create all tasks
+        if not(training):
+          self.task_names = domains
+          num_task = len(self.task_names)
+          self.num_task=num_task
+        for b in range(num_task):  # For each task,
+            total_per_class = self.k_support + self.k_query
+            task_size = 2*self.k_support + 2*self.k_query
+            # Select a task at random
+            if training:
+              domain = random.choice(domains)
+              self.task_names.append(domain)
+            else:
+              domain = self.task_names[b]
+            # Task data
+            domainExamples = self.examples[self.examples['domain'] == domain]
+            # Minimal label quantity
+            min_per_class = min(domainExamples['label'].value_counts())
+            if total_per_class > min_per_class:
+              total_per_class = min_per_class
+            # Select k_support + k_query task examples
+            # Sample (n) from each label(class)
+            selected_examples = domainExamples.groupby("label").sample(total_per_class, replace = False)
+            # Split data into support (training) and query (testing) sets
+            s, q = train_test_split(selected_examples,
+                                    stratify= selected_examples["label"],
+                                    test_size= 2*self.k_query/task_size,
+                                    shuffle=True)
+            # Permutating data
+            s = s.sample(frac=1)
+            q = q.sample(frac=1)
+            # Appending indexes
+            if not(training):
+              self.supports_indexs.append(s.index)
+              self.queries_indexs.append(q.index)
+            # Creating list of support (training) and query (testing) tasks
+            self.supports.append(s.to_dict('records'))
+            self.queries.append(q.to_dict('records'))
+    # Creating task tensors
+    def create_feature_set(self, examples):
+        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
+        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
+        all_token_type_ids = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
+        all_label_ids      = torch.empty(len(examples), dtype = torch.long)
+        for _id, e in enumerate(examples):
+          all_input_ids[_id], all_attention_mask[_id], all_token_type_ids[_id], all_label_ids[_id] = self.encode_text(e)
+        return TensorDataset(
+            all_input_ids,
+            all_attention_mask,
+            all_token_type_ids,
+            all_label_ids
+        )
+    # Data encoding
+    def encode_text(self, example):
+      comment_text = example["text"]
+      if self.treat_text:
+        comment_text = self.treat_text(comment_text)
+      labels = LABEL_MAP[example["label"]]
+      encoding = self.tokenizer.encode_plus(
+        (comment_text, "It is a great text."),
+        add_special_tokens=True,
+        max_length=self.max_seq_length,
+        return_token_type_ids=True,
+        padding="max_length",
+        truncation=True,
+        return_attention_mask=True,
+        return_tensors='pt',
+      )
+      return tuple((
+        encoding["input_ids"].flatten(),
+        encoding["attention_mask"].flatten(),
+        encoding["token_type_ids"].flatten(),
+        torch.tensor([torch.tensor(labels).to(int)])
+      ))
+    # Returns data upon calling
+    def __getitem__(self, index):
+        support_set = self.create_feature_set(self.supports[index])
+        query_set   = self.create_feature_set(self.queries[index])
+        name        = self.task_names[index]
+        return support_set, query_set, name
+    def __len__(self):
+        return self.num_task
+class treat_text:
+  def __init__(self, patterns):
+    self.patterns = patterns
+  def __call__(self,text):
+    text = unicodedata.normalize("NFKD",str(text))
+    text = multiple_replace(self.patterns,text.lower())
+    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
+    text = re.sub('( +)',' ', text)
+    text = re.sub('(, ,)|(,,)',',', text)
+    text = re.sub('(%)|(per cent)',' percent', text)
+    return text
 # Regex multiple replace function
 def multiple_replace(dict, text):
   regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
   # Substitution
+  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

Util_funs.py CHANGED Viewed

@@ -1,49 +1,49 @@
 import os
-import torch
 import numpy as np
 import random
-import json, pickle
-import torch.nn.functional as F
-import torch.nn as nn
-import math
 import torch
-import numpy as np
-import pandas as pd
 import time
-import transformers
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from sklearn.manifold import TSNE
-from copy import deepcopy, copy
 import seaborn as sns
 import matplotlib.pylab as plt
-from pprint import pprint
-import shutil
-import datetime
-import re
 import json
 from pathlib import Path
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-from torch import nn
-from torch.nn import functional as F
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler
-from torch.optim import Adam
-from torch.nn import CrossEntropyLoss
-from transformers import BertForSequenceClassification
-from copy import deepcopy
-import gc
-from sklearn.metrics import accuracy_score
-import torch
-import numpy as np
-import torchmetrics
-from torchmetrics import functional as fn
-SEED = 2222
-gen_seed = torch.Generator().manual_seed(SEED)
 # Random seed function
@@ -54,7 +54,7 @@ def random_seed(value):
     np.random.seed(value)
     random.seed(value)
-# Batch creation function
 def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
     idxs = list(range(0,len(taskset)))
     if is_shuffle:
@@ -63,48 +63,51 @@ def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
         yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]
-def prepare_data(data, batch_size,tokenizer,max_seq_length,
                  input = 'text', output = 'label',
-                 train_size_per_class = 5):
   data = data.reset_index().drop("index", axis=1)
-  labaled_data = data.loc[~data['label'].isna()]
-  data_train = labaled_data.groupby('label').sample(train_size_per_class)
-  rest_labaled_data = labaled_data.loc[~labaled_data.index.isin(data_train.index),:]
-  unlabaled_data = data.loc[data['label'].isna()]
-  data_test=data
-  # Train
-  ## Transforma em dataset
   dataset_train = SLR_DataSet(
     data = data_train.sample(frac=1),
     input = input,
     output = output,
     tokenizer=tokenizer,
-    max_seq_length =max_seq_length)
-  # Test
-  # Dataloaders
-    ## Transforma em dataset
   dataset_test = SLR_DataSet(
     data = data_test,
     input = input,
     output = output,
     tokenizer=tokenizer,
-    max_seq_length =max_seq_length)
   # Dataloaders
-  ## Treino
   data_train_loader = DataLoader(dataset_train,
                            shuffle=True,
                            batch_size=batch_size['train']
                                 )
   if len(dataset_test) % batch_size['test'] == 1 :
     data_test_loader = DataLoader(dataset_test,
                                     batch_size=batch_size['test'],
@@ -117,50 +120,54 @@ def prepare_data(data, batch_size,tokenizer,max_seq_length,
   return data_train_loader, data_test_loader, data_train, data_test
-from tqdm import tqdm
-def meta_train(data, model, device, Info, print_epoch =True, size_layer=0, Test_resource =None):
   learner = Learner(model = model, device = device, **Info)
   # Testing tasks
   if isinstance(Test_resource, pd.DataFrame):
     test = MetaTask(Test_resource, num_task = 0, k_support=10, k_query=10,
-                  training=False, **Info)
   torch.clear_autocast_cache()
   gc.collect()
   torch.cuda.empty_cache()
-  # Meta epoca
   for epoch in tqdm(range(Info['meta_epoch']), desc= "Meta epoch ", ncols=80):
-    # print("Meta Epoca:", epoch)
-      # Tarefas de treino
       train = MetaTask(data,
                       num_task = Info['num_task_train'],
                       k_support=Info['k_qry'],
-                      k_query=Info['k_spt'], **Info)
-      # Batchs de tarefas
       db = create_batch_of_tasks(train, is_shuffle = True, batch_size = Info["outer_batch_size"])
       if print_epoch:
       # Outer loop bach training
         for step, task_batch in enumerate(db):
             print("\n-----------------Training Mode","Meta_epoch:", epoch ,"-----------------\n")
-            # meta-feedfoward
             acc = learner(task_batch, valid_train= print_epoch)
             print('Step:', step, '\ttraining Acc:', acc)
         if isinstance(Test_resource, pd.DataFrame):
-          # Validating Model
           if ((epoch+1) % 4) + step == 0:
               random_seed(123)
               print("\n-----------------Testing Mode-----------------\n")
               db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
               acc_all_test = []
@@ -174,10 +181,10 @@ def meta_train(data, model, device, Info, print_epoch =True, size_layer=0, Test_
               # Restarting training randomly
               random_seed(int(time.time() % 10))
       else:
         for step, task_batch in enumerate(db):
             acc = learner(task_batch, print_epoch, valid_train= print_epoch)
   torch.clear_autocast_cache()
@@ -187,14 +194,14 @@ def meta_train(data, model, device, Info, print_epoch =True, size_layer=0, Test_
 def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr = 1, print_info = True, name = 'name'):
-  # Inicia o modelo
   model_meta = deepcopy(model)
   optimizer = Adam(model_meta.parameters(), lr=lr)
   model_meta.to(device)
   model_meta.train()
-  # Loop de treino da tarefa
   for i in range(0, epoch):
       all_loss = []
@@ -203,13 +210,13 @@ def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr
           batch = tuple(t.to(device) for t in batch)
           input_ids, attention_mask,q_token_type_ids, label_id = batch
-          # Feedfoward
           loss, _, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
-          # Calcula gradientes
           loss.backward()
-          # Atualiza os parametros
           optimizer.step()
           optimizer.zero_grad()
@@ -220,39 +227,43 @@ def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr
           print("Loss: ", np.mean(all_loss))
-  # Predicao no banco de teste
   model_meta.eval()
   all_loss = []
-  # all_acc = []
   features = []
   labels = []
   predi_logit = []
   with torch.no_grad():
       for inner_step, batch in enumerate(tqdm(data_test_loader,
                                               desc="Test validation | " + name,
                                               ncols=80)) :
         batch = tuple(t.to(device) for t in batch)
         input_ids, attention_mask,q_token_type_ids, label_id = batch
-        # Predicoes
         _, feature, prediction = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
         prediction = prediction.detach().cpu().squeeze()
         label_id = label_id.detach().cpu()
         logit = feature[1].detach().cpu()
-        feature_lat = feature[0].detach().cpu()
-        labels.append(label_id.numpy().squeeze())
         features.append(feature_lat.numpy())
-        predi_logit.append(logit.numpy())
-        # acc = fn.accuracy(prediction, label_id).item()
-        # all_acc.append(acc)
       del input_ids, attention_mask, label_id, batch
-  # if print_info:
-  #   print("acc:", np.mean(all_acc))
   model_meta.to('cpu')
   gc.collect()
@@ -260,26 +271,32 @@ def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr
   del model_meta, optimizer
   features = np.concatenate(np.array(features,dtype=object))
-  labels = np.concatenate(np.array(labels,dtype=object))
-  logits = np.concatenate(np.array(predi_logit,dtype=object))
   features = torch.tensor(features.astype(np.float32)).detach().clone()
   labels = torch.tensor(labels.astype(int)).detach().clone()
   logits = torch.tensor(logits.astype(np.float32)).detach().clone()
-  # Reducao de dimensionalidade
   X_embedded = TSNE(n_components=2, learning_rate='auto',
                     init='random').fit_transform(features.detach().clone())
   return logits.detach().clone(), X_embedded, labels.detach().clone(), features.detach().clone()
 def wss_calc(logit, labels, trsh = 0.5):
-  # Predicao com base nos treshould
   predict_trash = torch.sigmoid(logit).squeeze() >= trsh
   CM = confusion_matrix(labels, predict_trash.to(int) )
   tn, fp, fne, tp = CM.ravel()
@@ -287,36 +304,22 @@ def wss_calc(logit, labels, trsh = 0.5):
   N = (tn + fp)
   recall = tp/(tp+fne)
-  # Wss antigo
-  wss_old = (tn + fne)/len(labels) -(1- recall)
-  # WSS novo
-  wss_new = (tn/N - fne/P)
   return {
-      "wss": round(wss_old,4),
-      "awss": round(wss_new,4),
       "R": round(recall,4),
       "CM": CM
       }
-from sklearn.metrics import confusion_matrix
-from torchmetrics import functional as fn
-import matplotlib.pyplot as plt
-from sklearn.metrics import roc_curve, auc
-from sklearn.metrics import roc_auc_score
-import ipywidgets as widgets
-from IPython.display import HTML, display, clear_output
-import matplotlib.pyplot as plt
-import seaborn as sns
-import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-def plot(logits, X_embedded, labels, tresh, show = True,
          namefig = "plot", make_plot = True, print_stats = True, save = True):
   col = pd.MultiIndex.from_tuples([
                                    ("Predict", "0"),
@@ -329,30 +332,27 @@ def plot(logits, X_embedded, labels, tresh, show = True,
   predict = torch.sigmoid(logits).detach().clone()
-  roc_auc = dict()
   fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
-  # Sem especificar o tresh
-  # WSS
-  ## indice do recall 0.95
   idx_wss95 = sum(tpr < 0.95)
   thresholds95 = thresholds[idx_wss95]
   wss95_info = wss_calc(logits,labels, thresholds95 )
   acc_wss95 = fn.accuracy(predict, labels, threshold=thresholds95)
   f1_wss95 = fn.f1_score(predict, labels, threshold=thresholds95)
-  # Especificando o tresh
-  # Treshold avaliation
-  ## WSS
-  wss_info = wss_calc(logits,labels, tresh )
-  # Accuraci
-  acc_wssR = fn.accuracy(predict, labels, threshold=tresh)
-  f1_wssR = fn.f1_score(predict, labels, threshold=tresh)
   metrics= {
@@ -370,12 +370,11 @@ def plot(logits, X_embedded, labels, tresh, show = True,
       # f1
       "f1@95": f1_wss95.item(),
       "f1@R": f1_wssR.item(),
-      # treshould 95
-      "treshould@95": thresholds95
   }
-  # print stats
   if print_stats:
     wss95= f"WSS@95:{wss95_info['wss']}, R: {wss95_info['R']}"
     wss95_adj= f"ASSWSS@95:{wss95_info['awss']}"
@@ -383,14 +382,14 @@ def plot(logits, X_embedded, labels, tresh, show = True,
     print(wss95_adj)
     print('Acc.:', round(acc_wss95.item(), 4))
     print('F1-score:', round(f1_wss95.item(), 4))
-    print(f"Treshold to wss95: {round(thresholds95, 4)}")
     cm = pd.DataFrame(wss95_info['CM'],
               index=index,
               columns=col)
     print("\nConfusion matrix:")
     print(cm)
-    print("\n---Metrics with threshold:", tresh, "----\n")
     wss= f"WSS@R:{wss_info['wss']}, R: {wss_info['R']}"
     print(wss)
     wss_adj= f"AWSS@R:{wss_info['awss']}"
@@ -405,51 +404,53 @@ def plot(logits, X_embedded, labels, tresh, show = True,
     print(cm)
-  # Graficos
   if make_plot:
     fig, axes = plt.subplots(1, 4, figsize=(25,10))
     alpha = torch.squeeze(predict).numpy()
-    # plots
     p1 = sns.scatterplot(x=X_embedded[:, 0],
                   y=X_embedded[:, 1],
                   hue=labels,
-                  alpha=alpha, ax = axes[0]).set_title('Predictions-TSNE')
     t_wss = predict >= thresholds95
     t_wss = t_wss.squeeze().numpy()
     p2 = sns.scatterplot(x=X_embedded[t_wss, 0],
                   y=X_embedded[t_wss, 1],
                   hue=labels[t_wss],
-                  alpha=alpha[t_wss], ax = axes[1]).set_title('WSS@95')
-    t = predict >= tresh
     t = t.squeeze().numpy()
     p3 = sns.scatterplot(x=X_embedded[t, 0],
                   y=X_embedded[t, 1],
                   hue=labels[t],
-                  alpha=alpha[t], ax = axes[2]).set_title(f'Predictions-Treshold {tresh}')
     roc_auc = auc(fpr, tpr)
     lw = 2
     axes[3].plot(
       fpr,
       tpr,
       color="darkorange",
       lw=lw,
       label="ROC curve (area = %0.2f)" % roc_auc)
     axes[3].plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
     axes[3].axhline(y=0.95, color='r', linestyle='-')
-    axes[3].set(xlabel="False Positive Rate", ylabel="True Positive Rate", title= "ROC")
     axes[3].legend(loc="lower right")
     if show:
       plt.show()
@@ -459,6 +460,7 @@ def plot(logits, X_embedded, labels, tresh, show = True,
   return metrics
 def auc_plot(logits,labels, color = "darkorange", label = "test"):
     predict = torch.sigmoid(logits).detach().clone()
     fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
@@ -478,45 +480,40 @@ def auc_plot(logits,labels, color = "darkorange", label = "test"):
     plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
     plt.axhline(y=0.95, color='r', linestyle='-')
-from sklearn.metrics import confusion_matrix
-from torchmetrics import functional as fn
-import matplotlib.pyplot as plt
-from sklearn.metrics import roc_curve, auc
-from sklearn.metrics import roc_auc_score
-import ipywidgets as widgets
-from IPython.display import HTML, display, clear_output
-import matplotlib.pyplot as plt
-import seaborn as sns
-import warnings
 class diagnosis():
-  def __init__(self, names, Valid_resource, batch_size_test, model,Info,start = 0):
     self.names=names
     self.Valid_resource=Valid_resource
     self.batch_size_test=batch_size_test
     self.model=model
-    self.start=start
     self.value_trash = widgets.FloatText(
         value=0.95,
-        description='tresh',
         disabled=False
     )
     self.valueb = widgets.IntText(
         value=10,
         description='size',
         disabled=False
     )
     self.train_b = widgets.Button(description="Train")
     self.next_b = widgets.Button(description="Next")
     self.eval_b = widgets.Button(description="Evaluation")
     self.hbox = widgets.HBox([self.train_b, self.valueb])
     self.next_b.on_click(self.Next_button)
     self.train_b.on_click(self.Train_button)
     self.eval_b.on_click(self.Evaluation_button)
@@ -527,36 +524,37 @@ class diagnosis():
     clear_output()
     self.i=self.i+1
-    # global domain
-    self.domain = names[self.i]
-    print("Name:", self.domain)
-    # global data
     self.data = self.Valid_resource[self.Valid_resource['domain'] == self.domain]
     print(self.data['label'].value_counts())
     display(self.hbox)
     display(self.next_b)
   # Train button
   def Train_button(self, y):
     clear_output()
     print(self.domain)
-    # Preparing data for training
     self.data_train_loader, self.data_test_loader, self.data_train, self.data_test = prepare_data(self.data,
               train_size_per_class = self.valueb.value,
-              batch_size = {'train': Info['inner_batch_size'],
-                            'test': batch_size_test},
-              max_seq_length = Info['max_seq_length'],
-              tokenizer = Info['tokenizer'],
               input = "text",
-              output = "label")
     self.logits, self.X_embedded, self.labels, self.features = train_loop(self.data_train_loader, self.data_test_loader,
-                                                        model, device,
-                                                        epoch = Info['inner_update_step'],
-                                                        lr=Info['inner_update_lr'],
                                                         print_info=True,
                                                         name = self.domain)
@@ -565,6 +563,7 @@ class diagnosis():
     display(tresh_box)
     display(self.next_b)
   # Evaluation button
   def Evaluation_button(self, te):
     clear_output()
@@ -573,19 +572,18 @@ class diagnosis():
     print(self.domain)
     # print("\n")
     print("-------Train data-------")
-    print(self.data_train['label'].value_counts())
     print("-------Test data-------")
-    print(self.data_test['label'].value_counts())
     # print("\n")
     display(self.next_b)
     display(tresh_box)
     display(self.hbox)
     metrics = plot(self.logits, self.X_embedded, self.labels,
-                    tresh=Info['tresh'], show = True,
-                    # namefig= "./"+base_path +"/"+"Results/size_layer/"+ name_domain+'/' +str(n_layers) + '/img/' + str(attempt) + 'plots',
                     namefig= 'test',
                   make_plot = True,
                   print_stats = True,
@@ -593,261 +591,150 @@ class diagnosis():
   def __call__(self):
     self.i= self.start-1
     clear_output()
     display(self.next_b)
-import torch.nn.functional as F
-import torch.nn as nn
-import math
-import torch
-import numpy as np
-import pandas as pd
-import time
-import transformers
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from sklearn.manifold import TSNE
-from copy import deepcopy, copy
-import seaborn as sns
-import matplotlib.pylab as plt
-from pprint import pprint
-import shutil
-import datetime
-import re
-import json
-from pathlib import Path
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-import unicodedata
-import re
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-# Pre-trained model
-class Encoder(nn.Module):
-  def __init__(self, layers, freeze_bert, model):
-    super(Encoder, self).__init__()
-    # Dummy Parameter
-    self.dummy_param = nn.Parameter(torch.empty(0))
-    # Pre-trained model
-    self.model = deepcopy(model)
-    # Freezing bert parameters
-    if freeze_bert:
-      for param in self.model.parameters():
-        param.requires_grad = freeze_bert
-    # Selecting hidden layers of the pre-trained model
-    old_model_encoder = self.model.encoder.layer
-    new_model_encoder = nn.ModuleList()
-    for i in layers:
-      new_model_encoder.append(old_model_encoder[i])
-    self.model.encoder.layer = new_model_encoder
-  # Feed forward
-  def forward(self, **x):
-    return self.model(**x)['pooler_output']
-# Complete model
-class SLR_Classifier(nn.Module):
-  def __init__(self, **data):
-    super(SLR_Classifier, self).__init__()
-    # Dummy Parameter
-    self.dummy_param = nn.Parameter(torch.empty(0))
-    # Loss function
-    # Binary Cross Entropy with logits reduced to mean
-    self.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean',
-                                        pos_weight=torch.FloatTensor([data.get("pos_weight",  2.5)]))
-    # Pre-trained model
-    self.Encoder = Encoder(layers = data.get("bert_layers",  range(12)),
-                           freeze_bert = data.get("freeze_bert",  False),
-                           model = data.get("model"),
-                           )
-    # Feature Map Layer
-    self.feature_map = nn.Sequential(
-            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
-            nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
-            # nn.Dropout(data.get("drop", 0.5)),
-            nn.Linear(self.Encoder.model.config.hidden_size, 200),
-            nn.Dropout(data.get("drop", 0.5)),
-        )
-    # Classifier Layer
-    self.classifier = nn.Sequential(
-            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
-            # nn.Dropout(data.get("drop", 0.5)),
-            # nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
-            # nn.Dropout(data.get("drop", 0.5)),
-            nn.Tanh(),
-            nn.Linear(200, 1)
-        )
-    # Initializing layer parameters
-    nn.init.normal_(self.feature_map[1].weight, mean=0, std=0.00001)
-    nn.init.zeros_(self.feature_map[1].bias)
-  # Feed forward
-  def forward(self, input_ids, attention_mask, token_type_ids, labels):
-    predict = self.Encoder(**{"input_ids":input_ids,
-                              "attention_mask":attention_mask,
-                              "token_type_ids":token_type_ids})
-    feature = self.feature_map(predict)
-    logit = self.classifier(feature)
-    predict = torch.sigmoid(logit)
-    # Loss function
-    loss = self.loss_fn(logit.to(torch.float), labels.to(torch.float).unsqueeze(1))
-    return [loss, [feature, logit], predict]
-# Undesirable patterns within texts
-patterns = {
-    'CONCLUSIONS AND IMPLICATIONS':'',
-    'BACKGROUND AND PURPOSE':'',
-    'EXPERIMENTAL APPROACH':'',
-    'KEY RESULTS AEA':'',
-    '©':'',
-    '®':'',
-    'μ':'',
-    '(C)':'',
-    'OBJECTIVE:':'',
-    'MATERIALS AND METHODS:':'',
-    'SIGNIFICANCE:':'',
-    'BACKGROUND:':'',
-    'RESULTS:':'',
-    'METHODS:':'',
-    'CONCLUSIONS:':'',
-    'AIM:':'',
-    'STUDY DESIGN:':'',
-    'CLINICAL RELEVANCE:':'',
-    'CONCLUSION:':'',
-    'HYPOTHESIS:':'',
-    'CLINICAL RELEVANCE:':'',
-    'Questions/Purposes:':'',
-    'Introduction:':'',
-    'PURPOSE:':'',
-    'PATIENTS AND METHODS:':'',
-    'FINDINGS:':'',
-    'INTERPRETATIONS:':'',
-    'FUNDING:':'',
-    'PROGRESS:':'',
-    'CONTEXT:':'',
-    'MEASURES:':'',
-    'DESIGN:':'',
-    'BACKGROUND AND OBJECTIVES:':'',
-    '<p>':'',
-    '</p>':'',
-    '<<ETX>>':'',
-    '+/-':'',
-    }
-patterns = {x.lower():y for x,y in patterns.items()}
-LABEL_MAP = {'negative': 0,
-             'not included':0,
-             '0':0,
-             0:0,
-             'excluded':0,
-             'positive': 1,
-             'included':1,
-             '1':1,
-             1:1,
-             }
-class SLR_DataSet(Dataset):
-  def __init__(self, **args):
-    self.tokenizer = args.get('tokenizer')
-    self.data = args.get('data')
-    self.max_seq_length = args.get("max_seq_length", 512)
-    self.INPUT_NAME = args.get("input", 'x')
-    self.LABEL_NAME = args.get("output", 'y')
-  # Tokenizing and processing text
-  def encode_text(self, example):
-    comment_text = example[self.INPUT_NAME]
-    comment_text = self.treat_text(comment_text)
-    try:
-      labels = LABEL_MAP[example[self.LABEL_NAME].lower()]
-    except:
-      labels = -1
-    encoding = self.tokenizer.encode_plus(
-      (comment_text, "It is great text"),
-      add_special_tokens=True,
-      max_length=self.max_seq_length,
-      return_token_type_ids=True,
-      padding="max_length",
-      truncation=True,
-      return_attention_mask=True,
-      return_tensors='pt',
-    )
-    return tuple((
-      encoding["input_ids"].flatten(),
-      encoding["attention_mask"].flatten(),
-      encoding["token_type_ids"].flatten(),
-      torch.tensor([torch.tensor(labels).to(int)])
-    ))
-  # Text processing function
-  def treat_text(self, text):
-    text = unicodedata.normalize("NFKD",str(text))
-    text = multiple_replace(patterns,text.lower())
-    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
-    text = re.sub('( +)',' ', text)
-    text = re.sub('(, ,)|(,,)',',', text)
-    text = re.sub('(%)|(per cent)',' percent', text)
-    return text
-  def __len__(self):
-    return len(self.data)
-  # Returning data
-  def __getitem__(self, index: int):
-    # print(index)
-    data_row = self.data.reset_index().iloc[index]
-    temp_data =  self.encode_text(data_row)
-    return temp_data
-# Regex multiple replace function
-def multiple_replace(dict, text):
-  # Building regex from dict keys
-  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
-  # Substitution
-  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
-# Undesirable patterns within texts

+from ML_SLRC import *
 import os
 import numpy as np
+import pandas as pd
+from torch.utils.data import  DataLoader
+from torch.optim import Adam
+import gc
+from torchmetrics import functional as fn
 import random
+warnings.simplefilter(action='ignore', category=FutureWarning)
+from tqdm import tqdm
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import roc_curve, auc
+import ipywidgets as widgets
+from IPython.display import  display, clear_output
+import matplotlib.pyplot as plt
+import warnings
 import torch
 import time
 from sklearn.manifold import TSNE
+from copy import deepcopy
 import seaborn as sns
 import matplotlib.pylab as plt
 import json
 from pathlib import Path
+import re
+from collections import defaultdict
+# SEED = 2222
+# gen_seed = torch.Generator().manual_seed(SEED)
 # Random seed function
     np.random.seed(value)
     random.seed(value)
+# Tasks for meta-learner
 def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
     idxs = list(range(0,len(taskset)))
     if is_shuffle:
         yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]
+# Prepare data to process by Domain-learner
+def prepare_data(data, batch_size, tokenizer,max_seq_length,
                  input = 'text', output = 'label',
+                 train_size_per_class = 5, global_datasets = False,
+                 treat_text_fun =None):
   data = data.reset_index().drop("index", axis=1)
+  if global_datasets:
+    global data_train, data_test
+  # Sample task for training
+  data_train = data.groupby('label').sample(train_size_per_class, replace=False)
+  idex = data.index.isin(data_train.index)
+  # The Test set to label by the model
+  data_test = data[~idex].reset_index()
+  # Transform in dataset to model
+  ## Train
   dataset_train = SLR_DataSet(
     data = data_train.sample(frac=1),
     input = input,
     output = output,
     tokenizer=tokenizer,
+    max_seq_length =max_seq_length,
+    treat_text =treat_text_fun)
+  ## Test
   dataset_test = SLR_DataSet(
     data = data_test,
     input = input,
     output = output,
     tokenizer=tokenizer,
+    max_seq_length =max_seq_length,
+    treat_text =treat_text_fun)
   # Dataloaders
+  ## Train
   data_train_loader = DataLoader(dataset_train,
                            shuffle=True,
                            batch_size=batch_size['train']
                                 )
+  ## Test
   if len(dataset_test) % batch_size['test'] == 1 :
     data_test_loader = DataLoader(dataset_test,
                                     batch_size=batch_size['test'],
   return data_train_loader, data_test_loader, data_train, data_test
+# Meta trainer
+def meta_train(data, model, device, Info,
+               print_epoch =True,
+                Test_resource =None,
+                treat_text_fun =None):
+  # Meta-learner model
   learner = Learner(model = model, device = device, **Info)
   # Testing tasks
   if isinstance(Test_resource, pd.DataFrame):
     test = MetaTask(Test_resource, num_task = 0, k_support=10, k_query=10,
+                  training=False,treat_text =treat_text_fun, **Info)
   torch.clear_autocast_cache()
   gc.collect()
   torch.cuda.empty_cache()
+  # Meta epoch (Outer epoch)
   for epoch in tqdm(range(Info['meta_epoch']), desc= "Meta epoch ", ncols=80):
+      # Train tasks
       train = MetaTask(data,
                       num_task = Info['num_task_train'],
                       k_support=Info['k_qry'],
+                      k_query=Info['k_spt'],
+                      treat_text =treat_text_fun, **Info)
+      # Batch of train tasks
       db = create_batch_of_tasks(train, is_shuffle = True, batch_size = Info["outer_batch_size"])
       if print_epoch:
       # Outer loop bach training
         for step, task_batch in enumerate(db):
             print("\n-----------------Training Mode","Meta_epoch:", epoch ,"-----------------\n")
+            # meta-feedfoward (outer-feedfoward)
             acc = learner(task_batch, valid_train= print_epoch)
             print('Step:', step, '\ttraining Acc:', acc)
         if isinstance(Test_resource, pd.DataFrame):
+          # Validating Model
           if ((epoch+1) % 4) + step == 0:
               random_seed(123)
               print("\n-----------------Testing Mode-----------------\n")
+              # Batch of test tasks
               db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
               acc_all_test = []
               # Restarting training randomly
               random_seed(int(time.time() % 10))
       else:
         for step, task_batch in enumerate(db):
+            # meta-feedfoward (outer-feedfoward)
             acc = learner(task_batch, print_epoch, valid_train= print_epoch)
   torch.clear_autocast_cache()
 def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr = 1, print_info = True, name = 'name'):
+  # Start the model's parameters
   model_meta = deepcopy(model)
   optimizer = Adam(model_meta.parameters(), lr=lr)
   model_meta.to(device)
   model_meta.train()
+  # Task epoch (Inner epoch)
   for i in range(0, epoch):
       all_loss = []
           batch = tuple(t.to(device) for t in batch)
           input_ids, attention_mask,q_token_type_ids, label_id = batch
+          # Inner Feedfoward
           loss, _, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
+          # compute grads
           loss.backward()
+          # update parameters
           optimizer.step()
           optimizer.zero_grad()
           print("Loss: ", np.mean(all_loss))
+  # Test evaluation
   model_meta.eval()
   all_loss = []
+  all_acc = []
   features = []
   labels = []
   predi_logit = []
   with torch.no_grad():
+      # Test's Batch loop
       for inner_step, batch in enumerate(tqdm(data_test_loader,
                                               desc="Test validation | " + name,
                                               ncols=80)) :
         batch = tuple(t.to(device) for t in batch)
         input_ids, attention_mask,q_token_type_ids, label_id = batch
+        # Predictions
         _, feature, prediction = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
+        # Save batch's predictions
         prediction = prediction.detach().cpu().squeeze()
         label_id = label_id.detach().cpu()
+        labels.append(label_id.numpy().squeeze())
         logit = feature[1].detach().cpu()
+        predi_logit.append(logit.numpy())
+        feature_lat = feature[0].detach().cpu()
         features.append(feature_lat.numpy())
+        # Accuracy over the test's bach
+        acc = fn.accuracy(prediction, label_id).item()
+        all_acc.append(acc)
       del input_ids, attention_mask, label_id, batch
+  if print_info:
+    print("acc:", np.mean(all_acc))
   model_meta.to('cpu')
   gc.collect()
   del model_meta, optimizer
+  return map_feature_tsne(features, labels, predi_logit)
+# Process predictions and map the feature_map in tsne
+def map_feature_tsne(features, labels, predi_logit):
   features = np.concatenate(np.array(features,dtype=object))
   features = torch.tensor(features.astype(np.float32)).detach().clone()
+  labels = np.concatenate(np.array(labels,dtype=object))
   labels = torch.tensor(labels.astype(int)).detach().clone()
+  logits = np.concatenate(np.array(predi_logit,dtype=object))
   logits = torch.tensor(logits.astype(np.float32)).detach().clone()
+  # Dimention reduction
   X_embedded = TSNE(n_components=2, learning_rate='auto',
                     init='random').fit_transform(features.detach().clone())
   return logits.detach().clone(), X_embedded, labels.detach().clone(), features.detach().clone()
 def wss_calc(logit, labels, trsh = 0.5):
+  # Prediction label given the threshold
   predict_trash = torch.sigmoid(logit).squeeze() >= trsh
+  # Compute confusion matrix values
   CM = confusion_matrix(labels, predict_trash.to(int) )
   tn, fp, fne, tp = CM.ravel()
   N = (tn + fp)
   recall = tp/(tp+fne)
+  # WSS
+  wss = (tn + fne)/len(labels) -(1- recall)
+  # AWSS
+  awss = (tn/N - fne/P)
   return {
+      "wss": round(wss,4),
+      "awss": round(awss,4),
       "R": round(recall,4),
       "CM": CM
       }
+# Compute the metrics
+def plot(logits, X_embedded, labels, threshold, show = True,
          namefig = "plot", make_plot = True, print_stats = True, save = True):
   col = pd.MultiIndex.from_tuples([
                                    ("Predict", "0"),
   predict = torch.sigmoid(logits).detach().clone()
+  # Roc curve
   fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
+  # Given by a Recall of 95% (threshold avaliation)
+  ## WSS
+  ### Index to recall
   idx_wss95 = sum(tpr < 0.95)
+  ### threshold
   thresholds95 = thresholds[idx_wss95]
+  ### Compute the metrics
   wss95_info = wss_calc(logits,labels, thresholds95 )
   acc_wss95 = fn.accuracy(predict, labels, threshold=thresholds95)
   f1_wss95 = fn.f1_score(predict, labels, threshold=thresholds95)
+  # Given by a threshold (recall avaliation)
+  ### Compute the metrics
+  wss_info = wss_calc(logits,labels, threshold )
+  acc_wssR = fn.accuracy(predict, labels, threshold=threshold)
+  f1_wssR = fn.f1_score(predict, labels, threshold=threshold)
   metrics= {
       # f1
       "f1@95": f1_wss95.item(),
       "f1@R": f1_wssR.item(),
+      # threshold 95
+      "threshold@95": thresholds95
   }
+  # Print stats
   if print_stats:
     wss95= f"WSS@95:{wss95_info['wss']}, R: {wss95_info['R']}"
     wss95_adj= f"ASSWSS@95:{wss95_info['awss']}"
     print(wss95_adj)
     print('Acc.:', round(acc_wss95.item(), 4))
     print('F1-score:', round(f1_wss95.item(), 4))
+    print(f"threshold to wss95: {round(thresholds95, 4)}")
     cm = pd.DataFrame(wss95_info['CM'],
               index=index,
               columns=col)
     print("\nConfusion matrix:")
     print(cm)
+    print("\n---Metrics with threshold:", threshold, "----\n")
     wss= f"WSS@R:{wss_info['wss']}, R: {wss_info['R']}"
     print(wss)
     wss_adj= f"AWSS@R:{wss_info['awss']}"
     print(cm)
+  # Plots
   if make_plot:
     fig, axes = plt.subplots(1, 4, figsize=(25,10))
     alpha = torch.squeeze(predict).numpy()
+    # TSNE
     p1 = sns.scatterplot(x=X_embedded[:, 0],
                   y=X_embedded[:, 1],
                   hue=labels,
+                  alpha=alpha, ax = axes[0]).set_title('Predictions-TSNE', size=20)
+    # WSS@95
     t_wss = predict >= thresholds95
     t_wss = t_wss.squeeze().numpy()
     p2 = sns.scatterplot(x=X_embedded[t_wss, 0],
                   y=X_embedded[t_wss, 1],
                   hue=labels[t_wss],
+                  alpha=alpha[t_wss], ax = axes[1]).set_title('WSS@95', size=20)
+    # WSS@R
+    t = predict >= threshold
     t = t.squeeze().numpy()
     p3 = sns.scatterplot(x=X_embedded[t, 0],
                   y=X_embedded[t, 1],
                   hue=labels[t],
+                  alpha=alpha[t], ax = axes[2]).set_title(f'Predictions-threshold {threshold}', size=20)
+    # ROC-Curve
     roc_auc = auc(fpr, tpr)
     lw = 2
     axes[3].plot(
       fpr,
       tpr,
       color="darkorange",
       lw=lw,
       label="ROC curve (area = %0.2f)" % roc_auc)
     axes[3].plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
     axes[3].axhline(y=0.95, color='r', linestyle='-')
+    # axes[3].set(xlabel="False Positive Rate", ylabel="True Positive Rate")
     axes[3].legend(loc="lower right")
+    axes[3].set_title(label= "ROC", size = 20)
+    axes[3].set_ylabel("True Positive Rate", fontsize = 15)
+    axes[3].set_xlabel("False Positive Rate", fontsize = 15)
     if show:
       plt.show()
   return metrics
 def auc_plot(logits,labels, color = "darkorange", label = "test"):
     predict = torch.sigmoid(logits).detach().clone()
     fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
     plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
     plt.axhline(y=0.95, color='r', linestyle='-')
+# Interface to evaluation
 class diagnosis():
+  def __init__(self, names, Valid_resource, batch_size_test,
+   model,Info, device,treat_text_fun=None,start = 0):
     self.names=names
     self.Valid_resource=Valid_resource
     self.batch_size_test=batch_size_test
     self.model=model
+    self.start=start
+    self.Info = Info
+    self.device = device
+    self.treat_text_fun = treat_text_fun
+    # BOX INPUT
     self.value_trash = widgets.FloatText(
         value=0.95,
+        description='threshold',
         disabled=False
     )
     self.valueb = widgets.IntText(
         value=10,
         description='size',
         disabled=False
     )
+    # Buttons
     self.train_b = widgets.Button(description="Train")
     self.next_b = widgets.Button(description="Next")
     self.eval_b = widgets.Button(description="Evaluation")
     self.hbox = widgets.HBox([self.train_b, self.valueb])
+    # Click buttons functions
     self.next_b.on_click(self.Next_button)
     self.train_b.on_click(self.Train_button)
     self.eval_b.on_click(self.Evaluation_button)
     clear_output()
     self.i=self.i+1
+    # Select the domain data
+    self.domain = self.names[self.i]
     self.data = self.Valid_resource[self.Valid_resource['domain'] == self.domain]
+    print("Name:", self.domain)
     print(self.data['label'].value_counts())
     display(self.hbox)
     display(self.next_b)
   # Train button
   def Train_button(self, y):
     clear_output()
     print(self.domain)
+    # Prepare data for training (domain-learner)
     self.data_train_loader, self.data_test_loader, self.data_train, self.data_test = prepare_data(self.data,
               train_size_per_class = self.valueb.value,
+              batch_size = {'train': self.Info['inner_batch_size'],
+                            'test': self.batch_size_test},
+              max_seq_length = self.Info['max_seq_length'],
+              tokenizer = self.Info['tokenizer'],
               input = "text",
+              output = "label",
+              treat_text_fun=self.treat_text_fun)
+    # Train the model and predict in the test set
     self.logits, self.X_embedded, self.labels, self.features = train_loop(self.data_train_loader, self.data_test_loader,
+                                                        self.model, self.device,
+                                                        epoch = self.Info['inner_update_step'],
+                                                        lr=self.Info['inner_update_lr'],
                                                         print_info=True,
                                                         name = self.domain)
     display(tresh_box)
     display(self.next_b)
   # Evaluation button
   def Evaluation_button(self, te):
     clear_output()
     print(self.domain)
     # print("\n")
     print("-------Train data-------")
+    print(data_train['label'].value_counts())
     print("-------Test data-------")
+    print(data_test['label'].value_counts())
     # print("\n")
     display(self.next_b)
     display(tresh_box)
     display(self.hbox)
+    # Compute metrics
     metrics = plot(self.logits, self.X_embedded, self.labels,
+                    threshold=self.Info['threshold'], show = True,
                     namefig= 'test',
                   make_plot = True,
                   print_stats = True,
   def __call__(self):
     self.i= self.start-1
     clear_output()
     display(self.next_b)
+# Simulation attemps of domain learner
+def pipeline_simulation(Valid_resource, names_to_valid, path_save,
+                        model, Info, device, initializer_model,
+                        treat_text_fun=None):
+  n_attempt  = 5
+  batch_test = 100
+  # Create a directory to save informations
+  for name in names_to_valid:
+    name = re.sub("\.csv", "",name)
+    Path(path_save  + name + "/img").mkdir(parents=True, exist_ok=True)
+  # Dict to sabe roc curves
+  roc_stats = defaultdict(lambda: defaultdict(
+      lambda: defaultdict(
+          list
+          )
+      )
+  )
+  all_metrics = []
+  # Loop over a list of domains
+  for name in names_to_valid:
+    # Select a domain dataset
+    data = Valid_resource[Valid_resource['domain'] == name].reset_index().drop("index", axis=1)
+    # Attempts simulation
+    for attempt in range(n_attempt):
+      print("---"*4,"attempt", attempt, "---"*4)
+      # Prepare data to pass to the model
+      data_train_loader, data_test_loader,  _ , _ = prepare_data(data,
+                train_size_per_class = Info['k_spt'],
+                batch_size = {'train': Info['inner_batch_size'],
+                              'test': batch_test},
+                max_seq_length = Info['max_seq_length'],
+                tokenizer = Info['tokenizer'],
+                input = "text",
+                output = "label",
+                treat_text_fun=treat_text_fun)
+      # Train the model and evaluate on the test set of the domain
+      logits, X_embedded, labels, features = train_loop(data_train_loader, data_test_loader,
+                                                        model, device,
+                                                        epoch = Info['inner_update_step'],
+                                                        lr=Info['inner_update_lr'],
+                                                        print_info=False,
+                                                        name = name)
+      name_domain = re.sub("\.csv", "",name)
+      # Compute the metrics
+      metrics = plot(logits, X_embedded, labels,
+                    threshold=Info['threshold'], show = False,
+                    namefig= path_save  + name_domain + "/img/" + str(attempt) + 'plots',
+        make_plot = True, print_stats = False, save =  True)
+      # Compute the roc-curve
+      fpr, tpr, _ = roc_curve(labels, torch.sigmoid(logits).squeeze())
+      # Save the correspoud information of the domain
+      metrics['name'] = name_domain
+      metrics['layer_size'] = Info['bert_layers']
+      metrics['attempt'] = attempt
+      roc_stats[name_domain][str(Info['bert_layers'])]['fpr'].append(fpr.tolist())
+      roc_stats[name_domain][str(Info['bert_layers'])]['tpr'].append(tpr.tolist())
+      all_metrics.append(metrics)
+      # Save the metrics and the roc curve  of the attemp
+      pd.DataFrame(all_metrics).to_csv(path_save+ "metrics.csv")
+      roc_path =  path_save + "roc_stats.json"
+      with open(roc_path, 'w') as fp:
+          json.dump(roc_stats, fp)
+      del fpr, tpr, logits, X_embedded, labels
+      del features, metrics,  _
+  # Save the information used to evaluate the validation resource
+  save_info = Info.copy()
+  save_info['model'] = initializer_model.tokenizer.name_or_path
+  save_info.pop("tokenizer")
+  save_info.pop("bert_layers")
+  info_path =  path_save+"info.json"
+  with open(info_path, 'w') as fp:
+      json.dump(save_info, fp)
+# Loading dataset statistics
+def load_data_statistics(paths, names):
+  size = []
+  pos = []
+  neg = []
+  for p in paths:
+    data = pd.read_csv(p)
+    data = data.dropna()
+    # Dataset size
+    size.append(len(data))
+    # Number of positive labels
+    pos.append(data['labels'].value_counts()[1])
+    # Number of negative labels
+    neg.append(data['labels'].value_counts()[0])
+  del data
+  info_load = pd.DataFrame({
+      "size":size,
+      "pos":pos,
+      "neg":neg,
+      "names":names,
+      "paths": paths })
+  return info_load
+# Loading the datasets
+def load_data(train_info_load):
+  col = ['abstract','title', 'labels', 'domain']
+  data_train = pd.DataFrame(columns=col)
+  for p in train_info_load['paths']:
+    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
+    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
+    data_temp['domain'] = os.path.basename(p)
+    data_train = pd.concat([data_train, data_temp])
+  data_train['text'] = data_train['title'] + data_train['abstract'].replace(np.nan, '')
+  return( data_train \
+            .replace({"labels":{0:"negative", 1:'positive'}})\
+            .rename({"labels":"label"} , axis=1)\
+            .loc[ :,("text","domain","label")]
+        )