BecomeAllan
/

ML-SLRC

Model card Files Files and versions Community

File size: 22,465 Bytes

from ML_SLRC import *

import os
import numpy as np
import pandas as pd


from torch.utils.data import  DataLoader
from torch.optim import Adam

import gc
from torchmetrics import functional as fn

import random


from tqdm import tqdm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import ipywidgets as widgets
from IPython.display import  display, clear_output
import matplotlib.pyplot as plt
import warnings
import torch

import time
from sklearn.manifold import TSNE
from copy import deepcopy
import seaborn as sns
import matplotlib.pylab as plt
import json
from pathlib import Path

import re
from collections import defaultdict

# SEED = 2222

# gen_seed = torch.Generator().manual_seed(SEED)






# Random seed function
def random_seed(value):
    torch.backends.cudnn.deterministic=True
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    np.random.seed(value)
    random.seed(value)

# Tasks for meta-learner
def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
    idxs = list(range(0,len(taskset)))
    if is_shuffle:
        random.shuffle(idxs)
    for i in range(0,len(idxs), batch_size):
        yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]


# Prepare data to process by Domain-learner
def prepare_data(data, batch_size, tokenizer,max_seq_length,
                 input = 'text', output = 'label',
                 train_size_per_class = 5, global_datasets = False,
                 treat_text_fun =None):
  data = data.reset_index().drop("index", axis=1)

  if global_datasets:
    global data_train, data_test

  # Sample task for training
  data_train = data.groupby('label').sample(train_size_per_class, replace=False)
  idex = data.index.isin(data_train.index)

  # The Test set to label by the model
  data_test = data


  # Transform in dataset to model
  ## Train
  dataset_train = SLR_DataSet(
    data = data_train.sample(frac=1),
    input = input,
    output = output,
    tokenizer=tokenizer,
    max_seq_length =max_seq_length,
    treat_text =treat_text_fun)

  ## Test
  dataset_test = SLR_DataSet(
    data = data_test,
    input = input,
    output = output,
    tokenizer=tokenizer,
    max_seq_length =max_seq_length,
    treat_text =treat_text_fun)
  
  # Dataloaders
  ## Train 
  data_train_loader = DataLoader(dataset_train,
                           shuffle=True,
                           batch_size=batch_size['train']
                                )
  
  ## Test
  if len(dataset_test) % batch_size['test'] == 1 :
    data_test_loader = DataLoader(dataset_test,
                                    batch_size=batch_size['test'],
                                    drop_last=True)
  else:
    data_test_loader = DataLoader(dataset_test,
                                    batch_size=batch_size['test'],
                                    drop_last=False)

  return data_train_loader, data_test_loader, data_train, data_test


# Meta trainer
def meta_train(data, model, device, Info,
               print_epoch =True,
                Test_resource =None,
                treat_text_fun =None):

  # Meta-learner model
  learner = Learner(model = model, device = device, **Info)
  
  # Testing tasks
  if isinstance(Test_resource, pd.DataFrame):
    test = MetaTask(Test_resource, num_task = 0, k_support=10, k_query=10,
                  training=False,treat_text =treat_text_fun, **Info)


  torch.clear_autocast_cache()
  gc.collect()
  torch.cuda.empty_cache()

  # Meta epoch (Outer epoch)
  for epoch in tqdm(range(Info['meta_epoch']), desc= "Meta epoch ", ncols=80):
      
      # Train tasks
      train = MetaTask(data,
                      num_task = Info['num_task_train'],
                      k_support=Info['k_qry'],
                      k_query=Info['k_spt'],
                      treat_text =treat_text_fun, **Info)

      # Batch of train tasks
      db = create_batch_of_tasks(train, is_shuffle = True, batch_size = Info["outer_batch_size"])

      if print_epoch:
      # Outer loop bach training
        for step, task_batch in enumerate(db):          
            print("\n-----------------Training Mode","Meta_epoch:", epoch ,"-----------------\n")
            
            # meta-feedfoward (outer-feedfoward)
            acc = learner(task_batch, valid_train= print_epoch)
            print('Step:', step, '\ttraining Acc:', acc)
        
        if isinstance(Test_resource, pd.DataFrame):
          # Validating Model
          if ((epoch+1) % 4) + step == 0:
              random_seed(123)
              print("\n-----------------Testing Mode-----------------\n")
              
              # Batch of test tasks
              db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
              acc_all_test = []

              # Looping testing tasks
              for test_batch in db_test:
                  acc = learner(test_batch, training = False)
                  acc_all_test.append(acc)

              print('Test acc:', np.mean(acc_all_test))
              del acc_all_test, db_test

              # Restarting training randomly
              random_seed(int(time.time() % 10))

      else:
        for step, task_batch in enumerate(db):
            # meta-feedfoward (outer-feedfoward)
            acc = learner(task_batch, print_epoch, valid_train= print_epoch)

  torch.clear_autocast_cache()
  gc.collect()
  torch.cuda.empty_cache()



def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr = 1, print_info = True, name = 'name', weight_decay = 1):
  # Start the model's parameters
  model_meta = deepcopy(model)
  optimizer = Adam(model_meta.parameters(), lr=lr, weight_decay = weight_decay)

  model_meta.to(device)
  model_meta.train()

  # Task epoch (Inner epoch)
  for i in range(0, epoch):
      all_loss = []

      # Inner training batch (support set)
      for inner_step, batch in enumerate(data_train_loader):
          batch = tuple(t.to(device) for t in batch)
          input_ids, attention_mask,q_token_type_ids, label_id = batch
          
          # Inner Feedfoward
          loss, _, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
          
          # compute grads
          loss.backward()

          # update parameters
          optimizer.step()
          optimizer.zero_grad()
          
          all_loss.append(loss.item())
      

      if (i % 2 == 0) & print_info:
          print("Loss: ", np.mean(all_loss))


  # Test evaluation
  model_meta.eval()
  all_loss = []
  all_acc = []
  features = []
  labels = []
  predi_logit = []

  with torch.no_grad():
      # Test's Batch loop
      for inner_step, batch in enumerate(tqdm(data_test_loader,
                                              desc="Test validation | " + name,
                                              ncols=80)) :
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask,q_token_type_ids, label_id = batch

        # Predictions
        _, feature, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())

 #       prediction = prediction.detach().cpu().squeeze()
  #      label_id = label_id.detach().cpu()
        logit = feature[1].detach().cpu()
   #     feature_lat = feature[0].detach().cpu()

#        labels.append(label_id.numpy().squeeze())
#        features.append(feature_lat.numpy())
        predi_logit.append(logit.numpy())

        # Accuracy over the test's bach
#        acc = fn.accuracy(prediction, label_id).item()
 #       all_acc.append(acc)
      del input_ids, attention_mask, label_id, batch

  if print_info:
    print("acc:", np.mean(all_acc))

  model_meta.to('cpu')
  gc.collect()
  torch.cuda.empty_cache()

  del model_meta, optimizer
  
  logits = np.concatenate(np.array(predi_logit,dtype=object))
  logits = torch.tensor(logits.astype(np.float32)).detach().clone()
#  return features, labels, predi_logit

  return logits.detach().clone()
  
# Process predictions and map the feature_map in tsne
def map_feature_tsne(features, labels, predi_logit):
  
  features = np.concatenate(np.array(features,dtype=object))
  features = torch.tensor(features.astype(np.float32)).detach().clone()
  
  labels = np.concatenate(np.array(labels,dtype=object))
  labels = torch.tensor(labels.astype(int)).detach().clone()

  logits = np.concatenate(np.array(predi_logit,dtype=object))
  logits = torch.tensor(logits.astype(np.float32)).detach().clone()

  # Dimention reduction
  X_embedded = TSNE(n_components=2, learning_rate='auto',
                    init='random').fit_transform(features.detach().clone())

  return logits.detach().clone(), X_embedded, labels.detach().clone(), features.detach().clone()
  
def wss_calc(logit, labels, trsh = 0.5):
  
  # Prediction label given the threshold
  predict_trash = torch.sigmoid(logit).squeeze() >= trsh
  
  # Compute confusion matrix values
  CM = confusion_matrix(labels, predict_trash.to(int) )
  tn, fp, fne, tp = CM.ravel()

  P = (tp + fne)  
  N = (tn + fp) 
  recall = tp/(tp+fne)

  # WSS
  wss = (tn + fne)/len(labels) -(1- recall)

  # AWSS
  awss = (tn/N - fne/P)

  return {
      "wss": round(wss,4),
      "awss": round(awss,4),
      "R": round(recall,4),
      "CM": CM
      }


# Compute the metrics
def plot(logits, X_embedded, labels, threshold, show = True,
         namefig = "plot", make_plot = True, print_stats = True, save = True):
  col = pd.MultiIndex.from_tuples([
                                   ("Predict", "0"),
                                   ("Predict", "1")
                                   ])
  index = pd.MultiIndex.from_tuples([
                                   ("Real", "0"),
                                   ("Real", "1")
                                   ])

  predict = torch.sigmoid(logits).detach().clone()

  # Roc curve
  fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())

  # Given by a Recall of 95% (threshold avaliation)
  ## WSS
  ### Index to recall
  idx_wss95 = sum(tpr < 0.95)
  ### threshold
  thresholds95 = thresholds[idx_wss95]

  ### Compute the metrics
  wss95_info = wss_calc(logits,labels, thresholds95 )
  acc_wss95 = fn.accuracy(predict, labels, threshold=thresholds95)
  f1_wss95 = fn.f1_score(predict, labels, threshold=thresholds95)


  # Given by a threshold (recall avaliation)
  ### Compute the metrics
  wss_info = wss_calc(logits,labels, threshold )
  acc_wssR = fn.accuracy(predict, labels, threshold=threshold)
  f1_wssR = fn.f1_score(predict, labels, threshold=threshold)


  metrics= {
      # WSS
      "WSS@95": wss95_info['wss'],
      "AWSS@95": wss95_info['awss'],
      "WSS@R": wss_info['wss'],
      "AWSS@R": wss_info['awss'],
      # Recall
      "Recall_WSS@95": wss95_info['R'],
      "Recall_WSS@R": wss_info['R'],
      # acc
      "acc@95": acc_wss95.item(),
      "acc@R": acc_wssR.item(),
      # f1
      "f1@95": f1_wss95.item(),
      "f1@R": f1_wssR.item(),
      # threshold 95
      "threshold@95": thresholds95
  }

  # Print stats
  if print_stats:
    wss95= f"WSS@95:{wss95_info['wss']}, R: {wss95_info['R']}"
    wss95_adj= f"ASSWSS@95:{wss95_info['awss']}"
    print(wss95)
    print(wss95_adj)
    print('Acc.:', round(acc_wss95.item(), 4))
    print('F1-score:', round(f1_wss95.item(), 4))
    print(f"threshold to wss95: {round(thresholds95, 4)}")
    cm = pd.DataFrame(wss95_info['CM'],
              index=index,
              columns=col)
    
    print("\nConfusion matrix:")
    print(cm)
    print("\n---Metrics with threshold:", threshold, "----\n")
    wss= f"WSS@R:{wss_info['wss']}, R: {wss_info['R']}"
    print(wss)
    wss_adj= f"AWSS@R:{wss_info['awss']}"
    print(wss_adj)
    print('Acc.:', round(acc_wssR.item(), 4))
    print('F1-score:', round(f1_wssR.item(), 4))
    cm = pd.DataFrame(wss_info['CM'],
                index=index,
                columns=col)
      
    print("\nConfusion matrix:")
    print(cm)


  # Plots

  if make_plot:

    fig, axes = plt.subplots(1, 4, figsize=(25,10))
    alpha = torch.squeeze(predict).numpy()

    # TSNE
    p1 = sns.scatterplot(x=X_embedded[:, 0],
                  y=X_embedded[:, 1],
                  hue=labels,
                  alpha=alpha, ax = axes[0]).set_title('Predictions-TSNE', size=20)
    
    
    # WSS@95
    t_wss = predict >= thresholds95
    t_wss = t_wss.squeeze().numpy()
    p2 = sns.scatterplot(x=X_embedded[t_wss, 0],
                  y=X_embedded[t_wss, 1],
                  hue=labels[t_wss],
                  alpha=alpha[t_wss], ax = axes[1]).set_title('WSS@95', size=20)

    # WSS@R
    t = predict >= threshold
    t = t.squeeze().numpy()
    p3 = sns.scatterplot(x=X_embedded[t, 0],
                  y=X_embedded[t, 1],
                  hue=labels[t],
                  alpha=alpha[t], ax = axes[2]).set_title(f'Predictions-threshold {threshold}', size=20)

    # ROC-Curve
    roc_auc = auc(fpr, tpr)
    lw = 2
    axes[3].plot(
      fpr,
      tpr,
      color="darkorange",
      lw=lw,
      label="ROC curve (area = %0.2f)" % roc_auc)
    axes[3].plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    axes[3].axhline(y=0.95, color='r', linestyle='-')
    # axes[3].set(xlabel="False Positive Rate", ylabel="True Positive Rate")
    axes[3].legend(loc="lower right")
    axes[3].set_title(label= "ROC", size = 20)
    axes[3].set_ylabel("True Positive Rate", fontsize = 15)
    axes[3].set_xlabel("False Positive Rate", fontsize = 15)
    

    if show:
      plt.show()
    
    if save:
      fig.savefig(namefig, dpi=fig.dpi)

  return metrics


def auc_plot(logits,labels, color = "darkorange", label = "test"):
    predict = torch.sigmoid(logits).detach().clone()
    fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
    roc_auc = auc(fpr, tpr)
    lw = 2

    label = label + str(round(roc_auc,2))
    # print(label)

    plt.plot(
      fpr,
      tpr,
      color=color,
      lw=lw,
      label= label 
      )
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.axhline(y=0.95, color='r', linestyle='-')

# Interface to evaluation
class diagnosis():
  def __init__(self, names, Valid_resource, batch_size_test,
   model,Info, device,treat_text_fun=None,start = 0):
    self.names=names
    self.Valid_resource=Valid_resource
    self.batch_size_test=batch_size_test
    self.model=model
    self.start=start
    self.Info = Info
    self.device = device
    self.treat_text_fun = treat_text_fun
    

    # BOX INPUT
    self.value_trash = widgets.FloatText(
        value=0.95,
        description='threshold',
        disabled=False
    )
    self.valueb = widgets.IntText(
        value=10,
        description='size',
        disabled=False
    )

    # Buttons
    self.train_b = widgets.Button(description="Train")
    self.next_b = widgets.Button(description="Next")
    self.eval_b = widgets.Button(description="Evaluation")

    self.hbox = widgets.HBox([self.train_b, self.valueb])

    # Click buttons functions
    self.next_b.on_click(self.Next_button)
    self.train_b.on_click(self.Train_button)
    self.eval_b.on_click(self.Evaluation_button)


  # Next button
  def Next_button(self,p):
    clear_output()
    self.i=self.i+1

    # Select the domain data
    self.domain = self.names[self.i]
    self.data = self.Valid_resource[self.Valid_resource['domain'] == self.domain]
    
    print("Name:", self.domain)
    print(self.data['label'].value_counts())
    display(self.hbox)
    display(self.next_b)


  # Train button
  def Train_button(self, y):
    clear_output()
    print(self.domain)

    # Prepare data for training (domain-learner)
    self.data_train_loader, self.data_test_loader, self.data_train, self.data_test = prepare_data(self.data,
              train_size_per_class = self.valueb.value,
              batch_size = {'train': self.Info['inner_batch_size'],
                            'test': self.batch_size_test},
              max_seq_length = self.Info['max_seq_length'],
              tokenizer = self.Info['tokenizer'],
              input = "text",
              output = "label",
              treat_text_fun=self.treat_text_fun)

    # Train the model and predict in the test set
    self.logits, self.X_embedded, self.labels, self.features = train_loop(self.data_train_loader, self.data_test_loader,
                                                        self.model, self.device,
                                                        epoch = self.Info['inner_update_step'],
                                                        lr=self.Info['inner_update_lr'],
                                                        print_info=True,
                                                        name = self.domain)

    tresh_box = widgets.HBox([self.eval_b, self.value_trash])
    display(self.hbox)
    display(tresh_box)
    display(self.next_b)


  # Evaluation button
  def Evaluation_button(self, te):
    clear_output()
    tresh_box = widgets.HBox([self.eval_b, self.value_trash])

    print(self.domain)
    # print("\n")
    print("-------Train data-------")
    print(data_train['label'].value_counts())
    print("-------Test data-------")
    print(data_test['label'].value_counts())
    # print("\n")
    
    display(self.next_b)
    display(tresh_box)
    display(self.hbox)

    # Compute metrics    
    metrics = plot(self.logits, self.X_embedded, self.labels,
                    threshold=self.Info['threshold'], show = True,
                    namefig= 'test',
                  make_plot = True,
                  print_stats = True,
                  save=False)

  def __call__(self):
    self.i= self.start-1
    clear_output()
    display(self.next_b)




# Simulation attemps of domain learner
def pipeline_simulation(Valid_resource, names_to_valid, path_save,
                        model, Info, device, initializer_model,
                        treat_text_fun=None):
  n_attempt  = 5
  batch_test = 100

  # Create a directory to save informations
  for name in names_to_valid:
    name = re.sub("\.csv", "",name)
    Path(path_save  + name + "/img").mkdir(parents=True, exist_ok=True)

  # Dict to sabe roc curves
  roc_stats = defaultdict(lambda: defaultdict(
      lambda: defaultdict(
          list
          )
      )
  )


  

  all_metrics = []
  # Loop over a list of domains
  for name in names_to_valid:
    
    # Select a domain dataset
    data = Valid_resource[Valid_resource['domain'] == name].reset_index().drop("index", axis=1)

    # Attempts simulation
    for attempt in range(n_attempt):
      print("---"*4,"attempt", attempt, "---"*4)
      
      # Prepare data to pass to the model
      data_train_loader, data_test_loader,  _ , _ = prepare_data(data,
                train_size_per_class = Info['k_spt'],
                batch_size = {'train': Info['inner_batch_size'],
                              'test': batch_test},
                max_seq_length = Info['max_seq_length'],
                tokenizer = Info['tokenizer'],
                input = "text",
                output = "label",
                treat_text_fun=treat_text_fun)

      # Train the model and evaluate on the test set of the domain
      logits, X_embedded, labels, features = train_loop(data_train_loader, data_test_loader,
                                                        model, device,
                                                        epoch = Info['inner_update_step'],
                                                        lr=Info['inner_update_lr'],
                                                        print_info=False,
                                                        name = name)
      
      
      name_domain = re.sub("\.csv", "",name)

      # Compute the metrics
      metrics = plot(logits, X_embedded, labels,
                    threshold=Info['threshold'], show = False,
                    namefig= path_save  + name_domain + "/img/" + str(attempt) + 'plots',
        make_plot = True, print_stats = False, save =  True)

      # Compute the roc-curve
      fpr, tpr, _ = roc_curve(labels, torch.sigmoid(logits).squeeze())
      
      # Save the correspoud information of the domain
      metrics['name'] = name_domain
      metrics['layer_size'] = Info['bert_layers']
      metrics['attempt'] = attempt
      roc_stats[name_domain][str(Info['bert_layers'])]['fpr'].append(fpr.tolist())
      roc_stats[name_domain][str(Info['bert_layers'])]['tpr'].append(tpr.tolist())
      all_metrics.append(metrics)

      # Save the metrics and the roc curve  of the attemp
      pd.DataFrame(all_metrics).to_csv(path_save+ "metrics.csv")
      roc_path =  path_save + "roc_stats.json"
      with open(roc_path, 'w') as fp:
          json.dump(roc_stats, fp)


      del fpr, tpr, logits, X_embedded, labels
      del features, metrics,  _


  # Save the information used to evaluate the validation resource
  save_info = Info.copy()
  save_info['model'] = initializer_model.tokenizer.name_or_path
  save_info.pop("tokenizer")
  save_info.pop("bert_layers")

  info_path =  path_save+"info.json"
  with open(info_path, 'w') as fp:
      json.dump(save_info, fp)


# Loading dataset statistics
def load_data_statistics(paths, names):
  size = []
  pos = []
  neg = []
  for p in paths:
    data = pd.read_csv(p) 
    data = data.dropna()
    # Dataset size
    size.append(len(data))
    # Number of positive labels
    pos.append(data['labels'].value_counts()[1])
    # Number of negative labels
    neg.append(data['labels'].value_counts()[0])
  del data

  info_load = pd.DataFrame({
      "size":size,
      "pos":pos,
      "neg":neg,
      "names":names,
      "paths": paths })
  return info_load

# Loading the datasets
def load_data(train_info_load):

  col = ['abstract','title', 'labels', 'domain']

  data_train = pd.DataFrame(columns=col)
  for p in train_info_load['paths']:  
    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
    data_temp['domain'] = os.path.basename(p)
    data_train = pd.concat([data_train, data_temp])
    
  data_train['text'] = data_train['title'] + data_train['abstract'].replace(np.nan, '')

  return( data_train \
            .replace({"labels":{0:"negative", 1:'positive'}})\
            .rename({"labels":"label"} , axis=1)\
            .loc[ :,("text","domain","label")]
        )