BecomeAllan
/

ML-SLRC

Model card Files Files and versions Community

Allan Victor commited on May 18, 2022

Commit

1ba9831

1 Parent(s): 34fb5ee

Upload Util_funs.py

Browse files

Files changed (1) hide show

Util_funs.py +247 -2

Util_funs.py CHANGED Viewed

@@ -3,7 +3,6 @@ import torch
 import numpy as np
 import random
 import json, pickle
-# from ML_SLRC import SLR_DataSet, SLR_Classifier
 import torch.nn.functional as F
 import torch.nn as nn
@@ -596,4 +595,250 @@ class diagnosis():
     self.i= self.start-1
     clear_output()
-    display(self.next_b)

 import numpy as np
 import random
 import json, pickle
 import torch.nn.functional as F
 import torch.nn as nn
     self.i= self.start-1
     clear_output()
+    display(self.next_b)
+import torch.nn.functional as F
+import torch.nn as nn
+import math
+import torch
+import numpy as np
+import pandas as pd
+import time
+import transformers
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from sklearn.manifold import TSNE
+from copy import deepcopy, copy
+import seaborn as sns
+import matplotlib.pylab as plt
+from pprint import pprint
+import shutil
+import datetime
+import re
+import json
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+import unicodedata
+import re
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+# Pre-trained model
+class Encoder(nn.Module):
+  def __init__(self, layers, freeze_bert, model):
+    super(Encoder, self).__init__()
+    # Dummy Parameter
+    self.dummy_param = nn.Parameter(torch.empty(0))
+    # Pre-trained model
+    self.model = deepcopy(model)
+    # Freezing bert parameters
+    if freeze_bert:
+      for param in self.model.parameters():
+        param.requires_grad = freeze_bert
+    # Selecting hidden layers of the pre-trained model
+    old_model_encoder = self.model.encoder.layer
+    new_model_encoder = nn.ModuleList()
+    for i in layers:
+      new_model_encoder.append(old_model_encoder[i])
+    self.model.encoder.layer = new_model_encoder
+  # Feed forward
+  def forward(self, **x):
+    return self.model(**x)['pooler_output']
+# Complete model
+class SLR_Classifier(nn.Module):
+  def __init__(self, **data):
+    super(SLR_Classifier, self).__init__()
+    # Dummy Parameter
+    self.dummy_param = nn.Parameter(torch.empty(0))
+    # Loss function
+    # Binary Cross Entropy with logits reduced to mean
+    self.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean',
+                                        pos_weight=torch.FloatTensor([data.get("pos_weight",  2.5)]))
+    # Pre-trained model
+    self.Encoder = Encoder(layers = data.get("bert_layers",  range(12)),
+                           freeze_bert = data.get("freeze_bert",  False),
+                           model = data.get("model"),
+                           )
+    # Feature Map Layer
+    self.feature_map = nn.Sequential(
+            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
+            nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
+            # nn.Dropout(data.get("drop", 0.5)),
+            nn.Linear(self.Encoder.model.config.hidden_size, 200),
+            nn.Dropout(data.get("drop", 0.5)),
+        )
+    # Classifier Layer
+    self.classifier = nn.Sequential(
+            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
+            # nn.Dropout(data.get("drop", 0.5)),
+            # nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
+            # nn.Dropout(data.get("drop", 0.5)),
+            nn.Tanh(),
+            nn.Linear(200, 1)
+        )
+    # Initializing layer parameters
+    nn.init.normal_(self.feature_map[1].weight, mean=0, std=0.00001)
+    nn.init.zeros_(self.feature_map[1].bias)
+  # Feed forward
+  def forward(self, input_ids, attention_mask, token_type_ids, labels):
+    predict = self.Encoder(**{"input_ids":input_ids,
+                              "attention_mask":attention_mask,
+                              "token_type_ids":token_type_ids})
+    feature = self.feature_map(predict)
+    logit = self.classifier(feature)
+    predict = torch.sigmoid(logit)
+    # Loss function
+    loss = self.loss_fn(logit.to(torch.float), labels.to(torch.float).unsqueeze(1))
+    return [loss, [feature, logit], predict]
+# Undesirable patterns within texts
+patterns = {
+    'CONCLUSIONS AND IMPLICATIONS':'',
+    'BACKGROUND AND PURPOSE':'',
+    'EXPERIMENTAL APPROACH':'',
+    'KEY RESULTS AEA':'',
+    '©':'',
+    '®':'',
+    'μ':'',
+    '(C)':'',
+    'OBJECTIVE:':'',
+    'MATERIALS AND METHODS:':'',
+    'SIGNIFICANCE:':'',
+    'BACKGROUND:':'',
+    'RESULTS:':'',
+    'METHODS:':'',
+    'CONCLUSIONS:':'',
+    'AIM:':'',
+    'STUDY DESIGN:':'',
+    'CLINICAL RELEVANCE:':'',
+    'CONCLUSION:':'',
+    'HYPOTHESIS:':'',
+    'CLINICAL RELEVANCE:':'',
+    'Questions/Purposes:':'',
+    'Introduction:':'',
+    'PURPOSE:':'',
+    'PATIENTS AND METHODS:':'',
+    'FINDINGS:':'',
+    'INTERPRETATIONS:':'',
+    'FUNDING:':'',
+    'PROGRESS:':'',
+    'CONTEXT:':'',
+    'MEASURES:':'',
+    'DESIGN:':'',
+    'BACKGROUND AND OBJECTIVES:':'',
+    '<p>':'',
+    '</p>':'',
+    '<<ETX>>':'',
+    '+/-':'',
+    }
+patterns = {x.lower():y for x,y in patterns.items()}
+LABEL_MAP = {'negative': 0, 'positive': 1}
+class SLR_DataSet(Dataset):
+  def __init__(self, **args):
+    self.tokenizer = args.get('tokenizer')
+    self.data = args.get('data')
+    self.max_seq_length = args.get("max_seq_length", 512)
+    self.INPUT_NAME = args.get("input", 'x')
+    self.LABEL_NAME = args.get("output", 'y')
+  # Tokenizing and processing text
+  def encode_text(self, example):
+    comment_text = example[self.INPUT_NAME]
+    comment_text = self.treat_text(comment_text)
+    try:
+      labels = LABEL_MAP[example[self.LABEL_NAME]]
+    except:
+      labels = -1
+    encoding = self.tokenizer.encode_plus(
+      (comment_text, "It is great text"),
+      add_special_tokens=True,
+      max_length=self.max_seq_length,
+      return_token_type_ids=True,
+      padding="max_length",
+      truncation=True,
+      return_attention_mask=True,
+      return_tensors='pt',
+    )
+    return tuple((
+      encoding["input_ids"].flatten(),
+      encoding["attention_mask"].flatten(),
+      encoding["token_type_ids"].flatten(),
+      torch.tensor([torch.tensor(labels).to(int)])
+    ))
+  # Text processing function
+  def treat_text(self, text):
+    text = unicodedata.normalize("NFKD",str(text))
+    text = multiple_replace(patterns,text.lower())
+    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
+    text = re.sub('( +)',' ', text)
+    text = re.sub('(, ,)|(,,)',',', text)
+    text = re.sub('(%)|(per cent)',' percent', text)
+    return text
+  def __len__(self):
+    return len(self.data)
+  # Returning data
+  def __getitem__(self, index: int):
+    # print(index)
+    data_row = self.data.reset_index().iloc[index]
+    temp_data =  self.encode_text(data_row)
+    return temp_data
+# Regex multiple replace function
+def multiple_replace(dict, text):
+  # Building regex from dict keys
+  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
+  # Substitution
+  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
+# Undesirable patterns within texts