HaiderSultanArc's picture
AI Engine API
ba600a6
import pandas as pd
import torch
import transformers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.nn import ConstantPad1d
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
def preprocess(data: pd.DataFrame, tokenizer: transformers.BertTokenizer):
"""
Preprocesses the data.
======================
Parameters:
data (pd.DataFrame): The data to be preprocessed.
----------------------
Returns:
trainDataloader (DataLoader): The training dataloader.
validationDataloader (DataLoader): The validation dataloader.
testDataloader (DataLoader): The testing dataloader.
labelEncoder (LabelEncoder): The label encoder.
"""
sentences = data.sentence.values
labels = data.disease.values
encodedSentences = []
for sentence in sentences:
# `encode` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
encSentence = tokenizer.encode(
sentence, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion
# to pytorch tensors, but we need to do padding, so we
# can't use these features :( .
#max_length = 128, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors.
)
# Add the encoded sentence to the list.
encodedSentences.append(encSentence)
MAX_LEN = max([len(sen) for sen in encodedSentences]) + 10
seq = [torch.tensor(sen) for sen in encodedSentences]
padSequences = [ConstantPad1d((0, MAX_LEN - len(sen)), 0)(sen) for sen in seq]
encodedSentences = pad_sequence(padSequences, batch_first=True)
attentionMasks = []
for sentence in encodedSentences:
# Create the attention mask.
# - If a token ID is 0, then it's padding, set the mask to 0.
# - If a token ID is > 0, then it's a real token, set the mask to 1.
attMask = [int(token_id > 0) for token_id in sentence]
# Store the attention mask for this sentence.
attentionMasks.append(attMask)
labelEncoder = preprocessing.LabelEncoder()
labels = labelEncoder.fit_transform(labels) # type: ignore
trainingSentences, testingSentences, trainingLabels, testingLabels = train_test_split(encodedSentences, labels, test_size=0.3, random_state=2018)
# Use 90% for training and 10% for validation.
trainInputs, validationInputs, trainLabels, validationLabels = train_test_split(trainingSentences, trainingLabels,
random_state=2018, test_size=0.1)
# Do the same for the masks.
trainingMasks, testingMasks, _, _ = train_test_split(attentionMasks, labels, random_state=2018, test_size=0.3)
train_masks, validationMasks, _, _ = train_test_split(trainingMasks, trainingLabels, random_state=2018, test_size=0.1)
# Convert all inputs and labels into torch tensors, the required datatype
# for our model.
trainInputs = torch.tensor(trainInputs)
validationInputs = torch.tensor(validationInputs)
testInputs = torch.tensor(testingSentences)
trainLabels = torch.tensor(trainLabels)
validationLabels = torch.tensor(validationLabels)
testLabels = torch.tensor(testingLabels)
train_masks = torch.tensor(train_masks)
validationMasks = torch.tensor(validationMasks)
testMasks = torch.tensor(testingMasks)
batchSize = 32
trainData = TensorDataset(trainInputs, train_masks, trainLabels)
trainSampler = RandomSampler(trainData)
trainDataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batchSize)
validationData = TensorDataset(validationInputs, validationMasks, validationLabels)
validationSampler = SequentialSampler(validationData)
validationDataloader = DataLoader(validationData, sampler=validationSampler, batch_size=batchSize)
testData = TensorDataset(testInputs, testMasks, testLabels)
testSampler = SequentialSampler(testData)
testDataloader = DataLoader(testData, sampler=testSampler, batch_size=batchSize)
return trainDataloader, validationDataloader, testDataloader, labelEncoder