import pandas as pd import torch import transformers from sklearn import preprocessing from sklearn.model_selection import train_test_split from torch.nn import ConstantPad1d from torch.nn.utils.rnn import pad_sequence from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) def preprocess(data: pd.DataFrame, tokenizer: transformers.BertTokenizer): """ Preprocesses the data. ====================== Parameters: data (pd.DataFrame): The data to be preprocessed. ---------------------- Returns: trainDataloader (DataLoader): The training dataloader. validationDataloader (DataLoader): The validation dataloader. testDataloader (DataLoader): The testing dataloader. labelEncoder (LabelEncoder): The label encoder. """ sentences = data.sentence.values labels = data.disease.values encodedSentences = [] for sentence in sentences: # `encode` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. encSentence = tokenizer.encode( sentence, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' # This function also supports truncation and conversion # to pytorch tensors, but we need to do padding, so we # can't use these features :( . #max_length = 128, # Truncate all sentences. #return_tensors = 'pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. encodedSentences.append(encSentence) MAX_LEN = max([len(sen) for sen in encodedSentences]) + 10 seq = [torch.tensor(sen) for sen in encodedSentences] padSequences = [ConstantPad1d((0, MAX_LEN - len(sen)), 0)(sen) for sen in seq] encodedSentences = pad_sequence(padSequences, batch_first=True) attentionMasks = [] for sentence in encodedSentences: # Create the attention mask. # - If a token ID is 0, then it's padding, set the mask to 0. # - If a token ID is > 0, then it's a real token, set the mask to 1. attMask = [int(token_id > 0) for token_id in sentence] # Store the attention mask for this sentence. attentionMasks.append(attMask) labelEncoder = preprocessing.LabelEncoder() labels = labelEncoder.fit_transform(labels) # type: ignore trainingSentences, testingSentences, trainingLabels, testingLabels = train_test_split(encodedSentences, labels, test_size=0.3, random_state=2018) # Use 90% for training and 10% for validation. trainInputs, validationInputs, trainLabels, validationLabels = train_test_split(trainingSentences, trainingLabels, random_state=2018, test_size=0.1) # Do the same for the masks. trainingMasks, testingMasks, _, _ = train_test_split(attentionMasks, labels, random_state=2018, test_size=0.3) train_masks, validationMasks, _, _ = train_test_split(trainingMasks, trainingLabels, random_state=2018, test_size=0.1) # Convert all inputs and labels into torch tensors, the required datatype # for our model. trainInputs = torch.tensor(trainInputs) validationInputs = torch.tensor(validationInputs) testInputs = torch.tensor(testingSentences) trainLabels = torch.tensor(trainLabels) validationLabels = torch.tensor(validationLabels) testLabels = torch.tensor(testingLabels) train_masks = torch.tensor(train_masks) validationMasks = torch.tensor(validationMasks) testMasks = torch.tensor(testingMasks) batchSize = 32 trainData = TensorDataset(trainInputs, train_masks, trainLabels) trainSampler = RandomSampler(trainData) trainDataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batchSize) validationData = TensorDataset(validationInputs, validationMasks, validationLabels) validationSampler = SequentialSampler(validationData) validationDataloader = DataLoader(validationData, sampler=validationSampler, batch_size=batchSize) testData = TensorDataset(testInputs, testMasks, testLabels) testSampler = SequentialSampler(testData) testDataloader = DataLoader(testData, sampler=testSampler, batch_size=batchSize) return trainDataloader, validationDataloader, testDataloader, labelEncoder