Spaces:
Runtime error
Runtime error
import pandas as pd | |
import torch | |
import transformers | |
from sklearn import preprocessing | |
from sklearn.model_selection import train_test_split | |
from torch.nn import ConstantPad1d | |
from torch.nn.utils.rnn import pad_sequence | |
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, | |
TensorDataset) | |
def preprocess(data: pd.DataFrame, tokenizer: transformers.BertTokenizer): | |
""" | |
Preprocesses the data. | |
====================== | |
Parameters: | |
data (pd.DataFrame): The data to be preprocessed. | |
---------------------- | |
Returns: | |
trainDataloader (DataLoader): The training dataloader. | |
validationDataloader (DataLoader): The validation dataloader. | |
testDataloader (DataLoader): The testing dataloader. | |
labelEncoder (LabelEncoder): The label encoder. | |
""" | |
sentences = data.sentence.values | |
labels = data.disease.values | |
encodedSentences = [] | |
for sentence in sentences: | |
# `encode` will: | |
# (1) Tokenize the sentence. | |
# (2) Prepend the `[CLS]` token to the start. | |
# (3) Append the `[SEP]` token to the end. | |
# (4) Map tokens to their IDs. | |
encSentence = tokenizer.encode( | |
sentence, # Sentence to encode. | |
add_special_tokens = True, # Add '[CLS]' and '[SEP]' | |
# This function also supports truncation and conversion | |
# to pytorch tensors, but we need to do padding, so we | |
# can't use these features :( . | |
#max_length = 128, # Truncate all sentences. | |
#return_tensors = 'pt', # Return pytorch tensors. | |
) | |
# Add the encoded sentence to the list. | |
encodedSentences.append(encSentence) | |
MAX_LEN = max([len(sen) for sen in encodedSentences]) + 10 | |
seq = [torch.tensor(sen) for sen in encodedSentences] | |
padSequences = [ConstantPad1d((0, MAX_LEN - len(sen)), 0)(sen) for sen in seq] | |
encodedSentences = pad_sequence(padSequences, batch_first=True) | |
attentionMasks = [] | |
for sentence in encodedSentences: | |
# Create the attention mask. | |
# - If a token ID is 0, then it's padding, set the mask to 0. | |
# - If a token ID is > 0, then it's a real token, set the mask to 1. | |
attMask = [int(token_id > 0) for token_id in sentence] | |
# Store the attention mask for this sentence. | |
attentionMasks.append(attMask) | |
labelEncoder = preprocessing.LabelEncoder() | |
labels = labelEncoder.fit_transform(labels) # type: ignore | |
trainingSentences, testingSentences, trainingLabels, testingLabels = train_test_split(encodedSentences, labels, test_size=0.3, random_state=2018) | |
# Use 90% for training and 10% for validation. | |
trainInputs, validationInputs, trainLabels, validationLabels = train_test_split(trainingSentences, trainingLabels, | |
random_state=2018, test_size=0.1) | |
# Do the same for the masks. | |
trainingMasks, testingMasks, _, _ = train_test_split(attentionMasks, labels, random_state=2018, test_size=0.3) | |
train_masks, validationMasks, _, _ = train_test_split(trainingMasks, trainingLabels, random_state=2018, test_size=0.1) | |
# Convert all inputs and labels into torch tensors, the required datatype | |
# for our model. | |
trainInputs = torch.tensor(trainInputs) | |
validationInputs = torch.tensor(validationInputs) | |
testInputs = torch.tensor(testingSentences) | |
trainLabels = torch.tensor(trainLabels) | |
validationLabels = torch.tensor(validationLabels) | |
testLabels = torch.tensor(testingLabels) | |
train_masks = torch.tensor(train_masks) | |
validationMasks = torch.tensor(validationMasks) | |
testMasks = torch.tensor(testingMasks) | |
batchSize = 32 | |
trainData = TensorDataset(trainInputs, train_masks, trainLabels) | |
trainSampler = RandomSampler(trainData) | |
trainDataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batchSize) | |
validationData = TensorDataset(validationInputs, validationMasks, validationLabels) | |
validationSampler = SequentialSampler(validationData) | |
validationDataloader = DataLoader(validationData, sampler=validationSampler, batch_size=batchSize) | |
testData = TensorDataset(testInputs, testMasks, testLabels) | |
testSampler = SequentialSampler(testData) | |
testDataloader = DataLoader(testData, sampler=testSampler, batch_size=batchSize) | |
return trainDataloader, validationDataloader, testDataloader, labelEncoder |