File size: 4,723 Bytes
ba600a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import torch
import transformers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.nn import ConstantPad1d
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)


def preprocess(data: pd.DataFrame, tokenizer: transformers.BertTokenizer):
    """
    Preprocesses the data.
    ======================
    Parameters:
        data (pd.DataFrame): The data to be preprocessed.
    ----------------------
    Returns:
        trainDataloader (DataLoader): The training dataloader.
        validationDataloader (DataLoader): The validation dataloader.
        testDataloader (DataLoader): The testing dataloader.
        labelEncoder (LabelEncoder): The label encoder.
    """
    sentences = data.sentence.values
    labels = data.disease.values
    
    encodedSentences = []

    for sentence in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encSentence = tokenizer.encode(
                            sentence,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                            # This function also supports truncation and conversion
                            # to pytorch tensors, but we need to do padding, so we
                            # can't use these features :( .
                            #max_length = 128,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                    )
        
        # Add the encoded sentence to the list.
        encodedSentences.append(encSentence)
    
    MAX_LEN = max([len(sen) for sen in encodedSentences]) + 10
    
    seq = [torch.tensor(sen) for sen in encodedSentences]
    padSequences = [ConstantPad1d((0, MAX_LEN - len(sen)), 0)(sen) for sen in seq]
    encodedSentences = pad_sequence(padSequences, batch_first=True)
    
    attentionMasks = []

    for sentence in encodedSentences:
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        attMask = [int(token_id > 0) for token_id in sentence]
        
        # Store the attention mask for this sentence.
        attentionMasks.append(attMask)
    
    
    labelEncoder = preprocessing.LabelEncoder()
    labels = labelEncoder.fit_transform(labels) # type: ignore

    trainingSentences, testingSentences, trainingLabels, testingLabels = train_test_split(encodedSentences, labels, test_size=0.3, random_state=2018)

    # Use 90% for training and 10% for validation.
    trainInputs, validationInputs, trainLabels, validationLabels = train_test_split(trainingSentences, trainingLabels, 
                                                                random_state=2018, test_size=0.1)
    # Do the same for the masks.
    trainingMasks, testingMasks, _, _ = train_test_split(attentionMasks, labels, random_state=2018, test_size=0.3)

    train_masks, validationMasks, _, _ = train_test_split(trainingMasks, trainingLabels, random_state=2018, test_size=0.1)
    
    # Convert all inputs and labels into torch tensors, the required datatype 
    # for our model.
    trainInputs = torch.tensor(trainInputs)
    validationInputs = torch.tensor(validationInputs)
    testInputs = torch.tensor(testingSentences)

    trainLabels = torch.tensor(trainLabels)
    validationLabels = torch.tensor(validationLabels)
    testLabels = torch.tensor(testingLabels)

    train_masks = torch.tensor(train_masks)
    validationMasks = torch.tensor(validationMasks)
    testMasks = torch.tensor(testingMasks)

    batchSize = 32

    trainData = TensorDataset(trainInputs, train_masks, trainLabels)
    trainSampler = RandomSampler(trainData)
    trainDataloader = DataLoader(trainData, sampler=trainSampler, batch_size=batchSize)

    validationData = TensorDataset(validationInputs, validationMasks, validationLabels)
    validationSampler = SequentialSampler(validationData)
    validationDataloader = DataLoader(validationData, sampler=validationSampler, batch_size=batchSize)
    
    testData = TensorDataset(testInputs, testMasks, testLabels)
    testSampler = SequentialSampler(testData)
    testDataloader = DataLoader(testData, sampler=testSampler, batch_size=batchSize)
    
    return trainDataloader, validationDataloader, testDataloader, labelEncoder