# Exploring the GLUE - MRPC dataset

In [12]:
from datasets import load_dataset
from pprint import pprint

raw_dataset = load_dataset(path = "glue", name = "mrpc")

In [7]:
!ls ~/.cache/huggingface/datasets/

_home_.cache_huggingface_datasets_glue_mrpc_0.0.0_bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c.lock
downloads
glue


In [8]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [9]:
raw_dataset["train"][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [10]:
raw_dataset["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [15]:
# Look at the 15th and 87th element of the train and validation datasets respectively
pprint(raw_dataset["train"][15])
pprint(raw_dataset["validation"][87])

{'idx': 16,
 'label': 0,
 'sentence1': 'Rudder was most recently senior vice president for the '
              'Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the '
              'Developer and Platform Evangelism unit , will lead the new '
              'entity .'}
{'idx': 812,
 'label': 0,
 'sentence1': 'However , EPA officials would not confirm the 20 percent figure '
              '.',
 'sentence2': 'Only in the past few weeks have officials settled on the 20 '
              'percent figure .'}


# Tokenizer for pair processing

In [16]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [18]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
pprint(inputs)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
               2023,
               2003,
               1996,
               2034,
               6251,
               1012,
               102,
               2023,
               2003,
               1996,
               2117,
               2028,
               1012,
               102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]}


In [19]:
tokenizer.decode(inputs['input_ids'])

'[CLS] this is the first sentence. [SEP] this is the second one. [SEP]'

Here we can see that the tokenizer has appended the two sentences together and introduced `[CLS]` and `[SEP]` tokens specially because that's how bert was trained for next sentence prediction task.

In [25]:
print(tokenizer(raw_dataset["train"][15]["sentence1"]))

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [24]:
print(tokenizer(raw_dataset["train"][15]["sentence2"]))

{'input_ids': [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [26]:
print(tokenizer(raw_dataset["train"][15]["sentence1"], raw_dataset["train"][15]["sentence2"]))

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


Here we need to observe the `token_type_ids` field. It is different if we encode the two sentences at the same time vs if we do them independently. Also the `[CLS]` and `[SEP]` tokens are added differently in the two cases.

# Dataset Map to create new datasets

In [28]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [29]:
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map: 100%|██████████| 3668/3668 [00:00<00:00, 9953.91 examples/s] 
Map: 100%|██████████| 408/408 [00:00<00:00, 9044.46 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 9891.51 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [30]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

Here we see that as out tokenize functions returns new keys of `'input_ids', 'token_type_ids', 'attention_mask'`, those simply get added to the new tokenized_dataset Dataset and rest remains the same.

In [38]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

This `DataCollatorWithPadding` is meant to do dynamic collation of batches in the dataset based on the max length from among all the sequences in the batch.

In [42]:
samples = tokenized_datasets["train"][:3]
samples = {k: v for k, v in samples.items()}
[len(x) for x in samples["input_ids"]]

[50, 59, 47]

In [43]:
samples

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale ."],
 'label': [1, 0, 1],
 'idx': [0, 1, 2],
 'input_ids': [[101,
   2572,
   3217,
   5831,
   5496,
   2010,
   2567,
   1010,
   3183,
   2002,
   2170,
   1000,
   1996,
   7409,
   1000,
   1010,
   1997,
   9969,
   4487,
   23809,
   3436,
   2010,
   3350,
   1012,
   102,
   7727,
   2000,
   2032,
   200

In [40]:
samples_to_collate = tokenized_datasets["train"][:3]
samples_to_collate.pop("sentence1"); samples_to_collate.pop("sentence2"); samples_to_collate.pop("idx");

In [41]:
batch = data_collator(samples_to_collate)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([3, 59]),
 'token_type_ids': torch.Size([3, 59]),
 'attention_mask': torch.Size([3, 59]),
 'labels': torch.Size([3])}

# Replication of the above preprocessing on GLUE-SST2 dataset

In [45]:
from datasets import load_dataset

raw_dataset = load_dataset("glue", "sst2")

Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 4.89MB/s]
Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 128kB/s]
Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 260kB/s]
Generating train split: 100%|██████████| 67349/67349 [00:00<00:00, 467302.76 examples/s]
Generating validation split: 100%|██████████| 872/872 [00:00<00:00, 137580.24 examples/s]
Generating test split: 100%|██████████| 1821/1821 [00:00<00:00, 205588.75 examples/s]


In [50]:
raw_dataset["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [46]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [53]:
def tokenize_function(sequence):
    return tokenizer(sequence["sentence"], padding = True, truncation = True, return_tensors="pt")

In [54]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched = True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map: 100%|██████████| 67349/67349 [00:06<00:00, 11164.26 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 10952.43 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 11315.74 examples/s]


In [55]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [56]:
from transformers import DataCollatorWithPadding
dc = DataCollatorWithPadding(tokenizer = tokenizer, padding = True)

In [59]:
samples = tokenized_dataset["train"][:3]
samples = {k: v for k,v in samples.items() if k not in ["sentence", "ids"]}

In [61]:
samples.keys()

dict_keys(['label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])

In [62]:
samples

{'label': [0, 0, 1],
 'idx': [0, 1, 2],
 'input_ids': [[101,
   5342,
   2047,
   3595,
   8496,
   2013,
   1996,
   18643,
   3197,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101,
   3397,
   2053,
   15966,
   1010,
   2069,
   4450,
   2098,
   18201,
   2015,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101,
   2008,
   7459,
   2049,
   3494,
   1998,
   10639,
   2015,
   2242,
   2738,
   3376,
   2055,
   2529,
   3267,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,

The datasets library is pretty intuitive in the way it is structured. We just need to make sure before collating, we have the necessary fields and drop the unnecessary fields from the dataset. And that we do dynamic padding based on a batch of data and not on the model dim or the max sequence length of the entire corpus. It will be economical in terms of computation and also help training.