File size: 7,476 Bytes
8516514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import torch
import random
from typing import Literal
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

from datasets.features.video import Example
from transformers import RobertaTokenizerFast, PretrainedConfig
from dataclasses import dataclass, asdict

from datasets import load_dataset
from pprintpp import pprint

@dataclass
class RobertaConfig(PretrainedConfig):
    ### Tokenizer Config
    vocab_size: int = 50265
    start_token: int = 0
    end_token: int = 2
    pad_token: int = 2
    mask_token: int = 50264

    ### Transformer Config ###
    embedding_dimension: int = 768
    num_transformer_blocks: int = 12
    num_attention_heads: int = 12
    mlp_ratio: int = 4
    layer_norm_eps: float = 1e-6
    hidden_dropout_p: float = 0.1
    attention_dropout_p: float = 0.1
    context_length: int = 512

    ### Masking Config ###
    masking_prob: float = 0.15

    ### Huggingface Config ###
    hf_model_name: str = "deepset/roberta-base-squad2"

    ### Model Config ###
    pretrained_backbone: Literal["pretrained", "pretrained_huggingface", "random"] = "pretrained"
    max_position_embeddings: int = 512
    path_to_pretrained_weights: str = None

    ### Added in to_dict() method so this Config is compatible with Huggingface Trainer!!! ###
    def to_dict(self):
        return asdict(self)


def random_masking_text(tokens,
                        special_tokens_mask,
                        vocab_size=50264,
                        special_ids=(0, 1, 2, 3, 50264),
                        mask_ratio=0.15,
                        mask_token=50264):
    """
    Function for our random masking of tokens (excluding special tokens). This follow the logic provided
    by BERT/RoBERTa:

        - Select 15% of the tokens for masking
            - 80% of the selected tokens are replaced with a mask token
            - 10% of the selected tokens are replaced with another random token
            - 10% of the selected tokens are left alone

    This is almost identical to the masking function in our introductory jupyter notebook walkthrough of
    masked language modeling, but some minor changes are made to apply masking to batches of tokens
    rather than just one sequence at a time!
    """

    ### Create Random Uniform Sample Tensor ###
    random_masking = torch.rand(*tokens.shape)

    ### Set Value of Special Tokens to 1 so we DONT MASK THEM ###
    random_masking[special_tokens_mask == 1] = 1

    ### Get Boolean of Words under Masking Threshold ###
    random_masking = (random_masking < mask_ratio)

    ### Create Labels ###
    labels = torch.full((tokens.shape), -100)
    labels[random_masking] = tokens[random_masking]

    ### Get Indexes of True ###
    random_selected_idx = random_masking.nonzero()

    ### 80% Of the Time Replace with Mask Token ###
    masking_flag = torch.rand(len(random_selected_idx))
    masking_flag = (masking_flag < 0.8)
    selected_idx_for_masking = random_selected_idx[masking_flag]

    ### Seperate out remaining indexes to be assigned ###
    unselected_idx_for_masking = random_selected_idx[~masking_flag]

    ### 10% of the time (or 50 percent of the remaining 20%) we fill with random token ###
    ### The remaining times, leave the text as is ###
    masking_flag = torch.rand(len(unselected_idx_for_masking))
    masking_flag = (masking_flag < 0.5)
    selected_idx_for_random_filling = unselected_idx_for_masking[masking_flag]
    selected_idx_to_be_left_alone = unselected_idx_for_masking[~masking_flag]

    ### Fill Mask Tokens ###
    if len(selected_idx_for_masking) > 0:
        tokens[selected_idx_for_masking[:, 0], selected_idx_for_masking[:, 1]] = mask_token

    ### Fill Random Tokens ###
    if len(selected_idx_for_random_filling) > 0:
        non_special_ids = list(set(range(vocab_size)) - set(special_ids))
        randomly_selected_tokens = torch.tensor(random.sample(non_special_ids, len(selected_idx_for_random_filling)))
        tokens[selected_idx_for_random_filling[:, 0], selected_idx_for_random_filling[:, 1]] = randomly_selected_tokens

    return tokens, labels



def ExtractiveQAPreProcesing():

    tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-base-squad2")

    def char2token_mapping(examples):
        #
        # pprint(examples)

        questions = [q.strip() for sublist in examples["questions"] for q in sublist]
        # pprint(questions)
        stories = []
        for idx, sublist in enumerate(examples["questions"]):
            stories.extend([examples["story"][idx]] * len(sublist))

        # Now both questions and stories are 1D lists of the same length
        input = tokenizer(
            text=questions,
            text_pair=stories,
            max_length=512,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )
        offset_mapping = input.pop("offset_mapping")
        # pprint(input)
        answers = examples["answers"]
        input_text =[]
        answer_start = []
        answer_end=[]

        for key in answers:
            input_text.extend(key["input_text"])
            answer_start.extend(key["answer_start"])
            answer_end.extend(key["answer_end"])


        starting_token_idxs = []
        ending_token_idxs = []
        convert_data = {}

        for i, offset in enumerate(offset_mapping):

            start_char = answer_start[i]
            end_char = answer_end[i]

            # if start_char == -1 or end_char == -1:
            #     starting_token_idxs.append(0)
            #     ending_token_idxs.append(0)
            #     continue

            sequencen_ids = input.sequence_ids(i)

            context_start = None
            context_end = None

            for idx, id in enumerate(sequencen_ids):
                if context_start is None and id == 1:
                    context_start = idx
                elif context_start is not None and id != 1:
                    context_end = idx - 1
                    break
                elif context_start is not None and idx == len(sequencen_ids) - 1:
                    context_end = idx


            context_start_char = offset[context_start][0]
            context_end_char = offset[context_end][-1]

            if (start_char >= context_start_char) and (end_char <= context_end_char):
                # print(start_char, end_char)
                start_token_idx = None
                end_token_idx = None
                for token_idx, (offsets, seq_id) in enumerate(zip(offset, sequencen_ids)):
                    if seq_id == 1:
                        if start_char in range(offsets[0], offsets[1] + 1):
                            start_token_idx = token_idx
                        if end_char in range(offsets[0], offsets[1] + 1):
                            end_token_idx = token_idx

                starting_token_idxs.append(start_token_idx)
                ending_token_idxs.append(end_token_idx)
                # print("start_token_idx", start_token_idx, "end_token_idx", end_token_idx)
            else:
                starting_token_idxs.append(0)
                ending_token_idxs.append(0)
        

        input["start_positions"] = starting_token_idxs
        input["end_positions"] = ending_token_idxs
        return  input

    return char2token_mapping