File size: 7,833 Bytes
17ff0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""Implements data preprocessings including the T5 preprocessing."""
import itertools

import numpy as np
import torch


# TODO: here the max perhaps needs to be also the half-length.
def gpt_span_mask(
    length, pad_length, use_half_length_as_prefix_size, eval_context_size
):
    """Given the length and pad_length for an input generates a prefix (GPT-style) mask."""
    # Start of the sequence is not masked, so we consider length-1.
    # TODO: we need an assert for length not be smaller than a value.
    if not use_half_length_as_prefix_size:
        # high should be higher than low, otherwise we set prefix_size=1.
        prefix_size = (
            np.random.randint(low=1, high=int((length - 1) / 2)) if length >= 5 else 1
        )
    else:
        # If eval_context_size is set, we consider it, otherwise we use half of the given length as
        # context. Note that since the start token is also masked, we deduct one from the given
        # context size.
        prefix_size = (
            eval_context_size - 1
            if eval_context_size is not None
            else int((length - 1) / 2)
        )
    # The start token is not masked.
    return (
        [False]
        + [False] * prefix_size
        + [True] * (length - prefix_size - 1)
        + [False] * pad_length
    )


def gpt_span_mask_batch(
    batch, use_half_length_as_prefix_size=False, eval_context_size=None
):
    lengths = [len(feature["input_ids"]) for feature in batch]
    max_length = max(lengths)
    masks = [
        gpt_span_mask(
            length,
            max_length - length,
            use_half_length_as_prefix_size,
            eval_context_size,
        )
        for length in lengths
    ]
    return torch.tensor(masks)


def uncond_span_mask_batch(batch):
    lengths = [len(feature["input_ids"]) for feature in batch]
    max_length = max(lengths)
    return torch.ones((len(batch), max_length), dtype=torch.bool)


def t5_random_spans_mask(
    length, mask_ratio, mean_mask_span_length=3.0, rng=None, pad_length=None
):
    """Noise mask consisting of random spans of mask tokens.

    The number of mask tokens and the number of mask spans and non-mask spans
    are determined deterministically as follows:
      num_mask_tokens = round(length * mask_ratio)
      num_nonmask_spans = num_mask_spans = round(
         num_mask_tokens / mean_mask_span_length)
    Spans alternate between non-mask and mask, beginning with non-mask.
    Subject to the above restrictions, all masks are equally likely.
    Note that this function do not mask start/end of sequence.
    Args:
      length: an int32 scalar (length of the incoming token sequence)
      mask_ratio: a float - approximate ratio of output mask (between 0 and 1).
      mean_mask_span_length: Average mask length.
      rng = a np.random.default_rng() instance or None
    Returns:
      a boolean list of shape [length]
    adapted from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py#L2704
    and https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L288
    """
    # By default, we do not maks start and end of sequence.
    # TODO: we need to put assert for this!
    # NOTE: this only works if we use line_by_line which we do not. So I had to remove it.
    # length -= 2
    orig_length = length
    # Increase length to avoid degeneracy.
    length = max(length, 2)

    # Compute number of mask tokens and mask spans.
    num_mask_tokens = int(length * mask_ratio)
    # Avoid degeneracy by ensuring positive numbers of mask and nonmask tokens.
    num_mask_tokens = min(max(num_mask_tokens, 1), length - 1)
    num_mask_spans = int(num_mask_tokens / mean_mask_span_length)
    # Avoid degeneracy by ensuring positive number of mask spans.
    num_mask_spans = max(num_mask_spans, 1)
    num_nonmask_tokens = length - num_mask_tokens
    mask_span_lengths = _random_segmentation(num_mask_tokens, num_mask_spans, rng=rng)
    nonmask_span_lengths = _random_segmentation(
        num_nonmask_tokens, num_mask_spans, rng=rng
    )
    mask = list(
        itertools.chain.from_iterable(
            [
                [False] * nonmask_span_lengths[k] + [True] * mask_span_lengths[k]
                for k in range(num_mask_spans)
            ]
        )
    )[:orig_length]
    # Start and end of the sequence mask are set to False. Again since this is not line_by_line, we
    # remove this.
    # mask = [False] + mask + [False]
    if pad_length is not None:
        mask += [False for _ in range(pad_length)]
    return mask


def t5_random_spans_mask_batch(batch, mask_ratio, mean_mask_span_length=3.0, rng=None):
    """Given not padded inputs, generates the T5 mask for each input."""
    lengths = [len(feature["input_ids"]) for feature in batch]
    max_length = max(lengths)
    masks = [
        t5_random_spans_mask(
            length, mask_ratio, mean_mask_span_length, rng, max_length - length
        )
        for length in lengths
    ]
    return torch.tensor(masks)


def _random_segmentation(num_items, num_segments, rng=None):
    """Partition a sequence of items randomly into non-empty segments.
    Args:
      num_items: an integer scalar > 0
      num_segments: an integer scalar in [1, num_items]
      rng = a np.random.default_rng() instance or None
    Returns:
      a list with shape [num_segments] containing positive integers that add up to num_items.
    forked from: https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L265
    """
    first_in_segment = np.arange(num_items - 1) < num_segments - 1
    rng = rng or np.random.default_rng()
    rng.shuffle(first_in_segment)
    # The first position always starts a segment.
    # first_in_segment is boolean array for every position after the first that signals whether this location is the start of a new segment.
    segment_id = np.cumsum(first_in_segment)
    segment_length = [0] * num_segments
    segment_length[0] = 1  # first the first missing first in segment
    for k in range(num_items - 1):
        segment_length[segment_id[k]] += 1
    return segment_length


def insert_extra_paddings(rng, token_ids, pad_token_id, padding_ratio):
    """Inserts padding tokens with the ratio of `padding_ratio` into the token_ids."""
    # TODO: we need to assert to have start/end of sequence tokens.
    # We do not add the padding in the start and end of sequence.
    length = len(token_ids) - 2
    num_padding_tokens = int(length * padding_ratio)
    if num_padding_tokens == 0:
        # In this case, the rate of padding tokens was not enough to add extra tokens.
        return token_ids
    length = length + num_padding_tokens
    # We do not modify the start token.
    all_ids = np.arange(1, length + 1)
    # This is without shuffling.
    # original_ids = np.arange(1, length+1)
    rng = rng or np.random.default_rng()
    rng.shuffle(all_ids)
    # padding tokens positions.
    padding_ids = np.array(all_ids)[:num_padding_tokens] + 1
    token_ids_extended = []
    current_id = 0
    for i in range(length + 2):
        if i not in padding_ids:
            token_ids_extended.append(pad_token_id)
        else:
            token_ids_extended.append(token_ids[current_id])
            current_id += 1
    return token_ids_extended
    """
    # Other tokens positions, we do not change the start and end of sequence tokens.
    other_tokens_ids = [0]+[x for x in original_ids if x not in padding_ids]+[length+1]
    # Considers the start and end of sequence tokens in the final length.
    token_ids_extended = np.full((length+2), pad_token_id, dtype=int)
    token_ids_extended[other_tokens_ids] = token_ids
    return token_ids_extended.tolist()
    """