Spaces:
Sleeping
Sleeping
File size: 7,833 Bytes
17ff0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
"""Implements data preprocessings including the T5 preprocessing."""
import itertools
import numpy as np
import torch
# TODO: here the max perhaps needs to be also the half-length.
def gpt_span_mask(
length, pad_length, use_half_length_as_prefix_size, eval_context_size
):
"""Given the length and pad_length for an input generates a prefix (GPT-style) mask."""
# Start of the sequence is not masked, so we consider length-1.
# TODO: we need an assert for length not be smaller than a value.
if not use_half_length_as_prefix_size:
# high should be higher than low, otherwise we set prefix_size=1.
prefix_size = (
np.random.randint(low=1, high=int((length - 1) / 2)) if length >= 5 else 1
)
else:
# If eval_context_size is set, we consider it, otherwise we use half of the given length as
# context. Note that since the start token is also masked, we deduct one from the given
# context size.
prefix_size = (
eval_context_size - 1
if eval_context_size is not None
else int((length - 1) / 2)
)
# The start token is not masked.
return (
[False]
+ [False] * prefix_size
+ [True] * (length - prefix_size - 1)
+ [False] * pad_length
)
def gpt_span_mask_batch(
batch, use_half_length_as_prefix_size=False, eval_context_size=None
):
lengths = [len(feature["input_ids"]) for feature in batch]
max_length = max(lengths)
masks = [
gpt_span_mask(
length,
max_length - length,
use_half_length_as_prefix_size,
eval_context_size,
)
for length in lengths
]
return torch.tensor(masks)
def uncond_span_mask_batch(batch):
lengths = [len(feature["input_ids"]) for feature in batch]
max_length = max(lengths)
return torch.ones((len(batch), max_length), dtype=torch.bool)
def t5_random_spans_mask(
length, mask_ratio, mean_mask_span_length=3.0, rng=None, pad_length=None
):
"""Noise mask consisting of random spans of mask tokens.
The number of mask tokens and the number of mask spans and non-mask spans
are determined deterministically as follows:
num_mask_tokens = round(length * mask_ratio)
num_nonmask_spans = num_mask_spans = round(
num_mask_tokens / mean_mask_span_length)
Spans alternate between non-mask and mask, beginning with non-mask.
Subject to the above restrictions, all masks are equally likely.
Note that this function do not mask start/end of sequence.
Args:
length: an int32 scalar (length of the incoming token sequence)
mask_ratio: a float - approximate ratio of output mask (between 0 and 1).
mean_mask_span_length: Average mask length.
rng = a np.random.default_rng() instance or None
Returns:
a boolean list of shape [length]
adapted from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py#L2704
and https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L288
"""
# By default, we do not maks start and end of sequence.
# TODO: we need to put assert for this!
# NOTE: this only works if we use line_by_line which we do not. So I had to remove it.
# length -= 2
orig_length = length
# Increase length to avoid degeneracy.
length = max(length, 2)
# Compute number of mask tokens and mask spans.
num_mask_tokens = int(length * mask_ratio)
# Avoid degeneracy by ensuring positive numbers of mask and nonmask tokens.
num_mask_tokens = min(max(num_mask_tokens, 1), length - 1)
num_mask_spans = int(num_mask_tokens / mean_mask_span_length)
# Avoid degeneracy by ensuring positive number of mask spans.
num_mask_spans = max(num_mask_spans, 1)
num_nonmask_tokens = length - num_mask_tokens
mask_span_lengths = _random_segmentation(num_mask_tokens, num_mask_spans, rng=rng)
nonmask_span_lengths = _random_segmentation(
num_nonmask_tokens, num_mask_spans, rng=rng
)
mask = list(
itertools.chain.from_iterable(
[
[False] * nonmask_span_lengths[k] + [True] * mask_span_lengths[k]
for k in range(num_mask_spans)
]
)
)[:orig_length]
# Start and end of the sequence mask are set to False. Again since this is not line_by_line, we
# remove this.
# mask = [False] + mask + [False]
if pad_length is not None:
mask += [False for _ in range(pad_length)]
return mask
def t5_random_spans_mask_batch(batch, mask_ratio, mean_mask_span_length=3.0, rng=None):
"""Given not padded inputs, generates the T5 mask for each input."""
lengths = [len(feature["input_ids"]) for feature in batch]
max_length = max(lengths)
masks = [
t5_random_spans_mask(
length, mask_ratio, mean_mask_span_length, rng, max_length - length
)
for length in lengths
]
return torch.tensor(masks)
def _random_segmentation(num_items, num_segments, rng=None):
"""Partition a sequence of items randomly into non-empty segments.
Args:
num_items: an integer scalar > 0
num_segments: an integer scalar in [1, num_items]
rng = a np.random.default_rng() instance or None
Returns:
a list with shape [num_segments] containing positive integers that add up to num_items.
forked from: https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L265
"""
first_in_segment = np.arange(num_items - 1) < num_segments - 1
rng = rng or np.random.default_rng()
rng.shuffle(first_in_segment)
# The first position always starts a segment.
# first_in_segment is boolean array for every position after the first that signals whether this location is the start of a new segment.
segment_id = np.cumsum(first_in_segment)
segment_length = [0] * num_segments
segment_length[0] = 1 # first the first missing first in segment
for k in range(num_items - 1):
segment_length[segment_id[k]] += 1
return segment_length
def insert_extra_paddings(rng, token_ids, pad_token_id, padding_ratio):
"""Inserts padding tokens with the ratio of `padding_ratio` into the token_ids."""
# TODO: we need to assert to have start/end of sequence tokens.
# We do not add the padding in the start and end of sequence.
length = len(token_ids) - 2
num_padding_tokens = int(length * padding_ratio)
if num_padding_tokens == 0:
# In this case, the rate of padding tokens was not enough to add extra tokens.
return token_ids
length = length + num_padding_tokens
# We do not modify the start token.
all_ids = np.arange(1, length + 1)
# This is without shuffling.
# original_ids = np.arange(1, length+1)
rng = rng or np.random.default_rng()
rng.shuffle(all_ids)
# padding tokens positions.
padding_ids = np.array(all_ids)[:num_padding_tokens] + 1
token_ids_extended = []
current_id = 0
for i in range(length + 2):
if i not in padding_ids:
token_ids_extended.append(pad_token_id)
else:
token_ids_extended.append(token_ids[current_id])
current_id += 1
return token_ids_extended
"""
# Other tokens positions, we do not change the start and end of sequence tokens.
other_tokens_ids = [0]+[x for x in original_ids if x not in padding_ids]+[length+1]
# Considers the start and end of sequence tokens in the final length.
token_ids_extended = np.full((length+2), pad_token_id, dtype=int)
token_ids_extended[other_tokens_ids] = token_ids
return token_ids_extended.tolist()
"""
|