Spaces:

hamishivi
/

tess-2-demo

Sleeping

App Files Files Community

tess-2-demo / sdlm /data /preprocessors.py

hamishivi

commit

17ff0d8 verified about 2 months ago

raw

history blame contribute delete

7.83 kB

	"""Implements data preprocessings including the T5 preprocessing."""
	import itertools

	import numpy as np
	import torch


	# TODO: here the max perhaps needs to be also the half-length.
	def gpt_span_mask(
	length, pad_length, use_half_length_as_prefix_size, eval_context_size
	):
	"""Given the length and pad_length for an input generates a prefix (GPT-style) mask."""
	# Start of the sequence is not masked, so we consider length-1.
	# TODO: we need an assert for length not be smaller than a value.
	if not use_half_length_as_prefix_size:
	# high should be higher than low, otherwise we set prefix_size=1.
	prefix_size = (
	np.random.randint(low=1, high=int((length - 1) / 2)) if length >= 5 else 1
	)
	else:
	# If eval_context_size is set, we consider it, otherwise we use half of the given length as
	# context. Note that since the start token is also masked, we deduct one from the given
	# context size.
	prefix_size = (
	eval_context_size - 1
	if eval_context_size is not None
	else int((length - 1) / 2)
	)
	# The start token is not masked.
	return (
	[False]
	+ [False] * prefix_size
	+ [True] * (length - prefix_size - 1)
	+ [False] * pad_length
	)


	def gpt_span_mask_batch(
	batch, use_half_length_as_prefix_size=False, eval_context_size=None
	):
	lengths = [len(feature["input_ids"]) for feature in batch]
	max_length = max(lengths)
	masks = [
	gpt_span_mask(
	length,
	max_length - length,
	use_half_length_as_prefix_size,
	eval_context_size,
	)
	for length in lengths
	]
	return torch.tensor(masks)


	def uncond_span_mask_batch(batch):
	lengths = [len(feature["input_ids"]) for feature in batch]
	max_length = max(lengths)
	return torch.ones((len(batch), max_length), dtype=torch.bool)


	def t5_random_spans_mask(
	length, mask_ratio, mean_mask_span_length=3.0, rng=None, pad_length=None
	):
	"""Noise mask consisting of random spans of mask tokens.

	The number of mask tokens and the number of mask spans and non-mask spans
	are determined deterministically as follows:
	num_mask_tokens = round(length * mask_ratio)
	num_nonmask_spans = num_mask_spans = round(
	num_mask_tokens / mean_mask_span_length)
	Spans alternate between non-mask and mask, beginning with non-mask.
	Subject to the above restrictions, all masks are equally likely.
	Note that this function do not mask start/end of sequence.
	Args:
	length: an int32 scalar (length of the incoming token sequence)
	mask_ratio: a float - approximate ratio of output mask (between 0 and 1).
	mean_mask_span_length: Average mask length.
	rng = a np.random.default_rng() instance or None
	Returns:
	a boolean list of shape [length]
	adapted from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py#L2704
	and https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L288
	"""
	# By default, we do not maks start and end of sequence.
	# TODO: we need to put assert for this!
	# NOTE: this only works if we use line_by_line which we do not. So I had to remove it.
	# length -= 2
	orig_length = length
	# Increase length to avoid degeneracy.
	length = max(length, 2)

	# Compute number of mask tokens and mask spans.
	num_mask_tokens = int(length * mask_ratio)
	# Avoid degeneracy by ensuring positive numbers of mask and nonmask tokens.
	num_mask_tokens = min(max(num_mask_tokens, 1), length - 1)
	num_mask_spans = int(num_mask_tokens / mean_mask_span_length)
	# Avoid degeneracy by ensuring positive number of mask spans.
	num_mask_spans = max(num_mask_spans, 1)
	num_nonmask_tokens = length - num_mask_tokens
	mask_span_lengths = _random_segmentation(num_mask_tokens, num_mask_spans, rng=rng)
	nonmask_span_lengths = _random_segmentation(
	num_nonmask_tokens, num_mask_spans, rng=rng
	)
	mask = list(
	itertools.chain.from_iterable(
	[
	[False] * nonmask_span_lengths[k] + [True] * mask_span_lengths[k]
	for k in range(num_mask_spans)
	]
	)
	)[:orig_length]
	# Start and end of the sequence mask are set to False. Again since this is not line_by_line, we
	# remove this.
	# mask = [False] + mask + [False]
	if pad_length is not None:
	mask += [False for _ in range(pad_length)]
	return mask


	def t5_random_spans_mask_batch(batch, mask_ratio, mean_mask_span_length=3.0, rng=None):
	"""Given not padded inputs, generates the T5 mask for each input."""
	lengths = [len(feature["input_ids"]) for feature in batch]
	max_length = max(lengths)
	masks = [
	t5_random_spans_mask(
	length, mask_ratio, mean_mask_span_length, rng, max_length - length
	)
	for length in lengths
	]
	return torch.tensor(masks)


	def _random_segmentation(num_items, num_segments, rng=None):
	"""Partition a sequence of items randomly into non-empty segments.
	Args:
	num_items: an integer scalar > 0
	num_segments: an integer scalar in [1, num_items]
	rng = a np.random.default_rng() instance or None
	Returns:
	a list with shape [num_segments] containing positive integers that add up to num_items.
	forked from: https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L265
	"""
	first_in_segment = np.arange(num_items - 1) < num_segments - 1
	rng = rng or np.random.default_rng()
	rng.shuffle(first_in_segment)
	# The first position always starts a segment.
	# first_in_segment is boolean array for every position after the first that signals whether this location is the start of a new segment.
	segment_id = np.cumsum(first_in_segment)
	segment_length = [0] * num_segments
	segment_length[0] = 1 # first the first missing first in segment
	for k in range(num_items - 1):
	segment_length[segment_id[k]] += 1
	return segment_length


	def insert_extra_paddings(rng, token_ids, pad_token_id, padding_ratio):
	"""Inserts padding tokens with the ratio of `padding_ratio` into the token_ids."""
	# TODO: we need to assert to have start/end of sequence tokens.
	# We do not add the padding in the start and end of sequence.
	length = len(token_ids) - 2
	num_padding_tokens = int(length * padding_ratio)
	if num_padding_tokens == 0:
	# In this case, the rate of padding tokens was not enough to add extra tokens.
	return token_ids
	length = length + num_padding_tokens
	# We do not modify the start token.
	all_ids = np.arange(1, length + 1)
	# This is without shuffling.
	# original_ids = np.arange(1, length+1)
	rng = rng or np.random.default_rng()
	rng.shuffle(all_ids)
	# padding tokens positions.
	padding_ids = np.array(all_ids)[:num_padding_tokens] + 1
	token_ids_extended = []
	current_id = 0
	for i in range(length + 2):
	if i not in padding_ids:
	token_ids_extended.append(pad_token_id)
	else:
	token_ids_extended.append(token_ids[current_id])
	current_id += 1
	return token_ids_extended
	"""
	# Other tokens positions, we do not change the start and end of sequence tokens.
	other_tokens_ids = [0]+[x for x in original_ids if x not in padding_ids]+[length+1]
	# Considers the start and end of sequence tokens in the final length.
	token_ids_extended = np.full((length+2), pad_token_id, dtype=int)
	token_ids_extended[other_tokens_ids] = token_ids
	return token_ids_extended.tolist()
	"""