Spaces:

hamishivi
/

tess-2-demo

Sleeping

File size: 7,833 Bytes

17ff0d8

"""Implements data preprocessings including the T5 preprocessing."""
import itertools

import numpy as np
import torch


# TODO: here the max perhaps needs to be also the half-length.
def gpt_span_mask(
    length, pad_length, use_half_length_as_prefix_size, eval_context_size
):
    """Given the length and pad_length for an input generates a prefix (GPT-style) mask."""
    # Start of the sequence is not masked, so we consider length-1.
    # TODO: we need an assert for length not be smaller than a value.
    if not use_half_length_as_prefix_size:
        # high should be higher than low, otherwise we set prefix_size=1.
        prefix_size = (
            np.random.randint(low=1, high=int((length - 1) / 2)) if length >= 5 else 1
        )
    else:
        # If eval_context_size is set, we consider it, otherwise we use half of the given length as
        # context. Note that since the start token is also masked, we deduct one from the given
        # context size.
        prefix_size = (
            eval_context_size - 1
            if eval_context_size is not None
            else int((length - 1) / 2)
        )
    # The start token is not masked.
    return (
        [False]
        + [False] * prefix_size
        + [True] * (length - prefix_size - 1)
        + [False] * pad_length
    )


def gpt_span_mask_batch(
    batch, use_half_length_as_prefix_size=False, eval_context_size=None
):
    lengths = [len(feature["input_ids"]) for feature in batch]
    max_length = max(lengths)
    masks = [
        gpt_span_mask(
            length,
            max_length - length,
            use_half_length_as_prefix_size,
            eval_context_size,
        )
        for length in lengths
    ]
    return torch.tensor(masks)


def uncond_span_mask_batch(batch):
    lengths = [len(feature["input_ids"]) for feature in batch]
    max_length = max(lengths)
    return torch.ones((len(batch), max_length), dtype=torch.bool)


def t5_random_spans_mask(
    length, mask_ratio, mean_mask_span_length=3.0, rng=None, pad_length=None
):
    """Noise mask consisting of random spans of mask tokens.

    The number of mask tokens and the number of mask spans and non-mask spans
    are determined deterministically as follows:
      num_mask_tokens = round(length * mask_ratio)
      num_nonmask_spans = num_mask_spans = round(
         num_mask_tokens / mean_mask_span_length)
    Spans alternate between non-mask and mask, beginning with non-mask.
    Subject to the above restrictions, all masks are equally likely.
    Note that this function do not mask start/end of sequence.
    Args:
      length: an int32 scalar (length of the incoming token sequence)
      mask_ratio: a float - approximate ratio of output mask (between 0 and 1).
      mean_mask_span_length: Average mask length.
      rng = a np.random.default_rng() instance or None
    Returns:
      a boolean list of shape [length]
    adapted from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py#L2704
    and https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L288
    """
    # By default, we do not maks start and end of sequence.
    # TODO: we need to put assert for this!
    # NOTE: this only works if we use line_by_line which we do not. So I had to remove it.
    # length -= 2
    orig_length = length
    # Increase length to avoid degeneracy.
    length = max(length, 2)

    # Compute number of mask tokens and mask spans.
    num_mask_tokens = int(length * mask_ratio)
    # Avoid degeneracy by ensuring positive numbers of mask and nonmask tokens.
    num_mask_tokens = min(max(num_mask_tokens, 1), length - 1)
    num_mask_spans = int(num_mask_tokens / mean_mask_span_length)
    # Avoid degeneracy by ensuring positive number of mask spans.
    num_mask_spans = max(num_mask_spans, 1)
    num_nonmask_tokens = length - num_mask_tokens
    mask_span_lengths = _random_segmentation(num_mask_tokens, num_mask_spans, rng=rng)
    nonmask_span_lengths = _random_segmentation(
        num_nonmask_tokens, num_mask_spans, rng=rng
    )
    mask = list(
        itertools.chain.from_iterable(
            [
                [False] * nonmask_span_lengths[k] + [True] * mask_span_lengths[k]
                for k in range(num_mask_spans)
            ]
        )
    )[:orig_length]
    # Start and end of the sequence mask are set to False. Again since this is not line_by_line, we
    # remove this.
    # mask = [False] + mask + [False]
    if pad_length is not None:
        mask += [False for _ in range(pad_length)]
    return mask


def t5_random_spans_mask_batch(batch, mask_ratio, mean_mask_span_length=3.0, rng=None):
    """Given not padded inputs, generates the T5 mask for each input."""
    lengths = [len(feature["input_ids"]) for feature in batch]
    max_length = max(lengths)
    masks = [
        t5_random_spans_mask(
            length, mask_ratio, mean_mask_span_length, rng, max_length - length
        )
        for length in lengths
    ]
    return torch.tensor(masks)


def _random_segmentation(num_items, num_segments, rng=None):
    """Partition a sequence of items randomly into non-empty segments.
    Args:
      num_items: an integer scalar > 0
      num_segments: an integer scalar in [1, num_items]
      rng = a np.random.default_rng() instance or None
    Returns:
      a list with shape [num_segments] containing positive integers that add up to num_items.
    forked from: https://github.com/allenai/contrastive_pretraining/blob/95fe35d3257402c7df362c3e0f746a40d9fba8f0/cpt/data.py#L265
    """
    first_in_segment = np.arange(num_items - 1) < num_segments - 1
    rng = rng or np.random.default_rng()
    rng.shuffle(first_in_segment)
    # The first position always starts a segment.
    # first_in_segment is boolean array for every position after the first that signals whether this location is the start of a new segment.
    segment_id = np.cumsum(first_in_segment)
    segment_length = [0] * num_segments
    segment_length[0] = 1  # first the first missing first in segment
    for k in range(num_items - 1):
        segment_length[segment_id[k]] += 1
    return segment_length


def insert_extra_paddings(rng, token_ids, pad_token_id, padding_ratio):
    """Inserts padding tokens with the ratio of `padding_ratio` into the token_ids."""
    # TODO: we need to assert to have start/end of sequence tokens.
    # We do not add the padding in the start and end of sequence.
    length = len(token_ids) - 2
    num_padding_tokens = int(length * padding_ratio)
    if num_padding_tokens == 0:
        # In this case, the rate of padding tokens was not enough to add extra tokens.
        return token_ids
    length = length + num_padding_tokens
    # We do not modify the start token.
    all_ids = np.arange(1, length + 1)
    # This is without shuffling.
    # original_ids = np.arange(1, length+1)
    rng = rng or np.random.default_rng()
    rng.shuffle(all_ids)
    # padding tokens positions.
    padding_ids = np.array(all_ids)[:num_padding_tokens] + 1
    token_ids_extended = []
    current_id = 0
    for i in range(length + 2):
        if i not in padding_ids:
            token_ids_extended.append(pad_token_id)
        else:
            token_ids_extended.append(token_ids[current_id])
            current_id += 1
    return token_ids_extended
    """
    # Other tokens positions, we do not change the start and end of sequence tokens.
    other_tokens_ids = [0]+[x for x in original_ids if x not in padding_ids]+[length+1]
    # Considers the start and end of sequence tokens in the final length.
    token_ids_extended = np.full((length+2), pad_token_id, dtype=int)
    token_ids_extended[other_tokens_ids] = token_ids
    return token_ids_extended.tolist()
    """