Spaces:

hamishivi
/

tess-2-demo

Sleeping

File size: 28,397 Bytes

"""Arguments used in training/inference/data processing."""
import os
import sys
from dataclasses import dataclass, field
from typing import List, Optional

from transformers import MODEL_MAPPING, HfArgumentParser, SchedulerType
from transformers import TrainingArguments as HFTrainingArguments

MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


def get_args(filename: str = None):
    parser = HfArgumentParser(
        (
            ModelArguments,
            DataTrainingArguments,
            Seq2SeqTrainingArguments,
            DiffusionArguments,
        )
    )
    if filename is not None:
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args, diffusion_args = parser.parse_json_file(
            json_file=filename
        )
    else:
        (
            model_args,
            data_args,
            training_args,
            diffusion_args,
        ) = parser.parse_args_into_dataclasses()
    return model_args, data_args, training_args, diffusion_args


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
            )
        },
    )
    model_type: Optional[str] = field(
        default=None,
        metadata={
            "help": "If training from scratch, pass a model type from the list: "
            + ", ".join(MODEL_TYPES)
        },
    )
    use_model: str = field(
        default="",
        metadata={
            "help": "Choose whether to use a cdcd or tokenwise model. Options: cdcd, tokenwise_cdcd, confidence."
        },
    )
    config_overrides: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override some existing default config settings when a model is trained from scratch. Example: "
                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
            )
        },
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained tokenizer name or path if not the same as model_name"
        },
    )
    tokenizer_padding_side: Optional[str] = field(
        default="right",
        metadata={"help": "Tokenizer padding side"},
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
        },
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={
            "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
        },
    )
    model_revision: str = field(
        default="main",
        metadata={
            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
        },
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    autoregressive_eval_model: str = field(
        default="EleutherAI/gpt-neo-1.3B",
        metadata={
            "help": "The autoregressive model used to measure the evaluation perplexity."
        },
    )
    resize_position_embeddings: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
                "the model's position embeddings."
            )
        },
    )
    resize_position_embeddings_alternatively: Optional[bool] = field(
        default=False,
        metadata={
            "help": "If set, resizes the position embedding alternatively, and copies from the original for the uncovered part."
        },
    )
    from_scratch: Optional[bool] = field(
        default=False,
        metadata={
            "help": "Whether to train the model from scratch or not. Default to false."
        },
    )
    use_flash_attention2: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use flash attention 2."},
    )
    use_liger_kernel: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use liger kernel."},
    )
    is_causal: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use causal attention (for Llama)."},
    )
    use_lora: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use LoRA."},
    )
    lora_rank: Optional[int] = field(
        default=16,
        metadata={"help": "LoRA rank."},
    )
    lora_alpha: Optional[int] = field(
        default=32,
        metadata={"help": "LoRA alpha."},
    )
    lora_dropout: Optional[float] = field(
        default=0.1,
        metadata={"help": "LoRA dropout."},
    )
    freeze_embedding: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to freeze vocab embedding (and tied LM head)."},
    )
    freeze_model: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to freeze the entire model."},
    )

    def __post_init__(self):
        if self.config_overrides is not None and (self.model_name_or_path is not None):
            raise ValueError(
                "--config_overrides can't be used in combination with --model_name_or_path"
            )


@dataclass
class TrainingArguments(HFTrainingArguments):
    lr_scheduler_type: SchedulerType = field(
        default="linear",
        metadata={
            "help": (
                "The scheduler type to use. It can be `linear`, `cosine`,"
                "`cosine_with_restarts`, `polynomial`, `constant`, and `constant_with_warmup`"
            )
        },
    )
    output_dir: Optional[str] = field(
        default=None, metadata={"help": "Where to store the final model."}
    )
    checkpointing_steps: int = field(
        default=1000, metadata={"help": "Specifies the checkpoint step."}
    )
    resume_from_checkpoint: Optional[str] = field(
        default=None,
        metadata={"help": "If the training should continue from a checkpoint folder."},
    )
    log_generated_texts: bool = field(
        default=True, metadata={"help": "If set, logs generated texts."}
    )
    checkpoint_best_model: bool = field(
        default=False,
        metadata={
            "help": "If set, for `run_glue.py` it sets the metrics name"
            "to save the best model in each checkpoint step."
        },
    )
    eval_for_all_metrics: bool = field(
        default=False,
        metadata={"help": "If set, evaluates on all metrics in run_mlm.py"},
    )
    load_states_in_eval_from_model_path: bool = field(
        default=True,
        metadata={
            "help": "In case of only using --do_eval without --do_train, use it to load the states before eval."
            "keep this to true, it causes otherwise an issue with huggingface when doing only --do_eval."
            "This parameter when running baselines does not have any impact and is not needed."
        },
    )
    without_compute_metrics: bool = field(
        default=False,
        metadata={
            "help": "If set, does not compute the metrics. we are observing MAUVE is very slow"
            "on multi-gpu setting and we do this to compute the metrics separately."
            "If using this option, you can call `compute_mlm_metrics.py` to compute them on 1 GPU later on."
        },
    )
    compute_eval_loss_with_simplex: bool = field(
        default=False,
        metadata={
            "help": "If set, computes the evaluation loss from the simplex values."
        },
    )
    save_checkpoints_on_s3: bool = field(
        default=False,
        metadata={
            "help": "If set, instead of deleting the checkpoints when passing the limit of save checkpoints, it saves them on S3."
        },
    )
    # NOTE: change default to suppress deprecation warning
    optim: str = field(default="adamw_torch")
    # just for beaker training, to allow auto-resume easier.
    beaker: bool = field(default=False)
    mask_padding_in_loss: bool = field(
        default=False,
        metadata={"help": "Whether to mask padding token in loss computation."},
    )
    generation_config: str = field(default=None)
    timestep_embed_lr: Optional[float] = field(
        default=None, metadata={"help": "LR for timestep embedding."}
    )
    disable_timestep_embed: Optional[bool] = field(
        default=False, metadata={"help": "Whether to disable timestep embedding."}
    )


@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    """
    Args:
        sortish_sampler (`bool`, *optional*, defaults to `False`):
            Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset*
            for now but will become generally available in the near future.
            It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness
            for the training set.
        generation_max_length (`int`, *optional*):
            The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
            `max_length` value of the model configuration.
        generation_num_beams (`int`, *optional*):
            The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
            `num_beams` value of the model configuration.
    """

    sortish_sampler: bool = field(
        default=False, metadata={"help": "Whether to use SortishSampler or not."}
    )
    generation_max_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default "
                "to the `max_length` value of the model configuration."
            )
        },
    )
    generation_num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default "
                "to the `num_beams` value of the model configuration."
            )
        },
    )
    predict_with_generate: Optional[bool] = field(default=True)


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    split_glue: bool = field(
        default=False,
        metadata={
            "help": "If set to true split the glue dev/train to make the test set"
            "otherwises uses the original splits."
        },
    )
    glue_split_seed: int = field(
        default=42, metadata={"help": "Seed to split the glue data."}
    )
    is_tulu_pair: bool = field(
        default=False,
        metadata={"help": "Whether to use pair preprocessing for TULU."},
    )
    is_tulu_multiturn: bool = field(
        default=False,
        metadata={"help": "Whether to use multiturn preprocessing for TULU."},
    )
    is_tulu_sliding_window_multiturn: bool = field(
        default=False,
        metadata={
            "help": "Whether to use sliding window multiturn preprocessing for TULU."
        },
    )
    ul2_max_mask_ratio: float = field(
        default=0.5,
        metadata={"help": "UL2 variable maximum mask ratio."},
    )
    tokenized_data_path: Optional[str] = field(
        default=None, metadata={"help": "If set, reads a tokenized train data."}
    )
    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the dataset to use (via the datasets library)."},
    )
    dataset_config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "The configuration name of the dataset to use (via the datasets library)."
        },
    )
    dataset_folder: str = field(
        default=None, metadata={"help": "The dataset folder containing the dataset."}
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
        },
    )
    test_file: Optional[str] = field(
        default=None, metadata={"help": "A text file containing the test data."}
    )
    overwrite_cache: bool = field(
        default=False,
        metadata={"help": "Overwrite the cached training and evaluation sets"},
    )
    validation_split_ratio: Optional[float] = field(
        default=0.001,
        metadata={
            "help": "The ratio(< 1.0) of the train set used as validation set in case there's no validation split."
        },
    )
    max_seq_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated."
            )
        },
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    line_by_line: bool = field(
        default=False,
        metadata={
            "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to `max_seq_length`. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    mask_ratio: float = field(
        default=0.15,
        metadata={
            "help": "Defines the ratio of mask tokens. A number between 0 and 1."
        },
    )
    mean_mask_span_length: int = field(
        default=3, metadata={"help": "Defines the average mask length."}
    )
    extra_padding_ratio: float = field(
        default=0.0,
        metadata={
            "help": (
                "Defines the ratio for the extra padding"
                "which are added only to the training data, in case of `span_infilling` uniformly."
            )
        },
    )
    conditional_generation: Optional[str] = field(
        default=None,
        metadata={
            "help": "It can be `span_infilling`, `prefix_lm`, `ul2`, or `ul2_with_unconditional`, `seq2seq`, `prefix_with_unconditional`"
            "In case of `span_infilling`: It trains/evals on filling spans like T5. In `prefix_lm`: it trains/evals"
            "on completing the prefixes like GPT2. In `ul2`, it trains on a mixture of span_infilling, agressive"
            "span_infilling, or prefix_lm and evals on prefix_lm with masking half of the sequence. In case of"
            "`ul2_with_unconditional`: it uses ul2 with also including unconditional generation during training."
            "`seq2seq` is used for translation or summarization tasks. `ul2_variable`: is ul2 for the different"
            "T5 mask_ratio till half of the sequence. `prefix_with_unconditional`: use prefix-lm with unconditional."
        },
    )
    eval_context_size: Optional[int] = field(
        default=None,
        metadata={
            "help": "By default we consider the half of sequence as prompt when evaluating for `conditional_generation` of"
            "`ul2` and `prefix_lm`. If this parameter is set, it specifies the context size during the evaluation."
        },
    )
    # TODO: later fix masking length with truncation.
    truncation_length: Optional[int] = field(
        default=0,
        metadata={
            "help": "If set, we will truncate the tokens from the end for the given length."
            "Note we still compute masking length based on original data length!"
        },
    )
    skip_special_tokens: bool = field(
        default=True,
        metadata={
            "help": "If training line by line set this to False to generate end token and cut. Also, in case you want to consider generation till </s> and cut the rest."
        },
    )
    # Parameters used in seq2seq training for summarization.
    """
    test_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."},
    )
    """
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total sequence length for target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                "during ``evaluate`` and ``predict``."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
                "which is used during ``evaluate`` and ``predict``."
            )
        },
    )
    # Translation arguments.
    source_lang: str = field(
        default=None, metadata={"help": "Source language id for translation."}
    )
    target_lang: str = field(
        default=None, metadata={"help": "Target language id for translation."}
    )
    add_t5_tags: bool = field(
        default=False,
        metadata={
            "help": "In case of GLUE, it adds tags to the sentences like `sentence1:` ... ."
        },
    )
    # dataset verification
    verification_mode: str = field(
        default="basic_checks",
        metadata={
            "help": "Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...)."
        },
    )
    streaming: bool = field(
        default=False,
        metadata={
            "help": "If set, we will stream the data from the disk or over the internet. This is useful for large datasets."
        },
    )
    shuffle: bool = field(
        default=True,
        metadata={"help": "If set, we will shuffle the data before training."},
    )
    min_train_seq_length: int = field(
        default=0,
        metadata={"help": "Minimum sequence length for train samples."},
    )
    min_eval_seq_length: int = field(
        default=0,
        metadata={"help": "Minimum sequence length for eval samples."},
    )

    def __post_init__(self):
        if (
            not self.tokenized_data_path
            and self.dataset_name is None
            and (self.train_file is None and self.validation_file is None)
        ):
            # NOTE: for pretraining, we detect whether we're on weka or nfs
            # and automatically set the dataset
            pass
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                if extension not in ["csv", "json", "txt"]:
                    raise ValueError(
                        "`train_file` should be a csv, a json or a txt file."
                    )
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                if extension not in ["csv", "json", "txt"]:
                    raise ValueError(
                        "`validation_file` should be a csv, a json or a txt file."
                    )

        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length

        if self.conditional_generation is not None:
            assert self.conditional_generation in [
                "span_infilling",
                "ul2",
                "ul2_with_unconditional",
                "prefix_lm",
                "prefix_with_unconditional",
                "seq2seq",
                "ul2_variable",
            ]

        tulu_flags = (
            self.is_tulu_pair,
            self.is_tulu_multiturn,
            self.is_tulu_sliding_window_multiturn,
        )
        # can only have at most 1 option toggled true
        assert sum(tulu_flags) < 2


@dataclass
class DiffusionArguments:
    """Defines the diffusion related parameters."""

    simplex_value: float = field(
        default=5.0,
        metadata={
            "help": (
                "We map the token ids to a vector of vocabulary size, where for tokens not"
                "equal to the token id `-simplex_value` is selected, and `simplex_value` otherwise."
            )
        },
    )
    num_diffusion_steps: int = field(
        default=2500, metadata={"help": "Defines the number of diffusion steps."}
    )
    num_inference_diffusion_steps: List[int] = field(
        default_factory=lambda: [1, 10, 100],
        metadata={"help": "Diffusion timesteps to try during inference."},
    )
    beta_schedule: str = field(
        default="squaredcos_improved_ddpm",
        metadata={
            "help": (
                "The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model."
                "Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`, `squaredcos_improved_ddpm`."
                "`squaredcos_improved_ddpm` model is proposed in eqn.17 in Improved ddpm"
                "(https://arxiv.org/pdf/2102.09672.pdf)"
            )
        },
    )
    sampling_type: str = field(
        default="top_p",
        metadata={"help": "Sampling type used during the logit projection."},
    )
    top_p: Optional[float] = field(
        default=None, metadata={"help": "top_p value for nucleus (top_p) sampling."}
    )
    clip_sample: bool = field(
        default=False,
        metadata={
            "help": "Whether to clip predicted sample between -1 and 1 for numerical stability in the noise scheduler."
        },
    )
    self_condition: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "If set, adds self-conditioning."
                "we consider the following options: `logits`: predicted logits, or `logits_with_projection`: to"
                "consider logits and apply the projection. After concatenating the inputs, we project inputs back"
                "with a projection layer to the half dimension. We also consider the cases of `logits_addition`"
                " and `logits_with_projection_addition` where we adds up the previous prediction to the logits,"
                "possibly with a projection operation. `logits_mean`: gets the average of logits and `logits_max`"
                "computes the maximum."
            )
        },
    )
    self_condition_mix_before_weights: bool = field(
        default=False,
        metadata={
            "help": "If set, mixes the softmax of simplexes and then apply the weights."
        },
    )
    self_condition_mix_logits_before_weights: bool = field(
        default=False,
        metadata={"help": "If set, mixes simplexes and then apply the weights."},
    )
    self_condition_mlp_projection: bool = field(
        default=False, metadata={"help": "If not set, uses a linear layer."}
    )
    self_condition_zeros_after_softmax: bool = field(
        default=False,
        metadata={
            "help": "If set, makes the softmax of previous_logits,"
            "in case previous_logits are zero, zero. This avoid extra bias introduced with using Linear[softmax(previous_logits), logits]"
        },
    )
    deepmind_conditional: bool = field(
        default=False,
        metadata={
            "help": "This is the way conditional is explained in the DeepMind paper"
            "https://arxiv.org/abs/2211.15089, figure 3. In this setup, we mask the self-conditioned, noisy, and original emebeddings,"
            "then we concat mask to these, and project all of them, and then add timestep embeddings."
        },
    )
    guidance_scale: float = field(
        default=1.0,
        metadata={"help": "scale for classifier (or classifier-free) guidance."},
    )
    classifier_free_uncond_input: str = field(
        default="empty_token",
        metadata={"help": "This can be one of `empty_token` or `noisy_simplex`."},
    )
    empty_token_be_mask: bool = field(
        default=False, metadata={"help": "If set, makes the empty token a mask."}
    )
    classifier_free_simplex_inputs: bool = field(
        default=False,
        metadata={
            "help": "If set to true, uses simplex representation for the unconditional input."
        },
    )
    temperature: float = field(
        default=1.0,
        metadata={"help": "Defines the softmax temperature before doing the sampling."},
    )
    guidance_softmax_combination: bool = field(
        default=True,
        metadata={"help": "If set, first applies softmax, then combines logits."},
    )
    generate_with_seed: bool = field(
        default=False, metadata={"help": "If set, generates with seed."}
    )
    multiply_factor: float = field(
        default=1.0,
        metadata={"help": "Determines the starting noise level."},
    )
    classifier_model_name_or_path: Optional[str] = field(
        default=None,
        metadata={"help": "Classifier for classifier guidance."},
    )
    use_gumbel_softmax: bool = field(
        default=False,
        metadata={"help": "Whether to use gumbel softmax for classifier guidance."},
    )
    do_hard_sample: bool = field(
        default=False,
        metadata={
            "help": "Whether to use gumbel softmax hard sampling trick for classifier guidance."
        },
    )
    softmax_temperature: float = field(
        default=1.0,
        metadata={"help": "Softmax for classifier guidance."},
    )
    num_guidance_steps: int = field(
        default=1,
        metadata={"help": "Number of guidance steps per guidance."},
    )
    eval_dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the dataset to use for evaluation."},
    )