|
|
|
|
|
"""This script defines dataclasses: ModelArguments and DatasetArguments, |
|
that contain the arguments for the model and dataset used in training. |
|
|
|
It imports several modules, including dataclasses, field from typing, Optional from typing, |
|
require_version from transformers.utils.versions, MODEL_FOR_CAUSAL_LM_MAPPING, |
|
and TrainingArguments from transformers. |
|
|
|
MODEL_CONFIG_CLASSES is assigned a list of the model config classes from |
|
MODEL_FOR_CAUSAL_LM_MAPPING. MODEL_TYPES is assigned a tuple of the model types |
|
extracted from the MODEL_CONFIG_CLASSES. |
|
""" |
|
|
|
from dataclasses import dataclass, field |
|
from typing import Optional, List |
|
|
|
from transformers.utils.versions import require_version |
|
|
|
from transformers import ( |
|
MODEL_FOR_CAUSAL_LM_MAPPING, |
|
TrainingArguments, |
|
) |
|
|
|
MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) |
|
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) |
|
|
|
|
|
@dataclass |
|
class ModelArguments: |
|
""" |
|
Define a class ModelArguments using the dataclass decorator. |
|
The class contains several optional parameters that can be used to configure a model. |
|
|
|
model_name_or_path : str |
|
a string representing the path or name of a pretrained |
|
model checkpoint for weights initialization. If None, a model will be trained from scratch. |
|
|
|
model_type : str |
|
a string representing the type of model to use if training from |
|
scratch. If not provided, a pretrained model will be used. |
|
|
|
config_overrides : str |
|
a string representing the default config settings to override |
|
when training a model from scratch. |
|
|
|
config_name : str |
|
a string representing the name or path of the pretrained config to |
|
use, if different from the model_name_or_path. |
|
|
|
tokenizer_name : str |
|
a string representing the name or path of the pretrained tokenizer |
|
to use, if different from the model_name_or_path. |
|
|
|
cache_dir : str |
|
a string representing the path to the directory where pretrained models |
|
downloaded from huggingface.co will be stored. |
|
|
|
use_fast_tokenizer : bool |
|
a boolean indicating whether to use a fast tokenizer (backed by the |
|
tokenizers library) or not. |
|
|
|
model_revision : str |
|
a string representing the specific model version to use (can be a |
|
branch name, tag name, or commit id). |
|
|
|
use_auth_token : bool |
|
a boolean indicating whether to use the token generated when running |
|
huggingface-cli login (necessary to use this script with private models). |
|
|
|
torch_dtype : str |
|
a string representing the dtype to load the model under. If auto is |
|
passed, the dtype will be automatically derived from the model's weights. |
|
|
|
use_ram_optimized_load : bool |
|
a boolean indicating whether to use disk mapping when memory is not |
|
enough. |
|
""" |
|
|
|
model_name_or_path: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." |
|
) |
|
}, |
|
) |
|
lora_model_path: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"The incremental model diff introduced by LoRA finetuning." |
|
" Along with the original non-finetuned model forms the whole" |
|
" finetuned model." |
|
) |
|
} |
|
) |
|
model_type: Optional[str] = field( |
|
default=None, |
|
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, |
|
) |
|
arch_type: Optional[str] = field( |
|
default="decoder_only", |
|
metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"} |
|
) |
|
config_overrides: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"Override some existing default config settings when a model is trained from scratch. Example: " |
|
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" |
|
) |
|
}, |
|
) |
|
arch_type: Optional[str] = field( |
|
default="decoder_only", |
|
metadata={ |
|
"help": ( |
|
"Model architecture type, e.g. \"decoder_only\"," |
|
" \"encoder_decoder\"" |
|
), |
|
"choices": ["decoder_only", "encoder_decoder", "text_regression"], |
|
}, |
|
) |
|
config_name: Optional[str] = field( |
|
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} |
|
) |
|
tokenizer_name: Optional[str] = field( |
|
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} |
|
) |
|
cache_dir: Optional[str] = field( |
|
default=None, |
|
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, |
|
) |
|
use_fast_tokenizer: bool = field( |
|
default=True, |
|
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, |
|
) |
|
model_revision: str = field( |
|
default="main", |
|
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, |
|
) |
|
use_auth_token: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": ( |
|
"Will use the token generated when running `huggingface-cli login` (necessary to use this script " |
|
"with private models)." |
|
) |
|
}, |
|
) |
|
torch_dtype: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " |
|
"dtype will be automatically derived from the model's weights." |
|
), |
|
"choices": ["auto", "bfloat16", "float16", "float32"], |
|
}, |
|
) |
|
use_lora: bool = field( |
|
default=False, |
|
metadata={"help": "Whether to lora."}, |
|
) |
|
lora_r: int = field( |
|
default=8, |
|
metadata={"help": "the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has."}, |
|
) |
|
lora_alpha: int = field( |
|
default=32, |
|
metadata={"help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."}, |
|
) |
|
lora_target_modules: List[str] = field( |
|
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name", |
|
} |
|
) |
|
lora_dropout: float = field( |
|
default=0.1, |
|
metadata={"help": "The dropout rate in lora.linear."}, |
|
) |
|
save_aggregated_lora: bool = field( |
|
default=False, |
|
metadata={"help": "Whether to save aggregated lora."}, |
|
) |
|
use_ram_optimized_load: bool = field( |
|
default=True, |
|
metadata={"help": "Whether use disk mapping when memory is not enough."} |
|
) |
|
|
|
def __post_init__(self): |
|
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): |
|
raise ValueError( |
|
"--config_overrides can't be used in combination with --config_name or --model_name_or_path" |
|
) |
|
|
|
|
|
@dataclass |
|
class DatasetArguments: |
|
""" |
|
Define a class DatasetArguments using the dataclass decorator. |
|
The class contains several optional parameters that can be used to configure a dataset for a language model. |
|
|
|
|
|
dataset_path : str |
|
a string representing the path of the dataset to use. |
|
|
|
dataset_name : str |
|
a string representing the name of the dataset to use. The default value is "customized". |
|
|
|
is_custom_dataset : bool |
|
a boolean indicating whether to use custom data. The default value is False. |
|
|
|
customized_cache_dir : str |
|
a string representing the path to the directory where customized dataset caches will be stored. |
|
|
|
dataset_config_name : str |
|
a string representing the configuration name of the dataset to use (via the datasets library). |
|
|
|
train_file : str |
|
a string representing the path to the input training data file (a text file). |
|
|
|
validation_file : str |
|
a string representing the path to the input evaluation data file to evaluate the perplexity on (a text file). |
|
|
|
max_train_samples : int |
|
an integer indicating the maximum number of training examples to use for debugging or quicker training. |
|
If set, the training dataset will be truncated to this number. |
|
|
|
max_eval_samples: int |
|
an integer indicating the maximum number of evaluation examples to use for debugging or quicker training. |
|
If set, the evaluation dataset will be truncated to this number. |
|
|
|
streaming : bool |
|
a boolean indicating whether to enable streaming mode. |
|
|
|
block_size: int |
|
an integer indicating the optional input sequence length after tokenization. The training dataset will be |
|
truncated in blocks of this size for training. |
|
|
|
The class also includes some additional parameters that can be used to configure the dataset further, such as `overwrite_cache`, |
|
`validation_split_percentage`, `preprocessing_num_workers`, `disable_group_texts`, `demo_example_in_prompt`, `explanation_in_prompt`, |
|
`keep_linebreaks`, and `prompt_structure`. |
|
|
|
The field function is used to set default values and provide help messages for each parameter. The Optional type hint is |
|
used to indicate that a parameter is optional. The metadata argument is used to provide additional information about |
|
each parameter, such as a help message. |
|
""" |
|
|
|
dataset_path: Optional[str] = field( |
|
default=None, metadata={"help": "The path of the dataset to use."} |
|
) |
|
dataset_name: Optional[str] = field( |
|
default="customized", metadata={"help": "Should be \"customized\""} |
|
) |
|
is_custom_dataset: Optional[bool] = field( |
|
default=False, metadata={"help": "whether to use custom data"} |
|
) |
|
customized_cache_dir: Optional[str] = field( |
|
default=".cache/llm-ft/datasets", |
|
metadata={"help": "Where do you want to store the customized dataset caches"}, |
|
) |
|
dataset_config_name: Optional[str] = field( |
|
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} |
|
) |
|
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) |
|
validation_file: Optional[str] = field( |
|
default=None, |
|
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, |
|
) |
|
max_train_samples: Optional[int] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"For debugging purposes or quicker training, truncate the number of training examples to this " |
|
"value if set." |
|
) |
|
}, |
|
) |
|
max_eval_samples: Optional[int] = field( |
|
default=1e10, |
|
metadata={ |
|
"help": ( |
|
"For debugging purposes or quicker training, truncate the number of evaluation examples to this " |
|
"value if set." |
|
) |
|
}, |
|
) |
|
streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) |
|
block_size: Optional[int] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"Optional input sequence length after tokenization. " |
|
"The training dataset will be truncated in block of this size for training. " |
|
"Default to the model max input length for single sentence inputs (take into account special tokens)." |
|
) |
|
}, |
|
) |
|
overwrite_cache: bool = field( |
|
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} |
|
) |
|
validation_split_percentage: Optional[int] = field( |
|
default=5, |
|
metadata={ |
|
"help": "The percentage of the train set used as validation set in case there's no validation split" |
|
}, |
|
) |
|
preprocessing_num_workers: Optional[int] = field( |
|
default=None, |
|
metadata={"help": "The number of processes to use for the preprocessing."}, |
|
) |
|
disable_group_texts: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": ( |
|
"Whether we group original samples together to generate sample" |
|
" sequences of length `block_size`. By default, we group every" |
|
" 1000 tokenized sequences together, divide them into " |
|
" [{total_num_tokens} / {block_size}] sequences, each with" |
|
" `block_size` tokens (the remaining tokens are ommited." |
|
" If this flag is set to True, we only group 1 tokenized" |
|
" sequence, i.e. cutting long sequence into chunks." |
|
) |
|
}, |
|
) |
|
keep_linebreaks: bool = field( |
|
default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} |
|
) |
|
test_file: Optional[str] = field( |
|
default=None, |
|
metadata={"help": "Evaluation File Path"}, |
|
) |
|
|
|
def __post_init__(self): |
|
if self.streaming: |
|
require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") |
|
|
|
if self.dataset_name is None and self.train_file is None and self.validation_file is None: |
|
raise ValueError("Need either a dataset name or a training/validation file.") |
|
else: |
|
if self.train_file is not None: |
|
extension = self.train_file.split(".")[-1] |
|
assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." |
|
if self.validation_file is not None: |
|
extension = self.validation_file.split(".")[-1] |
|
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." |
|
|
|
|
|
@dataclass |
|
class FinetunerArguments(TrainingArguments): |
|
""" |
|
Adapt transformers.TrainingArguments |
|
""" |
|
pass |
|
|
|
|
|
@dataclass |
|
class EvaluatorArguments: |
|
""" |
|
Define a class EvaluatorArguments using the dataclass decorator. The class contains several optional |
|
parameters that can be used to configure a evaluator. |
|
|
|
local_rank : str |
|
For distributed training: local_rank |
|
|
|
random_shuffle : bool |
|
|
|
use_wandb : bool |
|
|
|
random_seed : int, default = 1 |
|
|
|
output_dir : str, default = './output_dir', |
|
|
|
mixed_precision : str, choice from ["bf16","fp16"]. |
|
mixed precision mode, whether to use bf16 or fp16 |
|
|
|
deepspeed : |
|
Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already |
|
loaded json file as a dict |
|
""" |
|
local_rank: int = field( |
|
default=-1, |
|
metadata={"help": "For distributed training: local_rank" |
|
} |
|
) |
|
|
|
random_shuffle: Optional[bool] = field( |
|
default=False, |
|
metadata={"help": "" |
|
} |
|
) |
|
|
|
use_wandb: Optional[bool] = field( |
|
default=False, |
|
metadata={ |
|
"help": ( |
|
"When this flag is True, wandb will be enabled" |
|
) |
|
}, |
|
) |
|
random_seed: Optional[int] = field( |
|
default=1, |
|
metadata={ |
|
"help": ( |
|
"used to set random seed" |
|
) |
|
}, |
|
) |
|
output_dir: Optional[str] = field( |
|
default="./output_dir", |
|
metadata={"help": "Output path for the inferenced results"}, |
|
) |
|
mixed_precision: Optional[str] = field( |
|
default="bf16", |
|
metadata={ |
|
"help": ( |
|
"mixed precision mode, whether to use bf16 or fp16" |
|
), |
|
"choices": ["bf16","fp16"], |
|
}, |
|
) |
|
deepspeed: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already" |
|
" loaded json file as a dict" |
|
) |
|
}, |
|
) |
|
answer_type: Optional[str] = field( |
|
default="text", |
|
metadata={ |
|
"help": ( |
|
'Question type for answer extraction from the decoder output.' |
|
' Supported types: \n' |
|
' 1) "multiple_choice", e.g. A, B, C, D, ...\n' |
|
' 2) "binary_choice", e.g. yes, no, maybe\n' |
|
' 3) "math", e.g. 1.0, -3.52\n' |
|
' 4) "text", e.g. "I think that it is okay"\n' |
|
' 5) Special treatment for several datasets\n' |
|
' - "gsm8k"\n' |
|
' - "svamp"\n' |
|
' - "asdiv"\n' |
|
' - "addsub"\n' |
|
' - "singleeq"\n' |
|
' - "multiarith"\n' |
|
' - "aqua"\n' |
|
' - "csqa"\n' |
|
' - "strategyqa"\n' |
|
' - "pubmedqa"\n' |
|
' - "medmcqa"\n' |
|
' - "usmle"\n' |
|
) |
|
}, |
|
) |
|
prompt_structure: Optional[str] = field( |
|
default="{input}", |
|
metadata={ |
|
"help": ( |
|
'Prompt structure to facilitate prompt engineering during' |
|
' inference. The model will receive' |
|
' `prompt_structure.format(input=input)` as its input.' |
|
) |
|
}, |
|
) |
|
evaluate_block_size: Optional[int] = field( |
|
default=512, |
|
metadata={ |
|
"help": ( |
|
"the model will have at least block_size tokens for context when calculating the conditional likelihood of any one token" |
|
" (provided there are block_size preceding tokens available to condition on)" |
|
) |
|
}, |
|
) |
|
metric: Optional[str] = field( |
|
default="accuracy", |
|
metadata={ |
|
"help": "the metric the model will be evaluated on", |
|
"choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"], |
|
}, |
|
) |
|
|
|
|
|
@dataclass |
|
class InferencerArguments: |
|
""" |
|
Define a class InferencerArguments using the dataclass decorator. The class contains several optional |
|
parameters that can be used to configure a inferencer. |
|
|
|
local_rank : str |
|
For distributed training: local_rank |
|
|
|
random_seed : int, default = 1 |
|
|
|
deepspeed : |
|
Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already |
|
loaded json file as a dict |
|
mixed_precision : str, choice from ["bf16","fp16"]. |
|
mixed precision mode, whether to use bf16 or fp16 |
|
|
|
""" |
|
device: str = field( |
|
default="gpu", |
|
metadata={ |
|
"help": "device of chatbot", |
|
"choices": ["gpu", "cpu"], |
|
}, |
|
) |
|
local_rank: int = field( |
|
default=-1, |
|
metadata={"help": "For distributed training: local_rank" |
|
} |
|
) |
|
random_seed: Optional[int] = field( |
|
default=1, |
|
metadata={ |
|
"help": ( |
|
"used to set random seed" |
|
) |
|
}, |
|
) |
|
deepspeed: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": ( |
|
"Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already" |
|
" loaded json file as a dict" |
|
) |
|
}, |
|
) |
|
mixed_precision: Optional[str] = field( |
|
default="bf16", |
|
metadata={ |
|
"help": ( |
|
"mixed precision mode, whether to use bf16 or fp16" |
|
), |
|
"choices": ["bf16","fp16"], |
|
}, |
|
) |
|
|
|
|
|
@dataclass |
|
class RaftAlignerArguments(TrainingArguments): |
|
""" |
|
Define a class RaftAlignerArguments to configure raft aligner. |
|
""" |
|
output_reward_path: Optional[str] = field( |
|
default="tmp/raft_aligner/", |
|
metadata={ |
|
"help": "The path of output rewards." |
|
} |
|
) |
|
output_min_length: Optional[int] = field( |
|
default=16, |
|
metadata={ |
|
"help": ( |
|
"minimum length of the output token sequence generated from" |
|
" model given an input." |
|
), |
|
}, |
|
) |
|
output_max_length: Optional[int] = field( |
|
default=48, |
|
metadata={ |
|
"help": ( |
|
"maximum length of the output token sequence generated from" |
|
" model given an output." |
|
), |
|
}, |
|
) |
|
num_raft_iteration: Optional[int] = field( |
|
default=20, |
|
metadata={ |
|
"help": "number of iterations of the raft aligner." |
|
}, |
|
) |
|
raft_batch_size: Optional[int] = field( |
|
default=320, |
|
metadata={ |
|
"help": ( |
|
"only select {raft_batch_size} samples each time to" |
|
" generate rewards and be ranked for STF training." |
|
) |
|
}, |
|
) |
|
top_reward_percentage: Optional[int] = field( |
|
default=0.2, |
|
metadata={ |
|
"help": ( |
|
"only top {top_reward_percentage} samples in the raft batch," |
|
" (in terms of rewards), will be used for SFT the model." |
|
), |
|
}, |
|
) |
|
inference_batch_size_per_device: Optional[int] = field( |
|
default=1, |
|
metadata={ |
|
"help": ( |
|
"every device will infer {inference_batch_size_per_device}" |
|
" samples in parallel. The inferred results will be concatenaed" |
|
" with inputs and attach a reward." |
|
), |
|
}, |
|
) |
|
|
|
|
|
PIPELINE_ARGUMENT_MAPPING = { |
|
"finetuner": FinetunerArguments, |
|
"evaluator": EvaluatorArguments, |
|
"inferencer": InferencerArguments, |
|
"raft_aligner": RaftAlignerArguments, |
|
} |
|
|
|
|
|
class AutoArguments: |
|
""" |
|
Automatically choose arguments from FinetunerArguments or EvaluatorArguments. |
|
""" |
|
def get_pipeline_args_class(pipeline_name: str): |
|
return PIPELINE_ARGUMENT_MAPPING[pipeline_name] |
|
|