merit-deberta-v2-xxlarge-v1 / training_config.yaml
chitanda's picture
upload fp16 models
30f7a38
train_file: wiki_erica_path/v7/union/train_distant.path_v7.train.0.pkl
dev_file: wiki_erica_path/v7/union/train_distant.path_v7.dev.pkl
test_file: null
model:
_target_: models.deberta.DebertaV2ForMultipleChoicePreTrain.from_pretrained
mlp_hidden_size: 3072
fs_checkpoint: false
fs_checkpoint_offload_to_cpu: false
read_tensor:
_target_: dataset.wiki_entity_path_v8_2.convert_examples_into_features
max_neg_num: 3
aug_num: 1
max_seq_length: 256
shuffle_context: true
min_rep_num: 5
geo_p: 0.4
deduct_ratio: 1.0
context_ratio: 1.0
num_workers: 32
extended_vocab: null
collator:
_target_: dataset.wiki_entity_path_v8.WikiPathDatasetCollatorWithContext
max_seq_length: 256
tokenizer: pretrained-models/deberta-v2-xxlarge
mlm_probability: 0.15
max_option_num: 4
swap: true
num_workers: 4
prefetch_factor: 4
model_name_or_path: pretrained-models/deberta-v2-xxlarge
pretrain: null
output_dir: experiments/deberta.v2.xxlarge.path.v7_v8.2.2.1aug.ctx.A100.v1.3.w4.s${seed}.fsdp.adamw
do_train: Train
evaluate_during_training: true
do_eval: false
eval_sub_path: null
do_preprocess: false
per_gpu_train_batch_size: 2
per_gpu_eval_batch_size: 2
learning_rate: 1.0e-05
gradient_accumulation_steps: 512
weight_decay: 0.01
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.999)
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: 200
warmup_proportion: 0.2
warmup_steps: 0
optimizer: null
use_nvlamb: null
bit_training: null
multi_tensor: null
logging_steps: 1
save_steps: 50
eval_steps: 50
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O2
ds_cfg:
train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
gradient_accumulation_steps: ${gradient_accumulation_steps}
optimizer:
type: AdamW
params:
lr: ${learning_rate}
betas:
- 0.9
- 0.999
eps: ${adam_epsilon}
weight_decay: ${weight_decay}
scheduler:
type: WarmupDecayLR
params:
total_num_steps: null
warmup_max_lr: ${learning_rate}
warmup_num_steps: null
warmup_type: linear
gradient_clipping: ${max_grad_norm}
fp16:
enabled: ${fp16}
initial_scale_power: 12
zero_optimization:
stage: 3
steps_per_print: 1024
reshard_after_forward: false
flatten_parameters: true
move_grads_to_cpu: false
move_params_to_cpu: false
n_gpu: 1
device: cuda:0
train_batch_size: 2
eval_batch_size: 2
note: null