experiment: | |
seed: 42 | |
name: default-baseline-uncleaned | |
group: baseline | |
dry_run: false | |
offline_run: false | |
resume_checkpoint_path: null | |
resume_run_id: null | |
dataset: | |
name: CamBabyTrainers/BabyLM | |
subconfig: original_strict_small | |
tokenizer: | |
name: CamBabyTrainers/CamBabyTokenizer-8192 | |
add_prefix_space: true | |
data_preprocessing: | |
include_punctuation: true | |
join_sentences: true | |
max_input_length: 128 | |
callback_functions: null | |
model: | |
name: roberta_pre_layer_norm | |
model_kwargs: | |
vocab_size: 8192 | |
num_hidden_layers: 10 | |
num_attention_heads: 10 | |
hidden_size: 500 | |
intermediate_size: 2000 | |
layer_norm_eps: 1.0e-05 | |
eos_token_id: 4 | |
bos_token_id: 3 | |
pad_token_id: 1 | |
tie_word_embeddings: false | |
trainer: | |
batch_size: 32 | |
lr: 0.001 | |
num_warmup_steps: 100000 | |
max_training_steps: 400000 | |
eval_blimp: true | |
eval_glue: false | |
eval_msgs: false | |
eval_perplexity: true | |
objective_curriculum: | |
units: | |
mlm: | |
task_head_params: {} | |
optimizer_params: | |
lr: 0.001 | |
scheduler_params: {} | |
optional_kwargs: | |
mask_probability: 0.15 | |
unmask_probability: 0 | |
steps: | |
mlm: | |
- 0.0 | |
- 1.0 | |
data_curriculum: null | |
vocabulary_curriculum: null | |