_attn_implementation_autoset: value: true _name_or_path: value: answerdotai/ModernBERT-base _wandb: value: cli_version: 0.19.6 m: - "1": train/epoch "5": 2 "6": - 1 - 3 "7": [] - "1": train/global_step "6": - 3 "7": [] - "1": eval/loss "5": 2 "6": - 1 - 3 "7": [] - "1": eval/runtime "5": 2 "6": - 1 - 3 "7": [] - "1": eval/steps_per_second "5": 2 "6": - 1 - 3 "7": [] - "1": eval/mse "5": 2 "6": - 1 - 3 "7": [] - "1": eval/samples_per_second "5": 2 "6": - 1 - 3 "7": [] - "1": train/loss "5": 2 "6": - 1 - 3 "7": [] - "1": train/grad_norm "5": 2 "6": - 1 - 3 "7": [] - "1": train/learning_rate "5": 2 "6": - 1 - 3 "7": [] python_version: 3.11.11 t: "1": - 1 - 5 - 11 - 49 - 51 - 53 - 55 - 71 "2": - 1 - 5 - 11 - 49 - 51 - 53 - 55 - 71 "3": - 2 - 7 - 14 - 19 - 23 - 55 - 62 - 66 "4": 3.11.11 "5": 0.19.6 "6": 4.48.3 "8": - 1 - 5 "9": "1": transformers_trainer "12": 0.19.6 "13": linux-x86_64 accelerator_config: value: dispatch_batches: null even_batches: true gradient_accumulation_kwargs: null non_blocking: false split_batches: false use_seedable_sampler: true adafactor: value: false adam_beta1: value: 0.9 adam_beta2: value: 0.999 adam_epsilon: value: 1e-08 add_cross_attention: value: false architectures: value: - ModernBertForMaskedLM attention_bias: value: false attention_dropout: value: 0 auto_find_batch_size: value: false average_tokens_across_devices: value: false bad_words_ids: value: null batch_eval_metrics: value: false batch_size: value: 8 begin_suppress_tokens: value: null bf16: value: false bf16_full_eval: value: false bos_token_id: value: 50281 chunk_size_feed_forward: value: 0 classifier_activation: value: gelu classifier_bias: value: false classifier_dropout: value: 0 classifier_pooling: value: mean cls_token_id: value: 50281 cross_attention_hidden_size: value: null data_seed: value: null dataloader_drop_last: value: false dataloader_num_workers: value: 0 dataloader_persistent_workers: value: false dataloader_pin_memory: value: true dataloader_prefetch_factor: value: null ddp_backend: value: null ddp_broadcast_buffers: value: null ddp_bucket_cap_mb: value: null ddp_find_unused_parameters: value: null ddp_timeout: value: 1800 debug: value: [] decoder_bias: value: true decoder_start_token_id: value: null deepspeed: value: null deterministic_flash_attn: value: false disable_tqdm: value: false dispatch_batches: value: null diversity_penalty: value: 0 do_eval: value: true do_predict: value: false do_sample: value: false do_train: value: false early_stopping: value: false embedding_dropout: value: 0 encoder_no_repeat_ngram_size: value: 0 eos_token_id: value: 50282 epochs: value: 4 eval_accumulation_steps: value: null eval_delay: value: 0 eval_do_concat_batches: value: true eval_on_start: value: false eval_steps: value: null eval_strategy: value: epoch eval_use_gather_object: value: false evaluation_strategy: value: epoch exponential_decay_length_penalty: value: null finetuning_task: value: null forced_bos_token_id: value: null forced_eos_token_id: value: null fp16: value: false fp16_backend: value: auto fp16_full_eval: value: false fp16_opt_level: value: O1 fsdp: value: [] fsdp_config: value: min_num_params: 0 xla: false xla_fsdp_grad_ckpt: false xla_fsdp_v2: false fsdp_min_num_params: value: 0 fsdp_transformer_layer_cls_to_wrap: value: null full_determinism: value: false global_attn_every_n_layers: value: 3 global_rope_theta: value: 160000 gradient_accumulation_steps: value: 1 gradient_checkpointing: value: false gradient_checkpointing_kwargs: value: null greater_is_better: value: false group_by_length: value: false half_precision_backend: value: auto hidden_activation: value: gelu hidden_size: value: 768 hub_always_push: value: false hub_model_id: value: null hub_private_repo: value: null hub_strategy: value: every_save hub_token: value: id2label: value: "0": LABEL_0 ignore_data_skip: value: false include_for_metrics: value: [] include_inputs_for_metrics: value: false include_num_input_tokens_seen: value: false include_tokens_per_second: value: false initializer_cutoff_factor: value: 2 initializer_range: value: 0.02 intermediate_size: value: 1152 is_decoder: value: false is_encoder_decoder: value: false jit_mode_eval: value: false label_names: value: null label_smoothing_factor: value: 0 label2id: value: LABEL_0: 0 layer_norm_eps: value: 1e-05 learning_rate: value: 8e-05 length_column_name: value: length length_penalty: value: 1 load_best_model_at_end: value: false local_attention: value: 128 local_rank: value: 0 local_rope_theta: value: 10000 log_level: value: error log_level_replica: value: warning log_on_each_node: value: true logging_dir: value: ../logs/content logging_first_step: value: false logging_nan_inf_filter: value: true logging_steps: value: 500 logging_strategy: value: steps lr_scheduler_type: value: linear max_grad_norm: value: 1 max_length: value: 20 max_position_embeddings: value: 8192 max_steps: value: -1 metric_for_best_model: value: mse min_length: value: 0 mlp_bias: value: false mlp_dropout: value: 0 model/num_parameters: value: 149605633 model_type: value: modernbert mp_parameters: value: "" neftune_noise_alpha: value: null no_cuda: value: false no_repeat_ngram_size: value: 0 norm_bias: value: false norm_eps: value: 1e-05 num_attention_heads: value: 12 num_beam_groups: value: 1 num_beams: value: 1 num_hidden_layers: value: 22 num_return_sequences: value: 1 num_train_epochs: value: 4 optim: value: adamw_torch optim_args: value: null optim_target_modules: value: null output_attentions: value: false output_dir: value: ../bin output_hidden_states: value: false output_scores: value: false overwrite_output_dir: value: false pad_token_id: value: 50283 past_index: value: -1 per_device_eval_batch_size: value: 16 per_device_train_batch_size: value: 8 per_gpu_eval_batch_size: value: null per_gpu_train_batch_size: value: null position_embedding_type: value: absolute prediction_loss_only: value: false prefix: value: null problem_type: value: null push_to_hub: value: false push_to_hub_model_id: value: null push_to_hub_organization: value: null push_to_hub_token: value: ray_scope: value: last reference_compile: value: null remove_invalid_values: value: false remove_unused_columns: value: true repad_logits_with_grad: value: false repetition_penalty: value: 1 report_to: value: - wandb restore_callback_states_from_checkpoint: value: false resume_from_checkpoint: value: null return_dict: value: true return_dict_in_generate: value: false run_name: value: ../bin save_on_each_node: value: false save_only_model: value: false save_safetensors: value: true save_steps: value: 500 save_strategy: value: "no" save_total_limit: value: null seed: value: 42 sep_token_id: value: 50282 skip_memory_metrics: value: true sparse_pred_ignore_index: value: -100 sparse_prediction: value: false split_batches: value: null suppress_tokens: value: null task_specific_params: value: null temperature: value: 1 tf_legacy_loss: value: false tf32: value: null tie_encoder_decoder: value: false tie_word_embeddings: value: true tokenizer_class: value: null top_k: value: 50 top_p: value: 1 torch_compile: value: false torch_compile_backend: value: null torch_compile_mode: value: null torch_dtype: value: float32 torch_empty_cache_steps: value: null torchdynamo: value: null torchscript: value: false tpu_metrics_debug: value: false tpu_num_cores: value: null transformers_version: value: 4.48.3 typical_p: value: 1 use_bfloat16: value: false use_cpu: value: false use_ipex: value: false use_legacy_prediction_loop: value: false use_liger_kernel: value: false use_mps_device: value: false vocab_size: value: 50368 warmup_ratio: value: 0 warmup_steps: value: 1000 weight_decay: value: 0