Wenboz commited on
Commit
06b5d63
·
verified ·
1 Parent(s): eb11778

Model save

Browse files
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: mistralai/Mistral-7B-v0.1
3
+ library_name: peft
4
+ license: apache-2.0
5
+ tags:
6
+ - trl
7
+ - dpo
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: zephyr-7b-dpo-lora
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # zephyr-7b-dpo-lora
18
+
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.4932
22
+ - Rewards/chosen: -1.7916
23
+ - Rewards/rejected: -2.7322
24
+ - Rewards/accuracies: 0.7262
25
+ - Rewards/margins: 0.9407
26
+ - Logps/rejected: -535.5854
27
+ - Logps/chosen: -446.6503
28
+ - Logits/rejected: 0.9353
29
+ - Logits/chosen: 0.5257
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 8
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - num_devices: 4
54
+ - gradient_accumulation_steps: 4
55
+ - total_train_batch_size: 64
56
+ - total_eval_batch_size: 32
57
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
58
+ - lr_scheduler_type: cosine
59
+ - lr_scheduler_warmup_ratio: 0.1
60
+ - num_epochs: 1
61
+
62
+ ### Training results
63
+
64
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.6087 | 0.1 | 100 | 0.6158 | -0.3136 | -0.5466 | 0.6726 | 0.2330 | -317.0252 | -298.8513 | -2.0360 | -2.1198 |
67
+ | 0.5463 | 0.21 | 200 | 0.5504 | -1.1262 | -1.6978 | 0.6925 | 0.5716 | -432.1413 | -380.1157 | -0.0431 | -0.2986 |
68
+ | 0.4949 | 0.31 | 300 | 0.5161 | -1.6535 | -2.4330 | 0.7183 | 0.7794 | -505.6621 | -432.8479 | 0.4034 | 0.1418 |
69
+ | 0.5239 | 0.42 | 400 | 0.5101 | -1.3693 | -2.0810 | 0.7302 | 0.7116 | -470.4624 | -404.4282 | 0.8585 | 0.5591 |
70
+ | 0.5272 | 0.52 | 500 | 0.5003 | -2.0358 | -2.9629 | 0.7381 | 0.9271 | -558.6534 | -471.0703 | 1.0404 | 0.7150 |
71
+ | 0.4886 | 0.63 | 600 | 0.4982 | -1.7739 | -2.6428 | 0.7262 | 0.8689 | -526.6414 | -444.8822 | 0.3752 | 0.0594 |
72
+ | 0.516 | 0.73 | 700 | 0.4933 | -2.0243 | -2.9388 | 0.7302 | 0.9144 | -556.2413 | -469.9273 | 0.8898 | 0.5312 |
73
+ | 0.495 | 0.84 | 800 | 0.4949 | -1.7382 | -2.6840 | 0.7262 | 0.9458 | -530.7620 | -441.3121 | 0.8308 | 0.4157 |
74
+ | 0.4866 | 0.94 | 900 | 0.4932 | -1.7916 | -2.7322 | 0.7262 | 0.9407 | -535.5854 | -446.6503 | 0.9353 | 0.5257 |
75
+
76
+
77
+ ### Framework versions
78
+
79
+ - PEFT 0.7.1
80
+ - Transformers 4.38.2
81
+ - Pytorch 2.1.2+cu121
82
+ - Datasets 2.14.6
83
+ - Tokenizers 0.15.2
adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 128,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 64,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "o_proj",
24
+ "gate_proj",
25
+ "up_proj",
26
+ "v_proj",
27
+ "k_proj",
28
+ "down_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM"
31
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a30a7a4badef97c6dfcd8926c13ead9c59cb66a2a3eeb2b655d70e26c1cef415
3
+ size 335605144
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.5264237219126436,
4
+ "train_runtime": 29801.8239,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 2.051,
7
+ "train_steps_per_second": 0.032
8
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.5264237219126436,
4
+ "train_runtime": 29801.8239,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 2.051,
7
+ "train_steps_per_second": 0.032
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9997382884061764,
5
+ "eval_steps": 100,
6
+ "global_step": 955,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 1.6872971730727813,
14
+ "learning_rate": 5.208333333333333e-08,
15
+ "logits/chosen": -2.4102981090545654,
16
+ "logits/rejected": -2.4143850803375244,
17
+ "logps/chosen": -352.07745361328125,
18
+ "logps/rejected": -290.7293395996094,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.01,
28
+ "grad_norm": 1.7117396873541209,
29
+ "learning_rate": 5.208333333333334e-07,
30
+ "logits/chosen": -2.2590177059173584,
31
+ "logits/rejected": -2.1774637699127197,
32
+ "logps/chosen": -236.37635803222656,
33
+ "logps/rejected": -228.0323944091797,
34
+ "loss": 0.6931,
35
+ "rewards/accuracies": 0.3819444477558136,
36
+ "rewards/chosen": -0.0001151897813542746,
37
+ "rewards/margins": -0.0005040301475673914,
38
+ "rewards/rejected": 0.00038884030072949827,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.02,
43
+ "grad_norm": 1.5684536992232367,
44
+ "learning_rate": 1.0416666666666667e-06,
45
+ "logits/chosen": -2.3643908500671387,
46
+ "logits/rejected": -2.298231840133667,
47
+ "logps/chosen": -282.7557067871094,
48
+ "logps/rejected": -264.20379638671875,
49
+ "loss": 0.6927,
50
+ "rewards/accuracies": 0.550000011920929,
51
+ "rewards/chosen": 0.0016558446222916245,
52
+ "rewards/margins": 0.000754863431211561,
53
+ "rewards/rejected": 0.0009009811910800636,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.03,
58
+ "grad_norm": 1.6320638446100388,
59
+ "learning_rate": 1.5625e-06,
60
+ "logits/chosen": -2.2148165702819824,
61
+ "logits/rejected": -2.1439080238342285,
62
+ "logps/chosen": -254.8279266357422,
63
+ "logps/rejected": -239.754150390625,
64
+ "loss": 0.6909,
65
+ "rewards/accuracies": 0.6875,
66
+ "rewards/chosen": 0.00770978769287467,
67
+ "rewards/margins": 0.005677036941051483,
68
+ "rewards/rejected": 0.002032750751823187,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.04,
73
+ "grad_norm": 1.5505793749959536,
74
+ "learning_rate": 2.0833333333333334e-06,
75
+ "logits/chosen": -2.4151053428649902,
76
+ "logits/rejected": -2.267972946166992,
77
+ "logps/chosen": -289.40142822265625,
78
+ "logps/rejected": -265.69000244140625,
79
+ "loss": 0.6867,
80
+ "rewards/accuracies": 0.6187499761581421,
81
+ "rewards/chosen": 0.019891003146767616,
82
+ "rewards/margins": 0.008773349225521088,
83
+ "rewards/rejected": 0.011117652989923954,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.05,
88
+ "grad_norm": 1.6726051946382139,
89
+ "learning_rate": 2.604166666666667e-06,
90
+ "logits/chosen": -2.336714029312134,
91
+ "logits/rejected": -2.256179094314575,
92
+ "logps/chosen": -245.6073455810547,
93
+ "logps/rejected": -228.17562866210938,
94
+ "loss": 0.6793,
95
+ "rewards/accuracies": 0.6499999761581421,
96
+ "rewards/chosen": 0.037440598011016846,
97
+ "rewards/margins": 0.02772732451558113,
98
+ "rewards/rejected": 0.009713277220726013,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.06,
103
+ "grad_norm": 1.7944993610396887,
104
+ "learning_rate": 3.125e-06,
105
+ "logits/chosen": -2.289384365081787,
106
+ "logits/rejected": -2.2603793144226074,
107
+ "logps/chosen": -252.38510131835938,
108
+ "logps/rejected": -252.2633819580078,
109
+ "loss": 0.6717,
110
+ "rewards/accuracies": 0.699999988079071,
111
+ "rewards/chosen": 0.04392694681882858,
112
+ "rewards/margins": 0.0502941831946373,
113
+ "rewards/rejected": -0.006367234978824854,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.07,
118
+ "grad_norm": 2.0272577815121133,
119
+ "learning_rate": 3.6458333333333333e-06,
120
+ "logits/chosen": -2.276283025741577,
121
+ "logits/rejected": -2.1802830696105957,
122
+ "logps/chosen": -252.1732635498047,
123
+ "logps/rejected": -269.9049987792969,
124
+ "loss": 0.6579,
125
+ "rewards/accuracies": 0.668749988079071,
126
+ "rewards/chosen": -0.014363644644618034,
127
+ "rewards/margins": 0.0627121776342392,
128
+ "rewards/rejected": -0.07707582414150238,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.08,
133
+ "grad_norm": 5.394650048474675,
134
+ "learning_rate": 4.166666666666667e-06,
135
+ "logits/chosen": -2.100991725921631,
136
+ "logits/rejected": -1.9991142749786377,
137
+ "logps/chosen": -265.56060791015625,
138
+ "logps/rejected": -263.40496826171875,
139
+ "loss": 0.6436,
140
+ "rewards/accuracies": 0.699999988079071,
141
+ "rewards/chosen": -0.027108073234558105,
142
+ "rewards/margins": 0.1258033663034439,
143
+ "rewards/rejected": -0.15291143953800201,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.09,
148
+ "grad_norm": 2.829581544220464,
149
+ "learning_rate": 4.6875000000000004e-06,
150
+ "logits/chosen": -2.1768875122070312,
151
+ "logits/rejected": -2.1092872619628906,
152
+ "logps/chosen": -284.537841796875,
153
+ "logps/rejected": -302.475830078125,
154
+ "loss": 0.629,
155
+ "rewards/accuracies": 0.6875,
156
+ "rewards/chosen": -0.2255394160747528,
157
+ "rewards/margins": 0.15923205018043518,
158
+ "rewards/rejected": -0.384771466255188,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.1,
163
+ "grad_norm": 5.983992522046318,
164
+ "learning_rate": 4.9997324926814375e-06,
165
+ "logits/chosen": -2.1670639514923096,
166
+ "logits/rejected": -2.1124117374420166,
167
+ "logps/chosen": -293.4961242675781,
168
+ "logps/rejected": -312.59320068359375,
169
+ "loss": 0.6087,
170
+ "rewards/accuracies": 0.7124999761581421,
171
+ "rewards/chosen": -0.15919722616672516,
172
+ "rewards/margins": 0.22884318232536316,
173
+ "rewards/rejected": -0.3880404233932495,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.1,
178
+ "eval_logits/chosen": -2.1197569370269775,
179
+ "eval_logits/rejected": -2.036003828048706,
180
+ "eval_logps/chosen": -298.85125732421875,
181
+ "eval_logps/rejected": -317.0252380371094,
182
+ "eval_loss": 0.6158178448677063,
183
+ "eval_rewards/accuracies": 0.6726190447807312,
184
+ "eval_rewards/chosen": -0.313564270734787,
185
+ "eval_rewards/margins": 0.23304378986358643,
186
+ "eval_rewards/rejected": -0.546608030796051,
187
+ "eval_runtime": 364.711,
188
+ "eval_samples_per_second": 5.484,
189
+ "eval_steps_per_second": 0.173,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.12,
194
+ "grad_norm": 4.31001099339475,
195
+ "learning_rate": 4.996723692767927e-06,
196
+ "logits/chosen": -2.2227556705474854,
197
+ "logits/rejected": -2.1177265644073486,
198
+ "logps/chosen": -341.78594970703125,
199
+ "logps/rejected": -359.0604553222656,
200
+ "loss": 0.6048,
201
+ "rewards/accuracies": 0.6875,
202
+ "rewards/chosen": -0.474813312292099,
203
+ "rewards/margins": 0.33163630962371826,
204
+ "rewards/rejected": -0.8064495921134949,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 0.13,
209
+ "grad_norm": 5.841735492614658,
210
+ "learning_rate": 4.9903757462135984e-06,
211
+ "logits/chosen": -2.2709505558013916,
212
+ "logits/rejected": -2.1858668327331543,
213
+ "logps/chosen": -327.3430480957031,
214
+ "logps/rejected": -359.780029296875,
215
+ "loss": 0.5976,
216
+ "rewards/accuracies": 0.706250011920929,
217
+ "rewards/chosen": -0.7242799401283264,
218
+ "rewards/margins": 0.3823621869087219,
219
+ "rewards/rejected": -1.1066421270370483,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.14,
224
+ "grad_norm": 4.058093924122145,
225
+ "learning_rate": 4.980697142834315e-06,
226
+ "logits/chosen": -1.9997293949127197,
227
+ "logits/rejected": -1.8148103952407837,
228
+ "logps/chosen": -391.48834228515625,
229
+ "logps/rejected": -384.39288330078125,
230
+ "loss": 0.5576,
231
+ "rewards/accuracies": 0.737500011920929,
232
+ "rewards/chosen": -0.850256621837616,
233
+ "rewards/margins": 0.467766135931015,
234
+ "rewards/rejected": -1.3180228471755981,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.15,
239
+ "grad_norm": 4.759314589639897,
240
+ "learning_rate": 4.967700826904229e-06,
241
+ "logits/chosen": -1.3535711765289307,
242
+ "logits/rejected": -1.2067102193832397,
243
+ "logps/chosen": -352.2384948730469,
244
+ "logps/rejected": -376.5408020019531,
245
+ "loss": 0.5621,
246
+ "rewards/accuracies": 0.731249988079071,
247
+ "rewards/chosen": -0.9079573750495911,
248
+ "rewards/margins": 0.47648367285728455,
249
+ "rewards/rejected": -1.3844410181045532,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 0.16,
254
+ "grad_norm": 6.966946629400433,
255
+ "learning_rate": 4.951404179843963e-06,
256
+ "logits/chosen": -1.4944788217544556,
257
+ "logits/rejected": -1.175698161125183,
258
+ "logps/chosen": -385.9867858886719,
259
+ "logps/rejected": -395.92889404296875,
260
+ "loss": 0.5526,
261
+ "rewards/accuracies": 0.7437499761581421,
262
+ "rewards/chosen": -0.8049098253250122,
263
+ "rewards/margins": 0.6389679908752441,
264
+ "rewards/rejected": -1.443877935409546,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 0.17,
269
+ "grad_norm": 6.933994816745419,
270
+ "learning_rate": 4.931828996974498e-06,
271
+ "logits/chosen": -1.2824828624725342,
272
+ "logits/rejected": -1.0046826601028442,
273
+ "logps/chosen": -356.4523010253906,
274
+ "logps/rejected": -377.0785217285156,
275
+ "loss": 0.577,
276
+ "rewards/accuracies": 0.7124999761581421,
277
+ "rewards/chosen": -1.0120383501052856,
278
+ "rewards/margins": 0.5240768790245056,
279
+ "rewards/rejected": -1.5361151695251465,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 0.18,
284
+ "grad_norm": 4.768436190577922,
285
+ "learning_rate": 4.909001458367867e-06,
286
+ "logits/chosen": -0.5933430790901184,
287
+ "logits/rejected": -0.33633238077163696,
288
+ "logps/chosen": -465.66510009765625,
289
+ "logps/rejected": -474.8177185058594,
290
+ "loss": 0.5755,
291
+ "rewards/accuracies": 0.731249988079071,
292
+ "rewards/chosen": -1.7751672267913818,
293
+ "rewards/margins": 0.543445885181427,
294
+ "rewards/rejected": -2.318613052368164,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 0.19,
299
+ "grad_norm": 4.556050170367665,
300
+ "learning_rate": 4.882952093833628e-06,
301
+ "logits/chosen": -0.9897077679634094,
302
+ "logits/rejected": -0.833814263343811,
303
+ "logps/chosen": -451.8089294433594,
304
+ "logps/rejected": -465.73565673828125,
305
+ "loss": 0.5727,
306
+ "rewards/accuracies": 0.6937500238418579,
307
+ "rewards/chosen": -1.6387646198272705,
308
+ "rewards/margins": 0.44902724027633667,
309
+ "rewards/rejected": -2.087791919708252,
310
+ "step": 180
311
+ },
312
+ {
313
+ "epoch": 0.2,
314
+ "grad_norm": 4.891304529530987,
315
+ "learning_rate": 4.853715742087947e-06,
316
+ "logits/chosen": -0.743046760559082,
317
+ "logits/rejected": -0.4523673951625824,
318
+ "logps/chosen": -378.919189453125,
319
+ "logps/rejected": -391.9801025390625,
320
+ "loss": 0.5486,
321
+ "rewards/accuracies": 0.6875,
322
+ "rewards/chosen": -1.2883106470108032,
323
+ "rewards/margins": 0.43378907442092896,
324
+ "rewards/rejected": -1.7220996618270874,
325
+ "step": 190
326
+ },
327
+ {
328
+ "epoch": 0.21,
329
+ "grad_norm": 5.38693225816445,
330
+ "learning_rate": 4.821331504159906e-06,
331
+ "logits/chosen": -0.3933953046798706,
332
+ "logits/rejected": -0.05709639936685562,
333
+ "logps/chosen": -357.71466064453125,
334
+ "logps/rejected": -383.194580078125,
335
+ "loss": 0.5463,
336
+ "rewards/accuracies": 0.731249988079071,
337
+ "rewards/chosen": -1.0744948387145996,
338
+ "rewards/margins": 0.5533859729766846,
339
+ "rewards/rejected": -1.6278808116912842,
340
+ "step": 200
341
+ },
342
+ {
343
+ "epoch": 0.21,
344
+ "eval_logits/chosen": -0.2985667288303375,
345
+ "eval_logits/rejected": -0.04311899468302727,
346
+ "eval_logps/chosen": -380.1156921386719,
347
+ "eval_logps/rejected": -432.1412658691406,
348
+ "eval_loss": 0.550410807132721,
349
+ "eval_rewards/accuracies": 0.692460298538208,
350
+ "eval_rewards/chosen": -1.1262083053588867,
351
+ "eval_rewards/margins": 0.5715598464012146,
352
+ "eval_rewards/rejected": -1.697768211364746,
353
+ "eval_runtime": 363.8367,
354
+ "eval_samples_per_second": 5.497,
355
+ "eval_steps_per_second": 0.173,
356
+ "step": 200
357
+ },
358
+ {
359
+ "epoch": 0.22,
360
+ "grad_norm": 5.527241951911523,
361
+ "learning_rate": 4.7858426910973435e-06,
362
+ "logits/chosen": -0.14632602035999298,
363
+ "logits/rejected": 0.03931695967912674,
364
+ "logps/chosen": -388.56036376953125,
365
+ "logps/rejected": -445.367919921875,
366
+ "loss": 0.5336,
367
+ "rewards/accuracies": 0.7562500238418579,
368
+ "rewards/chosen": -1.1834015846252441,
369
+ "rewards/margins": 0.731239914894104,
370
+ "rewards/rejected": -1.9146416187286377,
371
+ "step": 210
372
+ },
373
+ {
374
+ "epoch": 0.23,
375
+ "grad_norm": 6.273093626258007,
376
+ "learning_rate": 4.747296766042161e-06,
377
+ "logits/chosen": 0.4761236310005188,
378
+ "logits/rejected": 0.6708475351333618,
379
+ "logps/chosen": -378.1613464355469,
380
+ "logps/rejected": -428.78643798828125,
381
+ "loss": 0.5169,
382
+ "rewards/accuracies": 0.6875,
383
+ "rewards/chosen": -1.4337002038955688,
384
+ "rewards/margins": 0.7104736566543579,
385
+ "rewards/rejected": -2.1441738605499268,
386
+ "step": 220
387
+ },
388
+ {
389
+ "epoch": 0.24,
390
+ "grad_norm": 4.666490596901144,
391
+ "learning_rate": 4.705745280752586e-06,
392
+ "logits/chosen": -0.45128726959228516,
393
+ "logits/rejected": 0.05878635495901108,
394
+ "logps/chosen": -386.0497131347656,
395
+ "logps/rejected": -419.9007873535156,
396
+ "loss": 0.5351,
397
+ "rewards/accuracies": 0.762499988079071,
398
+ "rewards/chosen": -1.0124726295471191,
399
+ "rewards/margins": 0.6896177530288696,
400
+ "rewards/rejected": -1.7020905017852783,
401
+ "step": 230
402
+ },
403
+ {
404
+ "epoch": 0.25,
405
+ "grad_norm": 6.120721061963155,
406
+ "learning_rate": 4.661243806657256e-06,
407
+ "logits/chosen": -0.053895045071840286,
408
+ "logits/rejected": 0.2541065514087677,
409
+ "logps/chosen": -380.99114990234375,
410
+ "logps/rejected": -433.10809326171875,
411
+ "loss": 0.5416,
412
+ "rewards/accuracies": 0.7124999761581421,
413
+ "rewards/chosen": -1.1518728733062744,
414
+ "rewards/margins": 0.6217032074928284,
415
+ "rewards/rejected": -1.7735761404037476,
416
+ "step": 240
417
+ },
418
+ {
419
+ "epoch": 0.26,
420
+ "grad_norm": 5.583267704971781,
421
+ "learning_rate": 4.613851860533367e-06,
422
+ "logits/chosen": 0.5581181049346924,
423
+ "logits/rejected": 0.7085031867027283,
424
+ "logps/chosen": -440.7920837402344,
425
+ "logps/rejected": -470.03778076171875,
426
+ "loss": 0.5495,
427
+ "rewards/accuracies": 0.731249988079071,
428
+ "rewards/chosen": -1.6821489334106445,
429
+ "rewards/margins": 0.609348475933075,
430
+ "rewards/rejected": -2.291497230529785,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 0.27,
435
+ "grad_norm": 5.673344697563603,
436
+ "learning_rate": 4.563632824908252e-06,
437
+ "logits/chosen": 0.7213119864463806,
438
+ "logits/rejected": 0.8342201113700867,
439
+ "logps/chosen": -413.45794677734375,
440
+ "logps/rejected": -476.8807067871094,
441
+ "loss": 0.5347,
442
+ "rewards/accuracies": 0.706250011920929,
443
+ "rewards/chosen": -1.6307703256607056,
444
+ "rewards/margins": 0.5706671476364136,
445
+ "rewards/rejected": -2.201437473297119,
446
+ "step": 260
447
+ },
448
+ {
449
+ "epoch": 0.28,
450
+ "grad_norm": 7.321550555658446,
451
+ "learning_rate": 4.510653863290871e-06,
452
+ "logits/chosen": 0.22780194878578186,
453
+ "logits/rejected": 0.8186622858047485,
454
+ "logps/chosen": -407.9789123535156,
455
+ "logps/rejected": -441.42291259765625,
456
+ "loss": 0.5285,
457
+ "rewards/accuracies": 0.7437499761581421,
458
+ "rewards/chosen": -1.2799824476242065,
459
+ "rewards/margins": 0.812950611114502,
460
+ "rewards/rejected": -2.092933177947998,
461
+ "step": 270
462
+ },
463
+ {
464
+ "epoch": 0.29,
465
+ "grad_norm": 6.59015517751076,
466
+ "learning_rate": 4.454985830346574e-06,
467
+ "logits/chosen": 0.8685849905014038,
468
+ "logits/rejected": 1.2805653810501099,
469
+ "logps/chosen": -383.39483642578125,
470
+ "logps/rejected": -461.10528564453125,
471
+ "loss": 0.4941,
472
+ "rewards/accuracies": 0.7124999761581421,
473
+ "rewards/chosen": -1.3261792659759521,
474
+ "rewards/margins": 0.7511085271835327,
475
+ "rewards/rejected": -2.0772879123687744,
476
+ "step": 280
477
+ },
478
+ {
479
+ "epoch": 0.3,
480
+ "grad_norm": 5.290937758978531,
481
+ "learning_rate": 4.396703177135262e-06,
482
+ "logits/chosen": 0.7316430807113647,
483
+ "logits/rejected": 1.184604525566101,
484
+ "logps/chosen": -394.62908935546875,
485
+ "logps/rejected": -475.59637451171875,
486
+ "loss": 0.5394,
487
+ "rewards/accuracies": 0.7124999761581421,
488
+ "rewards/chosen": -1.2810670137405396,
489
+ "rewards/margins": 0.8331844210624695,
490
+ "rewards/rejected": -2.1142513751983643,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 0.31,
495
+ "grad_norm": 6.38047854311955,
496
+ "learning_rate": 4.335883851539693e-06,
497
+ "logits/chosen": -0.5117667317390442,
498
+ "logits/rejected": -0.2342231273651123,
499
+ "logps/chosen": -363.1314697265625,
500
+ "logps/rejected": -406.3230285644531,
501
+ "loss": 0.4949,
502
+ "rewards/accuracies": 0.731249988079071,
503
+ "rewards/chosen": -1.074188232421875,
504
+ "rewards/margins": 0.7227994799613953,
505
+ "rewards/rejected": -1.796987771987915,
506
+ "step": 300
507
+ },
508
+ {
509
+ "epoch": 0.31,
510
+ "eval_logits/chosen": 0.14181683957576752,
511
+ "eval_logits/rejected": 0.4033640921115875,
512
+ "eval_logps/chosen": -432.8479309082031,
513
+ "eval_logps/rejected": -505.6621398925781,
514
+ "eval_loss": 0.5161137580871582,
515
+ "eval_rewards/accuracies": 0.7182539701461792,
516
+ "eval_rewards/chosen": -1.6535308361053467,
517
+ "eval_rewards/margins": 0.7794461250305176,
518
+ "eval_rewards/rejected": -2.4329771995544434,
519
+ "eval_runtime": 363.6104,
520
+ "eval_samples_per_second": 5.5,
521
+ "eval_steps_per_second": 0.173,
522
+ "step": 300
523
+ },
524
+ {
525
+ "epoch": 0.32,
526
+ "grad_norm": 6.269453976354009,
527
+ "learning_rate": 4.2726091940171055e-06,
528
+ "logits/chosen": 0.015387272462248802,
529
+ "logits/rejected": 0.4361787736415863,
530
+ "logps/chosen": -471.3013610839844,
531
+ "logps/rejected": -514.4930419921875,
532
+ "loss": 0.5181,
533
+ "rewards/accuracies": 0.7250000238418579,
534
+ "rewards/chosen": -1.809822678565979,
535
+ "rewards/margins": 0.7652148604393005,
536
+ "rewards/rejected": -2.5750374794006348,
537
+ "step": 310
538
+ },
539
+ {
540
+ "epoch": 0.33,
541
+ "grad_norm": 7.128031303328463,
542
+ "learning_rate": 4.206963828813555e-06,
543
+ "logits/chosen": -0.014968380331993103,
544
+ "logits/rejected": 0.18288832902908325,
545
+ "logps/chosen": -377.1259765625,
546
+ "logps/rejected": -439.23504638671875,
547
+ "loss": 0.5033,
548
+ "rewards/accuracies": 0.7437499761581421,
549
+ "rewards/chosen": -1.4144322872161865,
550
+ "rewards/margins": 0.7219018936157227,
551
+ "rewards/rejected": -2.136334180831909,
552
+ "step": 320
553
+ },
554
+ {
555
+ "epoch": 0.35,
556
+ "grad_norm": 6.507915372367444,
557
+ "learning_rate": 4.139035550786495e-06,
558
+ "logits/chosen": 0.25516384840011597,
559
+ "logits/rejected": 0.32111260294914246,
560
+ "logps/chosen": -384.0611877441406,
561
+ "logps/rejected": -484.21826171875,
562
+ "loss": 0.5141,
563
+ "rewards/accuracies": 0.768750011920929,
564
+ "rewards/chosen": -1.6149778366088867,
565
+ "rewards/margins": 0.7714108824729919,
566
+ "rewards/rejected": -2.3863887786865234,
567
+ "step": 330
568
+ },
569
+ {
570
+ "epoch": 0.36,
571
+ "grad_norm": 7.117891747882256,
572
+ "learning_rate": 4.068915207986931e-06,
573
+ "logits/chosen": 0.793425977230072,
574
+ "logits/rejected": 1.1012945175170898,
575
+ "logps/chosen": -490.26702880859375,
576
+ "logps/rejected": -559.7706298828125,
577
+ "loss": 0.5288,
578
+ "rewards/accuracies": 0.7250000238418579,
579
+ "rewards/chosen": -2.188723564147949,
580
+ "rewards/margins": 0.789432168006897,
581
+ "rewards/rejected": -2.9781556129455566,
582
+ "step": 340
583
+ },
584
+ {
585
+ "epoch": 0.37,
586
+ "grad_norm": 6.767573615618988,
587
+ "learning_rate": 3.996696580158211e-06,
588
+ "logits/chosen": 0.038734257221221924,
589
+ "logits/rejected": 0.3444157540798187,
590
+ "logps/chosen": -393.08392333984375,
591
+ "logps/rejected": -440.9234924316406,
592
+ "loss": 0.543,
593
+ "rewards/accuracies": 0.7437499761581421,
594
+ "rewards/chosen": -1.2908889055252075,
595
+ "rewards/margins": 0.7114801406860352,
596
+ "rewards/rejected": -2.0023691654205322,
597
+ "step": 350
598
+ },
599
+ {
600
+ "epoch": 0.38,
601
+ "grad_norm": 5.745989852370514,
602
+ "learning_rate": 3.922476253313921e-06,
603
+ "logits/chosen": -0.3795308768749237,
604
+ "logits/rejected": -0.08608667552471161,
605
+ "logps/chosen": -352.70037841796875,
606
+ "logps/rejected": -411.849609375,
607
+ "loss": 0.5253,
608
+ "rewards/accuracies": 0.762499988079071,
609
+ "rewards/chosen": -1.2477184534072876,
610
+ "rewards/margins": 0.7360815405845642,
611
+ "rewards/rejected": -1.983799695968628,
612
+ "step": 360
613
+ },
614
+ {
615
+ "epoch": 0.39,
616
+ "grad_norm": 7.421866262330383,
617
+ "learning_rate": 3.846353490562664e-06,
618
+ "logits/chosen": 0.32861948013305664,
619
+ "logits/rejected": 0.747407078742981,
620
+ "logps/chosen": -419.4422912597656,
621
+ "logps/rejected": -482.92218017578125,
622
+ "loss": 0.5213,
623
+ "rewards/accuracies": 0.768750011920929,
624
+ "rewards/chosen": -1.5996549129486084,
625
+ "rewards/margins": 0.861799418926239,
626
+ "rewards/rejected": -2.4614546298980713,
627
+ "step": 370
628
+ },
629
+ {
630
+ "epoch": 0.4,
631
+ "grad_norm": 8.881319616744216,
632
+ "learning_rate": 3.768430099352445e-06,
633
+ "logits/chosen": 0.7479051351547241,
634
+ "logits/rejected": 0.9084765315055847,
635
+ "logps/chosen": -419.5804138183594,
636
+ "logps/rejected": -462.1929626464844,
637
+ "loss": 0.5316,
638
+ "rewards/accuracies": 0.637499988079071,
639
+ "rewards/chosen": -1.8022568225860596,
640
+ "rewards/margins": 0.5997826457023621,
641
+ "rewards/rejected": -2.4020392894744873,
642
+ "step": 380
643
+ },
644
+ {
645
+ "epoch": 0.41,
646
+ "grad_norm": 6.8157951015425144,
647
+ "learning_rate": 3.6888102953122307e-06,
648
+ "logits/chosen": 0.3040899932384491,
649
+ "logits/rejected": 0.7399716973304749,
650
+ "logps/chosen": -431.64471435546875,
651
+ "logps/rejected": -468.09228515625,
652
+ "loss": 0.5349,
653
+ "rewards/accuracies": 0.768750011920929,
654
+ "rewards/chosen": -1.6253465414047241,
655
+ "rewards/margins": 0.7721388339996338,
656
+ "rewards/rejected": -2.3974852561950684,
657
+ "step": 390
658
+ },
659
+ {
660
+ "epoch": 0.42,
661
+ "grad_norm": 7.719819400236357,
662
+ "learning_rate": 3.607600562872785e-06,
663
+ "logits/chosen": 0.025150710716843605,
664
+ "logits/rejected": 0.44670620560646057,
665
+ "logps/chosen": -389.49591064453125,
666
+ "logps/rejected": -436.03485107421875,
667
+ "loss": 0.5239,
668
+ "rewards/accuracies": 0.7250000238418579,
669
+ "rewards/chosen": -1.289335012435913,
670
+ "rewards/margins": 0.5783864259719849,
671
+ "rewards/rejected": -1.8677213191986084,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.42,
676
+ "eval_logits/chosen": 0.5590775609016418,
677
+ "eval_logits/rejected": 0.8585302233695984,
678
+ "eval_logps/chosen": -404.4281921386719,
679
+ "eval_logps/rejected": -470.4624328613281,
680
+ "eval_loss": 0.5100502371788025,
681
+ "eval_rewards/accuracies": 0.7301587462425232,
682
+ "eval_rewards/chosen": -1.3693335056304932,
683
+ "eval_rewards/margins": 0.7116466164588928,
684
+ "eval_rewards/rejected": -2.080980062484741,
685
+ "eval_runtime": 363.5138,
686
+ "eval_samples_per_second": 5.502,
687
+ "eval_steps_per_second": 0.173,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.43,
692
+ "grad_norm": 7.970937869843318,
693
+ "learning_rate": 3.5249095128531863e-06,
694
+ "logits/chosen": 0.4714787006378174,
695
+ "logits/rejected": 0.9316293597221375,
696
+ "logps/chosen": -396.480224609375,
697
+ "logps/rejected": -434.74969482421875,
698
+ "loss": 0.5072,
699
+ "rewards/accuracies": 0.731249988079071,
700
+ "rewards/chosen": -1.2913535833358765,
701
+ "rewards/margins": 0.7052849531173706,
702
+ "rewards/rejected": -1.996638536453247,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.44,
707
+ "grad_norm": 6.303939907368478,
708
+ "learning_rate": 3.4408477372034743e-06,
709
+ "logits/chosen": 0.2637383043766022,
710
+ "logits/rejected": 0.8194657564163208,
711
+ "logps/chosen": -425.71044921875,
712
+ "logps/rejected": -472.1114807128906,
713
+ "loss": 0.506,
714
+ "rewards/accuracies": 0.7562500238418579,
715
+ "rewards/chosen": -1.4857378005981445,
716
+ "rewards/margins": 0.8204676508903503,
717
+ "rewards/rejected": -2.3062055110931396,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.45,
722
+ "grad_norm": 4.180610112833693,
723
+ "learning_rate": 3.355527661097728e-06,
724
+ "logits/chosen": 0.5680927634239197,
725
+ "logits/rejected": 1.104353904724121,
726
+ "logps/chosen": -410.1541442871094,
727
+ "logps/rejected": -459.4964904785156,
728
+ "loss": 0.509,
729
+ "rewards/accuracies": 0.6625000238418579,
730
+ "rewards/chosen": -1.484910249710083,
731
+ "rewards/margins": 0.6827523112297058,
732
+ "rewards/rejected": -2.1676626205444336,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.46,
737
+ "grad_norm": 8.034683384238669,
738
+ "learning_rate": 3.269063392575352e-06,
739
+ "logits/chosen": 0.9522289037704468,
740
+ "logits/rejected": 1.3713102340698242,
741
+ "logps/chosen": -410.77386474609375,
742
+ "logps/rejected": -469.934814453125,
743
+ "loss": 0.4919,
744
+ "rewards/accuracies": 0.731249988079071,
745
+ "rewards/chosen": -1.4788429737091064,
746
+ "rewards/margins": 0.84101402759552,
747
+ "rewards/rejected": -2.319856882095337,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.47,
752
+ "grad_norm": 7.933204029001868,
753
+ "learning_rate": 3.181570569931697e-06,
754
+ "logits/chosen": 1.7222553491592407,
755
+ "logits/rejected": 2.531193256378174,
756
+ "logps/chosen": -468.93035888671875,
757
+ "logps/rejected": -518.9406127929688,
758
+ "loss": 0.5071,
759
+ "rewards/accuracies": 0.7250000238418579,
760
+ "rewards/chosen": -1.97989022731781,
761
+ "rewards/margins": 0.8815497159957886,
762
+ "rewards/rejected": -2.8614401817321777,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.48,
767
+ "grad_norm": 6.136879125450544,
768
+ "learning_rate": 3.09316620706208e-06,
769
+ "logits/chosen": 1.2675981521606445,
770
+ "logits/rejected": 1.6471933126449585,
771
+ "logps/chosen": -471.82501220703125,
772
+ "logps/rejected": -539.4986572265625,
773
+ "loss": 0.4972,
774
+ "rewards/accuracies": 0.71875,
775
+ "rewards/chosen": -1.9156773090362549,
776
+ "rewards/margins": 0.8640796542167664,
777
+ "rewards/rejected": -2.779757022857666,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.49,
782
+ "grad_norm": 6.202648384751733,
783
+ "learning_rate": 3.0039685369660785e-06,
784
+ "logits/chosen": 0.8523815274238586,
785
+ "logits/rejected": 1.4816639423370361,
786
+ "logps/chosen": -424.09759521484375,
787
+ "logps/rejected": -498.6441955566406,
788
+ "loss": 0.4932,
789
+ "rewards/accuracies": 0.737500011920929,
790
+ "rewards/chosen": -1.6259464025497437,
791
+ "rewards/margins": 0.9263278245925903,
792
+ "rewards/rejected": -2.552274227142334,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 0.5,
797
+ "grad_norm": 5.934005833493391,
798
+ "learning_rate": 2.91409685362137e-06,
799
+ "logits/chosen": 0.7408124208450317,
800
+ "logits/rejected": 1.0110493898391724,
801
+ "logps/chosen": -438.82781982421875,
802
+ "logps/rejected": -494.5777282714844,
803
+ "loss": 0.5068,
804
+ "rewards/accuracies": 0.675000011920929,
805
+ "rewards/chosen": -1.702741265296936,
806
+ "rewards/margins": 0.7638914585113525,
807
+ "rewards/rejected": -2.466632843017578,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 0.51,
812
+ "grad_norm": 6.134335934888188,
813
+ "learning_rate": 2.8236713524386085e-06,
814
+ "logits/chosen": 0.4550415873527527,
815
+ "logits/rejected": 0.7779414057731628,
816
+ "logps/chosen": -434.3086853027344,
817
+ "logps/rejected": -516.2806396484375,
818
+ "loss": 0.5089,
819
+ "rewards/accuracies": 0.7250000238418579,
820
+ "rewards/chosen": -1.7570041418075562,
821
+ "rewards/margins": 0.8707243204116821,
822
+ "rewards/rejected": -2.6277284622192383,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 0.52,
827
+ "grad_norm": 7.785644515938578,
828
+ "learning_rate": 2.7328129695107205e-06,
829
+ "logits/chosen": 0.7499665021896362,
830
+ "logits/rejected": 1.3003952503204346,
831
+ "logps/chosen": -424.2498474121094,
832
+ "logps/rejected": -495.997802734375,
833
+ "loss": 0.5272,
834
+ "rewards/accuracies": 0.71875,
835
+ "rewards/chosen": -1.920672059059143,
836
+ "rewards/margins": 0.8888359069824219,
837
+ "rewards/rejected": -2.8095080852508545,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 0.52,
842
+ "eval_logits/chosen": 0.7149888277053833,
843
+ "eval_logits/rejected": 1.0403735637664795,
844
+ "eval_logps/chosen": -471.0703430175781,
845
+ "eval_logps/rejected": -558.6534423828125,
846
+ "eval_loss": 0.5003111362457275,
847
+ "eval_rewards/accuracies": 0.738095223903656,
848
+ "eval_rewards/chosen": -2.035755157470703,
849
+ "eval_rewards/margins": 0.9271355867385864,
850
+ "eval_rewards/rejected": -2.962890863418579,
851
+ "eval_runtime": 363.4067,
852
+ "eval_samples_per_second": 5.503,
853
+ "eval_steps_per_second": 0.173,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 0.53,
858
+ "grad_norm": 6.508475133675616,
859
+ "learning_rate": 2.641643219871597e-06,
860
+ "logits/chosen": 0.5101484060287476,
861
+ "logits/rejected": 0.9493732452392578,
862
+ "logps/chosen": -467.1604919433594,
863
+ "logps/rejected": -551.2037353515625,
864
+ "loss": 0.4865,
865
+ "rewards/accuracies": 0.75,
866
+ "rewards/chosen": -1.970902442932129,
867
+ "rewards/margins": 0.9815346002578735,
868
+ "rewards/rejected": -2.952437162399292,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 0.54,
873
+ "grad_norm": 5.49058038197313,
874
+ "learning_rate": 2.5502840349805074e-06,
875
+ "logits/chosen": 0.1692526787519455,
876
+ "logits/rejected": 0.5913276076316833,
877
+ "logps/chosen": -398.19561767578125,
878
+ "logps/rejected": -454.1546936035156,
879
+ "loss": 0.5012,
880
+ "rewards/accuracies": 0.75,
881
+ "rewards/chosen": -1.5394331216812134,
882
+ "rewards/margins": 0.9616721868515015,
883
+ "rewards/rejected": -2.501105546951294,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 0.55,
888
+ "grad_norm": 6.551673605459196,
889
+ "learning_rate": 2.4588575996495797e-06,
890
+ "logits/chosen": -0.2059365212917328,
891
+ "logits/rejected": 0.3154928684234619,
892
+ "logps/chosen": -433.8434143066406,
893
+ "logps/rejected": -497.22705078125,
894
+ "loss": 0.4805,
895
+ "rewards/accuracies": 0.824999988079071,
896
+ "rewards/chosen": -1.5971723794937134,
897
+ "rewards/margins": 1.0449589490890503,
898
+ "rewards/rejected": -2.6421313285827637,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 0.57,
903
+ "grad_norm": 7.008080314698571,
904
+ "learning_rate": 2.367486188632446e-06,
905
+ "logits/chosen": 0.24021968245506287,
906
+ "logits/rejected": 0.4527043402194977,
907
+ "logps/chosen": -493.78436279296875,
908
+ "logps/rejected": -600.9166259765625,
909
+ "loss": 0.4979,
910
+ "rewards/accuracies": 0.8062499761581421,
911
+ "rewards/chosen": -2.1465725898742676,
912
+ "rewards/margins": 1.0573832988739014,
913
+ "rewards/rejected": -3.203955888748169,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 0.58,
918
+ "grad_norm": 7.130638434676378,
919
+ "learning_rate": 2.276292003092593e-06,
920
+ "logits/chosen": 0.10954128205776215,
921
+ "logits/rejected": 0.5284096002578735,
922
+ "logps/chosen": -523.6838989257812,
923
+ "logps/rejected": -586.87451171875,
924
+ "loss": 0.4699,
925
+ "rewards/accuracies": 0.737500011920929,
926
+ "rewards/chosen": -2.2369279861450195,
927
+ "rewards/margins": 0.9251989126205444,
928
+ "rewards/rejected": -3.1621270179748535,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 0.59,
933
+ "grad_norm": 6.527520826256094,
934
+ "learning_rate": 2.1853970071701415e-06,
935
+ "logits/chosen": 0.1212792843580246,
936
+ "logits/rejected": 0.621984601020813,
937
+ "logps/chosen": -492.78326416015625,
938
+ "logps/rejected": -556.6223754882812,
939
+ "loss": 0.4888,
940
+ "rewards/accuracies": 0.7437499761581421,
941
+ "rewards/chosen": -1.9754877090454102,
942
+ "rewards/margins": 0.8828755617141724,
943
+ "rewards/rejected": -2.858363389968872,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 0.6,
948
+ "grad_norm": 7.521550053676498,
949
+ "learning_rate": 2.0949227648656194e-06,
950
+ "logits/chosen": 0.6495383977890015,
951
+ "logits/rejected": 0.7199844121932983,
952
+ "logps/chosen": -420.24951171875,
953
+ "logps/rejected": -494.35516357421875,
954
+ "loss": 0.5204,
955
+ "rewards/accuracies": 0.6875,
956
+ "rewards/chosen": -1.9334577322006226,
957
+ "rewards/margins": 0.878303050994873,
958
+ "rewards/rejected": -2.8117611408233643,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 0.61,
963
+ "grad_norm": 5.995967781657259,
964
+ "learning_rate": 2.00499027745888e-06,
965
+ "logits/chosen": -0.02193205989897251,
966
+ "logits/rejected": 0.2604701519012451,
967
+ "logps/chosen": -426.2652893066406,
968
+ "logps/rejected": -514.4281005859375,
969
+ "loss": 0.4959,
970
+ "rewards/accuracies": 0.731249988079071,
971
+ "rewards/chosen": -1.6685237884521484,
972
+ "rewards/margins": 0.8125013113021851,
973
+ "rewards/rejected": -2.481024980545044,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 0.62,
978
+ "grad_norm": 7.427361767129983,
979
+ "learning_rate": 1.915719821680624e-06,
980
+ "logits/chosen": -0.14169612526893616,
981
+ "logits/rejected": 0.3787733018398285,
982
+ "logps/chosen": -397.3866882324219,
983
+ "logps/rejected": -437.279541015625,
984
+ "loss": 0.5169,
985
+ "rewards/accuracies": 0.706250011920929,
986
+ "rewards/chosen": -1.5220028162002563,
987
+ "rewards/margins": 0.7406032085418701,
988
+ "rewards/rejected": -2.262606143951416,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 0.63,
993
+ "grad_norm": 6.908180760937083,
994
+ "learning_rate": 1.8272307888529276e-06,
995
+ "logits/chosen": 0.0949440747499466,
996
+ "logits/rejected": 0.2515867352485657,
997
+ "logps/chosen": -374.9373474121094,
998
+ "logps/rejected": -469.41143798828125,
999
+ "loss": 0.4886,
1000
+ "rewards/accuracies": 0.762499988079071,
1001
+ "rewards/chosen": -1.6092283725738525,
1002
+ "rewards/margins": 0.7828912138938904,
1003
+ "rewards/rejected": -2.3921194076538086,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 0.63,
1008
+ "eval_logits/chosen": 0.05941150337457657,
1009
+ "eval_logits/rejected": 0.37521931529045105,
1010
+ "eval_logps/chosen": -444.8821716308594,
1011
+ "eval_logps/rejected": -526.641357421875,
1012
+ "eval_loss": 0.49817371368408203,
1013
+ "eval_rewards/accuracies": 0.726190447807312,
1014
+ "eval_rewards/chosen": -1.7738730907440186,
1015
+ "eval_rewards/margins": 0.8688962459564209,
1016
+ "eval_rewards/rejected": -2.6427693367004395,
1017
+ "eval_runtime": 363.5537,
1018
+ "eval_samples_per_second": 5.501,
1019
+ "eval_steps_per_second": 0.173,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 0.64,
1024
+ "grad_norm": 7.791648443136137,
1025
+ "learning_rate": 1.739641525213929e-06,
1026
+ "logits/chosen": 0.37017613649368286,
1027
+ "logits/rejected": 0.7658742666244507,
1028
+ "logps/chosen": -431.961181640625,
1029
+ "logps/rejected": -511.739501953125,
1030
+ "loss": 0.4684,
1031
+ "rewards/accuracies": 0.78125,
1032
+ "rewards/chosen": -1.889151930809021,
1033
+ "rewards/margins": 0.9400160908699036,
1034
+ "rewards/rejected": -2.8291683197021484,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 0.65,
1039
+ "grad_norm": 6.033378581233917,
1040
+ "learning_rate": 1.6530691736402317e-06,
1041
+ "logits/chosen": 0.5889648795127869,
1042
+ "logits/rejected": 0.6707448959350586,
1043
+ "logps/chosen": -445.545654296875,
1044
+ "logps/rejected": -556.2078857421875,
1045
+ "loss": 0.4698,
1046
+ "rewards/accuracies": 0.737500011920929,
1047
+ "rewards/chosen": -2.1388087272644043,
1048
+ "rewards/margins": 0.829494297504425,
1049
+ "rewards/rejected": -2.9683032035827637,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 0.66,
1054
+ "grad_norm": 8.258841660679291,
1055
+ "learning_rate": 1.5676295169786864e-06,
1056
+ "logits/chosen": 0.6791388392448425,
1057
+ "logits/rejected": 1.2308781147003174,
1058
+ "logps/chosen": -498.2019958496094,
1059
+ "logps/rejected": -589.25927734375,
1060
+ "loss": 0.4736,
1061
+ "rewards/accuracies": 0.75,
1062
+ "rewards/chosen": -2.2767367362976074,
1063
+ "rewards/margins": 1.188873291015625,
1064
+ "rewards/rejected": -3.4656097888946533,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 0.67,
1069
+ "grad_norm": 7.552790690651825,
1070
+ "learning_rate": 1.4834368231970922e-06,
1071
+ "logits/chosen": 0.6634274125099182,
1072
+ "logits/rejected": 0.95360267162323,
1073
+ "logps/chosen": -443.93304443359375,
1074
+ "logps/rejected": -516.3823852539062,
1075
+ "loss": 0.495,
1076
+ "rewards/accuracies": 0.737500011920929,
1077
+ "rewards/chosen": -2.053490161895752,
1078
+ "rewards/margins": 0.9448803663253784,
1079
+ "rewards/rejected": -2.99837064743042,
1080
+ "step": 640
1081
+ },
1082
+ {
1083
+ "epoch": 0.68,
1084
+ "grad_norm": 6.240039976432098,
1085
+ "learning_rate": 1.4006036925609245e-06,
1086
+ "logits/chosen": 0.2837061583995819,
1087
+ "logits/rejected": 0.6308731436729431,
1088
+ "logps/chosen": -462.76824951171875,
1089
+ "logps/rejected": -545.1807250976562,
1090
+ "loss": 0.4953,
1091
+ "rewards/accuracies": 0.762499988079071,
1092
+ "rewards/chosen": -1.862541913986206,
1093
+ "rewards/margins": 0.9856799840927124,
1094
+ "rewards/rejected": -2.848222017288208,
1095
+ "step": 650
1096
+ },
1097
+ {
1098
+ "epoch": 0.69,
1099
+ "grad_norm": 6.068196245695217,
1100
+ "learning_rate": 1.3192409070404582e-06,
1101
+ "logits/chosen": -0.018947910517454147,
1102
+ "logits/rejected": 0.6848478317260742,
1103
+ "logps/chosen": -459.00946044921875,
1104
+ "logps/rejected": -509.8402404785156,
1105
+ "loss": 0.4779,
1106
+ "rewards/accuracies": 0.737500011920929,
1107
+ "rewards/chosen": -1.6749271154403687,
1108
+ "rewards/margins": 0.9239859580993652,
1109
+ "rewards/rejected": -2.5989129543304443,
1110
+ "step": 660
1111
+ },
1112
+ {
1113
+ "epoch": 0.7,
1114
+ "grad_norm": 7.317258150817362,
1115
+ "learning_rate": 1.2394572821496953e-06,
1116
+ "logits/chosen": 0.4193340837955475,
1117
+ "logits/rejected": 0.5491575598716736,
1118
+ "logps/chosen": -433.359375,
1119
+ "logps/rejected": -529.31201171875,
1120
+ "loss": 0.5121,
1121
+ "rewards/accuracies": 0.7749999761581421,
1122
+ "rewards/chosen": -2.034470796585083,
1123
+ "rewards/margins": 0.8596637845039368,
1124
+ "rewards/rejected": -2.894134521484375,
1125
+ "step": 670
1126
+ },
1127
+ {
1128
+ "epoch": 0.71,
1129
+ "grad_norm": 6.984303996772759,
1130
+ "learning_rate": 1.1613595214152713e-06,
1131
+ "logits/chosen": 0.375018447637558,
1132
+ "logits/rejected": 1.3137071132659912,
1133
+ "logps/chosen": -503.49188232421875,
1134
+ "logps/rejected": -537.77392578125,
1135
+ "loss": 0.4789,
1136
+ "rewards/accuracies": 0.737500011920929,
1137
+ "rewards/chosen": -2.075040817260742,
1138
+ "rewards/margins": 0.832379162311554,
1139
+ "rewards/rejected": -2.9074199199676514,
1140
+ "step": 680
1141
+ },
1142
+ {
1143
+ "epoch": 0.72,
1144
+ "grad_norm": 6.449594635756279,
1145
+ "learning_rate": 1.0850520736699362e-06,
1146
+ "logits/chosen": 0.4090999662876129,
1147
+ "logits/rejected": 0.8342447280883789,
1148
+ "logps/chosen": -482.46832275390625,
1149
+ "logps/rejected": -552.4658203125,
1150
+ "loss": 0.5044,
1151
+ "rewards/accuracies": 0.71875,
1152
+ "rewards/chosen": -1.9981062412261963,
1153
+ "rewards/margins": 0.9065272212028503,
1154
+ "rewards/rejected": -2.9046332836151123,
1155
+ "step": 690
1156
+ },
1157
+ {
1158
+ "epoch": 0.73,
1159
+ "grad_norm": 7.762156396899746,
1160
+ "learning_rate": 1.0106369933615043e-06,
1161
+ "logits/chosen": 0.44636210799217224,
1162
+ "logits/rejected": 0.6373854875564575,
1163
+ "logps/chosen": -459.1854553222656,
1164
+ "logps/rejected": -532.9047241210938,
1165
+ "loss": 0.516,
1166
+ "rewards/accuracies": 0.706250011920929,
1167
+ "rewards/chosen": -2.135340452194214,
1168
+ "rewards/margins": 0.8491575121879578,
1169
+ "rewards/rejected": -2.9844982624053955,
1170
+ "step": 700
1171
+ },
1172
+ {
1173
+ "epoch": 0.73,
1174
+ "eval_logits/chosen": 0.531207799911499,
1175
+ "eval_logits/rejected": 0.8897786736488342,
1176
+ "eval_logps/chosen": -469.92730712890625,
1177
+ "eval_logps/rejected": -556.2413330078125,
1178
+ "eval_loss": 0.49328893423080444,
1179
+ "eval_rewards/accuracies": 0.7301587462425232,
1180
+ "eval_rewards/chosen": -2.024324893951416,
1181
+ "eval_rewards/margins": 0.9144444465637207,
1182
+ "eval_rewards/rejected": -2.938769578933716,
1183
+ "eval_runtime": 363.5561,
1184
+ "eval_samples_per_second": 5.501,
1185
+ "eval_steps_per_second": 0.173,
1186
+ "step": 700
1187
+ },
1188
+ {
1189
+ "epoch": 0.74,
1190
+ "grad_norm": 6.854213571436283,
1191
+ "learning_rate": 9.382138040640714e-07,
1192
+ "logits/chosen": 0.5431604981422424,
1193
+ "logits/rejected": 1.0465270280838013,
1194
+ "logps/chosen": -448.469970703125,
1195
+ "logps/rejected": -540.0692749023438,
1196
+ "loss": 0.4754,
1197
+ "rewards/accuracies": 0.762499988079071,
1198
+ "rewards/chosen": -2.114922285079956,
1199
+ "rewards/margins": 0.9879555702209473,
1200
+ "rewards/rejected": -3.1028780937194824,
1201
+ "step": 710
1202
+ },
1203
+ {
1204
+ "epoch": 0.75,
1205
+ "grad_norm": 5.476671160655268,
1206
+ "learning_rate": 8.678793653740633e-07,
1207
+ "logits/chosen": 0.13008640706539154,
1208
+ "logits/rejected": 0.6161590814590454,
1209
+ "logps/chosen": -476.4979553222656,
1210
+ "logps/rejected": -574.2578125,
1211
+ "loss": 0.4781,
1212
+ "rewards/accuracies": 0.7437499761581421,
1213
+ "rewards/chosen": -1.904258131980896,
1214
+ "rewards/margins": 1.040144681930542,
1215
+ "rewards/rejected": -2.9444031715393066,
1216
+ "step": 720
1217
+ },
1218
+ {
1219
+ "epoch": 0.76,
1220
+ "grad_norm": 10.553613660329997,
1221
+ "learning_rate": 7.997277433690984e-07,
1222
+ "logits/chosen": 0.33887559175491333,
1223
+ "logits/rejected": 0.7562397718429565,
1224
+ "logps/chosen": -460.935302734375,
1225
+ "logps/rejected": -538.0240478515625,
1226
+ "loss": 0.4952,
1227
+ "rewards/accuracies": 0.7124999761581421,
1228
+ "rewards/chosen": -1.8783504962921143,
1229
+ "rewards/margins": 0.9283574819564819,
1230
+ "rewards/rejected": -2.8067078590393066,
1231
+ "step": 730
1232
+ },
1233
+ {
1234
+ "epoch": 0.77,
1235
+ "grad_norm": 6.627806159178358,
1236
+ "learning_rate": 7.338500848029603e-07,
1237
+ "logits/chosen": 0.0666571706533432,
1238
+ "logits/rejected": 0.5915927290916443,
1239
+ "logps/chosen": -481.9137268066406,
1240
+ "logps/rejected": -545.0120849609375,
1241
+ "loss": 0.4847,
1242
+ "rewards/accuracies": 0.6937500238418579,
1243
+ "rewards/chosen": -1.9084794521331787,
1244
+ "rewards/margins": 0.9120246767997742,
1245
+ "rewards/rejected": -2.8205044269561768,
1246
+ "step": 740
1247
+ },
1248
+ {
1249
+ "epoch": 0.79,
1250
+ "grad_norm": 5.781902544465049,
1251
+ "learning_rate": 6.70334495204884e-07,
1252
+ "logits/chosen": 0.2374829351902008,
1253
+ "logits/rejected": 0.428078830242157,
1254
+ "logps/chosen": -427.08099365234375,
1255
+ "logps/rejected": -511.5672302246094,
1256
+ "loss": 0.4783,
1257
+ "rewards/accuracies": 0.75,
1258
+ "rewards/chosen": -1.7517359256744385,
1259
+ "rewards/margins": 0.9133321642875671,
1260
+ "rewards/rejected": -2.6650681495666504,
1261
+ "step": 750
1262
+ },
1263
+ {
1264
+ "epoch": 0.8,
1265
+ "grad_norm": 8.985975146227219,
1266
+ "learning_rate": 6.092659210462232e-07,
1267
+ "logits/chosen": -0.05799068883061409,
1268
+ "logits/rejected": 0.24734115600585938,
1269
+ "logps/chosen": -439.37841796875,
1270
+ "logps/rejected": -537.7448120117188,
1271
+ "loss": 0.5005,
1272
+ "rewards/accuracies": 0.762499988079071,
1273
+ "rewards/chosen": -1.7082140445709229,
1274
+ "rewards/margins": 0.9354721903800964,
1275
+ "rewards/rejected": -2.643686294555664,
1276
+ "step": 760
1277
+ },
1278
+ {
1279
+ "epoch": 0.81,
1280
+ "grad_norm": 8.444556330335592,
1281
+ "learning_rate": 5.507260361320738e-07,
1282
+ "logits/chosen": -0.10365080833435059,
1283
+ "logits/rejected": 0.6751923561096191,
1284
+ "logps/chosen": -425.59130859375,
1285
+ "logps/rejected": -492.73583984375,
1286
+ "loss": 0.4983,
1287
+ "rewards/accuracies": 0.8125,
1288
+ "rewards/chosen": -1.6130247116088867,
1289
+ "rewards/margins": 1.0618236064910889,
1290
+ "rewards/rejected": -2.6748480796813965,
1291
+ "step": 770
1292
+ },
1293
+ {
1294
+ "epoch": 0.82,
1295
+ "grad_norm": 6.994205801584757,
1296
+ "learning_rate": 4.947931323697983e-07,
1297
+ "logits/chosen": -0.10879464447498322,
1298
+ "logits/rejected": 0.6311591863632202,
1299
+ "logps/chosen": -402.650634765625,
1300
+ "logps/rejected": -480.27978515625,
1301
+ "loss": 0.4858,
1302
+ "rewards/accuracies": 0.731249988079071,
1303
+ "rewards/chosen": -1.6773712635040283,
1304
+ "rewards/margins": 0.953662097454071,
1305
+ "rewards/rejected": -2.631033420562744,
1306
+ "step": 780
1307
+ },
1308
+ {
1309
+ "epoch": 0.83,
1310
+ "grad_norm": 7.034724618907778,
1311
+ "learning_rate": 4.4154201506053985e-07,
1312
+ "logits/chosen": -0.059246379882097244,
1313
+ "logits/rejected": 0.9515973925590515,
1314
+ "logps/chosen": -418.908935546875,
1315
+ "logps/rejected": -463.66497802734375,
1316
+ "loss": 0.4823,
1317
+ "rewards/accuracies": 0.737500011920929,
1318
+ "rewards/chosen": -1.5309089422225952,
1319
+ "rewards/margins": 1.080568552017212,
1320
+ "rewards/rejected": -2.6114776134490967,
1321
+ "step": 790
1322
+ },
1323
+ {
1324
+ "epoch": 0.84,
1325
+ "grad_norm": 11.254330628062345,
1326
+ "learning_rate": 3.910439028537638e-07,
1327
+ "logits/chosen": 0.3055418133735657,
1328
+ "logits/rejected": 0.5041495561599731,
1329
+ "logps/chosen": -393.4361267089844,
1330
+ "logps/rejected": -491.19537353515625,
1331
+ "loss": 0.495,
1332
+ "rewards/accuracies": 0.762499988079071,
1333
+ "rewards/chosen": -1.6317393779754639,
1334
+ "rewards/margins": 0.9304509162902832,
1335
+ "rewards/rejected": -2.562190532684326,
1336
+ "step": 800
1337
+ },
1338
+ {
1339
+ "epoch": 0.84,
1340
+ "eval_logits/chosen": 0.4157378673553467,
1341
+ "eval_logits/rejected": 0.8307757377624512,
1342
+ "eval_logps/chosen": -441.31207275390625,
1343
+ "eval_logps/rejected": -530.7620239257812,
1344
+ "eval_loss": 0.49489909410476685,
1345
+ "eval_rewards/accuracies": 0.726190447807312,
1346
+ "eval_rewards/chosen": -1.7381722927093506,
1347
+ "eval_rewards/margins": 0.9458035230636597,
1348
+ "eval_rewards/rejected": -2.6839756965637207,
1349
+ "eval_runtime": 363.554,
1350
+ "eval_samples_per_second": 5.501,
1351
+ "eval_steps_per_second": 0.173,
1352
+ "step": 800
1353
+ },
1354
+ {
1355
+ "epoch": 0.85,
1356
+ "grad_norm": 10.131129540510356,
1357
+ "learning_rate": 3.4336633249862084e-07,
1358
+ "logits/chosen": 0.2840031385421753,
1359
+ "logits/rejected": 0.8334852457046509,
1360
+ "logps/chosen": -408.244384765625,
1361
+ "logps/rejected": -496.79852294921875,
1362
+ "loss": 0.5046,
1363
+ "rewards/accuracies": 0.75,
1364
+ "rewards/chosen": -1.7685251235961914,
1365
+ "rewards/margins": 0.8878759145736694,
1366
+ "rewards/rejected": -2.6564011573791504,
1367
+ "step": 810
1368
+ },
1369
+ {
1370
+ "epoch": 0.86,
1371
+ "grad_norm": 6.762519528965419,
1372
+ "learning_rate": 2.98573068519539e-07,
1373
+ "logits/chosen": 0.10075119882822037,
1374
+ "logits/rejected": 0.3803648352622986,
1375
+ "logps/chosen": -466.81689453125,
1376
+ "logps/rejected": -508.10394287109375,
1377
+ "loss": 0.5171,
1378
+ "rewards/accuracies": 0.699999988079071,
1379
+ "rewards/chosen": -1.9121501445770264,
1380
+ "rewards/margins": 0.6225390434265137,
1381
+ "rewards/rejected": -2.534689426422119,
1382
+ "step": 820
1383
+ },
1384
+ {
1385
+ "epoch": 0.87,
1386
+ "grad_norm": 5.70932547678816,
1387
+ "learning_rate": 2.5672401793681854e-07,
1388
+ "logits/chosen": 0.38261884450912476,
1389
+ "logits/rejected": 0.6645594835281372,
1390
+ "logps/chosen": -452.214111328125,
1391
+ "logps/rejected": -503.60479736328125,
1392
+ "loss": 0.4928,
1393
+ "rewards/accuracies": 0.699999988079071,
1394
+ "rewards/chosen": -1.784679651260376,
1395
+ "rewards/margins": 0.795635461807251,
1396
+ "rewards/rejected": -2.580315113067627,
1397
+ "step": 830
1398
+ },
1399
+ {
1400
+ "epoch": 0.88,
1401
+ "grad_norm": 6.787751487161624,
1402
+ "learning_rate": 2.178751501463036e-07,
1403
+ "logits/chosen": 0.19089770317077637,
1404
+ "logits/rejected": 0.6150975227355957,
1405
+ "logps/chosen": -435.7149963378906,
1406
+ "logps/rejected": -512.2667236328125,
1407
+ "loss": 0.5099,
1408
+ "rewards/accuracies": 0.675000011920929,
1409
+ "rewards/chosen": -1.6579662561416626,
1410
+ "rewards/margins": 0.820970892906189,
1411
+ "rewards/rejected": -2.4789371490478516,
1412
+ "step": 840
1413
+ },
1414
+ {
1415
+ "epoch": 0.89,
1416
+ "grad_norm": 7.2710507714696275,
1417
+ "learning_rate": 1.820784220652766e-07,
1418
+ "logits/chosen": 0.3307928442955017,
1419
+ "logits/rejected": 0.8749529719352722,
1420
+ "logps/chosen": -447.7725524902344,
1421
+ "logps/rejected": -521.2926025390625,
1422
+ "loss": 0.4866,
1423
+ "rewards/accuracies": 0.7875000238418579,
1424
+ "rewards/chosen": -1.6773738861083984,
1425
+ "rewards/margins": 1.1296848058700562,
1426
+ "rewards/rejected": -2.807058811187744,
1427
+ "step": 850
1428
+ },
1429
+ {
1430
+ "epoch": 0.9,
1431
+ "grad_norm": 7.229633532426356,
1432
+ "learning_rate": 1.4938170864468636e-07,
1433
+ "logits/chosen": 0.37119001150131226,
1434
+ "logits/rejected": 0.9635122418403625,
1435
+ "logps/chosen": -403.9157409667969,
1436
+ "logps/rejected": -498.6942443847656,
1437
+ "loss": 0.4717,
1438
+ "rewards/accuracies": 0.7749999761581421,
1439
+ "rewards/chosen": -1.6513326168060303,
1440
+ "rewards/margins": 1.0772497653961182,
1441
+ "rewards/rejected": -2.7285823822021484,
1442
+ "step": 860
1443
+ },
1444
+ {
1445
+ "epoch": 0.91,
1446
+ "grad_norm": 8.52193537691758,
1447
+ "learning_rate": 1.1982873884064466e-07,
1448
+ "logits/chosen": 0.4872625768184662,
1449
+ "logits/rejected": 0.821652889251709,
1450
+ "logps/chosen": -409.1003723144531,
1451
+ "logps/rejected": -532.8604736328125,
1452
+ "loss": 0.5032,
1453
+ "rewards/accuracies": 0.7437499761581421,
1454
+ "rewards/chosen": -1.784326195716858,
1455
+ "rewards/margins": 1.051481008529663,
1456
+ "rewards/rejected": -2.8358073234558105,
1457
+ "step": 870
1458
+ },
1459
+ {
1460
+ "epoch": 0.92,
1461
+ "grad_norm": 6.240495703154818,
1462
+ "learning_rate": 9.345903713082305e-08,
1463
+ "logits/chosen": 0.43627986311912537,
1464
+ "logits/rejected": 0.9064377546310425,
1465
+ "logps/chosen": -447.2481994628906,
1466
+ "logps/rejected": -526.6138916015625,
1467
+ "loss": 0.4741,
1468
+ "rewards/accuracies": 0.768750011920929,
1469
+ "rewards/chosen": -1.7614307403564453,
1470
+ "rewards/margins": 1.0818777084350586,
1471
+ "rewards/rejected": -2.843308210372925,
1472
+ "step": 880
1473
+ },
1474
+ {
1475
+ "epoch": 0.93,
1476
+ "grad_norm": 7.1651137983571624,
1477
+ "learning_rate": 7.030787065396866e-08,
1478
+ "logits/chosen": 0.2825869917869568,
1479
+ "logits/rejected": 0.7335230112075806,
1480
+ "logps/chosen": -467.37469482421875,
1481
+ "logps/rejected": -523.0452880859375,
1482
+ "loss": 0.4941,
1483
+ "rewards/accuracies": 0.6625000238418579,
1484
+ "rewards/chosen": -1.9135255813598633,
1485
+ "rewards/margins": 0.7427676320075989,
1486
+ "rewards/rejected": -2.6562931537628174,
1487
+ "step": 890
1488
+ },
1489
+ {
1490
+ "epoch": 0.94,
1491
+ "grad_norm": 7.718450007612897,
1492
+ "learning_rate": 5.0406202043228604e-08,
1493
+ "logits/chosen": 0.40967661142349243,
1494
+ "logits/rejected": 0.6366448998451233,
1495
+ "logps/chosen": -437.923095703125,
1496
+ "logps/rejected": -538.6532592773438,
1497
+ "loss": 0.4866,
1498
+ "rewards/accuracies": 0.8187500238418579,
1499
+ "rewards/chosen": -1.7436615228652954,
1500
+ "rewards/margins": 0.9877888560295105,
1501
+ "rewards/rejected": -2.731450319290161,
1502
+ "step": 900
1503
+ },
1504
+ {
1505
+ "epoch": 0.94,
1506
+ "eval_logits/chosen": 0.5257142782211304,
1507
+ "eval_logits/rejected": 0.935259997844696,
1508
+ "eval_logps/chosen": -446.6502990722656,
1509
+ "eval_logps/rejected": -535.5853881835938,
1510
+ "eval_loss": 0.49319854378700256,
1511
+ "eval_rewards/accuracies": 0.726190447807312,
1512
+ "eval_rewards/chosen": -1.79155433177948,
1513
+ "eval_rewards/margins": 0.9406552314758301,
1514
+ "eval_rewards/rejected": -2.7322094440460205,
1515
+ "eval_runtime": 363.747,
1516
+ "eval_samples_per_second": 5.498,
1517
+ "eval_steps_per_second": 0.173,
1518
+ "step": 900
1519
+ },
1520
+ {
1521
+ "epoch": 0.95,
1522
+ "grad_norm": 7.916275121810713,
1523
+ "learning_rate": 3.378064801637687e-08,
1524
+ "logits/chosen": 0.09373348951339722,
1525
+ "logits/rejected": 0.7054184675216675,
1526
+ "logps/chosen": -458.953369140625,
1527
+ "logps/rejected": -513.7935791015625,
1528
+ "loss": 0.5184,
1529
+ "rewards/accuracies": 0.706250011920929,
1530
+ "rewards/chosen": -1.85234797000885,
1531
+ "rewards/margins": 0.8529074788093567,
1532
+ "rewards/rejected": -2.7052555084228516,
1533
+ "step": 910
1534
+ },
1535
+ {
1536
+ "epoch": 0.96,
1537
+ "grad_norm": 6.7231952954282646,
1538
+ "learning_rate": 2.0453443778310766e-08,
1539
+ "logits/chosen": 0.3935597240924835,
1540
+ "logits/rejected": 0.7541261315345764,
1541
+ "logps/chosen": -428.7496643066406,
1542
+ "logps/rejected": -517.533935546875,
1543
+ "loss": 0.4802,
1544
+ "rewards/accuracies": 0.78125,
1545
+ "rewards/chosen": -1.6958158016204834,
1546
+ "rewards/margins": 1.0528090000152588,
1547
+ "rewards/rejected": -2.748624801635742,
1548
+ "step": 920
1549
+ },
1550
+ {
1551
+ "epoch": 0.97,
1552
+ "grad_norm": 6.355797363488898,
1553
+ "learning_rate": 1.0442413283435759e-08,
1554
+ "logits/chosen": 0.23696298897266388,
1555
+ "logits/rejected": 0.9657135009765625,
1556
+ "logps/chosen": -446.9468688964844,
1557
+ "logps/rejected": -511.3190002441406,
1558
+ "loss": 0.4773,
1559
+ "rewards/accuracies": 0.731249988079071,
1560
+ "rewards/chosen": -1.799220323562622,
1561
+ "rewards/margins": 0.9288120269775391,
1562
+ "rewards/rejected": -2.7280325889587402,
1563
+ "step": 930
1564
+ },
1565
+ {
1566
+ "epoch": 0.98,
1567
+ "grad_norm": 8.823642044844457,
1568
+ "learning_rate": 3.760945397705828e-09,
1569
+ "logits/chosen": 0.3560529053211212,
1570
+ "logits/rejected": 0.6830099821090698,
1571
+ "logps/chosen": -457.27587890625,
1572
+ "logps/rejected": -526.8748168945312,
1573
+ "loss": 0.513,
1574
+ "rewards/accuracies": 0.7124999761581421,
1575
+ "rewards/chosen": -1.8548386096954346,
1576
+ "rewards/margins": 0.7306956052780151,
1577
+ "rewards/rejected": -2.585533857345581,
1578
+ "step": 940
1579
+ },
1580
+ {
1581
+ "epoch": 0.99,
1582
+ "grad_norm": 7.015844125023096,
1583
+ "learning_rate": 4.1797599220405605e-10,
1584
+ "logits/chosen": 0.21756580471992493,
1585
+ "logits/rejected": 0.7274689078330994,
1586
+ "logps/chosen": -462.048583984375,
1587
+ "logps/rejected": -528.304443359375,
1588
+ "loss": 0.4919,
1589
+ "rewards/accuracies": 0.737500011920929,
1590
+ "rewards/chosen": -1.763311743736267,
1591
+ "rewards/margins": 0.9584550857543945,
1592
+ "rewards/rejected": -2.721766948699951,
1593
+ "step": 950
1594
+ },
1595
+ {
1596
+ "epoch": 1.0,
1597
+ "step": 955,
1598
+ "total_flos": 0.0,
1599
+ "train_loss": 0.5264237219126436,
1600
+ "train_runtime": 29801.8239,
1601
+ "train_samples_per_second": 2.051,
1602
+ "train_steps_per_second": 0.032
1603
+ }
1604
+ ],
1605
+ "logging_steps": 10,
1606
+ "max_steps": 955,
1607
+ "num_input_tokens_seen": 0,
1608
+ "num_train_epochs": 1,
1609
+ "save_steps": 100000,
1610
+ "total_flos": 0.0,
1611
+ "train_batch_size": 4,
1612
+ "trial_name": null,
1613
+ "trial_params": null
1614
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fe19b748ca3fddccc7ee76272be28fd47fa1e74bf00b4cac21fa1ecd0626e9d
3
+ size 6200