Hanhpt23 commited on
Commit
49a0351
1 Parent(s): 6f46fae

End of training

Browse files
Files changed (7) hide show
  1. README.md +14 -6
  2. all_results.json +6 -6
  3. config.json +28 -28
  4. model.safetensors +1 -1
  5. train_results.json +6 -6
  6. trainer_state.json +2309 -32
  7. training_args.bin +1 -1
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.7
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -32,8 +32,8 @@ should probably proofread and complete it, then remove this comment. -->
32
 
33
  This model is a fine-tuned version of [microsoft/swin-large-patch4-window12-384-in22k](https://huggingface.co/microsoft/swin-large-patch4-window12-384-in22k) on the NIH-Xray dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 2.2480
36
- - Accuracy: 0.7
37
 
38
  ## Model description
39
 
@@ -61,14 +61,22 @@ The following hyperparameters were used during training:
61
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
  - lr_scheduler_type: linear
63
  - lr_scheduler_warmup_ratio: 0.1
64
- - num_epochs: 2
65
 
66
  ### Training results
67
 
68
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
69
  |:-------------:|:------:|:----:|:---------------:|:--------:|
70
- | No log | 1.0 | 1 | 2.6591 | 0.0 |
71
- | No log | 1.3333 | 2 | 2.2480 | 0.7 |
 
 
 
 
 
 
 
 
72
 
73
 
74
  ### Framework versions
 
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
+ value: 0.49376114081996436
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  This model is a fine-tuned version of [microsoft/swin-large-patch4-window12-384-in22k](https://huggingface.co/microsoft/swin-large-patch4-window12-384-in22k) on the NIH-Xray dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 3.7711
36
+ - Accuracy: 0.4938
37
 
38
  ## Model description
39
 
 
61
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
  - lr_scheduler_type: linear
63
  - lr_scheduler_warmup_ratio: 0.1
64
+ - num_epochs: 10
65
 
66
  ### Training results
67
 
68
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
69
  |:-------------:|:------:|:----:|:---------------:|:--------:|
70
+ | 1.8318 | 0.9984 | 315 | 1.7651 | 0.5437 |
71
+ | 1.6067 | 2.0 | 631 | 1.6393 | 0.5455 |
72
+ | 1.406 | 2.9984 | 946 | 1.6472 | 0.5490 |
73
+ | 1.3983 | 4.0 | 1262 | 1.7344 | 0.5455 |
74
+ | 0.7272 | 4.9984 | 1577 | 2.1283 | 0.5258 |
75
+ | 0.3975 | 6.0 | 1893 | 2.5229 | 0.5134 |
76
+ | 0.2648 | 6.9984 | 2208 | 3.0333 | 0.5080 |
77
+ | 0.1232 | 8.0 | 2524 | 3.4626 | 0.5241 |
78
+ | 0.0873 | 8.9984 | 2839 | 3.6219 | 0.5027 |
79
+ | 0.0554 | 9.9842 | 3150 | 3.7711 | 0.4938 |
80
 
81
 
82
  ### Framework versions
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.3333333333333333,
3
- "total_flos": 7254221177290752.0,
4
- "train_loss": 1.3116413354873657,
5
- "train_runtime": 24.7689,
6
- "train_samples_per_second": 0.807,
7
- "train_steps_per_second": 0.081
8
  }
 
1
  {
2
+ "epoch": 9.984152139461173,
3
+ "total_flos": 2.6101205954547646e+19,
4
+ "train_loss": 0.7937991834822156,
5
+ "train_runtime": 16586.4479,
6
+ "train_samples_per_second": 3.042,
7
+ "train_steps_per_second": 0.19
8
  }
config.json CHANGED
@@ -17,40 +17,40 @@
17
  "hidden_dropout_prob": 0.0,
18
  "hidden_size": 1536,
19
  "id2label": {
20
- "0": "Pneumonia",
21
- "1": "Emphysema",
22
- "2": "Fibrosis",
23
  "3": "No Finding",
24
- "4": "Pleural_Thickening",
25
- "5": "Cardiomegaly",
26
- "6": "Consolidation",
27
- "7": "Hernia",
28
- "8": "Effusion",
29
- "9": "Nodule",
30
- "10": "Pneumothorax",
31
- "11": "Edema",
32
- "12": "Mass",
33
- "13": "Infiltration",
34
- "14": "Atelectasis"
35
  },
36
  "image_size": 384,
37
  "initializer_range": 0.02,
38
  "label2id": {
39
- "Atelectasis": 14,
40
- "Cardiomegaly": 5,
41
- "Consolidation": 6,
42
- "Edema": 11,
43
- "Effusion": 8,
44
- "Emphysema": 1,
45
- "Fibrosis": 2,
46
- "Hernia": 7,
47
- "Infiltration": 13,
48
- "Mass": 12,
49
  "No Finding": 3,
50
- "Nodule": 9,
51
- "Pleural_Thickening": 4,
52
- "Pneumonia": 0,
53
- "Pneumothorax": 10
54
  },
55
  "layer_norm_eps": 1e-05,
56
  "mlp_ratio": 4.0,
 
17
  "hidden_dropout_prob": 0.0,
18
  "hidden_size": 1536,
19
  "id2label": {
20
+ "0": "Pleural_Thickening",
21
+ "1": "Pneumothorax",
22
+ "2": "Effusion",
23
  "3": "No Finding",
24
+ "4": "Infiltration",
25
+ "5": "Mass",
26
+ "6": "Nodule",
27
+ "7": "Emphysema",
28
+ "8": "Edema",
29
+ "9": "Fibrosis",
30
+ "10": "Cardiomegaly",
31
+ "11": "Atelectasis",
32
+ "12": "Pneumonia",
33
+ "13": "Hernia",
34
+ "14": "Consolidation"
35
  },
36
  "image_size": 384,
37
  "initializer_range": 0.02,
38
  "label2id": {
39
+ "Atelectasis": 11,
40
+ "Cardiomegaly": 10,
41
+ "Consolidation": 14,
42
+ "Edema": 8,
43
+ "Effusion": 2,
44
+ "Emphysema": 7,
45
+ "Fibrosis": 9,
46
+ "Hernia": 13,
47
+ "Infiltration": 4,
48
+ "Mass": 5,
49
  "No Finding": 3,
50
+ "Nodule": 6,
51
+ "Pleural_Thickening": 0,
52
+ "Pneumonia": 12,
53
+ "Pneumothorax": 1
54
  },
55
  "layer_norm_eps": 1e-05,
56
  "mlp_ratio": 4.0,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01cd6b4dc77846366ae16e29d21e641d56c9ef171537227f9ecbfc83c8a1ceb9
3
  size 784924668
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1daf5f592fb81d4eaa196e6f9effdf921903e8b0e265602e423583bae3acb1b4
3
  size 784924668
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.3333333333333333,
3
- "total_flos": 7254221177290752.0,
4
- "train_loss": 1.3116413354873657,
5
- "train_runtime": 24.7689,
6
- "train_samples_per_second": 0.807,
7
- "train_steps_per_second": 0.081
8
  }
 
1
  {
2
+ "epoch": 9.984152139461173,
3
+ "total_flos": 2.6101205954547646e+19,
4
+ "train_loss": 0.7937991834822156,
5
+ "train_runtime": 16586.4479,
6
+ "train_samples_per_second": 3.042,
7
+ "train_steps_per_second": 0.19
8
  }
trainer_state.json CHANGED
@@ -1,45 +1,2322 @@
1
  {
2
- "best_metric": 0.7,
3
- "best_model_checkpoint": "SwinLarge/checkpoint-2",
4
- "epoch": 1.3333333333333333,
5
  "eval_steps": 500,
6
- "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.0,
14
- "eval_loss": 2.659140110015869,
15
- "eval_runtime": 2.803,
16
- "eval_samples_per_second": 3.568,
17
- "eval_steps_per_second": 1.07,
18
- "step": 1
19
- },
20
- {
21
- "epoch": 1.3333333333333333,
22
- "eval_accuracy": 0.7,
23
- "eval_loss": 2.24800443649292,
24
- "eval_runtime": 2.8186,
25
- "eval_samples_per_second": 3.548,
26
- "eval_steps_per_second": 1.064,
27
- "step": 2
28
- },
29
- {
30
- "epoch": 1.3333333333333333,
31
- "step": 2,
32
- "total_flos": 7254221177290752.0,
33
- "train_loss": 1.3116413354873657,
34
- "train_runtime": 24.7689,
35
- "train_samples_per_second": 0.807,
36
- "train_steps_per_second": 0.081
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
  ],
39
  "logging_steps": 10,
40
- "max_steps": 2,
41
  "num_input_tokens_seen": 0,
42
- "num_train_epochs": 2,
43
  "save_steps": 500,
44
  "stateful_callbacks": {
45
  "TrainerControl": {
@@ -53,7 +2330,7 @@
53
  "attributes": {}
54
  }
55
  },
56
- "total_flos": 7254221177290752.0,
57
  "train_batch_size": 4,
58
  "trial_name": null,
59
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5490196078431373,
3
+ "best_model_checkpoint": "SwinLarge/checkpoint-946",
4
+ "epoch": 9.984152139461173,
5
  "eval_steps": 500,
6
+ "global_step": 3150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03169572107765452,
13
+ "grad_norm": 8.620658874511719,
14
+ "learning_rate": 1.5873015873015873e-06,
15
+ "loss": 2.6801,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.06339144215530904,
20
+ "grad_norm": 10.387493133544922,
21
+ "learning_rate": 3.1746031746031746e-06,
22
+ "loss": 2.5071,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.09508716323296355,
27
+ "grad_norm": 11.921625137329102,
28
+ "learning_rate": 4.7619047619047615e-06,
29
+ "loss": 2.3099,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.12678288431061807,
34
+ "grad_norm": 7.416236877441406,
35
+ "learning_rate": 6.349206349206349e-06,
36
+ "loss": 2.0759,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.15847860538827258,
41
+ "grad_norm": 10.799355506896973,
42
+ "learning_rate": 7.936507936507936e-06,
43
+ "loss": 1.9247,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.1901743264659271,
48
+ "grad_norm": 13.448376655578613,
49
+ "learning_rate": 9.523809523809523e-06,
50
+ "loss": 1.7113,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.2218700475435816,
55
+ "grad_norm": 10.010295867919922,
56
+ "learning_rate": 1.1111111111111112e-05,
57
+ "loss": 1.8997,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.25356576862123614,
62
+ "grad_norm": 7.308131217956543,
63
+ "learning_rate": 1.2698412698412699e-05,
64
+ "loss": 1.8577,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.28526148969889065,
69
+ "grad_norm": 7.589808464050293,
70
+ "learning_rate": 1.4285714285714285e-05,
71
+ "loss": 1.5864,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.31695721077654515,
76
+ "grad_norm": 32.497920989990234,
77
+ "learning_rate": 1.5873015873015872e-05,
78
+ "loss": 1.5627,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.3486529318541997,
83
+ "grad_norm": 8.728975296020508,
84
+ "learning_rate": 1.746031746031746e-05,
85
+ "loss": 2.0147,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.3803486529318542,
90
+ "grad_norm": 11.442461013793945,
91
+ "learning_rate": 1.9047619047619046e-05,
92
+ "loss": 1.8042,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.4120443740095087,
97
+ "grad_norm": 7.592460632324219,
98
+ "learning_rate": 2.0634920634920636e-05,
99
+ "loss": 1.5589,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.4437400950871632,
104
+ "grad_norm": 8.4674711227417,
105
+ "learning_rate": 2.2222222222222223e-05,
106
+ "loss": 1.8606,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.4754358161648177,
111
+ "grad_norm": 5.960061073303223,
112
+ "learning_rate": 2.380952380952381e-05,
113
+ "loss": 1.5667,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.5071315372424723,
118
+ "grad_norm": 6.8911333084106445,
119
+ "learning_rate": 2.5396825396825397e-05,
120
+ "loss": 1.6166,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.5388272583201268,
125
+ "grad_norm": 5.806614875793457,
126
+ "learning_rate": 2.6984126984126984e-05,
127
+ "loss": 1.5221,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.5705229793977813,
132
+ "grad_norm": 13.649744033813477,
133
+ "learning_rate": 2.857142857142857e-05,
134
+ "loss": 1.6278,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.6022187004754358,
139
+ "grad_norm": 6.668085098266602,
140
+ "learning_rate": 3.0158730158730158e-05,
141
+ "loss": 1.7091,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.6339144215530903,
146
+ "grad_norm": 7.859160900115967,
147
+ "learning_rate": 3.1746031746031745e-05,
148
+ "loss": 1.5582,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.6656101426307448,
153
+ "grad_norm": 4.802083492279053,
154
+ "learning_rate": 3.3333333333333335e-05,
155
+ "loss": 1.4414,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.6973058637083994,
160
+ "grad_norm": 15.605195045471191,
161
+ "learning_rate": 3.492063492063492e-05,
162
+ "loss": 1.7171,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.7290015847860539,
167
+ "grad_norm": 7.946794033050537,
168
+ "learning_rate": 3.650793650793651e-05,
169
+ "loss": 1.738,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.7606973058637084,
174
+ "grad_norm": 7.9303364753723145,
175
+ "learning_rate": 3.809523809523809e-05,
176
+ "loss": 1.5497,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.7923930269413629,
181
+ "grad_norm": 6.65696382522583,
182
+ "learning_rate": 3.968253968253968e-05,
183
+ "loss": 1.7273,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.8240887480190174,
188
+ "grad_norm": 4.192816734313965,
189
+ "learning_rate": 4.126984126984127e-05,
190
+ "loss": 1.8298,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.8557844690966719,
195
+ "grad_norm": 6.851734161376953,
196
+ "learning_rate": 4.2857142857142856e-05,
197
+ "loss": 1.728,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.8874801901743264,
202
+ "grad_norm": 8.173200607299805,
203
+ "learning_rate": 4.4444444444444447e-05,
204
+ "loss": 1.5258,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.919175911251981,
209
+ "grad_norm": 5.898636817932129,
210
+ "learning_rate": 4.603174603174603e-05,
211
+ "loss": 1.817,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.9508716323296355,
216
+ "grad_norm": 7.8518452644348145,
217
+ "learning_rate": 4.761904761904762e-05,
218
+ "loss": 1.6367,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.9825673534072901,
223
+ "grad_norm": 4.014355659484863,
224
+ "learning_rate": 4.9206349206349204e-05,
225
+ "loss": 1.8318,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.9984152139461173,
230
+ "eval_accuracy": 0.5436720142602496,
231
+ "eval_loss": 1.7651361227035522,
232
+ "eval_runtime": 149.0995,
233
+ "eval_samples_per_second": 3.763,
234
+ "eval_steps_per_second": 0.946,
235
+ "step": 315
236
+ },
237
+ {
238
+ "epoch": 1.0142630744849446,
239
+ "grad_norm": 5.769894599914551,
240
+ "learning_rate": 4.991181657848325e-05,
241
+ "loss": 1.5269,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 1.045958795562599,
246
+ "grad_norm": 4.770431041717529,
247
+ "learning_rate": 4.973544973544973e-05,
248
+ "loss": 1.71,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 1.0776545166402536,
253
+ "grad_norm": 3.420660972595215,
254
+ "learning_rate": 4.955908289241622e-05,
255
+ "loss": 1.856,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.109350237717908,
260
+ "grad_norm": 4.487371444702148,
261
+ "learning_rate": 4.938271604938271e-05,
262
+ "loss": 1.5407,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.1410459587955626,
267
+ "grad_norm": 5.092081546783447,
268
+ "learning_rate": 4.9206349206349204e-05,
269
+ "loss": 1.6359,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 1.172741679873217,
274
+ "grad_norm": 3.577042818069458,
275
+ "learning_rate": 4.9029982363315695e-05,
276
+ "loss": 1.6652,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 1.2044374009508716,
281
+ "grad_norm": 3.7257213592529297,
282
+ "learning_rate": 4.8853615520282185e-05,
283
+ "loss": 1.5741,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 1.236133122028526,
288
+ "grad_norm": 5.686126232147217,
289
+ "learning_rate": 4.8677248677248676e-05,
290
+ "loss": 1.6713,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 1.2678288431061806,
295
+ "grad_norm": 4.3698577880859375,
296
+ "learning_rate": 4.850088183421517e-05,
297
+ "loss": 1.7232,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.299524564183835,
302
+ "grad_norm": 6.864264965057373,
303
+ "learning_rate": 4.832451499118166e-05,
304
+ "loss": 1.6318,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 1.3312202852614896,
309
+ "grad_norm": 3.44631290435791,
310
+ "learning_rate": 4.814814814814815e-05,
311
+ "loss": 1.5783,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 1.3629160063391441,
316
+ "grad_norm": 11.141667366027832,
317
+ "learning_rate": 4.797178130511464e-05,
318
+ "loss": 1.5381,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 1.3946117274167986,
323
+ "grad_norm": 4.2202959060668945,
324
+ "learning_rate": 4.779541446208113e-05,
325
+ "loss": 1.5854,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 1.4263074484944531,
330
+ "grad_norm": 4.3586039543151855,
331
+ "learning_rate": 4.761904761904762e-05,
332
+ "loss": 1.7901,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 1.4580031695721076,
337
+ "grad_norm": 4.2103352546691895,
338
+ "learning_rate": 4.744268077601411e-05,
339
+ "loss": 1.4986,
340
+ "step": 460
341
+ },
342
+ {
343
+ "epoch": 1.4896988906497624,
344
+ "grad_norm": 5.0563225746154785,
345
+ "learning_rate": 4.72663139329806e-05,
346
+ "loss": 1.5424,
347
+ "step": 470
348
+ },
349
+ {
350
+ "epoch": 1.5213946117274166,
351
+ "grad_norm": 4.409386157989502,
352
+ "learning_rate": 4.708994708994709e-05,
353
+ "loss": 1.4963,
354
+ "step": 480
355
+ },
356
+ {
357
+ "epoch": 1.5530903328050714,
358
+ "grad_norm": 6.303650856018066,
359
+ "learning_rate": 4.691358024691358e-05,
360
+ "loss": 1.5943,
361
+ "step": 490
362
+ },
363
+ {
364
+ "epoch": 1.5847860538827259,
365
+ "grad_norm": 4.192474842071533,
366
+ "learning_rate": 4.673721340388007e-05,
367
+ "loss": 1.6115,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 1.6164817749603804,
372
+ "grad_norm": 4.521521091461182,
373
+ "learning_rate": 4.656084656084656e-05,
374
+ "loss": 1.5737,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 1.6481774960380349,
379
+ "grad_norm": 8.733231544494629,
380
+ "learning_rate": 4.638447971781305e-05,
381
+ "loss": 1.5708,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 1.6798732171156894,
386
+ "grad_norm": 5.1407575607299805,
387
+ "learning_rate": 4.620811287477954e-05,
388
+ "loss": 1.5751,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 1.7115689381933439,
393
+ "grad_norm": 4.215906143188477,
394
+ "learning_rate": 4.603174603174603e-05,
395
+ "loss": 1.5678,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 1.7432646592709984,
400
+ "grad_norm": 7.169848918914795,
401
+ "learning_rate": 4.585537918871252e-05,
402
+ "loss": 1.6263,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 1.7749603803486529,
407
+ "grad_norm": 5.950140953063965,
408
+ "learning_rate": 4.567901234567901e-05,
409
+ "loss": 1.5155,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 1.8066561014263076,
414
+ "grad_norm": 6.070963382720947,
415
+ "learning_rate": 4.55026455026455e-05,
416
+ "loss": 1.6662,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 1.8383518225039621,
421
+ "grad_norm": 5.032849311828613,
422
+ "learning_rate": 4.532627865961199e-05,
423
+ "loss": 1.6791,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 1.8700475435816166,
428
+ "grad_norm": 6.853456020355225,
429
+ "learning_rate": 4.5149911816578484e-05,
430
+ "loss": 1.6334,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 1.9017432646592711,
435
+ "grad_norm": 4.782254219055176,
436
+ "learning_rate": 4.4973544973544974e-05,
437
+ "loss": 1.6209,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 1.9334389857369256,
442
+ "grad_norm": 7.97931432723999,
443
+ "learning_rate": 4.4797178130511465e-05,
444
+ "loss": 1.8784,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 1.9651347068145801,
449
+ "grad_norm": 3.8500170707702637,
450
+ "learning_rate": 4.4620811287477956e-05,
451
+ "loss": 1.721,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 1.9968304278922346,
456
+ "grad_norm": 3.8997135162353516,
457
+ "learning_rate": 4.4444444444444447e-05,
458
+ "loss": 1.6067,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 2.0,
463
+ "eval_accuracy": 0.5454545454545454,
464
+ "eval_loss": 1.6392648220062256,
465
+ "eval_runtime": 147.9577,
466
+ "eval_samples_per_second": 3.792,
467
+ "eval_steps_per_second": 0.953,
468
+ "step": 631
469
+ },
470
+ {
471
+ "epoch": 2.028526148969889,
472
+ "grad_norm": 4.185525894165039,
473
+ "learning_rate": 4.426807760141094e-05,
474
+ "loss": 1.4786,
475
+ "step": 640
476
+ },
477
+ {
478
+ "epoch": 2.0602218700475436,
479
+ "grad_norm": 5.7630391120910645,
480
+ "learning_rate": 4.409171075837743e-05,
481
+ "loss": 1.3732,
482
+ "step": 650
483
+ },
484
+ {
485
+ "epoch": 2.091917591125198,
486
+ "grad_norm": 6.025218963623047,
487
+ "learning_rate": 4.391534391534391e-05,
488
+ "loss": 1.5661,
489
+ "step": 660
490
+ },
491
+ {
492
+ "epoch": 2.1236133122028527,
493
+ "grad_norm": 8.51380443572998,
494
+ "learning_rate": 4.37389770723104e-05,
495
+ "loss": 1.5385,
496
+ "step": 670
497
+ },
498
+ {
499
+ "epoch": 2.155309033280507,
500
+ "grad_norm": 7.913331985473633,
501
+ "learning_rate": 4.3562610229276893e-05,
502
+ "loss": 1.3816,
503
+ "step": 680
504
+ },
505
+ {
506
+ "epoch": 2.1870047543581617,
507
+ "grad_norm": 10.290911674499512,
508
+ "learning_rate": 4.3386243386243384e-05,
509
+ "loss": 1.5289,
510
+ "step": 690
511
+ },
512
+ {
513
+ "epoch": 2.218700475435816,
514
+ "grad_norm": 12.427474021911621,
515
+ "learning_rate": 4.3209876543209875e-05,
516
+ "loss": 1.3822,
517
+ "step": 700
518
+ },
519
+ {
520
+ "epoch": 2.2503961965134707,
521
+ "grad_norm": 8.04323959350586,
522
+ "learning_rate": 4.3033509700176366e-05,
523
+ "loss": 1.699,
524
+ "step": 710
525
+ },
526
+ {
527
+ "epoch": 2.282091917591125,
528
+ "grad_norm": 6.580540180206299,
529
+ "learning_rate": 4.2857142857142856e-05,
530
+ "loss": 1.488,
531
+ "step": 720
532
+ },
533
+ {
534
+ "epoch": 2.3137876386687797,
535
+ "grad_norm": 6.697164058685303,
536
+ "learning_rate": 4.268077601410935e-05,
537
+ "loss": 1.6108,
538
+ "step": 730
539
+ },
540
+ {
541
+ "epoch": 2.345483359746434,
542
+ "grad_norm": 8.002148628234863,
543
+ "learning_rate": 4.250440917107584e-05,
544
+ "loss": 1.4629,
545
+ "step": 740
546
+ },
547
+ {
548
+ "epoch": 2.3771790808240887,
549
+ "grad_norm": 6.550644874572754,
550
+ "learning_rate": 4.232804232804233e-05,
551
+ "loss": 1.3738,
552
+ "step": 750
553
+ },
554
+ {
555
+ "epoch": 2.408874801901743,
556
+ "grad_norm": 10.595471382141113,
557
+ "learning_rate": 4.215167548500882e-05,
558
+ "loss": 1.4171,
559
+ "step": 760
560
+ },
561
+ {
562
+ "epoch": 2.4405705229793977,
563
+ "grad_norm": 8.15519905090332,
564
+ "learning_rate": 4.197530864197531e-05,
565
+ "loss": 1.5467,
566
+ "step": 770
567
+ },
568
+ {
569
+ "epoch": 2.472266244057052,
570
+ "grad_norm": 5.568257808685303,
571
+ "learning_rate": 4.17989417989418e-05,
572
+ "loss": 1.3498,
573
+ "step": 780
574
+ },
575
+ {
576
+ "epoch": 2.5039619651347067,
577
+ "grad_norm": 5.846574783325195,
578
+ "learning_rate": 4.162257495590829e-05,
579
+ "loss": 1.4658,
580
+ "step": 790
581
+ },
582
+ {
583
+ "epoch": 2.535657686212361,
584
+ "grad_norm": 5.970884323120117,
585
+ "learning_rate": 4.144620811287478e-05,
586
+ "loss": 1.5717,
587
+ "step": 800
588
+ },
589
+ {
590
+ "epoch": 2.5673534072900157,
591
+ "grad_norm": 10.46259593963623,
592
+ "learning_rate": 4.126984126984127e-05,
593
+ "loss": 1.5871,
594
+ "step": 810
595
+ },
596
+ {
597
+ "epoch": 2.59904912836767,
598
+ "grad_norm": 6.0144267082214355,
599
+ "learning_rate": 4.109347442680776e-05,
600
+ "loss": 1.5136,
601
+ "step": 820
602
+ },
603
+ {
604
+ "epoch": 2.6307448494453247,
605
+ "grad_norm": 7.098198890686035,
606
+ "learning_rate": 4.091710758377425e-05,
607
+ "loss": 1.5325,
608
+ "step": 830
609
+ },
610
+ {
611
+ "epoch": 2.662440570522979,
612
+ "grad_norm": 6.445565700531006,
613
+ "learning_rate": 4.074074074074074e-05,
614
+ "loss": 1.4732,
615
+ "step": 840
616
+ },
617
+ {
618
+ "epoch": 2.6941362916006337,
619
+ "grad_norm": 7.529149532318115,
620
+ "learning_rate": 4.056437389770723e-05,
621
+ "loss": 1.4529,
622
+ "step": 850
623
+ },
624
+ {
625
+ "epoch": 2.7258320126782882,
626
+ "grad_norm": 5.447256088256836,
627
+ "learning_rate": 4.038800705467372e-05,
628
+ "loss": 1.3855,
629
+ "step": 860
630
+ },
631
+ {
632
+ "epoch": 2.7575277337559427,
633
+ "grad_norm": 5.000948429107666,
634
+ "learning_rate": 4.021164021164021e-05,
635
+ "loss": 1.248,
636
+ "step": 870
637
+ },
638
+ {
639
+ "epoch": 2.7892234548335972,
640
+ "grad_norm": 5.39021110534668,
641
+ "learning_rate": 4.00352733686067e-05,
642
+ "loss": 1.7784,
643
+ "step": 880
644
+ },
645
+ {
646
+ "epoch": 2.8209191759112517,
647
+ "grad_norm": 5.756132125854492,
648
+ "learning_rate": 3.985890652557319e-05,
649
+ "loss": 1.6598,
650
+ "step": 890
651
+ },
652
+ {
653
+ "epoch": 2.8526148969889062,
654
+ "grad_norm": 5.659241199493408,
655
+ "learning_rate": 3.968253968253968e-05,
656
+ "loss": 1.4138,
657
+ "step": 900
658
+ },
659
+ {
660
+ "epoch": 2.8843106180665607,
661
+ "grad_norm": 7.678089141845703,
662
+ "learning_rate": 3.950617283950617e-05,
663
+ "loss": 1.3823,
664
+ "step": 910
665
+ },
666
+ {
667
+ "epoch": 2.9160063391442153,
668
+ "grad_norm": 7.0097479820251465,
669
+ "learning_rate": 3.9329805996472664e-05,
670
+ "loss": 1.6495,
671
+ "step": 920
672
+ },
673
+ {
674
+ "epoch": 2.94770206022187,
675
+ "grad_norm": 7.138586521148682,
676
+ "learning_rate": 3.9153439153439155e-05,
677
+ "loss": 1.4128,
678
+ "step": 930
679
+ },
680
+ {
681
+ "epoch": 2.9793977812995247,
682
+ "grad_norm": 8.242341041564941,
683
+ "learning_rate": 3.8977072310405645e-05,
684
+ "loss": 1.406,
685
+ "step": 940
686
+ },
687
+ {
688
+ "epoch": 2.9984152139461173,
689
+ "eval_accuracy": 0.5490196078431373,
690
+ "eval_loss": 1.6471669673919678,
691
+ "eval_runtime": 147.6692,
692
+ "eval_samples_per_second": 3.799,
693
+ "eval_steps_per_second": 0.955,
694
+ "step": 946
695
+ },
696
+ {
697
+ "epoch": 3.011093502377179,
698
+ "grad_norm": 8.209331512451172,
699
+ "learning_rate": 3.8800705467372136e-05,
700
+ "loss": 1.4082,
701
+ "step": 950
702
+ },
703
+ {
704
+ "epoch": 3.0427892234548337,
705
+ "grad_norm": 6.750119209289551,
706
+ "learning_rate": 3.862433862433863e-05,
707
+ "loss": 1.1927,
708
+ "step": 960
709
+ },
710
+ {
711
+ "epoch": 3.074484944532488,
712
+ "grad_norm": 18.73226547241211,
713
+ "learning_rate": 3.844797178130512e-05,
714
+ "loss": 1.2399,
715
+ "step": 970
716
+ },
717
+ {
718
+ "epoch": 3.1061806656101427,
719
+ "grad_norm": 19.268539428710938,
720
+ "learning_rate": 3.82716049382716e-05,
721
+ "loss": 1.1511,
722
+ "step": 980
723
+ },
724
+ {
725
+ "epoch": 3.1378763866877972,
726
+ "grad_norm": 13.193881034851074,
727
+ "learning_rate": 3.809523809523809e-05,
728
+ "loss": 1.3139,
729
+ "step": 990
730
+ },
731
+ {
732
+ "epoch": 3.1695721077654517,
733
+ "grad_norm": 9.010955810546875,
734
+ "learning_rate": 3.791887125220458e-05,
735
+ "loss": 1.2482,
736
+ "step": 1000
737
+ },
738
+ {
739
+ "epoch": 3.2012678288431062,
740
+ "grad_norm": 6.472105026245117,
741
+ "learning_rate": 3.7742504409171074e-05,
742
+ "loss": 1.0941,
743
+ "step": 1010
744
+ },
745
+ {
746
+ "epoch": 3.2329635499207607,
747
+ "grad_norm": 14.444457054138184,
748
+ "learning_rate": 3.7566137566137564e-05,
749
+ "loss": 1.1845,
750
+ "step": 1020
751
+ },
752
+ {
753
+ "epoch": 3.2646592709984152,
754
+ "grad_norm": 9.034687995910645,
755
+ "learning_rate": 3.7389770723104055e-05,
756
+ "loss": 1.2469,
757
+ "step": 1030
758
+ },
759
+ {
760
+ "epoch": 3.2963549920760697,
761
+ "grad_norm": 10.975589752197266,
762
+ "learning_rate": 3.7213403880070546e-05,
763
+ "loss": 1.3324,
764
+ "step": 1040
765
+ },
766
+ {
767
+ "epoch": 3.3280507131537242,
768
+ "grad_norm": 9.528573036193848,
769
+ "learning_rate": 3.7037037037037037e-05,
770
+ "loss": 1.0736,
771
+ "step": 1050
772
+ },
773
+ {
774
+ "epoch": 3.3597464342313788,
775
+ "grad_norm": 9.448025703430176,
776
+ "learning_rate": 3.686067019400353e-05,
777
+ "loss": 1.2401,
778
+ "step": 1060
779
+ },
780
+ {
781
+ "epoch": 3.3914421553090333,
782
+ "grad_norm": 7.891082286834717,
783
+ "learning_rate": 3.668430335097002e-05,
784
+ "loss": 1.3604,
785
+ "step": 1070
786
+ },
787
+ {
788
+ "epoch": 3.4231378763866878,
789
+ "grad_norm": 12.8311185836792,
790
+ "learning_rate": 3.650793650793651e-05,
791
+ "loss": 1.2031,
792
+ "step": 1080
793
+ },
794
+ {
795
+ "epoch": 3.4548335974643423,
796
+ "grad_norm": 10.008551597595215,
797
+ "learning_rate": 3.6331569664903e-05,
798
+ "loss": 1.1598,
799
+ "step": 1090
800
+ },
801
+ {
802
+ "epoch": 3.4865293185419968,
803
+ "grad_norm": 10.480392456054688,
804
+ "learning_rate": 3.615520282186949e-05,
805
+ "loss": 1.3213,
806
+ "step": 1100
807
+ },
808
+ {
809
+ "epoch": 3.5182250396196513,
810
+ "grad_norm": 10.108732223510742,
811
+ "learning_rate": 3.597883597883598e-05,
812
+ "loss": 1.3552,
813
+ "step": 1110
814
+ },
815
+ {
816
+ "epoch": 3.5499207606973058,
817
+ "grad_norm": 9.28955364227295,
818
+ "learning_rate": 3.580246913580247e-05,
819
+ "loss": 1.1861,
820
+ "step": 1120
821
+ },
822
+ {
823
+ "epoch": 3.5816164817749603,
824
+ "grad_norm": 9.186285972595215,
825
+ "learning_rate": 3.562610229276896e-05,
826
+ "loss": 1.1303,
827
+ "step": 1130
828
+ },
829
+ {
830
+ "epoch": 3.613312202852615,
831
+ "grad_norm": 9.3871488571167,
832
+ "learning_rate": 3.5449735449735446e-05,
833
+ "loss": 1.2186,
834
+ "step": 1140
835
+ },
836
+ {
837
+ "epoch": 3.6450079239302693,
838
+ "grad_norm": 14.419343948364258,
839
+ "learning_rate": 3.527336860670194e-05,
840
+ "loss": 1.3413,
841
+ "step": 1150
842
+ },
843
+ {
844
+ "epoch": 3.676703645007924,
845
+ "grad_norm": 8.462416648864746,
846
+ "learning_rate": 3.509700176366843e-05,
847
+ "loss": 1.2544,
848
+ "step": 1160
849
+ },
850
+ {
851
+ "epoch": 3.7083993660855783,
852
+ "grad_norm": 6.36189079284668,
853
+ "learning_rate": 3.492063492063492e-05,
854
+ "loss": 1.0951,
855
+ "step": 1170
856
+ },
857
+ {
858
+ "epoch": 3.740095087163233,
859
+ "grad_norm": 16.61732292175293,
860
+ "learning_rate": 3.474426807760141e-05,
861
+ "loss": 1.1044,
862
+ "step": 1180
863
+ },
864
+ {
865
+ "epoch": 3.7717908082408877,
866
+ "grad_norm": 9.842411041259766,
867
+ "learning_rate": 3.45679012345679e-05,
868
+ "loss": 1.2549,
869
+ "step": 1190
870
+ },
871
+ {
872
+ "epoch": 3.8034865293185423,
873
+ "grad_norm": 16.95412826538086,
874
+ "learning_rate": 3.439153439153439e-05,
875
+ "loss": 1.2285,
876
+ "step": 1200
877
+ },
878
+ {
879
+ "epoch": 3.8351822503961968,
880
+ "grad_norm": 16.060531616210938,
881
+ "learning_rate": 3.421516754850088e-05,
882
+ "loss": 1.3635,
883
+ "step": 1210
884
+ },
885
+ {
886
+ "epoch": 3.8668779714738513,
887
+ "grad_norm": 12.322091102600098,
888
+ "learning_rate": 3.403880070546737e-05,
889
+ "loss": 1.2121,
890
+ "step": 1220
891
+ },
892
+ {
893
+ "epoch": 3.8985736925515058,
894
+ "grad_norm": 10.384512901306152,
895
+ "learning_rate": 3.386243386243386e-05,
896
+ "loss": 1.321,
897
+ "step": 1230
898
+ },
899
+ {
900
+ "epoch": 3.9302694136291603,
901
+ "grad_norm": 10.984822273254395,
902
+ "learning_rate": 3.3686067019400353e-05,
903
+ "loss": 1.1268,
904
+ "step": 1240
905
+ },
906
+ {
907
+ "epoch": 3.9619651347068148,
908
+ "grad_norm": 12.061741828918457,
909
+ "learning_rate": 3.3509700176366844e-05,
910
+ "loss": 1.1616,
911
+ "step": 1250
912
+ },
913
+ {
914
+ "epoch": 3.9936608557844693,
915
+ "grad_norm": 13.65400505065918,
916
+ "learning_rate": 3.3333333333333335e-05,
917
+ "loss": 1.3983,
918
+ "step": 1260
919
+ },
920
+ {
921
+ "epoch": 4.0,
922
+ "eval_accuracy": 0.5454545454545454,
923
+ "eval_loss": 1.7344454526901245,
924
+ "eval_runtime": 147.3715,
925
+ "eval_samples_per_second": 3.807,
926
+ "eval_steps_per_second": 0.957,
927
+ "step": 1262
928
+ },
929
+ {
930
+ "epoch": 4.025356576862124,
931
+ "grad_norm": 14.69643497467041,
932
+ "learning_rate": 3.3156966490299826e-05,
933
+ "loss": 0.8817,
934
+ "step": 1270
935
+ },
936
+ {
937
+ "epoch": 4.057052297939778,
938
+ "grad_norm": 16.983091354370117,
939
+ "learning_rate": 3.2980599647266316e-05,
940
+ "loss": 0.9822,
941
+ "step": 1280
942
+ },
943
+ {
944
+ "epoch": 4.088748019017433,
945
+ "grad_norm": 7.896317481994629,
946
+ "learning_rate": 3.280423280423281e-05,
947
+ "loss": 0.9038,
948
+ "step": 1290
949
+ },
950
+ {
951
+ "epoch": 4.120443740095087,
952
+ "grad_norm": 19.647830963134766,
953
+ "learning_rate": 3.262786596119929e-05,
954
+ "loss": 0.9003,
955
+ "step": 1300
956
+ },
957
+ {
958
+ "epoch": 4.152139461172742,
959
+ "grad_norm": 14.798089981079102,
960
+ "learning_rate": 3.245149911816578e-05,
961
+ "loss": 0.9009,
962
+ "step": 1310
963
+ },
964
+ {
965
+ "epoch": 4.183835182250396,
966
+ "grad_norm": 21.833358764648438,
967
+ "learning_rate": 3.227513227513227e-05,
968
+ "loss": 0.832,
969
+ "step": 1320
970
+ },
971
+ {
972
+ "epoch": 4.215530903328051,
973
+ "grad_norm": 16.340986251831055,
974
+ "learning_rate": 3.209876543209876e-05,
975
+ "loss": 0.7249,
976
+ "step": 1330
977
+ },
978
+ {
979
+ "epoch": 4.247226624405705,
980
+ "grad_norm": 13.776719093322754,
981
+ "learning_rate": 3.1922398589065254e-05,
982
+ "loss": 0.782,
983
+ "step": 1340
984
+ },
985
+ {
986
+ "epoch": 4.27892234548336,
987
+ "grad_norm": 22.321552276611328,
988
+ "learning_rate": 3.1746031746031745e-05,
989
+ "loss": 0.7917,
990
+ "step": 1350
991
+ },
992
+ {
993
+ "epoch": 4.310618066561014,
994
+ "grad_norm": 13.473883628845215,
995
+ "learning_rate": 3.1569664902998235e-05,
996
+ "loss": 0.7287,
997
+ "step": 1360
998
+ },
999
+ {
1000
+ "epoch": 4.342313787638669,
1001
+ "grad_norm": 21.802818298339844,
1002
+ "learning_rate": 3.1393298059964726e-05,
1003
+ "loss": 0.7062,
1004
+ "step": 1370
1005
+ },
1006
+ {
1007
+ "epoch": 4.374009508716323,
1008
+ "grad_norm": 16.524831771850586,
1009
+ "learning_rate": 3.121693121693122e-05,
1010
+ "loss": 0.8148,
1011
+ "step": 1380
1012
+ },
1013
+ {
1014
+ "epoch": 4.405705229793978,
1015
+ "grad_norm": 16.520061492919922,
1016
+ "learning_rate": 3.104056437389771e-05,
1017
+ "loss": 0.8808,
1018
+ "step": 1390
1019
+ },
1020
+ {
1021
+ "epoch": 4.437400950871632,
1022
+ "grad_norm": 22.931140899658203,
1023
+ "learning_rate": 3.08641975308642e-05,
1024
+ "loss": 0.7967,
1025
+ "step": 1400
1026
+ },
1027
+ {
1028
+ "epoch": 4.469096671949287,
1029
+ "grad_norm": 14.37611198425293,
1030
+ "learning_rate": 3.068783068783069e-05,
1031
+ "loss": 0.7436,
1032
+ "step": 1410
1033
+ },
1034
+ {
1035
+ "epoch": 4.500792393026941,
1036
+ "grad_norm": 15.375365257263184,
1037
+ "learning_rate": 3.0511463844797176e-05,
1038
+ "loss": 0.8335,
1039
+ "step": 1420
1040
+ },
1041
+ {
1042
+ "epoch": 4.532488114104596,
1043
+ "grad_norm": 13.653996467590332,
1044
+ "learning_rate": 3.0335097001763667e-05,
1045
+ "loss": 0.8232,
1046
+ "step": 1430
1047
+ },
1048
+ {
1049
+ "epoch": 4.56418383518225,
1050
+ "grad_norm": 28.006961822509766,
1051
+ "learning_rate": 3.0158730158730158e-05,
1052
+ "loss": 0.7759,
1053
+ "step": 1440
1054
+ },
1055
+ {
1056
+ "epoch": 4.595879556259905,
1057
+ "grad_norm": 7.853826522827148,
1058
+ "learning_rate": 2.998236331569665e-05,
1059
+ "loss": 0.898,
1060
+ "step": 1450
1061
+ },
1062
+ {
1063
+ "epoch": 4.627575277337559,
1064
+ "grad_norm": 14.124467849731445,
1065
+ "learning_rate": 2.980599647266314e-05,
1066
+ "loss": 0.9063,
1067
+ "step": 1460
1068
+ },
1069
+ {
1070
+ "epoch": 4.659270998415214,
1071
+ "grad_norm": 16.662132263183594,
1072
+ "learning_rate": 2.962962962962963e-05,
1073
+ "loss": 0.6931,
1074
+ "step": 1470
1075
+ },
1076
+ {
1077
+ "epoch": 4.690966719492868,
1078
+ "grad_norm": 17.079029083251953,
1079
+ "learning_rate": 2.945326278659612e-05,
1080
+ "loss": 0.676,
1081
+ "step": 1480
1082
+ },
1083
+ {
1084
+ "epoch": 4.722662440570523,
1085
+ "grad_norm": 15.269068717956543,
1086
+ "learning_rate": 2.927689594356261e-05,
1087
+ "loss": 0.8302,
1088
+ "step": 1490
1089
+ },
1090
+ {
1091
+ "epoch": 4.754358161648177,
1092
+ "grad_norm": 11.715784072875977,
1093
+ "learning_rate": 2.91005291005291e-05,
1094
+ "loss": 0.8037,
1095
+ "step": 1500
1096
+ },
1097
+ {
1098
+ "epoch": 4.786053882725832,
1099
+ "grad_norm": 15.774462699890137,
1100
+ "learning_rate": 2.892416225749559e-05,
1101
+ "loss": 0.8735,
1102
+ "step": 1510
1103
+ },
1104
+ {
1105
+ "epoch": 4.817749603803486,
1106
+ "grad_norm": 18.308950424194336,
1107
+ "learning_rate": 2.874779541446208e-05,
1108
+ "loss": 0.6236,
1109
+ "step": 1520
1110
+ },
1111
+ {
1112
+ "epoch": 4.849445324881141,
1113
+ "grad_norm": 21.661277770996094,
1114
+ "learning_rate": 2.857142857142857e-05,
1115
+ "loss": 0.8501,
1116
+ "step": 1530
1117
+ },
1118
+ {
1119
+ "epoch": 4.881141045958795,
1120
+ "grad_norm": 10.561766624450684,
1121
+ "learning_rate": 2.839506172839506e-05,
1122
+ "loss": 0.8987,
1123
+ "step": 1540
1124
+ },
1125
+ {
1126
+ "epoch": 4.91283676703645,
1127
+ "grad_norm": 17.965679168701172,
1128
+ "learning_rate": 2.8218694885361552e-05,
1129
+ "loss": 0.8309,
1130
+ "step": 1550
1131
+ },
1132
+ {
1133
+ "epoch": 4.944532488114104,
1134
+ "grad_norm": 15.322041511535645,
1135
+ "learning_rate": 2.8042328042328043e-05,
1136
+ "loss": 0.6263,
1137
+ "step": 1560
1138
+ },
1139
+ {
1140
+ "epoch": 4.976228209191759,
1141
+ "grad_norm": 8.305255889892578,
1142
+ "learning_rate": 2.7865961199294534e-05,
1143
+ "loss": 0.7272,
1144
+ "step": 1570
1145
+ },
1146
+ {
1147
+ "epoch": 4.998415213946117,
1148
+ "eval_accuracy": 0.5258467023172906,
1149
+ "eval_loss": 2.12831974029541,
1150
+ "eval_runtime": 147.5846,
1151
+ "eval_samples_per_second": 3.801,
1152
+ "eval_steps_per_second": 0.955,
1153
+ "step": 1577
1154
+ },
1155
+ {
1156
+ "epoch": 5.007923930269413,
1157
+ "grad_norm": 12.602760314941406,
1158
+ "learning_rate": 2.768959435626102e-05,
1159
+ "loss": 0.7548,
1160
+ "step": 1580
1161
+ },
1162
+ {
1163
+ "epoch": 5.039619651347068,
1164
+ "grad_norm": 10.254925727844238,
1165
+ "learning_rate": 2.7513227513227512e-05,
1166
+ "loss": 0.4662,
1167
+ "step": 1590
1168
+ },
1169
+ {
1170
+ "epoch": 5.071315372424722,
1171
+ "grad_norm": 6.985943794250488,
1172
+ "learning_rate": 2.7336860670194003e-05,
1173
+ "loss": 0.3352,
1174
+ "step": 1600
1175
+ },
1176
+ {
1177
+ "epoch": 5.103011093502377,
1178
+ "grad_norm": 5.529629230499268,
1179
+ "learning_rate": 2.7160493827160493e-05,
1180
+ "loss": 0.4111,
1181
+ "step": 1610
1182
+ },
1183
+ {
1184
+ "epoch": 5.134706814580031,
1185
+ "grad_norm": 26.3822021484375,
1186
+ "learning_rate": 2.6984126984126984e-05,
1187
+ "loss": 0.445,
1188
+ "step": 1620
1189
+ },
1190
+ {
1191
+ "epoch": 5.166402535657686,
1192
+ "grad_norm": 15.873011589050293,
1193
+ "learning_rate": 2.6807760141093475e-05,
1194
+ "loss": 0.4571,
1195
+ "step": 1630
1196
+ },
1197
+ {
1198
+ "epoch": 5.19809825673534,
1199
+ "grad_norm": 15.90845012664795,
1200
+ "learning_rate": 2.6631393298059965e-05,
1201
+ "loss": 0.6197,
1202
+ "step": 1640
1203
+ },
1204
+ {
1205
+ "epoch": 5.229793977812995,
1206
+ "grad_norm": 21.822021484375,
1207
+ "learning_rate": 2.6455026455026456e-05,
1208
+ "loss": 0.416,
1209
+ "step": 1650
1210
+ },
1211
+ {
1212
+ "epoch": 5.261489698890649,
1213
+ "grad_norm": 16.9212646484375,
1214
+ "learning_rate": 2.6278659611992943e-05,
1215
+ "loss": 0.3351,
1216
+ "step": 1660
1217
+ },
1218
+ {
1219
+ "epoch": 5.293185419968304,
1220
+ "grad_norm": 18.473758697509766,
1221
+ "learning_rate": 2.6102292768959434e-05,
1222
+ "loss": 0.4751,
1223
+ "step": 1670
1224
+ },
1225
+ {
1226
+ "epoch": 5.324881141045958,
1227
+ "grad_norm": 19.178316116333008,
1228
+ "learning_rate": 2.5925925925925925e-05,
1229
+ "loss": 0.535,
1230
+ "step": 1680
1231
+ },
1232
+ {
1233
+ "epoch": 5.356576862123613,
1234
+ "grad_norm": 11.751172065734863,
1235
+ "learning_rate": 2.5749559082892416e-05,
1236
+ "loss": 0.6542,
1237
+ "step": 1690
1238
+ },
1239
+ {
1240
+ "epoch": 5.3882725832012675,
1241
+ "grad_norm": 16.899391174316406,
1242
+ "learning_rate": 2.5573192239858906e-05,
1243
+ "loss": 0.4733,
1244
+ "step": 1700
1245
+ },
1246
+ {
1247
+ "epoch": 5.419968304278922,
1248
+ "grad_norm": 19.778823852539062,
1249
+ "learning_rate": 2.5396825396825397e-05,
1250
+ "loss": 0.448,
1251
+ "step": 1710
1252
+ },
1253
+ {
1254
+ "epoch": 5.4516640253565765,
1255
+ "grad_norm": 30.306079864501953,
1256
+ "learning_rate": 2.5220458553791888e-05,
1257
+ "loss": 0.6095,
1258
+ "step": 1720
1259
+ },
1260
+ {
1261
+ "epoch": 5.483359746434231,
1262
+ "grad_norm": 7.676305770874023,
1263
+ "learning_rate": 2.504409171075838e-05,
1264
+ "loss": 0.3816,
1265
+ "step": 1730
1266
+ },
1267
+ {
1268
+ "epoch": 5.5150554675118855,
1269
+ "grad_norm": 12.745634078979492,
1270
+ "learning_rate": 2.4867724867724866e-05,
1271
+ "loss": 0.5836,
1272
+ "step": 1740
1273
+ },
1274
+ {
1275
+ "epoch": 5.546751188589541,
1276
+ "grad_norm": 19.767845153808594,
1277
+ "learning_rate": 2.4691358024691357e-05,
1278
+ "loss": 0.4719,
1279
+ "step": 1750
1280
+ },
1281
+ {
1282
+ "epoch": 5.5784469096671945,
1283
+ "grad_norm": 22.378528594970703,
1284
+ "learning_rate": 2.4514991181657847e-05,
1285
+ "loss": 0.4079,
1286
+ "step": 1760
1287
+ },
1288
+ {
1289
+ "epoch": 5.61014263074485,
1290
+ "grad_norm": 16.106239318847656,
1291
+ "learning_rate": 2.4338624338624338e-05,
1292
+ "loss": 0.4083,
1293
+ "step": 1770
1294
+ },
1295
+ {
1296
+ "epoch": 5.6418383518225035,
1297
+ "grad_norm": 11.803277015686035,
1298
+ "learning_rate": 2.416225749559083e-05,
1299
+ "loss": 0.5222,
1300
+ "step": 1780
1301
+ },
1302
+ {
1303
+ "epoch": 5.673534072900159,
1304
+ "grad_norm": 20.511564254760742,
1305
+ "learning_rate": 2.398589065255732e-05,
1306
+ "loss": 0.4857,
1307
+ "step": 1790
1308
+ },
1309
+ {
1310
+ "epoch": 5.705229793977813,
1311
+ "grad_norm": 30.313129425048828,
1312
+ "learning_rate": 2.380952380952381e-05,
1313
+ "loss": 0.5033,
1314
+ "step": 1800
1315
+ },
1316
+ {
1317
+ "epoch": 5.736925515055468,
1318
+ "grad_norm": 17.92380142211914,
1319
+ "learning_rate": 2.36331569664903e-05,
1320
+ "loss": 0.3853,
1321
+ "step": 1810
1322
+ },
1323
+ {
1324
+ "epoch": 5.768621236133122,
1325
+ "grad_norm": 11.325889587402344,
1326
+ "learning_rate": 2.345679012345679e-05,
1327
+ "loss": 0.4485,
1328
+ "step": 1820
1329
+ },
1330
+ {
1331
+ "epoch": 5.800316957210777,
1332
+ "grad_norm": 30.107561111450195,
1333
+ "learning_rate": 2.328042328042328e-05,
1334
+ "loss": 0.4691,
1335
+ "step": 1830
1336
+ },
1337
+ {
1338
+ "epoch": 5.832012678288431,
1339
+ "grad_norm": 7.061257362365723,
1340
+ "learning_rate": 2.310405643738977e-05,
1341
+ "loss": 0.5587,
1342
+ "step": 1840
1343
+ },
1344
+ {
1345
+ "epoch": 5.863708399366086,
1346
+ "grad_norm": 21.1574764251709,
1347
+ "learning_rate": 2.292768959435626e-05,
1348
+ "loss": 0.3672,
1349
+ "step": 1850
1350
+ },
1351
+ {
1352
+ "epoch": 5.89540412044374,
1353
+ "grad_norm": 21.53982162475586,
1354
+ "learning_rate": 2.275132275132275e-05,
1355
+ "loss": 0.4423,
1356
+ "step": 1860
1357
+ },
1358
+ {
1359
+ "epoch": 5.927099841521395,
1360
+ "grad_norm": 9.077971458435059,
1361
+ "learning_rate": 2.2574955908289242e-05,
1362
+ "loss": 0.2509,
1363
+ "step": 1870
1364
+ },
1365
+ {
1366
+ "epoch": 5.958795562599049,
1367
+ "grad_norm": 21.368032455444336,
1368
+ "learning_rate": 2.2398589065255733e-05,
1369
+ "loss": 0.4079,
1370
+ "step": 1880
1371
+ },
1372
+ {
1373
+ "epoch": 5.990491283676704,
1374
+ "grad_norm": 13.913603782653809,
1375
+ "learning_rate": 2.2222222222222223e-05,
1376
+ "loss": 0.3975,
1377
+ "step": 1890
1378
+ },
1379
+ {
1380
+ "epoch": 6.0,
1381
+ "eval_accuracy": 0.5133689839572193,
1382
+ "eval_loss": 2.522934913635254,
1383
+ "eval_runtime": 147.3195,
1384
+ "eval_samples_per_second": 3.808,
1385
+ "eval_steps_per_second": 0.957,
1386
+ "step": 1893
1387
+ },
1388
+ {
1389
+ "epoch": 6.022187004754358,
1390
+ "grad_norm": 3.1691958904266357,
1391
+ "learning_rate": 2.2045855379188714e-05,
1392
+ "loss": 0.2654,
1393
+ "step": 1900
1394
+ },
1395
+ {
1396
+ "epoch": 6.053882725832013,
1397
+ "grad_norm": 9.656085014343262,
1398
+ "learning_rate": 2.18694885361552e-05,
1399
+ "loss": 0.3378,
1400
+ "step": 1910
1401
+ },
1402
+ {
1403
+ "epoch": 6.085578446909667,
1404
+ "grad_norm": 6.1951375007629395,
1405
+ "learning_rate": 2.1693121693121692e-05,
1406
+ "loss": 0.2532,
1407
+ "step": 1920
1408
+ },
1409
+ {
1410
+ "epoch": 6.117274167987322,
1411
+ "grad_norm": 17.4242000579834,
1412
+ "learning_rate": 2.1516754850088183e-05,
1413
+ "loss": 0.3027,
1414
+ "step": 1930
1415
+ },
1416
+ {
1417
+ "epoch": 6.148969889064976,
1418
+ "grad_norm": 27.61707305908203,
1419
+ "learning_rate": 2.1340388007054674e-05,
1420
+ "loss": 0.2637,
1421
+ "step": 1940
1422
+ },
1423
+ {
1424
+ "epoch": 6.180665610142631,
1425
+ "grad_norm": 10.691701889038086,
1426
+ "learning_rate": 2.1164021164021164e-05,
1427
+ "loss": 0.2931,
1428
+ "step": 1950
1429
+ },
1430
+ {
1431
+ "epoch": 6.212361331220285,
1432
+ "grad_norm": 35.979496002197266,
1433
+ "learning_rate": 2.0987654320987655e-05,
1434
+ "loss": 0.2336,
1435
+ "step": 1960
1436
+ },
1437
+ {
1438
+ "epoch": 6.24405705229794,
1439
+ "grad_norm": 21.92683219909668,
1440
+ "learning_rate": 2.0811287477954146e-05,
1441
+ "loss": 0.3806,
1442
+ "step": 1970
1443
+ },
1444
+ {
1445
+ "epoch": 6.2757527733755945,
1446
+ "grad_norm": 8.734770774841309,
1447
+ "learning_rate": 2.0634920634920636e-05,
1448
+ "loss": 0.2853,
1449
+ "step": 1980
1450
+ },
1451
+ {
1452
+ "epoch": 6.307448494453249,
1453
+ "grad_norm": 13.567136764526367,
1454
+ "learning_rate": 2.0458553791887124e-05,
1455
+ "loss": 0.2756,
1456
+ "step": 1990
1457
+ },
1458
+ {
1459
+ "epoch": 6.3391442155309035,
1460
+ "grad_norm": 14.175586700439453,
1461
+ "learning_rate": 2.0282186948853614e-05,
1462
+ "loss": 0.3459,
1463
+ "step": 2000
1464
+ },
1465
+ {
1466
+ "epoch": 6.370839936608558,
1467
+ "grad_norm": 6.7237749099731445,
1468
+ "learning_rate": 2.0105820105820105e-05,
1469
+ "loss": 0.13,
1470
+ "step": 2010
1471
+ },
1472
+ {
1473
+ "epoch": 6.4025356576862125,
1474
+ "grad_norm": 3.6393444538116455,
1475
+ "learning_rate": 1.9929453262786596e-05,
1476
+ "loss": 0.1789,
1477
+ "step": 2020
1478
+ },
1479
+ {
1480
+ "epoch": 6.434231378763867,
1481
+ "grad_norm": 4.694659233093262,
1482
+ "learning_rate": 1.9753086419753087e-05,
1483
+ "loss": 0.2228,
1484
+ "step": 2030
1485
+ },
1486
+ {
1487
+ "epoch": 6.4659270998415215,
1488
+ "grad_norm": 9.12720775604248,
1489
+ "learning_rate": 1.9576719576719577e-05,
1490
+ "loss": 0.2706,
1491
+ "step": 2040
1492
+ },
1493
+ {
1494
+ "epoch": 6.497622820919176,
1495
+ "grad_norm": 18.74335479736328,
1496
+ "learning_rate": 1.9400352733686068e-05,
1497
+ "loss": 0.2674,
1498
+ "step": 2050
1499
+ },
1500
+ {
1501
+ "epoch": 6.5293185419968305,
1502
+ "grad_norm": 7.866713523864746,
1503
+ "learning_rate": 1.922398589065256e-05,
1504
+ "loss": 0.3087,
1505
+ "step": 2060
1506
+ },
1507
+ {
1508
+ "epoch": 6.561014263074485,
1509
+ "grad_norm": 4.733059406280518,
1510
+ "learning_rate": 1.9047619047619046e-05,
1511
+ "loss": 0.2031,
1512
+ "step": 2070
1513
+ },
1514
+ {
1515
+ "epoch": 6.5927099841521395,
1516
+ "grad_norm": 15.15184497833252,
1517
+ "learning_rate": 1.8871252204585537e-05,
1518
+ "loss": 0.18,
1519
+ "step": 2080
1520
+ },
1521
+ {
1522
+ "epoch": 6.624405705229794,
1523
+ "grad_norm": 19.79001235961914,
1524
+ "learning_rate": 1.8694885361552028e-05,
1525
+ "loss": 0.2417,
1526
+ "step": 2090
1527
+ },
1528
+ {
1529
+ "epoch": 6.6561014263074485,
1530
+ "grad_norm": 13.751955032348633,
1531
+ "learning_rate": 1.8518518518518518e-05,
1532
+ "loss": 0.2938,
1533
+ "step": 2100
1534
+ },
1535
+ {
1536
+ "epoch": 6.687797147385103,
1537
+ "grad_norm": 15.962373733520508,
1538
+ "learning_rate": 1.834215167548501e-05,
1539
+ "loss": 0.1829,
1540
+ "step": 2110
1541
+ },
1542
+ {
1543
+ "epoch": 6.7194928684627575,
1544
+ "grad_norm": 15.361172676086426,
1545
+ "learning_rate": 1.81657848324515e-05,
1546
+ "loss": 0.1573,
1547
+ "step": 2120
1548
+ },
1549
+ {
1550
+ "epoch": 6.751188589540412,
1551
+ "grad_norm": 3.690701961517334,
1552
+ "learning_rate": 1.798941798941799e-05,
1553
+ "loss": 0.3055,
1554
+ "step": 2130
1555
+ },
1556
+ {
1557
+ "epoch": 6.7828843106180665,
1558
+ "grad_norm": 10.492719650268555,
1559
+ "learning_rate": 1.781305114638448e-05,
1560
+ "loss": 0.2244,
1561
+ "step": 2140
1562
+ },
1563
+ {
1564
+ "epoch": 6.814580031695721,
1565
+ "grad_norm": 13.18823528289795,
1566
+ "learning_rate": 1.763668430335097e-05,
1567
+ "loss": 0.2039,
1568
+ "step": 2150
1569
+ },
1570
+ {
1571
+ "epoch": 6.8462757527733755,
1572
+ "grad_norm": 5.904299259185791,
1573
+ "learning_rate": 1.746031746031746e-05,
1574
+ "loss": 0.2987,
1575
+ "step": 2160
1576
+ },
1577
+ {
1578
+ "epoch": 6.87797147385103,
1579
+ "grad_norm": 25.814712524414062,
1580
+ "learning_rate": 1.728395061728395e-05,
1581
+ "loss": 0.2196,
1582
+ "step": 2170
1583
+ },
1584
+ {
1585
+ "epoch": 6.9096671949286845,
1586
+ "grad_norm": 17.685895919799805,
1587
+ "learning_rate": 1.710758377425044e-05,
1588
+ "loss": 0.2306,
1589
+ "step": 2180
1590
+ },
1591
+ {
1592
+ "epoch": 6.941362916006339,
1593
+ "grad_norm": 34.82439422607422,
1594
+ "learning_rate": 1.693121693121693e-05,
1595
+ "loss": 0.2668,
1596
+ "step": 2190
1597
+ },
1598
+ {
1599
+ "epoch": 6.9730586370839935,
1600
+ "grad_norm": 6.761794567108154,
1601
+ "learning_rate": 1.6754850088183422e-05,
1602
+ "loss": 0.2648,
1603
+ "step": 2200
1604
+ },
1605
+ {
1606
+ "epoch": 6.998415213946117,
1607
+ "eval_accuracy": 0.5080213903743316,
1608
+ "eval_loss": 3.0332891941070557,
1609
+ "eval_runtime": 147.5737,
1610
+ "eval_samples_per_second": 3.801,
1611
+ "eval_steps_per_second": 0.955,
1612
+ "step": 2208
1613
+ },
1614
+ {
1615
+ "epoch": 7.004754358161648,
1616
+ "grad_norm": 18.470195770263672,
1617
+ "learning_rate": 1.6578483245149913e-05,
1618
+ "loss": 0.1705,
1619
+ "step": 2210
1620
+ },
1621
+ {
1622
+ "epoch": 7.0364500792393025,
1623
+ "grad_norm": 13.634916305541992,
1624
+ "learning_rate": 1.6402116402116404e-05,
1625
+ "loss": 0.1345,
1626
+ "step": 2220
1627
+ },
1628
+ {
1629
+ "epoch": 7.068145800316957,
1630
+ "grad_norm": 7.778036594390869,
1631
+ "learning_rate": 1.622574955908289e-05,
1632
+ "loss": 0.0799,
1633
+ "step": 2230
1634
+ },
1635
+ {
1636
+ "epoch": 7.0998415213946116,
1637
+ "grad_norm": 15.194972038269043,
1638
+ "learning_rate": 1.604938271604938e-05,
1639
+ "loss": 0.1687,
1640
+ "step": 2240
1641
+ },
1642
+ {
1643
+ "epoch": 7.131537242472266,
1644
+ "grad_norm": 7.493090629577637,
1645
+ "learning_rate": 1.5873015873015872e-05,
1646
+ "loss": 0.0908,
1647
+ "step": 2250
1648
+ },
1649
+ {
1650
+ "epoch": 7.163232963549921,
1651
+ "grad_norm": 1.1790108680725098,
1652
+ "learning_rate": 1.5696649029982363e-05,
1653
+ "loss": 0.1564,
1654
+ "step": 2260
1655
+ },
1656
+ {
1657
+ "epoch": 7.194928684627575,
1658
+ "grad_norm": 20.04016876220703,
1659
+ "learning_rate": 1.5520282186948854e-05,
1660
+ "loss": 0.166,
1661
+ "step": 2270
1662
+ },
1663
+ {
1664
+ "epoch": 7.22662440570523,
1665
+ "grad_norm": 3.1165976524353027,
1666
+ "learning_rate": 1.5343915343915344e-05,
1667
+ "loss": 0.1283,
1668
+ "step": 2280
1669
+ },
1670
+ {
1671
+ "epoch": 7.258320126782884,
1672
+ "grad_norm": 23.948923110961914,
1673
+ "learning_rate": 1.5167548500881834e-05,
1674
+ "loss": 0.176,
1675
+ "step": 2290
1676
+ },
1677
+ {
1678
+ "epoch": 7.290015847860539,
1679
+ "grad_norm": 3.0111184120178223,
1680
+ "learning_rate": 1.4991181657848324e-05,
1681
+ "loss": 0.1272,
1682
+ "step": 2300
1683
+ },
1684
+ {
1685
+ "epoch": 7.321711568938193,
1686
+ "grad_norm": 11.723183631896973,
1687
+ "learning_rate": 1.4814814814814815e-05,
1688
+ "loss": 0.1757,
1689
+ "step": 2310
1690
+ },
1691
+ {
1692
+ "epoch": 7.353407290015848,
1693
+ "grad_norm": 35.94911193847656,
1694
+ "learning_rate": 1.4638447971781306e-05,
1695
+ "loss": 0.1217,
1696
+ "step": 2320
1697
+ },
1698
+ {
1699
+ "epoch": 7.385103011093502,
1700
+ "grad_norm": 11.777710914611816,
1701
+ "learning_rate": 1.4462081128747795e-05,
1702
+ "loss": 0.1587,
1703
+ "step": 2330
1704
+ },
1705
+ {
1706
+ "epoch": 7.416798732171157,
1707
+ "grad_norm": 31.9948673248291,
1708
+ "learning_rate": 1.4285714285714285e-05,
1709
+ "loss": 0.116,
1710
+ "step": 2340
1711
+ },
1712
+ {
1713
+ "epoch": 7.448494453248811,
1714
+ "grad_norm": 9.983025550842285,
1715
+ "learning_rate": 1.4109347442680776e-05,
1716
+ "loss": 0.1015,
1717
+ "step": 2350
1718
+ },
1719
+ {
1720
+ "epoch": 7.480190174326466,
1721
+ "grad_norm": 13.77480411529541,
1722
+ "learning_rate": 1.3932980599647267e-05,
1723
+ "loss": 0.1539,
1724
+ "step": 2360
1725
+ },
1726
+ {
1727
+ "epoch": 7.51188589540412,
1728
+ "grad_norm": 18.11697769165039,
1729
+ "learning_rate": 1.3756613756613756e-05,
1730
+ "loss": 0.1695,
1731
+ "step": 2370
1732
+ },
1733
+ {
1734
+ "epoch": 7.543581616481775,
1735
+ "grad_norm": 0.5398086905479431,
1736
+ "learning_rate": 1.3580246913580247e-05,
1737
+ "loss": 0.0802,
1738
+ "step": 2380
1739
+ },
1740
+ {
1741
+ "epoch": 7.575277337559429,
1742
+ "grad_norm": 22.803916931152344,
1743
+ "learning_rate": 1.3403880070546737e-05,
1744
+ "loss": 0.1697,
1745
+ "step": 2390
1746
+ },
1747
+ {
1748
+ "epoch": 7.606973058637084,
1749
+ "grad_norm": 24.52759552001953,
1750
+ "learning_rate": 1.3227513227513228e-05,
1751
+ "loss": 0.1614,
1752
+ "step": 2400
1753
+ },
1754
+ {
1755
+ "epoch": 7.638668779714738,
1756
+ "grad_norm": 14.2177734375,
1757
+ "learning_rate": 1.3051146384479717e-05,
1758
+ "loss": 0.1197,
1759
+ "step": 2410
1760
+ },
1761
+ {
1762
+ "epoch": 7.6703645007923935,
1763
+ "grad_norm": 3.4550652503967285,
1764
+ "learning_rate": 1.2874779541446208e-05,
1765
+ "loss": 0.0921,
1766
+ "step": 2420
1767
+ },
1768
+ {
1769
+ "epoch": 7.702060221870047,
1770
+ "grad_norm": 2.2358646392822266,
1771
+ "learning_rate": 1.2698412698412699e-05,
1772
+ "loss": 0.1467,
1773
+ "step": 2430
1774
+ },
1775
+ {
1776
+ "epoch": 7.7337559429477025,
1777
+ "grad_norm": 24.631845474243164,
1778
+ "learning_rate": 1.252204585537919e-05,
1779
+ "loss": 0.2073,
1780
+ "step": 2440
1781
+ },
1782
+ {
1783
+ "epoch": 7.765451664025356,
1784
+ "grad_norm": 1.8313311338424683,
1785
+ "learning_rate": 1.2345679012345678e-05,
1786
+ "loss": 0.0878,
1787
+ "step": 2450
1788
+ },
1789
+ {
1790
+ "epoch": 7.7971473851030115,
1791
+ "grad_norm": 5.266528606414795,
1792
+ "learning_rate": 1.2169312169312169e-05,
1793
+ "loss": 0.1029,
1794
+ "step": 2460
1795
+ },
1796
+ {
1797
+ "epoch": 7.828843106180665,
1798
+ "grad_norm": 5.7267632484436035,
1799
+ "learning_rate": 1.199294532627866e-05,
1800
+ "loss": 0.1302,
1801
+ "step": 2470
1802
+ },
1803
+ {
1804
+ "epoch": 7.8605388272583205,
1805
+ "grad_norm": 24.080692291259766,
1806
+ "learning_rate": 1.181657848324515e-05,
1807
+ "loss": 0.1879,
1808
+ "step": 2480
1809
+ },
1810
+ {
1811
+ "epoch": 7.892234548335974,
1812
+ "grad_norm": 29.952129364013672,
1813
+ "learning_rate": 1.164021164021164e-05,
1814
+ "loss": 0.1314,
1815
+ "step": 2490
1816
+ },
1817
+ {
1818
+ "epoch": 7.9239302694136295,
1819
+ "grad_norm": 10.011947631835938,
1820
+ "learning_rate": 1.146384479717813e-05,
1821
+ "loss": 0.1335,
1822
+ "step": 2500
1823
+ },
1824
+ {
1825
+ "epoch": 7.955625990491284,
1826
+ "grad_norm": 4.504798412322998,
1827
+ "learning_rate": 1.1287477954144621e-05,
1828
+ "loss": 0.1474,
1829
+ "step": 2510
1830
+ },
1831
+ {
1832
+ "epoch": 7.9873217115689386,
1833
+ "grad_norm": 13.406722068786621,
1834
+ "learning_rate": 1.1111111111111112e-05,
1835
+ "loss": 0.1232,
1836
+ "step": 2520
1837
+ },
1838
+ {
1839
+ "epoch": 8.0,
1840
+ "eval_accuracy": 0.5240641711229946,
1841
+ "eval_loss": 3.4625513553619385,
1842
+ "eval_runtime": 147.6895,
1843
+ "eval_samples_per_second": 3.799,
1844
+ "eval_steps_per_second": 0.955,
1845
+ "step": 2524
1846
+ },
1847
+ {
1848
+ "epoch": 8.019017432646592,
1849
+ "grad_norm": 2.257606267929077,
1850
+ "learning_rate": 1.09347442680776e-05,
1851
+ "loss": 0.1443,
1852
+ "step": 2530
1853
+ },
1854
+ {
1855
+ "epoch": 8.050713153724248,
1856
+ "grad_norm": 4.254097938537598,
1857
+ "learning_rate": 1.0758377425044091e-05,
1858
+ "loss": 0.0859,
1859
+ "step": 2540
1860
+ },
1861
+ {
1862
+ "epoch": 8.082408874801901,
1863
+ "grad_norm": 16.948497772216797,
1864
+ "learning_rate": 1.0582010582010582e-05,
1865
+ "loss": 0.0859,
1866
+ "step": 2550
1867
+ },
1868
+ {
1869
+ "epoch": 8.114104595879557,
1870
+ "grad_norm": 10.674163818359375,
1871
+ "learning_rate": 1.0405643738977073e-05,
1872
+ "loss": 0.0403,
1873
+ "step": 2560
1874
+ },
1875
+ {
1876
+ "epoch": 8.14580031695721,
1877
+ "grad_norm": 31.626522064208984,
1878
+ "learning_rate": 1.0229276895943562e-05,
1879
+ "loss": 0.1069,
1880
+ "step": 2570
1881
+ },
1882
+ {
1883
+ "epoch": 8.177496038034866,
1884
+ "grad_norm": 14.414571762084961,
1885
+ "learning_rate": 1.0052910052910053e-05,
1886
+ "loss": 0.0578,
1887
+ "step": 2580
1888
+ },
1889
+ {
1890
+ "epoch": 8.20919175911252,
1891
+ "grad_norm": 12.272441864013672,
1892
+ "learning_rate": 9.876543209876543e-06,
1893
+ "loss": 0.1106,
1894
+ "step": 2590
1895
+ },
1896
+ {
1897
+ "epoch": 8.240887480190175,
1898
+ "grad_norm": 39.95854187011719,
1899
+ "learning_rate": 9.700176366843034e-06,
1900
+ "loss": 0.1382,
1901
+ "step": 2600
1902
+ },
1903
+ {
1904
+ "epoch": 8.272583201267828,
1905
+ "grad_norm": 5.969349384307861,
1906
+ "learning_rate": 9.523809523809523e-06,
1907
+ "loss": 0.0664,
1908
+ "step": 2610
1909
+ },
1910
+ {
1911
+ "epoch": 8.304278922345484,
1912
+ "grad_norm": 6.0112128257751465,
1913
+ "learning_rate": 9.347442680776014e-06,
1914
+ "loss": 0.066,
1915
+ "step": 2620
1916
+ },
1917
+ {
1918
+ "epoch": 8.335974643423137,
1919
+ "grad_norm": 9.331475257873535,
1920
+ "learning_rate": 9.171075837742504e-06,
1921
+ "loss": 0.0728,
1922
+ "step": 2630
1923
+ },
1924
+ {
1925
+ "epoch": 8.367670364500793,
1926
+ "grad_norm": 1.4397279024124146,
1927
+ "learning_rate": 8.994708994708995e-06,
1928
+ "loss": 0.0607,
1929
+ "step": 2640
1930
+ },
1931
+ {
1932
+ "epoch": 8.399366085578446,
1933
+ "grad_norm": 16.279735565185547,
1934
+ "learning_rate": 8.818342151675484e-06,
1935
+ "loss": 0.1049,
1936
+ "step": 2650
1937
+ },
1938
+ {
1939
+ "epoch": 8.431061806656102,
1940
+ "grad_norm": 28.912487030029297,
1941
+ "learning_rate": 8.641975308641975e-06,
1942
+ "loss": 0.1594,
1943
+ "step": 2660
1944
+ },
1945
+ {
1946
+ "epoch": 8.462757527733755,
1947
+ "grad_norm": 7.044099807739258,
1948
+ "learning_rate": 8.465608465608466e-06,
1949
+ "loss": 0.0905,
1950
+ "step": 2670
1951
+ },
1952
+ {
1953
+ "epoch": 8.49445324881141,
1954
+ "grad_norm": 1.4151721000671387,
1955
+ "learning_rate": 8.289241622574956e-06,
1956
+ "loss": 0.1646,
1957
+ "step": 2680
1958
+ },
1959
+ {
1960
+ "epoch": 8.526148969889064,
1961
+ "grad_norm": 8.555072784423828,
1962
+ "learning_rate": 8.112874779541445e-06,
1963
+ "loss": 0.0751,
1964
+ "step": 2690
1965
+ },
1966
+ {
1967
+ "epoch": 8.55784469096672,
1968
+ "grad_norm": 10.666037559509277,
1969
+ "learning_rate": 7.936507936507936e-06,
1970
+ "loss": 0.0631,
1971
+ "step": 2700
1972
+ },
1973
+ {
1974
+ "epoch": 8.589540412044373,
1975
+ "grad_norm": 21.815340042114258,
1976
+ "learning_rate": 7.760141093474427e-06,
1977
+ "loss": 0.0646,
1978
+ "step": 2710
1979
+ },
1980
+ {
1981
+ "epoch": 8.621236133122029,
1982
+ "grad_norm": 2.177539587020874,
1983
+ "learning_rate": 7.583774250440917e-06,
1984
+ "loss": 0.1986,
1985
+ "step": 2720
1986
+ },
1987
+ {
1988
+ "epoch": 8.652931854199682,
1989
+ "grad_norm": 3.416982889175415,
1990
+ "learning_rate": 7.4074074074074075e-06,
1991
+ "loss": 0.0619,
1992
+ "step": 2730
1993
+ },
1994
+ {
1995
+ "epoch": 8.684627575277338,
1996
+ "grad_norm": 7.183048248291016,
1997
+ "learning_rate": 7.231040564373897e-06,
1998
+ "loss": 0.1483,
1999
+ "step": 2740
2000
+ },
2001
+ {
2002
+ "epoch": 8.716323296354991,
2003
+ "grad_norm": 27.301572799682617,
2004
+ "learning_rate": 7.054673721340388e-06,
2005
+ "loss": 0.0735,
2006
+ "step": 2750
2007
+ },
2008
+ {
2009
+ "epoch": 8.748019017432647,
2010
+ "grad_norm": 4.7548651695251465,
2011
+ "learning_rate": 6.878306878306878e-06,
2012
+ "loss": 0.075,
2013
+ "step": 2760
2014
+ },
2015
+ {
2016
+ "epoch": 8.7797147385103,
2017
+ "grad_norm": 19.81043815612793,
2018
+ "learning_rate": 6.701940035273369e-06,
2019
+ "loss": 0.0404,
2020
+ "step": 2770
2021
+ },
2022
+ {
2023
+ "epoch": 8.811410459587956,
2024
+ "grad_norm": 3.3048360347747803,
2025
+ "learning_rate": 6.5255731922398585e-06,
2026
+ "loss": 0.084,
2027
+ "step": 2780
2028
+ },
2029
+ {
2030
+ "epoch": 8.843106180665611,
2031
+ "grad_norm": 22.454687118530273,
2032
+ "learning_rate": 6.349206349206349e-06,
2033
+ "loss": 0.0535,
2034
+ "step": 2790
2035
+ },
2036
+ {
2037
+ "epoch": 8.874801901743265,
2038
+ "grad_norm": 10.355389595031738,
2039
+ "learning_rate": 6.172839506172839e-06,
2040
+ "loss": 0.101,
2041
+ "step": 2800
2042
+ },
2043
+ {
2044
+ "epoch": 8.906497622820918,
2045
+ "grad_norm": 1.525553822517395,
2046
+ "learning_rate": 5.99647266313933e-06,
2047
+ "loss": 0.0479,
2048
+ "step": 2810
2049
+ },
2050
+ {
2051
+ "epoch": 8.938193343898574,
2052
+ "grad_norm": 2.0415096282958984,
2053
+ "learning_rate": 5.82010582010582e-06,
2054
+ "loss": 0.0523,
2055
+ "step": 2820
2056
+ },
2057
+ {
2058
+ "epoch": 8.969889064976229,
2059
+ "grad_norm": 4.97711706161499,
2060
+ "learning_rate": 5.6437389770723105e-06,
2061
+ "loss": 0.0873,
2062
+ "step": 2830
2063
+ },
2064
+ {
2065
+ "epoch": 8.998415213946117,
2066
+ "eval_accuracy": 0.5026737967914439,
2067
+ "eval_loss": 3.6218764781951904,
2068
+ "eval_runtime": 146.8598,
2069
+ "eval_samples_per_second": 3.82,
2070
+ "eval_steps_per_second": 0.96,
2071
+ "step": 2839
2072
+ },
2073
+ {
2074
+ "epoch": 9.001584786053883,
2075
+ "grad_norm": 2.5172929763793945,
2076
+ "learning_rate": 5.4673721340388e-06,
2077
+ "loss": 0.042,
2078
+ "step": 2840
2079
+ },
2080
+ {
2081
+ "epoch": 9.033280507131538,
2082
+ "grad_norm": 19.426034927368164,
2083
+ "learning_rate": 5.291005291005291e-06,
2084
+ "loss": 0.0626,
2085
+ "step": 2850
2086
+ },
2087
+ {
2088
+ "epoch": 9.064976228209192,
2089
+ "grad_norm": 5.145648956298828,
2090
+ "learning_rate": 5.114638447971781e-06,
2091
+ "loss": 0.0465,
2092
+ "step": 2860
2093
+ },
2094
+ {
2095
+ "epoch": 9.096671949286847,
2096
+ "grad_norm": 5.119009971618652,
2097
+ "learning_rate": 4.938271604938272e-06,
2098
+ "loss": 0.0241,
2099
+ "step": 2870
2100
+ },
2101
+ {
2102
+ "epoch": 9.1283676703645,
2103
+ "grad_norm": 3.799389362335205,
2104
+ "learning_rate": 4.7619047619047615e-06,
2105
+ "loss": 0.1072,
2106
+ "step": 2880
2107
+ },
2108
+ {
2109
+ "epoch": 9.160063391442156,
2110
+ "grad_norm": 10.141913414001465,
2111
+ "learning_rate": 4.585537918871252e-06,
2112
+ "loss": 0.0517,
2113
+ "step": 2890
2114
+ },
2115
+ {
2116
+ "epoch": 9.19175911251981,
2117
+ "grad_norm": 26.335498809814453,
2118
+ "learning_rate": 4.409171075837742e-06,
2119
+ "loss": 0.0353,
2120
+ "step": 2900
2121
+ },
2122
+ {
2123
+ "epoch": 9.223454833597465,
2124
+ "grad_norm": 72.42816162109375,
2125
+ "learning_rate": 4.232804232804233e-06,
2126
+ "loss": 0.1109,
2127
+ "step": 2910
2128
+ },
2129
+ {
2130
+ "epoch": 9.255150554675119,
2131
+ "grad_norm": 6.7033538818359375,
2132
+ "learning_rate": 4.056437389770723e-06,
2133
+ "loss": 0.0249,
2134
+ "step": 2920
2135
+ },
2136
+ {
2137
+ "epoch": 9.286846275752774,
2138
+ "grad_norm": 0.21264280378818512,
2139
+ "learning_rate": 3.8800705467372134e-06,
2140
+ "loss": 0.0571,
2141
+ "step": 2930
2142
+ },
2143
+ {
2144
+ "epoch": 9.318541996830428,
2145
+ "grad_norm": 0.8612923622131348,
2146
+ "learning_rate": 3.7037037037037037e-06,
2147
+ "loss": 0.0355,
2148
+ "step": 2940
2149
+ },
2150
+ {
2151
+ "epoch": 9.350237717908083,
2152
+ "grad_norm": 0.4571894407272339,
2153
+ "learning_rate": 3.527336860670194e-06,
2154
+ "loss": 0.0297,
2155
+ "step": 2950
2156
+ },
2157
+ {
2158
+ "epoch": 9.381933438985737,
2159
+ "grad_norm": 0.8260334730148315,
2160
+ "learning_rate": 3.3509700176366843e-06,
2161
+ "loss": 0.0477,
2162
+ "step": 2960
2163
+ },
2164
+ {
2165
+ "epoch": 9.413629160063392,
2166
+ "grad_norm": 15.636239051818848,
2167
+ "learning_rate": 3.1746031746031746e-06,
2168
+ "loss": 0.0422,
2169
+ "step": 2970
2170
+ },
2171
+ {
2172
+ "epoch": 9.445324881141046,
2173
+ "grad_norm": 20.841358184814453,
2174
+ "learning_rate": 2.998236331569665e-06,
2175
+ "loss": 0.0369,
2176
+ "step": 2980
2177
+ },
2178
+ {
2179
+ "epoch": 9.477020602218701,
2180
+ "grad_norm": 1.6072678565979004,
2181
+ "learning_rate": 2.8218694885361552e-06,
2182
+ "loss": 0.0503,
2183
+ "step": 2990
2184
+ },
2185
+ {
2186
+ "epoch": 9.508716323296355,
2187
+ "grad_norm": 1.6855651140213013,
2188
+ "learning_rate": 2.6455026455026455e-06,
2189
+ "loss": 0.0472,
2190
+ "step": 3000
2191
+ },
2192
+ {
2193
+ "epoch": 9.54041204437401,
2194
+ "grad_norm": 4.713263988494873,
2195
+ "learning_rate": 2.469135802469136e-06,
2196
+ "loss": 0.1097,
2197
+ "step": 3010
2198
+ },
2199
+ {
2200
+ "epoch": 9.572107765451664,
2201
+ "grad_norm": 2.9448232650756836,
2202
+ "learning_rate": 2.292768959435626e-06,
2203
+ "loss": 0.0553,
2204
+ "step": 3020
2205
+ },
2206
+ {
2207
+ "epoch": 9.60380348652932,
2208
+ "grad_norm": 18.76097869873047,
2209
+ "learning_rate": 2.1164021164021164e-06,
2210
+ "loss": 0.0342,
2211
+ "step": 3030
2212
+ },
2213
+ {
2214
+ "epoch": 9.635499207606973,
2215
+ "grad_norm": 6.058113098144531,
2216
+ "learning_rate": 1.9400352733686067e-06,
2217
+ "loss": 0.041,
2218
+ "step": 3040
2219
+ },
2220
+ {
2221
+ "epoch": 9.667194928684628,
2222
+ "grad_norm": 32.21320343017578,
2223
+ "learning_rate": 1.763668430335097e-06,
2224
+ "loss": 0.0386,
2225
+ "step": 3050
2226
+ },
2227
+ {
2228
+ "epoch": 9.698890649762282,
2229
+ "grad_norm": 0.48055747151374817,
2230
+ "learning_rate": 1.5873015873015873e-06,
2231
+ "loss": 0.0454,
2232
+ "step": 3060
2233
+ },
2234
+ {
2235
+ "epoch": 9.730586370839937,
2236
+ "grad_norm": 7.426022529602051,
2237
+ "learning_rate": 1.4109347442680776e-06,
2238
+ "loss": 0.1654,
2239
+ "step": 3070
2240
+ },
2241
+ {
2242
+ "epoch": 9.76228209191759,
2243
+ "grad_norm": 7.643261909484863,
2244
+ "learning_rate": 1.234567901234568e-06,
2245
+ "loss": 0.0294,
2246
+ "step": 3080
2247
+ },
2248
+ {
2249
+ "epoch": 9.793977812995246,
2250
+ "grad_norm": 3.1924824714660645,
2251
+ "learning_rate": 1.0582010582010582e-06,
2252
+ "loss": 0.1084,
2253
+ "step": 3090
2254
+ },
2255
+ {
2256
+ "epoch": 9.8256735340729,
2257
+ "grad_norm": 31.949647903442383,
2258
+ "learning_rate": 8.818342151675485e-07,
2259
+ "loss": 0.0582,
2260
+ "step": 3100
2261
+ },
2262
+ {
2263
+ "epoch": 9.857369255150555,
2264
+ "grad_norm": 41.80002212524414,
2265
+ "learning_rate": 7.054673721340388e-07,
2266
+ "loss": 0.0771,
2267
+ "step": 3110
2268
+ },
2269
+ {
2270
+ "epoch": 9.889064976228209,
2271
+ "grad_norm": 1.5549315214157104,
2272
+ "learning_rate": 5.291005291005291e-07,
2273
+ "loss": 0.0501,
2274
+ "step": 3120
2275
+ },
2276
+ {
2277
+ "epoch": 9.920760697305864,
2278
+ "grad_norm": 14.312984466552734,
2279
+ "learning_rate": 3.527336860670194e-07,
2280
+ "loss": 0.0516,
2281
+ "step": 3130
2282
+ },
2283
+ {
2284
+ "epoch": 9.952456418383518,
2285
+ "grad_norm": 3.448463201522827,
2286
+ "learning_rate": 1.763668430335097e-07,
2287
+ "loss": 0.0653,
2288
+ "step": 3140
2289
+ },
2290
+ {
2291
+ "epoch": 9.984152139461173,
2292
+ "grad_norm": 5.9251389503479,
2293
+ "learning_rate": 0.0,
2294
+ "loss": 0.0554,
2295
+ "step": 3150
2296
+ },
2297
+ {
2298
+ "epoch": 9.984152139461173,
2299
+ "eval_accuracy": 0.49376114081996436,
2300
+ "eval_loss": 3.771087884902954,
2301
+ "eval_runtime": 147.6096,
2302
+ "eval_samples_per_second": 3.801,
2303
+ "eval_steps_per_second": 0.955,
2304
+ "step": 3150
2305
+ },
2306
+ {
2307
+ "epoch": 9.984152139461173,
2308
+ "step": 3150,
2309
+ "total_flos": 2.6101205954547646e+19,
2310
+ "train_loss": 0.7937991834822156,
2311
+ "train_runtime": 16586.4479,
2312
+ "train_samples_per_second": 3.042,
2313
+ "train_steps_per_second": 0.19
2314
  }
2315
  ],
2316
  "logging_steps": 10,
2317
+ "max_steps": 3150,
2318
  "num_input_tokens_seen": 0,
2319
+ "num_train_epochs": 10,
2320
  "save_steps": 500,
2321
  "stateful_callbacks": {
2322
  "TrainerControl": {
 
2330
  "attributes": {}
2331
  }
2332
  },
2333
+ "total_flos": 2.6101205954547646e+19,
2334
  "train_batch_size": 4,
2335
  "trial_name": null,
2336
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69a161edef4abfeba776ab18049d518fbe4234534be4e111ea1e2354c2ae2204
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b8a713530de1b135851342ac35b5d66f0423a343c53743291c63267a40664e5
3
  size 5112