TracyTank commited on
Commit
bd6ef00
·
verified ·
1 Parent(s): 410b8f3

Training in progress, step 833, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f02bb38ebeaf6366d6eb671e156e4bf3e765fdece440cc930c9cfdcd0b64083
3
  size 2145944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07cefc2581899e9cd81423324de5956c82dd5f731f05187440a982b0d6d05b71
3
  size 2145944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32e9855b316f434d57ba4467b5b4ef1f8d2f959696403f5e32def6828ad6bef5
3
  size 4310020
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4570e5c4b0ed79e4f5d3b402f6ad2f2549e244175cc54843a5d5f6d813db1532
3
  size 4310020
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53f71efcffbad4ff9b0400739a9b2811c20050ed7852096fc24fb2eb4c3c091d
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50f87c0ed2d75cb8a451ddff6da6632b673785da1e9fef3dc083d69f65b801bb
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33825bed89719638e2e37f21656a413d50d0dc8cf99d86b4d7152f50e5bbcd6e
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:756858fb0f5a27c92f1116791d526a1becaf91fa7110176767537e98ea42bbd8
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f3682dfb503773bdb3a7d4868d8abf3b6eed45d692e22e2299624c46632a667
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7d1a6c204d94ba0bb67bac856dedd2342d34022ad605da33963f9022dc066a7
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cb07e5da45d7c5643fe3179d2cfab1712f94a8c62afddc66fee2e58ea42f700
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3408db92c33886624cd88dac64c08b23fc69f8e188c300a1a0bfe82c164e648
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f024d2f6fb6610551c472834de25d1d904c6aa9a110ea61cc065fb2a17fa713f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:814c5335d071ae07021d1744ec9a6235158c1a31d5d0598e1cda6bddebbe2205
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8599700149925037,
5
  "eval_steps": 500,
6
- "global_step": 717,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5026,6 +5026,818 @@
5026
  "learning_rate": 6.053180944220627e-06,
5027
  "loss": 9.5913,
5028
  "step": 717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5029
  }
5030
  ],
5031
  "logging_steps": 1,
@@ -5040,12 +5852,12 @@
5040
  "should_evaluate": false,
5041
  "should_log": false,
5042
  "should_save": true,
5043
- "should_training_stop": false
5044
  },
5045
  "attributes": {}
5046
  }
5047
  },
5048
- "total_flos": 595349320237056.0,
5049
  "train_batch_size": 4,
5050
  "trial_name": null,
5051
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9991004497751125,
5
  "eval_steps": 500,
6
+ "global_step": 833,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5026
  "learning_rate": 6.053180944220627e-06,
5027
  "loss": 9.5913,
5028
  "step": 717
5029
+ },
5030
+ {
5031
+ "epoch": 0.8611694152923538,
5032
+ "grad_norm": 0.2777814567089081,
5033
+ "learning_rate": 5.951378333139118e-06,
5034
+ "loss": 9.604,
5035
+ "step": 718
5036
+ },
5037
+ {
5038
+ "epoch": 0.8623688155922039,
5039
+ "grad_norm": 0.27990755438804626,
5040
+ "learning_rate": 5.850384861654329e-06,
5041
+ "loss": 9.5929,
5042
+ "step": 719
5043
+ },
5044
+ {
5045
+ "epoch": 0.863568215892054,
5046
+ "grad_norm": 0.285398006439209,
5047
+ "learning_rate": 5.750202384939313e-06,
5048
+ "loss": 9.5953,
5049
+ "step": 720
5050
+ },
5051
+ {
5052
+ "epoch": 0.864767616191904,
5053
+ "grad_norm": 0.28564396500587463,
5054
+ "learning_rate": 5.650832743269779e-06,
5055
+ "loss": 9.6041,
5056
+ "step": 721
5057
+ },
5058
+ {
5059
+ "epoch": 0.8659670164917541,
5060
+ "grad_norm": 0.29116764664649963,
5061
+ "learning_rate": 5.552277761990294e-06,
5062
+ "loss": 9.594,
5063
+ "step": 722
5064
+ },
5065
+ {
5066
+ "epoch": 0.8671664167916042,
5067
+ "grad_norm": 0.28536704182624817,
5068
+ "learning_rate": 5.454539251480739e-06,
5069
+ "loss": 9.6037,
5070
+ "step": 723
5071
+ },
5072
+ {
5073
+ "epoch": 0.8683658170914542,
5074
+ "grad_norm": 0.2976490259170532,
5075
+ "learning_rate": 5.3576190071230106e-06,
5076
+ "loss": 9.6027,
5077
+ "step": 724
5078
+ },
5079
+ {
5080
+ "epoch": 0.8695652173913043,
5081
+ "grad_norm": 0.3443441390991211,
5082
+ "learning_rate": 5.2615188092681176e-06,
5083
+ "loss": 9.6111,
5084
+ "step": 725
5085
+ },
5086
+ {
5087
+ "epoch": 0.8707646176911544,
5088
+ "grad_norm": 0.27186739444732666,
5089
+ "learning_rate": 5.166240423203428e-06,
5090
+ "loss": 9.5908,
5091
+ "step": 726
5092
+ },
5093
+ {
5094
+ "epoch": 0.8719640179910045,
5095
+ "grad_norm": 0.2679988443851471,
5096
+ "learning_rate": 5.071785599120243e-06,
5097
+ "loss": 9.5937,
5098
+ "step": 727
5099
+ },
5100
+ {
5101
+ "epoch": 0.8731634182908545,
5102
+ "grad_norm": 0.2798680067062378,
5103
+ "learning_rate": 4.978156072081669e-06,
5104
+ "loss": 9.5911,
5105
+ "step": 728
5106
+ },
5107
+ {
5108
+ "epoch": 0.8743628185907046,
5109
+ "grad_norm": 0.27777785062789917,
5110
+ "learning_rate": 4.885353561990752e-06,
5111
+ "loss": 9.5982,
5112
+ "step": 729
5113
+ },
5114
+ {
5115
+ "epoch": 0.8755622188905547,
5116
+ "grad_norm": 0.2682747542858124,
5117
+ "learning_rate": 4.793379773558815e-06,
5118
+ "loss": 9.6045,
5119
+ "step": 730
5120
+ },
5121
+ {
5122
+ "epoch": 0.8767616191904049,
5123
+ "grad_norm": 0.2757691740989685,
5124
+ "learning_rate": 4.7022363962742514e-06,
5125
+ "loss": 9.6068,
5126
+ "step": 731
5127
+ },
5128
+ {
5129
+ "epoch": 0.8779610194902548,
5130
+ "grad_norm": 0.27363264560699463,
5131
+ "learning_rate": 4.6119251043714225e-06,
5132
+ "loss": 9.5998,
5133
+ "step": 732
5134
+ },
5135
+ {
5136
+ "epoch": 0.879160419790105,
5137
+ "grad_norm": 0.28155508637428284,
5138
+ "learning_rate": 4.522447556799875e-06,
5139
+ "loss": 9.5994,
5140
+ "step": 733
5141
+ },
5142
+ {
5143
+ "epoch": 0.8803598200899551,
5144
+ "grad_norm": 0.27953052520751953,
5145
+ "learning_rate": 4.433805397193969e-06,
5146
+ "loss": 9.6041,
5147
+ "step": 734
5148
+ },
5149
+ {
5150
+ "epoch": 0.881559220389805,
5151
+ "grad_norm": 0.28824350237846375,
5152
+ "learning_rate": 4.3460002538425805e-06,
5153
+ "loss": 9.6045,
5154
+ "step": 735
5155
+ },
5156
+ {
5157
+ "epoch": 0.8827586206896552,
5158
+ "grad_norm": 0.29731664061546326,
5159
+ "learning_rate": 4.2590337396592406e-06,
5160
+ "loss": 9.6076,
5161
+ "step": 736
5162
+ },
5163
+ {
5164
+ "epoch": 0.8839580209895053,
5165
+ "grad_norm": 0.2972903847694397,
5166
+ "learning_rate": 4.172907452152519e-06,
5167
+ "loss": 9.608,
5168
+ "step": 737
5169
+ },
5170
+ {
5171
+ "epoch": 0.8851574212893554,
5172
+ "grad_norm": 0.2724793255329132,
5173
+ "learning_rate": 4.087622973396665e-06,
5174
+ "loss": 9.5971,
5175
+ "step": 738
5176
+ },
5177
+ {
5178
+ "epoch": 0.8863568215892054,
5179
+ "grad_norm": 0.2679544985294342,
5180
+ "learning_rate": 4.0031818700025095e-06,
5181
+ "loss": 9.5936,
5182
+ "step": 739
5183
+ },
5184
+ {
5185
+ "epoch": 0.8875562218890555,
5186
+ "grad_norm": 0.2719581425189972,
5187
+ "learning_rate": 3.919585693088751e-06,
5188
+ "loss": 9.592,
5189
+ "step": 740
5190
+ },
5191
+ {
5192
+ "epoch": 0.8887556221889056,
5193
+ "grad_norm": 0.27202773094177246,
5194
+ "learning_rate": 3.836835978253433e-06,
5195
+ "loss": 9.5904,
5196
+ "step": 741
5197
+ },
5198
+ {
5199
+ "epoch": 0.8899550224887556,
5200
+ "grad_norm": 0.2740086615085602,
5201
+ "learning_rate": 3.7549342455457216e-06,
5202
+ "loss": 9.5989,
5203
+ "step": 742
5204
+ },
5205
+ {
5206
+ "epoch": 0.8911544227886057,
5207
+ "grad_norm": 0.2741098999977112,
5208
+ "learning_rate": 3.6738819994379945e-06,
5209
+ "loss": 9.6006,
5210
+ "step": 743
5211
+ },
5212
+ {
5213
+ "epoch": 0.8923538230884558,
5214
+ "grad_norm": 0.2779156565666199,
5215
+ "learning_rate": 3.593680728798238e-06,
5216
+ "loss": 9.5999,
5217
+ "step": 744
5218
+ },
5219
+ {
5220
+ "epoch": 0.8935532233883059,
5221
+ "grad_norm": 0.28172117471694946,
5222
+ "learning_rate": 3.5143319068626225e-06,
5223
+ "loss": 9.6032,
5224
+ "step": 745
5225
+ },
5226
+ {
5227
+ "epoch": 0.8947526236881559,
5228
+ "grad_norm": 0.2854410409927368,
5229
+ "learning_rate": 3.435836991208524e-06,
5230
+ "loss": 9.5996,
5231
+ "step": 746
5232
+ },
5233
+ {
5234
+ "epoch": 0.895952023988006,
5235
+ "grad_norm": 0.2835351824760437,
5236
+ "learning_rate": 3.35819742372771e-06,
5237
+ "loss": 9.6017,
5238
+ "step": 747
5239
+ },
5240
+ {
5241
+ "epoch": 0.8971514242878561,
5242
+ "grad_norm": 0.2815116047859192,
5243
+ "learning_rate": 3.2814146305998107e-06,
5244
+ "loss": 9.605,
5245
+ "step": 748
5246
+ },
5247
+ {
5248
+ "epoch": 0.8983508245877061,
5249
+ "grad_norm": 0.3012433648109436,
5250
+ "learning_rate": 3.2054900222662276e-06,
5251
+ "loss": 9.607,
5252
+ "step": 749
5253
+ },
5254
+ {
5255
+ "epoch": 0.8995502248875562,
5256
+ "grad_norm": 0.3453059792518616,
5257
+ "learning_rate": 3.1304249934041017e-06,
5258
+ "loss": 9.6038,
5259
+ "step": 750
5260
+ },
5261
+ {
5262
+ "epoch": 0.9007496251874063,
5263
+ "grad_norm": 0.2700274586677551,
5264
+ "learning_rate": 3.0562209229008042e-06,
5265
+ "loss": 9.5894,
5266
+ "step": 751
5267
+ },
5268
+ {
5269
+ "epoch": 0.9019490254872564,
5270
+ "grad_norm": 0.26212278008461,
5271
+ "learning_rate": 2.982879173828523e-06,
5272
+ "loss": 9.5959,
5273
+ "step": 752
5274
+ },
5275
+ {
5276
+ "epoch": 0.9031484257871064,
5277
+ "grad_norm": 0.2739808261394501,
5278
+ "learning_rate": 2.9104010934192794e-06,
5279
+ "loss": 9.5937,
5280
+ "step": 753
5281
+ },
5282
+ {
5283
+ "epoch": 0.9043478260869565,
5284
+ "grad_norm": 0.2720181345939636,
5285
+ "learning_rate": 2.838788013040139e-06,
5286
+ "loss": 9.5944,
5287
+ "step": 754
5288
+ },
5289
+ {
5290
+ "epoch": 0.9055472263868066,
5291
+ "grad_norm": 0.2759906053543091,
5292
+ "learning_rate": 2.768041248168801e-06,
5293
+ "loss": 9.5952,
5294
+ "step": 755
5295
+ },
5296
+ {
5297
+ "epoch": 0.9067466266866566,
5298
+ "grad_norm": 0.27012184262275696,
5299
+ "learning_rate": 2.6981620983694057e-06,
5300
+ "loss": 9.5966,
5301
+ "step": 756
5302
+ },
5303
+ {
5304
+ "epoch": 0.9079460269865067,
5305
+ "grad_norm": 0.2913030683994293,
5306
+ "learning_rate": 2.6291518472686404e-06,
5307
+ "loss": 9.5965,
5308
+ "step": 757
5309
+ },
5310
+ {
5311
+ "epoch": 0.9091454272863568,
5312
+ "grad_norm": 0.2893541753292084,
5313
+ "learning_rate": 2.5610117625322118e-06,
5314
+ "loss": 9.5964,
5315
+ "step": 758
5316
+ },
5317
+ {
5318
+ "epoch": 0.9103448275862069,
5319
+ "grad_norm": 0.2874806523323059,
5320
+ "learning_rate": 2.4937430958415278e-06,
5321
+ "loss": 9.5995,
5322
+ "step": 759
5323
+ },
5324
+ {
5325
+ "epoch": 0.9115442278860569,
5326
+ "grad_norm": 0.2814995348453522,
5327
+ "learning_rate": 2.427347082870701e-06,
5328
+ "loss": 9.6001,
5329
+ "step": 760
5330
+ },
5331
+ {
5332
+ "epoch": 0.912743628185907,
5333
+ "grad_norm": 0.28927141427993774,
5334
+ "learning_rate": 2.361824943263874e-06,
5335
+ "loss": 9.6073,
5336
+ "step": 761
5337
+ },
5338
+ {
5339
+ "epoch": 0.9139430284857571,
5340
+ "grad_norm": 0.29837363958358765,
5341
+ "learning_rate": 2.2971778806127996e-06,
5342
+ "loss": 9.6054,
5343
+ "step": 762
5344
+ },
5345
+ {
5346
+ "epoch": 0.9151424287856071,
5347
+ "grad_norm": 0.25863122940063477,
5348
+ "learning_rate": 2.233407082434724e-06,
5349
+ "loss": 9.6017,
5350
+ "step": 763
5351
+ },
5352
+ {
5353
+ "epoch": 0.9163418290854572,
5354
+ "grad_norm": 0.26800212264060974,
5355
+ "learning_rate": 2.1705137201505965e-06,
5356
+ "loss": 9.5951,
5357
+ "step": 764
5358
+ },
5359
+ {
5360
+ "epoch": 0.9175412293853074,
5361
+ "grad_norm": 0.2701495587825775,
5362
+ "learning_rate": 2.1084989490635255e-06,
5363
+ "loss": 9.5911,
5364
+ "step": 765
5365
+ },
5366
+ {
5367
+ "epoch": 0.9187406296851575,
5368
+ "grad_norm": 0.27781182527542114,
5369
+ "learning_rate": 2.0473639083375795e-06,
5370
+ "loss": 9.5949,
5371
+ "step": 766
5372
+ },
5373
+ {
5374
+ "epoch": 0.9199400299850075,
5375
+ "grad_norm": 0.26822102069854736,
5376
+ "learning_rate": 1.9871097209768375e-06,
5377
+ "loss": 9.6018,
5378
+ "step": 767
5379
+ },
5380
+ {
5381
+ "epoch": 0.9211394302848576,
5382
+ "grad_norm": 0.2740313708782196,
5383
+ "learning_rate": 1.9277374938047988e-06,
5384
+ "loss": 9.5969,
5385
+ "step": 768
5386
+ },
5387
+ {
5388
+ "epoch": 0.9223388305847077,
5389
+ "grad_norm": 0.28559255599975586,
5390
+ "learning_rate": 1.8692483174439946e-06,
5391
+ "loss": 9.5933,
5392
+ "step": 769
5393
+ },
5394
+ {
5395
+ "epoch": 0.9235382308845578,
5396
+ "grad_norm": 0.28170379996299744,
5397
+ "learning_rate": 1.8116432662960037e-06,
5398
+ "loss": 9.595,
5399
+ "step": 770
5400
+ },
5401
+ {
5402
+ "epoch": 0.9247376311844078,
5403
+ "grad_norm": 0.2856314182281494,
5404
+ "learning_rate": 1.7549233985217074e-06,
5405
+ "loss": 9.5949,
5406
+ "step": 771
5407
+ },
5408
+ {
5409
+ "epoch": 0.9259370314842579,
5410
+ "grad_norm": 0.28359171748161316,
5411
+ "learning_rate": 1.6990897560218211e-06,
5412
+ "loss": 9.5984,
5413
+ "step": 772
5414
+ },
5415
+ {
5416
+ "epoch": 0.927136431784108,
5417
+ "grad_norm": 0.28927284479141235,
5418
+ "learning_rate": 1.644143364417794e-06,
5419
+ "loss": 9.6003,
5420
+ "step": 773
5421
+ },
5422
+ {
5423
+ "epoch": 0.928335832083958,
5424
+ "grad_norm": 0.28748857975006104,
5425
+ "learning_rate": 1.5900852330329563e-06,
5426
+ "loss": 9.6163,
5427
+ "step": 774
5428
+ },
5429
+ {
5430
+ "epoch": 0.9295352323838081,
5431
+ "grad_norm": 0.32728201150894165,
5432
+ "learning_rate": 1.5369163548739462e-06,
5433
+ "loss": 9.6079,
5434
+ "step": 775
5435
+ },
5436
+ {
5437
+ "epoch": 0.9307346326836582,
5438
+ "grad_norm": 0.2640519440174103,
5439
+ "learning_rate": 1.484637706612535e-06,
5440
+ "loss": 9.5952,
5441
+ "step": 776
5442
+ },
5443
+ {
5444
+ "epoch": 0.9319340329835083,
5445
+ "grad_norm": 0.2757025361061096,
5446
+ "learning_rate": 1.4332502485676358e-06,
5447
+ "loss": 9.5968,
5448
+ "step": 777
5449
+ },
5450
+ {
5451
+ "epoch": 0.9331334332833583,
5452
+ "grad_norm": 0.2660670578479767,
5453
+ "learning_rate": 1.3827549246876625e-06,
5454
+ "loss": 9.5966,
5455
+ "step": 778
5456
+ },
5457
+ {
5458
+ "epoch": 0.9343328335832084,
5459
+ "grad_norm": 0.26621681451797485,
5460
+ "learning_rate": 1.333152662533227e-06,
5461
+ "loss": 9.6007,
5462
+ "step": 779
5463
+ },
5464
+ {
5465
+ "epoch": 0.9355322338830585,
5466
+ "grad_norm": 0.2719038426876068,
5467
+ "learning_rate": 1.2844443732600576e-06,
5468
+ "loss": 9.6027,
5469
+ "step": 780
5470
+ },
5471
+ {
5472
+ "epoch": 0.9367316341829085,
5473
+ "grad_norm": 0.278006911277771,
5474
+ "learning_rate": 1.2366309516022966e-06,
5475
+ "loss": 9.6006,
5476
+ "step": 781
5477
+ },
5478
+ {
5479
+ "epoch": 0.9379310344827586,
5480
+ "grad_norm": 0.2759842574596405,
5481
+ "learning_rate": 1.189713275856047e-06,
5482
+ "loss": 9.6019,
5483
+ "step": 782
5484
+ },
5485
+ {
5486
+ "epoch": 0.9391304347826087,
5487
+ "grad_norm": 0.2800839841365814,
5488
+ "learning_rate": 1.1436922078632394e-06,
5489
+ "loss": 9.6011,
5490
+ "step": 783
5491
+ },
5492
+ {
5493
+ "epoch": 0.9403298350824588,
5494
+ "grad_norm": 0.2854614555835724,
5495
+ "learning_rate": 1.0985685929958134e-06,
5496
+ "loss": 9.6005,
5497
+ "step": 784
5498
+ },
5499
+ {
5500
+ "epoch": 0.9415292353823088,
5501
+ "grad_norm": 0.28942301869392395,
5502
+ "learning_rate": 1.0543432601401615e-06,
5503
+ "loss": 9.5996,
5504
+ "step": 785
5505
+ },
5506
+ {
5507
+ "epoch": 0.9427286356821589,
5508
+ "grad_norm": 0.29389646649360657,
5509
+ "learning_rate": 1.0110170216819316e-06,
5510
+ "loss": 9.6074,
5511
+ "step": 786
5512
+ },
5513
+ {
5514
+ "epoch": 0.943928035982009,
5515
+ "grad_norm": 0.29730501770973206,
5516
+ "learning_rate": 9.685906734910988e-07,
5517
+ "loss": 9.6045,
5518
+ "step": 787
5519
+ },
5520
+ {
5521
+ "epoch": 0.945127436281859,
5522
+ "grad_norm": 0.27036651968955994,
5523
+ "learning_rate": 9.270649949073229e-07,
5524
+ "loss": 9.5922,
5525
+ "step": 788
5526
+ },
5527
+ {
5528
+ "epoch": 0.9463268365817091,
5529
+ "grad_norm": 0.26795369386672974,
5530
+ "learning_rate": 8.864407487256699e-07,
5531
+ "loss": 9.5981,
5532
+ "step": 789
5533
+ },
5534
+ {
5535
+ "epoch": 0.9475262368815592,
5536
+ "grad_norm": 0.27786487340927124,
5537
+ "learning_rate": 8.467186811825623e-07,
5538
+ "loss": 9.5882,
5539
+ "step": 790
5540
+ },
5541
+ {
5542
+ "epoch": 0.9487256371814093,
5543
+ "grad_norm": 0.27581360936164856,
5544
+ "learning_rate": 8.07899521942096e-07,
5545
+ "loss": 9.5986,
5546
+ "step": 791
5547
+ },
5548
+ {
5549
+ "epoch": 0.9499250374812593,
5550
+ "grad_norm": 0.27784955501556396,
5551
+ "learning_rate": 7.69983984082634e-07,
5552
+ "loss": 9.5951,
5553
+ "step": 792
5554
+ },
5555
+ {
5556
+ "epoch": 0.9511244377811094,
5557
+ "grad_norm": 0.2779393196105957,
5558
+ "learning_rate": 7.329727640837058e-07,
5559
+ "loss": 9.5992,
5560
+ "step": 793
5561
+ },
5562
+ {
5563
+ "epoch": 0.9523238380809596,
5564
+ "grad_norm": 0.2856026291847229,
5565
+ "learning_rate": 6.968665418131848e-07,
5566
+ "loss": 9.6011,
5567
+ "step": 794
5568
+ },
5569
+ {
5570
+ "epoch": 0.9535232383808095,
5571
+ "grad_norm": 0.2818286418914795,
5572
+ "learning_rate": 6.616659805148695e-07,
5573
+ "loss": 9.603,
5574
+ "step": 795
5575
+ },
5576
+ {
5577
+ "epoch": 0.9547226386806597,
5578
+ "grad_norm": 0.2779069244861603,
5579
+ "learning_rate": 6.273717267962164e-07,
5580
+ "loss": 9.6057,
5581
+ "step": 796
5582
+ },
5583
+ {
5584
+ "epoch": 0.9559220389805098,
5585
+ "grad_norm": 0.28562232851982117,
5586
+ "learning_rate": 5.93984410616527e-07,
5587
+ "loss": 9.5981,
5588
+ "step": 797
5589
+ },
5590
+ {
5591
+ "epoch": 0.9571214392803599,
5592
+ "grad_norm": 0.2814895510673523,
5593
+ "learning_rate": 5.615046452753403e-07,
5594
+ "loss": 9.6072,
5595
+ "step": 798
5596
+ },
5597
+ {
5598
+ "epoch": 0.9583208395802099,
5599
+ "grad_norm": 0.291547030210495,
5600
+ "learning_rate": 5.299330274011916e-07,
5601
+ "loss": 9.6075,
5602
+ "step": 799
5603
+ },
5604
+ {
5605
+ "epoch": 0.95952023988006,
5606
+ "grad_norm": 0.3199189603328705,
5607
+ "learning_rate": 4.992701369406161e-07,
5608
+ "loss": 9.6069,
5609
+ "step": 800
5610
+ },
5611
+ {
5612
+ "epoch": 0.9607196401799101,
5613
+ "grad_norm": 0.26192960143089294,
5614
+ "learning_rate": 4.695165371475463e-07,
5615
+ "loss": 9.5961,
5616
+ "step": 801
5617
+ },
5618
+ {
5619
+ "epoch": 0.9619190404797601,
5620
+ "grad_norm": 0.27191805839538574,
5621
+ "learning_rate": 4.4067277457292556e-07,
5622
+ "loss": 9.5925,
5623
+ "step": 802
5624
+ },
5625
+ {
5626
+ "epoch": 0.9631184407796102,
5627
+ "grad_norm": 0.2721245288848877,
5628
+ "learning_rate": 4.1273937905467185e-07,
5629
+ "loss": 9.5944,
5630
+ "step": 803
5631
+ },
5632
+ {
5633
+ "epoch": 0.9643178410794603,
5634
+ "grad_norm": 0.27019554376602173,
5635
+ "learning_rate": 3.8571686370797443e-07,
5636
+ "loss": 9.5939,
5637
+ "step": 804
5638
+ },
5639
+ {
5640
+ "epoch": 0.9655172413793104,
5641
+ "grad_norm": 0.27776581048965454,
5642
+ "learning_rate": 3.5960572491583466e-07,
5643
+ "loss": 9.5946,
5644
+ "step": 805
5645
+ },
5646
+ {
5647
+ "epoch": 0.9667166416791604,
5648
+ "grad_norm": 0.27804285287857056,
5649
+ "learning_rate": 3.3440644231995664e-07,
5650
+ "loss": 9.5989,
5651
+ "step": 806
5652
+ },
5653
+ {
5654
+ "epoch": 0.9679160419790105,
5655
+ "grad_norm": 0.27963870763778687,
5656
+ "learning_rate": 3.101194788119599e-07,
5657
+ "loss": 9.5996,
5658
+ "step": 807
5659
+ },
5660
+ {
5661
+ "epoch": 0.9691154422788606,
5662
+ "grad_norm": 0.27757352590560913,
5663
+ "learning_rate": 2.867452805248416e-07,
5664
+ "loss": 9.5986,
5665
+ "step": 808
5666
+ },
5667
+ {
5668
+ "epoch": 0.9703148425787106,
5669
+ "grad_norm": 0.27756595611572266,
5670
+ "learning_rate": 2.642842768248055e-07,
5671
+ "loss": 9.6037,
5672
+ "step": 809
5673
+ },
5674
+ {
5675
+ "epoch": 0.9715142428785607,
5676
+ "grad_norm": 0.2873465418815613,
5677
+ "learning_rate": 2.4273688030336805e-07,
5678
+ "loss": 9.6023,
5679
+ "step": 810
5680
+ },
5681
+ {
5682
+ "epoch": 0.9727136431784108,
5683
+ "grad_norm": 0.29148492217063904,
5684
+ "learning_rate": 2.2210348676977023e-07,
5685
+ "loss": 9.5991,
5686
+ "step": 811
5687
+ },
5688
+ {
5689
+ "epoch": 0.9739130434782609,
5690
+ "grad_norm": 0.30888915061950684,
5691
+ "learning_rate": 2.0238447524372205e-07,
5692
+ "loss": 9.6039,
5693
+ "step": 812
5694
+ },
5695
+ {
5696
+ "epoch": 0.9751124437781109,
5697
+ "grad_norm": 0.2737792730331421,
5698
+ "learning_rate": 1.8358020794843056e-07,
5699
+ "loss": 9.5931,
5700
+ "step": 813
5701
+ },
5702
+ {
5703
+ "epoch": 0.976311844077961,
5704
+ "grad_norm": 0.2680344581604004,
5705
+ "learning_rate": 1.6569103030394938e-07,
5706
+ "loss": 9.5942,
5707
+ "step": 814
5708
+ },
5709
+ {
5710
+ "epoch": 0.9775112443778111,
5711
+ "grad_norm": 0.272061288356781,
5712
+ "learning_rate": 1.48717270920834e-07,
5713
+ "loss": 9.5932,
5714
+ "step": 815
5715
+ },
5716
+ {
5717
+ "epoch": 0.9787106446776612,
5718
+ "grad_norm": 0.273945152759552,
5719
+ "learning_rate": 1.3265924159410192e-07,
5720
+ "loss": 9.5924,
5721
+ "step": 816
5722
+ },
5723
+ {
5724
+ "epoch": 0.9799100449775112,
5725
+ "grad_norm": 0.27954599261283875,
5726
+ "learning_rate": 1.1751723729750974e-07,
5727
+ "loss": 9.5923,
5728
+ "step": 817
5729
+ },
5730
+ {
5731
+ "epoch": 0.9811094452773613,
5732
+ "grad_norm": 0.27977848052978516,
5733
+ "learning_rate": 1.0329153617812947e-07,
5734
+ "loss": 9.5911,
5735
+ "step": 818
5736
+ },
5737
+ {
5738
+ "epoch": 0.9823088455772114,
5739
+ "grad_norm": 0.275766521692276,
5740
+ "learning_rate": 8.998239955124721e-08,
5741
+ "loss": 9.6008,
5742
+ "step": 819
5743
+ },
5744
+ {
5745
+ "epoch": 0.9835082458770614,
5746
+ "grad_norm": 0.2739316523075104,
5747
+ "learning_rate": 7.759007189555579e-08,
5748
+ "loss": 9.6012,
5749
+ "step": 820
5750
+ },
5751
+ {
5752
+ "epoch": 0.9847076461769115,
5753
+ "grad_norm": 0.2775361239910126,
5754
+ "learning_rate": 6.611478084866951e-08,
5755
+ "loss": 9.6039,
5756
+ "step": 821
5757
+ },
5758
+ {
5759
+ "epoch": 0.9859070464767616,
5760
+ "grad_norm": 0.2795866131782532,
5761
+ "learning_rate": 5.555673720292753e-08,
5762
+ "loss": 9.6031,
5763
+ "step": 822
5764
+ },
5765
+ {
5766
+ "epoch": 0.9871064467766117,
5767
+ "grad_norm": 0.29514625668525696,
5768
+ "learning_rate": 4.5916134901552443e-08,
5769
+ "loss": 9.6003,
5770
+ "step": 823
5771
+ },
5772
+ {
5773
+ "epoch": 0.9883058470764617,
5774
+ "grad_norm": 0.29738849401474,
5775
+ "learning_rate": 3.7193151035047616e-08,
5776
+ "loss": 9.6031,
5777
+ "step": 824
5778
+ },
5779
+ {
5780
+ "epoch": 0.9895052473763118,
5781
+ "grad_norm": 0.3434954285621643,
5782
+ "learning_rate": 2.93879458379831e-08,
5783
+ "loss": 9.6087,
5784
+ "step": 825
5785
+ },
5786
+ {
5787
+ "epoch": 0.990704647676162,
5788
+ "grad_norm": 0.26993241906166077,
5789
+ "learning_rate": 2.2500662686025797e-08,
5790
+ "loss": 9.5913,
5791
+ "step": 826
5792
+ },
5793
+ {
5794
+ "epoch": 0.991904047976012,
5795
+ "grad_norm": 0.27204930782318115,
5796
+ "learning_rate": 1.653142809331376e-08,
5797
+ "loss": 9.597,
5798
+ "step": 827
5799
+ },
5800
+ {
5801
+ "epoch": 0.993103448275862,
5802
+ "grad_norm": 0.2699795067310333,
5803
+ "learning_rate": 1.148035171014139e-08,
5804
+ "loss": 9.5989,
5805
+ "step": 828
5806
+ },
5807
+ {
5808
+ "epoch": 0.9943028485757122,
5809
+ "grad_norm": 0.2756272852420807,
5810
+ "learning_rate": 7.347526320927723e-09,
5811
+ "loss": 9.6,
5812
+ "step": 829
5813
+ },
5814
+ {
5815
+ "epoch": 0.9955022488755623,
5816
+ "grad_norm": 0.2937001585960388,
5817
+ "learning_rate": 4.133027842517789e-09,
5818
+ "loss": 9.5987,
5819
+ "step": 830
5820
+ },
5821
+ {
5822
+ "epoch": 0.9967016491754123,
5823
+ "grad_norm": 0.2777741551399231,
5824
+ "learning_rate": 1.8369153228114944e-09,
5825
+ "loss": 9.5984,
5826
+ "step": 831
5827
+ },
5828
+ {
5829
+ "epoch": 0.9979010494752624,
5830
+ "grad_norm": 0.277544766664505,
5831
+ "learning_rate": 4.5923093963118335e-10,
5832
+ "loss": 9.6021,
5833
+ "step": 832
5834
+ },
5835
+ {
5836
+ "epoch": 0.9991004497751125,
5837
+ "grad_norm": 0.2950143814086914,
5838
+ "learning_rate": 0.0,
5839
+ "loss": 9.6099,
5840
+ "step": 833
5841
  }
5842
  ],
5843
  "logging_steps": 1,
 
5852
  "should_evaluate": false,
5853
  "should_log": false,
5854
  "should_save": true,
5855
+ "should_training_stop": true
5856
  },
5857
  "attributes": {}
5858
  }
5859
  },
5860
+ "total_flos": 691668038713344.0,
5861
  "train_batch_size": 4,
5862
  "trial_name": null,
5863
  "trial_params": null