JCChen0331 commited on
Commit
ae9c216
·
verified ·
1 Parent(s): cbf866a

Upload 7 files

Browse files
adapter_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-v0.3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "gate_proj",
24
+ "q_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "lm_head",
28
+ "o_proj",
29
+ "up_proj",
30
+ "down_proj"
31
+ ],
32
+ "task_type": "CAUSAL_LM",
33
+ "use_dora": false,
34
+ "use_rslora": false
35
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fee9f1f89c41ec4d83aca4192d9ba3db313ff0b663e259096739c90edcce649
3
+ size 1217458040
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b688bd125018708521c11f52b012566e969d0456a19530a2b927360f1c7a6ff2
3
+ size 341465180
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c237647a783894602e9a174e2acc3cc6efedfd402125f5a18191b15d3c632982
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1169ad6fa975cd7e76c73a14a643ca06f26b62ca618c1fcc1b72bb446a3420cb
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.6456390565002743,
5
+ "eval_steps": 1000,
6
+ "global_step": 3000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.013713658804168952,
13
+ "grad_norm": 0.9250810742378235,
14
+ "learning_rate": 2.4808205470313544e-05,
15
+ "loss": 2.5905,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.027427317608337904,
20
+ "grad_norm": 0.7566132545471191,
21
+ "learning_rate": 2.459973315543696e-05,
22
+ "loss": 2.1754,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.04114097641250686,
27
+ "grad_norm": 0.8571016788482666,
28
+ "learning_rate": 2.4391260840560377e-05,
29
+ "loss": 2.063,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.05485463521667581,
34
+ "grad_norm": 0.734812319278717,
35
+ "learning_rate": 2.418278852568379e-05,
36
+ "loss": 2.0424,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.06856829402084476,
41
+ "grad_norm": 0.706753671169281,
42
+ "learning_rate": 2.3974316210807207e-05,
43
+ "loss": 2.022,
44
+ "step": 125
45
+ },
46
+ {
47
+ "epoch": 0.08228195282501372,
48
+ "grad_norm": 0.7179118394851685,
49
+ "learning_rate": 2.3765843895930624e-05,
50
+ "loss": 2.0341,
51
+ "step": 150
52
+ },
53
+ {
54
+ "epoch": 0.09599561162918267,
55
+ "grad_norm": 0.7341741323471069,
56
+ "learning_rate": 2.3557371581054037e-05,
57
+ "loss": 2.0339,
58
+ "step": 175
59
+ },
60
+ {
61
+ "epoch": 0.10970927043335162,
62
+ "grad_norm": 0.6983020901679993,
63
+ "learning_rate": 2.334889926617745e-05,
64
+ "loss": 1.9717,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.12342292923752057,
69
+ "grad_norm": 0.7121065855026245,
70
+ "learning_rate": 2.3140426951300867e-05,
71
+ "loss": 1.996,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 0.13713658804168952,
76
+ "grad_norm": 0.7687849402427673,
77
+ "learning_rate": 2.2931954636424284e-05,
78
+ "loss": 1.9742,
79
+ "step": 250
80
+ },
81
+ {
82
+ "epoch": 0.1508502468458585,
83
+ "grad_norm": 0.7747183442115784,
84
+ "learning_rate": 2.2723482321547697e-05,
85
+ "loss": 1.9774,
86
+ "step": 275
87
+ },
88
+ {
89
+ "epoch": 0.16456390565002743,
90
+ "grad_norm": 0.8771799802780151,
91
+ "learning_rate": 2.2515010006671114e-05,
92
+ "loss": 1.9537,
93
+ "step": 300
94
+ },
95
+ {
96
+ "epoch": 0.17827756445419637,
97
+ "grad_norm": 0.7963676452636719,
98
+ "learning_rate": 2.230653769179453e-05,
99
+ "loss": 1.9408,
100
+ "step": 325
101
+ },
102
+ {
103
+ "epoch": 0.19199122325836535,
104
+ "grad_norm": 0.7438392639160156,
105
+ "learning_rate": 2.2098065376917944e-05,
106
+ "loss": 2.0012,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.2057048820625343,
111
+ "grad_norm": 0.7908737063407898,
112
+ "learning_rate": 2.188959306204136e-05,
113
+ "loss": 1.9485,
114
+ "step": 375
115
+ },
116
+ {
117
+ "epoch": 0.21941854086670323,
118
+ "grad_norm": 0.7139158248901367,
119
+ "learning_rate": 2.1681120747164777e-05,
120
+ "loss": 1.9829,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 0.23313219967087218,
125
+ "grad_norm": 0.9640923738479614,
126
+ "learning_rate": 2.147264843228819e-05,
127
+ "loss": 2.0043,
128
+ "step": 425
129
+ },
130
+ {
131
+ "epoch": 0.24684585847504115,
132
+ "grad_norm": 0.8830370903015137,
133
+ "learning_rate": 2.1264176117411607e-05,
134
+ "loss": 1.9798,
135
+ "step": 450
136
+ },
137
+ {
138
+ "epoch": 0.2605595172792101,
139
+ "grad_norm": 0.8050561547279358,
140
+ "learning_rate": 2.1055703802535024e-05,
141
+ "loss": 1.9679,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 0.27427317608337903,
146
+ "grad_norm": 0.8845964074134827,
147
+ "learning_rate": 2.084723148765844e-05,
148
+ "loss": 1.9261,
149
+ "step": 500
150
+ },
151
+ {
152
+ "epoch": 0.287986834887548,
153
+ "grad_norm": 0.7687380909919739,
154
+ "learning_rate": 2.0638759172781854e-05,
155
+ "loss": 1.938,
156
+ "step": 525
157
+ },
158
+ {
159
+ "epoch": 0.301700493691717,
160
+ "grad_norm": 0.8741185665130615,
161
+ "learning_rate": 2.043028685790527e-05,
162
+ "loss": 1.941,
163
+ "step": 550
164
+ },
165
+ {
166
+ "epoch": 0.3154141524958859,
167
+ "grad_norm": 0.8414379358291626,
168
+ "learning_rate": 2.0221814543028687e-05,
169
+ "loss": 1.9442,
170
+ "step": 575
171
+ },
172
+ {
173
+ "epoch": 0.32912781130005486,
174
+ "grad_norm": 0.8874805569648743,
175
+ "learning_rate": 2.00133422281521e-05,
176
+ "loss": 1.9899,
177
+ "step": 600
178
+ },
179
+ {
180
+ "epoch": 0.34284147010422383,
181
+ "grad_norm": 0.8438735604286194,
182
+ "learning_rate": 1.9804869913275517e-05,
183
+ "loss": 1.9332,
184
+ "step": 625
185
+ },
186
+ {
187
+ "epoch": 0.35655512890839275,
188
+ "grad_norm": 0.8141827583312988,
189
+ "learning_rate": 1.9596397598398934e-05,
190
+ "loss": 1.9334,
191
+ "step": 650
192
+ },
193
+ {
194
+ "epoch": 0.3702687877125617,
195
+ "grad_norm": 0.8998775482177734,
196
+ "learning_rate": 1.9387925283522347e-05,
197
+ "loss": 1.9208,
198
+ "step": 675
199
+ },
200
+ {
201
+ "epoch": 0.3839824465167307,
202
+ "grad_norm": 0.8827155232429504,
203
+ "learning_rate": 1.9179452968645764e-05,
204
+ "loss": 1.9287,
205
+ "step": 700
206
+ },
207
+ {
208
+ "epoch": 0.3976961053208996,
209
+ "grad_norm": 0.7951996922492981,
210
+ "learning_rate": 1.897098065376918e-05,
211
+ "loss": 1.9514,
212
+ "step": 725
213
+ },
214
+ {
215
+ "epoch": 0.4114097641250686,
216
+ "grad_norm": 0.784051775932312,
217
+ "learning_rate": 1.8762508338892594e-05,
218
+ "loss": 1.9226,
219
+ "step": 750
220
+ },
221
+ {
222
+ "epoch": 0.4251234229292375,
223
+ "grad_norm": 0.9527340531349182,
224
+ "learning_rate": 1.855403602401601e-05,
225
+ "loss": 1.9459,
226
+ "step": 775
227
+ },
228
+ {
229
+ "epoch": 0.43883708173340646,
230
+ "grad_norm": 0.8100705742835999,
231
+ "learning_rate": 1.8345563709139427e-05,
232
+ "loss": 1.9218,
233
+ "step": 800
234
+ },
235
+ {
236
+ "epoch": 0.45255074053757544,
237
+ "grad_norm": 0.814917266368866,
238
+ "learning_rate": 1.8137091394262844e-05,
239
+ "loss": 1.9682,
240
+ "step": 825
241
+ },
242
+ {
243
+ "epoch": 0.46626439934174435,
244
+ "grad_norm": 0.8693529367446899,
245
+ "learning_rate": 1.7928619079386257e-05,
246
+ "loss": 1.9068,
247
+ "step": 850
248
+ },
249
+ {
250
+ "epoch": 0.4799780581459133,
251
+ "grad_norm": 0.8571690320968628,
252
+ "learning_rate": 1.7720146764509674e-05,
253
+ "loss": 1.9225,
254
+ "step": 875
255
+ },
256
+ {
257
+ "epoch": 0.4936917169500823,
258
+ "grad_norm": 0.9239374995231628,
259
+ "learning_rate": 1.751167444963309e-05,
260
+ "loss": 1.9628,
261
+ "step": 900
262
+ },
263
+ {
264
+ "epoch": 0.5074053757542513,
265
+ "grad_norm": 0.8887170553207397,
266
+ "learning_rate": 1.7303202134756504e-05,
267
+ "loss": 1.8972,
268
+ "step": 925
269
+ },
270
+ {
271
+ "epoch": 0.5211190345584202,
272
+ "grad_norm": 0.8989230394363403,
273
+ "learning_rate": 1.709472981987992e-05,
274
+ "loss": 1.9517,
275
+ "step": 950
276
+ },
277
+ {
278
+ "epoch": 0.5348326933625891,
279
+ "grad_norm": 0.8530762791633606,
280
+ "learning_rate": 1.6886257505003337e-05,
281
+ "loss": 1.9163,
282
+ "step": 975
283
+ },
284
+ {
285
+ "epoch": 0.5485463521667581,
286
+ "grad_norm": 0.9289182424545288,
287
+ "learning_rate": 1.667778519012675e-05,
288
+ "loss": 1.9399,
289
+ "step": 1000
290
+ },
291
+ {
292
+ "epoch": 0.5485463521667581,
293
+ "eval_loss": 1.8345812559127808,
294
+ "eval_runtime": 12.1751,
295
+ "eval_samples_per_second": 11.91,
296
+ "eval_steps_per_second": 1.561,
297
+ "step": 1000
298
+ },
299
+ {
300
+ "epoch": 0.562260010970927,
301
+ "grad_norm": 0.9478083848953247,
302
+ "learning_rate": 1.6469312875250167e-05,
303
+ "loss": 1.9219,
304
+ "step": 1025
305
+ },
306
+ {
307
+ "epoch": 0.575973669775096,
308
+ "grad_norm": 0.9554803371429443,
309
+ "learning_rate": 1.6260840560373584e-05,
310
+ "loss": 1.9297,
311
+ "step": 1050
312
+ },
313
+ {
314
+ "epoch": 0.589687328579265,
315
+ "grad_norm": 0.9292556643486023,
316
+ "learning_rate": 1.6052368245496997e-05,
317
+ "loss": 1.9245,
318
+ "step": 1075
319
+ },
320
+ {
321
+ "epoch": 0.603400987383434,
322
+ "grad_norm": 0.9070860743522644,
323
+ "learning_rate": 1.5843895930620414e-05,
324
+ "loss": 1.9247,
325
+ "step": 1100
326
+ },
327
+ {
328
+ "epoch": 0.6171146461876028,
329
+ "grad_norm": 0.883678674697876,
330
+ "learning_rate": 1.563542361574383e-05,
331
+ "loss": 1.9041,
332
+ "step": 1125
333
+ },
334
+ {
335
+ "epoch": 0.6308283049917718,
336
+ "grad_norm": 0.9620354175567627,
337
+ "learning_rate": 1.5426951300867247e-05,
338
+ "loss": 1.9394,
339
+ "step": 1150
340
+ },
341
+ {
342
+ "epoch": 0.6445419637959408,
343
+ "grad_norm": 0.9458388090133667,
344
+ "learning_rate": 1.521847898599066e-05,
345
+ "loss": 1.9513,
346
+ "step": 1175
347
+ },
348
+ {
349
+ "epoch": 0.6582556226001097,
350
+ "grad_norm": 0.9318162798881531,
351
+ "learning_rate": 1.5010006671114077e-05,
352
+ "loss": 1.8923,
353
+ "step": 1200
354
+ },
355
+ {
356
+ "epoch": 0.6719692814042787,
357
+ "grad_norm": 0.8988519906997681,
358
+ "learning_rate": 1.4801534356237492e-05,
359
+ "loss": 1.9337,
360
+ "step": 1225
361
+ },
362
+ {
363
+ "epoch": 0.6856829402084477,
364
+ "grad_norm": 0.9615154266357422,
365
+ "learning_rate": 1.4593062041360909e-05,
366
+ "loss": 1.9118,
367
+ "step": 1250
368
+ },
369
+ {
370
+ "epoch": 0.6993965990126165,
371
+ "grad_norm": 0.9634252190589905,
372
+ "learning_rate": 1.4384589726484324e-05,
373
+ "loss": 1.8784,
374
+ "step": 1275
375
+ },
376
+ {
377
+ "epoch": 0.7131102578167855,
378
+ "grad_norm": 0.9005519151687622,
379
+ "learning_rate": 1.4176117411607739e-05,
380
+ "loss": 1.9124,
381
+ "step": 1300
382
+ },
383
+ {
384
+ "epoch": 0.7268239166209545,
385
+ "grad_norm": 0.9393877387046814,
386
+ "learning_rate": 1.3967645096731155e-05,
387
+ "loss": 1.925,
388
+ "step": 1325
389
+ },
390
+ {
391
+ "epoch": 0.7405375754251234,
392
+ "grad_norm": 0.9061549305915833,
393
+ "learning_rate": 1.375917278185457e-05,
394
+ "loss": 1.9245,
395
+ "step": 1350
396
+ },
397
+ {
398
+ "epoch": 0.7542512342292924,
399
+ "grad_norm": 0.9310101270675659,
400
+ "learning_rate": 1.3550700466977987e-05,
401
+ "loss": 1.9177,
402
+ "step": 1375
403
+ },
404
+ {
405
+ "epoch": 0.7679648930334614,
406
+ "grad_norm": 0.892022430896759,
407
+ "learning_rate": 1.3342228152101402e-05,
408
+ "loss": 1.8819,
409
+ "step": 1400
410
+ },
411
+ {
412
+ "epoch": 0.7816785518376302,
413
+ "grad_norm": 1.0094044208526611,
414
+ "learning_rate": 1.3133755837224817e-05,
415
+ "loss": 1.9032,
416
+ "step": 1425
417
+ },
418
+ {
419
+ "epoch": 0.7953922106417992,
420
+ "grad_norm": 0.9686225652694702,
421
+ "learning_rate": 1.2925283522348234e-05,
422
+ "loss": 1.9229,
423
+ "step": 1450
424
+ },
425
+ {
426
+ "epoch": 0.8091058694459682,
427
+ "grad_norm": 0.8588367104530334,
428
+ "learning_rate": 1.2716811207471649e-05,
429
+ "loss": 1.8997,
430
+ "step": 1475
431
+ },
432
+ {
433
+ "epoch": 0.8228195282501372,
434
+ "grad_norm": 0.9800122380256653,
435
+ "learning_rate": 1.2508338892595064e-05,
436
+ "loss": 1.9087,
437
+ "step": 1500
438
+ },
439
+ {
440
+ "epoch": 0.8365331870543061,
441
+ "grad_norm": 0.9647035002708435,
442
+ "learning_rate": 1.229986657771848e-05,
443
+ "loss": 1.9239,
444
+ "step": 1525
445
+ },
446
+ {
447
+ "epoch": 0.850246845858475,
448
+ "grad_norm": 0.9459319114685059,
449
+ "learning_rate": 1.2091394262841895e-05,
450
+ "loss": 1.88,
451
+ "step": 1550
452
+ },
453
+ {
454
+ "epoch": 0.863960504662644,
455
+ "grad_norm": 0.989025354385376,
456
+ "learning_rate": 1.1882921947965312e-05,
457
+ "loss": 1.9134,
458
+ "step": 1575
459
+ },
460
+ {
461
+ "epoch": 0.8776741634668129,
462
+ "grad_norm": 0.9573891162872314,
463
+ "learning_rate": 1.1674449633088725e-05,
464
+ "loss": 1.917,
465
+ "step": 1600
466
+ },
467
+ {
468
+ "epoch": 0.8913878222709819,
469
+ "grad_norm": 0.9619189500808716,
470
+ "learning_rate": 1.1465977318212142e-05,
471
+ "loss": 1.9202,
472
+ "step": 1625
473
+ },
474
+ {
475
+ "epoch": 0.9051014810751509,
476
+ "grad_norm": 0.959064781665802,
477
+ "learning_rate": 1.1257505003335557e-05,
478
+ "loss": 1.9053,
479
+ "step": 1650
480
+ },
481
+ {
482
+ "epoch": 0.9188151398793198,
483
+ "grad_norm": 1.0735059976577759,
484
+ "learning_rate": 1.1049032688458972e-05,
485
+ "loss": 1.914,
486
+ "step": 1675
487
+ },
488
+ {
489
+ "epoch": 0.9325287986834887,
490
+ "grad_norm": 0.8986095786094666,
491
+ "learning_rate": 1.0840560373582389e-05,
492
+ "loss": 1.8883,
493
+ "step": 1700
494
+ },
495
+ {
496
+ "epoch": 0.9462424574876577,
497
+ "grad_norm": 0.9695867300033569,
498
+ "learning_rate": 1.0632088058705804e-05,
499
+ "loss": 1.892,
500
+ "step": 1725
501
+ },
502
+ {
503
+ "epoch": 0.9599561162918266,
504
+ "grad_norm": 0.893974244594574,
505
+ "learning_rate": 1.042361574382922e-05,
506
+ "loss": 1.9094,
507
+ "step": 1750
508
+ },
509
+ {
510
+ "epoch": 0.9736697750959956,
511
+ "grad_norm": 1.0204190015792847,
512
+ "learning_rate": 1.0215143428952635e-05,
513
+ "loss": 1.9121,
514
+ "step": 1775
515
+ },
516
+ {
517
+ "epoch": 0.9873834339001646,
518
+ "grad_norm": 0.9737017750740051,
519
+ "learning_rate": 1.000667111407605e-05,
520
+ "loss": 1.9003,
521
+ "step": 1800
522
+ },
523
+ {
524
+ "epoch": 1.0010970927043334,
525
+ "grad_norm": 0.9492760300636292,
526
+ "learning_rate": 9.798198799199467e-06,
527
+ "loss": 1.9097,
528
+ "step": 1825
529
+ },
530
+ {
531
+ "epoch": 1.0148107515085025,
532
+ "grad_norm": 0.9199559688568115,
533
+ "learning_rate": 9.589726484322882e-06,
534
+ "loss": 1.7813,
535
+ "step": 1850
536
+ },
537
+ {
538
+ "epoch": 1.0285244103126714,
539
+ "grad_norm": 0.8272762298583984,
540
+ "learning_rate": 9.381254169446297e-06,
541
+ "loss": 1.8073,
542
+ "step": 1875
543
+ },
544
+ {
545
+ "epoch": 1.0422380691168405,
546
+ "grad_norm": 0.9935997724533081,
547
+ "learning_rate": 9.172781854569714e-06,
548
+ "loss": 1.7889,
549
+ "step": 1900
550
+ },
551
+ {
552
+ "epoch": 1.0559517279210093,
553
+ "grad_norm": 1.0486711263656616,
554
+ "learning_rate": 8.964309539693129e-06,
555
+ "loss": 1.8072,
556
+ "step": 1925
557
+ },
558
+ {
559
+ "epoch": 1.0696653867251782,
560
+ "grad_norm": 1.0662914514541626,
561
+ "learning_rate": 8.755837224816545e-06,
562
+ "loss": 1.8171,
563
+ "step": 1950
564
+ },
565
+ {
566
+ "epoch": 1.0833790455293473,
567
+ "grad_norm": 0.9916715621948242,
568
+ "learning_rate": 8.54736490993996e-06,
569
+ "loss": 1.8002,
570
+ "step": 1975
571
+ },
572
+ {
573
+ "epoch": 1.0970927043335161,
574
+ "grad_norm": 1.0449947118759155,
575
+ "learning_rate": 8.338892595063375e-06,
576
+ "loss": 1.8416,
577
+ "step": 2000
578
+ },
579
+ {
580
+ "epoch": 1.0970927043335161,
581
+ "eval_loss": 1.7659664154052734,
582
+ "eval_runtime": 12.173,
583
+ "eval_samples_per_second": 11.912,
584
+ "eval_steps_per_second": 1.561,
585
+ "step": 2000
586
+ },
587
+ {
588
+ "epoch": 1.1108063631376852,
589
+ "grad_norm": 1.0766034126281738,
590
+ "learning_rate": 8.130420280186792e-06,
591
+ "loss": 1.8049,
592
+ "step": 2025
593
+ },
594
+ {
595
+ "epoch": 1.124520021941854,
596
+ "grad_norm": 1.1313064098358154,
597
+ "learning_rate": 7.921947965310207e-06,
598
+ "loss": 1.7966,
599
+ "step": 2050
600
+ },
601
+ {
602
+ "epoch": 1.1382336807460232,
603
+ "grad_norm": 1.219266653060913,
604
+ "learning_rate": 7.713475650433624e-06,
605
+ "loss": 1.839,
606
+ "step": 2075
607
+ },
608
+ {
609
+ "epoch": 1.151947339550192,
610
+ "grad_norm": 1.171193242073059,
611
+ "learning_rate": 7.5050033355570386e-06,
612
+ "loss": 1.8187,
613
+ "step": 2100
614
+ },
615
+ {
616
+ "epoch": 1.1656609983543609,
617
+ "grad_norm": 1.0916893482208252,
618
+ "learning_rate": 7.296531020680454e-06,
619
+ "loss": 1.8144,
620
+ "step": 2125
621
+ },
622
+ {
623
+ "epoch": 1.17937465715853,
624
+ "grad_norm": 1.0833641290664673,
625
+ "learning_rate": 7.088058705803869e-06,
626
+ "loss": 1.8367,
627
+ "step": 2150
628
+ },
629
+ {
630
+ "epoch": 1.1930883159626988,
631
+ "grad_norm": 1.1324750185012817,
632
+ "learning_rate": 6.879586390927285e-06,
633
+ "loss": 1.8323,
634
+ "step": 2175
635
+ },
636
+ {
637
+ "epoch": 1.2068019747668677,
638
+ "grad_norm": 1.1888611316680908,
639
+ "learning_rate": 6.671114076050701e-06,
640
+ "loss": 1.8014,
641
+ "step": 2200
642
+ },
643
+ {
644
+ "epoch": 1.2205156335710368,
645
+ "grad_norm": 1.2182481288909912,
646
+ "learning_rate": 6.462641761174117e-06,
647
+ "loss": 1.8353,
648
+ "step": 2225
649
+ },
650
+ {
651
+ "epoch": 1.2342292923752056,
652
+ "grad_norm": 1.2250556945800781,
653
+ "learning_rate": 6.254169446297532e-06,
654
+ "loss": 1.816,
655
+ "step": 2250
656
+ },
657
+ {
658
+ "epoch": 1.2479429511793747,
659
+ "grad_norm": 1.2347311973571777,
660
+ "learning_rate": 6.045697131420948e-06,
661
+ "loss": 1.8175,
662
+ "step": 2275
663
+ },
664
+ {
665
+ "epoch": 1.2616566099835436,
666
+ "grad_norm": 1.2559269666671753,
667
+ "learning_rate": 5.837224816544363e-06,
668
+ "loss": 1.8196,
669
+ "step": 2300
670
+ },
671
+ {
672
+ "epoch": 1.2753702687877126,
673
+ "grad_norm": 1.2170192003250122,
674
+ "learning_rate": 5.6287525016677785e-06,
675
+ "loss": 1.8137,
676
+ "step": 2325
677
+ },
678
+ {
679
+ "epoch": 1.2890839275918815,
680
+ "grad_norm": 1.2384990453720093,
681
+ "learning_rate": 5.420280186791194e-06,
682
+ "loss": 1.7757,
683
+ "step": 2350
684
+ },
685
+ {
686
+ "epoch": 1.3027975863960504,
687
+ "grad_norm": 1.2390954494476318,
688
+ "learning_rate": 5.21180787191461e-06,
689
+ "loss": 1.7952,
690
+ "step": 2375
691
+ },
692
+ {
693
+ "epoch": 1.3165112452002194,
694
+ "grad_norm": 1.3812655210494995,
695
+ "learning_rate": 5.003335557038025e-06,
696
+ "loss": 1.8311,
697
+ "step": 2400
698
+ },
699
+ {
700
+ "epoch": 1.3302249040043883,
701
+ "grad_norm": 1.2410950660705566,
702
+ "learning_rate": 4.794863242161441e-06,
703
+ "loss": 1.7996,
704
+ "step": 2425
705
+ },
706
+ {
707
+ "epoch": 1.3439385628085574,
708
+ "grad_norm": 1.1785918474197388,
709
+ "learning_rate": 4.586390927284857e-06,
710
+ "loss": 1.793,
711
+ "step": 2450
712
+ },
713
+ {
714
+ "epoch": 1.3576522216127263,
715
+ "grad_norm": 1.4189759492874146,
716
+ "learning_rate": 4.377918612408273e-06,
717
+ "loss": 1.8558,
718
+ "step": 2475
719
+ },
720
+ {
721
+ "epoch": 1.3713658804168953,
722
+ "grad_norm": 1.209304928779602,
723
+ "learning_rate": 4.169446297531688e-06,
724
+ "loss": 1.7649,
725
+ "step": 2500
726
+ },
727
+ {
728
+ "epoch": 1.3850795392210642,
729
+ "grad_norm": 1.2057141065597534,
730
+ "learning_rate": 3.9609739826551035e-06,
731
+ "loss": 1.8148,
732
+ "step": 2525
733
+ },
734
+ {
735
+ "epoch": 1.398793198025233,
736
+ "grad_norm": 1.0700486898422241,
737
+ "learning_rate": 3.7525016677785193e-06,
738
+ "loss": 1.8063,
739
+ "step": 2550
740
+ },
741
+ {
742
+ "epoch": 1.4125068568294021,
743
+ "grad_norm": 1.1492820978164673,
744
+ "learning_rate": 3.5440293529019347e-06,
745
+ "loss": 1.7777,
746
+ "step": 2575
747
+ },
748
+ {
749
+ "epoch": 1.426220515633571,
750
+ "grad_norm": 1.2483413219451904,
751
+ "learning_rate": 3.3355570380253505e-06,
752
+ "loss": 1.8595,
753
+ "step": 2600
754
+ },
755
+ {
756
+ "epoch": 1.4399341744377399,
757
+ "grad_norm": 1.384775996208191,
758
+ "learning_rate": 3.127084723148766e-06,
759
+ "loss": 1.7923,
760
+ "step": 2625
761
+ },
762
+ {
763
+ "epoch": 1.453647833241909,
764
+ "grad_norm": 1.2097506523132324,
765
+ "learning_rate": 2.9186124082721813e-06,
766
+ "loss": 1.8107,
767
+ "step": 2650
768
+ },
769
+ {
770
+ "epoch": 1.467361492046078,
771
+ "grad_norm": 1.2465295791625977,
772
+ "learning_rate": 2.710140093395597e-06,
773
+ "loss": 1.8037,
774
+ "step": 2675
775
+ },
776
+ {
777
+ "epoch": 1.4810751508502469,
778
+ "grad_norm": 1.3312894105911255,
779
+ "learning_rate": 2.5016677785190126e-06,
780
+ "loss": 1.8149,
781
+ "step": 2700
782
+ },
783
+ {
784
+ "epoch": 1.4947888096544157,
785
+ "grad_norm": 1.1086313724517822,
786
+ "learning_rate": 2.2931954636424284e-06,
787
+ "loss": 1.8261,
788
+ "step": 2725
789
+ },
790
+ {
791
+ "epoch": 1.5085024684585848,
792
+ "grad_norm": 1.3292433023452759,
793
+ "learning_rate": 2.084723148765844e-06,
794
+ "loss": 1.8145,
795
+ "step": 2750
796
+ },
797
+ {
798
+ "epoch": 1.5222161272627537,
799
+ "grad_norm": 1.2049185037612915,
800
+ "learning_rate": 1.8762508338892596e-06,
801
+ "loss": 1.8255,
802
+ "step": 2775
803
+ },
804
+ {
805
+ "epoch": 1.5359297860669225,
806
+ "grad_norm": 1.266183614730835,
807
+ "learning_rate": 1.6677785190126753e-06,
808
+ "loss": 1.7954,
809
+ "step": 2800
810
+ },
811
+ {
812
+ "epoch": 1.5496434448710916,
813
+ "grad_norm": 1.337235927581787,
814
+ "learning_rate": 1.4593062041360907e-06,
815
+ "loss": 1.7749,
816
+ "step": 2825
817
+ },
818
+ {
819
+ "epoch": 1.5633571036752607,
820
+ "grad_norm": 1.3112897872924805,
821
+ "learning_rate": 1.2508338892595063e-06,
822
+ "loss": 1.8236,
823
+ "step": 2850
824
+ },
825
+ {
826
+ "epoch": 1.5770707624794296,
827
+ "grad_norm": 1.24995756149292,
828
+ "learning_rate": 1.042361574382922e-06,
829
+ "loss": 1.8364,
830
+ "step": 2875
831
+ },
832
+ {
833
+ "epoch": 1.5907844212835984,
834
+ "grad_norm": 1.2855443954467773,
835
+ "learning_rate": 8.338892595063376e-07,
836
+ "loss": 1.8267,
837
+ "step": 2900
838
+ },
839
+ {
840
+ "epoch": 1.6044980800877675,
841
+ "grad_norm": 1.1317636966705322,
842
+ "learning_rate": 6.254169446297531e-07,
843
+ "loss": 1.8185,
844
+ "step": 2925
845
+ },
846
+ {
847
+ "epoch": 1.6182117388919364,
848
+ "grad_norm": 1.3289167881011963,
849
+ "learning_rate": 4.169446297531688e-07,
850
+ "loss": 1.8577,
851
+ "step": 2950
852
+ },
853
+ {
854
+ "epoch": 1.6319253976961052,
855
+ "grad_norm": 1.2797874212265015,
856
+ "learning_rate": 2.084723148765844e-07,
857
+ "loss": 1.811,
858
+ "step": 2975
859
+ },
860
+ {
861
+ "epoch": 1.6456390565002743,
862
+ "grad_norm": 1.3882725238800049,
863
+ "learning_rate": 0.0,
864
+ "loss": 1.8245,
865
+ "step": 3000
866
+ },
867
+ {
868
+ "epoch": 1.6456390565002743,
869
+ "eval_loss": 1.7348041534423828,
870
+ "eval_runtime": 12.1692,
871
+ "eval_samples_per_second": 11.915,
872
+ "eval_steps_per_second": 1.561,
873
+ "step": 3000
874
+ }
875
+ ],
876
+ "logging_steps": 25,
877
+ "max_steps": 3000,
878
+ "num_input_tokens_seen": 0,
879
+ "num_train_epochs": 2,
880
+ "save_steps": 1000,
881
+ "stateful_callbacks": {
882
+ "TrainerControl": {
883
+ "args": {
884
+ "should_epoch_stop": false,
885
+ "should_evaluate": false,
886
+ "should_log": false,
887
+ "should_save": true,
888
+ "should_training_stop": true
889
+ },
890
+ "attributes": {}
891
+ }
892
+ },
893
+ "total_flos": 1.0737917404957901e+18,
894
+ "train_batch_size": 16,
895
+ "trial_name": null,
896
+ "trial_params": null
897
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce91bab46a833b3b653f3689597f2de95a9586bd5a81305a79d7f8e04ef09dff
3
+ size 5112