ChiefTheLord commited on
Commit
8164967
verified
1 Parent(s): b5d6954

Upload folder using huggingface_hub

Browse files
flickr8k_checkpoints/checkpoint-1208/adapter.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e99df632ba281149a567ab0f850dc3be5c51d9b63810eb68577d29603d14c0e
3
  size 17064856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c2baa719ba3d00319742871c6ccf4f3f8a64aa3dbd92a3923116006562f9ba9
3
  size 17064856
flickr8k_checkpoints/checkpoint-1208/eval_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
flickr8k_checkpoints/checkpoint-1208/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:067b5fde8b60af2370dac43b51a0eaaed1d55487d4847e177307de9f998befed
3
  size 8714492
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33cc226a0932968bbd20d757ed7c15ae79d5418ad0f04f6e722ba1f6f8445b7d
3
  size 8714492
flickr8k_checkpoints/checkpoint-1208/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6db6a4c086774bca871a88695029095eba8e64c70a4cdf4e980cb249f462eb44
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:229084bbb0e922d44dba4cc9206f54220a162a8229897c2f3d636ef81c5c5418
3
  size 1064
flickr8k_checkpoints/checkpoint-1208/trainer_state.json CHANGED
@@ -10,565 +10,565 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.026490066225165563,
13
- "grad_norm": 0.23711644113063812,
14
  "learning_rate": 0.0004324324324324325,
15
- "loss": 4.5092,
16
  "step": 16
17
  },
18
  {
19
  "epoch": 0.052980132450331126,
20
- "grad_norm": 0.285342812538147,
21
- "learning_rate": 0.0007297297297297297,
22
- "loss": 4.9762,
23
  "step": 32
24
  },
25
  {
26
  "epoch": 0.07947019867549669,
27
- "grad_norm": 0.09107156842947006,
28
- "learning_rate": 0.0009999352232600816,
29
- "loss": 4.3898,
30
  "step": 48
31
  },
32
  {
33
  "epoch": 0.10596026490066225,
34
- "grad_norm": 0.11629017442464828,
35
- "learning_rate": 0.0009991293467102582,
36
- "loss": 4.3831,
37
  "step": 64
38
  },
39
  {
40
  "epoch": 0.13245033112582782,
41
- "grad_norm": 0.1688951849937439,
42
- "learning_rate": 0.000997403926531526,
43
- "loss": 4.3703,
44
  "step": 80
45
  },
46
  {
47
  "epoch": 0.15894039735099338,
48
- "grad_norm": 0.11590718477964401,
49
- "learning_rate": 0.0009947621414572996,
50
- "loss": 4.4239,
51
  "step": 96
52
  },
53
  {
54
  "epoch": 0.18543046357615894,
55
- "grad_norm": 0.1131146252155304,
56
- "learning_rate": 0.0009912088584356955,
57
- "loss": 4.3535,
58
  "step": 112
59
  },
60
  {
61
  "epoch": 0.2119205298013245,
62
- "grad_norm": 0.136209174990654,
63
- "learning_rate": 0.000986750623663177,
64
- "loss": 4.3339,
65
  "step": 128
66
  },
67
  {
68
  "epoch": 0.23841059602649006,
69
- "grad_norm": 0.18205471336841583,
70
- "learning_rate": 0.000981395650524528,
71
- "loss": 4.3282,
72
  "step": 144
73
  },
74
  {
75
  "epoch": 0.26490066225165565,
76
- "grad_norm": 0.10476769506931305,
77
- "learning_rate": 0.000975153804461372,
78
- "loss": 4.3313,
79
  "step": 160
80
  },
81
  {
82
  "epoch": 0.2913907284768212,
83
- "grad_norm": 0.0960649624466896,
84
- "learning_rate": 0.0009680365847971162,
85
- "loss": 4.3201,
86
  "step": 176
87
  },
88
  {
89
  "epoch": 0.31788079470198677,
90
- "grad_norm": 0.13584400713443756,
91
- "learning_rate": 0.0009600571035518034,
92
- "loss": 4.3756,
93
  "step": 192
94
  },
95
  {
96
  "epoch": 0.3443708609271523,
97
- "grad_norm": 0.10583332926034927,
98
- "learning_rate": 0.000951230061285898,
99
- "loss": 4.3302,
100
  "step": 208
101
  },
102
  {
103
  "epoch": 0.3708609271523179,
104
- "grad_norm": 0.13830633461475372,
105
- "learning_rate": 0.0009415717200175151,
106
- "loss": 4.3383,
107
  "step": 224
108
  },
109
  {
110
  "epoch": 0.3973509933774834,
111
- "grad_norm": 0.12482637912034988,
112
- "learning_rate": 0.0009310998732629798,
113
- "loss": 4.3083,
114
  "step": 240
115
  },
116
  {
117
  "epoch": 0.423841059602649,
118
- "grad_norm": 0.11580520868301392,
119
- "learning_rate": 0.0009198338132559177,
120
- "loss": 4.3313,
121
  "step": 256
122
  },
123
  {
124
  "epoch": 0.4503311258278146,
125
- "grad_norm": 0.15633529424667358,
126
- "learning_rate": 0.0009077942954052636,
127
- "loss": 4.3099,
128
  "step": 272
129
  },
130
  {
131
  "epoch": 0.4768211920529801,
132
- "grad_norm": 0.12987282872200012,
133
- "learning_rate": 0.0008950035000576705,
134
- "loss": 4.2932,
135
  "step": 288
136
  },
137
  {
138
  "epoch": 0.5033112582781457,
139
- "grad_norm": 0.1630668193101883,
140
- "learning_rate": 0.000881484991634762,
141
- "loss": 4.3073,
142
  "step": 304
143
  },
144
  {
145
  "epoch": 0.5298013245033113,
146
- "grad_norm": 0.14686144888401031,
147
- "learning_rate": 0.0008672636752205099,
148
- "loss": 4.2848,
149
  "step": 320
150
  },
151
  {
152
  "epoch": 0.5562913907284768,
153
- "grad_norm": 0.1868494600057602,
154
- "learning_rate": 0.0008523657506787162,
155
- "loss": 4.3405,
156
  "step": 336
157
  },
158
  {
159
  "epoch": 0.5827814569536424,
160
- "grad_norm": 0.11744031310081482,
161
- "learning_rate": 0.0008368186643851284,
162
- "loss": 4.3128,
163
  "step": 352
164
  },
165
  {
166
  "epoch": 0.609271523178808,
167
- "grad_norm": 0.14506162703037262,
168
- "learning_rate": 0.0008206510586631114,
169
- "loss": 4.3208,
170
  "step": 368
171
  },
172
  {
173
  "epoch": 0.6357615894039735,
174
- "grad_norm": 0.12040343135595322,
175
- "learning_rate": 0.0008038927190160309,
176
- "loss": 4.2675,
177
  "step": 384
178
  },
179
  {
180
  "epoch": 0.6622516556291391,
181
- "grad_norm": 0.11899886280298233,
182
- "learning_rate": 0.000786574519253562,
183
- "loss": 4.2686,
184
  "step": 400
185
  },
186
  {
187
  "epoch": 0.6887417218543046,
188
- "grad_norm": 0.12363821268081665,
189
- "learning_rate": 0.0007687283646130157,
190
- "loss": 4.2989,
191
  "step": 416
192
  },
193
  {
194
  "epoch": 0.7152317880794702,
195
- "grad_norm": 0.13222382962703705,
196
- "learning_rate": 0.0007503871329804718,
197
- "loss": 4.2919,
198
  "step": 432
199
  },
200
  {
201
  "epoch": 0.7417218543046358,
202
- "grad_norm": 0.11946064233779907,
203
- "learning_rate": 0.0007315846143200053,
204
- "loss": 4.2956,
205
  "step": 448
206
  },
207
  {
208
  "epoch": 0.7682119205298014,
209
- "grad_norm": 0.1136200875043869,
210
- "learning_rate": 0.0007123554484225956,
211
- "loss": 4.251,
212
  "step": 464
213
  },
214
  {
215
  "epoch": 0.7947019867549668,
216
- "grad_norm": 0.13116462528705597,
217
- "learning_rate": 0.0006927350610894034,
218
- "loss": 4.3105,
219
  "step": 480
220
  },
221
  {
222
  "epoch": 0.8211920529801324,
223
- "grad_norm": 0.10559297353029251,
224
- "learning_rate": 0.0006727595988669864,
225
- "loss": 4.262,
226
  "step": 496
227
  },
228
  {
229
  "epoch": 0.847682119205298,
230
- "grad_norm": 0.11165904253721237,
231
- "learning_rate": 0.0006524658624546864,
232
- "loss": 4.3069,
233
  "step": 512
234
  },
235
  {
236
  "epoch": 0.8741721854304636,
237
- "grad_norm": 0.10997848957777023,
238
- "learning_rate": 0.0006318912389068766,
239
- "loss": 4.2901,
240
  "step": 528
241
  },
242
  {
243
  "epoch": 0.9006622516556292,
244
- "grad_norm": 0.11981035768985748,
245
- "learning_rate": 0.0006110736327549683,
246
- "loss": 4.2744,
247
  "step": 544
248
  },
249
  {
250
  "epoch": 0.9271523178807947,
251
- "grad_norm": 0.11829685419797897,
252
- "learning_rate": 0.0005900513961760718,
253
- "loss": 4.3009,
254
  "step": 560
255
  },
256
  {
257
  "epoch": 0.9536423841059603,
258
- "grad_norm": 0.09722407907247543,
259
- "learning_rate": 0.0005688632583369634,
260
- "loss": 4.2738,
261
  "step": 576
262
  },
263
  {
264
  "epoch": 0.9801324503311258,
265
- "grad_norm": 0.12697675824165344,
266
- "learning_rate": 0.0005475482540435239,
267
- "loss": 4.3521,
268
  "step": 592
269
  },
270
  {
271
  "epoch": 1.0,
272
- "eval_bleu": 0.10730040886167266,
273
- "eval_cap_loss": 1.229803388481898,
274
- "eval_con_loss": 1.8703024170256608,
275
- "eval_loss": 3.100105801560231,
276
  "step": 604
277
  },
278
  {
279
  "epoch": 1.0,
280
- "eval_bleu": 0.10730040886167266,
281
- "eval_cap_loss": 1.229803388481898,
282
- "eval_con_loss": 1.8703024170256608,
283
- "eval_loss": 3.100105801560231,
284
- "eval_runtime": 247.7711,
285
- "eval_samples_per_second": 19.49,
286
- "eval_steps_per_second": 2.438,
287
  "step": 604
288
  },
289
  {
290
  "epoch": 1.0066225165562914,
291
- "grad_norm": 0.13247686624526978,
292
- "learning_rate": 0.000526145651827102,
293
- "loss": 4.2509,
294
  "step": 608
295
  },
296
  {
297
  "epoch": 1.033112582781457,
298
- "grad_norm": 0.1049540564417839,
299
- "learning_rate": 0.0005046948816002839,
300
- "loss": 4.2821,
301
  "step": 624
302
  },
303
  {
304
  "epoch": 1.0596026490066226,
305
- "grad_norm": 0.1295829862356186,
306
- "learning_rate": 0.00048323546201535375,
307
- "loss": 4.2916,
308
  "step": 640
309
  },
310
  {
311
  "epoch": 1.086092715231788,
312
- "grad_norm": 0.14188766479492188,
313
- "learning_rate": 0.0004618069276592665,
314
- "loss": 4.2996,
315
  "step": 656
316
  },
317
  {
318
  "epoch": 1.1125827814569536,
319
- "grad_norm": 0.12427990883588791,
320
- "learning_rate": 0.0004404487562192665,
321
- "loss": 4.3079,
322
  "step": 672
323
  },
324
  {
325
  "epoch": 1.1390728476821192,
326
- "grad_norm": 0.10745778679847717,
327
- "learning_rate": 0.0004192002957533321,
328
- "loss": 4.284,
329
  "step": 688
330
  },
331
  {
332
  "epoch": 1.1655629139072847,
333
- "grad_norm": 0.1536702811717987,
334
- "learning_rate": 0.00039810069219943343,
335
- "loss": 4.2841,
336
  "step": 704
337
  },
338
  {
339
  "epoch": 1.1920529801324504,
340
- "grad_norm": 0.1520632952451706,
341
- "learning_rate": 0.0003771888172571579,
342
- "loss": 4.3065,
343
  "step": 720
344
  },
345
  {
346
  "epoch": 1.218543046357616,
347
- "grad_norm": 0.1317160278558731,
348
- "learning_rate": 0.0003565031967745614,
349
- "loss": 4.2853,
350
  "step": 736
351
  },
352
  {
353
  "epoch": 1.2450331125827814,
354
- "grad_norm": 0.1610012948513031,
355
- "learning_rate": 0.00033608193977218185,
356
- "loss": 4.3149,
357
  "step": 752
358
  },
359
  {
360
  "epoch": 1.271523178807947,
361
- "grad_norm": 0.13628298044204712,
362
- "learning_rate": 0.0003159626682349709,
363
- "loss": 4.3005,
364
  "step": 768
365
  },
366
  {
367
  "epoch": 1.2980132450331126,
368
- "grad_norm": 0.1552676558494568,
369
- "learning_rate": 0.00029618244780148955,
370
- "loss": 4.2843,
371
  "step": 784
372
  },
373
  {
374
  "epoch": 1.3245033112582782,
375
- "grad_norm": 0.1701873242855072,
376
- "learning_rate": 0.0002767777194780578,
377
- "loss": 4.3283,
378
  "step": 800
379
  },
380
  {
381
  "epoch": 1.3509933774834437,
382
- "grad_norm": 0.10417599231004715,
383
- "learning_rate": 0.00025778423250366167,
384
- "loss": 4.2768,
385
  "step": 816
386
  },
387
  {
388
  "epoch": 1.3774834437086092,
389
- "grad_norm": 0.14864106476306915,
390
- "learning_rate": 0.0002392369784893001,
391
- "loss": 4.2471,
392
  "step": 832
393
  },
394
  {
395
  "epoch": 1.403973509933775,
396
- "grad_norm": 0.14134187996387482,
397
- "learning_rate": 0.00022117012695310468,
398
- "loss": 4.2717,
399
  "step": 848
400
  },
401
  {
402
  "epoch": 1.4304635761589404,
403
- "grad_norm": 0.12026234716176987,
404
- "learning_rate": 0.0002036169623700001,
405
- "loss": 4.282,
406
  "step": 864
407
  },
408
  {
409
  "epoch": 1.4569536423841059,
410
- "grad_norm": 0.15808941423892975,
411
- "learning_rate": 0.000186609822851872,
412
- "loss": 4.2811,
413
  "step": 880
414
  },
415
  {
416
  "epoch": 1.4834437086092715,
417
- "grad_norm": 0.11768582463264465,
418
- "learning_rate": 0.00017018004057121894,
419
- "loss": 4.2527,
420
  "step": 896
421
  },
422
  {
423
  "epoch": 1.5099337748344372,
424
- "grad_norm": 0.10410495847463608,
425
- "learning_rate": 0.00015435788403803702,
426
- "loss": 4.3271,
427
  "step": 912
428
  },
429
  {
430
  "epoch": 1.5364238410596025,
431
- "grad_norm": 0.1393006443977356,
432
- "learning_rate": 0.00013917250233628969,
433
- "loss": 4.2384,
434
  "step": 928
435
  },
436
  {
437
  "epoch": 1.5629139072847682,
438
- "grad_norm": 0.11578142642974854,
439
- "learning_rate": 0.00012465187142268687,
440
- "loss": 4.2827,
441
  "step": 944
442
  },
443
  {
444
  "epoch": 1.589403973509934,
445
- "grad_norm": 0.12900201976299286,
446
- "learning_rate": 0.00011082274258671376,
447
- "loss": 4.2757,
448
  "step": 960
449
  },
450
  {
451
  "epoch": 1.6158940397350994,
452
- "grad_norm": 0.10685920715332031,
453
- "learning_rate": 9.771059316685665e-05,
454
- "loss": 4.2779,
455
  "step": 976
456
  },
457
  {
458
  "epoch": 1.6423841059602649,
459
- "grad_norm": 0.13312238454818726,
460
- "learning_rate": 8.533957961382238e-05,
461
- "loss": 4.2988,
462
  "step": 992
463
  },
464
  {
465
  "epoch": 1.6688741721854305,
466
- "grad_norm": 0.1473645716905594,
467
- "learning_rate": 7.373249298722506e-05,
468
- "loss": 4.2973,
469
  "step": 1008
470
  },
471
  {
472
  "epoch": 1.695364238410596,
473
- "grad_norm": 0.10669790208339691,
474
- "learning_rate": 6.29107169677236e-05,
475
- "loss": 4.2533,
476
  "step": 1024
477
  },
478
  {
479
  "epoch": 1.7218543046357615,
480
- "grad_norm": 0.13500191271305084,
481
- "learning_rate": 5.28941884619693e-05,
482
- "loss": 4.3073,
483
  "step": 1040
484
  },
485
  {
486
  "epoch": 1.7483443708609272,
487
- "grad_norm": 0.1250208169221878,
488
- "learning_rate": 4.370136087293658e-05,
489
- "loss": 4.2093,
490
  "step": 1056
491
  },
492
  {
493
  "epoch": 1.7748344370860927,
494
- "grad_norm": 0.1221160963177681,
495
- "learning_rate": 3.534917010330652e-05,
496
- "loss": 4.2625,
497
  "step": 1072
498
  },
499
  {
500
  "epoch": 1.8013245033112582,
501
- "grad_norm": 0.11934095621109009,
502
- "learning_rate": 2.7853003354533555e-05,
503
- "loss": 4.3391,
504
  "step": 1088
505
  },
506
  {
507
  "epoch": 1.8278145695364238,
508
- "grad_norm": 0.13353998959064484,
509
- "learning_rate": 2.1226670779077306e-05,
510
- "loss": 4.2848,
511
  "step": 1104
512
  },
513
  {
514
  "epoch": 1.8543046357615895,
515
- "grad_norm": 0.10006093233823776,
516
- "learning_rate": 1.5482380038023768e-05,
517
- "loss": 4.2608,
518
  "step": 1120
519
  },
520
  {
521
  "epoch": 1.8807947019867548,
522
- "grad_norm": 0.10387697070837021,
523
- "learning_rate": 1.0630713810969639e-05,
524
- "loss": 4.2986,
525
  "step": 1136
526
  },
527
  {
528
  "epoch": 1.9072847682119205,
529
- "grad_norm": 0.13752028346061707,
530
- "learning_rate": 6.680610299601708e-06,
531
- "loss": 4.2593,
532
  "step": 1152
533
  },
534
  {
535
  "epoch": 1.9337748344370862,
536
- "grad_norm": 0.1313001811504364,
537
- "learning_rate": 3.639346760890283e-06,
538
- "loss": 4.2522,
539
  "step": 1168
540
  },
541
  {
542
  "epoch": 1.9602649006622517,
543
- "grad_norm": 0.11591313779354095,
544
- "learning_rate": 1.5125261002330026e-06,
545
- "loss": 4.2476,
546
  "step": 1184
547
  },
548
  {
549
  "epoch": 1.9867549668874172,
550
- "grad_norm": 0.11690162122249603,
551
- "learning_rate": 3.040665492491379e-07,
552
- "loss": 4.3406,
553
  "step": 1200
554
  },
555
  {
556
  "epoch": 2.0,
557
- "eval_bleu": 0.11121865675263325,
558
- "eval_cap_loss": 1.2156698384032345,
559
- "eval_con_loss": 1.8595664712372204,
560
- "eval_loss": 3.0752363122062176,
561
  "step": 1208
562
  },
563
  {
564
  "epoch": 2.0,
565
- "eval_bleu": 0.11121865675263325,
566
- "eval_cap_loss": 1.2156698384032345,
567
- "eval_con_loss": 1.8595664712372204,
568
- "eval_loss": 3.0752363122062176,
569
- "eval_runtime": 250.9492,
570
- "eval_samples_per_second": 19.243,
571
- "eval_steps_per_second": 2.407,
572
  "step": 1208
573
  }
574
  ],
@@ -593,5 +593,5 @@
593
  "train_batch_size": 32,
594
  "trial_name": null,
595
  "trial_params": null,
596
- "tau_value": 3.5545
597
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.026490066225165563,
13
+ "grad_norm": 0.12706246972084045,
14
  "learning_rate": 0.0004324324324324325,
15
+ "loss": 4.2683,
16
  "step": 16
17
  },
18
  {
19
  "epoch": 0.052980132450331126,
20
+ "grad_norm": 0.1603141874074936,
21
+ "learning_rate": 0.000864864864864865,
22
+ "loss": 4.2333,
23
  "step": 32
24
  },
25
  {
26
  "epoch": 0.07947019867549669,
27
+ "grad_norm": 0.11292944103479385,
28
+ "learning_rate": 0.0009997822892796068,
29
+ "loss": 4.2693,
30
  "step": 48
31
  },
32
  {
33
  "epoch": 0.10596026490066225,
34
+ "grad_norm": 0.126583069562912,
35
+ "learning_rate": 0.000998688816161266,
36
+ "loss": 4.2745,
37
  "step": 64
38
  },
39
  {
40
  "epoch": 0.13245033112582782,
41
+ "grad_norm": 0.1791575849056244,
42
+ "learning_rate": 0.0009966766110013582,
43
+ "loss": 4.2707,
44
  "step": 80
45
  },
46
  {
47
  "epoch": 0.15894039735099338,
48
+ "grad_norm": 0.16615551710128784,
49
+ "learning_rate": 0.0009937493808759087,
50
+ "loss": 4.3332,
51
  "step": 96
52
  },
53
  {
54
  "epoch": 0.18543046357615894,
55
+ "grad_norm": 0.17333188652992249,
56
+ "learning_rate": 0.0009899125186070988,
57
+ "loss": 4.2671,
58
  "step": 112
59
  },
60
  {
61
  "epoch": 0.2119205298013245,
62
+ "grad_norm": 0.14814811944961548,
63
+ "learning_rate": 0.0009851730928280944,
64
+ "loss": 4.2474,
65
  "step": 128
66
  },
67
  {
68
  "epoch": 0.23841059602649006,
69
+ "grad_norm": 0.16701027750968933,
70
+ "learning_rate": 0.0009795398349605373,
71
+ "loss": 4.248,
72
  "step": 144
73
  },
74
  {
75
  "epoch": 0.26490066225165565,
76
+ "grad_norm": 0.11271824687719345,
77
+ "learning_rate": 0.0009730231231286876,
78
+ "loss": 4.2506,
79
  "step": 160
80
  },
81
  {
82
  "epoch": 0.2913907284768212,
83
+ "grad_norm": 0.10354705899953842,
84
+ "learning_rate": 0.0009656349630398554,
85
+ "loss": 4.2424,
86
  "step": 176
87
  },
88
  {
89
  "epoch": 0.31788079470198677,
90
+ "grad_norm": 0.1165025606751442,
91
+ "learning_rate": 0.0009573889658663424,
92
+ "loss": 4.2973,
93
  "step": 192
94
  },
95
  {
96
  "epoch": 0.3443708609271523,
97
+ "grad_norm": 0.1250074803829193,
98
+ "learning_rate": 0.0009483003231696446,
99
+ "loss": 4.2571,
100
  "step": 208
101
  },
102
  {
103
  "epoch": 0.3708609271523179,
104
+ "grad_norm": 0.13243281841278076,
105
+ "learning_rate": 0.0009383857789131097,
106
+ "loss": 4.2659,
107
  "step": 224
108
  },
109
  {
110
  "epoch": 0.3973509933774834,
111
+ "grad_norm": 0.10707055032253265,
112
+ "learning_rate": 0.0009276635986146136,
113
+ "loss": 4.2331,
114
  "step": 240
115
  },
116
  {
117
  "epoch": 0.423841059602649,
118
+ "grad_norm": 0.12676522135734558,
119
+ "learning_rate": 0.0009161535356960828,
120
+ "loss": 4.2585,
121
  "step": 256
122
  },
123
  {
124
  "epoch": 0.4503311258278146,
125
+ "grad_norm": 0.10248049348592758,
126
+ "learning_rate": 0.0009038767950918592,
127
+ "loss": 4.2375,
128
  "step": 272
129
  },
130
  {
131
  "epoch": 0.4768211920529801,
132
+ "grad_norm": 0.16809116303920746,
133
+ "learning_rate": 0.0008908559941829497,
134
+ "loss": 4.2249,
135
  "step": 288
136
  },
137
  {
138
  "epoch": 0.5033112582781457,
139
+ "grad_norm": 0.11593299359083176,
140
+ "learning_rate": 0.0008771151211291332,
141
+ "loss": 4.2411,
142
  "step": 304
143
  },
144
  {
145
  "epoch": 0.5298013245033113,
146
+ "grad_norm": 0.11452265083789825,
147
+ "learning_rate": 0.0008626794906756866,
148
+ "loss": 4.2158,
149
  "step": 320
150
  },
151
  {
152
  "epoch": 0.5562913907284768,
153
+ "grad_norm": 0.14648571610450745,
154
+ "learning_rate": 0.0008475756975161504,
155
+ "loss": 4.2718,
156
  "step": 336
157
  },
158
  {
159
  "epoch": 0.5827814569536424,
160
+ "grad_norm": 0.11873907595872879,
161
+ "learning_rate": 0.00083183156729705,
162
+ "loss": 4.2431,
163
  "step": 352
164
  },
165
  {
166
  "epoch": 0.609271523178808,
167
+ "grad_norm": 0.13245505094528198,
168
+ "learning_rate": 0.0008154761053548404,
169
+ "loss": 4.2546,
170
  "step": 368
171
  },
172
  {
173
  "epoch": 0.6357615894039735,
174
+ "grad_norm": 0.12086515128612518,
175
+ "learning_rate": 0.000798539443279511,
176
+ "loss": 4.2042,
177
  "step": 384
178
  },
179
  {
180
  "epoch": 0.6622516556291391,
181
+ "grad_norm": 0.09804444760084152,
182
+ "learning_rate": 0.0007810527834033009,
183
+ "loss": 4.2046,
184
  "step": 400
185
  },
186
  {
187
  "epoch": 0.6887417218543046,
188
+ "grad_norm": 0.12007380276918411,
189
+ "learning_rate": 0.00076304834131679,
190
+ "loss": 4.2311,
191
  "step": 416
192
  },
193
  {
194
  "epoch": 0.7152317880794702,
195
+ "grad_norm": 0.12332040816545486,
196
+ "learning_rate": 0.0007445592865182695,
197
+ "loss": 4.2304,
198
  "step": 432
199
  },
200
  {
201
  "epoch": 0.7417218543046358,
202
+ "grad_norm": 0.12303052097558975,
203
+ "learning_rate": 0.0007256196813057318,
204
+ "loss": 4.2351,
205
  "step": 448
206
  },
207
  {
208
  "epoch": 0.7682119205298014,
209
+ "grad_norm": 0.09435882419347763,
210
+ "learning_rate": 0.0007062644180240614,
211
+ "loss": 4.1903,
212
  "step": 464
213
  },
214
  {
215
  "epoch": 0.7947019867549668,
216
+ "grad_norm": 0.12024106830358505,
217
+ "learning_rate": 0.0006865291547830324,
218
+ "loss": 4.2468,
219
  "step": 480
220
  },
221
  {
222
  "epoch": 0.8211920529801324,
223
+ "grad_norm": 0.09838169813156128,
224
+ "learning_rate": 0.000666450249764542,
225
+ "loss": 4.1978,
226
  "step": 496
227
  },
228
  {
229
  "epoch": 0.847682119205298,
230
+ "grad_norm": 0.10983674973249435,
231
+ "learning_rate": 0.0006460646942401058,
232
+ "loss": 4.2443,
233
  "step": 512
234
  },
235
  {
236
  "epoch": 0.8741721854304636,
237
+ "grad_norm": 0.09954366832971573,
238
+ "learning_rate": 0.0006254100444220115,
239
+ "loss": 4.227,
240
  "step": 528
241
  },
242
  {
243
  "epoch": 0.9006622516556292,
244
+ "grad_norm": 0.12015046179294586,
245
+ "learning_rate": 0.0006045243522736885,
246
+ "loss": 4.2154,
247
  "step": 544
248
  },
249
  {
250
  "epoch": 0.9271523178807947,
251
+ "grad_norm": 0.10851255804300308,
252
+ "learning_rate": 0.0005834460954067559,
253
+ "loss": 4.242,
254
  "step": 560
255
  },
256
  {
257
  "epoch": 0.9536423841059603,
258
+ "grad_norm": 0.0975833460688591,
259
+ "learning_rate": 0.0005622141061939006,
260
+ "loss": 4.2135,
261
  "step": 576
262
  },
263
  {
264
  "epoch": 0.9801324503311258,
265
+ "grad_norm": 0.10621072351932526,
266
+ "learning_rate": 0.0005408675002281818,
267
+ "loss": 4.2932,
268
  "step": 592
269
  },
270
  {
271
  "epoch": 1.0,
272
+ "eval_bleu": 0.1135024238637086,
273
+ "eval_cap_loss": 1.2033403850351738,
274
+ "eval_con_loss": 1.786089795906812,
275
+ "eval_loss": 2.9894301812380353,
276
  "step": 604
277
  },
278
  {
279
  "epoch": 1.0,
280
+ "eval_bleu": 0.1135024238637086,
281
+ "eval_cap_loss": 1.2033403850351738,
282
+ "eval_con_loss": 1.786089795906812,
283
+ "eval_loss": 2.9894301812380353,
284
+ "eval_runtime": 416.7065,
285
+ "eval_samples_per_second": 11.588,
286
+ "eval_steps_per_second": 1.449,
287
  "step": 604
288
  },
289
  {
290
  "epoch": 1.0066225165562914,
291
+ "grad_norm": 0.13524451851844788,
292
+ "learning_rate": 0.0005194456042605587,
293
+ "loss": 4.1897,
294
  "step": 608
295
  },
296
  {
297
  "epoch": 1.033112582781457,
298
+ "grad_norm": 0.09372173994779587,
299
+ "learning_rate": 0.0004979878837484043,
300
+ "loss": 4.2234,
301
  "step": 624
302
  },
303
  {
304
  "epoch": 1.0596026490066226,
305
+ "grad_norm": 0.10636495053768158,
306
+ "learning_rate": 0.00047653387014848014,
307
+ "loss": 4.2304,
308
  "step": 640
309
  },
310
  {
311
  "epoch": 1.086092715231788,
312
+ "grad_norm": 0.1295643150806427,
313
+ "learning_rate": 0.0004551230880883208,
314
+ "loss": 4.2388,
315
  "step": 656
316
  },
317
  {
318
  "epoch": 1.1125827814569536,
319
+ "grad_norm": 0.10932028293609619,
320
+ "learning_rate": 0.00043379498255020037,
321
+ "loss": 4.2505,
322
  "step": 672
323
  },
324
  {
325
  "epoch": 1.1390728476821192,
326
+ "grad_norm": 0.11317116022109985,
327
+ "learning_rate": 0.00041258884620182804,
328
+ "loss": 4.2256,
329
  "step": 688
330
  },
331
  {
332
  "epoch": 1.1655629139072847,
333
+ "grad_norm": 0.15960782766342163,
334
+ "learning_rate": 0.00039154374700765316,
335
+ "loss": 4.2248,
336
  "step": 704
337
  },
338
  {
339
  "epoch": 1.1920529801324504,
340
+ "grad_norm": 0.14448010921478271,
341
+ "learning_rate": 0.00037069845625413954,
342
+ "loss": 4.248,
343
  "step": 720
344
  },
345
  {
346
  "epoch": 1.218543046357616,
347
+ "grad_norm": 0.12560078501701355,
348
+ "learning_rate": 0.0003500913771216081,
349
+ "loss": 4.2264,
350
  "step": 736
351
  },
352
  {
353
  "epoch": 1.2450331125827814,
354
+ "grad_norm": 0.15087343752384186,
355
+ "learning_rate": 0.0003297604739342396,
356
+ "loss": 4.2582,
357
  "step": 752
358
  },
359
  {
360
  "epoch": 1.271523178807947,
361
+ "grad_norm": 0.14385798573493958,
362
+ "learning_rate": 0.00030974320221858066,
363
+ "loss": 4.2413,
364
  "step": 768
365
  },
366
  {
367
  "epoch": 1.2980132450331126,
368
+ "grad_norm": 0.12157344818115234,
369
+ "learning_rate": 0.0002900764396994049,
370
+ "loss": 4.2285,
371
  "step": 784
372
  },
373
  {
374
  "epoch": 1.3245033112582782,
375
+ "grad_norm": 0.14533455669879913,
376
+ "learning_rate": 0.00027079641836005473,
377
+ "loss": 4.2719,
378
  "step": 800
379
  },
380
  {
381
  "epoch": 1.3509933774834437,
382
+ "grad_norm": 0.11581210792064667,
383
+ "learning_rate": 0.0002519386576924303,
384
+ "loss": 4.2191,
385
  "step": 816
386
  },
387
  {
388
  "epoch": 1.3774834437086092,
389
+ "grad_norm": 0.11933822929859161,
390
+ "learning_rate": 0.0002335378992595995,
391
+ "loss": 4.1929,
392
  "step": 832
393
  },
394
  {
395
  "epoch": 1.403973509933775,
396
+ "grad_norm": 0.13587485253810883,
397
+ "learning_rate": 0.0002156280426915786,
398
+ "loss": 4.2147,
399
  "step": 848
400
  },
401
  {
402
  "epoch": 1.4304635761589404,
403
+ "grad_norm": 0.11555938422679901,
404
+ "learning_rate": 0.00019824208323220656,
405
+ "loss": 4.2268,
406
  "step": 864
407
  },
408
  {
409
  "epoch": 1.4569536423841059,
410
+ "grad_norm": 0.12277619540691376,
411
+ "learning_rate": 0.00018141205095216294,
412
+ "loss": 4.2261,
413
  "step": 880
414
  },
415
  {
416
  "epoch": 1.4834437086092715,
417
+ "grad_norm": 0.11538510024547577,
418
+ "learning_rate": 0.00016516895174012043,
419
+ "loss": 4.1956,
420
  "step": 896
421
  },
422
  {
423
  "epoch": 1.5099337748344372,
424
+ "grad_norm": 0.10595931112766266,
425
+ "learning_rate": 0.00014954271018074368,
426
+ "loss": 4.2703,
427
  "step": 912
428
  },
429
  {
430
  "epoch": 1.5364238410596025,
431
+ "grad_norm": 0.12333059310913086,
432
+ "learning_rate": 0.00013456211442476813,
433
+ "loss": 4.1822,
434
  "step": 928
435
  },
436
  {
437
  "epoch": 1.5629139072847682,
438
+ "grad_norm": 0.10703016817569733,
439
+ "learning_rate": 0.00012025476315272743,
440
+ "loss": 4.2289,
441
  "step": 944
442
  },
443
  {
444
  "epoch": 1.589403973509934,
445
+ "grad_norm": 0.11507616937160492,
446
+ "learning_rate": 0.00010664701473003396,
447
+ "loss": 4.2223,
448
  "step": 960
449
  },
450
  {
451
  "epoch": 1.6158940397350994,
452
+ "grad_norm": 0.09708770364522934,
453
+ "learning_rate": 9.376393864708821e-05,
454
+ "loss": 4.2223,
455
  "step": 976
456
  },
457
  {
458
  "epoch": 1.6423841059602649,
459
+ "grad_norm": 0.11372318863868713,
460
+ "learning_rate": 8.162926933387499e-05,
461
+ "loss": 4.2433,
462
  "step": 992
463
  },
464
  {
465
  "epoch": 1.6688741721854305,
466
+ "grad_norm": 0.16325490176677704,
467
+ "learning_rate": 7.026536243413539e-05,
468
+ "loss": 4.2427,
469
  "step": 1008
470
  },
471
  {
472
  "epoch": 1.695364238410596,
473
+ "grad_norm": 0.10167232900857925,
474
+ "learning_rate": 5.969315361967087e-05,
475
+ "loss": 4.1995,
476
  "step": 1024
477
  },
478
  {
479
  "epoch": 1.7218543046357615,
480
+ "grad_norm": 0.11737249046564102,
481
+ "learning_rate": 4.9932120020654116e-05,
482
+ "loss": 4.2522,
483
  "step": 1040
484
  },
485
  {
486
  "epoch": 1.7483443708609272,
487
+ "grad_norm": 0.10857795923948288,
488
+ "learning_rate": 4.100024434300437e-05,
489
+ "loss": 4.1555,
490
  "step": 1056
491
  },
492
  {
493
  "epoch": 1.7748344370860927,
494
+ "grad_norm": 0.11734642088413239,
495
+ "learning_rate": 3.2913981738933395e-05,
496
+ "loss": 4.208,
497
  "step": 1072
498
  },
499
  {
500
  "epoch": 1.8013245033112582,
501
+ "grad_norm": 0.10702993720769882,
502
+ "learning_rate": 2.5688229491697356e-05,
503
+ "loss": 4.2835,
504
  "step": 1088
505
  },
506
  {
507
  "epoch": 1.8278145695364238,
508
+ "grad_norm": 0.12669378519058228,
509
+ "learning_rate": 1.9336299570401396e-05,
510
+ "loss": 4.2288,
511
  "step": 1104
512
  },
513
  {
514
  "epoch": 1.8543046357615895,
515
+ "grad_norm": 0.0894075259566307,
516
+ "learning_rate": 1.3869894105423109e-05,
517
+ "loss": 4.2051,
518
  "step": 1120
519
  },
520
  {
521
  "epoch": 1.8807947019867548,
522
+ "grad_norm": 0.09530144929885864,
523
+ "learning_rate": 9.299083829632516e-06,
524
+ "loss": 4.2449,
525
  "step": 1136
526
  },
527
  {
528
  "epoch": 1.9072847682119205,
529
+ "grad_norm": 0.13094228506088257,
530
+ "learning_rate": 5.632289525129064e-06,
531
+ "loss": 4.2017,
532
  "step": 1152
533
  },
534
  {
535
  "epoch": 1.9337748344370862,
536
+ "grad_norm": 0.12416987866163254,
537
+ "learning_rate": 2.8762665096744854e-06,
538
+ "loss": 4.1943,
539
  "step": 1168
540
  },
541
  {
542
  "epoch": 1.9602649006622517,
543
+ "grad_norm": 0.10805880278348923,
544
+ "learning_rate": 1.036092191402882e-06,
545
+ "loss": 4.1936,
546
  "step": 1184
547
  },
548
  {
549
  "epoch": 1.9867549668874172,
550
+ "grad_norm": 0.11822624504566193,
551
+ "learning_rate": 1.1515671473599775e-07,
552
+ "loss": 4.2833,
553
  "step": 1200
554
  },
555
  {
556
  "epoch": 2.0,
557
+ "eval_bleu": 0.1159956729511227,
558
+ "eval_cap_loss": 1.1952345237037203,
559
+ "eval_con_loss": 1.7797789394066035,
560
+ "eval_loss": 2.975013460544561,
561
  "step": 1208
562
  },
563
  {
564
  "epoch": 2.0,
565
+ "eval_bleu": 0.1159956729511227,
566
+ "eval_cap_loss": 1.1952345237037203,
567
+ "eval_con_loss": 1.7797789394066035,
568
+ "eval_loss": 2.975013460544561,
569
+ "eval_runtime": 297.535,
570
+ "eval_samples_per_second": 16.23,
571
+ "eval_steps_per_second": 2.03,
572
  "step": 1208
573
  }
574
  ],
 
593
  "train_batch_size": 32,
594
  "trial_name": null,
595
  "trial_params": null,
596
+ "tau_value": 5.2037
597
  }