|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.18851918182675087, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00018851918182675087, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 5e-06, |
|
"loss": 2.7659, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00037703836365350174, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.5842, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0005655575454802526, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.8169, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0007540767273070035, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2e-05, |
|
"loss": 2.6938, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0009425959091337543, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.7862, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0011311150909605052, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3e-05, |
|
"loss": 2.8844, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0013196342727872562, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.5000000000000004e-05, |
|
"loss": 2.8254, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.001508153454614007, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4e-05, |
|
"loss": 2.7735, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.001696672636440758, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 2.8222, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0018851918182675087, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 5e-05, |
|
"loss": 2.6943, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0020737110000942595, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 5.5e-05, |
|
"loss": 2.6735, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0022622301819210104, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6e-05, |
|
"loss": 2.6482, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0024507493637477614, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 2.8788, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0026392685455745124, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 7.000000000000001e-05, |
|
"loss": 2.7531, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.002827787727401263, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 7.5e-05, |
|
"loss": 2.7911, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.003016306909228014, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 8e-05, |
|
"loss": 2.7358, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.003204826091054765, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 8.5e-05, |
|
"loss": 2.7272, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.003393345272881516, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 2.7176, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0035818644547082664, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.5e-05, |
|
"loss": 2.8573, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0037703836365350174, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7512, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.003958902818361768, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.000105, |
|
"loss": 2.7962, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.004147422000188519, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00011, |
|
"loss": 2.7, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00433594118201527, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.000115, |
|
"loss": 2.7128, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.004524460363842021, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00012, |
|
"loss": 2.729, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.004712979545668771, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.000125, |
|
"loss": 2.698, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.004901498727495523, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 2.7461, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.005090017909322273, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.000135, |
|
"loss": 2.7315, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.005278537091149025, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00014000000000000001, |
|
"loss": 2.7089, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.005467056272975775, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.000145, |
|
"loss": 2.6724, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.005655575454802526, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00015, |
|
"loss": 2.799, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.005844094636629277, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.000155, |
|
"loss": 2.7939, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.006032613818456028, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00016, |
|
"loss": 2.8004, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.006221133000282778, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.000165, |
|
"loss": 2.6322, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00640965218210953, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00017, |
|
"loss": 2.7095, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.00659817136393628, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.000175, |
|
"loss": 2.7111, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.006786690545763032, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 2.7666, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.006975209727589782, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.000185, |
|
"loss": 2.7779, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.007163728909416533, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00019, |
|
"loss": 2.7684, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.007352248091243284, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019500000000000002, |
|
"loss": 2.8674, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.007540767273070035, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0002, |
|
"loss": 2.7694, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.007729286454896786, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.000205, |
|
"loss": 2.6799, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.007917805636723537, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00021, |
|
"loss": 2.6289, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.008106324818550288, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.000215, |
|
"loss": 2.7937, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.008294844000377038, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00022, |
|
"loss": 2.78, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.00848336318220379, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 2.6351, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00867188236403054, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00023, |
|
"loss": 2.8156, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.00886040154585729, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.000235, |
|
"loss": 2.8304, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.009048920727684042, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00024, |
|
"loss": 2.7148, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.009237439909510793, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.000245, |
|
"loss": 2.7169, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.009425959091337543, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00025, |
|
"loss": 2.8345, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009614478273164294, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000255, |
|
"loss": 2.8149, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.009802997454991046, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00026000000000000003, |
|
"loss": 2.8182, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.009991516636817797, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00026500000000000004, |
|
"loss": 2.8114, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.010180035818644547, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00027, |
|
"loss": 2.803, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.010368555000471298, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000275, |
|
"loss": 2.7979, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01055707418229805, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00028000000000000003, |
|
"loss": 2.8062, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0107455933641248, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.000285, |
|
"loss": 2.6728, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.01093411254595155, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00029, |
|
"loss": 2.7547, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.011122631727778302, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.000295, |
|
"loss": 2.6773, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.011311150909605052, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0003, |
|
"loss": 2.7238, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.011499670091431803, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.000305, |
|
"loss": 2.6842, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.011688189273258555, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00031, |
|
"loss": 2.8449, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.011876708455085304, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000315, |
|
"loss": 2.6828, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.012065227636912056, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00032, |
|
"loss": 2.7663, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.012253746818738807, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00032500000000000004, |
|
"loss": 2.6127, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.012442266000565557, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00033, |
|
"loss": 2.6333, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.012630785182392308, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.000335, |
|
"loss": 2.7669, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.01281930436421906, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00034, |
|
"loss": 2.7363, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.013007823546045811, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.000345, |
|
"loss": 2.6626, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.01319634272787256, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00035, |
|
"loss": 2.7896, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.013384861909699312, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000355, |
|
"loss": 2.7407, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.013573381091526063, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 2.804, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.013761900273352813, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000365, |
|
"loss": 2.781, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.013950419455179565, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00037, |
|
"loss": 2.5436, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.014138938637006316, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000375, |
|
"loss": 2.7272, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.014327457818833066, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00038, |
|
"loss": 2.6777, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.014515977000659817, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00038500000000000003, |
|
"loss": 2.8211, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.014704496182486568, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00039000000000000005, |
|
"loss": 2.7639, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.014893015364313318, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000395, |
|
"loss": 2.6884, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.01508153454614007, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0004, |
|
"loss": 2.6492, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.015270053727966821, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00040500000000000003, |
|
"loss": 2.8072, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.015458572909793572, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00041, |
|
"loss": 2.7446, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.015647092091620324, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000415, |
|
"loss": 2.7554, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.015835611273447073, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00042, |
|
"loss": 2.7212, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.016024130455273823, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.000425, |
|
"loss": 2.6933, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.016212649637100576, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00043, |
|
"loss": 2.7461, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.016401168818927326, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.000435, |
|
"loss": 2.7079, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.016589688000754076, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00044, |
|
"loss": 2.8562, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.01677820718258083, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00044500000000000003, |
|
"loss": 2.6606, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.01696672636440758, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 2.7817, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.017155245546234328, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.000455, |
|
"loss": 2.7714, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.01734376472806108, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00046, |
|
"loss": 2.7217, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.01753228390988783, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.000465, |
|
"loss": 2.6855, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.01772080309171458, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00047, |
|
"loss": 2.7111, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.017909322273541334, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.000475, |
|
"loss": 2.6868, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.018097841455368083, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00048, |
|
"loss": 2.7355, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.018286360637194833, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00048499999999999997, |
|
"loss": 2.7172, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.018474879819021586, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00049, |
|
"loss": 2.8204, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.018663399000848336, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.000495, |
|
"loss": 2.6965, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.018851918182675086, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0005, |
|
"loss": 2.7988, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01904043736450184, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.000505, |
|
"loss": 2.7069, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.01922895654632859, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00051, |
|
"loss": 2.6942, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.019417475728155338, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.000515, |
|
"loss": 2.7497, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.01960599490998209, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0005200000000000001, |
|
"loss": 2.6381, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.01979451409180884, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0005250000000000001, |
|
"loss": 2.6969, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.019983033273635594, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0005300000000000001, |
|
"loss": 2.7247, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.020171552455462344, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.000535, |
|
"loss": 2.828, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.020360071637289093, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00054, |
|
"loss": 2.7309, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.020548590819115847, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.000545, |
|
"loss": 2.8354, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.020737110000942596, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00055, |
|
"loss": 2.8101, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.020925629182769346, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000555, |
|
"loss": 2.7837, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0211141483645961, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0005600000000000001, |
|
"loss": 2.6813, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.02130266754642285, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.000565, |
|
"loss": 2.7035, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0214911867282496, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00057, |
|
"loss": 2.6901, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.02167970591007635, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.000575, |
|
"loss": 2.7001, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0218682250919031, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00058, |
|
"loss": 2.7508, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.02205674427372985, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.000585, |
|
"loss": 2.7348, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.022245263455556604, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00059, |
|
"loss": 2.7434, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.022433782637383354, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0005949999999999999, |
|
"loss": 2.6735, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.022622301819210103, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0006, |
|
"loss": 2.6258, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.022810821001036857, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.000605, |
|
"loss": 2.7676, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.022999340182863606, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00061, |
|
"loss": 2.7045, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.023187859364690356, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.000615, |
|
"loss": 2.6322, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.02337637854651711, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00062, |
|
"loss": 2.6953, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.02356489772834386, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.000625, |
|
"loss": 2.6045, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.02375341691017061, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00063, |
|
"loss": 2.6551, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.02394193609199736, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000635, |
|
"loss": 2.656, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.02413045527382411, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00064, |
|
"loss": 2.791, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.02431897445565086, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0006450000000000001, |
|
"loss": 2.6599, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.024507493637477614, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 2.633, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.024696012819304364, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0006550000000000001, |
|
"loss": 2.6002, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.024884532001131113, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00066, |
|
"loss": 2.7593, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.025073051182957867, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000665, |
|
"loss": 2.706, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.025261570364784616, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00067, |
|
"loss": 2.7094, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.02545008954661137, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.000675, |
|
"loss": 2.6961, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.02563860872843812, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00068, |
|
"loss": 2.7805, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.02582712791026487, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0006850000000000001, |
|
"loss": 2.6559, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.026015647092091622, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00069, |
|
"loss": 2.7455, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.02620416627391837, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.000695, |
|
"loss": 2.7533, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.02639268545574512, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0007, |
|
"loss": 2.7434, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.026581204637571874, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000705, |
|
"loss": 2.7018, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.026769723819398624, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00071, |
|
"loss": 2.6182, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.026958243001225374, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.000715, |
|
"loss": 2.5742, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.027146762183052127, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0007199999999999999, |
|
"loss": 2.6547, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.027335281364878877, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.000725, |
|
"loss": 2.7054, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.027523800546705626, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00073, |
|
"loss": 2.5809, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.02771231972853238, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.000735, |
|
"loss": 2.6474, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.02790083891035913, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00074, |
|
"loss": 2.7606, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.02808935809218588, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.000745, |
|
"loss": 2.6923, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.028277877274012632, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00075, |
|
"loss": 2.782, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02846639645583938, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000755, |
|
"loss": 2.7369, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.02865491563766613, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00076, |
|
"loss": 2.6287, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.028843434819492884, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0007650000000000001, |
|
"loss": 2.6649, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.029031954001319634, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0007700000000000001, |
|
"loss": 2.7421, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.029220473183146384, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0007750000000000001, |
|
"loss": 2.5988, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.029408992364973137, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0007800000000000001, |
|
"loss": 2.6876, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.029597511546799887, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.000785, |
|
"loss": 2.6846, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.029786030728626636, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00079, |
|
"loss": 2.7869, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.02997454991045339, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.000795, |
|
"loss": 2.6972, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.03016306909228014, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0008, |
|
"loss": 2.7664, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03035158827410689, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000805, |
|
"loss": 2.6554, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.030540107455933642, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008100000000000001, |
|
"loss": 2.662, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.03072862663776039, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.000815, |
|
"loss": 2.622, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.030917145819587145, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00082, |
|
"loss": 2.6071, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.031105665001413894, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.000825, |
|
"loss": 2.6724, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.03129418418324065, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00083, |
|
"loss": 2.5888, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.031482703365067394, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.000835, |
|
"loss": 2.7932, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.03167122254689415, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00084, |
|
"loss": 2.6234, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.0318597417287209, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0008449999999999999, |
|
"loss": 2.6725, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.032048260910547646, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00085, |
|
"loss": 2.6502, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0322367800923744, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.000855, |
|
"loss": 2.7151, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.03242529927420115, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00086, |
|
"loss": 2.8332, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.0326138184560279, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.000865, |
|
"loss": 2.8183, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.03280233763785465, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00087, |
|
"loss": 2.6777, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.032990856819681405, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.000875, |
|
"loss": 2.6281, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03317937600150815, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00088, |
|
"loss": 2.7047, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.033367895183334904, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000885, |
|
"loss": 2.6637, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.03355641436516166, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0008900000000000001, |
|
"loss": 2.7817, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.033744933546988404, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0008950000000000001, |
|
"loss": 2.6216, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.03393345272881516, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 2.6608, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03412197191064191, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009050000000000001, |
|
"loss": 2.712, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.034310491092468656, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00091, |
|
"loss": 2.6812, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.03449901027429541, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.000915, |
|
"loss": 2.6181, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.03468752945612216, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00092, |
|
"loss": 2.5939, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.03487604863794891, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.000925, |
|
"loss": 2.6378, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.03506456781977566, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00093, |
|
"loss": 2.658, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.035253087001602415, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0009350000000000001, |
|
"loss": 2.6324, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.03544160618342916, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00094, |
|
"loss": 2.7615, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.035630125365255914, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.000945, |
|
"loss": 2.8334, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.03581864454708267, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00095, |
|
"loss": 2.8026, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.036007163728909414, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.000955, |
|
"loss": 2.6532, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.03619568291073617, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00096, |
|
"loss": 2.5541, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.03638420209256292, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000965, |
|
"loss": 2.6375, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.036572721274389666, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009699999999999999, |
|
"loss": 2.5705, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.03676124045621642, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.000975, |
|
"loss": 2.6405, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.03694975963804317, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00098, |
|
"loss": 2.7821, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.03713827881986992, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000985, |
|
"loss": 2.6889, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.03732679800169667, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00099, |
|
"loss": 2.6658, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.037515317183523425, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.000995, |
|
"loss": 2.6969, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.03770383636535017, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.001, |
|
"loss": 2.5479, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.037892355547176924, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0009998040752351098, |
|
"loss": 2.7177, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.03808087472900368, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0009996081504702195, |
|
"loss": 2.7224, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.038269393910830424, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009994122257053293, |
|
"loss": 2.6316, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.03845791309265718, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0009992163009404388, |
|
"loss": 2.8178, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.03864643227448393, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009990203761755486, |
|
"loss": 2.7619, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.038834951456310676, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009988244514106584, |
|
"loss": 2.5739, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.03902347063813743, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0009986285266457681, |
|
"loss": 2.7797, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.03921198981996418, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009984326018808779, |
|
"loss": 2.695, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.039400509001790936, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009982366771159876, |
|
"loss": 2.7551, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.03958902818361768, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009980407523510972, |
|
"loss": 2.7898, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.039777547365444435, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.000997844827586207, |
|
"loss": 2.6824, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.03996606654727119, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009976489028213167, |
|
"loss": 2.8341, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.040154585729097934, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009974529780564262, |
|
"loss": 2.6885, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.04034310491092469, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.000997257053291536, |
|
"loss": 2.5722, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.04053162409275144, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009970611285266457, |
|
"loss": 2.7023, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.04072014327457819, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009968652037617555, |
|
"loss": 2.6429, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.04090866245640494, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0009966692789968653, |
|
"loss": 2.7053, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.04109718163823169, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000996473354231975, |
|
"loss": 2.7841, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.04128570082005844, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009962774294670846, |
|
"loss": 2.6687, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.04147422000188519, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009960815047021943, |
|
"loss": 2.7893, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.041662739183711946, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000995885579937304, |
|
"loss": 2.5992, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.04185125836553869, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009956896551724138, |
|
"loss": 2.7238, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.042039777547365445, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009954937304075236, |
|
"loss": 2.7477, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.0422282967291922, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009952978056426334, |
|
"loss": 2.6079, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.042416815911018944, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.000995101880877743, |
|
"loss": 2.6389, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.0426053350928457, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009949059561128527, |
|
"loss": 2.6014, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.04279385427467245, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009947100313479624, |
|
"loss": 2.6708, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.0429823734564992, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009945141065830722, |
|
"loss": 2.7032, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.04317089263832595, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009943181818181817, |
|
"loss": 2.7911, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.0433594118201527, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009941222570532915, |
|
"loss": 2.5071, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04354793100197945, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009939263322884012, |
|
"loss": 2.695, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.0437364501838062, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.000993730407523511, |
|
"loss": 2.5969, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.043924969365632956, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009935344827586207, |
|
"loss": 2.6602, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.0441134885474597, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009933385579937305, |
|
"loss": 2.6561, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.044302007729286455, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00099314263322884, |
|
"loss": 2.6442, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.04449052691111321, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0009929467084639498, |
|
"loss": 2.7465, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.044679046092939954, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0009927507836990596, |
|
"loss": 2.7102, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.04486756527476671, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009925548589341693, |
|
"loss": 2.7074, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.04505608445659346, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.000992358934169279, |
|
"loss": 2.6626, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.04524460363842021, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009921630094043888, |
|
"loss": 2.5579, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04543312282024696, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009919670846394984, |
|
"loss": 2.7225, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.04562164200207371, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009917711598746081, |
|
"loss": 2.6952, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.04581016118390046, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.000991575235109718, |
|
"loss": 2.6886, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.04599868036572721, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009913793103448277, |
|
"loss": 2.6096, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.046187199547553966, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009911833855799374, |
|
"loss": 2.7612, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.04637571872938071, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009909874608150472, |
|
"loss": 2.6082, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.046564237911207465, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009907915360501567, |
|
"loss": 2.7621, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.04675275709303422, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009905956112852665, |
|
"loss": 2.6764, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.046941276274860964, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009903996865203762, |
|
"loss": 2.6527, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.04712979545668772, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009902037617554858, |
|
"loss": 2.5762, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04731831463851447, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0009900078369905955, |
|
"loss": 2.7241, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.04750683382034122, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009898119122257053, |
|
"loss": 2.6935, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.04769535300216797, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.000989615987460815, |
|
"loss": 2.776, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.04788387218399472, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009894200626959248, |
|
"loss": 2.7799, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.04807239136582147, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009892241379310346, |
|
"loss": 2.7589, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.04826091054764822, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009890282131661443, |
|
"loss": 2.646, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.048449429729474976, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009888322884012539, |
|
"loss": 2.7226, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.04863794891130172, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009886363636363636, |
|
"loss": 2.6825, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.048826468093128475, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009884404388714734, |
|
"loss": 2.6494, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.04901498727495523, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009882445141065831, |
|
"loss": 2.7586, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.049203506456781974, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000988048589341693, |
|
"loss": 2.7986, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.04939202563860873, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009878526645768027, |
|
"loss": 2.624, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.04958054482043548, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009876567398119122, |
|
"loss": 2.4967, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.04976906400226223, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000987460815047022, |
|
"loss": 2.5694, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.04995758318408898, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009872648902821317, |
|
"loss": 2.7369, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.05014610236591573, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009870689655172413, |
|
"loss": 2.641, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.05033462154774248, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000986873040752351, |
|
"loss": 2.5988, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.05052314072956923, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009866771159874608, |
|
"loss": 2.6935, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.050711659911395986, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009864811912225705, |
|
"loss": 2.6573, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.05090017909322274, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009862852664576803, |
|
"loss": 2.5501, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.051088698275049485, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00098608934169279, |
|
"loss": 2.7173, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.05127721745687624, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0009858934169278996, |
|
"loss": 2.7147, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.05146573663870299, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009856974921630094, |
|
"loss": 2.6823, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.05165425582052974, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009855015673981191, |
|
"loss": 2.7399, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.05184277500235649, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009853056426332289, |
|
"loss": 2.8052, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.052031294184183244, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0009851097178683386, |
|
"loss": 2.6471, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.05221981336600999, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009849137931034484, |
|
"loss": 2.5997, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.05240833254783674, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.000984717868338558, |
|
"loss": 2.6933, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.052596851729663496, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009845219435736677, |
|
"loss": 2.7849, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.05278537091149024, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0009843260188087774, |
|
"loss": 2.7277, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.052973890093316996, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009841300940438872, |
|
"loss": 2.7328, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.05316240927514375, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000983934169278997, |
|
"loss": 2.8041, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.053350928456970495, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009837382445141067, |
|
"loss": 2.6497, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.05353944763879725, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009835423197492165, |
|
"loss": 2.6852, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.053727966820624, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.000983346394984326, |
|
"loss": 2.6116, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.05391648600245075, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009831504702194358, |
|
"loss": 2.5864, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.0541050051842775, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009829545454545455, |
|
"loss": 2.6291, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.054293524366104254, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000982758620689655, |
|
"loss": 2.672, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.054482043547931, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009825626959247648, |
|
"loss": 2.6036, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.05467056272975775, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0009823667711598746, |
|
"loss": 2.4802, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.054859081911584506, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0009821708463949844, |
|
"loss": 2.721, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.05504760109341125, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009819749216300941, |
|
"loss": 2.6039, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.055236120275238006, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009817789968652039, |
|
"loss": 2.7125, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.05542463945706476, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009815830721003134, |
|
"loss": 2.7176, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.055613158638891505, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009813871473354232, |
|
"loss": 2.7061, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.05580167782071826, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000981191222570533, |
|
"loss": 2.7324, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.05599019700254501, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009809952978056427, |
|
"loss": 2.6318, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.05617871618437176, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009807993730407524, |
|
"loss": 2.637, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.05636723536619851, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009806034482758622, |
|
"loss": 2.5645, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.056555754548025264, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009804075235109717, |
|
"loss": 2.7011, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05674427372985201, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009802115987460815, |
|
"loss": 2.7293, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.05693279291167876, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009800156739811913, |
|
"loss": 2.5779, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.057121312093505516, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000979819749216301, |
|
"loss": 2.7574, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.05730983127533226, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009796238244514106, |
|
"loss": 2.7168, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.057498350457159016, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009794278996865203, |
|
"loss": 2.6531, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.05768686963898577, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00097923197492163, |
|
"loss": 2.6852, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.057875388820812515, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009790360501567398, |
|
"loss": 2.8098, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.05806390800263927, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009788401253918496, |
|
"loss": 2.5938, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.05825242718446602, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009786442006269591, |
|
"loss": 2.6858, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.05844094636629277, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.000978448275862069, |
|
"loss": 2.6455, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.05862946554811952, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009782523510971787, |
|
"loss": 2.7194, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.058817984729946274, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009780564263322884, |
|
"loss": 2.5933, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.05900650391177302, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009778605015673982, |
|
"loss": 2.7103, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.05919502309359977, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000977664576802508, |
|
"loss": 2.7317, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.059383542275426526, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0009774686520376177, |
|
"loss": 2.6629, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.05957206145725327, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009772727272727272, |
|
"loss": 2.811, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.059760580639080026, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.000977076802507837, |
|
"loss": 2.679, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.05994909982090678, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009768808777429468, |
|
"loss": 2.7421, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.060137619002733525, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0009766849529780565, |
|
"loss": 2.7717, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.06032613818456028, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0009764890282131662, |
|
"loss": 2.7456, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06051465736638703, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009762931034482759, |
|
"loss": 2.6342, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.06070317654821378, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.0009760971786833856, |
|
"loss": 2.7088, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.06089169573004053, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009759012539184952, |
|
"loss": 2.651, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.061080214911867284, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.000975705329153605, |
|
"loss": 2.7472, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.06126873409369403, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0009755094043887147, |
|
"loss": 2.8417, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.06145725327552078, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009753134796238245, |
|
"loss": 2.7298, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.061645772457347536, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009751175548589341, |
|
"loss": 2.4934, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.06183429163917429, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0009749216300940439, |
|
"loss": 2.6646, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.062022810821001036, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009747257053291537, |
|
"loss": 2.6718, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.06221133000282779, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009745297805642633, |
|
"loss": 2.5689, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06239984918465454, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009743338557993731, |
|
"loss": 2.5188, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.0625883683664813, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009741379310344828, |
|
"loss": 2.7064, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.06277688754830804, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009739420062695925, |
|
"loss": 2.726, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.06296540673013479, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009737460815047022, |
|
"loss": 2.7389, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.06315392591196155, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.000973550156739812, |
|
"loss": 2.8134, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.0633424450937883, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009733542319749216, |
|
"loss": 2.7394, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.06353096427561504, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009731583072100314, |
|
"loss": 2.6256, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.0637194834574418, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0009729623824451412, |
|
"loss": 2.7413, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.06390800263926855, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009727664576802508, |
|
"loss": 2.7725, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.06409652182109529, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009725705329153606, |
|
"loss": 2.8092, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06428504100292205, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009723746081504702, |
|
"loss": 2.7276, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.0644735601847488, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009721786833855799, |
|
"loss": 2.5861, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.06466207936657555, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009719827586206896, |
|
"loss": 2.6467, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.0648505985484023, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009717868338557994, |
|
"loss": 2.7404, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.06503911773022905, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.000971590909090909, |
|
"loss": 2.6333, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.0652276369120558, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009713949843260188, |
|
"loss": 2.6079, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.06541615609388256, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009711990595611286, |
|
"loss": 2.5708, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.0656046752757093, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009710031347962382, |
|
"loss": 2.6675, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.06579319445753605, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.000970807210031348, |
|
"loss": 2.7782, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.06598171363936281, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009706112852664577, |
|
"loss": 2.6853, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06617023282118956, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009704153605015674, |
|
"loss": 2.7684, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.0663587520030163, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009702194357366771, |
|
"loss": 2.5759, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.06654727118484306, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009700235109717869, |
|
"loss": 2.7151, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.06673579036666981, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009698275862068966, |
|
"loss": 2.6346, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.06692430954849656, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009696316614420063, |
|
"loss": 2.5878, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.06711282873032332, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009694357366771161, |
|
"loss": 2.6841, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.06730134791215006, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009692398119122258, |
|
"loss": 2.5688, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.06748986709397681, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009690438871473355, |
|
"loss": 2.5057, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.06767838627580357, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0009688479623824452, |
|
"loss": 2.6444, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.06786690545763031, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009686520376175549, |
|
"loss": 2.6894, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06805542463945706, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009684561128526645, |
|
"loss": 2.5921, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.06824394382128382, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0009682601880877743, |
|
"loss": 2.7547, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.06843246300311057, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.000968064263322884, |
|
"loss": 2.7235, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.06862098218493731, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009678683385579937, |
|
"loss": 2.6726, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.06880950136676407, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0009676724137931034, |
|
"loss": 2.7688, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.06899802054859082, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009674764890282132, |
|
"loss": 2.6567, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.06918653973041756, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009672805642633229, |
|
"loss": 2.7241, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.06937505891224433, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009670846394984326, |
|
"loss": 2.603, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.06956357809407107, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009668887147335424, |
|
"loss": 2.6863, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.06975209727589782, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.000966692789968652, |
|
"loss": 2.6655, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.06994061645772458, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009664968652037618, |
|
"loss": 2.5301, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.07012913563955132, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009663009404388715, |
|
"loss": 2.7405, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.07031765482137807, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009661050156739812, |
|
"loss": 2.7326, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.07050617400320483, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.000965909090909091, |
|
"loss": 2.6685, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.07069469318503158, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009657131661442007, |
|
"loss": 2.6664, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.07088321236685832, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009655172413793104, |
|
"loss": 2.5892, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.07107173154868508, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009653213166144201, |
|
"loss": 2.6351, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.07126025073051183, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009651253918495299, |
|
"loss": 2.6587, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.07144876991233857, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009649294670846394, |
|
"loss": 2.7744, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.07163728909416534, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009647335423197492, |
|
"loss": 2.7516, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07182580827599208, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009645376175548589, |
|
"loss": 2.6607, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.07201432745781883, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009643416927899687, |
|
"loss": 2.7513, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.07220284663964559, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009641457680250783, |
|
"loss": 2.607, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.07239136582147233, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009639498432601881, |
|
"loss": 2.5463, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.07257988500329908, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0009637539184952979, |
|
"loss": 2.6368, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.07276840418512584, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009635579937304075, |
|
"loss": 2.5846, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.07295692336695259, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009633620689655173, |
|
"loss": 2.7072, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.07314544254877933, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000963166144200627, |
|
"loss": 2.6918, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.07333396173060609, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009629702194357367, |
|
"loss": 2.6682, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.07352248091243284, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009627742946708464, |
|
"loss": 2.6512, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07371100009425958, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0009625783699059562, |
|
"loss": 2.718, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.07389951927608635, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0009623824451410658, |
|
"loss": 2.7208, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.07408803845791309, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009621865203761756, |
|
"loss": 2.7411, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.07427655763973984, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009619905956112854, |
|
"loss": 2.6763, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.0744650768215666, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000961794670846395, |
|
"loss": 2.6919, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.07465359600339334, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0009615987460815048, |
|
"loss": 2.767, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.07484211518522009, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009614028213166145, |
|
"loss": 2.6868, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.07503063436704685, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009612068965517241, |
|
"loss": 2.6393, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.0752191535488736, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009610109717868338, |
|
"loss": 2.5917, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.07540767273070034, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009608150470219436, |
|
"loss": 2.709, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0755961919125271, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009606191222570532, |
|
"loss": 2.6591, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.07578471109435385, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.000960423197492163, |
|
"loss": 2.7638, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.0759732302761806, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009602272727272728, |
|
"loss": 2.58, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.07616174945800736, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009600313479623824, |
|
"loss": 2.5257, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.0763502686398341, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009598354231974922, |
|
"loss": 2.6512, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.07653878782166085, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009596394984326019, |
|
"loss": 2.6432, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.07672730700348761, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009594435736677116, |
|
"loss": 2.6028, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.07691582618531435, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0009592476489028213, |
|
"loss": 2.707, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.0771043453671411, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009590517241379311, |
|
"loss": 2.5831, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.07729286454896786, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009588557993730408, |
|
"loss": 2.7447, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.0774813837307946, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009586598746081505, |
|
"loss": 2.6452, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.07766990291262135, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009584639498432603, |
|
"loss": 2.7138, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.07785842209444811, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00095826802507837, |
|
"loss": 2.5726, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.07804694127627486, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009580721003134797, |
|
"loss": 2.7128, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.0782354604581016, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0009578761755485894, |
|
"loss": 2.5482, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.07842397963992837, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0009576802507836991, |
|
"loss": 2.768, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.07861249882175511, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009574843260188087, |
|
"loss": 2.774, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.07880101800358187, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009572884012539185, |
|
"loss": 2.7388, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.07898953718540862, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0009570924764890282, |
|
"loss": 2.5905, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.07917805636723536, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009568965517241379, |
|
"loss": 2.7478, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.07936657554906212, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009567006269592476, |
|
"loss": 2.699, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.07955509473088887, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009565047021943574, |
|
"loss": 2.5614, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.07974361391271562, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009563087774294671, |
|
"loss": 2.6036, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.07993213309454238, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009561128526645768, |
|
"loss": 2.6515, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.08012065227636912, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009559169278996866, |
|
"loss": 2.7026, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.08030917145819587, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009557210031347962, |
|
"loss": 2.7497, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.08049769064002263, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.000955525078369906, |
|
"loss": 2.6129, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.08068620982184938, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009553291536050157, |
|
"loss": 2.6113, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.08087472900367612, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009551332288401254, |
|
"loss": 2.4547, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.08106324818550288, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009549373040752351, |
|
"loss": 2.6197, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08125176736732963, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009547413793103449, |
|
"loss": 2.684, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.08144028654915637, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009545454545454546, |
|
"loss": 2.6874, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.08162880573098313, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009543495297805643, |
|
"loss": 2.6019, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.08181732491280988, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009541536050156741, |
|
"loss": 2.6309, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.08200584409463663, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009539576802507836, |
|
"loss": 2.6848, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.08219436327646339, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009537617554858934, |
|
"loss": 2.7124, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.08238288245829013, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009535658307210031, |
|
"loss": 2.5744, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.08257140164011688, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009533699059561129, |
|
"loss": 2.7689, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.08275992082194364, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009531739811912225, |
|
"loss": 2.7709, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.08294844000377039, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009529780564263323, |
|
"loss": 2.5495, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08313695918559713, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009527821316614421, |
|
"loss": 2.696, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.08332547836742389, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009525862068965517, |
|
"loss": 2.6657, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.08351399754925064, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009523902821316615, |
|
"loss": 2.6998, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.08370251673107738, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009521943573667712, |
|
"loss": 2.7154, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.08389103591290414, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009519984326018809, |
|
"loss": 2.6478, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.08407955509473089, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009518025078369906, |
|
"loss": 2.6899, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.08426807427655764, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0009516065830721004, |
|
"loss": 2.7137, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.0844565934583844, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00095141065830721, |
|
"loss": 2.6207, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.08464511264021114, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009512147335423198, |
|
"loss": 2.7149, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.08483363182203789, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009510188087774296, |
|
"loss": 2.7011, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08502215100386465, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0009508228840125392, |
|
"loss": 2.6496, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.0852106701856914, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.000950626959247649, |
|
"loss": 2.6714, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.08539918936751814, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0009504310344827587, |
|
"loss": 2.6271, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.0855877085493449, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009502351097178683, |
|
"loss": 2.6513, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.08577622773117165, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.000950039184952978, |
|
"loss": 2.6638, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.0859647469129984, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009498432601880878, |
|
"loss": 2.7398, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.08615326609482515, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009496473354231974, |
|
"loss": 2.7013, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.0863417852766519, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009494514106583072, |
|
"loss": 2.6336, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.08653030445847865, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.000949255485893417, |
|
"loss": 2.5915, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.0867188236403054, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009490595611285266, |
|
"loss": 2.6545, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.08690734282213215, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009488636363636364, |
|
"loss": 2.6792, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.0870958620039589, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009486677115987461, |
|
"loss": 2.6238, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.08728438118578566, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009484717868338558, |
|
"loss": 2.6929, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.0874729003676124, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009482758620689655, |
|
"loss": 2.7269, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.08766141954943915, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009480799373040753, |
|
"loss": 2.6728, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.08784993873126591, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.000947884012539185, |
|
"loss": 2.667, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.08803845791309266, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009476880877742947, |
|
"loss": 2.7706, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.0882269770949194, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009474921630094045, |
|
"loss": 2.7464, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.08841549627674616, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0009472962382445142, |
|
"loss": 2.6004, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.08860401545857291, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009471003134796239, |
|
"loss": 2.6237, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.08879253464039966, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009469043887147336, |
|
"loss": 2.6628, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.08898105382222642, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009467084639498434, |
|
"loss": 2.7066, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.08916957300405316, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009465125391849529, |
|
"loss": 2.6655, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.08935809218587991, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009463166144200627, |
|
"loss": 2.6333, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.08954661136770667, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009461206896551724, |
|
"loss": 2.5766, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.08973513054953342, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009459247648902821, |
|
"loss": 2.7387, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.08992364973136016, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0009457288401253918, |
|
"loss": 2.7342, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.09011216891318692, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009455329153605016, |
|
"loss": 2.6416, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.09030068809501367, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009453369905956113, |
|
"loss": 2.6143, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.09048920727684041, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000945141065830721, |
|
"loss": 2.7185, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09067772645866717, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0009449451410658308, |
|
"loss": 2.6152, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.09086624564049392, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009447492163009404, |
|
"loss": 2.6592, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.09105476482232067, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009445532915360502, |
|
"loss": 2.5181, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.09124328400414743, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009443573667711599, |
|
"loss": 2.6332, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.09143180318597417, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009441614420062696, |
|
"loss": 2.521, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.09162032236780092, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009439655172413793, |
|
"loss": 2.6339, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.09180884154962768, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009437695924764891, |
|
"loss": 2.6627, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.09199736073145443, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009435736677115988, |
|
"loss": 2.6227, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.09218587991328117, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009433777429467085, |
|
"loss": 2.784, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.09237439909510793, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.0009431818181818183, |
|
"loss": 2.5622, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.09256291827693468, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009429858934169278, |
|
"loss": 2.6712, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.09275143745876142, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009427899686520376, |
|
"loss": 2.5781, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.09293995664058818, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009425940438871473, |
|
"loss": 2.6193, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.09312847582241493, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009423981191222571, |
|
"loss": 2.716, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.09331699500424168, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009422021943573667, |
|
"loss": 2.745, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.09350551418606844, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009420062695924765, |
|
"loss": 2.5251, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.09369403336789518, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009418103448275863, |
|
"loss": 2.7023, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.09388255254972193, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009416144200626959, |
|
"loss": 2.7697, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.09407107173154869, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009414184952978057, |
|
"loss": 2.6253, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.09425959091337544, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0009412225705329154, |
|
"loss": 2.6668, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09425959091337544, |
|
"eval_runtime": 58.5785, |
|
"eval_samples_per_second": 17.481, |
|
"eval_steps_per_second": 0.546, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09425959091337544, |
|
"eval/hellaswag_acc": 0.37572196773551086, |
|
"eval/hellaswag_acc_norm": 0.4714200358494324, |
|
"eval_hellaswag_elapsed_time": 195.95180106163025, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09444811009520218, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009410266457680251, |
|
"loss": 2.6645, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.09463662927702894, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0009408307210031348, |
|
"loss": 2.7233, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.09482514845885569, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009406347962382446, |
|
"loss": 2.6959, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.09501366764068243, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009404388714733542, |
|
"loss": 2.747, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.0952021868225092, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.000940242946708464, |
|
"loss": 2.521, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.09539070600433594, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009400470219435738, |
|
"loss": 2.7368, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.09557922518616269, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009398510971786834, |
|
"loss": 2.6509, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.09576774436798945, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009396551724137932, |
|
"loss": 2.785, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.09595626354981619, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009394592476489029, |
|
"loss": 2.5647, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.09614478273164294, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009392633228840125, |
|
"loss": 2.6087, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0963333019134697, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009390673981191222, |
|
"loss": 2.6032, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.09652182109529645, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000938871473354232, |
|
"loss": 2.6934, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.09671034027712319, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009386755485893416, |
|
"loss": 2.7077, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.09689885945894995, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009384796238244514, |
|
"loss": 2.7372, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.0970873786407767, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009382836990595611, |
|
"loss": 2.5907, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.09727589782260344, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0009380877742946708, |
|
"loss": 2.5623, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.0974644170044302, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009378918495297806, |
|
"loss": 2.6949, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.09765293618625695, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0009376959247648903, |
|
"loss": 2.6505, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.0978414553680837, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009375, |
|
"loss": 2.6902, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.09802997454991046, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009373040752351097, |
|
"loss": 2.6529, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.0982184937317372, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009371081504702195, |
|
"loss": 2.5571, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.09840701291356395, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009369122257053292, |
|
"loss": 2.7718, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.09859553209539071, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009367163009404389, |
|
"loss": 2.7112, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.09878405127721746, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009365203761755486, |
|
"loss": 2.5318, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.0989725704590442, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009363244514106584, |
|
"loss": 2.6242, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.09916108964087096, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009361285266457681, |
|
"loss": 2.6603, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.09934960882269771, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009359326018808778, |
|
"loss": 2.7204, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.09953812800452445, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009357366771159876, |
|
"loss": 2.6355, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.09972664718635121, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009355407523510971, |
|
"loss": 2.6126, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.09991516636817796, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0009353448275862069, |
|
"loss": 2.5042, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1001036855500047, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009351489028213166, |
|
"loss": 2.639, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.10029220473183147, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009349529780564263, |
|
"loss": 2.6981, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.10048072391365821, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000934757053291536, |
|
"loss": 2.6578, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.10066924309548496, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009345611285266458, |
|
"loss": 2.7651, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.10085776227731172, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009343652037617555, |
|
"loss": 2.6639, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.10104628145913847, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009341692789968652, |
|
"loss": 2.6911, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.10123480064096523, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000933973354231975, |
|
"loss": 2.6213, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.10142331982279197, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009337774294670846, |
|
"loss": 2.6084, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.10161183900461872, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009335815047021944, |
|
"loss": 2.6893, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.10180035818644548, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009333855799373041, |
|
"loss": 2.547, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.10198887736827222, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009331896551724138, |
|
"loss": 2.7084, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.10217739655009897, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009329937304075235, |
|
"loss": 2.6611, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.10236591573192573, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009327978056426333, |
|
"loss": 2.658, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.10255443491375248, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.000932601880877743, |
|
"loss": 2.7423, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.10274295409557922, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0009324059561128527, |
|
"loss": 2.6237, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.10293147327740598, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009322100313479625, |
|
"loss": 2.6846, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.10311999245923273, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009320141065830722, |
|
"loss": 2.5963, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.10330851164105948, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009318181818181818, |
|
"loss": 2.6334, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.10349703082288624, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009316222570532915, |
|
"loss": 2.6657, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.10368555000471298, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009314263322884013, |
|
"loss": 2.7307, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.10387406918653973, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009312304075235109, |
|
"loss": 2.7773, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.10406258836836649, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009310344827586207, |
|
"loss": 2.7054, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.10425110755019323, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009308385579937305, |
|
"loss": 2.6325, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.10443962673201998, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009306426332288401, |
|
"loss": 2.5011, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.10462814591384674, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009304467084639499, |
|
"loss": 2.6252, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.10481666509567349, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009302507836990596, |
|
"loss": 2.6176, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.10500518427750023, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009300548589341693, |
|
"loss": 2.699, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.10519370345932699, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.000929858934169279, |
|
"loss": 2.5058, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.10538222264115374, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009296630094043888, |
|
"loss": 2.6034, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.10557074182298049, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0009294670846394984, |
|
"loss": 2.5785, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.10575926100480725, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0009292711598746082, |
|
"loss": 2.7846, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.10594778018663399, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.000929075235109718, |
|
"loss": 2.7049, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.10613629936846074, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009288793103448276, |
|
"loss": 2.72, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.1063248185502875, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009286833855799374, |
|
"loss": 2.7468, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.10651333773211424, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009284874608150471, |
|
"loss": 2.706, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.10670185691394099, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009282915360501567, |
|
"loss": 2.6664, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.10689037609576775, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0009280956112852664, |
|
"loss": 2.6685, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.1070788952775945, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0009278996865203762, |
|
"loss": 2.6295, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.10726741445942124, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009277037617554858, |
|
"loss": 2.7556, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.107455933641248, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009275078369905956, |
|
"loss": 2.6027, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.10764445282307475, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0009273119122257053, |
|
"loss": 2.622, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.1078329720049015, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.000927115987460815, |
|
"loss": 2.6136, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.10802149118672826, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0009269200626959248, |
|
"loss": 2.6196, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.108210010368555, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009267241379310345, |
|
"loss": 2.6569, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.10839852955038175, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009265282131661443, |
|
"loss": 2.7018, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.10858704873220851, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009263322884012539, |
|
"loss": 2.5521, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.10877556791403525, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009261363636363637, |
|
"loss": 2.6091, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.108964087095862, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009259404388714734, |
|
"loss": 2.5969, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.10915260627768876, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009257445141065831, |
|
"loss": 2.6032, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.1093411254595155, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009255485893416928, |
|
"loss": 2.6755, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.10952964464134225, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009253526645768026, |
|
"loss": 2.6504, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.10971816382316901, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009251567398119123, |
|
"loss": 2.7218, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.10990668300499576, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000924960815047022, |
|
"loss": 2.6733, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.1100952021868225, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009247648902821318, |
|
"loss": 2.7395, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.11028372136864927, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009245689655172413, |
|
"loss": 2.7391, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.11047224055047601, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009243730407523511, |
|
"loss": 2.5436, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.11066075973230276, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009241771159874608, |
|
"loss": 2.6671, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.11084927891412952, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0009239811912225705, |
|
"loss": 2.7918, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.11103779809595626, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009237852664576802, |
|
"loss": 2.6591, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.11122631727778301, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00092358934169279, |
|
"loss": 2.5917, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.11141483645960977, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009233934169278996, |
|
"loss": 2.5555, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.11160335564143652, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0009231974921630094, |
|
"loss": 2.7138, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.11179187482326326, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009230015673981192, |
|
"loss": 2.7638, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.11198039400509002, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009228056426332288, |
|
"loss": 2.6662, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.11216891318691677, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009226097178683386, |
|
"loss": 2.7772, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.11235743236874352, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0009224137931034483, |
|
"loss": 2.7929, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.11254595155057028, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.000922217868338558, |
|
"loss": 2.7971, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.11273447073239702, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009220219435736677, |
|
"loss": 2.7352, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.11292298991422377, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009218260188087775, |
|
"loss": 2.6516, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.11311150909605053, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009216300940438871, |
|
"loss": 2.7391, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11330002827787727, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009214341692789969, |
|
"loss": 2.6186, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.11348854745970402, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009212382445141067, |
|
"loss": 2.7238, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.11367706664153078, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0009210423197492164, |
|
"loss": 2.5381, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.11386558582335753, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.000920846394984326, |
|
"loss": 2.6816, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.11405410500518427, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0009206504702194357, |
|
"loss": 2.5738, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.11424262418701103, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009204545454545455, |
|
"loss": 2.6701, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.11443114336883778, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009202586206896551, |
|
"loss": 2.668, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.11461966255066453, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0009200626959247649, |
|
"loss": 2.6853, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.11480818173249129, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009198667711598747, |
|
"loss": 2.6926, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.11499670091431803, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009196708463949843, |
|
"loss": 2.7387, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.11518522009614478, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0009194749216300941, |
|
"loss": 2.613, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.11537373927797154, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009192789968652038, |
|
"loss": 2.5144, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.11556225845979828, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0009190830721003135, |
|
"loss": 2.7777, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.11575077764162503, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009188871473354232, |
|
"loss": 2.6103, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.11593929682345179, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.000918691222570533, |
|
"loss": 2.6806, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.11612781600527854, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009184952978056426, |
|
"loss": 2.6122, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.11631633518710528, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009182993730407524, |
|
"loss": 2.6198, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.11650485436893204, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009181034482758622, |
|
"loss": 2.5324, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.11669337355075879, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0009179075235109718, |
|
"loss": 2.6424, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.11688189273258554, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009177115987460816, |
|
"loss": 2.6544, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1170704119144123, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009175156739811913, |
|
"loss": 2.6725, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.11725893109623904, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.000917319749216301, |
|
"loss": 2.6113, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.11744745027806579, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009171238244514106, |
|
"loss": 2.6232, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.11763596945989255, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009169278996865204, |
|
"loss": 2.5457, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.1178244886417193, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00091673197492163, |
|
"loss": 2.5777, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.11801300782354604, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0009165360501567398, |
|
"loss": 2.6668, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.1182015270053728, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009163401253918495, |
|
"loss": 2.7571, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.11839004618719955, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009161442006269592, |
|
"loss": 2.6457, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.11857856536902629, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000915948275862069, |
|
"loss": 2.5717, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.11876708455085305, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0009157523510971787, |
|
"loss": 2.6518, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.1189556037326798, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009155564263322885, |
|
"loss": 2.6215, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.11914412291450655, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009153605015673981, |
|
"loss": 2.6829, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.1193326420963333, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0009151645768025079, |
|
"loss": 2.6905, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.11952116127816005, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009149686520376176, |
|
"loss": 2.5993, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.1197096804599868, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009147727272727273, |
|
"loss": 2.6115, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.11989819964181356, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000914576802507837, |
|
"loss": 2.7553, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.1200867188236403, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0009143808777429468, |
|
"loss": 2.685, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.12027523800546705, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009141849529780565, |
|
"loss": 2.6026, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.12046375718729381, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0009139890282131662, |
|
"loss": 2.7128, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.12065227636912056, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.000913793103448276, |
|
"loss": 2.5892, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1208407955509473, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009135971786833855, |
|
"loss": 2.6472, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.12102931473277406, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009134012539184953, |
|
"loss": 2.6738, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.12121783391460081, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.000913205329153605, |
|
"loss": 2.6913, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.12140635309642756, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009130094043887147, |
|
"loss": 2.6355, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.12159487227825432, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009128134796238244, |
|
"loss": 2.5979, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.12178339146008106, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009126175548589342, |
|
"loss": 2.7308, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.12197191064190781, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009124216300940438, |
|
"loss": 2.6809, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.12216042982373457, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009122257053291536, |
|
"loss": 2.7713, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.12234894900556131, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0009120297805642634, |
|
"loss": 2.4518, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.12253746818738806, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.000911833855799373, |
|
"loss": 2.5884, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.12272598736921482, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009116379310344828, |
|
"loss": 2.6641, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.12291450655104157, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009114420062695925, |
|
"loss": 2.5444, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.12310302573286831, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009112460815047022, |
|
"loss": 2.6773, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.12329154491469507, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009110501567398119, |
|
"loss": 2.6689, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.12348006409652182, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009108542319749217, |
|
"loss": 2.8371, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.12366858327834858, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009106583072100313, |
|
"loss": 2.6491, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.12385710246017533, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009104623824451411, |
|
"loss": 2.7192, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.12404562164200207, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009102664576802509, |
|
"loss": 2.7254, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.12423414082382883, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009100705329153606, |
|
"loss": 2.6403, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.12442266000565558, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009098746081504702, |
|
"loss": 2.6217, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.12461117918748232, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0009096786833855799, |
|
"loss": 2.7402, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.12479969836930908, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009094827586206897, |
|
"loss": 2.7237, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.12498821755113583, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009092868338557993, |
|
"loss": 2.5321, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.1251767367329626, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0009090909090909091, |
|
"loss": 2.4766, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.12536525591478934, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0009088949843260188, |
|
"loss": 2.5655, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.12555377509661608, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009086990595611285, |
|
"loss": 2.4495, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.12574229427844283, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009085031347962383, |
|
"loss": 2.7035, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.12593081346026958, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.000908307210031348, |
|
"loss": 2.5528, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.12611933264209632, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009081112852664577, |
|
"loss": 2.5787, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.1263078518239231, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0009079153605015674, |
|
"loss": 2.6167, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.12649637100574984, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009077194357366772, |
|
"loss": 2.7147, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.1266848901875766, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0009075235109717868, |
|
"loss": 2.7819, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.12687340936940333, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009073275862068966, |
|
"loss": 2.5718, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.12706192855123008, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0009071316614420063, |
|
"loss": 2.6887, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.12725044773305683, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000906935736677116, |
|
"loss": 2.6037, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.1274389669148836, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0009067398119122258, |
|
"loss": 2.6148, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.12762748609671035, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0009065438871473355, |
|
"loss": 2.6737, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.1278160052785371, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009063479623824452, |
|
"loss": 2.7679, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.12800452446036384, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0009061520376175548, |
|
"loss": 2.6948, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.12819304364219059, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009059561128526646, |
|
"loss": 2.6185, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.12838156282401733, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0009057601880877742, |
|
"loss": 2.7351, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.1285700820058441, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.000905564263322884, |
|
"loss": 2.6394, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.12875860118767085, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009053683385579937, |
|
"loss": 2.7473, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.1289471203694976, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0009051724137931034, |
|
"loss": 2.5965, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.12913563955132434, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0009049764890282132, |
|
"loss": 2.8092, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.1293241587331511, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009047805642633229, |
|
"loss": 2.58, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.12951267791497784, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009045846394984327, |
|
"loss": 2.6549, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.1297011970968046, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009043887147335423, |
|
"loss": 2.6374, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.12988971627863136, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009041927899686521, |
|
"loss": 2.5968, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.1300782354604581, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009039968652037618, |
|
"loss": 2.6792, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.13026675464228485, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0009038009404388715, |
|
"loss": 2.6631, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.1304552738241116, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0009036050156739812, |
|
"loss": 2.7008, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.13064379300593834, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.000903409090909091, |
|
"loss": 2.5822, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.13083231218776512, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0009032131661442007, |
|
"loss": 2.5259, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.13102083136959186, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0009030172413793104, |
|
"loss": 2.5717, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.1312093505514186, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009028213166144202, |
|
"loss": 2.7658, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.13139786973324535, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0009026253918495298, |
|
"loss": 2.6304, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.1315863889150721, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0009024294670846395, |
|
"loss": 2.6773, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.13177490809689885, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0009022335423197492, |
|
"loss": 2.6761, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.13196342727872562, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0009020376175548589, |
|
"loss": 2.6741, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13215194646055237, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0009018416927899686, |
|
"loss": 2.5493, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.1323404656423791, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0009016457680250784, |
|
"loss": 2.6143, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.13252898482420586, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.000901449843260188, |
|
"loss": 2.6671, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.1327175040060326, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0009012539184952978, |
|
"loss": 2.6108, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.13290602318785935, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009010579937304076, |
|
"loss": 2.6649, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.13309454236968613, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0009008620689655172, |
|
"loss": 2.6925, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.13328306155151287, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000900666144200627, |
|
"loss": 2.6506, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.13347158073333962, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0009004702194357367, |
|
"loss": 2.7378, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.13366009991516636, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0009002742946708464, |
|
"loss": 2.6705, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.1338486190969931, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0009000783699059561, |
|
"loss": 2.7019, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.13403713827881986, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008998824451410659, |
|
"loss": 2.6219, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.13422565746064663, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0008996865203761755, |
|
"loss": 2.5641, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.13441417664247338, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0008994905956112853, |
|
"loss": 2.6977, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.13460269582430012, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008992946708463951, |
|
"loss": 2.7332, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.13479121500612687, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008990987460815048, |
|
"loss": 2.5868, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.13497973418795361, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008989028213166145, |
|
"loss": 2.6311, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.1351682533697804, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008987068965517241, |
|
"loss": 2.6369, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.13535677255160714, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0008985109717868339, |
|
"loss": 2.778, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.13554529173343388, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0008983150470219435, |
|
"loss": 2.6355, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.13573381091526063, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008981191222570533, |
|
"loss": 2.6524, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.13592233009708737, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.000897923197492163, |
|
"loss": 2.7543, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.13611084927891412, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008977272727272727, |
|
"loss": 2.7062, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.1362993684607409, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0008975313479623825, |
|
"loss": 2.7506, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.13648788764256764, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0008973354231974922, |
|
"loss": 2.6015, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.1366764068243944, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008971394984326019, |
|
"loss": 2.6391, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.13686492600622113, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0008969435736677116, |
|
"loss": 2.7587, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.13705344518804788, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008967476489028214, |
|
"loss": 2.7175, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.13724196436987462, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.000896551724137931, |
|
"loss": 2.6185, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.1374304835517014, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008963557993730408, |
|
"loss": 2.5831, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.13761900273352815, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0008961598746081505, |
|
"loss": 2.6364, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.1378075219153549, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008959639498432602, |
|
"loss": 2.5928, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.13799604109718164, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00089576802507837, |
|
"loss": 2.6144, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.13818456027900838, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008955721003134797, |
|
"loss": 2.5887, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.13837307946083513, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0008953761755485894, |
|
"loss": 2.5888, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.1385615986426619, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.000895180250783699, |
|
"loss": 2.5938, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.13875011782448865, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008949843260188088, |
|
"loss": 2.5876, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.1389386370063154, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0008947884012539184, |
|
"loss": 2.6677, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.13912715618814214, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0008945924764890282, |
|
"loss": 2.5932, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.1393156753699689, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008943965517241379, |
|
"loss": 2.6358, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.13950419455179563, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008942006269592476, |
|
"loss": 2.6529, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1396927137336224, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008940047021943573, |
|
"loss": 2.6557, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.13988123291544916, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008938087774294671, |
|
"loss": 2.6333, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.1400697520972759, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0008936128526645769, |
|
"loss": 2.5974, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.14025827127910265, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008934169278996865, |
|
"loss": 2.6484, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.1404467904609294, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.0008932210031347963, |
|
"loss": 2.617, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.14063530964275614, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.000893025078369906, |
|
"loss": 2.6803, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.14082382882458291, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008928291536050157, |
|
"loss": 2.6882, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.14101234800640966, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0008926332288401254, |
|
"loss": 2.6814, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.1412008671882364, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0008924373040752352, |
|
"loss": 2.7618, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.14138938637006315, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008922413793103448, |
|
"loss": 2.7598, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1415779055518899, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0008920454545454546, |
|
"loss": 2.6827, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.14176642473371664, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0008918495297805644, |
|
"loss": 2.7531, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.14195494391554342, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.000891653605015674, |
|
"loss": 2.5641, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.14214346309737017, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008914576802507837, |
|
"loss": 2.6333, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.1423319822791969, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008912617554858934, |
|
"loss": 2.7232, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.14252050146102366, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008910658307210031, |
|
"loss": 2.6999, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.1427090206428504, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008908699059561128, |
|
"loss": 2.6267, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.14289753982467715, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0008906739811912226, |
|
"loss": 2.6947, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.14308605900650392, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0008904780564263322, |
|
"loss": 2.6448, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.14327457818833067, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000890282131661442, |
|
"loss": 2.7433, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.14346309737015742, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008900862068965518, |
|
"loss": 2.6372, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.14365161655198416, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008898902821316614, |
|
"loss": 2.7054, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.1438401357338109, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008896943573667712, |
|
"loss": 2.6433, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.14402865491563765, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008894984326018809, |
|
"loss": 2.6648, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.14421717409746443, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008893025078369906, |
|
"loss": 2.6961, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.14440569327929118, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0008891065830721003, |
|
"loss": 2.5835, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.14459421246111792, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008889106583072101, |
|
"loss": 2.6922, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.14478273164294467, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008887147335423197, |
|
"loss": 2.5899, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.1449712508247714, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0008885188087774295, |
|
"loss": 2.6164, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.14515977000659816, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008883228840125393, |
|
"loss": 2.708, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.14534828918842493, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000888126959247649, |
|
"loss": 2.59, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.14553680837025168, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008879310344827587, |
|
"loss": 2.5905, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.14572532755207843, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008877351097178683, |
|
"loss": 2.6069, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.14591384673390517, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0008875391849529781, |
|
"loss": 2.695, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.14610236591573192, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008873432601880877, |
|
"loss": 2.7418, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.14629088509755866, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008871473354231975, |
|
"loss": 2.6499, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.14647940427938544, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008869514106583072, |
|
"loss": 2.6959, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.14666792346121219, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008867554858934169, |
|
"loss": 2.6255, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.14685644264303893, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008865595611285267, |
|
"loss": 2.6885, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.14704496182486568, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008863636363636364, |
|
"loss": 2.6154, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.14723348100669242, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008861677115987461, |
|
"loss": 2.6715, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.14742200018851917, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008859717868338558, |
|
"loss": 2.6356, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.14761051937034594, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008857758620689656, |
|
"loss": 2.6181, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.1477990385521727, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0008855799373040752, |
|
"loss": 2.6921, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.14798755773399944, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.000885384012539185, |
|
"loss": 2.5067, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.14817607691582618, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008851880877742947, |
|
"loss": 2.729, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.14836459609765293, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0008849921630094044, |
|
"loss": 2.6272, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.14855311527947967, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008847962382445142, |
|
"loss": 2.5603, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.14874163446130645, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008846003134796239, |
|
"loss": 2.7072, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.1489301536431332, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008844043887147336, |
|
"loss": 2.6431, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.14911867282495994, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008842084639498433, |
|
"loss": 2.6058, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.1493071920067867, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.000884012539184953, |
|
"loss": 2.6369, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.14949571118861343, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008838166144200626, |
|
"loss": 2.5803, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.14968423037044018, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008836206896551724, |
|
"loss": 2.6228, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.14987274955226695, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0008834247648902821, |
|
"loss": 2.5875, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.1500612687340937, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008832288401253918, |
|
"loss": 2.6705, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.15024978791592045, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008830329153605015, |
|
"loss": 2.6356, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.1504383070977472, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008828369905956113, |
|
"loss": 2.6354, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.15062682627957394, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008826410658307211, |
|
"loss": 2.7054, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.15081534546140068, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008824451410658307, |
|
"loss": 2.5472, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15100386464322746, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0008822492163009405, |
|
"loss": 2.5917, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.1511923838250542, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008820532915360502, |
|
"loss": 2.6797, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.15138090300688095, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008818573667711599, |
|
"loss": 2.6894, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.1515694221887077, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0008816614420062696, |
|
"loss": 2.6542, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.15175794137053444, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008814655172413794, |
|
"loss": 2.7165, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.1519464605523612, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.000881269592476489, |
|
"loss": 2.6396, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.15213497973418796, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008810736677115988, |
|
"loss": 2.5562, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.1523234989160147, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008808777429467086, |
|
"loss": 2.5097, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.15251201809784146, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008806818181818182, |
|
"loss": 2.6022, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.1527005372796682, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008804858934169279, |
|
"loss": 2.6144, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.15288905646149495, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008802899686520376, |
|
"loss": 2.7956, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.1530775756433217, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008800940438871473, |
|
"loss": 2.6763, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.15326609482514847, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.000879898119122257, |
|
"loss": 2.7261, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.15345461400697522, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0008797021943573668, |
|
"loss": 2.5969, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.15364313318880196, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008795062695924764, |
|
"loss": 2.5595, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.1538316523706287, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008793103448275862, |
|
"loss": 2.4956, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.15402017155245545, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000879114420062696, |
|
"loss": 2.6691, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.1542086907342822, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0008789184952978056, |
|
"loss": 2.5205, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.15439720991610897, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008787225705329154, |
|
"loss": 2.5277, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.15458572909793572, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0008785266457680251, |
|
"loss": 2.6255, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.15477424827976247, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008783307210031348, |
|
"loss": 2.6624, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.1549627674615892, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0008781347962382445, |
|
"loss": 2.6192, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.15515128664341596, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008779388714733543, |
|
"loss": 2.6637, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.1553398058252427, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.000877742946708464, |
|
"loss": 2.6116, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.15552832500706948, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008775470219435737, |
|
"loss": 2.7295, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.15571684418889623, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008773510971786835, |
|
"loss": 2.6604, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.15590536337072297, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008771551724137932, |
|
"loss": 2.661, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.15609388255254972, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008769592476489029, |
|
"loss": 2.6784, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.15628240173437646, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008767633228840125, |
|
"loss": 2.6322, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.1564709209162032, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008765673981191223, |
|
"loss": 2.6958, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.15665944009802998, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008763714733542319, |
|
"loss": 2.614, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.15684795927985673, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008761755485893417, |
|
"loss": 2.61, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.15703647846168348, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008759796238244514, |
|
"loss": 2.6638, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.15722499764351022, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0008757836990595611, |
|
"loss": 2.5964, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.15741351682533697, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008755877742946709, |
|
"loss": 2.6957, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.15760203600716374, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008753918495297806, |
|
"loss": 2.5888, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.1577905551889905, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008751959247648903, |
|
"loss": 2.644, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.15797907437081724, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.000875, |
|
"loss": 2.772, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.15816759355264398, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0008748040752351098, |
|
"loss": 2.6483, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.15835611273447073, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008746081504702194, |
|
"loss": 2.5755, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.15854463191629747, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008744122257053292, |
|
"loss": 2.6584, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.15873315109812425, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008742163009404389, |
|
"loss": 2.6448, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.158921670279951, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008740203761755486, |
|
"loss": 2.6187, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.15911018946177774, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008738244514106584, |
|
"loss": 2.7043, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.1592987086436045, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008736285266457681, |
|
"loss": 2.5718, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.15948722782543123, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008734326018808778, |
|
"loss": 2.6218, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.15967574700725798, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008732366771159875, |
|
"loss": 2.7162, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.15986426618908475, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008730407523510972, |
|
"loss": 2.7002, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.1600527853709115, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0008728448275862068, |
|
"loss": 2.5591, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.16024130455273825, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0008726489028213166, |
|
"loss": 2.68, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.160429823734565, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008724529780564263, |
|
"loss": 2.6753, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.16061834291639174, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000872257053291536, |
|
"loss": 2.6676, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.16080686209821848, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0008720611285266457, |
|
"loss": 2.6622, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.16099538128004526, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0008718652037617555, |
|
"loss": 2.8021, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.161183900461872, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008716692789968653, |
|
"loss": 2.6823, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.16137241964369875, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0008714733542319749, |
|
"loss": 2.6915, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.1615609388255255, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008712774294670847, |
|
"loss": 2.6373, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.16174945800735224, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008710815047021944, |
|
"loss": 2.6126, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.161937977189179, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008708855799373041, |
|
"loss": 2.544, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.16212649637100576, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008706896551724138, |
|
"loss": 2.5462, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.1623150155528325, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008704937304075236, |
|
"loss": 2.6703, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.16250353473465926, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0008702978056426332, |
|
"loss": 2.606, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.162692053916486, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.000870101880877743, |
|
"loss": 2.5978, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.16288057309831275, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0008699059561128528, |
|
"loss": 2.6027, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.1630690922801395, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0008697100313479624, |
|
"loss": 2.654, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.16325761146196627, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008695141065830722, |
|
"loss": 2.4922, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.16344613064379301, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008693181818181818, |
|
"loss": 2.54, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.16363464982561976, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008691222570532915, |
|
"loss": 2.6802, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.1638231690074465, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0008689263322884012, |
|
"loss": 2.6451, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.16401168818927325, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.000868730407523511, |
|
"loss": 2.6301, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.1642002073711, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008685344827586206, |
|
"loss": 2.6155, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.16438872655292677, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0008683385579937304, |
|
"loss": 2.5815, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.16457724573475352, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008681426332288402, |
|
"loss": 2.6225, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.16476576491658027, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008679467084639498, |
|
"loss": 2.6562, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.164954284098407, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008677507836990596, |
|
"loss": 2.6155, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.16514280328023376, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008675548589341693, |
|
"loss": 2.6055, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.1653313224620605, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.000867358934169279, |
|
"loss": 2.637, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.16551984164388728, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008671630094043887, |
|
"loss": 2.6913, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.16570836082571402, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0008669670846394985, |
|
"loss": 2.5859, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.16589688000754077, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008667711598746082, |
|
"loss": 2.7039, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.16608539918936752, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008665752351097179, |
|
"loss": 2.662, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.16627391837119426, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008663793103448277, |
|
"loss": 2.6888, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.166462437553021, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0008661833855799374, |
|
"loss": 2.6346, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.16665095673484778, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008659874608150471, |
|
"loss": 2.6536, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.16683947591667453, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008657915360501567, |
|
"loss": 2.5995, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.16702799509850128, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0008655956112852665, |
|
"loss": 2.7111, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.16721651428032802, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0008653996865203761, |
|
"loss": 2.6629, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.16740503346215477, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008652037617554859, |
|
"loss": 2.5553, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.1675935526439815, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008650078369905956, |
|
"loss": 2.7594, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.1677820718258083, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008648119122257053, |
|
"loss": 2.6729, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.16797059100763503, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.000864615987460815, |
|
"loss": 2.7915, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.16815911018946178, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008644200626959248, |
|
"loss": 2.5758, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.16834762937128853, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0008642241379310345, |
|
"loss": 2.5708, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.16853614855311527, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008640282131661442, |
|
"loss": 2.6254, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.16872466773494202, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.000863832288401254, |
|
"loss": 2.5748, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.1689131869167688, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008636363636363636, |
|
"loss": 2.7726, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.16910170609859554, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008634404388714734, |
|
"loss": 2.6658, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.16929022528042229, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0008632445141065831, |
|
"loss": 2.6508, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.16947874446224903, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0008630485893416928, |
|
"loss": 2.6266, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.16966726364407578, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008628526645768026, |
|
"loss": 2.7459, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.16985578282590252, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008626567398119123, |
|
"loss": 2.587, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.1700443020077293, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.000862460815047022, |
|
"loss": 2.6079, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.17023282118955604, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008622648902821317, |
|
"loss": 2.7354, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.1704213403713828, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0008620689655172414, |
|
"loss": 2.5455, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.17060985955320954, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.000861873040752351, |
|
"loss": 2.6302, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.17079837873503628, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008616771159874608, |
|
"loss": 2.4757, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.17098689791686303, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0008614811912225705, |
|
"loss": 2.6884, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.1711754170986898, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008612852664576803, |
|
"loss": 2.5877, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.17136393628051655, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0008610893416927899, |
|
"loss": 2.6323, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.1715524554623433, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008608934169278997, |
|
"loss": 2.657, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.17174097464417004, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008606974921630095, |
|
"loss": 2.6591, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.1719294938259968, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008605015673981191, |
|
"loss": 2.6385, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.17211801300782353, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.0008603056426332289, |
|
"loss": 2.7853, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.1723065321896503, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0008601097178683386, |
|
"loss": 2.7299, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.17249505137147705, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008599137931034483, |
|
"loss": 2.7032, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.1726835705533038, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.000859717868338558, |
|
"loss": 2.6568, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.17287208973513055, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008595219435736678, |
|
"loss": 2.8574, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.1730606089169573, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008593260188087774, |
|
"loss": 2.7283, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.17324912809878404, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0008591300940438872, |
|
"loss": 2.712, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.1734376472806108, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.000858934169278997, |
|
"loss": 2.5807, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.17362616646243756, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008587382445141066, |
|
"loss": 2.6998, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.1738146856442643, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008585423197492164, |
|
"loss": 2.5734, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.17400320482609105, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000858346394984326, |
|
"loss": 2.6374, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.1741917240079178, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008581504702194357, |
|
"loss": 2.5408, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.17438024318974454, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0008579545454545454, |
|
"loss": 2.661, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.17456876237157132, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008577586206896552, |
|
"loss": 2.701, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.17475728155339806, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008575626959247648, |
|
"loss": 2.71, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.1749458007352248, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008573667711598746, |
|
"loss": 2.6468, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.17513431991705156, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0008571708463949844, |
|
"loss": 2.5039, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.1753228390988783, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.000856974921630094, |
|
"loss": 2.6299, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.17551135828070505, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008567789968652038, |
|
"loss": 2.7097, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.17569987746253182, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008565830721003135, |
|
"loss": 2.662, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.17588839664435857, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008563871473354232, |
|
"loss": 2.6554, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.17607691582618532, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008561912225705329, |
|
"loss": 2.5157, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.17626543500801206, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008559952978056427, |
|
"loss": 2.658, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.1764539541898388, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008557993730407524, |
|
"loss": 2.5697, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.17664247337166555, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008556034482758621, |
|
"loss": 2.6467, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.17683099255349233, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008554075235109719, |
|
"loss": 2.7308, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.17701951173531907, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008552115987460816, |
|
"loss": 2.5552, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.17720803091714582, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0008550156739811913, |
|
"loss": 2.7391, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.17739655009897257, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.000854819749216301, |
|
"loss": 2.5914, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.1775850692807993, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008546238244514107, |
|
"loss": 2.6652, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.17777358846262606, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008544278996865203, |
|
"loss": 2.7119, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.17796210764445283, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008542319749216301, |
|
"loss": 2.7099, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.17815062682627958, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0008540360501567398, |
|
"loss": 2.6452, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.17833914600810633, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008538401253918495, |
|
"loss": 2.5148, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.17852766518993307, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008536442006269592, |
|
"loss": 2.6379, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.17871618437175982, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000853448275862069, |
|
"loss": 2.6155, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.17890470355358656, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008532523510971787, |
|
"loss": 2.5122, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.17909322273541334, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008530564263322884, |
|
"loss": 2.7232, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.17928174191724008, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0008528605015673982, |
|
"loss": 2.7196, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.17947026109906683, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0008526645768025078, |
|
"loss": 2.5332, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.17965878028089358, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0008524686520376176, |
|
"loss": 2.7149, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.17984729946272032, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008522727272727273, |
|
"loss": 2.7601, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.1800358186445471, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.000852076802507837, |
|
"loss": 2.6729, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.18022433782637384, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008518808777429467, |
|
"loss": 2.5701, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.1804128570082006, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008516849529780565, |
|
"loss": 2.6001, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.18060137619002734, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008514890282131662, |
|
"loss": 2.7079, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.18078989537185408, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008512931034482759, |
|
"loss": 2.6712, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.18097841455368083, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0008510971786833856, |
|
"loss": 2.722, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.1811669337355076, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0008509012539184952, |
|
"loss": 2.5951, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.18135545291733435, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.000850705329153605, |
|
"loss": 2.6927, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.1815439720991611, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008505094043887147, |
|
"loss": 2.6872, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.18173249128098784, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0008503134796238245, |
|
"loss": 2.498, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.1819210104628146, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008501175548589341, |
|
"loss": 2.7254, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.18210952964464133, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008499216300940439, |
|
"loss": 2.6044, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.1822980488264681, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008497257053291537, |
|
"loss": 2.6202, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.18248656800829485, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0008495297805642633, |
|
"loss": 2.6671, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.1826750871901216, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0008493338557993731, |
|
"loss": 2.7925, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.18286360637194835, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008491379310344828, |
|
"loss": 2.5893, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.1830521255537751, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0008489420062695925, |
|
"loss": 2.6704, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.18324064473560184, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008487460815047022, |
|
"loss": 2.6642, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.1834291639174286, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.000848550156739812, |
|
"loss": 2.6579, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.18361768309925536, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0008483542319749216, |
|
"loss": 2.47, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.1838062022810821, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008481583072100314, |
|
"loss": 2.5976, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.18399472146290885, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0008479623824451412, |
|
"loss": 2.7708, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.1841832406447356, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008477664576802508, |
|
"loss": 2.6823, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.18437175982656234, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0008475705329153606, |
|
"loss": 2.6927, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.18456027900838912, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0008473746081504702, |
|
"loss": 2.682, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.18474879819021586, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008471786833855799, |
|
"loss": 2.6509, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.1849373173720426, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008469827586206896, |
|
"loss": 2.7625, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.18512583655386936, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0008467868338557994, |
|
"loss": 2.7144, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.1853143557356961, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000846590909090909, |
|
"loss": 2.7065, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.18550287491752285, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0008463949843260188, |
|
"loss": 2.5608, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.18569139409934962, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0008461990595611286, |
|
"loss": 2.587, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.18587991328117637, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008460031347962382, |
|
"loss": 2.5719, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.18606843246300311, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.000845807210031348, |
|
"loss": 2.6672, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.18625695164482986, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0008456112852664577, |
|
"loss": 2.6857, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.1864454708266566, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0008454153605015674, |
|
"loss": 2.6708, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.18663399000848335, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0008452194357366771, |
|
"loss": 2.6183, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.18682250919031013, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008450235109717869, |
|
"loss": 2.6452, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.18701102837213687, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0008448275862068966, |
|
"loss": 2.704, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.18719954755396362, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008446316614420063, |
|
"loss": 2.674, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.18738806673579037, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000844435736677116, |
|
"loss": 2.7137, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.1875765859176171, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008442398119122258, |
|
"loss": 2.7582, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.18776510509944386, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0008440438871473355, |
|
"loss": 2.6009, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.18795362428127063, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0008438479623824452, |
|
"loss": 2.6595, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.18814214346309738, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0008436520376175549, |
|
"loss": 2.552, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.18833066264492412, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0008434561128526645, |
|
"loss": 2.5806, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.18851918182675087, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0008432601880877743, |
|
"loss": 2.6469, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18851918182675087, |
|
"eval_runtime": 16.219, |
|
"eval_samples_per_second": 63.136, |
|
"eval_steps_per_second": 1.973, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18851918182675087, |
|
"eval/hellaswag_acc": 0.3743278231428002, |
|
"eval/hellaswag_acc_norm": 0.4706233817964549, |
|
"eval_hellaswag_elapsed_time": 116.27660393714905, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 5304, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.325965577388032e+18, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|