|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.01, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1e-05, |
|
"grad_norm": 1.6317715205474108, |
|
"learning_rate": 3e-06, |
|
"loss": 10.867, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2e-05, |
|
"grad_norm": 1.6173147870740345, |
|
"learning_rate": 6e-06, |
|
"loss": 10.8657, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 3e-05, |
|
"grad_norm": 1.6387509359885835, |
|
"learning_rate": 9e-06, |
|
"loss": 10.8658, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 4e-05, |
|
"grad_norm": 1.597552357815991, |
|
"learning_rate": 1.2e-05, |
|
"loss": 10.865, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 5e-05, |
|
"grad_norm": 1.6454471252189307, |
|
"learning_rate": 1.5e-05, |
|
"loss": 10.8617, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 6e-05, |
|
"grad_norm": 1.6407925722175996, |
|
"learning_rate": 1.8e-05, |
|
"loss": 10.8593, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7e-05, |
|
"grad_norm": 1.6096088910322361, |
|
"learning_rate": 2.1000000000000002e-05, |
|
"loss": 10.8456, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 8e-05, |
|
"grad_norm": 1.4682698515009915, |
|
"learning_rate": 2.4e-05, |
|
"loss": 10.8184, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 9e-05, |
|
"grad_norm": 1.3934246427009196, |
|
"learning_rate": 2.7e-05, |
|
"loss": 10.8113, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0001, |
|
"grad_norm": 1.3326087040550991, |
|
"learning_rate": 3e-05, |
|
"loss": 10.7969, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00011, |
|
"grad_norm": 1.2173090118888668, |
|
"learning_rate": 3.2999999999999996e-05, |
|
"loss": 10.7801, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00012, |
|
"grad_norm": 1.176457724285593, |
|
"learning_rate": 3.6e-05, |
|
"loss": 10.7688, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00013, |
|
"grad_norm": 1.1304424318539597, |
|
"learning_rate": 3.9e-05, |
|
"loss": 10.7498, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00014, |
|
"grad_norm": 1.1158244568462428, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 10.739, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00015, |
|
"grad_norm": 1.10654756415174, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 10.7299, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00016, |
|
"grad_norm": 1.0751770608444569, |
|
"learning_rate": 4.8e-05, |
|
"loss": 10.7126, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00017, |
|
"grad_norm": 1.046143788290158, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 10.6968, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00018, |
|
"grad_norm": 1.0230391412556632, |
|
"learning_rate": 5.4e-05, |
|
"loss": 10.6806, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00019, |
|
"grad_norm": 0.9869982733638543, |
|
"learning_rate": 5.7e-05, |
|
"loss": 10.6649, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0002, |
|
"grad_norm": 0.9728818553338922, |
|
"learning_rate": 6e-05, |
|
"loss": 10.6526, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00021, |
|
"grad_norm": 0.9439994749998407, |
|
"learning_rate": 6.3e-05, |
|
"loss": 10.639, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00022, |
|
"grad_norm": 0.9279528910342688, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 10.6244, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00023, |
|
"grad_norm": 0.9208663519602571, |
|
"learning_rate": 6.9e-05, |
|
"loss": 10.6103, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.00024, |
|
"grad_norm": 0.9178551557561957, |
|
"learning_rate": 7.2e-05, |
|
"loss": 10.5993, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00025, |
|
"grad_norm": 0.9193923250060233, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 10.5847, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00026, |
|
"grad_norm": 0.9190901609677985, |
|
"learning_rate": 7.8e-05, |
|
"loss": 10.5717, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00027, |
|
"grad_norm": 0.913753327244254, |
|
"learning_rate": 8.1e-05, |
|
"loss": 10.5597, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00028, |
|
"grad_norm": 0.9119625217070774, |
|
"learning_rate": 8.400000000000001e-05, |
|
"loss": 10.5467, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00029, |
|
"grad_norm": 0.9131038863398008, |
|
"learning_rate": 8.7e-05, |
|
"loss": 10.5323, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0003, |
|
"grad_norm": 0.9186172829723749, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 10.517, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00031, |
|
"grad_norm": 0.9155453723962563, |
|
"learning_rate": 9.3e-05, |
|
"loss": 10.5024, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.00032, |
|
"grad_norm": 0.909575589137279, |
|
"learning_rate": 9.6e-05, |
|
"loss": 10.4882, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00033, |
|
"grad_norm": 0.9102175928717151, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 10.4717, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00034, |
|
"grad_norm": 0.9103991171564593, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 10.4554, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.00035, |
|
"grad_norm": 0.9164468459870767, |
|
"learning_rate": 0.00010500000000000002, |
|
"loss": 10.4388, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00036, |
|
"grad_norm": 0.9129850089149896, |
|
"learning_rate": 0.000108, |
|
"loss": 10.4213, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.00037, |
|
"grad_norm": 0.9029808327462479, |
|
"learning_rate": 0.000111, |
|
"loss": 10.4047, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.00038, |
|
"grad_norm": 0.9074583652458479, |
|
"learning_rate": 0.000114, |
|
"loss": 10.3859, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.00039, |
|
"grad_norm": 0.9101621604618185, |
|
"learning_rate": 0.000117, |
|
"loss": 10.3665, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0004, |
|
"grad_norm": 0.9144345472354501, |
|
"learning_rate": 0.00012, |
|
"loss": 10.3443, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00041, |
|
"grad_norm": 0.9045315909874942, |
|
"learning_rate": 0.000123, |
|
"loss": 10.3264, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.00042, |
|
"grad_norm": 0.9101518170592343, |
|
"learning_rate": 0.000126, |
|
"loss": 10.3059, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.00043, |
|
"grad_norm": 0.9147491310031046, |
|
"learning_rate": 0.000129, |
|
"loss": 10.2831, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.00044, |
|
"grad_norm": 0.9138166723808987, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 10.2617, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.00045, |
|
"grad_norm": 0.9166634386783579, |
|
"learning_rate": 0.000135, |
|
"loss": 10.2388, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00046, |
|
"grad_norm": 0.9061319871747918, |
|
"learning_rate": 0.000138, |
|
"loss": 10.2171, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.00047, |
|
"grad_norm": 0.9134144094551727, |
|
"learning_rate": 0.000141, |
|
"loss": 10.1906, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.00048, |
|
"grad_norm": 0.9111015850262806, |
|
"learning_rate": 0.000144, |
|
"loss": 10.1669, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.00049, |
|
"grad_norm": 0.9106086112424904, |
|
"learning_rate": 0.000147, |
|
"loss": 10.1439, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0005, |
|
"grad_norm": 0.9135108001899231, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 10.1178, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00051, |
|
"grad_norm": 0.9152031108656089, |
|
"learning_rate": 0.000153, |
|
"loss": 10.0918, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.00052, |
|
"grad_norm": 0.9133043896843657, |
|
"learning_rate": 0.000156, |
|
"loss": 10.0658, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.00053, |
|
"grad_norm": 0.9039676544194273, |
|
"learning_rate": 0.000159, |
|
"loss": 10.0419, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.00054, |
|
"grad_norm": 0.9245050218484777, |
|
"learning_rate": 0.000162, |
|
"loss": 10.0099, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00055, |
|
"grad_norm": 0.915653013423474, |
|
"learning_rate": 0.000165, |
|
"loss": 9.9858, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00056, |
|
"grad_norm": 0.90743999026624, |
|
"learning_rate": 0.00016800000000000002, |
|
"loss": 9.9567, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.00057, |
|
"grad_norm": 0.9125740935300273, |
|
"learning_rate": 0.000171, |
|
"loss": 9.9292, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.00058, |
|
"grad_norm": 0.9103134473221595, |
|
"learning_rate": 0.000174, |
|
"loss": 9.9046, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.00059, |
|
"grad_norm": 0.91502039796166, |
|
"learning_rate": 0.000177, |
|
"loss": 9.8727, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0006, |
|
"grad_norm": 0.902549468432534, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 9.8467, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00061, |
|
"grad_norm": 0.9011753674575653, |
|
"learning_rate": 0.000183, |
|
"loss": 9.8184, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.00062, |
|
"grad_norm": 0.9073829944096542, |
|
"learning_rate": 0.000186, |
|
"loss": 9.7865, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.00063, |
|
"grad_norm": 0.900277782228101, |
|
"learning_rate": 0.000189, |
|
"loss": 9.7594, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.00064, |
|
"grad_norm": 0.8964836174343672, |
|
"learning_rate": 0.000192, |
|
"loss": 9.7292, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.00065, |
|
"grad_norm": 0.9040253232587138, |
|
"learning_rate": 0.00019500000000000002, |
|
"loss": 9.6969, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00066, |
|
"grad_norm": 0.8969700410935529, |
|
"learning_rate": 0.00019800000000000002, |
|
"loss": 9.6728, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00067, |
|
"grad_norm": 0.8981910064021, |
|
"learning_rate": 0.000201, |
|
"loss": 9.643, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.00068, |
|
"grad_norm": 0.9049113812894196, |
|
"learning_rate": 0.00020400000000000003, |
|
"loss": 9.61, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.00069, |
|
"grad_norm": 0.8903730719674341, |
|
"learning_rate": 0.00020700000000000002, |
|
"loss": 9.5824, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0007, |
|
"grad_norm": 0.9034192157313848, |
|
"learning_rate": 0.00021000000000000004, |
|
"loss": 9.5456, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00071, |
|
"grad_norm": 0.8933343163190056, |
|
"learning_rate": 0.00021299999999999997, |
|
"loss": 9.5189, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.00072, |
|
"grad_norm": 0.8968593008835964, |
|
"learning_rate": 0.000216, |
|
"loss": 9.4914, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.00073, |
|
"grad_norm": 0.8960167287531013, |
|
"learning_rate": 0.00021899999999999998, |
|
"loss": 9.4574, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.00074, |
|
"grad_norm": 0.8965354004907367, |
|
"learning_rate": 0.000222, |
|
"loss": 9.4304, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.00075, |
|
"grad_norm": 0.8923958382533757, |
|
"learning_rate": 0.000225, |
|
"loss": 9.3946, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.00076, |
|
"grad_norm": 0.9011054851249423, |
|
"learning_rate": 0.000228, |
|
"loss": 9.3667, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.00077, |
|
"grad_norm": 0.8959973023769764, |
|
"learning_rate": 0.000231, |
|
"loss": 9.3374, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.00078, |
|
"grad_norm": 0.8901362352309407, |
|
"learning_rate": 0.000234, |
|
"loss": 9.3055, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.00079, |
|
"grad_norm": 0.8931237127830396, |
|
"learning_rate": 0.00023700000000000001, |
|
"loss": 9.2694, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0008, |
|
"grad_norm": 0.8937972757596185, |
|
"learning_rate": 0.00024, |
|
"loss": 9.2355, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.00081, |
|
"grad_norm": 0.8984186266430717, |
|
"learning_rate": 0.00024300000000000002, |
|
"loss": 9.2033, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.00082, |
|
"grad_norm": 0.9060711515558022, |
|
"learning_rate": 0.000246, |
|
"loss": 9.1789, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.00083, |
|
"grad_norm": 0.8964035250431631, |
|
"learning_rate": 0.00024900000000000004, |
|
"loss": 9.1434, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.00084, |
|
"grad_norm": 0.8986377804082708, |
|
"learning_rate": 0.000252, |
|
"loss": 9.1119, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.00085, |
|
"grad_norm": 0.8933390971667627, |
|
"learning_rate": 0.000255, |
|
"loss": 9.0885, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.00086, |
|
"grad_norm": 0.8953786211031305, |
|
"learning_rate": 0.000258, |
|
"loss": 9.0521, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.00087, |
|
"grad_norm": 0.8876773666743288, |
|
"learning_rate": 0.000261, |
|
"loss": 9.0302, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.00088, |
|
"grad_norm": 0.8929152197347487, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 8.9927, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.00089, |
|
"grad_norm": 0.8872457913370299, |
|
"learning_rate": 0.000267, |
|
"loss": 8.9669, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0009, |
|
"grad_norm": 0.8862509419810688, |
|
"learning_rate": 0.00027, |
|
"loss": 8.9393, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00091, |
|
"grad_norm": 0.8807880266520192, |
|
"learning_rate": 0.000273, |
|
"loss": 8.9089, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.00092, |
|
"grad_norm": 0.8831207589668301, |
|
"learning_rate": 0.000276, |
|
"loss": 8.876, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.00093, |
|
"grad_norm": 0.883866988348631, |
|
"learning_rate": 0.000279, |
|
"loss": 8.8462, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.00094, |
|
"grad_norm": 0.8824853161719922, |
|
"learning_rate": 0.000282, |
|
"loss": 8.8199, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.00095, |
|
"grad_norm": 0.8794572899807177, |
|
"learning_rate": 0.000285, |
|
"loss": 8.789, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.00096, |
|
"grad_norm": 0.8879520784944948, |
|
"learning_rate": 0.000288, |
|
"loss": 8.7571, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.00097, |
|
"grad_norm": 0.8798042444972031, |
|
"learning_rate": 0.000291, |
|
"loss": 8.7321, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.00098, |
|
"grad_norm": 0.8802597894834375, |
|
"learning_rate": 0.000294, |
|
"loss": 8.6954, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.00099, |
|
"grad_norm": 0.8806466323910314, |
|
"learning_rate": 0.000297, |
|
"loss": 8.6749, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.001, |
|
"grad_norm": 0.8777097574069823, |
|
"learning_rate": 0.00030000000000000003, |
|
"loss": 8.6485, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00101, |
|
"grad_norm": 0.8786755147609817, |
|
"learning_rate": 0.00030300000000000005, |
|
"loss": 8.614, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.00102, |
|
"grad_norm": 0.8680143868447665, |
|
"learning_rate": 0.000306, |
|
"loss": 8.5949, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.00103, |
|
"grad_norm": 0.8748449452730288, |
|
"learning_rate": 0.000309, |
|
"loss": 8.5706, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.00104, |
|
"grad_norm": 0.8670215859388973, |
|
"learning_rate": 0.000312, |
|
"loss": 8.5498, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.00105, |
|
"grad_norm": 0.8687292008731472, |
|
"learning_rate": 0.000315, |
|
"loss": 8.5231, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.00106, |
|
"grad_norm": 0.8611486197845404, |
|
"learning_rate": 0.000318, |
|
"loss": 8.4945, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.00107, |
|
"grad_norm": 0.8521924348129856, |
|
"learning_rate": 0.000321, |
|
"loss": 8.4693, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.00108, |
|
"grad_norm": 0.8581933475380797, |
|
"learning_rate": 0.000324, |
|
"loss": 8.4407, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.00109, |
|
"grad_norm": 0.8524146875206363, |
|
"learning_rate": 0.000327, |
|
"loss": 8.421, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0011, |
|
"grad_norm": 0.8682563584613229, |
|
"learning_rate": 0.00033, |
|
"loss": 8.3983, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.00111, |
|
"grad_norm": 0.8871469803064874, |
|
"learning_rate": 0.000333, |
|
"loss": 8.3685, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.00112, |
|
"grad_norm": 0.9236879668329372, |
|
"learning_rate": 0.00033600000000000004, |
|
"loss": 8.3463, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.00113, |
|
"grad_norm": 0.9129864456504505, |
|
"learning_rate": 0.000339, |
|
"loss": 8.3063, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.00114, |
|
"grad_norm": 0.8352263501003522, |
|
"learning_rate": 0.000342, |
|
"loss": 8.2966, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.00115, |
|
"grad_norm": 0.8592375580090957, |
|
"learning_rate": 0.00034500000000000004, |
|
"loss": 8.2718, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.00116, |
|
"grad_norm": 0.8674294753896091, |
|
"learning_rate": 0.000348, |
|
"loss": 8.2506, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.00117, |
|
"grad_norm": 0.8276917493567776, |
|
"learning_rate": 0.000351, |
|
"loss": 8.2188, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.00118, |
|
"grad_norm": 0.8476744963131545, |
|
"learning_rate": 0.000354, |
|
"loss": 8.2045, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.00119, |
|
"grad_norm": 0.844358071644388, |
|
"learning_rate": 0.000357, |
|
"loss": 8.1926, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0012, |
|
"grad_norm": 0.8166594269287538, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 8.1658, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00121, |
|
"grad_norm": 0.8239930081024902, |
|
"learning_rate": 0.000363, |
|
"loss": 8.1389, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.00122, |
|
"grad_norm": 0.8099951959348987, |
|
"learning_rate": 0.000366, |
|
"loss": 8.1225, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.00123, |
|
"grad_norm": 0.830800320388625, |
|
"learning_rate": 0.000369, |
|
"loss": 8.1005, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.00124, |
|
"grad_norm": 0.8139169053139192, |
|
"learning_rate": 0.000372, |
|
"loss": 8.0791, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.00125, |
|
"grad_norm": 0.8112246790149765, |
|
"learning_rate": 0.000375, |
|
"loss": 8.0547, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.00126, |
|
"grad_norm": 0.7922278873371895, |
|
"learning_rate": 0.000378, |
|
"loss": 8.0424, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.00127, |
|
"grad_norm": 0.7755075943975184, |
|
"learning_rate": 0.000381, |
|
"loss": 8.0182, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.00128, |
|
"grad_norm": 0.8028212720713388, |
|
"learning_rate": 0.000384, |
|
"loss": 8.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.00129, |
|
"grad_norm": 0.8765129391436198, |
|
"learning_rate": 0.00038700000000000003, |
|
"loss": 7.9904, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.0013, |
|
"grad_norm": 1.1486399983200042, |
|
"learning_rate": 0.00039000000000000005, |
|
"loss": 7.9724, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00131, |
|
"grad_norm": 1.0507777578095836, |
|
"learning_rate": 0.000393, |
|
"loss": 7.9382, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.00132, |
|
"grad_norm": 0.7511305165281239, |
|
"learning_rate": 0.00039600000000000003, |
|
"loss": 7.9248, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.00133, |
|
"grad_norm": 0.8108077692812641, |
|
"learning_rate": 0.00039900000000000005, |
|
"loss": 7.906, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.00134, |
|
"grad_norm": 0.8301313874662418, |
|
"learning_rate": 0.000402, |
|
"loss": 7.8952, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.00135, |
|
"grad_norm": 0.7123568807247732, |
|
"learning_rate": 0.00040500000000000003, |
|
"loss": 7.8651, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.00136, |
|
"grad_norm": 0.7697556904537746, |
|
"learning_rate": 0.00040800000000000005, |
|
"loss": 7.8515, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.00137, |
|
"grad_norm": 0.7190977621725152, |
|
"learning_rate": 0.000411, |
|
"loss": 7.8299, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.00138, |
|
"grad_norm": 0.7147305335216294, |
|
"learning_rate": 0.00041400000000000003, |
|
"loss": 7.8066, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.00139, |
|
"grad_norm": 0.747529428120578, |
|
"learning_rate": 0.00041700000000000005, |
|
"loss": 7.7954, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0014, |
|
"grad_norm": 0.6748198408281931, |
|
"learning_rate": 0.00042000000000000007, |
|
"loss": 7.7774, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00141, |
|
"grad_norm": 0.662142968172009, |
|
"learning_rate": 0.000423, |
|
"loss": 7.7644, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.00142, |
|
"grad_norm": 0.6859796391897652, |
|
"learning_rate": 0.00042599999999999995, |
|
"loss": 7.7534, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.00143, |
|
"grad_norm": 0.6828514310354903, |
|
"learning_rate": 0.00042899999999999997, |
|
"loss": 7.7255, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.00144, |
|
"grad_norm": 0.6490687938821236, |
|
"learning_rate": 0.000432, |
|
"loss": 7.7078, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.00145, |
|
"grad_norm": 0.6453156151137228, |
|
"learning_rate": 0.000435, |
|
"loss": 7.7035, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.00146, |
|
"grad_norm": 0.7166050341593803, |
|
"learning_rate": 0.00043799999999999997, |
|
"loss": 7.6672, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.00147, |
|
"grad_norm": 0.7443563124698165, |
|
"learning_rate": 0.000441, |
|
"loss": 7.6627, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.00148, |
|
"grad_norm": 0.7124453009383569, |
|
"learning_rate": 0.000444, |
|
"loss": 7.648, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.00149, |
|
"grad_norm": 0.6775306354557482, |
|
"learning_rate": 0.00044699999999999997, |
|
"loss": 7.6419, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0015, |
|
"grad_norm": 0.8906380813028638, |
|
"learning_rate": 0.00045, |
|
"loss": 7.6135, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.00151, |
|
"grad_norm": 1.2892607147030477, |
|
"learning_rate": 0.000453, |
|
"loss": 7.605, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.00152, |
|
"grad_norm": 0.6492280537852009, |
|
"learning_rate": 0.000456, |
|
"loss": 7.5875, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.00153, |
|
"grad_norm": 0.8226748559002907, |
|
"learning_rate": 0.000459, |
|
"loss": 7.5783, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.00154, |
|
"grad_norm": 1.073498340899344, |
|
"learning_rate": 0.000462, |
|
"loss": 7.5662, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.00155, |
|
"grad_norm": 0.772416311968319, |
|
"learning_rate": 0.000465, |
|
"loss": 7.5481, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.00156, |
|
"grad_norm": 0.7999364848474875, |
|
"learning_rate": 0.000468, |
|
"loss": 7.5209, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.00157, |
|
"grad_norm": 0.9435570004081977, |
|
"learning_rate": 0.000471, |
|
"loss": 7.5196, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.00158, |
|
"grad_norm": 0.8364296006508578, |
|
"learning_rate": 0.00047400000000000003, |
|
"loss": 7.4997, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.00159, |
|
"grad_norm": 0.5596750978008624, |
|
"learning_rate": 0.000477, |
|
"loss": 7.4825, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 0.5748856794890025, |
|
"learning_rate": 0.00048, |
|
"loss": 7.4616, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.00161, |
|
"grad_norm": 0.5986411025981033, |
|
"learning_rate": 0.00048300000000000003, |
|
"loss": 7.4419, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.00162, |
|
"grad_norm": 0.5117747222458712, |
|
"learning_rate": 0.00048600000000000005, |
|
"loss": 7.4429, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.00163, |
|
"grad_norm": 0.5509815158764758, |
|
"learning_rate": 0.0004890000000000001, |
|
"loss": 7.4259, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.00164, |
|
"grad_norm": 0.46393808675226217, |
|
"learning_rate": 0.000492, |
|
"loss": 7.4236, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.00165, |
|
"grad_norm": 0.5390679177469344, |
|
"learning_rate": 0.000495, |
|
"loss": 7.4006, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.00166, |
|
"grad_norm": 0.645354949444588, |
|
"learning_rate": 0.0004980000000000001, |
|
"loss": 7.3773, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.00167, |
|
"grad_norm": 1.0877276724000633, |
|
"learning_rate": 0.000501, |
|
"loss": 7.3741, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.00168, |
|
"grad_norm": 1.0909247587015876, |
|
"learning_rate": 0.000504, |
|
"loss": 7.3697, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.00169, |
|
"grad_norm": 0.5084368948335112, |
|
"learning_rate": 0.0005070000000000001, |
|
"loss": 7.3418, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.0017, |
|
"grad_norm": 1.5053615947271437, |
|
"learning_rate": 0.00051, |
|
"loss": 7.3492, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.00171, |
|
"grad_norm": 0.6120595065498764, |
|
"learning_rate": 0.000513, |
|
"loss": 7.3094, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.00172, |
|
"grad_norm": 0.9401489103136018, |
|
"learning_rate": 0.000516, |
|
"loss": 7.3189, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.00173, |
|
"grad_norm": 0.5558816443971099, |
|
"learning_rate": 0.0005189999999999999, |
|
"loss": 7.2906, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.00174, |
|
"grad_norm": 0.7810658223347703, |
|
"learning_rate": 0.000522, |
|
"loss": 7.314, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.00175, |
|
"grad_norm": 0.5409187817835382, |
|
"learning_rate": 0.000525, |
|
"loss": 7.2727, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.00176, |
|
"grad_norm": 0.630636606756854, |
|
"learning_rate": 0.0005279999999999999, |
|
"loss": 7.2484, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.00177, |
|
"grad_norm": 0.5206138592215499, |
|
"learning_rate": 0.000531, |
|
"loss": 7.2507, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.00178, |
|
"grad_norm": 0.5577166707239251, |
|
"learning_rate": 0.000534, |
|
"loss": 7.2428, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.00179, |
|
"grad_norm": 0.4980094016420717, |
|
"learning_rate": 0.000537, |
|
"loss": 7.2333, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.0018, |
|
"grad_norm": 0.5436132024149352, |
|
"learning_rate": 0.00054, |
|
"loss": 7.2053, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00181, |
|
"grad_norm": 0.4850436660416719, |
|
"learning_rate": 0.000543, |
|
"loss": 7.1915, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.00182, |
|
"grad_norm": 0.686701132410735, |
|
"learning_rate": 0.000546, |
|
"loss": 7.1909, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.00183, |
|
"grad_norm": 0.68400997444651, |
|
"learning_rate": 0.000549, |
|
"loss": 7.1802, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.00184, |
|
"grad_norm": 0.8396027257351396, |
|
"learning_rate": 0.000552, |
|
"loss": 7.171, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.00185, |
|
"grad_norm": 0.7665365037410753, |
|
"learning_rate": 0.000555, |
|
"loss": 7.1476, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.00186, |
|
"grad_norm": 0.5359175667027454, |
|
"learning_rate": 0.000558, |
|
"loss": 7.1506, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.00187, |
|
"grad_norm": 0.5513800735908173, |
|
"learning_rate": 0.000561, |
|
"loss": 7.1261, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.00188, |
|
"grad_norm": 0.5352093004255375, |
|
"learning_rate": 0.000564, |
|
"loss": 7.1044, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.00189, |
|
"grad_norm": 0.5938457818726526, |
|
"learning_rate": 0.000567, |
|
"loss": 7.1004, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0019, |
|
"grad_norm": 0.5273842405001533, |
|
"learning_rate": 0.00057, |
|
"loss": 7.0834, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.00191, |
|
"grad_norm": 0.47487115515279366, |
|
"learning_rate": 0.000573, |
|
"loss": 7.0721, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.00192, |
|
"grad_norm": 0.6036837698051599, |
|
"learning_rate": 0.000576, |
|
"loss": 7.0655, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.00193, |
|
"grad_norm": 0.3951184212196986, |
|
"learning_rate": 0.000579, |
|
"loss": 7.061, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.00194, |
|
"grad_norm": 0.442083567688087, |
|
"learning_rate": 0.000582, |
|
"loss": 7.0548, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.00195, |
|
"grad_norm": 0.439546882682468, |
|
"learning_rate": 0.000585, |
|
"loss": 7.0348, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.00196, |
|
"grad_norm": 0.46247531692771043, |
|
"learning_rate": 0.000588, |
|
"loss": 7.0228, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.00197, |
|
"grad_norm": 0.4140335072217301, |
|
"learning_rate": 0.000591, |
|
"loss": 7.0171, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.00198, |
|
"grad_norm": 0.3685986320410548, |
|
"learning_rate": 0.000594, |
|
"loss": 7.0081, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.00199, |
|
"grad_norm": 0.4020373564129086, |
|
"learning_rate": 0.0005970000000000001, |
|
"loss": 6.9898, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 0.37126410475941546, |
|
"learning_rate": 0.0006000000000000001, |
|
"loss": 6.9867, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00201, |
|
"grad_norm": 0.3773154493828028, |
|
"learning_rate": 0.000603, |
|
"loss": 6.9617, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.00202, |
|
"grad_norm": 0.3540017416986532, |
|
"learning_rate": 0.0006060000000000001, |
|
"loss": 6.9491, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.00203, |
|
"grad_norm": 0.403279648640721, |
|
"learning_rate": 0.0006090000000000001, |
|
"loss": 6.9534, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.00204, |
|
"grad_norm": 0.5112949618253247, |
|
"learning_rate": 0.000612, |
|
"loss": 6.9385, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.00205, |
|
"grad_norm": 0.7200998739972175, |
|
"learning_rate": 0.000615, |
|
"loss": 6.931, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.00206, |
|
"grad_norm": 1.209379808685074, |
|
"learning_rate": 0.000618, |
|
"loss": 6.9351, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.00207, |
|
"grad_norm": 1.0040442357645134, |
|
"learning_rate": 0.000621, |
|
"loss": 6.9279, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.00208, |
|
"grad_norm": 0.7878464521205251, |
|
"learning_rate": 0.000624, |
|
"loss": 6.9129, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.00209, |
|
"grad_norm": 1.3096135062236434, |
|
"learning_rate": 0.000627, |
|
"loss": 6.9067, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.0021, |
|
"grad_norm": 0.910504669978176, |
|
"learning_rate": 0.00063, |
|
"loss": 6.8878, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.00211, |
|
"grad_norm": 1.3015817966038044, |
|
"learning_rate": 0.000633, |
|
"loss": 6.8987, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.00212, |
|
"grad_norm": 0.5587575104994011, |
|
"learning_rate": 0.000636, |
|
"loss": 6.8696, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.00213, |
|
"grad_norm": 0.8081412049208773, |
|
"learning_rate": 0.000639, |
|
"loss": 6.8608, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.00214, |
|
"grad_norm": 0.6397075273457759, |
|
"learning_rate": 0.000642, |
|
"loss": 6.8511, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.00215, |
|
"grad_norm": 0.5875043250740225, |
|
"learning_rate": 0.000645, |
|
"loss": 6.8482, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.00216, |
|
"grad_norm": 0.6060774535893669, |
|
"learning_rate": 0.000648, |
|
"loss": 6.8413, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.00217, |
|
"grad_norm": 0.5183751970166313, |
|
"learning_rate": 0.000651, |
|
"loss": 6.8119, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.00218, |
|
"grad_norm": 0.5539011900924167, |
|
"learning_rate": 0.000654, |
|
"loss": 6.8171, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.00219, |
|
"grad_norm": 0.5004312163301685, |
|
"learning_rate": 0.000657, |
|
"loss": 6.7882, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.0022, |
|
"grad_norm": 0.4762494747244133, |
|
"learning_rate": 0.00066, |
|
"loss": 6.8062, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00221, |
|
"grad_norm": 0.353452768224107, |
|
"learning_rate": 0.0006630000000000001, |
|
"loss": 6.7814, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.00222, |
|
"grad_norm": 0.3879096289808107, |
|
"learning_rate": 0.000666, |
|
"loss": 6.7696, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.00223, |
|
"grad_norm": 0.4253518811476648, |
|
"learning_rate": 0.000669, |
|
"loss": 6.7658, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.00224, |
|
"grad_norm": 0.4076338977034843, |
|
"learning_rate": 0.0006720000000000001, |
|
"loss": 6.7609, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.00225, |
|
"grad_norm": 0.4491881195721512, |
|
"learning_rate": 0.000675, |
|
"loss": 6.7489, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.00226, |
|
"grad_norm": 0.4195112098951784, |
|
"learning_rate": 0.000678, |
|
"loss": 6.7444, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.00227, |
|
"grad_norm": 0.371663908330708, |
|
"learning_rate": 0.0006810000000000001, |
|
"loss": 6.7174, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.00228, |
|
"grad_norm": 0.3462952066328263, |
|
"learning_rate": 0.000684, |
|
"loss": 6.7197, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.00229, |
|
"grad_norm": 0.5288966810878937, |
|
"learning_rate": 0.000687, |
|
"loss": 6.7178, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.0023, |
|
"grad_norm": 0.7623592789505088, |
|
"learning_rate": 0.0006900000000000001, |
|
"loss": 6.6993, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.00231, |
|
"grad_norm": 0.9880780315432149, |
|
"learning_rate": 0.000693, |
|
"loss": 6.6923, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.00232, |
|
"grad_norm": 1.028691756937799, |
|
"learning_rate": 0.000696, |
|
"loss": 6.7142, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.00233, |
|
"grad_norm": 0.9501761132787978, |
|
"learning_rate": 0.0006990000000000001, |
|
"loss": 6.6946, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.00234, |
|
"grad_norm": 0.9999298841530961, |
|
"learning_rate": 0.000702, |
|
"loss": 6.6948, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.00235, |
|
"grad_norm": 0.6945930996245869, |
|
"learning_rate": 0.000705, |
|
"loss": 6.6675, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.00236, |
|
"grad_norm": 0.6422608041910052, |
|
"learning_rate": 0.000708, |
|
"loss": 6.6513, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.00237, |
|
"grad_norm": 0.47985797682797676, |
|
"learning_rate": 0.0007109999999999999, |
|
"loss": 6.657, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.00238, |
|
"grad_norm": 0.6191094472441181, |
|
"learning_rate": 0.000714, |
|
"loss": 6.6482, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.00239, |
|
"grad_norm": 0.5181523072026278, |
|
"learning_rate": 0.000717, |
|
"loss": 6.628, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.0024, |
|
"grad_norm": 0.5894783578801835, |
|
"learning_rate": 0.0007199999999999999, |
|
"loss": 6.645, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.00241, |
|
"grad_norm": 0.5639698176641863, |
|
"learning_rate": 0.000723, |
|
"loss": 6.6279, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.00242, |
|
"grad_norm": 0.9095170652542525, |
|
"learning_rate": 0.000726, |
|
"loss": 6.6087, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.00243, |
|
"grad_norm": 1.3373514416355459, |
|
"learning_rate": 0.000729, |
|
"loss": 6.6191, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.00244, |
|
"grad_norm": 0.5837452319331187, |
|
"learning_rate": 0.000732, |
|
"loss": 6.5991, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.00245, |
|
"grad_norm": 1.0261084178764917, |
|
"learning_rate": 0.000735, |
|
"loss": 6.6035, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.00246, |
|
"grad_norm": 0.5663979442820745, |
|
"learning_rate": 0.000738, |
|
"loss": 6.574, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.00247, |
|
"grad_norm": 0.5896636858778472, |
|
"learning_rate": 0.000741, |
|
"loss": 6.5719, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.00248, |
|
"grad_norm": 0.47326080430149503, |
|
"learning_rate": 0.000744, |
|
"loss": 6.5757, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.00249, |
|
"grad_norm": 0.6248164732961499, |
|
"learning_rate": 0.000747, |
|
"loss": 6.5582, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 0.48362025373458484, |
|
"learning_rate": 0.00075, |
|
"loss": 6.5567, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.00251, |
|
"grad_norm": 0.4858269118610639, |
|
"learning_rate": 0.000753, |
|
"loss": 6.5227, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.00252, |
|
"grad_norm": 0.41862369731289734, |
|
"learning_rate": 0.000756, |
|
"loss": 6.5302, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.00253, |
|
"grad_norm": 0.47200180330590646, |
|
"learning_rate": 0.000759, |
|
"loss": 6.522, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.00254, |
|
"grad_norm": 0.3651546358356223, |
|
"learning_rate": 0.000762, |
|
"loss": 6.5136, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.00255, |
|
"grad_norm": 0.4285192093499382, |
|
"learning_rate": 0.0007650000000000001, |
|
"loss": 6.5149, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.00256, |
|
"grad_norm": 0.42767490819993825, |
|
"learning_rate": 0.000768, |
|
"loss": 6.4917, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.00257, |
|
"grad_norm": 0.48143237273687123, |
|
"learning_rate": 0.000771, |
|
"loss": 6.4974, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.00258, |
|
"grad_norm": 0.5467090762573645, |
|
"learning_rate": 0.0007740000000000001, |
|
"loss": 6.4744, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.00259, |
|
"grad_norm": 0.6750367098607939, |
|
"learning_rate": 0.000777, |
|
"loss": 6.4781, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.0026, |
|
"grad_norm": 0.8320211245129605, |
|
"learning_rate": 0.0007800000000000001, |
|
"loss": 6.4681, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.00261, |
|
"grad_norm": 0.905841412497731, |
|
"learning_rate": 0.0007830000000000001, |
|
"loss": 6.4809, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.00262, |
|
"grad_norm": 1.0687360869781928, |
|
"learning_rate": 0.000786, |
|
"loss": 6.4647, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.00263, |
|
"grad_norm": 1.2051408681263374, |
|
"learning_rate": 0.0007890000000000001, |
|
"loss": 6.4874, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.00264, |
|
"grad_norm": 0.8690142485653533, |
|
"learning_rate": 0.0007920000000000001, |
|
"loss": 6.455, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.00265, |
|
"grad_norm": 1.2774066489819682, |
|
"learning_rate": 0.000795, |
|
"loss": 6.4641, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.00266, |
|
"grad_norm": 0.7288978979341997, |
|
"learning_rate": 0.0007980000000000001, |
|
"loss": 6.4454, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.00267, |
|
"grad_norm": 0.6506287971604123, |
|
"learning_rate": 0.0008010000000000001, |
|
"loss": 6.4371, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.00268, |
|
"grad_norm": 0.6866545943797145, |
|
"learning_rate": 0.000804, |
|
"loss": 6.4338, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.00269, |
|
"grad_norm": 1.0440114151727509, |
|
"learning_rate": 0.0008070000000000001, |
|
"loss": 6.4227, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.0027, |
|
"grad_norm": 1.170351969791303, |
|
"learning_rate": 0.0008100000000000001, |
|
"loss": 6.4362, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.00271, |
|
"grad_norm": 0.6832222674646221, |
|
"learning_rate": 0.000813, |
|
"loss": 6.4253, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.00272, |
|
"grad_norm": 0.617936452008115, |
|
"learning_rate": 0.0008160000000000001, |
|
"loss": 6.4063, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.00273, |
|
"grad_norm": 0.6750066600068259, |
|
"learning_rate": 0.0008190000000000001, |
|
"loss": 6.4008, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.00274, |
|
"grad_norm": 0.6315560378177079, |
|
"learning_rate": 0.000822, |
|
"loss": 6.379, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.00275, |
|
"grad_norm": 0.6774266517760174, |
|
"learning_rate": 0.0008250000000000001, |
|
"loss": 6.3852, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.00276, |
|
"grad_norm": 0.7272895822836024, |
|
"learning_rate": 0.0008280000000000001, |
|
"loss": 6.3794, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.00277, |
|
"grad_norm": 0.6752471037637485, |
|
"learning_rate": 0.0008310000000000001, |
|
"loss": 6.3735, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.00278, |
|
"grad_norm": 0.5678457826039285, |
|
"learning_rate": 0.0008340000000000001, |
|
"loss": 6.354, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.00279, |
|
"grad_norm": 0.4611700189072147, |
|
"learning_rate": 0.0008370000000000001, |
|
"loss": 6.3529, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.0028, |
|
"grad_norm": 0.343285643042232, |
|
"learning_rate": 0.0008400000000000001, |
|
"loss": 6.3329, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.00281, |
|
"grad_norm": 0.4519631747446028, |
|
"learning_rate": 0.0008430000000000001, |
|
"loss": 6.3253, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.00282, |
|
"grad_norm": 0.3255189118052276, |
|
"learning_rate": 0.000846, |
|
"loss": 6.3232, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.00283, |
|
"grad_norm": 0.4297016476682907, |
|
"learning_rate": 0.0008489999999999999, |
|
"loss": 6.3149, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.00284, |
|
"grad_norm": 0.37515936714697207, |
|
"learning_rate": 0.0008519999999999999, |
|
"loss": 6.3058, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.00285, |
|
"grad_norm": 0.3458870104505622, |
|
"learning_rate": 0.000855, |
|
"loss": 6.3075, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.00286, |
|
"grad_norm": 0.39943359732325345, |
|
"learning_rate": 0.0008579999999999999, |
|
"loss": 6.2857, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.00287, |
|
"grad_norm": 0.3631751365570726, |
|
"learning_rate": 0.000861, |
|
"loss": 6.2875, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.00288, |
|
"grad_norm": 0.552280032213235, |
|
"learning_rate": 0.000864, |
|
"loss": 6.2707, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.00289, |
|
"grad_norm": 0.9396362484724781, |
|
"learning_rate": 0.0008669999999999999, |
|
"loss": 6.2923, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.0029, |
|
"grad_norm": 1.3959756273937387, |
|
"learning_rate": 0.00087, |
|
"loss": 6.2945, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.00291, |
|
"grad_norm": 0.6791923155853262, |
|
"learning_rate": 0.000873, |
|
"loss": 6.2663, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.00292, |
|
"grad_norm": 0.9879732835250642, |
|
"learning_rate": 0.0008759999999999999, |
|
"loss": 6.2794, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.00293, |
|
"grad_norm": 1.0140384817226566, |
|
"learning_rate": 0.000879, |
|
"loss": 6.2652, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.00294, |
|
"grad_norm": 0.9889225551341856, |
|
"learning_rate": 0.000882, |
|
"loss": 6.2822, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.00295, |
|
"grad_norm": 1.0429477871581094, |
|
"learning_rate": 0.0008849999999999999, |
|
"loss": 6.2468, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.00296, |
|
"grad_norm": 1.1774929945794055, |
|
"learning_rate": 0.000888, |
|
"loss": 6.2705, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.00297, |
|
"grad_norm": 0.7364156162079134, |
|
"learning_rate": 0.000891, |
|
"loss": 6.2278, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.00298, |
|
"grad_norm": 0.8424638476384282, |
|
"learning_rate": 0.0008939999999999999, |
|
"loss": 6.2455, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.00299, |
|
"grad_norm": 0.8668489286879963, |
|
"learning_rate": 0.000897, |
|
"loss": 6.2225, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.003, |
|
"grad_norm": 0.9039057952602142, |
|
"learning_rate": 0.0009, |
|
"loss": 6.2236, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.00301, |
|
"grad_norm": 1.0344858724084711, |
|
"learning_rate": 0.0009029999999999999, |
|
"loss": 6.2222, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.00302, |
|
"grad_norm": 0.801399966246171, |
|
"learning_rate": 0.000906, |
|
"loss": 6.2007, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.00303, |
|
"grad_norm": 0.7276255151675343, |
|
"learning_rate": 0.000909, |
|
"loss": 6.2106, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.00304, |
|
"grad_norm": 0.8306162070729353, |
|
"learning_rate": 0.000912, |
|
"loss": 6.205, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.00305, |
|
"grad_norm": 0.7650178489304597, |
|
"learning_rate": 0.000915, |
|
"loss": 6.2045, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.00306, |
|
"grad_norm": 0.7024963687074245, |
|
"learning_rate": 0.000918, |
|
"loss": 6.1878, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.00307, |
|
"grad_norm": 0.5687961336654864, |
|
"learning_rate": 0.000921, |
|
"loss": 6.1778, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.00308, |
|
"grad_norm": 0.4515866134049927, |
|
"learning_rate": 0.000924, |
|
"loss": 6.1586, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.00309, |
|
"grad_norm": 0.5454081565882548, |
|
"learning_rate": 0.000927, |
|
"loss": 6.1763, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.0031, |
|
"grad_norm": 0.4033959865123679, |
|
"learning_rate": 0.00093, |
|
"loss": 6.1549, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.00311, |
|
"grad_norm": 0.5004494164305024, |
|
"learning_rate": 0.000933, |
|
"loss": 6.1478, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.00312, |
|
"grad_norm": 0.470361361901935, |
|
"learning_rate": 0.000936, |
|
"loss": 6.1275, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.00313, |
|
"grad_norm": 0.4973667290148138, |
|
"learning_rate": 0.0009390000000000001, |
|
"loss": 6.1399, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.00314, |
|
"grad_norm": 0.47398584855820086, |
|
"learning_rate": 0.000942, |
|
"loss": 6.1453, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.00315, |
|
"grad_norm": 0.39081338977861474, |
|
"learning_rate": 0.000945, |
|
"loss": 6.1206, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.00316, |
|
"grad_norm": 0.308452368547838, |
|
"learning_rate": 0.0009480000000000001, |
|
"loss": 6.0925, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.00317, |
|
"grad_norm": 0.33735261718955184, |
|
"learning_rate": 0.000951, |
|
"loss": 6.1112, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.00318, |
|
"grad_norm": 0.3843792856632324, |
|
"learning_rate": 0.000954, |
|
"loss": 6.1055, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.00319, |
|
"grad_norm": 0.45015697169720664, |
|
"learning_rate": 0.0009570000000000001, |
|
"loss": 6.0951, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 0.6347836654615971, |
|
"learning_rate": 0.00096, |
|
"loss": 6.097, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.00321, |
|
"grad_norm": 0.9264517947757075, |
|
"learning_rate": 0.000963, |
|
"loss": 6.0955, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.00322, |
|
"grad_norm": 1.3311662164937155, |
|
"learning_rate": 0.0009660000000000001, |
|
"loss": 6.0984, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.00323, |
|
"grad_norm": 0.7192768486088142, |
|
"learning_rate": 0.000969, |
|
"loss": 6.081, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.00324, |
|
"grad_norm": 0.8815686637890671, |
|
"learning_rate": 0.0009720000000000001, |
|
"loss": 6.0921, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.00325, |
|
"grad_norm": 0.9399377971403509, |
|
"learning_rate": 0.0009750000000000001, |
|
"loss": 6.0559, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.00326, |
|
"grad_norm": 1.1161084705724094, |
|
"learning_rate": 0.0009780000000000001, |
|
"loss": 6.0866, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.00327, |
|
"grad_norm": 0.9798577225908143, |
|
"learning_rate": 0.000981, |
|
"loss": 6.08, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.00328, |
|
"grad_norm": 1.4594200076446016, |
|
"learning_rate": 0.000984, |
|
"loss": 6.0949, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.00329, |
|
"grad_norm": 0.7530414187502621, |
|
"learning_rate": 0.000987, |
|
"loss": 6.0592, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.0033, |
|
"grad_norm": 0.9983029089238832, |
|
"learning_rate": 0.00099, |
|
"loss": 6.0707, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.00331, |
|
"grad_norm": 0.835334238631822, |
|
"learning_rate": 0.0009930000000000002, |
|
"loss": 6.0609, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.00332, |
|
"grad_norm": 1.2633352311024129, |
|
"learning_rate": 0.0009960000000000001, |
|
"loss": 6.0632, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.00333, |
|
"grad_norm": 0.9415989578137678, |
|
"learning_rate": 0.000999, |
|
"loss": 6.0388, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.00334, |
|
"grad_norm": 0.752057040577036, |
|
"learning_rate": 0.001002, |
|
"loss": 6.0434, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.00335, |
|
"grad_norm": 0.6368712711427076, |
|
"learning_rate": 0.001005, |
|
"loss": 6.039, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.00336, |
|
"grad_norm": 0.5277321252477392, |
|
"learning_rate": 0.001008, |
|
"loss": 6.0255, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.00337, |
|
"grad_norm": 0.545640031716998, |
|
"learning_rate": 0.0010110000000000002, |
|
"loss": 6.0051, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.00338, |
|
"grad_norm": 0.5605817392374447, |
|
"learning_rate": 0.0010140000000000001, |
|
"loss": 6.0183, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.00339, |
|
"grad_norm": 0.4963166785854256, |
|
"learning_rate": 0.0010170000000000001, |
|
"loss": 6.0214, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.0034, |
|
"grad_norm": 0.496225629593103, |
|
"learning_rate": 0.00102, |
|
"loss": 5.9915, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.00341, |
|
"grad_norm": 0.45013704339915594, |
|
"learning_rate": 0.001023, |
|
"loss": 5.9719, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.00342, |
|
"grad_norm": 0.37899658405778774, |
|
"learning_rate": 0.001026, |
|
"loss": 5.9765, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.00343, |
|
"grad_norm": 0.4216072327604619, |
|
"learning_rate": 0.0010290000000000002, |
|
"loss": 5.9773, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.00344, |
|
"grad_norm": 0.4964437378733662, |
|
"learning_rate": 0.001032, |
|
"loss": 5.9678, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.00345, |
|
"grad_norm": 0.6031142308234815, |
|
"learning_rate": 0.001035, |
|
"loss": 5.9605, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.00346, |
|
"grad_norm": 0.6349778009023861, |
|
"learning_rate": 0.0010379999999999999, |
|
"loss": 5.9675, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.00347, |
|
"grad_norm": 0.6213366174245898, |
|
"learning_rate": 0.001041, |
|
"loss": 5.9393, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.00348, |
|
"grad_norm": 0.684857817555668, |
|
"learning_rate": 0.001044, |
|
"loss": 5.9499, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.00349, |
|
"grad_norm": 0.8421661368559449, |
|
"learning_rate": 0.001047, |
|
"loss": 5.9398, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.0035, |
|
"grad_norm": 0.8062802732253019, |
|
"learning_rate": 0.00105, |
|
"loss": 5.9568, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.00351, |
|
"grad_norm": 0.7669273491138234, |
|
"learning_rate": 0.001053, |
|
"loss": 5.9549, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.00352, |
|
"grad_norm": 1.0673384228730578, |
|
"learning_rate": 0.0010559999999999999, |
|
"loss": 5.9491, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.00353, |
|
"grad_norm": 1.313057737158452, |
|
"learning_rate": 0.001059, |
|
"loss": 5.9596, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.00354, |
|
"grad_norm": 0.8748293544965601, |
|
"learning_rate": 0.001062, |
|
"loss": 5.9228, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.00355, |
|
"grad_norm": 0.7617258072087358, |
|
"learning_rate": 0.001065, |
|
"loss": 5.9348, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.00356, |
|
"grad_norm": 0.9561923051718775, |
|
"learning_rate": 0.001068, |
|
"loss": 5.9306, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.00357, |
|
"grad_norm": 0.8631216371911231, |
|
"learning_rate": 0.001071, |
|
"loss": 5.9199, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.00358, |
|
"grad_norm": 0.9525953351992685, |
|
"learning_rate": 0.001074, |
|
"loss": 5.9184, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.00359, |
|
"grad_norm": 0.6858307664974747, |
|
"learning_rate": 0.001077, |
|
"loss": 5.8973, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.0036, |
|
"grad_norm": 0.7167037627907079, |
|
"learning_rate": 0.00108, |
|
"loss": 5.9065, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.00361, |
|
"grad_norm": 0.7969047134484166, |
|
"learning_rate": 0.001083, |
|
"loss": 5.8986, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.00362, |
|
"grad_norm": 0.8186204145394074, |
|
"learning_rate": 0.001086, |
|
"loss": 5.888, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.00363, |
|
"grad_norm": 0.7436182013517663, |
|
"learning_rate": 0.001089, |
|
"loss": 5.8766, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.00364, |
|
"grad_norm": 0.6658023261534547, |
|
"learning_rate": 0.001092, |
|
"loss": 5.8872, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.00365, |
|
"grad_norm": 0.7206668214896482, |
|
"learning_rate": 0.001095, |
|
"loss": 5.8872, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.00366, |
|
"grad_norm": 0.6345413922647961, |
|
"learning_rate": 0.001098, |
|
"loss": 5.8617, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.00367, |
|
"grad_norm": 0.6094924231218852, |
|
"learning_rate": 0.001101, |
|
"loss": 5.8618, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.00368, |
|
"grad_norm": 0.7732260654201254, |
|
"learning_rate": 0.001104, |
|
"loss": 5.8653, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.00369, |
|
"grad_norm": 0.9451132874875877, |
|
"learning_rate": 0.001107, |
|
"loss": 5.8779, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.0037, |
|
"grad_norm": 1.047387946938651, |
|
"learning_rate": 0.00111, |
|
"loss": 5.8603, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.00371, |
|
"grad_norm": 0.8709497379832931, |
|
"learning_rate": 0.001113, |
|
"loss": 5.8262, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.00372, |
|
"grad_norm": 0.6993621521276565, |
|
"learning_rate": 0.001116, |
|
"loss": 5.8434, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.00373, |
|
"grad_norm": 0.6889019297226876, |
|
"learning_rate": 0.001119, |
|
"loss": 5.8344, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.00374, |
|
"grad_norm": 0.8187808414499578, |
|
"learning_rate": 0.001122, |
|
"loss": 5.8424, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.00375, |
|
"grad_norm": 0.7395556739972736, |
|
"learning_rate": 0.0011250000000000001, |
|
"loss": 5.8404, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.00376, |
|
"grad_norm": 0.5027446756427529, |
|
"learning_rate": 0.001128, |
|
"loss": 5.8296, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.00377, |
|
"grad_norm": 0.45297103786338255, |
|
"learning_rate": 0.001131, |
|
"loss": 5.8239, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.00378, |
|
"grad_norm": 0.5150298222384522, |
|
"learning_rate": 0.001134, |
|
"loss": 5.82, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.00379, |
|
"grad_norm": 0.4216428009711753, |
|
"learning_rate": 0.001137, |
|
"loss": 5.8036, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.0038, |
|
"grad_norm": 0.43574801532624385, |
|
"learning_rate": 0.00114, |
|
"loss": 5.8234, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.00381, |
|
"grad_norm": 0.4737821231317218, |
|
"learning_rate": 0.0011430000000000001, |
|
"loss": 5.7908, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.00382, |
|
"grad_norm": 0.5236457098681065, |
|
"learning_rate": 0.001146, |
|
"loss": 5.7778, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.00383, |
|
"grad_norm": 0.5006340354259897, |
|
"learning_rate": 0.001149, |
|
"loss": 5.787, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.00384, |
|
"grad_norm": 0.5762330042414852, |
|
"learning_rate": 0.001152, |
|
"loss": 5.7915, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.00385, |
|
"grad_norm": 0.6623935085819848, |
|
"learning_rate": 0.001155, |
|
"loss": 5.7808, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.00386, |
|
"grad_norm": 0.6780931079980513, |
|
"learning_rate": 0.001158, |
|
"loss": 5.7864, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.00387, |
|
"grad_norm": 0.6283557380392781, |
|
"learning_rate": 0.0011610000000000001, |
|
"loss": 5.76, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.00388, |
|
"grad_norm": 0.8661947460887196, |
|
"learning_rate": 0.001164, |
|
"loss": 5.7642, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.00389, |
|
"grad_norm": 1.1877567285678448, |
|
"learning_rate": 0.001167, |
|
"loss": 5.7898, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.0039, |
|
"grad_norm": 0.5285139722189788, |
|
"learning_rate": 0.00117, |
|
"loss": 5.7491, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.00391, |
|
"grad_norm": 0.6430118692881639, |
|
"learning_rate": 0.001173, |
|
"loss": 5.7622, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.00392, |
|
"grad_norm": 0.6748100403552353, |
|
"learning_rate": 0.001176, |
|
"loss": 5.7713, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.00393, |
|
"grad_norm": 0.7344199345621275, |
|
"learning_rate": 0.0011790000000000001, |
|
"loss": 5.7448, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.00394, |
|
"grad_norm": 0.8611653440542995, |
|
"learning_rate": 0.001182, |
|
"loss": 5.7377, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.00395, |
|
"grad_norm": 1.0200403565527223, |
|
"learning_rate": 0.001185, |
|
"loss": 5.7297, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.00396, |
|
"grad_norm": 1.3219082304761296, |
|
"learning_rate": 0.001188, |
|
"loss": 5.7571, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.00397, |
|
"grad_norm": 1.0648841185793536, |
|
"learning_rate": 0.001191, |
|
"loss": 5.7452, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.00398, |
|
"grad_norm": 0.8818852137031271, |
|
"learning_rate": 0.0011940000000000002, |
|
"loss": 5.7476, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.00399, |
|
"grad_norm": 0.7229082271352473, |
|
"learning_rate": 0.0011970000000000001, |
|
"loss": 5.7455, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 0.7464785543636239, |
|
"learning_rate": 0.0012000000000000001, |
|
"loss": 5.7335, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.00401, |
|
"grad_norm": 0.788008045277313, |
|
"learning_rate": 0.001203, |
|
"loss": 5.7147, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.00402, |
|
"grad_norm": 0.8844811042429299, |
|
"learning_rate": 0.001206, |
|
"loss": 5.7255, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.00403, |
|
"grad_norm": 0.8786909754931423, |
|
"learning_rate": 0.001209, |
|
"loss": 5.7201, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.00404, |
|
"grad_norm": 0.9763167280044874, |
|
"learning_rate": 0.0012120000000000002, |
|
"loss": 5.7188, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.00405, |
|
"grad_norm": 0.9708490732732808, |
|
"learning_rate": 0.0012150000000000002, |
|
"loss": 5.7216, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.00406, |
|
"grad_norm": 0.7953769652219763, |
|
"learning_rate": 0.0012180000000000001, |
|
"loss": 5.6987, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.00407, |
|
"grad_norm": 0.7231086032909518, |
|
"learning_rate": 0.0012209999999999999, |
|
"loss": 5.6945, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.00408, |
|
"grad_norm": 0.6539281657127057, |
|
"learning_rate": 0.001224, |
|
"loss": 5.7108, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.00409, |
|
"grad_norm": 0.48725126694443294, |
|
"learning_rate": 0.001227, |
|
"loss": 5.6827, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.0041, |
|
"grad_norm": 0.5935989194477762, |
|
"learning_rate": 0.00123, |
|
"loss": 5.6771, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00411, |
|
"grad_norm": 0.6524672694061662, |
|
"learning_rate": 0.001233, |
|
"loss": 5.6945, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.00412, |
|
"grad_norm": 0.5686252583506046, |
|
"learning_rate": 0.001236, |
|
"loss": 5.6796, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.00413, |
|
"grad_norm": 0.5206794027062916, |
|
"learning_rate": 0.0012389999999999999, |
|
"loss": 5.676, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.00414, |
|
"grad_norm": 0.5100681259570212, |
|
"learning_rate": 0.001242, |
|
"loss": 5.6561, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.00415, |
|
"grad_norm": 0.5323494958966052, |
|
"learning_rate": 0.001245, |
|
"loss": 5.6572, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.00416, |
|
"grad_norm": 0.593833682795931, |
|
"learning_rate": 0.001248, |
|
"loss": 5.6419, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.00417, |
|
"grad_norm": 0.5881502982050868, |
|
"learning_rate": 0.001251, |
|
"loss": 5.6711, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.00418, |
|
"grad_norm": 0.6571826832460801, |
|
"learning_rate": 0.001254, |
|
"loss": 5.6451, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.00419, |
|
"grad_norm": 0.8242389183582979, |
|
"learning_rate": 0.0012569999999999999, |
|
"loss": 5.6393, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.0042, |
|
"grad_norm": 0.7322278119135938, |
|
"learning_rate": 0.00126, |
|
"loss": 5.6531, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.00421, |
|
"grad_norm": 0.6080486142741918, |
|
"learning_rate": 0.001263, |
|
"loss": 5.6415, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.00422, |
|
"grad_norm": 0.6271805214461816, |
|
"learning_rate": 0.001266, |
|
"loss": 5.6327, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.00423, |
|
"grad_norm": 0.5934268893913589, |
|
"learning_rate": 0.001269, |
|
"loss": 5.6286, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.00424, |
|
"grad_norm": 0.6795296945522938, |
|
"learning_rate": 0.001272, |
|
"loss": 5.6299, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.00425, |
|
"grad_norm": 0.7845276448161947, |
|
"learning_rate": 0.001275, |
|
"loss": 5.6344, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.00426, |
|
"grad_norm": 0.9176562117983922, |
|
"learning_rate": 0.001278, |
|
"loss": 5.6402, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.00427, |
|
"grad_norm": 1.0306205830362438, |
|
"learning_rate": 0.001281, |
|
"loss": 5.6293, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.00428, |
|
"grad_norm": 1.0252381178312269, |
|
"learning_rate": 0.001284, |
|
"loss": 5.6086, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.00429, |
|
"grad_norm": 1.3332162612637855, |
|
"learning_rate": 0.001287, |
|
"loss": 5.6393, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.0043, |
|
"grad_norm": 0.8174291079939794, |
|
"learning_rate": 0.00129, |
|
"loss": 5.6202, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.00431, |
|
"grad_norm": 0.821028463418781, |
|
"learning_rate": 0.001293, |
|
"loss": 5.6102, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.00432, |
|
"grad_norm": 0.7475137161763143, |
|
"learning_rate": 0.001296, |
|
"loss": 5.6163, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.00433, |
|
"grad_norm": 0.7571870052992741, |
|
"learning_rate": 0.001299, |
|
"loss": 5.604, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.00434, |
|
"grad_norm": 1.0543560255015263, |
|
"learning_rate": 0.001302, |
|
"loss": 5.6184, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.00435, |
|
"grad_norm": 0.8758758304266553, |
|
"learning_rate": 0.001305, |
|
"loss": 5.6066, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.00436, |
|
"grad_norm": 0.9137062421440546, |
|
"learning_rate": 0.001308, |
|
"loss": 5.5859, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.00437, |
|
"grad_norm": 1.301736875083812, |
|
"learning_rate": 0.001311, |
|
"loss": 5.6173, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.00438, |
|
"grad_norm": 0.9356953917037294, |
|
"learning_rate": 0.001314, |
|
"loss": 5.5919, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.00439, |
|
"grad_norm": 0.8522821992819578, |
|
"learning_rate": 0.001317, |
|
"loss": 5.6158, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.0044, |
|
"grad_norm": 0.7182761753028103, |
|
"learning_rate": 0.00132, |
|
"loss": 5.5821, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.00441, |
|
"grad_norm": 0.6293266892726601, |
|
"learning_rate": 0.001323, |
|
"loss": 5.577, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.00442, |
|
"grad_norm": 0.8189921873128356, |
|
"learning_rate": 0.0013260000000000001, |
|
"loss": 5.5762, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.00443, |
|
"grad_norm": 1.0212422821054057, |
|
"learning_rate": 0.001329, |
|
"loss": 5.5904, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.00444, |
|
"grad_norm": 0.8951785018901781, |
|
"learning_rate": 0.001332, |
|
"loss": 5.5851, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.00445, |
|
"grad_norm": 0.7868769140150608, |
|
"learning_rate": 0.001335, |
|
"loss": 5.5661, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.00446, |
|
"grad_norm": 0.790102612629763, |
|
"learning_rate": 0.001338, |
|
"loss": 5.5718, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.00447, |
|
"grad_norm": 0.8396194874372788, |
|
"learning_rate": 0.001341, |
|
"loss": 5.5716, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.00448, |
|
"grad_norm": 0.9120841535821665, |
|
"learning_rate": 0.0013440000000000001, |
|
"loss": 5.5589, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.00449, |
|
"grad_norm": 0.8573073152890212, |
|
"learning_rate": 0.001347, |
|
"loss": 5.5603, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.0045, |
|
"grad_norm": 0.7420732830276576, |
|
"learning_rate": 0.00135, |
|
"loss": 5.5551, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.00451, |
|
"grad_norm": 0.7054051762730813, |
|
"learning_rate": 0.001353, |
|
"loss": 5.5451, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.00452, |
|
"grad_norm": 0.5383194985068459, |
|
"learning_rate": 0.001356, |
|
"loss": 5.5229, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.00453, |
|
"grad_norm": 0.5845224529530345, |
|
"learning_rate": 0.001359, |
|
"loss": 5.5144, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.00454, |
|
"grad_norm": 0.5197277965757966, |
|
"learning_rate": 0.0013620000000000001, |
|
"loss": 5.5269, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.00455, |
|
"grad_norm": 0.44901215490386587, |
|
"learning_rate": 0.0013650000000000001, |
|
"loss": 5.5227, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.00456, |
|
"grad_norm": 0.40823042735319937, |
|
"learning_rate": 0.001368, |
|
"loss": 5.5155, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.00457, |
|
"grad_norm": 0.4077054091161063, |
|
"learning_rate": 0.001371, |
|
"loss": 5.5045, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.00458, |
|
"grad_norm": 0.4051721280659754, |
|
"learning_rate": 0.001374, |
|
"loss": 5.5115, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.00459, |
|
"grad_norm": 0.3819222938644956, |
|
"learning_rate": 0.0013770000000000002, |
|
"loss": 5.4986, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.0046, |
|
"grad_norm": 0.4096791477469412, |
|
"learning_rate": 0.0013800000000000002, |
|
"loss": 5.4851, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.00461, |
|
"grad_norm": 0.44167018915391465, |
|
"learning_rate": 0.0013830000000000001, |
|
"loss": 5.4805, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.00462, |
|
"grad_norm": 0.49346826910496583, |
|
"learning_rate": 0.001386, |
|
"loss": 5.4928, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.00463, |
|
"grad_norm": 0.463489633996114, |
|
"learning_rate": 0.001389, |
|
"loss": 5.4652, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.00464, |
|
"grad_norm": 0.507205937400668, |
|
"learning_rate": 0.001392, |
|
"loss": 5.4859, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.00465, |
|
"grad_norm": 0.5318639728571777, |
|
"learning_rate": 0.0013950000000000002, |
|
"loss": 5.4572, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.00466, |
|
"grad_norm": 0.6489252257104292, |
|
"learning_rate": 0.0013980000000000002, |
|
"loss": 5.4792, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.00467, |
|
"grad_norm": 0.7957789059873086, |
|
"learning_rate": 0.0014010000000000001, |
|
"loss": 5.4727, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.00468, |
|
"grad_norm": 0.7621042481808248, |
|
"learning_rate": 0.001404, |
|
"loss": 5.462, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.00469, |
|
"grad_norm": 0.573822393217689, |
|
"learning_rate": 0.001407, |
|
"loss": 5.457, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.0047, |
|
"grad_norm": 0.7543746323219513, |
|
"learning_rate": 0.00141, |
|
"loss": 5.4709, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.00471, |
|
"grad_norm": 0.8641651762100534, |
|
"learning_rate": 0.001413, |
|
"loss": 5.4567, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.00472, |
|
"grad_norm": 0.9026674726231502, |
|
"learning_rate": 0.001416, |
|
"loss": 5.4699, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.00473, |
|
"grad_norm": 0.8212885101559565, |
|
"learning_rate": 0.001419, |
|
"loss": 5.4778, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.00474, |
|
"grad_norm": 0.8914030740906659, |
|
"learning_rate": 0.0014219999999999999, |
|
"loss": 5.461, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.00475, |
|
"grad_norm": 1.0570094425693455, |
|
"learning_rate": 0.001425, |
|
"loss": 5.4652, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.00476, |
|
"grad_norm": 0.9736444976311589, |
|
"learning_rate": 0.001428, |
|
"loss": 5.4875, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.00477, |
|
"grad_norm": 1.1550380737092787, |
|
"learning_rate": 0.001431, |
|
"loss": 5.4568, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.00478, |
|
"grad_norm": 0.848321570803796, |
|
"learning_rate": 0.001434, |
|
"loss": 5.4695, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.00479, |
|
"grad_norm": 0.9517827225501269, |
|
"learning_rate": 0.001437, |
|
"loss": 5.4501, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 1.0883787540754652, |
|
"learning_rate": 0.0014399999999999999, |
|
"loss": 5.4562, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.00481, |
|
"grad_norm": 0.9422991164230814, |
|
"learning_rate": 0.001443, |
|
"loss": 5.4516, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.00482, |
|
"grad_norm": 0.9159499791385636, |
|
"learning_rate": 0.001446, |
|
"loss": 5.4273, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.00483, |
|
"grad_norm": 0.9688645055474768, |
|
"learning_rate": 0.001449, |
|
"loss": 5.44, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.00484, |
|
"grad_norm": 1.1114303023214132, |
|
"learning_rate": 0.001452, |
|
"loss": 5.4474, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.00485, |
|
"grad_norm": 0.9051569573634253, |
|
"learning_rate": 0.001455, |
|
"loss": 5.4468, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.00486, |
|
"grad_norm": 0.9247454458721566, |
|
"learning_rate": 0.001458, |
|
"loss": 5.4217, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.00487, |
|
"grad_norm": 0.852928162562673, |
|
"learning_rate": 0.001461, |
|
"loss": 5.4339, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.00488, |
|
"grad_norm": 0.8142513932978471, |
|
"learning_rate": 0.001464, |
|
"loss": 5.41, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.00489, |
|
"grad_norm": 0.9329231917883894, |
|
"learning_rate": 0.001467, |
|
"loss": 5.403, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.0049, |
|
"grad_norm": 0.9855383918059676, |
|
"learning_rate": 0.00147, |
|
"loss": 5.4217, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.00491, |
|
"grad_norm": 0.7649189111405409, |
|
"learning_rate": 0.001473, |
|
"loss": 5.4131, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.00492, |
|
"grad_norm": 0.7909135941762935, |
|
"learning_rate": 0.001476, |
|
"loss": 5.4088, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.00493, |
|
"grad_norm": 0.9895574066894314, |
|
"learning_rate": 0.001479, |
|
"loss": 5.412, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.00494, |
|
"grad_norm": 1.132991334341666, |
|
"learning_rate": 0.001482, |
|
"loss": 5.429, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.00495, |
|
"grad_norm": 0.761507737933228, |
|
"learning_rate": 0.001485, |
|
"loss": 5.4134, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.00496, |
|
"grad_norm": 0.9726159326361932, |
|
"learning_rate": 0.001488, |
|
"loss": 5.4067, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.00497, |
|
"grad_norm": 1.1482864163713484, |
|
"learning_rate": 0.001491, |
|
"loss": 5.3997, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.00498, |
|
"grad_norm": 0.8327332046897746, |
|
"learning_rate": 0.001494, |
|
"loss": 5.4147, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.00499, |
|
"grad_norm": 0.725916611519047, |
|
"learning_rate": 0.001497, |
|
"loss": 5.3792, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 0.6719782846245154, |
|
"learning_rate": 0.0015, |
|
"loss": 5.3842, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.00501, |
|
"grad_norm": 0.7463076928465905, |
|
"learning_rate": 0.001503, |
|
"loss": 5.3796, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.00502, |
|
"grad_norm": 0.8408104186601356, |
|
"learning_rate": 0.001506, |
|
"loss": 5.389, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.00503, |
|
"grad_norm": 0.860909611402231, |
|
"learning_rate": 0.0015090000000000001, |
|
"loss": 5.3926, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.00504, |
|
"grad_norm": 0.7026490274993983, |
|
"learning_rate": 0.001512, |
|
"loss": 5.3646, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.00505, |
|
"grad_norm": 0.7184807991697565, |
|
"learning_rate": 0.001515, |
|
"loss": 5.3547, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.00506, |
|
"grad_norm": 0.7839475253802514, |
|
"learning_rate": 0.001518, |
|
"loss": 5.3611, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.00507, |
|
"grad_norm": 0.7039499110993044, |
|
"learning_rate": 0.001521, |
|
"loss": 5.3522, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.00508, |
|
"grad_norm": 0.5587706859739108, |
|
"learning_rate": 0.001524, |
|
"loss": 5.3512, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.00509, |
|
"grad_norm": 0.4952941325228141, |
|
"learning_rate": 0.0015270000000000001, |
|
"loss": 5.326, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.0051, |
|
"grad_norm": 0.5131490664852795, |
|
"learning_rate": 0.0015300000000000001, |
|
"loss": 5.3428, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.00511, |
|
"grad_norm": 0.5539213487194597, |
|
"learning_rate": 0.001533, |
|
"loss": 5.3196, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.00512, |
|
"grad_norm": 0.5937876471208409, |
|
"learning_rate": 0.001536, |
|
"loss": 5.3158, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.00513, |
|
"grad_norm": 0.5441672327389838, |
|
"learning_rate": 0.001539, |
|
"loss": 5.327, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.00514, |
|
"grad_norm": 0.5371789671410057, |
|
"learning_rate": 0.001542, |
|
"loss": 5.3038, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.00515, |
|
"grad_norm": 0.5194765862771661, |
|
"learning_rate": 0.0015450000000000001, |
|
"loss": 5.3109, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.00516, |
|
"grad_norm": 0.5575198815834714, |
|
"learning_rate": 0.0015480000000000001, |
|
"loss": 5.3083, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.00517, |
|
"grad_norm": 0.5237583962443445, |
|
"learning_rate": 0.001551, |
|
"loss": 5.2962, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.00518, |
|
"grad_norm": 0.5237181167506534, |
|
"learning_rate": 0.001554, |
|
"loss": 5.2786, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.00519, |
|
"grad_norm": 0.6652146969315359, |
|
"learning_rate": 0.001557, |
|
"loss": 5.2847, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.0052, |
|
"grad_norm": 0.8817258231902962, |
|
"learning_rate": 0.0015600000000000002, |
|
"loss": 5.2966, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.00521, |
|
"grad_norm": 0.9047592596397305, |
|
"learning_rate": 0.0015630000000000002, |
|
"loss": 5.2732, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.00522, |
|
"grad_norm": 0.6139926424217688, |
|
"learning_rate": 0.0015660000000000001, |
|
"loss": 5.2701, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.00523, |
|
"grad_norm": 0.6292464658556638, |
|
"learning_rate": 0.001569, |
|
"loss": 5.2826, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.00524, |
|
"grad_norm": 0.752030715547053, |
|
"learning_rate": 0.001572, |
|
"loss": 5.2856, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.00525, |
|
"grad_norm": 0.929770536798091, |
|
"learning_rate": 0.001575, |
|
"loss": 5.287, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.00526, |
|
"grad_norm": 0.887370520628206, |
|
"learning_rate": 0.0015780000000000002, |
|
"loss": 5.2635, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.00527, |
|
"grad_norm": 0.7819104471156305, |
|
"learning_rate": 0.0015810000000000002, |
|
"loss": 5.2728, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.00528, |
|
"grad_norm": 0.9038037239389326, |
|
"learning_rate": 0.0015840000000000001, |
|
"loss": 5.2534, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.00529, |
|
"grad_norm": 0.8898068580069259, |
|
"learning_rate": 0.001587, |
|
"loss": 5.265, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.0053, |
|
"grad_norm": 1.0010848916209774, |
|
"learning_rate": 0.00159, |
|
"loss": 5.2764, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.00531, |
|
"grad_norm": 1.0010984765594055, |
|
"learning_rate": 0.001593, |
|
"loss": 5.2677, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.00532, |
|
"grad_norm": 1.026224264336229, |
|
"learning_rate": 0.0015960000000000002, |
|
"loss": 5.2779, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.00533, |
|
"grad_norm": 0.9759289805508353, |
|
"learning_rate": 0.0015990000000000002, |
|
"loss": 5.2678, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.00534, |
|
"grad_norm": 1.1376558518204782, |
|
"learning_rate": 0.0016020000000000001, |
|
"loss": 5.2612, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.00535, |
|
"grad_norm": 1.0517455067238486, |
|
"learning_rate": 0.001605, |
|
"loss": 5.2672, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.00536, |
|
"grad_norm": 0.9398243649272562, |
|
"learning_rate": 0.001608, |
|
"loss": 5.2627, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.00537, |
|
"grad_norm": 0.9512995727424398, |
|
"learning_rate": 0.0016110000000000002, |
|
"loss": 5.2558, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.00538, |
|
"grad_norm": 1.2746657535312511, |
|
"learning_rate": 0.0016140000000000002, |
|
"loss": 5.2451, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.00539, |
|
"grad_norm": 1.067518036012326, |
|
"learning_rate": 0.0016170000000000002, |
|
"loss": 5.2415, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.0054, |
|
"grad_norm": 1.1859628541053806, |
|
"learning_rate": 0.0016200000000000001, |
|
"loss": 5.2618, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.00541, |
|
"grad_norm": 0.8882936824028492, |
|
"learning_rate": 0.001623, |
|
"loss": 5.2308, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.00542, |
|
"grad_norm": 0.8517075205384302, |
|
"learning_rate": 0.001626, |
|
"loss": 5.2545, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.00543, |
|
"grad_norm": 0.8283552605034004, |
|
"learning_rate": 0.0016290000000000002, |
|
"loss": 5.2098, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.00544, |
|
"grad_norm": 0.9087829134911761, |
|
"learning_rate": 0.0016320000000000002, |
|
"loss": 5.2265, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.00545, |
|
"grad_norm": 0.8034144620978907, |
|
"learning_rate": 0.0016350000000000002, |
|
"loss": 5.2348, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.00546, |
|
"grad_norm": 0.7091235133563132, |
|
"learning_rate": 0.0016380000000000001, |
|
"loss": 5.2004, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.00547, |
|
"grad_norm": 0.6683331586466694, |
|
"learning_rate": 0.001641, |
|
"loss": 5.1915, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.00548, |
|
"grad_norm": 0.5441552662279447, |
|
"learning_rate": 0.001644, |
|
"loss": 5.1843, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.00549, |
|
"grad_norm": 0.5462993858197037, |
|
"learning_rate": 0.0016470000000000002, |
|
"loss": 5.1806, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.0055, |
|
"grad_norm": 0.5691406737163984, |
|
"learning_rate": 0.0016500000000000002, |
|
"loss": 5.1914, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.00551, |
|
"grad_norm": 0.57998143312047, |
|
"learning_rate": 0.0016530000000000002, |
|
"loss": 5.1789, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.00552, |
|
"grad_norm": 0.5286482487653069, |
|
"learning_rate": 0.0016560000000000001, |
|
"loss": 5.158, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.00553, |
|
"grad_norm": 0.48759014943874474, |
|
"learning_rate": 0.001659, |
|
"loss": 5.1465, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.00554, |
|
"grad_norm": 0.4792524526847805, |
|
"learning_rate": 0.0016620000000000003, |
|
"loss": 5.1537, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.00555, |
|
"grad_norm": 0.5054837577144806, |
|
"learning_rate": 0.0016650000000000002, |
|
"loss": 5.1496, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.00556, |
|
"grad_norm": 0.5148667639200912, |
|
"learning_rate": 0.0016680000000000002, |
|
"loss": 5.1288, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.00557, |
|
"grad_norm": 0.5122706189056161, |
|
"learning_rate": 0.0016710000000000002, |
|
"loss": 5.1153, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.00558, |
|
"grad_norm": 0.6046224327964763, |
|
"learning_rate": 0.0016740000000000001, |
|
"loss": 5.1358, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.00559, |
|
"grad_norm": 0.6851636176151574, |
|
"learning_rate": 0.001677, |
|
"loss": 5.1268, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.0056, |
|
"grad_norm": 0.6922547745331437, |
|
"learning_rate": 0.0016800000000000003, |
|
"loss": 5.096, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.00561, |
|
"grad_norm": 0.7411075733344746, |
|
"learning_rate": 0.0016830000000000003, |
|
"loss": 5.1176, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.00562, |
|
"grad_norm": 0.8981627852593407, |
|
"learning_rate": 0.0016860000000000002, |
|
"loss": 5.1206, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.00563, |
|
"grad_norm": 1.1260200882948381, |
|
"learning_rate": 0.001689, |
|
"loss": 5.1239, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.00564, |
|
"grad_norm": 1.1027210513289374, |
|
"learning_rate": 0.001692, |
|
"loss": 5.113, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.00565, |
|
"grad_norm": 0.890213024695838, |
|
"learning_rate": 0.001695, |
|
"loss": 5.1071, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.00566, |
|
"grad_norm": 0.8627979775394023, |
|
"learning_rate": 0.0016979999999999999, |
|
"loss": 5.0976, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.00567, |
|
"grad_norm": 0.9823945407746334, |
|
"learning_rate": 0.0017009999999999998, |
|
"loss": 5.101, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.00568, |
|
"grad_norm": 0.9715849000742567, |
|
"learning_rate": 0.0017039999999999998, |
|
"loss": 5.1139, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.00569, |
|
"grad_norm": 0.9989913016974431, |
|
"learning_rate": 0.001707, |
|
"loss": 5.0861, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.0057, |
|
"grad_norm": 1.0927877338999235, |
|
"learning_rate": 0.00171, |
|
"loss": 5.1262, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.00571, |
|
"grad_norm": 1.0328737042176641, |
|
"learning_rate": 0.001713, |
|
"loss": 5.1192, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.00572, |
|
"grad_norm": 1.3722388350456287, |
|
"learning_rate": 0.0017159999999999999, |
|
"loss": 5.1049, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.00573, |
|
"grad_norm": 0.9525979203379623, |
|
"learning_rate": 0.0017189999999999998, |
|
"loss": 5.1081, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.00574, |
|
"grad_norm": 1.0626128534442882, |
|
"learning_rate": 0.001722, |
|
"loss": 5.1048, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.00575, |
|
"grad_norm": 0.9331527599734185, |
|
"learning_rate": 0.001725, |
|
"loss": 5.074, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.00576, |
|
"grad_norm": 0.9277735357052385, |
|
"learning_rate": 0.001728, |
|
"loss": 5.0843, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.00577, |
|
"grad_norm": 0.9070967517565243, |
|
"learning_rate": 0.001731, |
|
"loss": 5.0908, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.00578, |
|
"grad_norm": 0.8451551366430134, |
|
"learning_rate": 0.0017339999999999999, |
|
"loss": 5.0704, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.00579, |
|
"grad_norm": 0.7590562073625285, |
|
"learning_rate": 0.0017369999999999998, |
|
"loss": 5.058, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.0058, |
|
"grad_norm": 0.6385326977156373, |
|
"learning_rate": 0.00174, |
|
"loss": 5.0662, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.00581, |
|
"grad_norm": 0.5982129206576257, |
|
"learning_rate": 0.001743, |
|
"loss": 5.0624, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.00582, |
|
"grad_norm": 0.7358321954275717, |
|
"learning_rate": 0.001746, |
|
"loss": 5.0371, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.00583, |
|
"grad_norm": 0.790704997209863, |
|
"learning_rate": 0.001749, |
|
"loss": 5.0641, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.00584, |
|
"grad_norm": 0.7351931257233056, |
|
"learning_rate": 0.0017519999999999999, |
|
"loss": 5.0549, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.00585, |
|
"grad_norm": 0.6167540927458872, |
|
"learning_rate": 0.0017549999999999998, |
|
"loss": 5.0362, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.00586, |
|
"grad_norm": 0.6143004439139178, |
|
"learning_rate": 0.001758, |
|
"loss": 5.0316, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.00587, |
|
"grad_norm": 0.619830997492515, |
|
"learning_rate": 0.001761, |
|
"loss": 5.018, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.00588, |
|
"grad_norm": 0.676754014852622, |
|
"learning_rate": 0.001764, |
|
"loss": 5.0113, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.00589, |
|
"grad_norm": 0.8152367183756798, |
|
"learning_rate": 0.001767, |
|
"loss": 5.0063, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.0059, |
|
"grad_norm": 0.8067730505459064, |
|
"learning_rate": 0.0017699999999999999, |
|
"loss": 5.0009, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.00591, |
|
"grad_norm": 0.7857115694134444, |
|
"learning_rate": 0.001773, |
|
"loss": 5.0014, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.00592, |
|
"grad_norm": 1.0305248086573016, |
|
"learning_rate": 0.001776, |
|
"loss": 5.0384, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.00593, |
|
"grad_norm": 1.1034322728795254, |
|
"learning_rate": 0.001779, |
|
"loss": 5.0387, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.00594, |
|
"grad_norm": 0.796782800390064, |
|
"learning_rate": 0.001782, |
|
"loss": 5.0138, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.00595, |
|
"grad_norm": 0.947951394101208, |
|
"learning_rate": 0.001785, |
|
"loss": 5.0288, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.00596, |
|
"grad_norm": 0.948001541672118, |
|
"learning_rate": 0.0017879999999999999, |
|
"loss": 5.0236, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.00597, |
|
"grad_norm": 0.8614311642183788, |
|
"learning_rate": 0.001791, |
|
"loss": 4.983, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.00598, |
|
"grad_norm": 0.7822502963641336, |
|
"learning_rate": 0.001794, |
|
"loss": 5.0032, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.00599, |
|
"grad_norm": 0.662502314672726, |
|
"learning_rate": 0.001797, |
|
"loss": 4.9824, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.006, |
|
"grad_norm": 0.612691797581944, |
|
"learning_rate": 0.0018, |
|
"loss": 4.9865, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.00601, |
|
"grad_norm": 0.6872791409552744, |
|
"learning_rate": 0.001803, |
|
"loss": 4.9768, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.00602, |
|
"grad_norm": 0.6805787880992017, |
|
"learning_rate": 0.0018059999999999999, |
|
"loss": 4.9611, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.00603, |
|
"grad_norm": 0.6726518476757681, |
|
"learning_rate": 0.001809, |
|
"loss": 4.9863, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.00604, |
|
"grad_norm": 0.6569683723636671, |
|
"learning_rate": 0.001812, |
|
"loss": 4.9594, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.00605, |
|
"grad_norm": 0.7635898698708745, |
|
"learning_rate": 0.001815, |
|
"loss": 4.9733, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.00606, |
|
"grad_norm": 0.7455554600622951, |
|
"learning_rate": 0.001818, |
|
"loss": 4.9743, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.00607, |
|
"grad_norm": 0.7493825131119356, |
|
"learning_rate": 0.001821, |
|
"loss": 4.968, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.00608, |
|
"grad_norm": 0.7957828614510537, |
|
"learning_rate": 0.001824, |
|
"loss": 4.9536, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.00609, |
|
"grad_norm": 0.8304899179729943, |
|
"learning_rate": 0.001827, |
|
"loss": 4.9564, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.0061, |
|
"grad_norm": 0.8109281284885322, |
|
"learning_rate": 0.00183, |
|
"loss": 4.9141, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.00611, |
|
"grad_norm": 0.7476158062529945, |
|
"learning_rate": 0.001833, |
|
"loss": 4.936, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.00612, |
|
"grad_norm": 0.7704555466663776, |
|
"learning_rate": 0.001836, |
|
"loss": 4.9262, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.00613, |
|
"grad_norm": 0.7179188189545482, |
|
"learning_rate": 0.001839, |
|
"loss": 4.9285, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.00614, |
|
"grad_norm": 0.6487628040476678, |
|
"learning_rate": 0.001842, |
|
"loss": 4.9483, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.00615, |
|
"grad_norm": 0.6346341426047015, |
|
"learning_rate": 0.001845, |
|
"loss": 4.9336, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.00616, |
|
"grad_norm": 0.6919482296512011, |
|
"learning_rate": 0.001848, |
|
"loss": 4.92, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.00617, |
|
"grad_norm": 0.8470324326487232, |
|
"learning_rate": 0.001851, |
|
"loss": 4.937, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.00618, |
|
"grad_norm": 0.9780451420741652, |
|
"learning_rate": 0.001854, |
|
"loss": 4.918, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.00619, |
|
"grad_norm": 1.0393940937639112, |
|
"learning_rate": 0.001857, |
|
"loss": 4.9393, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.0062, |
|
"grad_norm": 1.0864888594503201, |
|
"learning_rate": 0.00186, |
|
"loss": 4.9315, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.00621, |
|
"grad_norm": 1.1243573748169302, |
|
"learning_rate": 0.001863, |
|
"loss": 4.931, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.00622, |
|
"grad_norm": 1.2701399731469665, |
|
"learning_rate": 0.001866, |
|
"loss": 4.9502, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.00623, |
|
"grad_norm": 0.8273605340089976, |
|
"learning_rate": 0.001869, |
|
"loss": 4.9139, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.00624, |
|
"grad_norm": 0.9643311427939416, |
|
"learning_rate": 0.001872, |
|
"loss": 4.9253, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.00625, |
|
"grad_norm": 1.2310098040523034, |
|
"learning_rate": 0.001875, |
|
"loss": 4.9539, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.00626, |
|
"grad_norm": 0.8268848993203812, |
|
"learning_rate": 0.0018780000000000001, |
|
"loss": 4.9199, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.00627, |
|
"grad_norm": 0.9575639973355906, |
|
"learning_rate": 0.001881, |
|
"loss": 4.909, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.00628, |
|
"grad_norm": 0.9391450767019313, |
|
"learning_rate": 0.001884, |
|
"loss": 4.9487, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.00629, |
|
"grad_norm": 0.8387975784827085, |
|
"learning_rate": 0.001887, |
|
"loss": 4.8976, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.0063, |
|
"grad_norm": 0.7745062985579545, |
|
"learning_rate": 0.00189, |
|
"loss": 4.9062, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.00631, |
|
"grad_norm": 0.8213366097020988, |
|
"learning_rate": 0.0018930000000000002, |
|
"loss": 4.9006, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.00632, |
|
"grad_norm": 0.8670608316656828, |
|
"learning_rate": 0.0018960000000000001, |
|
"loss": 4.9226, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.00633, |
|
"grad_norm": 0.9183940724690223, |
|
"learning_rate": 0.001899, |
|
"loss": 4.9055, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.00634, |
|
"grad_norm": 0.8953336214105924, |
|
"learning_rate": 0.001902, |
|
"loss": 4.8931, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.00635, |
|
"grad_norm": 0.8869843611928877, |
|
"learning_rate": 0.001905, |
|
"loss": 4.8851, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.00636, |
|
"grad_norm": 1.014617544478078, |
|
"learning_rate": 0.001908, |
|
"loss": 4.911, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.00637, |
|
"grad_norm": 0.8681767489690893, |
|
"learning_rate": 0.0019110000000000002, |
|
"loss": 4.909, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.00638, |
|
"grad_norm": 0.9792692499883113, |
|
"learning_rate": 0.0019140000000000001, |
|
"loss": 4.9073, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.00639, |
|
"grad_norm": 0.8702384519183142, |
|
"learning_rate": 0.001917, |
|
"loss": 4.8724, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 0.7656979903996737, |
|
"learning_rate": 0.00192, |
|
"loss": 4.8809, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.00641, |
|
"grad_norm": 0.7895438458212233, |
|
"learning_rate": 0.001923, |
|
"loss": 4.8762, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.00642, |
|
"grad_norm": 0.8281699495124627, |
|
"learning_rate": 0.001926, |
|
"loss": 4.8749, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.00643, |
|
"grad_norm": 0.7845360432833325, |
|
"learning_rate": 0.0019290000000000002, |
|
"loss": 4.8856, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.00644, |
|
"grad_norm": 0.6857666802048429, |
|
"learning_rate": 0.0019320000000000001, |
|
"loss": 4.8583, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.00645, |
|
"grad_norm": 0.5056941566313082, |
|
"learning_rate": 0.001935, |
|
"loss": 4.8522, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.00646, |
|
"grad_norm": 0.47964630717910517, |
|
"learning_rate": 0.001938, |
|
"loss": 4.8628, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.00647, |
|
"grad_norm": 0.519046406898008, |
|
"learning_rate": 0.001941, |
|
"loss": 4.8552, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.00648, |
|
"grad_norm": 0.4935185080219269, |
|
"learning_rate": 0.0019440000000000002, |
|
"loss": 4.8488, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.00649, |
|
"grad_norm": 0.5205599594018984, |
|
"learning_rate": 0.0019470000000000002, |
|
"loss": 4.8333, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.0065, |
|
"grad_norm": 0.45295352715192905, |
|
"learning_rate": 0.0019500000000000001, |
|
"loss": 4.8617, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.00651, |
|
"grad_norm": 0.4378817965282719, |
|
"learning_rate": 0.001953, |
|
"loss": 4.84, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.00652, |
|
"grad_norm": 0.48395025642257355, |
|
"learning_rate": 0.0019560000000000003, |
|
"loss": 4.8266, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.00653, |
|
"grad_norm": 0.5194845024092328, |
|
"learning_rate": 0.0019590000000000002, |
|
"loss": 4.8294, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.00654, |
|
"grad_norm": 0.5707068593609262, |
|
"learning_rate": 0.001962, |
|
"loss": 4.8026, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.00655, |
|
"grad_norm": 0.6725172209521785, |
|
"learning_rate": 0.001965, |
|
"loss": 4.834, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.00656, |
|
"grad_norm": 0.7729568432311581, |
|
"learning_rate": 0.001968, |
|
"loss": 4.8398, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.00657, |
|
"grad_norm": 0.9182317536069756, |
|
"learning_rate": 0.001971, |
|
"loss": 4.8175, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.00658, |
|
"grad_norm": 0.9241834469814327, |
|
"learning_rate": 0.001974, |
|
"loss": 4.8482, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.00659, |
|
"grad_norm": 0.7554289117841486, |
|
"learning_rate": 0.001977, |
|
"loss": 4.8199, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.0066, |
|
"grad_norm": 0.786222644002383, |
|
"learning_rate": 0.00198, |
|
"loss": 4.8336, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.00661, |
|
"grad_norm": 0.7486956414167568, |
|
"learning_rate": 0.001983, |
|
"loss": 4.8035, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.00662, |
|
"grad_norm": 0.8224936329580866, |
|
"learning_rate": 0.0019860000000000004, |
|
"loss": 4.8038, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.00663, |
|
"grad_norm": 1.0333891900276588, |
|
"learning_rate": 0.0019890000000000003, |
|
"loss": 4.8216, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.00664, |
|
"grad_norm": 1.0454184895583787, |
|
"learning_rate": 0.0019920000000000003, |
|
"loss": 4.826, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.00665, |
|
"grad_norm": 0.9684168839706867, |
|
"learning_rate": 0.0019950000000000002, |
|
"loss": 4.7952, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.00666, |
|
"grad_norm": 1.0353944208678731, |
|
"learning_rate": 0.001998, |
|
"loss": 4.8213, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.00667, |
|
"grad_norm": 0.9360230286599108, |
|
"learning_rate": 0.002001, |
|
"loss": 4.8166, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.00668, |
|
"grad_norm": 0.7256915823313501, |
|
"learning_rate": 0.002004, |
|
"loss": 4.83, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.00669, |
|
"grad_norm": 0.8394069732991961, |
|
"learning_rate": 0.002007, |
|
"loss": 4.8494, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.0067, |
|
"grad_norm": 1.077577977883947, |
|
"learning_rate": 0.00201, |
|
"loss": 4.8532, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.00671, |
|
"grad_norm": 0.944781077919628, |
|
"learning_rate": 0.002013, |
|
"loss": 4.8236, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.00672, |
|
"grad_norm": 0.7024086289087288, |
|
"learning_rate": 0.002016, |
|
"loss": 4.7966, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.00673, |
|
"grad_norm": 0.6931174948166909, |
|
"learning_rate": 0.002019, |
|
"loss": 4.809, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.00674, |
|
"grad_norm": 0.6108100930625565, |
|
"learning_rate": 0.0020220000000000004, |
|
"loss": 4.8005, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.00675, |
|
"grad_norm": 0.5498762136107717, |
|
"learning_rate": 0.0020250000000000003, |
|
"loss": 4.8171, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.00676, |
|
"grad_norm": 0.6244881777878202, |
|
"learning_rate": 0.0020280000000000003, |
|
"loss": 4.7951, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.00677, |
|
"grad_norm": 0.5815667573949113, |
|
"learning_rate": 0.0020310000000000003, |
|
"loss": 4.7913, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.00678, |
|
"grad_norm": 0.521491074875552, |
|
"learning_rate": 0.0020340000000000002, |
|
"loss": 4.7959, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.00679, |
|
"grad_norm": 0.5801192243287052, |
|
"learning_rate": 0.002037, |
|
"loss": 4.7864, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.0068, |
|
"grad_norm": 0.6398788453868361, |
|
"learning_rate": 0.00204, |
|
"loss": 4.7875, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.00681, |
|
"grad_norm": 0.6456567912241706, |
|
"learning_rate": 0.002043, |
|
"loss": 4.7807, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.00682, |
|
"grad_norm": 0.6889535888719373, |
|
"learning_rate": 0.002046, |
|
"loss": 4.7776, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.00683, |
|
"grad_norm": 0.68132819276425, |
|
"learning_rate": 0.002049, |
|
"loss": 4.7612, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.00684, |
|
"grad_norm": 0.6724100911918068, |
|
"learning_rate": 0.002052, |
|
"loss": 4.7814, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.00685, |
|
"grad_norm": 0.815928121483196, |
|
"learning_rate": 0.0020550000000000004, |
|
"loss": 4.7913, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.00686, |
|
"grad_norm": 0.7537150884203632, |
|
"learning_rate": 0.0020580000000000004, |
|
"loss": 4.7584, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.00687, |
|
"grad_norm": 0.7326770278926477, |
|
"learning_rate": 0.0020610000000000003, |
|
"loss": 4.7655, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.00688, |
|
"grad_norm": 0.7145131916015852, |
|
"learning_rate": 0.002064, |
|
"loss": 4.7631, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.00689, |
|
"grad_norm": 0.694828893634318, |
|
"learning_rate": 0.002067, |
|
"loss": 4.7685, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.0069, |
|
"grad_norm": 0.7402628986076095, |
|
"learning_rate": 0.00207, |
|
"loss": 4.7629, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.00691, |
|
"grad_norm": 0.7809038473743037, |
|
"learning_rate": 0.0020729999999999998, |
|
"loss": 4.734, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.00692, |
|
"grad_norm": 0.9321737974219415, |
|
"learning_rate": 0.0020759999999999997, |
|
"loss": 4.7623, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.00693, |
|
"grad_norm": 1.0940875671633443, |
|
"learning_rate": 0.0020789999999999997, |
|
"loss": 4.788, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.00694, |
|
"grad_norm": 0.9030072391970948, |
|
"learning_rate": 0.002082, |
|
"loss": 4.7582, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.00695, |
|
"grad_norm": 0.8857004379777869, |
|
"learning_rate": 0.002085, |
|
"loss": 4.7649, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.00696, |
|
"grad_norm": 0.8991843577664433, |
|
"learning_rate": 0.002088, |
|
"loss": 4.7712, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.00697, |
|
"grad_norm": 1.0094726318481275, |
|
"learning_rate": 0.002091, |
|
"loss": 4.7869, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.00698, |
|
"grad_norm": 1.09376996979269, |
|
"learning_rate": 0.002094, |
|
"loss": 4.7773, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.00699, |
|
"grad_norm": 0.8766228601427865, |
|
"learning_rate": 0.002097, |
|
"loss": 4.7548, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.007, |
|
"grad_norm": 0.8580408760742435, |
|
"learning_rate": 0.0021, |
|
"loss": 4.7811, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.00701, |
|
"grad_norm": 0.7398805390269545, |
|
"learning_rate": 0.002103, |
|
"loss": 4.7496, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.00702, |
|
"grad_norm": 0.8329034135969645, |
|
"learning_rate": 0.002106, |
|
"loss": 4.762, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.00703, |
|
"grad_norm": 0.8505784581176613, |
|
"learning_rate": 0.0021089999999999998, |
|
"loss": 4.7427, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.00704, |
|
"grad_norm": 0.8003313240374758, |
|
"learning_rate": 0.0021119999999999997, |
|
"loss": 4.7516, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.00705, |
|
"grad_norm": 0.7109629500256387, |
|
"learning_rate": 0.002115, |
|
"loss": 4.7635, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.00706, |
|
"grad_norm": 0.6289559048526784, |
|
"learning_rate": 0.002118, |
|
"loss": 4.7473, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.00707, |
|
"grad_norm": 0.5482382237711074, |
|
"learning_rate": 0.002121, |
|
"loss": 4.7193, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.00708, |
|
"grad_norm": 0.5598390325768088, |
|
"learning_rate": 0.002124, |
|
"loss": 4.7015, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.00709, |
|
"grad_norm": 0.5529551760214286, |
|
"learning_rate": 0.002127, |
|
"loss": 4.7331, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.0071, |
|
"grad_norm": 0.6671084870548735, |
|
"learning_rate": 0.00213, |
|
"loss": 4.7271, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.00711, |
|
"grad_norm": 1.1336515667563218, |
|
"learning_rate": 0.002133, |
|
"loss": 4.7225, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.00712, |
|
"grad_norm": 0.8878745395415352, |
|
"learning_rate": 0.002136, |
|
"loss": 4.761, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.00713, |
|
"grad_norm": 0.6767692274074788, |
|
"learning_rate": 0.002139, |
|
"loss": 4.7547, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.00714, |
|
"grad_norm": 0.7698487468076475, |
|
"learning_rate": 0.002142, |
|
"loss": 4.7544, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.00715, |
|
"grad_norm": 0.6189384208707256, |
|
"learning_rate": 0.0021449999999999998, |
|
"loss": 4.7077, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.00716, |
|
"grad_norm": 0.5968344999809513, |
|
"learning_rate": 0.002148, |
|
"loss": 4.7064, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.00717, |
|
"grad_norm": 0.5117629580972962, |
|
"learning_rate": 0.002151, |
|
"loss": 4.7111, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.00718, |
|
"grad_norm": 0.5586203984201494, |
|
"learning_rate": 0.002154, |
|
"loss": 4.7079, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.00719, |
|
"grad_norm": 0.5075479633130432, |
|
"learning_rate": 0.002157, |
|
"loss": 4.7121, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.0072, |
|
"grad_norm": 0.4840284295584723, |
|
"learning_rate": 0.00216, |
|
"loss": 4.7045, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.00721, |
|
"grad_norm": 0.4971167150526394, |
|
"learning_rate": 0.002163, |
|
"loss": 4.7056, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.00722, |
|
"grad_norm": 0.5347204422021664, |
|
"learning_rate": 0.002166, |
|
"loss": 4.6807, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.00723, |
|
"grad_norm": 0.5424206330484346, |
|
"learning_rate": 0.002169, |
|
"loss": 4.6942, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.00724, |
|
"grad_norm": 0.5219918511718452, |
|
"learning_rate": 0.002172, |
|
"loss": 4.6912, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.00725, |
|
"grad_norm": 0.4975444213381179, |
|
"learning_rate": 0.002175, |
|
"loss": 4.6834, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.00726, |
|
"grad_norm": 0.5840922524665152, |
|
"learning_rate": 0.002178, |
|
"loss": 4.7008, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.00727, |
|
"grad_norm": 0.7633698368434719, |
|
"learning_rate": 0.0021809999999999998, |
|
"loss": 4.6777, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.00728, |
|
"grad_norm": 0.9052599955462425, |
|
"learning_rate": 0.002184, |
|
"loss": 4.7084, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.00729, |
|
"grad_norm": 0.9052862109699649, |
|
"learning_rate": 0.002187, |
|
"loss": 4.7065, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.0073, |
|
"grad_norm": 0.9788098741089467, |
|
"learning_rate": 0.00219, |
|
"loss": 4.7163, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.00731, |
|
"grad_norm": 0.9218858714566311, |
|
"learning_rate": 0.002193, |
|
"loss": 4.7193, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.00732, |
|
"grad_norm": 1.1412145043722297, |
|
"learning_rate": 0.002196, |
|
"loss": 4.7513, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.00733, |
|
"grad_norm": 1.0851884555194036, |
|
"learning_rate": 0.002199, |
|
"loss": 4.7135, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.00734, |
|
"grad_norm": 0.9295060612046938, |
|
"learning_rate": 0.002202, |
|
"loss": 4.745, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.00735, |
|
"grad_norm": 1.1744848346988581, |
|
"learning_rate": 0.002205, |
|
"loss": 4.766, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.00736, |
|
"grad_norm": 0.8278931321658332, |
|
"learning_rate": 0.002208, |
|
"loss": 4.7413, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.00737, |
|
"grad_norm": 0.8170748474782394, |
|
"learning_rate": 0.002211, |
|
"loss": 4.701, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.00738, |
|
"grad_norm": 0.772640277783885, |
|
"learning_rate": 0.002214, |
|
"loss": 4.7083, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.00739, |
|
"grad_norm": 0.6328057525928463, |
|
"learning_rate": 0.0022170000000000002, |
|
"loss": 4.6886, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.0074, |
|
"grad_norm": 0.6456777453177256, |
|
"learning_rate": 0.00222, |
|
"loss": 4.7001, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.00741, |
|
"grad_norm": 0.5912580097221507, |
|
"learning_rate": 0.002223, |
|
"loss": 4.6652, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.00742, |
|
"grad_norm": 0.4622238730689768, |
|
"learning_rate": 0.002226, |
|
"loss": 4.6972, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.00743, |
|
"grad_norm": 0.4904966133460914, |
|
"learning_rate": 0.002229, |
|
"loss": 4.6585, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.00744, |
|
"grad_norm": 0.5611948954376664, |
|
"learning_rate": 0.002232, |
|
"loss": 4.6643, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.00745, |
|
"grad_norm": 0.6222946629538655, |
|
"learning_rate": 0.002235, |
|
"loss": 4.6777, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.00746, |
|
"grad_norm": 0.6678221674623648, |
|
"learning_rate": 0.002238, |
|
"loss": 4.6735, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.00747, |
|
"grad_norm": 0.7945455347777014, |
|
"learning_rate": 0.002241, |
|
"loss": 4.6637, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.00748, |
|
"grad_norm": 0.9027593299924651, |
|
"learning_rate": 0.002244, |
|
"loss": 4.683, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.00749, |
|
"grad_norm": 0.8224517051186598, |
|
"learning_rate": 0.002247, |
|
"loss": 4.6612, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 0.6737194715747743, |
|
"learning_rate": 0.0022500000000000003, |
|
"loss": 4.6955, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.00751, |
|
"grad_norm": 0.7232822830562855, |
|
"learning_rate": 0.0022530000000000002, |
|
"loss": 4.6893, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.00752, |
|
"grad_norm": 0.6449225627821611, |
|
"learning_rate": 0.002256, |
|
"loss": 4.637, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.00753, |
|
"grad_norm": 0.6558342053271724, |
|
"learning_rate": 0.002259, |
|
"loss": 4.6782, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.00754, |
|
"grad_norm": 0.7056102529212358, |
|
"learning_rate": 0.002262, |
|
"loss": 4.6906, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.00755, |
|
"grad_norm": 0.9043185481447295, |
|
"learning_rate": 0.002265, |
|
"loss": 4.6606, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.00756, |
|
"grad_norm": 0.8770375990036015, |
|
"learning_rate": 0.002268, |
|
"loss": 4.6551, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.00757, |
|
"grad_norm": 0.784655485815756, |
|
"learning_rate": 0.002271, |
|
"loss": 4.6655, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.00758, |
|
"grad_norm": 0.6912079950422929, |
|
"learning_rate": 0.002274, |
|
"loss": 4.655, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.00759, |
|
"grad_norm": 0.6281371231708264, |
|
"learning_rate": 0.002277, |
|
"loss": 4.6629, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.0076, |
|
"grad_norm": 0.5992557531443571, |
|
"learning_rate": 0.00228, |
|
"loss": 4.6463, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.00761, |
|
"grad_norm": 0.6034804931159479, |
|
"learning_rate": 0.002283, |
|
"loss": 4.6146, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.00762, |
|
"grad_norm": 0.6060668832918386, |
|
"learning_rate": 0.0022860000000000003, |
|
"loss": 4.5953, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.00763, |
|
"grad_norm": 0.5816892837581542, |
|
"learning_rate": 0.0022890000000000002, |
|
"loss": 4.6459, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.00764, |
|
"grad_norm": 0.599786845424844, |
|
"learning_rate": 0.002292, |
|
"loss": 4.6554, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.00765, |
|
"grad_norm": 0.6640243443171511, |
|
"learning_rate": 0.002295, |
|
"loss": 4.6451, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.00766, |
|
"grad_norm": 0.6663176934335011, |
|
"learning_rate": 0.002298, |
|
"loss": 4.6438, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.00767, |
|
"grad_norm": 0.6384162747773746, |
|
"learning_rate": 0.002301, |
|
"loss": 4.6392, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.00768, |
|
"grad_norm": 0.8537658256313299, |
|
"learning_rate": 0.002304, |
|
"loss": 4.6457, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.00769, |
|
"grad_norm": 1.0098120726516953, |
|
"learning_rate": 0.002307, |
|
"loss": 4.6694, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.0077, |
|
"grad_norm": 0.7423003820485509, |
|
"learning_rate": 0.00231, |
|
"loss": 4.6223, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.00771, |
|
"grad_norm": 0.6691351211965346, |
|
"learning_rate": 0.002313, |
|
"loss": 4.6569, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.00772, |
|
"grad_norm": 0.8185609860643415, |
|
"learning_rate": 0.002316, |
|
"loss": 4.6531, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.00773, |
|
"grad_norm": 1.0039613061235502, |
|
"learning_rate": 0.0023190000000000003, |
|
"loss": 4.6664, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.00774, |
|
"grad_norm": 0.8500607234716588, |
|
"learning_rate": 0.0023220000000000003, |
|
"loss": 4.6845, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.00775, |
|
"grad_norm": 0.7057192127032131, |
|
"learning_rate": 0.0023250000000000002, |
|
"loss": 4.6688, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.00776, |
|
"grad_norm": 0.8527617729361273, |
|
"learning_rate": 0.002328, |
|
"loss": 4.6732, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.00777, |
|
"grad_norm": 0.7987783133918631, |
|
"learning_rate": 0.002331, |
|
"loss": 4.6469, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.00778, |
|
"grad_norm": 0.8221364860163118, |
|
"learning_rate": 0.002334, |
|
"loss": 4.6645, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.00779, |
|
"grad_norm": 0.8901185821524535, |
|
"learning_rate": 0.002337, |
|
"loss": 4.6243, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.0078, |
|
"grad_norm": 0.889956887452623, |
|
"learning_rate": 0.00234, |
|
"loss": 4.646, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.00781, |
|
"grad_norm": 0.763195026878423, |
|
"learning_rate": 0.002343, |
|
"loss": 4.6465, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.00782, |
|
"grad_norm": 0.6941360203492353, |
|
"learning_rate": 0.002346, |
|
"loss": 4.6378, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.00783, |
|
"grad_norm": 0.6504570243922513, |
|
"learning_rate": 0.002349, |
|
"loss": 4.6195, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.00784, |
|
"grad_norm": 0.5612286515783087, |
|
"learning_rate": 0.002352, |
|
"loss": 4.6293, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.00785, |
|
"grad_norm": 0.6491841258536649, |
|
"learning_rate": 0.0023550000000000003, |
|
"loss": 4.6258, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.00786, |
|
"grad_norm": 0.7618604140557245, |
|
"learning_rate": 0.0023580000000000003, |
|
"loss": 4.6353, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.00787, |
|
"grad_norm": 0.9289251225559676, |
|
"learning_rate": 0.0023610000000000003, |
|
"loss": 4.6478, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.00788, |
|
"grad_norm": 1.0160410825626025, |
|
"learning_rate": 0.002364, |
|
"loss": 4.6569, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.00789, |
|
"grad_norm": 0.9904632547103471, |
|
"learning_rate": 0.002367, |
|
"loss": 4.6841, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.0079, |
|
"grad_norm": 0.978549262142993, |
|
"learning_rate": 0.00237, |
|
"loss": 4.6525, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.00791, |
|
"grad_norm": 0.7626000247604355, |
|
"learning_rate": 0.002373, |
|
"loss": 4.6008, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.00792, |
|
"grad_norm": 0.7032433100729657, |
|
"learning_rate": 0.002376, |
|
"loss": 4.6361, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.00793, |
|
"grad_norm": 0.6164393933780811, |
|
"learning_rate": 0.002379, |
|
"loss": 4.6263, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.00794, |
|
"grad_norm": 0.6849838631141278, |
|
"learning_rate": 0.002382, |
|
"loss": 4.6402, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.00795, |
|
"grad_norm": 0.7541453624973187, |
|
"learning_rate": 0.002385, |
|
"loss": 4.6181, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.00796, |
|
"grad_norm": 0.7900314117496404, |
|
"learning_rate": 0.0023880000000000004, |
|
"loss": 4.6154, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.00797, |
|
"grad_norm": 0.8032302751261208, |
|
"learning_rate": 0.0023910000000000003, |
|
"loss": 4.6598, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.00798, |
|
"grad_norm": 0.7037506066431396, |
|
"learning_rate": 0.0023940000000000003, |
|
"loss": 4.6206, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.00799, |
|
"grad_norm": 0.6873143294249761, |
|
"learning_rate": 0.0023970000000000003, |
|
"loss": 4.6565, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.5611968184575689, |
|
"learning_rate": 0.0024000000000000002, |
|
"loss": 4.6002, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.00801, |
|
"grad_norm": 0.6431524925156966, |
|
"learning_rate": 0.002403, |
|
"loss": 4.6326, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.00802, |
|
"grad_norm": 0.664361851854149, |
|
"learning_rate": 0.002406, |
|
"loss": 4.624, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.00803, |
|
"grad_norm": 0.7385532765716779, |
|
"learning_rate": 0.002409, |
|
"loss": 4.6034, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.00804, |
|
"grad_norm": 0.6763586866376322, |
|
"learning_rate": 0.002412, |
|
"loss": 4.609, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.00805, |
|
"grad_norm": 0.4984516329675046, |
|
"learning_rate": 0.002415, |
|
"loss": 4.5958, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.00806, |
|
"grad_norm": 0.6422151194456196, |
|
"learning_rate": 0.002418, |
|
"loss": 4.6202, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.00807, |
|
"grad_norm": 0.6520347813599764, |
|
"learning_rate": 0.0024210000000000004, |
|
"loss": 4.6111, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.00808, |
|
"grad_norm": 0.6466117231395049, |
|
"learning_rate": 0.0024240000000000004, |
|
"loss": 4.5938, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.00809, |
|
"grad_norm": 0.7173763198404414, |
|
"learning_rate": 0.0024270000000000003, |
|
"loss": 4.5875, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.0081, |
|
"grad_norm": 0.7630708316428362, |
|
"learning_rate": 0.0024300000000000003, |
|
"loss": 4.6257, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.00811, |
|
"grad_norm": 0.6597258634486322, |
|
"learning_rate": 0.0024330000000000003, |
|
"loss": 4.5916, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.00812, |
|
"grad_norm": 0.5596135689444884, |
|
"learning_rate": 0.0024360000000000002, |
|
"loss": 4.5639, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.00813, |
|
"grad_norm": 0.5455291997717118, |
|
"learning_rate": 0.0024389999999999998, |
|
"loss": 4.6039, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.00814, |
|
"grad_norm": 0.643394229982758, |
|
"learning_rate": 0.0024419999999999997, |
|
"loss": 4.6125, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.00815, |
|
"grad_norm": 0.7154153020141445, |
|
"learning_rate": 0.0024449999999999997, |
|
"loss": 4.6204, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.00816, |
|
"grad_norm": 0.7061775094598466, |
|
"learning_rate": 0.002448, |
|
"loss": 4.6005, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.00817, |
|
"grad_norm": 0.7497949470997279, |
|
"learning_rate": 0.002451, |
|
"loss": 4.6054, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.00818, |
|
"grad_norm": 0.8303821582765404, |
|
"learning_rate": 0.002454, |
|
"loss": 4.6047, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.00819, |
|
"grad_norm": 0.7870772660210968, |
|
"learning_rate": 0.002457, |
|
"loss": 4.6129, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.0082, |
|
"grad_norm": 0.8045799279983024, |
|
"learning_rate": 0.00246, |
|
"loss": 4.5959, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.00821, |
|
"grad_norm": 0.6847814476141592, |
|
"learning_rate": 0.002463, |
|
"loss": 4.5753, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.00822, |
|
"grad_norm": 0.6767342952639601, |
|
"learning_rate": 0.002466, |
|
"loss": 4.5743, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.00823, |
|
"grad_norm": 0.7814972414925468, |
|
"learning_rate": 0.002469, |
|
"loss": 4.5892, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.00824, |
|
"grad_norm": 0.7970591329920511, |
|
"learning_rate": 0.002472, |
|
"loss": 4.6124, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.00825, |
|
"grad_norm": 0.7237059608391628, |
|
"learning_rate": 0.0024749999999999998, |
|
"loss": 4.6158, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.00826, |
|
"grad_norm": 0.8825063151832095, |
|
"learning_rate": 0.0024779999999999997, |
|
"loss": 4.5838, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.00827, |
|
"grad_norm": 0.895197450200386, |
|
"learning_rate": 0.002481, |
|
"loss": 4.6224, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.00828, |
|
"grad_norm": 0.7760924275661016, |
|
"learning_rate": 0.002484, |
|
"loss": 4.6087, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.00829, |
|
"grad_norm": 0.8835354563964355, |
|
"learning_rate": 0.002487, |
|
"loss": 4.6157, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.0083, |
|
"grad_norm": 0.8984013262238549, |
|
"learning_rate": 0.00249, |
|
"loss": 4.6232, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.00831, |
|
"grad_norm": 0.8653834946214679, |
|
"learning_rate": 0.002493, |
|
"loss": 4.6038, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.00832, |
|
"grad_norm": 0.8641563564291244, |
|
"learning_rate": 0.002496, |
|
"loss": 4.6089, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.00833, |
|
"grad_norm": 0.8849940351412556, |
|
"learning_rate": 0.002499, |
|
"loss": 4.6069, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.00834, |
|
"grad_norm": 0.9333438912141526, |
|
"learning_rate": 0.002502, |
|
"loss": 4.6165, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.00835, |
|
"grad_norm": 0.8853225829488065, |
|
"learning_rate": 0.002505, |
|
"loss": 4.6108, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.00836, |
|
"grad_norm": 0.9042165265638229, |
|
"learning_rate": 0.002508, |
|
"loss": 4.6433, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.00837, |
|
"grad_norm": 0.8034248166459536, |
|
"learning_rate": 0.0025109999999999998, |
|
"loss": 4.5745, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.00838, |
|
"grad_norm": 0.8086998635281932, |
|
"learning_rate": 0.0025139999999999997, |
|
"loss": 4.6114, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.00839, |
|
"grad_norm": 0.587375456181757, |
|
"learning_rate": 0.002517, |
|
"loss": 4.5954, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.0084, |
|
"grad_norm": 0.6561997865962053, |
|
"learning_rate": 0.00252, |
|
"loss": 4.5818, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.00841, |
|
"grad_norm": 0.6860624256233325, |
|
"learning_rate": 0.002523, |
|
"loss": 4.5813, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.00842, |
|
"grad_norm": 0.6384378914075728, |
|
"learning_rate": 0.002526, |
|
"loss": 4.5783, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.00843, |
|
"grad_norm": 0.5397302637540236, |
|
"learning_rate": 0.002529, |
|
"loss": 4.5728, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.00844, |
|
"grad_norm": 0.5753282450552862, |
|
"learning_rate": 0.002532, |
|
"loss": 4.5912, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.00845, |
|
"grad_norm": 0.605188801053789, |
|
"learning_rate": 0.002535, |
|
"loss": 4.5588, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.00846, |
|
"grad_norm": 0.523231421872222, |
|
"learning_rate": 0.002538, |
|
"loss": 4.5771, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.00847, |
|
"grad_norm": 0.4442194357732046, |
|
"learning_rate": 0.002541, |
|
"loss": 4.5539, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.00848, |
|
"grad_norm": 0.4340595323225716, |
|
"learning_rate": 0.002544, |
|
"loss": 4.5562, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.00849, |
|
"grad_norm": 0.40987686572717924, |
|
"learning_rate": 0.002547, |
|
"loss": 4.5471, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.0085, |
|
"grad_norm": 0.38823479672015115, |
|
"learning_rate": 0.00255, |
|
"loss": 4.5547, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.00851, |
|
"grad_norm": 0.42997075207568086, |
|
"learning_rate": 0.002553, |
|
"loss": 4.5733, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.00852, |
|
"grad_norm": 0.5192269912554525, |
|
"learning_rate": 0.002556, |
|
"loss": 4.5624, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.00853, |
|
"grad_norm": 0.6626364019198889, |
|
"learning_rate": 0.002559, |
|
"loss": 4.5465, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.00854, |
|
"grad_norm": 0.9475750496728129, |
|
"learning_rate": 0.002562, |
|
"loss": 4.5544, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.00855, |
|
"grad_norm": 0.9196861875673602, |
|
"learning_rate": 0.002565, |
|
"loss": 4.5554, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.00856, |
|
"grad_norm": 0.6972970172901616, |
|
"learning_rate": 0.002568, |
|
"loss": 4.5747, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.00857, |
|
"grad_norm": 0.7425487517589463, |
|
"learning_rate": 0.002571, |
|
"loss": 4.5717, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.00858, |
|
"grad_norm": 0.678911102843075, |
|
"learning_rate": 0.002574, |
|
"loss": 4.5576, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.00859, |
|
"grad_norm": 0.6680695283315139, |
|
"learning_rate": 0.002577, |
|
"loss": 4.5494, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.0086, |
|
"grad_norm": 0.7323389776740598, |
|
"learning_rate": 0.00258, |
|
"loss": 4.5797, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.00861, |
|
"grad_norm": 0.6564756555111457, |
|
"learning_rate": 0.0025830000000000002, |
|
"loss": 4.5928, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.00862, |
|
"grad_norm": 0.6458920384469554, |
|
"learning_rate": 0.002586, |
|
"loss": 4.5402, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.00863, |
|
"grad_norm": 0.56307833723133, |
|
"learning_rate": 0.002589, |
|
"loss": 4.5659, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.00864, |
|
"grad_norm": 0.6301527121328465, |
|
"learning_rate": 0.002592, |
|
"loss": 4.5169, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.00865, |
|
"grad_norm": 0.5741592962502369, |
|
"learning_rate": 0.002595, |
|
"loss": 4.5342, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.00866, |
|
"grad_norm": 0.6185631950272453, |
|
"learning_rate": 0.002598, |
|
"loss": 4.5624, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.00867, |
|
"grad_norm": 0.7226342011827854, |
|
"learning_rate": 0.002601, |
|
"loss": 4.5538, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.00868, |
|
"grad_norm": 0.6871063796143311, |
|
"learning_rate": 0.002604, |
|
"loss": 4.5354, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.00869, |
|
"grad_norm": 0.6146544871717051, |
|
"learning_rate": 0.002607, |
|
"loss": 4.5411, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.0087, |
|
"grad_norm": 0.5850681754527672, |
|
"learning_rate": 0.00261, |
|
"loss": 4.5279, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.00871, |
|
"grad_norm": 0.6619050810997609, |
|
"learning_rate": 0.002613, |
|
"loss": 4.5496, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.00872, |
|
"grad_norm": 0.6309510333250544, |
|
"learning_rate": 0.002616, |
|
"loss": 4.5312, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.00873, |
|
"grad_norm": 0.620458951387438, |
|
"learning_rate": 0.0026190000000000002, |
|
"loss": 4.5409, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.00874, |
|
"grad_norm": 0.7675582961358233, |
|
"learning_rate": 0.002622, |
|
"loss": 4.5679, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.00875, |
|
"grad_norm": 0.8413199287183839, |
|
"learning_rate": 0.002625, |
|
"loss": 4.5217, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.00876, |
|
"grad_norm": 0.6439305106538762, |
|
"learning_rate": 0.002628, |
|
"loss": 4.5435, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.00877, |
|
"grad_norm": 0.6155973146282673, |
|
"learning_rate": 0.002631, |
|
"loss": 4.5607, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.00878, |
|
"grad_norm": 0.6572664983016872, |
|
"learning_rate": 0.002634, |
|
"loss": 4.4959, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.00879, |
|
"grad_norm": 0.678308084092591, |
|
"learning_rate": 0.002637, |
|
"loss": 4.526, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.0088, |
|
"grad_norm": 0.682644138366955, |
|
"learning_rate": 0.00264, |
|
"loss": 4.5397, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.00881, |
|
"grad_norm": 0.6916960221367301, |
|
"learning_rate": 0.002643, |
|
"loss": 4.5424, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.00882, |
|
"grad_norm": 0.8625989079873627, |
|
"learning_rate": 0.002646, |
|
"loss": 4.5523, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.00883, |
|
"grad_norm": 0.9115056000231379, |
|
"learning_rate": 0.002649, |
|
"loss": 4.5538, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.00884, |
|
"grad_norm": 1.1386535317654105, |
|
"learning_rate": 0.0026520000000000003, |
|
"loss": 4.5528, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.00885, |
|
"grad_norm": 0.9401738564991132, |
|
"learning_rate": 0.0026550000000000002, |
|
"loss": 4.5478, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.00886, |
|
"grad_norm": 1.0261918984073035, |
|
"learning_rate": 0.002658, |
|
"loss": 4.5785, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.00887, |
|
"grad_norm": 1.2427377192417122, |
|
"learning_rate": 0.002661, |
|
"loss": 4.6, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.00888, |
|
"grad_norm": 1.061546131863703, |
|
"learning_rate": 0.002664, |
|
"loss": 4.5885, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.00889, |
|
"grad_norm": 0.907593068920597, |
|
"learning_rate": 0.002667, |
|
"loss": 4.535, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.0089, |
|
"grad_norm": 0.8129313603021157, |
|
"learning_rate": 0.00267, |
|
"loss": 4.5855, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.00891, |
|
"grad_norm": 0.8985699151758613, |
|
"learning_rate": 0.002673, |
|
"loss": 4.5762, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.00892, |
|
"grad_norm": 0.9328845387585828, |
|
"learning_rate": 0.002676, |
|
"loss": 4.5962, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.00893, |
|
"grad_norm": 0.9658499469408992, |
|
"learning_rate": 0.002679, |
|
"loss": 4.5743, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.00894, |
|
"grad_norm": 0.9262060176852961, |
|
"learning_rate": 0.002682, |
|
"loss": 4.5624, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.00895, |
|
"grad_norm": 1.0340643070320608, |
|
"learning_rate": 0.0026850000000000003, |
|
"loss": 4.5798, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.00896, |
|
"grad_norm": 0.961568519948311, |
|
"learning_rate": 0.0026880000000000003, |
|
"loss": 4.5724, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.00897, |
|
"grad_norm": 0.9522979094666822, |
|
"learning_rate": 0.0026910000000000002, |
|
"loss": 4.5901, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.00898, |
|
"grad_norm": 0.90558868323236, |
|
"learning_rate": 0.002694, |
|
"loss": 4.5802, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.00899, |
|
"grad_norm": 0.7752291080346148, |
|
"learning_rate": 0.002697, |
|
"loss": 4.5848, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.009, |
|
"grad_norm": 0.7198757797557334, |
|
"learning_rate": 0.0027, |
|
"loss": 4.5998, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.00901, |
|
"grad_norm": 0.7328950523365488, |
|
"learning_rate": 0.002703, |
|
"loss": 4.5552, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.00902, |
|
"grad_norm": 0.5974121750232518, |
|
"learning_rate": 0.002706, |
|
"loss": 4.558, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.00903, |
|
"grad_norm": 0.5753314236304967, |
|
"learning_rate": 0.002709, |
|
"loss": 4.5534, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.00904, |
|
"grad_norm": 0.6128643778394687, |
|
"learning_rate": 0.002712, |
|
"loss": 4.5329, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.00905, |
|
"grad_norm": 0.5691916638541509, |
|
"learning_rate": 0.002715, |
|
"loss": 4.5446, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.00906, |
|
"grad_norm": 0.5122473095411203, |
|
"learning_rate": 0.002718, |
|
"loss": 4.5538, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.00907, |
|
"grad_norm": 0.4479956748150508, |
|
"learning_rate": 0.0027210000000000003, |
|
"loss": 4.5603, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.00908, |
|
"grad_norm": 0.4387217017050332, |
|
"learning_rate": 0.0027240000000000003, |
|
"loss": 4.5304, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.00909, |
|
"grad_norm": 0.4358600325368994, |
|
"learning_rate": 0.0027270000000000003, |
|
"loss": 4.5454, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.0091, |
|
"grad_norm": 0.3849373702796595, |
|
"learning_rate": 0.0027300000000000002, |
|
"loss": 4.5477, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.00911, |
|
"grad_norm": 0.4574591880095763, |
|
"learning_rate": 0.002733, |
|
"loss": 4.5038, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.00912, |
|
"grad_norm": 0.6121326743360266, |
|
"learning_rate": 0.002736, |
|
"loss": 4.5131, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.00913, |
|
"grad_norm": 0.6845161679319078, |
|
"learning_rate": 0.002739, |
|
"loss": 4.513, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.00914, |
|
"grad_norm": 0.7333722528690678, |
|
"learning_rate": 0.002742, |
|
"loss": 4.5432, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.00915, |
|
"grad_norm": 0.6714442153045657, |
|
"learning_rate": 0.002745, |
|
"loss": 4.5048, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.00916, |
|
"grad_norm": 0.5861682792098877, |
|
"learning_rate": 0.002748, |
|
"loss": 4.4768, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.00917, |
|
"grad_norm": 0.7568015257717708, |
|
"learning_rate": 0.002751, |
|
"loss": 4.4985, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.00918, |
|
"grad_norm": 0.67845085346177, |
|
"learning_rate": 0.0027540000000000004, |
|
"loss": 4.5185, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.00919, |
|
"grad_norm": 0.6376249584521229, |
|
"learning_rate": 0.0027570000000000003, |
|
"loss": 4.4976, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.0092, |
|
"grad_norm": 0.5953295490629925, |
|
"learning_rate": 0.0027600000000000003, |
|
"loss": 4.5104, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.00921, |
|
"grad_norm": 0.5470113372090761, |
|
"learning_rate": 0.0027630000000000003, |
|
"loss": 4.4892, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.00922, |
|
"grad_norm": 0.5586775976924615, |
|
"learning_rate": 0.0027660000000000002, |
|
"loss": 4.5007, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.00923, |
|
"grad_norm": 0.5602976125001545, |
|
"learning_rate": 0.002769, |
|
"loss": 4.4767, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.00924, |
|
"grad_norm": 0.5512257756150963, |
|
"learning_rate": 0.002772, |
|
"loss": 4.4992, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.00925, |
|
"grad_norm": 0.6104014154605217, |
|
"learning_rate": 0.002775, |
|
"loss": 4.4804, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.00926, |
|
"grad_norm": 0.627882987139063, |
|
"learning_rate": 0.002778, |
|
"loss": 4.4994, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.00927, |
|
"grad_norm": 0.5637389395797978, |
|
"learning_rate": 0.002781, |
|
"loss": 4.4582, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.00928, |
|
"grad_norm": 0.5793012970445044, |
|
"learning_rate": 0.002784, |
|
"loss": 4.505, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.00929, |
|
"grad_norm": 0.5538600481661593, |
|
"learning_rate": 0.0027870000000000004, |
|
"loss": 4.4867, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.0093, |
|
"grad_norm": 0.4915384406563099, |
|
"learning_rate": 0.0027900000000000004, |
|
"loss": 4.4652, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.00931, |
|
"grad_norm": 0.5291025545236148, |
|
"learning_rate": 0.0027930000000000003, |
|
"loss": 4.4796, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.00932, |
|
"grad_norm": 0.5189072536981689, |
|
"learning_rate": 0.0027960000000000003, |
|
"loss": 4.4881, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.00933, |
|
"grad_norm": 0.4391692113712953, |
|
"learning_rate": 0.0027990000000000003, |
|
"loss": 4.4495, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.00934, |
|
"grad_norm": 0.5067956527040228, |
|
"learning_rate": 0.0028020000000000002, |
|
"loss": 4.4811, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.00935, |
|
"grad_norm": 0.5752082887405049, |
|
"learning_rate": 0.002805, |
|
"loss": 4.4893, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.00936, |
|
"grad_norm": 0.8597610157431208, |
|
"learning_rate": 0.002808, |
|
"loss": 4.5012, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.00937, |
|
"grad_norm": 1.0923006968336397, |
|
"learning_rate": 0.002811, |
|
"loss": 4.5462, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.00938, |
|
"grad_norm": 0.8211742236965874, |
|
"learning_rate": 0.002814, |
|
"loss": 4.5052, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.00939, |
|
"grad_norm": 0.699189062953733, |
|
"learning_rate": 0.002817, |
|
"loss": 4.4989, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.0094, |
|
"grad_norm": 0.6251265662758575, |
|
"learning_rate": 0.00282, |
|
"loss": 4.495, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.00941, |
|
"grad_norm": 0.8614570213641495, |
|
"learning_rate": 0.002823, |
|
"loss": 4.4823, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.00942, |
|
"grad_norm": 0.9710835908546839, |
|
"learning_rate": 0.002826, |
|
"loss": 4.5282, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.00943, |
|
"grad_norm": 0.8255156288963836, |
|
"learning_rate": 0.002829, |
|
"loss": 4.5175, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.00944, |
|
"grad_norm": 0.8529303551550831, |
|
"learning_rate": 0.002832, |
|
"loss": 4.5233, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.00945, |
|
"grad_norm": 0.8805636588256177, |
|
"learning_rate": 0.002835, |
|
"loss": 4.5307, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.00946, |
|
"grad_norm": 0.8815394682869104, |
|
"learning_rate": 0.002838, |
|
"loss": 4.545, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.00947, |
|
"grad_norm": 0.9792537736888025, |
|
"learning_rate": 0.0028409999999999998, |
|
"loss": 4.5259, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.00948, |
|
"grad_norm": 1.16480627354709, |
|
"learning_rate": 0.0028439999999999997, |
|
"loss": 4.5258, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.00949, |
|
"grad_norm": 0.9311558570406092, |
|
"learning_rate": 0.002847, |
|
"loss": 4.5678, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.0095, |
|
"grad_norm": 0.9151500308490546, |
|
"learning_rate": 0.00285, |
|
"loss": 4.5566, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.00951, |
|
"grad_norm": 0.8788038193040383, |
|
"learning_rate": 0.002853, |
|
"loss": 4.539, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.00952, |
|
"grad_norm": 0.6883343465241535, |
|
"learning_rate": 0.002856, |
|
"loss": 4.5492, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.00953, |
|
"grad_norm": 0.7428316161957196, |
|
"learning_rate": 0.002859, |
|
"loss": 4.4912, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.00954, |
|
"grad_norm": 0.6589075049765231, |
|
"learning_rate": 0.002862, |
|
"loss": 4.5478, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.00955, |
|
"grad_norm": 0.6374906159512967, |
|
"learning_rate": 0.002865, |
|
"loss": 4.5305, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.00956, |
|
"grad_norm": 0.7037593758453927, |
|
"learning_rate": 0.002868, |
|
"loss": 4.5123, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.00957, |
|
"grad_norm": 0.7575837270625414, |
|
"learning_rate": 0.002871, |
|
"loss": 4.5364, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.00958, |
|
"grad_norm": 0.8136537306721928, |
|
"learning_rate": 0.002874, |
|
"loss": 4.5221, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.00959, |
|
"grad_norm": 0.7529303499273822, |
|
"learning_rate": 0.002877, |
|
"loss": 4.5134, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 0.7723176518474248, |
|
"learning_rate": 0.0028799999999999997, |
|
"loss": 4.5128, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.00961, |
|
"grad_norm": 0.6826701565045171, |
|
"learning_rate": 0.002883, |
|
"loss": 4.4802, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.00962, |
|
"grad_norm": 0.6022941634910601, |
|
"learning_rate": 0.002886, |
|
"loss": 4.5086, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.00963, |
|
"grad_norm": 0.5423609760386227, |
|
"learning_rate": 0.002889, |
|
"loss": 4.5025, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.00964, |
|
"grad_norm": 0.5430502092899187, |
|
"learning_rate": 0.002892, |
|
"loss": 4.493, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.00965, |
|
"grad_norm": 0.5046734063237629, |
|
"learning_rate": 0.002895, |
|
"loss": 4.4643, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.00966, |
|
"grad_norm": 0.4625107724617308, |
|
"learning_rate": 0.002898, |
|
"loss": 4.4758, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.00967, |
|
"grad_norm": 0.4403845150464526, |
|
"learning_rate": 0.002901, |
|
"loss": 4.5117, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.00968, |
|
"grad_norm": 0.40739936008519134, |
|
"learning_rate": 0.002904, |
|
"loss": 4.4717, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.00969, |
|
"grad_norm": 0.41620271754109195, |
|
"learning_rate": 0.002907, |
|
"loss": 4.5059, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.0097, |
|
"grad_norm": 0.3884080596123629, |
|
"learning_rate": 0.00291, |
|
"loss": 4.4561, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.00971, |
|
"grad_norm": 0.40963591049792675, |
|
"learning_rate": 0.002913, |
|
"loss": 4.4624, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.00972, |
|
"grad_norm": 0.427860611994679, |
|
"learning_rate": 0.002916, |
|
"loss": 4.4693, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.00973, |
|
"grad_norm": 0.6266830722962007, |
|
"learning_rate": 0.002919, |
|
"loss": 4.4689, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.00974, |
|
"grad_norm": 0.7795241673951117, |
|
"learning_rate": 0.002922, |
|
"loss": 4.4734, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.00975, |
|
"grad_norm": 0.7392088002205494, |
|
"learning_rate": 0.002925, |
|
"loss": 4.4915, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.00976, |
|
"grad_norm": 0.6090919745791438, |
|
"learning_rate": 0.002928, |
|
"loss": 4.4688, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.00977, |
|
"grad_norm": 0.6721667952819499, |
|
"learning_rate": 0.002931, |
|
"loss": 4.5114, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.00978, |
|
"grad_norm": 0.7157395676490952, |
|
"learning_rate": 0.002934, |
|
"loss": 4.4828, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.00979, |
|
"grad_norm": 0.6673604586868628, |
|
"learning_rate": 0.002937, |
|
"loss": 4.4724, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.0098, |
|
"grad_norm": 0.6241827666303977, |
|
"learning_rate": 0.00294, |
|
"loss": 4.4641, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.00981, |
|
"grad_norm": 0.6477735028588306, |
|
"learning_rate": 0.002943, |
|
"loss": 4.479, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.00982, |
|
"grad_norm": 0.5903229373466107, |
|
"learning_rate": 0.002946, |
|
"loss": 4.4717, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.00983, |
|
"grad_norm": 0.5367081021491611, |
|
"learning_rate": 0.0029490000000000002, |
|
"loss": 4.4684, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.00984, |
|
"grad_norm": 0.55855952891459, |
|
"learning_rate": 0.002952, |
|
"loss": 4.455, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.00985, |
|
"grad_norm": 0.6566800838357816, |
|
"learning_rate": 0.002955, |
|
"loss": 4.4789, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.00986, |
|
"grad_norm": 0.7602203626787503, |
|
"learning_rate": 0.002958, |
|
"loss": 4.4461, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.00987, |
|
"grad_norm": 0.7515002206199092, |
|
"learning_rate": 0.002961, |
|
"loss": 4.4656, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.00988, |
|
"grad_norm": 0.7510028045825011, |
|
"learning_rate": 0.002964, |
|
"loss": 4.497, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.00989, |
|
"grad_norm": 0.6686524266681442, |
|
"learning_rate": 0.002967, |
|
"loss": 4.438, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.0099, |
|
"grad_norm": 0.7110181433203292, |
|
"learning_rate": 0.00297, |
|
"loss": 4.4759, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.00991, |
|
"grad_norm": 0.9487700940206629, |
|
"learning_rate": 0.002973, |
|
"loss": 4.4909, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.00992, |
|
"grad_norm": 1.0128664486589338, |
|
"learning_rate": 0.002976, |
|
"loss": 4.4936, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.00993, |
|
"grad_norm": 0.776315556008045, |
|
"learning_rate": 0.002979, |
|
"loss": 4.4927, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.00994, |
|
"grad_norm": 0.6954881784010936, |
|
"learning_rate": 0.002982, |
|
"loss": 4.4894, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.00995, |
|
"grad_norm": 0.8343170542469196, |
|
"learning_rate": 0.0029850000000000002, |
|
"loss": 4.4966, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.00996, |
|
"grad_norm": 0.8064256733311401, |
|
"learning_rate": 0.002988, |
|
"loss": 4.4974, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.00997, |
|
"grad_norm": 0.7600837252115415, |
|
"learning_rate": 0.002991, |
|
"loss": 4.4859, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.00998, |
|
"grad_norm": 0.7413851219310601, |
|
"learning_rate": 0.002994, |
|
"loss": 4.4878, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.00999, |
|
"grad_norm": 0.7789142980889687, |
|
"learning_rate": 0.002997, |
|
"loss": 4.4774, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5973093879700552, |
|
"learning_rate": 0.003, |
|
"loss": 4.4903, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.9643642855424e+16, |
|
"train_batch_size": 1024, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|