diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,176542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 50.0, + "eval_steps": 500, + "global_step": 25000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002, + "grad_norm": 100.7913589477539, + "learning_rate": 2e-05, + "loss": 3.10091186, + "step": 1 + }, + { + "epoch": 0.004, + "grad_norm": 40.7878532409668, + "learning_rate": 2e-05, + "loss": 1.36577916, + "step": 2 + }, + { + "epoch": 0.006, + "grad_norm": 17.934667587280273, + "learning_rate": 2e-05, + "loss": 0.80921072, + "step": 3 + }, + { + "epoch": 0.008, + "grad_norm": 29.937301635742188, + "learning_rate": 2e-05, + "loss": 0.85672188, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 8.57142448425293, + "learning_rate": 2e-05, + "loss": 0.69120318, + "step": 5 + }, + { + "epoch": 0.012, + "grad_norm": 7.896178245544434, + "learning_rate": 2e-05, + "loss": 0.47293058, + "step": 6 + }, + { + "epoch": 0.014, + "grad_norm": 8.591035842895508, + "learning_rate": 2e-05, + "loss": 0.50240922, + "step": 7 + }, + { + "epoch": 0.016, + "grad_norm": 10.887709617614746, + "learning_rate": 2e-05, + "loss": 0.51509768, + "step": 8 + }, + { + "epoch": 0.018, + "grad_norm": 134.73768615722656, + "learning_rate": 2e-05, + "loss": 0.49554652, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 55.33390426635742, + "learning_rate": 2e-05, + "loss": 0.62111574, + "step": 10 + }, + { + "epoch": 0.022, + "grad_norm": 12.508940696716309, + "learning_rate": 2e-05, + "loss": 0.43727028, + "step": 11 + }, + { + "epoch": 0.024, + "grad_norm": 8.327451705932617, + "learning_rate": 2e-05, + "loss": 0.4894059, + "step": 12 + }, + { + "epoch": 0.026, + "grad_norm": 4.562747001647949, + "learning_rate": 2e-05, + "loss": 0.42299265, + "step": 13 + }, + { + "epoch": 0.028, + "grad_norm": 5.968645095825195, + "learning_rate": 2e-05, + "loss": 0.3810643, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 4.443109035491943, + "learning_rate": 2e-05, + "loss": 0.41381907, + "step": 15 + }, + { + "epoch": 0.032, + "grad_norm": 3.4299163818359375, + "learning_rate": 2e-05, + "loss": 0.42420715, + "step": 16 + }, + { + "epoch": 0.034, + "grad_norm": 4.944918155670166, + "learning_rate": 2e-05, + "loss": 0.46888059, + "step": 17 + }, + { + "epoch": 0.036, + "grad_norm": 4.6365180015563965, + "learning_rate": 2e-05, + "loss": 0.45326883, + "step": 18 + }, + { + "epoch": 0.038, + "grad_norm": 2.8536622524261475, + "learning_rate": 2e-05, + "loss": 0.44977522, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 5.375406265258789, + "learning_rate": 2e-05, + "loss": 0.44011864, + "step": 20 + }, + { + "epoch": 0.042, + "grad_norm": 3.37801456451416, + "learning_rate": 2e-05, + "loss": 0.40136099, + "step": 21 + }, + { + "epoch": 0.044, + "grad_norm": 3.705557346343994, + "learning_rate": 2e-05, + "loss": 0.37761164, + "step": 22 + }, + { + "epoch": 0.046, + "grad_norm": 3.8013367652893066, + "learning_rate": 2e-05, + "loss": 0.44508934, + "step": 23 + }, + { + "epoch": 0.048, + "grad_norm": 3.566617250442505, + "learning_rate": 2e-05, + "loss": 0.38218161, + "step": 24 + }, + { + "epoch": 0.05, + "grad_norm": 3.859208345413208, + "learning_rate": 2e-05, + "loss": 0.3878693, + "step": 25 + }, + { + "epoch": 0.052, + "grad_norm": 3.2768349647521973, + "learning_rate": 2e-05, + "loss": 0.37782881, + "step": 26 + }, + { + "epoch": 0.054, + "grad_norm": 2.3881168365478516, + "learning_rate": 2e-05, + "loss": 0.42464909, + "step": 27 + }, + { + "epoch": 0.056, + "grad_norm": 2.54347562789917, + "learning_rate": 2e-05, + "loss": 0.39481583, + "step": 28 + }, + { + "epoch": 0.058, + "grad_norm": 7.55833101272583, + "learning_rate": 2e-05, + "loss": 0.42484623, + "step": 29 + }, + { + "epoch": 0.06, + "grad_norm": 2.715081214904785, + "learning_rate": 2e-05, + "loss": 0.46655717, + "step": 30 + }, + { + "epoch": 0.062, + "grad_norm": 2.705334424972534, + "learning_rate": 2e-05, + "loss": 0.38881761, + "step": 31 + }, + { + "epoch": 0.064, + "grad_norm": 3.3475492000579834, + "learning_rate": 2e-05, + "loss": 0.42681402, + "step": 32 + }, + { + "epoch": 0.066, + "grad_norm": 3.141242504119873, + "learning_rate": 2e-05, + "loss": 0.42609936, + "step": 33 + }, + { + "epoch": 0.068, + "grad_norm": 2.4876580238342285, + "learning_rate": 2e-05, + "loss": 0.37850282, + "step": 34 + }, + { + "epoch": 0.07, + "grad_norm": 2.4182939529418945, + "learning_rate": 2e-05, + "loss": 0.377267, + "step": 35 + }, + { + "epoch": 0.072, + "grad_norm": 3.404860258102417, + "learning_rate": 2e-05, + "loss": 0.41409212, + "step": 36 + }, + { + "epoch": 0.074, + "grad_norm": 4.462337970733643, + "learning_rate": 2e-05, + "loss": 0.41622919, + "step": 37 + }, + { + "epoch": 0.076, + "grad_norm": 3.895780563354492, + "learning_rate": 2e-05, + "loss": 0.40852085, + "step": 38 + }, + { + "epoch": 0.078, + "grad_norm": 4.350463390350342, + "learning_rate": 2e-05, + "loss": 0.44786674, + "step": 39 + }, + { + "epoch": 0.08, + "grad_norm": 2.5350425243377686, + "learning_rate": 2e-05, + "loss": 0.40965152, + "step": 40 + }, + { + "epoch": 0.082, + "grad_norm": 5.017084121704102, + "learning_rate": 2e-05, + "loss": 0.45135465, + "step": 41 + }, + { + "epoch": 0.084, + "grad_norm": 3.472752332687378, + "learning_rate": 2e-05, + "loss": 0.42319882, + "step": 42 + }, + { + "epoch": 0.086, + "grad_norm": 3.5228331089019775, + "learning_rate": 2e-05, + "loss": 0.31802225, + "step": 43 + }, + { + "epoch": 0.088, + "grad_norm": 4.688898086547852, + "learning_rate": 2e-05, + "loss": 0.46733904, + "step": 44 + }, + { + "epoch": 0.09, + "grad_norm": 2.554978370666504, + "learning_rate": 2e-05, + "loss": 0.35162151, + "step": 45 + }, + { + "epoch": 0.092, + "grad_norm": 3.005284070968628, + "learning_rate": 2e-05, + "loss": 0.40723544, + "step": 46 + }, + { + "epoch": 0.094, + "grad_norm": 3.2923007011413574, + "learning_rate": 2e-05, + "loss": 0.35407171, + "step": 47 + }, + { + "epoch": 0.096, + "grad_norm": 7.087942600250244, + "learning_rate": 2e-05, + "loss": 0.43687314, + "step": 48 + }, + { + "epoch": 0.098, + "grad_norm": 3.748598098754883, + "learning_rate": 2e-05, + "loss": 0.40476981, + "step": 49 + }, + { + "epoch": 0.1, + "grad_norm": 3.1440889835357666, + "learning_rate": 2e-05, + "loss": 0.34471345, + "step": 50 + }, + { + "epoch": 0.102, + "grad_norm": 3.0694971084594727, + "learning_rate": 2e-05, + "loss": 0.39447641, + "step": 51 + }, + { + "epoch": 0.104, + "grad_norm": 2.8853402137756348, + "learning_rate": 2e-05, + "loss": 0.4363516, + "step": 52 + }, + { + "epoch": 0.106, + "grad_norm": 2.3027663230895996, + "learning_rate": 2e-05, + "loss": 0.42112815, + "step": 53 + }, + { + "epoch": 0.108, + "grad_norm": 3.492051124572754, + "learning_rate": 2e-05, + "loss": 0.36403522, + "step": 54 + }, + { + "epoch": 0.11, + "grad_norm": 3.2932403087615967, + "learning_rate": 2e-05, + "loss": 0.37265083, + "step": 55 + }, + { + "epoch": 0.112, + "grad_norm": 2.5150811672210693, + "learning_rate": 2e-05, + "loss": 0.35803241, + "step": 56 + }, + { + "epoch": 0.114, + "grad_norm": 2.370377540588379, + "learning_rate": 2e-05, + "loss": 0.4209469, + "step": 57 + }, + { + "epoch": 0.116, + "grad_norm": 2.392200469970703, + "learning_rate": 2e-05, + "loss": 0.44664249, + "step": 58 + }, + { + "epoch": 0.118, + "grad_norm": 2.7458958625793457, + "learning_rate": 2e-05, + "loss": 0.41989598, + "step": 59 + }, + { + "epoch": 0.12, + "grad_norm": 2.5858850479125977, + "learning_rate": 2e-05, + "loss": 0.4336924, + "step": 60 + }, + { + "epoch": 0.122, + "grad_norm": 3.472402334213257, + "learning_rate": 2e-05, + "loss": 0.45059329, + "step": 61 + }, + { + "epoch": 0.124, + "grad_norm": 2.9056155681610107, + "learning_rate": 2e-05, + "loss": 0.43756476, + "step": 62 + }, + { + "epoch": 0.126, + "grad_norm": 2.1384313106536865, + "learning_rate": 2e-05, + "loss": 0.37061867, + "step": 63 + }, + { + "epoch": 0.128, + "grad_norm": 2.247955560684204, + "learning_rate": 2e-05, + "loss": 0.4254021, + "step": 64 + }, + { + "epoch": 0.13, + "grad_norm": 3.1384289264678955, + "learning_rate": 2e-05, + "loss": 0.48214421, + "step": 65 + }, + { + "epoch": 0.132, + "grad_norm": 3.4709699153900146, + "learning_rate": 2e-05, + "loss": 0.42113096, + "step": 66 + }, + { + "epoch": 0.134, + "grad_norm": 2.4829514026641846, + "learning_rate": 2e-05, + "loss": 0.3783536, + "step": 67 + }, + { + "epoch": 0.136, + "grad_norm": 3.276494264602661, + "learning_rate": 2e-05, + "loss": 0.42203033, + "step": 68 + }, + { + "epoch": 0.138, + "grad_norm": 2.3716046810150146, + "learning_rate": 2e-05, + "loss": 0.37840182, + "step": 69 + }, + { + "epoch": 0.14, + "grad_norm": 2.6660404205322266, + "learning_rate": 2e-05, + "loss": 0.3909995, + "step": 70 + }, + { + "epoch": 0.142, + "grad_norm": 2.4495251178741455, + "learning_rate": 2e-05, + "loss": 0.39900306, + "step": 71 + }, + { + "epoch": 0.144, + "grad_norm": 2.6454856395721436, + "learning_rate": 2e-05, + "loss": 0.40227687, + "step": 72 + }, + { + "epoch": 0.146, + "grad_norm": 3.020829916000366, + "learning_rate": 2e-05, + "loss": 0.43715149, + "step": 73 + }, + { + "epoch": 0.148, + "grad_norm": 2.465714454650879, + "learning_rate": 2e-05, + "loss": 0.43334347, + "step": 74 + }, + { + "epoch": 0.15, + "grad_norm": 3.1749234199523926, + "learning_rate": 2e-05, + "loss": 0.39349914, + "step": 75 + }, + { + "epoch": 0.152, + "grad_norm": 2.522357225418091, + "learning_rate": 2e-05, + "loss": 0.40576932, + "step": 76 + }, + { + "epoch": 0.154, + "grad_norm": 2.7155542373657227, + "learning_rate": 2e-05, + "loss": 0.45305899, + "step": 77 + }, + { + "epoch": 0.156, + "grad_norm": 2.727293014526367, + "learning_rate": 2e-05, + "loss": 0.44109389, + "step": 78 + }, + { + "epoch": 0.158, + "grad_norm": 2.575566291809082, + "learning_rate": 2e-05, + "loss": 0.44445091, + "step": 79 + }, + { + "epoch": 0.16, + "grad_norm": 2.941967487335205, + "learning_rate": 2e-05, + "loss": 0.41291177, + "step": 80 + }, + { + "epoch": 0.162, + "grad_norm": 2.702101707458496, + "learning_rate": 2e-05, + "loss": 0.3560887, + "step": 81 + }, + { + "epoch": 0.164, + "grad_norm": 2.959989547729492, + "learning_rate": 2e-05, + "loss": 0.3968443, + "step": 82 + }, + { + "epoch": 0.166, + "grad_norm": 3.733569860458374, + "learning_rate": 2e-05, + "loss": 0.4232977, + "step": 83 + }, + { + "epoch": 0.168, + "grad_norm": 2.6571216583251953, + "learning_rate": 2e-05, + "loss": 0.40444511, + "step": 84 + }, + { + "epoch": 0.17, + "grad_norm": 2.2582576274871826, + "learning_rate": 2e-05, + "loss": 0.40064648, + "step": 85 + }, + { + "epoch": 0.172, + "grad_norm": 2.748850107192993, + "learning_rate": 2e-05, + "loss": 0.38729197, + "step": 86 + }, + { + "epoch": 0.174, + "grad_norm": 2.8563621044158936, + "learning_rate": 2e-05, + "loss": 0.41891837, + "step": 87 + }, + { + "epoch": 0.176, + "grad_norm": 2.7641937732696533, + "learning_rate": 2e-05, + "loss": 0.36975527, + "step": 88 + }, + { + "epoch": 0.178, + "grad_norm": 3.3558907508850098, + "learning_rate": 2e-05, + "loss": 0.40827322, + "step": 89 + }, + { + "epoch": 0.18, + "grad_norm": 2.4098260402679443, + "learning_rate": 2e-05, + "loss": 0.3943249, + "step": 90 + }, + { + "epoch": 0.182, + "grad_norm": 2.5855796337127686, + "learning_rate": 2e-05, + "loss": 0.43265003, + "step": 91 + }, + { + "epoch": 0.184, + "grad_norm": 3.2219607830047607, + "learning_rate": 2e-05, + "loss": 0.44435838, + "step": 92 + }, + { + "epoch": 0.186, + "grad_norm": 2.994154453277588, + "learning_rate": 2e-05, + "loss": 0.43490282, + "step": 93 + }, + { + "epoch": 0.188, + "grad_norm": 3.057532548904419, + "learning_rate": 2e-05, + "loss": 0.42345771, + "step": 94 + }, + { + "epoch": 0.19, + "grad_norm": 2.1149864196777344, + "learning_rate": 2e-05, + "loss": 0.40705955, + "step": 95 + }, + { + "epoch": 0.192, + "grad_norm": 3.158325672149658, + "learning_rate": 2e-05, + "loss": 0.46287918, + "step": 96 + }, + { + "epoch": 0.194, + "grad_norm": 2.7815473079681396, + "learning_rate": 2e-05, + "loss": 0.39676839, + "step": 97 + }, + { + "epoch": 0.196, + "grad_norm": 2.1724812984466553, + "learning_rate": 2e-05, + "loss": 0.38517368, + "step": 98 + }, + { + "epoch": 0.198, + "grad_norm": 2.216989040374756, + "learning_rate": 2e-05, + "loss": 0.41994014, + "step": 99 + }, + { + "epoch": 0.2, + "grad_norm": 2.4955761432647705, + "learning_rate": 2e-05, + "loss": 0.44885725, + "step": 100 + }, + { + "epoch": 0.202, + "grad_norm": 2.3539469242095947, + "learning_rate": 2e-05, + "loss": 0.39006793, + "step": 101 + }, + { + "epoch": 0.204, + "grad_norm": 2.2714924812316895, + "learning_rate": 2e-05, + "loss": 0.34156144, + "step": 102 + }, + { + "epoch": 0.206, + "grad_norm": 2.3460693359375, + "learning_rate": 2e-05, + "loss": 0.43049014, + "step": 103 + }, + { + "epoch": 0.208, + "grad_norm": 3.0970299243927, + "learning_rate": 2e-05, + "loss": 0.42005956, + "step": 104 + }, + { + "epoch": 0.21, + "grad_norm": 2.211514711380005, + "learning_rate": 2e-05, + "loss": 0.38698167, + "step": 105 + }, + { + "epoch": 0.212, + "grad_norm": 2.8566031455993652, + "learning_rate": 2e-05, + "loss": 0.39388683, + "step": 106 + }, + { + "epoch": 0.214, + "grad_norm": 4.965332508087158, + "learning_rate": 2e-05, + "loss": 0.32786798, + "step": 107 + }, + { + "epoch": 0.216, + "grad_norm": 2.8506662845611572, + "learning_rate": 2e-05, + "loss": 0.39046088, + "step": 108 + }, + { + "epoch": 0.218, + "grad_norm": 2.1420042514801025, + "learning_rate": 2e-05, + "loss": 0.39321035, + "step": 109 + }, + { + "epoch": 0.22, + "grad_norm": 2.7388463020324707, + "learning_rate": 2e-05, + "loss": 0.39163101, + "step": 110 + }, + { + "epoch": 0.222, + "grad_norm": 2.2771174907684326, + "learning_rate": 2e-05, + "loss": 0.41349089, + "step": 111 + }, + { + "epoch": 0.224, + "grad_norm": 4.137855529785156, + "learning_rate": 2e-05, + "loss": 0.37277463, + "step": 112 + }, + { + "epoch": 0.226, + "grad_norm": 2.8623104095458984, + "learning_rate": 2e-05, + "loss": 0.38881728, + "step": 113 + }, + { + "epoch": 0.228, + "grad_norm": 2.295194387435913, + "learning_rate": 2e-05, + "loss": 0.3589763, + "step": 114 + }, + { + "epoch": 0.23, + "grad_norm": 2.1582581996917725, + "learning_rate": 2e-05, + "loss": 0.42957473, + "step": 115 + }, + { + "epoch": 0.232, + "grad_norm": 3.635145664215088, + "learning_rate": 2e-05, + "loss": 0.39942896, + "step": 116 + }, + { + "epoch": 0.234, + "grad_norm": 3.435088872909546, + "learning_rate": 2e-05, + "loss": 0.41030282, + "step": 117 + }, + { + "epoch": 0.236, + "grad_norm": 2.8914260864257812, + "learning_rate": 2e-05, + "loss": 0.38662139, + "step": 118 + }, + { + "epoch": 0.238, + "grad_norm": 3.000392198562622, + "learning_rate": 2e-05, + "loss": 0.42550755, + "step": 119 + }, + { + "epoch": 0.24, + "grad_norm": 2.0846173763275146, + "learning_rate": 2e-05, + "loss": 0.3794331, + "step": 120 + }, + { + "epoch": 0.242, + "grad_norm": 2.7421340942382812, + "learning_rate": 2e-05, + "loss": 0.40051925, + "step": 121 + }, + { + "epoch": 0.244, + "grad_norm": 2.206716537475586, + "learning_rate": 2e-05, + "loss": 0.41522628, + "step": 122 + }, + { + "epoch": 0.246, + "grad_norm": 2.345612049102783, + "learning_rate": 2e-05, + "loss": 0.39287353, + "step": 123 + }, + { + "epoch": 0.248, + "grad_norm": 2.5863916873931885, + "learning_rate": 2e-05, + "loss": 0.44706243, + "step": 124 + }, + { + "epoch": 0.25, + "grad_norm": 2.4030227661132812, + "learning_rate": 2e-05, + "loss": 0.39214543, + "step": 125 + }, + { + "epoch": 0.252, + "grad_norm": 2.58503794670105, + "learning_rate": 2e-05, + "loss": 0.40951967, + "step": 126 + }, + { + "epoch": 0.254, + "grad_norm": 2.058553457260132, + "learning_rate": 2e-05, + "loss": 0.40562445, + "step": 127 + }, + { + "epoch": 0.256, + "grad_norm": 2.324352979660034, + "learning_rate": 2e-05, + "loss": 0.39885789, + "step": 128 + }, + { + "epoch": 0.258, + "grad_norm": 2.4435112476348877, + "learning_rate": 2e-05, + "loss": 0.4411369, + "step": 129 + }, + { + "epoch": 0.26, + "grad_norm": 2.4042422771453857, + "learning_rate": 2e-05, + "loss": 0.42098755, + "step": 130 + }, + { + "epoch": 0.262, + "grad_norm": 2.854393482208252, + "learning_rate": 2e-05, + "loss": 0.40045485, + "step": 131 + }, + { + "epoch": 0.264, + "grad_norm": 2.2807977199554443, + "learning_rate": 2e-05, + "loss": 0.44947705, + "step": 132 + }, + { + "epoch": 0.266, + "grad_norm": 2.3839499950408936, + "learning_rate": 2e-05, + "loss": 0.3765958, + "step": 133 + }, + { + "epoch": 0.268, + "grad_norm": 2.462369203567505, + "learning_rate": 2e-05, + "loss": 0.34250346, + "step": 134 + }, + { + "epoch": 0.27, + "grad_norm": 2.4267525672912598, + "learning_rate": 2e-05, + "loss": 0.4025071, + "step": 135 + }, + { + "epoch": 0.272, + "grad_norm": 2.5603864192962646, + "learning_rate": 2e-05, + "loss": 0.35927606, + "step": 136 + }, + { + "epoch": 0.274, + "grad_norm": 3.1467013359069824, + "learning_rate": 2e-05, + "loss": 0.42534155, + "step": 137 + }, + { + "epoch": 0.276, + "grad_norm": 2.4989264011383057, + "learning_rate": 2e-05, + "loss": 0.38834065, + "step": 138 + }, + { + "epoch": 0.278, + "grad_norm": 5.77341890335083, + "learning_rate": 2e-05, + "loss": 0.3826952, + "step": 139 + }, + { + "epoch": 0.28, + "grad_norm": 2.720933675765991, + "learning_rate": 2e-05, + "loss": 0.39199528, + "step": 140 + }, + { + "epoch": 0.282, + "grad_norm": 6.706570625305176, + "learning_rate": 2e-05, + "loss": 0.43876618, + "step": 141 + }, + { + "epoch": 0.284, + "grad_norm": 22.753585815429688, + "learning_rate": 2e-05, + "loss": 0.58754456, + "step": 142 + }, + { + "epoch": 0.286, + "grad_norm": 951.1945190429688, + "learning_rate": 2e-05, + "loss": 0.81678319, + "step": 143 + }, + { + "epoch": 0.288, + "grad_norm": 52.611961364746094, + "learning_rate": 2e-05, + "loss": 0.69098788, + "step": 144 + }, + { + "epoch": 0.29, + "grad_norm": 45.12958526611328, + "learning_rate": 2e-05, + "loss": 0.76437581, + "step": 145 + }, + { + "epoch": 0.292, + "grad_norm": 33.77717971801758, + "learning_rate": 2e-05, + "loss": 0.75502336, + "step": 146 + }, + { + "epoch": 0.294, + "grad_norm": 78.94849395751953, + "learning_rate": 2e-05, + "loss": 0.98386419, + "step": 147 + }, + { + "epoch": 0.296, + "grad_norm": 99.15904998779297, + "learning_rate": 2e-05, + "loss": 0.7885493, + "step": 148 + }, + { + "epoch": 0.298, + "grad_norm": 95.97003173828125, + "learning_rate": 2e-05, + "loss": 1.16757083, + "step": 149 + }, + { + "epoch": 0.3, + "grad_norm": 395.0550537109375, + "learning_rate": 2e-05, + "loss": 0.89327443, + "step": 150 + }, + { + "epoch": 0.302, + "grad_norm": 34.62553787231445, + "learning_rate": 2e-05, + "loss": 0.80242836, + "step": 151 + }, + { + "epoch": 0.304, + "grad_norm": 49.44704818725586, + "learning_rate": 2e-05, + "loss": 0.63106126, + "step": 152 + }, + { + "epoch": 0.306, + "grad_norm": 171.4626007080078, + "learning_rate": 2e-05, + "loss": 0.57824063, + "step": 153 + }, + { + "epoch": 0.308, + "grad_norm": 93.4944839477539, + "learning_rate": 2e-05, + "loss": 0.52180779, + "step": 154 + }, + { + "epoch": 0.31, + "grad_norm": 112.07191467285156, + "learning_rate": 2e-05, + "loss": 0.57057035, + "step": 155 + }, + { + "epoch": 0.312, + "grad_norm": 23.541261672973633, + "learning_rate": 2e-05, + "loss": 0.6415171, + "step": 156 + }, + { + "epoch": 0.314, + "grad_norm": 121.93998718261719, + "learning_rate": 2e-05, + "loss": 0.81909418, + "step": 157 + }, + { + "epoch": 0.316, + "grad_norm": 34.9930305480957, + "learning_rate": 2e-05, + "loss": 0.68764293, + "step": 158 + }, + { + "epoch": 0.318, + "grad_norm": 38.409847259521484, + "learning_rate": 2e-05, + "loss": 0.52681422, + "step": 159 + }, + { + "epoch": 0.32, + "grad_norm": 23.83429718017578, + "learning_rate": 2e-05, + "loss": 0.57993436, + "step": 160 + }, + { + "epoch": 0.322, + "grad_norm": 23.568622589111328, + "learning_rate": 2e-05, + "loss": 0.51327848, + "step": 161 + }, + { + "epoch": 0.324, + "grad_norm": 22.358970642089844, + "learning_rate": 2e-05, + "loss": 0.4694868, + "step": 162 + }, + { + "epoch": 0.326, + "grad_norm": 37.23964309692383, + "learning_rate": 2e-05, + "loss": 0.52104098, + "step": 163 + }, + { + "epoch": 0.328, + "grad_norm": 27.70103645324707, + "learning_rate": 2e-05, + "loss": 0.5051856, + "step": 164 + }, + { + "epoch": 0.33, + "grad_norm": 19.599693298339844, + "learning_rate": 2e-05, + "loss": 0.48360169, + "step": 165 + }, + { + "epoch": 0.332, + "grad_norm": 17.194791793823242, + "learning_rate": 2e-05, + "loss": 0.50848162, + "step": 166 + }, + { + "epoch": 0.334, + "grad_norm": 27.718290328979492, + "learning_rate": 2e-05, + "loss": 0.46776086, + "step": 167 + }, + { + "epoch": 0.336, + "grad_norm": 29.439424514770508, + "learning_rate": 2e-05, + "loss": 0.48417586, + "step": 168 + }, + { + "epoch": 0.338, + "grad_norm": 13.250664710998535, + "learning_rate": 2e-05, + "loss": 0.47358498, + "step": 169 + }, + { + "epoch": 0.34, + "grad_norm": 33.282222747802734, + "learning_rate": 2e-05, + "loss": 0.52936995, + "step": 170 + }, + { + "epoch": 0.342, + "grad_norm": 30.323564529418945, + "learning_rate": 2e-05, + "loss": 0.50996685, + "step": 171 + }, + { + "epoch": 0.344, + "grad_norm": 26.364585876464844, + "learning_rate": 2e-05, + "loss": 0.45880085, + "step": 172 + }, + { + "epoch": 0.346, + "grad_norm": 16.831697463989258, + "learning_rate": 2e-05, + "loss": 0.44258234, + "step": 173 + }, + { + "epoch": 0.348, + "grad_norm": 15.406957626342773, + "learning_rate": 2e-05, + "loss": 0.42777783, + "step": 174 + }, + { + "epoch": 0.35, + "grad_norm": 27.91541862487793, + "learning_rate": 2e-05, + "loss": 0.44150254, + "step": 175 + }, + { + "epoch": 0.352, + "grad_norm": 11.729520797729492, + "learning_rate": 2e-05, + "loss": 0.45409214, + "step": 176 + }, + { + "epoch": 0.354, + "grad_norm": 10.498335838317871, + "learning_rate": 2e-05, + "loss": 0.45166379, + "step": 177 + }, + { + "epoch": 0.356, + "grad_norm": 13.207727432250977, + "learning_rate": 2e-05, + "loss": 0.48104641, + "step": 178 + }, + { + "epoch": 0.358, + "grad_norm": 38.07108688354492, + "learning_rate": 2e-05, + "loss": 0.4513554, + "step": 179 + }, + { + "epoch": 0.36, + "grad_norm": 9.624302864074707, + "learning_rate": 2e-05, + "loss": 0.41277581, + "step": 180 + }, + { + "epoch": 0.362, + "grad_norm": 8.47309398651123, + "learning_rate": 2e-05, + "loss": 0.42749909, + "step": 181 + }, + { + "epoch": 0.364, + "grad_norm": 5.230127811431885, + "learning_rate": 2e-05, + "loss": 0.39681259, + "step": 182 + }, + { + "epoch": 0.366, + "grad_norm": 11.410614013671875, + "learning_rate": 2e-05, + "loss": 0.39442861, + "step": 183 + }, + { + "epoch": 0.368, + "grad_norm": 12.072883605957031, + "learning_rate": 2e-05, + "loss": 0.40660879, + "step": 184 + }, + { + "epoch": 0.37, + "grad_norm": 13.550542831420898, + "learning_rate": 2e-05, + "loss": 0.40361917, + "step": 185 + }, + { + "epoch": 0.372, + "grad_norm": 6.943983554840088, + "learning_rate": 2e-05, + "loss": 0.45435604, + "step": 186 + }, + { + "epoch": 0.374, + "grad_norm": 7.458695888519287, + "learning_rate": 2e-05, + "loss": 0.36000335, + "step": 187 + }, + { + "epoch": 0.376, + "grad_norm": 8.648634910583496, + "learning_rate": 2e-05, + "loss": 0.43435773, + "step": 188 + }, + { + "epoch": 0.378, + "grad_norm": 5.483082294464111, + "learning_rate": 2e-05, + "loss": 0.40732422, + "step": 189 + }, + { + "epoch": 0.38, + "grad_norm": 6.177685737609863, + "learning_rate": 2e-05, + "loss": 0.42347455, + "step": 190 + }, + { + "epoch": 0.382, + "grad_norm": 3.8768444061279297, + "learning_rate": 2e-05, + "loss": 0.38617033, + "step": 191 + }, + { + "epoch": 0.384, + "grad_norm": 4.045779705047607, + "learning_rate": 2e-05, + "loss": 0.3738341, + "step": 192 + }, + { + "epoch": 0.386, + "grad_norm": 4.49653434753418, + "learning_rate": 2e-05, + "loss": 0.41778257, + "step": 193 + }, + { + "epoch": 0.388, + "grad_norm": 3.0218472480773926, + "learning_rate": 2e-05, + "loss": 0.37529367, + "step": 194 + }, + { + "epoch": 0.39, + "grad_norm": 4.7495503425598145, + "learning_rate": 2e-05, + "loss": 0.39258975, + "step": 195 + }, + { + "epoch": 0.392, + "grad_norm": 4.811206817626953, + "learning_rate": 2e-05, + "loss": 0.39241675, + "step": 196 + }, + { + "epoch": 0.394, + "grad_norm": 2.8001279830932617, + "learning_rate": 2e-05, + "loss": 0.42924637, + "step": 197 + }, + { + "epoch": 0.396, + "grad_norm": 4.613635063171387, + "learning_rate": 2e-05, + "loss": 0.40108818, + "step": 198 + }, + { + "epoch": 0.398, + "grad_norm": 6.350893497467041, + "learning_rate": 2e-05, + "loss": 0.45137745, + "step": 199 + }, + { + "epoch": 0.4, + "grad_norm": 4.332032203674316, + "learning_rate": 2e-05, + "loss": 0.3409031, + "step": 200 + }, + { + "epoch": 0.402, + "grad_norm": 3.7277584075927734, + "learning_rate": 2e-05, + "loss": 0.4174028, + "step": 201 + }, + { + "epoch": 0.404, + "grad_norm": 2.3304224014282227, + "learning_rate": 2e-05, + "loss": 0.36742717, + "step": 202 + }, + { + "epoch": 0.406, + "grad_norm": 2.5752127170562744, + "learning_rate": 2e-05, + "loss": 0.36992675, + "step": 203 + }, + { + "epoch": 0.408, + "grad_norm": 2.8590283393859863, + "learning_rate": 2e-05, + "loss": 0.37675118, + "step": 204 + }, + { + "epoch": 0.41, + "grad_norm": 3.1647579669952393, + "learning_rate": 2e-05, + "loss": 0.3679235, + "step": 205 + }, + { + "epoch": 0.412, + "grad_norm": 2.6260170936584473, + "learning_rate": 2e-05, + "loss": 0.4498952, + "step": 206 + }, + { + "epoch": 0.414, + "grad_norm": 2.4622349739074707, + "learning_rate": 2e-05, + "loss": 0.35507852, + "step": 207 + }, + { + "epoch": 0.416, + "grad_norm": 2.9872968196868896, + "learning_rate": 2e-05, + "loss": 0.3549549, + "step": 208 + }, + { + "epoch": 0.418, + "grad_norm": 3.04270601272583, + "learning_rate": 2e-05, + "loss": 0.41885364, + "step": 209 + }, + { + "epoch": 0.42, + "grad_norm": 2.9570157527923584, + "learning_rate": 2e-05, + "loss": 0.43402624, + "step": 210 + }, + { + "epoch": 0.422, + "grad_norm": 2.0857791900634766, + "learning_rate": 2e-05, + "loss": 0.40431345, + "step": 211 + }, + { + "epoch": 0.424, + "grad_norm": 2.574413537979126, + "learning_rate": 2e-05, + "loss": 0.40234259, + "step": 212 + }, + { + "epoch": 0.426, + "grad_norm": 2.3002917766571045, + "learning_rate": 2e-05, + "loss": 0.4213936, + "step": 213 + }, + { + "epoch": 0.428, + "grad_norm": 2.598877429962158, + "learning_rate": 2e-05, + "loss": 0.42018652, + "step": 214 + }, + { + "epoch": 0.43, + "grad_norm": 2.410801887512207, + "learning_rate": 2e-05, + "loss": 0.41117987, + "step": 215 + }, + { + "epoch": 0.432, + "grad_norm": 2.146622657775879, + "learning_rate": 2e-05, + "loss": 0.37472421, + "step": 216 + }, + { + "epoch": 0.434, + "grad_norm": 2.1747183799743652, + "learning_rate": 2e-05, + "loss": 0.42943805, + "step": 217 + }, + { + "epoch": 0.436, + "grad_norm": 2.395761013031006, + "learning_rate": 2e-05, + "loss": 0.39487824, + "step": 218 + }, + { + "epoch": 0.438, + "grad_norm": 2.3509767055511475, + "learning_rate": 2e-05, + "loss": 0.33290654, + "step": 219 + }, + { + "epoch": 0.44, + "grad_norm": 3.0364584922790527, + "learning_rate": 2e-05, + "loss": 0.40460128, + "step": 220 + }, + { + "epoch": 0.442, + "grad_norm": 2.566840648651123, + "learning_rate": 2e-05, + "loss": 0.41224802, + "step": 221 + }, + { + "epoch": 0.444, + "grad_norm": 3.774826765060425, + "learning_rate": 2e-05, + "loss": 0.43019855, + "step": 222 + }, + { + "epoch": 0.446, + "grad_norm": 2.1061086654663086, + "learning_rate": 2e-05, + "loss": 0.4085083, + "step": 223 + }, + { + "epoch": 0.448, + "grad_norm": 2.4577176570892334, + "learning_rate": 2e-05, + "loss": 0.39225131, + "step": 224 + }, + { + "epoch": 0.45, + "grad_norm": 4.246354579925537, + "learning_rate": 2e-05, + "loss": 0.43242174, + "step": 225 + }, + { + "epoch": 0.452, + "grad_norm": 2.795057773590088, + "learning_rate": 2e-05, + "loss": 0.34006116, + "step": 226 + }, + { + "epoch": 0.454, + "grad_norm": 2.231876850128174, + "learning_rate": 2e-05, + "loss": 0.39721149, + "step": 227 + }, + { + "epoch": 0.456, + "grad_norm": 3.058765172958374, + "learning_rate": 2e-05, + "loss": 0.41030541, + "step": 228 + }, + { + "epoch": 0.458, + "grad_norm": 2.9272921085357666, + "learning_rate": 2e-05, + "loss": 0.40642214, + "step": 229 + }, + { + "epoch": 0.46, + "grad_norm": 2.118684768676758, + "learning_rate": 2e-05, + "loss": 0.40627092, + "step": 230 + }, + { + "epoch": 0.462, + "grad_norm": 4.35416841506958, + "learning_rate": 2e-05, + "loss": 0.41149405, + "step": 231 + }, + { + "epoch": 0.464, + "grad_norm": 3.272524356842041, + "learning_rate": 2e-05, + "loss": 0.40240556, + "step": 232 + }, + { + "epoch": 0.466, + "grad_norm": 2.587944746017456, + "learning_rate": 2e-05, + "loss": 0.374861, + "step": 233 + }, + { + "epoch": 0.468, + "grad_norm": 2.573385238647461, + "learning_rate": 2e-05, + "loss": 0.42833674, + "step": 234 + }, + { + "epoch": 0.47, + "grad_norm": 2.941406726837158, + "learning_rate": 2e-05, + "loss": 0.44332218, + "step": 235 + }, + { + "epoch": 0.472, + "grad_norm": 2.2706453800201416, + "learning_rate": 2e-05, + "loss": 0.38916123, + "step": 236 + }, + { + "epoch": 0.474, + "grad_norm": 2.572369337081909, + "learning_rate": 2e-05, + "loss": 0.38100535, + "step": 237 + }, + { + "epoch": 0.476, + "grad_norm": 2.6199276447296143, + "learning_rate": 2e-05, + "loss": 0.39440346, + "step": 238 + }, + { + "epoch": 0.478, + "grad_norm": 2.281733512878418, + "learning_rate": 2e-05, + "loss": 0.3807506, + "step": 239 + }, + { + "epoch": 0.48, + "grad_norm": 2.7786262035369873, + "learning_rate": 2e-05, + "loss": 0.34910616, + "step": 240 + }, + { + "epoch": 0.482, + "grad_norm": 2.339578866958618, + "learning_rate": 2e-05, + "loss": 0.40123379, + "step": 241 + }, + { + "epoch": 0.484, + "grad_norm": 2.5612998008728027, + "learning_rate": 2e-05, + "loss": 0.34685874, + "step": 242 + }, + { + "epoch": 0.486, + "grad_norm": 2.650993824005127, + "learning_rate": 2e-05, + "loss": 0.37038237, + "step": 243 + }, + { + "epoch": 0.488, + "grad_norm": 2.3199408054351807, + "learning_rate": 2e-05, + "loss": 0.35883853, + "step": 244 + }, + { + "epoch": 0.49, + "grad_norm": 3.2961573600769043, + "learning_rate": 2e-05, + "loss": 0.4362112, + "step": 245 + }, + { + "epoch": 0.492, + "grad_norm": 2.3096096515655518, + "learning_rate": 2e-05, + "loss": 0.3606168, + "step": 246 + }, + { + "epoch": 0.494, + "grad_norm": 2.3645904064178467, + "learning_rate": 2e-05, + "loss": 0.42863727, + "step": 247 + }, + { + "epoch": 0.496, + "grad_norm": 2.2203831672668457, + "learning_rate": 2e-05, + "loss": 0.40489531, + "step": 248 + }, + { + "epoch": 0.498, + "grad_norm": 2.45977783203125, + "learning_rate": 2e-05, + "loss": 0.39940965, + "step": 249 + }, + { + "epoch": 0.5, + "grad_norm": 2.9246718883514404, + "learning_rate": 2e-05, + "loss": 0.39140785, + "step": 250 + }, + { + "epoch": 0.502, + "grad_norm": 1.8212493658065796, + "learning_rate": 2e-05, + "loss": 0.36203432, + "step": 251 + }, + { + "epoch": 0.504, + "grad_norm": 2.2326316833496094, + "learning_rate": 2e-05, + "loss": 0.44940746, + "step": 252 + }, + { + "epoch": 0.506, + "grad_norm": 2.1888906955718994, + "learning_rate": 2e-05, + "loss": 0.42821601, + "step": 253 + }, + { + "epoch": 0.508, + "grad_norm": 3.0814597606658936, + "learning_rate": 2e-05, + "loss": 0.36414945, + "step": 254 + }, + { + "epoch": 0.51, + "grad_norm": 2.7043652534484863, + "learning_rate": 2e-05, + "loss": 0.39156815, + "step": 255 + }, + { + "epoch": 0.512, + "grad_norm": 2.3231751918792725, + "learning_rate": 2e-05, + "loss": 0.37385294, + "step": 256 + }, + { + "epoch": 0.514, + "grad_norm": 2.275028705596924, + "learning_rate": 2e-05, + "loss": 0.38484564, + "step": 257 + }, + { + "epoch": 0.516, + "grad_norm": 2.472454071044922, + "learning_rate": 2e-05, + "loss": 0.36479205, + "step": 258 + }, + { + "epoch": 0.518, + "grad_norm": 2.479954481124878, + "learning_rate": 2e-05, + "loss": 0.38072658, + "step": 259 + }, + { + "epoch": 0.52, + "grad_norm": 2.311441659927368, + "learning_rate": 2e-05, + "loss": 0.33453172, + "step": 260 + }, + { + "epoch": 0.522, + "grad_norm": 3.8335723876953125, + "learning_rate": 2e-05, + "loss": 0.40032426, + "step": 261 + }, + { + "epoch": 0.524, + "grad_norm": 2.4983038902282715, + "learning_rate": 2e-05, + "loss": 0.37842286, + "step": 262 + }, + { + "epoch": 0.526, + "grad_norm": 2.8506362438201904, + "learning_rate": 2e-05, + "loss": 0.45623779, + "step": 263 + }, + { + "epoch": 0.528, + "grad_norm": 2.5922443866729736, + "learning_rate": 2e-05, + "loss": 0.42091221, + "step": 264 + }, + { + "epoch": 0.53, + "grad_norm": 2.4378294944763184, + "learning_rate": 2e-05, + "loss": 0.43036759, + "step": 265 + }, + { + "epoch": 0.532, + "grad_norm": 2.5051655769348145, + "learning_rate": 2e-05, + "loss": 0.36193722, + "step": 266 + }, + { + "epoch": 0.534, + "grad_norm": 2.4639880657196045, + "learning_rate": 2e-05, + "loss": 0.42388517, + "step": 267 + }, + { + "epoch": 0.536, + "grad_norm": 2.43228816986084, + "learning_rate": 2e-05, + "loss": 0.37180698, + "step": 268 + }, + { + "epoch": 0.538, + "grad_norm": 2.8706986904144287, + "learning_rate": 2e-05, + "loss": 0.3466678, + "step": 269 + }, + { + "epoch": 0.54, + "grad_norm": 2.8321847915649414, + "learning_rate": 2e-05, + "loss": 0.36745179, + "step": 270 + }, + { + "epoch": 0.542, + "grad_norm": 2.8276920318603516, + "learning_rate": 2e-05, + "loss": 0.38181525, + "step": 271 + }, + { + "epoch": 0.544, + "grad_norm": 2.267299175262451, + "learning_rate": 2e-05, + "loss": 0.36299157, + "step": 272 + }, + { + "epoch": 0.546, + "grad_norm": 2.176030158996582, + "learning_rate": 2e-05, + "loss": 0.35883152, + "step": 273 + }, + { + "epoch": 0.548, + "grad_norm": 2.4151179790496826, + "learning_rate": 2e-05, + "loss": 0.35068169, + "step": 274 + }, + { + "epoch": 0.55, + "grad_norm": 2.5002832412719727, + "learning_rate": 2e-05, + "loss": 0.4389441, + "step": 275 + }, + { + "epoch": 0.552, + "grad_norm": 3.062483072280884, + "learning_rate": 2e-05, + "loss": 0.38645649, + "step": 276 + }, + { + "epoch": 0.554, + "grad_norm": 1.8937181234359741, + "learning_rate": 2e-05, + "loss": 0.32340169, + "step": 277 + }, + { + "epoch": 0.556, + "grad_norm": 2.4653971195220947, + "learning_rate": 2e-05, + "loss": 0.44503152, + "step": 278 + }, + { + "epoch": 0.558, + "grad_norm": 2.399894952774048, + "learning_rate": 2e-05, + "loss": 0.3736915, + "step": 279 + }, + { + "epoch": 0.56, + "grad_norm": 3.2674710750579834, + "learning_rate": 2e-05, + "loss": 0.37925249, + "step": 280 + }, + { + "epoch": 0.562, + "grad_norm": 2.1981282234191895, + "learning_rate": 2e-05, + "loss": 0.379641, + "step": 281 + }, + { + "epoch": 0.564, + "grad_norm": 2.4101972579956055, + "learning_rate": 2e-05, + "loss": 0.32225233, + "step": 282 + }, + { + "epoch": 0.566, + "grad_norm": 3.3641390800476074, + "learning_rate": 2e-05, + "loss": 0.40978536, + "step": 283 + }, + { + "epoch": 0.568, + "grad_norm": 2.453429937362671, + "learning_rate": 2e-05, + "loss": 0.42054862, + "step": 284 + }, + { + "epoch": 0.57, + "grad_norm": 2.228393793106079, + "learning_rate": 2e-05, + "loss": 0.36940455, + "step": 285 + }, + { + "epoch": 0.572, + "grad_norm": 1.8759377002716064, + "learning_rate": 2e-05, + "loss": 0.3323741, + "step": 286 + }, + { + "epoch": 0.574, + "grad_norm": 2.5758562088012695, + "learning_rate": 2e-05, + "loss": 0.40134084, + "step": 287 + }, + { + "epoch": 0.576, + "grad_norm": 2.2973790168762207, + "learning_rate": 2e-05, + "loss": 0.42725891, + "step": 288 + }, + { + "epoch": 0.578, + "grad_norm": 2.4652743339538574, + "learning_rate": 2e-05, + "loss": 0.35494244, + "step": 289 + }, + { + "epoch": 0.58, + "grad_norm": 2.2309014797210693, + "learning_rate": 2e-05, + "loss": 0.3866846, + "step": 290 + }, + { + "epoch": 0.582, + "grad_norm": 2.242001533508301, + "learning_rate": 2e-05, + "loss": 0.4193109, + "step": 291 + }, + { + "epoch": 0.584, + "grad_norm": 2.306638479232788, + "learning_rate": 2e-05, + "loss": 0.38435796, + "step": 292 + }, + { + "epoch": 0.586, + "grad_norm": 2.3204331398010254, + "learning_rate": 2e-05, + "loss": 0.32084846, + "step": 293 + }, + { + "epoch": 0.588, + "grad_norm": 2.4214060306549072, + "learning_rate": 2e-05, + "loss": 0.36136317, + "step": 294 + }, + { + "epoch": 0.59, + "grad_norm": 1.7441887855529785, + "learning_rate": 2e-05, + "loss": 0.38891983, + "step": 295 + }, + { + "epoch": 0.592, + "grad_norm": 2.1641783714294434, + "learning_rate": 2e-05, + "loss": 0.3675043, + "step": 296 + }, + { + "epoch": 0.594, + "grad_norm": 2.537519931793213, + "learning_rate": 2e-05, + "loss": 0.39061338, + "step": 297 + }, + { + "epoch": 0.596, + "grad_norm": 2.3807148933410645, + "learning_rate": 2e-05, + "loss": 0.38095194, + "step": 298 + }, + { + "epoch": 0.598, + "grad_norm": 3.760620594024658, + "learning_rate": 2e-05, + "loss": 0.38023266, + "step": 299 + }, + { + "epoch": 0.6, + "grad_norm": 2.8382818698883057, + "learning_rate": 2e-05, + "loss": 0.42183822, + "step": 300 + }, + { + "epoch": 0.602, + "grad_norm": 2.5104598999023438, + "learning_rate": 2e-05, + "loss": 0.38760763, + "step": 301 + }, + { + "epoch": 0.604, + "grad_norm": 3.4479098320007324, + "learning_rate": 2e-05, + "loss": 0.38792706, + "step": 302 + }, + { + "epoch": 0.606, + "grad_norm": 2.2696542739868164, + "learning_rate": 2e-05, + "loss": 0.43671882, + "step": 303 + }, + { + "epoch": 0.608, + "grad_norm": 2.4829628467559814, + "learning_rate": 2e-05, + "loss": 0.38020101, + "step": 304 + }, + { + "epoch": 0.61, + "grad_norm": 3.053079843521118, + "learning_rate": 2e-05, + "loss": 0.42556402, + "step": 305 + }, + { + "epoch": 0.612, + "grad_norm": 2.130146026611328, + "learning_rate": 2e-05, + "loss": 0.36177719, + "step": 306 + }, + { + "epoch": 0.614, + "grad_norm": 5.296567440032959, + "learning_rate": 2e-05, + "loss": 0.40961123, + "step": 307 + }, + { + "epoch": 0.616, + "grad_norm": 2.4534523487091064, + "learning_rate": 2e-05, + "loss": 0.34303677, + "step": 308 + }, + { + "epoch": 0.618, + "grad_norm": 1.9672064781188965, + "learning_rate": 2e-05, + "loss": 0.42068005, + "step": 309 + }, + { + "epoch": 0.62, + "grad_norm": 3.3204848766326904, + "learning_rate": 2e-05, + "loss": 0.37758303, + "step": 310 + }, + { + "epoch": 0.622, + "grad_norm": 2.4542860984802246, + "learning_rate": 2e-05, + "loss": 0.41424572, + "step": 311 + }, + { + "epoch": 0.624, + "grad_norm": 1.9260770082473755, + "learning_rate": 2e-05, + "loss": 0.34970552, + "step": 312 + }, + { + "epoch": 0.626, + "grad_norm": 2.6891448497772217, + "learning_rate": 2e-05, + "loss": 0.30088019, + "step": 313 + }, + { + "epoch": 0.628, + "grad_norm": 2.968409538269043, + "learning_rate": 2e-05, + "loss": 0.42999855, + "step": 314 + }, + { + "epoch": 0.63, + "grad_norm": 2.5964770317077637, + "learning_rate": 2e-05, + "loss": 0.41451907, + "step": 315 + }, + { + "epoch": 0.632, + "grad_norm": 2.4311039447784424, + "learning_rate": 2e-05, + "loss": 0.35747087, + "step": 316 + }, + { + "epoch": 0.634, + "grad_norm": 2.2850985527038574, + "learning_rate": 2e-05, + "loss": 0.45336699, + "step": 317 + }, + { + "epoch": 0.636, + "grad_norm": 2.128378391265869, + "learning_rate": 2e-05, + "loss": 0.40923813, + "step": 318 + }, + { + "epoch": 0.638, + "grad_norm": 2.9572834968566895, + "learning_rate": 2e-05, + "loss": 0.40831214, + "step": 319 + }, + { + "epoch": 0.64, + "grad_norm": 2.1294198036193848, + "learning_rate": 2e-05, + "loss": 0.352382, + "step": 320 + }, + { + "epoch": 0.642, + "grad_norm": 2.146904706954956, + "learning_rate": 2e-05, + "loss": 0.36986923, + "step": 321 + }, + { + "epoch": 0.644, + "grad_norm": 2.226712465286255, + "learning_rate": 2e-05, + "loss": 0.37009352, + "step": 322 + }, + { + "epoch": 0.646, + "grad_norm": 2.8748672008514404, + "learning_rate": 2e-05, + "loss": 0.39577782, + "step": 323 + }, + { + "epoch": 0.648, + "grad_norm": 2.0993311405181885, + "learning_rate": 2e-05, + "loss": 0.34025472, + "step": 324 + }, + { + "epoch": 0.65, + "grad_norm": 2.553697347640991, + "learning_rate": 2e-05, + "loss": 0.35979444, + "step": 325 + }, + { + "epoch": 0.652, + "grad_norm": 3.0230658054351807, + "learning_rate": 2e-05, + "loss": 0.38101768, + "step": 326 + }, + { + "epoch": 0.654, + "grad_norm": 2.227163076400757, + "learning_rate": 2e-05, + "loss": 0.36153871, + "step": 327 + }, + { + "epoch": 0.656, + "grad_norm": 2.366971731185913, + "learning_rate": 2e-05, + "loss": 0.40203893, + "step": 328 + }, + { + "epoch": 0.658, + "grad_norm": 4.39686393737793, + "learning_rate": 2e-05, + "loss": 0.34971178, + "step": 329 + }, + { + "epoch": 0.66, + "grad_norm": 3.1364448070526123, + "learning_rate": 2e-05, + "loss": 0.38524896, + "step": 330 + }, + { + "epoch": 0.662, + "grad_norm": 2.4799602031707764, + "learning_rate": 2e-05, + "loss": 0.3510325, + "step": 331 + }, + { + "epoch": 0.664, + "grad_norm": 3.8624725341796875, + "learning_rate": 2e-05, + "loss": 0.40343612, + "step": 332 + }, + { + "epoch": 0.666, + "grad_norm": 2.8651304244995117, + "learning_rate": 2e-05, + "loss": 0.3543312, + "step": 333 + }, + { + "epoch": 0.668, + "grad_norm": 2.507993221282959, + "learning_rate": 2e-05, + "loss": 0.4071638, + "step": 334 + }, + { + "epoch": 0.67, + "grad_norm": 2.307523488998413, + "learning_rate": 2e-05, + "loss": 0.34446114, + "step": 335 + }, + { + "epoch": 0.672, + "grad_norm": 2.8034613132476807, + "learning_rate": 2e-05, + "loss": 0.36103964, + "step": 336 + }, + { + "epoch": 0.674, + "grad_norm": 2.804739236831665, + "learning_rate": 2e-05, + "loss": 0.34058389, + "step": 337 + }, + { + "epoch": 0.676, + "grad_norm": 2.3788864612579346, + "learning_rate": 2e-05, + "loss": 0.3461957, + "step": 338 + }, + { + "epoch": 0.678, + "grad_norm": 2.2598109245300293, + "learning_rate": 2e-05, + "loss": 0.34844282, + "step": 339 + }, + { + "epoch": 0.68, + "grad_norm": 2.7016749382019043, + "learning_rate": 2e-05, + "loss": 0.37224805, + "step": 340 + }, + { + "epoch": 0.682, + "grad_norm": 2.102294445037842, + "learning_rate": 2e-05, + "loss": 0.34660944, + "step": 341 + }, + { + "epoch": 0.684, + "grad_norm": 2.256808280944824, + "learning_rate": 2e-05, + "loss": 0.36872256, + "step": 342 + }, + { + "epoch": 0.686, + "grad_norm": 2.219933032989502, + "learning_rate": 2e-05, + "loss": 0.41074491, + "step": 343 + }, + { + "epoch": 0.688, + "grad_norm": 38.59788131713867, + "learning_rate": 2e-05, + "loss": 0.42532402, + "step": 344 + }, + { + "epoch": 0.69, + "grad_norm": 2.8445427417755127, + "learning_rate": 2e-05, + "loss": 0.40610889, + "step": 345 + }, + { + "epoch": 0.692, + "grad_norm": 3.2422025203704834, + "learning_rate": 2e-05, + "loss": 0.34015438, + "step": 346 + }, + { + "epoch": 0.694, + "grad_norm": 2.0103511810302734, + "learning_rate": 2e-05, + "loss": 0.33063939, + "step": 347 + }, + { + "epoch": 0.696, + "grad_norm": 2.821288585662842, + "learning_rate": 2e-05, + "loss": 0.40264255, + "step": 348 + }, + { + "epoch": 0.698, + "grad_norm": 3.3211405277252197, + "learning_rate": 2e-05, + "loss": 0.33633679, + "step": 349 + }, + { + "epoch": 0.7, + "grad_norm": 2.1281816959381104, + "learning_rate": 2e-05, + "loss": 0.36737266, + "step": 350 + }, + { + "epoch": 0.702, + "grad_norm": 2.546674966812134, + "learning_rate": 2e-05, + "loss": 0.39834434, + "step": 351 + }, + { + "epoch": 0.704, + "grad_norm": 1.9590989351272583, + "learning_rate": 2e-05, + "loss": 0.376955, + "step": 352 + }, + { + "epoch": 0.706, + "grad_norm": 2.5581297874450684, + "learning_rate": 2e-05, + "loss": 0.39899454, + "step": 353 + }, + { + "epoch": 0.708, + "grad_norm": 2.3324179649353027, + "learning_rate": 2e-05, + "loss": 0.37603098, + "step": 354 + }, + { + "epoch": 0.71, + "grad_norm": 2.334880828857422, + "learning_rate": 2e-05, + "loss": 0.42339247, + "step": 355 + }, + { + "epoch": 0.712, + "grad_norm": 3.104797601699829, + "learning_rate": 2e-05, + "loss": 0.36481744, + "step": 356 + }, + { + "epoch": 0.714, + "grad_norm": 6.291801929473877, + "learning_rate": 2e-05, + "loss": 0.40678573, + "step": 357 + }, + { + "epoch": 0.716, + "grad_norm": 4.675421237945557, + "learning_rate": 2e-05, + "loss": 0.39416665, + "step": 358 + }, + { + "epoch": 0.718, + "grad_norm": 2.531383752822876, + "learning_rate": 2e-05, + "loss": 0.35706043, + "step": 359 + }, + { + "epoch": 0.72, + "grad_norm": 5.778651237487793, + "learning_rate": 2e-05, + "loss": 0.41085088, + "step": 360 + }, + { + "epoch": 0.722, + "grad_norm": 2.785738945007324, + "learning_rate": 2e-05, + "loss": 0.31312498, + "step": 361 + }, + { + "epoch": 0.724, + "grad_norm": 4.2192063331604, + "learning_rate": 2e-05, + "loss": 0.43665931, + "step": 362 + }, + { + "epoch": 0.726, + "grad_norm": 2.268357515335083, + "learning_rate": 2e-05, + "loss": 0.41825864, + "step": 363 + }, + { + "epoch": 0.728, + "grad_norm": 2.246572494506836, + "learning_rate": 2e-05, + "loss": 0.36334115, + "step": 364 + }, + { + "epoch": 0.73, + "grad_norm": 2.2913174629211426, + "learning_rate": 2e-05, + "loss": 0.40367627, + "step": 365 + }, + { + "epoch": 0.732, + "grad_norm": 1.9627586603164673, + "learning_rate": 2e-05, + "loss": 0.36439764, + "step": 366 + }, + { + "epoch": 0.734, + "grad_norm": 2.0793967247009277, + "learning_rate": 2e-05, + "loss": 0.3633121, + "step": 367 + }, + { + "epoch": 0.736, + "grad_norm": 2.039069175720215, + "learning_rate": 2e-05, + "loss": 0.37875551, + "step": 368 + }, + { + "epoch": 0.738, + "grad_norm": 2.103626251220703, + "learning_rate": 2e-05, + "loss": 0.34807205, + "step": 369 + }, + { + "epoch": 0.74, + "grad_norm": 1.898775339126587, + "learning_rate": 2e-05, + "loss": 0.3396126, + "step": 370 + }, + { + "epoch": 0.742, + "grad_norm": 2.118276596069336, + "learning_rate": 2e-05, + "loss": 0.38108289, + "step": 371 + }, + { + "epoch": 0.744, + "grad_norm": 2.3276379108428955, + "learning_rate": 2e-05, + "loss": 0.35690206, + "step": 372 + }, + { + "epoch": 0.746, + "grad_norm": 2.173872232437134, + "learning_rate": 2e-05, + "loss": 0.35335249, + "step": 373 + }, + { + "epoch": 0.748, + "grad_norm": 1.8903744220733643, + "learning_rate": 2e-05, + "loss": 0.37995228, + "step": 374 + }, + { + "epoch": 0.75, + "grad_norm": 2.2106189727783203, + "learning_rate": 2e-05, + "loss": 0.34882003, + "step": 375 + }, + { + "epoch": 0.752, + "grad_norm": 2.048971652984619, + "learning_rate": 2e-05, + "loss": 0.43542331, + "step": 376 + }, + { + "epoch": 0.754, + "grad_norm": 2.003777503967285, + "learning_rate": 2e-05, + "loss": 0.3117795, + "step": 377 + }, + { + "epoch": 0.756, + "grad_norm": 2.0449531078338623, + "learning_rate": 2e-05, + "loss": 0.33903271, + "step": 378 + }, + { + "epoch": 0.758, + "grad_norm": 2.3183586597442627, + "learning_rate": 2e-05, + "loss": 0.33581644, + "step": 379 + }, + { + "epoch": 0.76, + "grad_norm": 1.9782938957214355, + "learning_rate": 2e-05, + "loss": 0.31719434, + "step": 380 + }, + { + "epoch": 0.762, + "grad_norm": 2.3408687114715576, + "learning_rate": 2e-05, + "loss": 0.38454038, + "step": 381 + }, + { + "epoch": 0.764, + "grad_norm": 2.642451524734497, + "learning_rate": 2e-05, + "loss": 0.36350057, + "step": 382 + }, + { + "epoch": 0.766, + "grad_norm": 2.2716588973999023, + "learning_rate": 2e-05, + "loss": 0.3672176, + "step": 383 + }, + { + "epoch": 0.768, + "grad_norm": 2.143385410308838, + "learning_rate": 2e-05, + "loss": 0.35499257, + "step": 384 + }, + { + "epoch": 0.77, + "grad_norm": 2.251404285430908, + "learning_rate": 2e-05, + "loss": 0.33705157, + "step": 385 + }, + { + "epoch": 0.772, + "grad_norm": 2.599787712097168, + "learning_rate": 2e-05, + "loss": 0.40010357, + "step": 386 + }, + { + "epoch": 0.774, + "grad_norm": 2.932671308517456, + "learning_rate": 2e-05, + "loss": 0.41798162, + "step": 387 + }, + { + "epoch": 0.776, + "grad_norm": 3.084031343460083, + "learning_rate": 2e-05, + "loss": 0.40057978, + "step": 388 + }, + { + "epoch": 0.778, + "grad_norm": 2.370199680328369, + "learning_rate": 2e-05, + "loss": 0.37256229, + "step": 389 + }, + { + "epoch": 0.78, + "grad_norm": 2.715414524078369, + "learning_rate": 2e-05, + "loss": 0.3550342, + "step": 390 + }, + { + "epoch": 0.782, + "grad_norm": 2.370290756225586, + "learning_rate": 2e-05, + "loss": 0.40933335, + "step": 391 + }, + { + "epoch": 0.784, + "grad_norm": 2.753520965576172, + "learning_rate": 2e-05, + "loss": 0.37647349, + "step": 392 + }, + { + "epoch": 0.786, + "grad_norm": 3.428513526916504, + "learning_rate": 2e-05, + "loss": 0.37809077, + "step": 393 + }, + { + "epoch": 0.788, + "grad_norm": 3.0542285442352295, + "learning_rate": 2e-05, + "loss": 0.3635264, + "step": 394 + }, + { + "epoch": 0.79, + "grad_norm": 2.631666421890259, + "learning_rate": 2e-05, + "loss": 0.37626997, + "step": 395 + }, + { + "epoch": 0.792, + "grad_norm": 2.9206936359405518, + "learning_rate": 2e-05, + "loss": 0.3362987, + "step": 396 + }, + { + "epoch": 0.794, + "grad_norm": 2.600062608718872, + "learning_rate": 2e-05, + "loss": 0.36531377, + "step": 397 + }, + { + "epoch": 0.796, + "grad_norm": 2.959347724914551, + "learning_rate": 2e-05, + "loss": 0.39553091, + "step": 398 + }, + { + "epoch": 0.798, + "grad_norm": 2.826603651046753, + "learning_rate": 2e-05, + "loss": 0.35759783, + "step": 399 + }, + { + "epoch": 0.8, + "grad_norm": 4.756749153137207, + "learning_rate": 2e-05, + "loss": 0.3761026, + "step": 400 + }, + { + "epoch": 0.802, + "grad_norm": 2.015024185180664, + "learning_rate": 2e-05, + "loss": 0.37341845, + "step": 401 + }, + { + "epoch": 0.804, + "grad_norm": 2.473043203353882, + "learning_rate": 2e-05, + "loss": 0.38090366, + "step": 402 + }, + { + "epoch": 0.806, + "grad_norm": 3.260671854019165, + "learning_rate": 2e-05, + "loss": 0.40409699, + "step": 403 + }, + { + "epoch": 0.808, + "grad_norm": 1.9450372457504272, + "learning_rate": 2e-05, + "loss": 0.38324416, + "step": 404 + }, + { + "epoch": 0.81, + "grad_norm": 2.213120460510254, + "learning_rate": 2e-05, + "loss": 0.32400131, + "step": 405 + }, + { + "epoch": 0.812, + "grad_norm": 2.4603614807128906, + "learning_rate": 2e-05, + "loss": 0.38184336, + "step": 406 + }, + { + "epoch": 0.814, + "grad_norm": 2.590217351913452, + "learning_rate": 2e-05, + "loss": 0.31220454, + "step": 407 + }, + { + "epoch": 0.816, + "grad_norm": 2.3522443771362305, + "learning_rate": 2e-05, + "loss": 0.37040508, + "step": 408 + }, + { + "epoch": 0.818, + "grad_norm": 3.8462178707122803, + "learning_rate": 2e-05, + "loss": 0.31150436, + "step": 409 + }, + { + "epoch": 0.82, + "grad_norm": 2.3633556365966797, + "learning_rate": 2e-05, + "loss": 0.33486691, + "step": 410 + }, + { + "epoch": 0.822, + "grad_norm": 2.797116756439209, + "learning_rate": 2e-05, + "loss": 0.35024345, + "step": 411 + }, + { + "epoch": 0.824, + "grad_norm": 2.787532329559326, + "learning_rate": 2e-05, + "loss": 0.34601441, + "step": 412 + }, + { + "epoch": 0.826, + "grad_norm": 2.7806966304779053, + "learning_rate": 2e-05, + "loss": 0.37785235, + "step": 413 + }, + { + "epoch": 0.828, + "grad_norm": 3.3908019065856934, + "learning_rate": 2e-05, + "loss": 0.40772134, + "step": 414 + }, + { + "epoch": 0.83, + "grad_norm": 2.3474032878875732, + "learning_rate": 2e-05, + "loss": 0.31730652, + "step": 415 + }, + { + "epoch": 0.832, + "grad_norm": 3.695180892944336, + "learning_rate": 2e-05, + "loss": 0.38893342, + "step": 416 + }, + { + "epoch": 0.834, + "grad_norm": 2.969644784927368, + "learning_rate": 2e-05, + "loss": 0.37266415, + "step": 417 + }, + { + "epoch": 0.836, + "grad_norm": 3.3439037799835205, + "learning_rate": 2e-05, + "loss": 0.36170414, + "step": 418 + }, + { + "epoch": 0.838, + "grad_norm": 2.669370651245117, + "learning_rate": 2e-05, + "loss": 0.41537088, + "step": 419 + }, + { + "epoch": 0.84, + "grad_norm": 3.2042794227600098, + "learning_rate": 2e-05, + "loss": 0.34706631, + "step": 420 + }, + { + "epoch": 0.842, + "grad_norm": 3.0707359313964844, + "learning_rate": 2e-05, + "loss": 0.36334276, + "step": 421 + }, + { + "epoch": 0.844, + "grad_norm": 3.0460245609283447, + "learning_rate": 2e-05, + "loss": 0.3588101, + "step": 422 + }, + { + "epoch": 0.846, + "grad_norm": 3.4480361938476562, + "learning_rate": 2e-05, + "loss": 0.35593536, + "step": 423 + }, + { + "epoch": 0.848, + "grad_norm": 3.668825626373291, + "learning_rate": 2e-05, + "loss": 0.36293906, + "step": 424 + }, + { + "epoch": 0.85, + "grad_norm": 3.644979953765869, + "learning_rate": 2e-05, + "loss": 0.33053726, + "step": 425 + }, + { + "epoch": 0.852, + "grad_norm": 3.7507691383361816, + "learning_rate": 2e-05, + "loss": 0.33477077, + "step": 426 + }, + { + "epoch": 0.854, + "grad_norm": 3.2668840885162354, + "learning_rate": 2e-05, + "loss": 0.35934401, + "step": 427 + }, + { + "epoch": 0.856, + "grad_norm": 3.24027681350708, + "learning_rate": 2e-05, + "loss": 0.28396821, + "step": 428 + }, + { + "epoch": 0.858, + "grad_norm": 12.503198623657227, + "learning_rate": 2e-05, + "loss": 0.32771713, + "step": 429 + }, + { + "epoch": 0.86, + "grad_norm": 4.297608852386475, + "learning_rate": 2e-05, + "loss": 0.33745548, + "step": 430 + }, + { + "epoch": 0.862, + "grad_norm": 4.472741603851318, + "learning_rate": 2e-05, + "loss": 0.27140242, + "step": 431 + }, + { + "epoch": 0.864, + "grad_norm": 3.206968069076538, + "learning_rate": 2e-05, + "loss": 0.28080299, + "step": 432 + }, + { + "epoch": 0.866, + "grad_norm": 3.3867924213409424, + "learning_rate": 2e-05, + "loss": 0.3420102, + "step": 433 + }, + { + "epoch": 0.868, + "grad_norm": 3.833103656768799, + "learning_rate": 2e-05, + "loss": 0.27758431, + "step": 434 + }, + { + "epoch": 0.87, + "grad_norm": 3.7450990676879883, + "learning_rate": 2e-05, + "loss": 0.27860394, + "step": 435 + }, + { + "epoch": 0.872, + "grad_norm": 4.227891445159912, + "learning_rate": 2e-05, + "loss": 0.31056997, + "step": 436 + }, + { + "epoch": 0.874, + "grad_norm": 3.7188467979431152, + "learning_rate": 2e-05, + "loss": 0.26568019, + "step": 437 + }, + { + "epoch": 0.876, + "grad_norm": 3.5237154960632324, + "learning_rate": 2e-05, + "loss": 0.25259641, + "step": 438 + }, + { + "epoch": 0.878, + "grad_norm": 5.686617851257324, + "learning_rate": 2e-05, + "loss": 0.2755993, + "step": 439 + }, + { + "epoch": 0.88, + "grad_norm": 3.4197587966918945, + "learning_rate": 2e-05, + "loss": 0.23646541, + "step": 440 + }, + { + "epoch": 0.882, + "grad_norm": 4.551022052764893, + "learning_rate": 2e-05, + "loss": 0.22122362, + "step": 441 + }, + { + "epoch": 0.884, + "grad_norm": 4.930042266845703, + "learning_rate": 2e-05, + "loss": 0.27152666, + "step": 442 + }, + { + "epoch": 0.886, + "grad_norm": 3.824270725250244, + "learning_rate": 2e-05, + "loss": 0.2230306, + "step": 443 + }, + { + "epoch": 0.888, + "grad_norm": 3.917961597442627, + "learning_rate": 2e-05, + "loss": 0.27966443, + "step": 444 + }, + { + "epoch": 0.89, + "grad_norm": 3.9343419075012207, + "learning_rate": 2e-05, + "loss": 0.20023456, + "step": 445 + }, + { + "epoch": 0.892, + "grad_norm": 4.3064775466918945, + "learning_rate": 2e-05, + "loss": 0.19676761, + "step": 446 + }, + { + "epoch": 0.894, + "grad_norm": 3.1968557834625244, + "learning_rate": 2e-05, + "loss": 0.19466686, + "step": 447 + }, + { + "epoch": 0.896, + "grad_norm": 4.086765289306641, + "learning_rate": 2e-05, + "loss": 0.22548294, + "step": 448 + }, + { + "epoch": 0.898, + "grad_norm": 4.0280256271362305, + "learning_rate": 2e-05, + "loss": 0.22720584, + "step": 449 + }, + { + "epoch": 0.9, + "grad_norm": 4.620728492736816, + "learning_rate": 2e-05, + "loss": 0.17157443, + "step": 450 + }, + { + "epoch": 0.902, + "grad_norm": 3.6249382495880127, + "learning_rate": 2e-05, + "loss": 0.23641428, + "step": 451 + }, + { + "epoch": 0.904, + "grad_norm": 4.136668682098389, + "learning_rate": 2e-05, + "loss": 0.21479097, + "step": 452 + }, + { + "epoch": 0.906, + "grad_norm": 7.2505879402160645, + "learning_rate": 2e-05, + "loss": 0.27446824, + "step": 453 + }, + { + "epoch": 0.908, + "grad_norm": 5.988163471221924, + "learning_rate": 2e-05, + "loss": 0.21940503, + "step": 454 + }, + { + "epoch": 0.91, + "grad_norm": 6.0386505126953125, + "learning_rate": 2e-05, + "loss": 0.19830205, + "step": 455 + }, + { + "epoch": 0.912, + "grad_norm": 3.1702799797058105, + "learning_rate": 2e-05, + "loss": 0.1882799, + "step": 456 + }, + { + "epoch": 0.914, + "grad_norm": 4.0222015380859375, + "learning_rate": 2e-05, + "loss": 0.23622099, + "step": 457 + }, + { + "epoch": 0.916, + "grad_norm": 3.4282891750335693, + "learning_rate": 2e-05, + "loss": 0.18106145, + "step": 458 + }, + { + "epoch": 0.918, + "grad_norm": 4.8152337074279785, + "learning_rate": 2e-05, + "loss": 0.17646313, + "step": 459 + }, + { + "epoch": 0.92, + "grad_norm": 3.1678340435028076, + "learning_rate": 2e-05, + "loss": 0.20366624, + "step": 460 + }, + { + "epoch": 0.922, + "grad_norm": 2.8468918800354004, + "learning_rate": 2e-05, + "loss": 0.20546392, + "step": 461 + }, + { + "epoch": 0.924, + "grad_norm": 3.2070858478546143, + "learning_rate": 2e-05, + "loss": 0.16359358, + "step": 462 + }, + { + "epoch": 0.926, + "grad_norm": 3.970043420791626, + "learning_rate": 2e-05, + "loss": 0.21810345, + "step": 463 + }, + { + "epoch": 0.928, + "grad_norm": 2.7538678646087646, + "learning_rate": 2e-05, + "loss": 0.15927938, + "step": 464 + }, + { + "epoch": 0.93, + "grad_norm": 3.887653112411499, + "learning_rate": 2e-05, + "loss": 0.17965358, + "step": 465 + }, + { + "epoch": 0.932, + "grad_norm": 2.4924097061157227, + "learning_rate": 2e-05, + "loss": 0.14478508, + "step": 466 + }, + { + "epoch": 0.934, + "grad_norm": 2.4270923137664795, + "learning_rate": 2e-05, + "loss": 0.1299592, + "step": 467 + }, + { + "epoch": 0.936, + "grad_norm": 2.980048179626465, + "learning_rate": 2e-05, + "loss": 0.16589662, + "step": 468 + }, + { + "epoch": 0.938, + "grad_norm": 2.9414331912994385, + "learning_rate": 2e-05, + "loss": 0.15968111, + "step": 469 + }, + { + "epoch": 0.94, + "grad_norm": 3.0330123901367188, + "learning_rate": 2e-05, + "loss": 0.17940134, + "step": 470 + }, + { + "epoch": 0.942, + "grad_norm": 4.506875514984131, + "learning_rate": 2e-05, + "loss": 0.17710808, + "step": 471 + }, + { + "epoch": 0.944, + "grad_norm": 4.033360481262207, + "learning_rate": 2e-05, + "loss": 0.16440117, + "step": 472 + }, + { + "epoch": 0.946, + "grad_norm": 3.041499376296997, + "learning_rate": 2e-05, + "loss": 0.18891403, + "step": 473 + }, + { + "epoch": 0.948, + "grad_norm": 3.819559335708618, + "learning_rate": 2e-05, + "loss": 0.17200641, + "step": 474 + }, + { + "epoch": 0.95, + "grad_norm": 2.71626615524292, + "learning_rate": 2e-05, + "loss": 0.14698125, + "step": 475 + }, + { + "epoch": 0.952, + "grad_norm": 4.742694854736328, + "learning_rate": 2e-05, + "loss": 0.18361038, + "step": 476 + }, + { + "epoch": 0.954, + "grad_norm": 3.403785228729248, + "learning_rate": 2e-05, + "loss": 0.15806127, + "step": 477 + }, + { + "epoch": 0.956, + "grad_norm": 3.088829278945923, + "learning_rate": 2e-05, + "loss": 0.19003233, + "step": 478 + }, + { + "epoch": 0.958, + "grad_norm": 4.706967353820801, + "learning_rate": 2e-05, + "loss": 0.19501597, + "step": 479 + }, + { + "epoch": 0.96, + "grad_norm": 2.4652099609375, + "learning_rate": 2e-05, + "loss": 0.16612351, + "step": 480 + }, + { + "epoch": 0.962, + "grad_norm": 4.185473918914795, + "learning_rate": 2e-05, + "loss": 0.14217728, + "step": 481 + }, + { + "epoch": 0.964, + "grad_norm": 3.6370861530303955, + "learning_rate": 2e-05, + "loss": 0.18404439, + "step": 482 + }, + { + "epoch": 0.966, + "grad_norm": 4.554826736450195, + "learning_rate": 2e-05, + "loss": 0.16499752, + "step": 483 + }, + { + "epoch": 0.968, + "grad_norm": 2.758190393447876, + "learning_rate": 2e-05, + "loss": 0.13831472, + "step": 484 + }, + { + "epoch": 0.97, + "grad_norm": 2.261568784713745, + "learning_rate": 2e-05, + "loss": 0.13929909, + "step": 485 + }, + { + "epoch": 0.972, + "grad_norm": 2.8820412158966064, + "learning_rate": 2e-05, + "loss": 0.14241502, + "step": 486 + }, + { + "epoch": 0.974, + "grad_norm": 3.352541208267212, + "learning_rate": 2e-05, + "loss": 0.16133608, + "step": 487 + }, + { + "epoch": 0.976, + "grad_norm": 3.834998846054077, + "learning_rate": 2e-05, + "loss": 0.17684533, + "step": 488 + }, + { + "epoch": 0.978, + "grad_norm": 2.977851152420044, + "learning_rate": 2e-05, + "loss": 0.12988587, + "step": 489 + }, + { + "epoch": 0.98, + "grad_norm": 6.011771202087402, + "learning_rate": 2e-05, + "loss": 0.15765052, + "step": 490 + }, + { + "epoch": 0.982, + "grad_norm": 2.6912081241607666, + "learning_rate": 2e-05, + "loss": 0.12373734, + "step": 491 + }, + { + "epoch": 0.984, + "grad_norm": 2.2780370712280273, + "learning_rate": 2e-05, + "loss": 0.15556982, + "step": 492 + }, + { + "epoch": 0.986, + "grad_norm": 2.763603448867798, + "learning_rate": 2e-05, + "loss": 0.14312422, + "step": 493 + }, + { + "epoch": 0.988, + "grad_norm": 2.51889705657959, + "learning_rate": 2e-05, + "loss": 0.14164892, + "step": 494 + }, + { + "epoch": 0.99, + "grad_norm": 2.4747464656829834, + "learning_rate": 2e-05, + "loss": 0.18990183, + "step": 495 + }, + { + "epoch": 0.992, + "grad_norm": 2.276155471801758, + "learning_rate": 2e-05, + "loss": 0.13020995, + "step": 496 + }, + { + "epoch": 0.994, + "grad_norm": 2.6515796184539795, + "learning_rate": 2e-05, + "loss": 0.15413743, + "step": 497 + }, + { + "epoch": 0.996, + "grad_norm": 2.347593069076538, + "learning_rate": 2e-05, + "loss": 0.15986988, + "step": 498 + }, + { + "epoch": 0.998, + "grad_norm": 2.6618576049804688, + "learning_rate": 2e-05, + "loss": 0.15566903, + "step": 499 + }, + { + "epoch": 1.0, + "grad_norm": 2.4387569427490234, + "learning_rate": 2e-05, + "loss": 0.15638649, + "step": 500 + }, + { + "epoch": 1.0, + "eval_performance": { + "AngleClassification_1": 0.976, + "AngleClassification_2": 0.634, + "AngleClassification_3": 0.499001996007984, + "Equal_1": 0.13, + "Equal_2": 0.08383233532934131, + "Equal_3": 0.14770459081836326, + "LineComparison_1": 0.498, + "LineComparison_2": 0.47305389221556887, + "LineComparison_3": 0.5069860279441117, + "Parallel_1": 0.40480961923847697, + "Parallel_2": 0.9038076152304609, + "Parallel_3": 0.254, + "Perpendicular_1": 0.522, + "Perpendicular_2": 0.16, + "Perpendicular_3": 0.1092184368737475, + "PointLiesOnCircle_1": 0.8086172344689379, + "PointLiesOnCircle_2": 0.5923333333333334, + "PointLiesOnCircle_3": 0.29560000000000003, + "PointLiesOnLine_1": 0.4529058116232465, + "PointLiesOnLine_2": 0.342685370741483, + "PointLiesOnLine_3": 0.2315369261477046 + }, + "eval_runtime": 324.0838, + "eval_samples_per_second": 32.399, + "eval_steps_per_second": 0.648, + "step": 500 + }, + { + "epoch": 1.002, + "grad_norm": 4.821013450622559, + "learning_rate": 2e-05, + "loss": 0.1556485, + "step": 501 + }, + { + "epoch": 1.004, + "grad_norm": 1.8802967071533203, + "learning_rate": 2e-05, + "loss": 0.11685692, + "step": 502 + }, + { + "epoch": 1.006, + "grad_norm": 1.7924933433532715, + "learning_rate": 2e-05, + "loss": 0.11074463, + "step": 503 + }, + { + "epoch": 1.008, + "grad_norm": 3.829883337020874, + "learning_rate": 2e-05, + "loss": 0.1522513, + "step": 504 + }, + { + "epoch": 1.01, + "grad_norm": 4.246246337890625, + "learning_rate": 2e-05, + "loss": 0.15618658, + "step": 505 + }, + { + "epoch": 1.012, + "grad_norm": 4.451237678527832, + "learning_rate": 2e-05, + "loss": 0.23163846, + "step": 506 + }, + { + "epoch": 1.014, + "grad_norm": 2.5041894912719727, + "learning_rate": 2e-05, + "loss": 0.13576819, + "step": 507 + }, + { + "epoch": 1.016, + "grad_norm": 2.6320042610168457, + "learning_rate": 2e-05, + "loss": 0.16223896, + "step": 508 + }, + { + "epoch": 1.018, + "grad_norm": 1.8667449951171875, + "learning_rate": 2e-05, + "loss": 0.12035898, + "step": 509 + }, + { + "epoch": 1.02, + "grad_norm": 2.622072458267212, + "learning_rate": 2e-05, + "loss": 0.13647205, + "step": 510 + }, + { + "epoch": 1.022, + "grad_norm": 4.481700420379639, + "learning_rate": 2e-05, + "loss": 0.15954015, + "step": 511 + }, + { + "epoch": 1.024, + "grad_norm": 3.5658326148986816, + "learning_rate": 2e-05, + "loss": 0.12799534, + "step": 512 + }, + { + "epoch": 1.026, + "grad_norm": 6.7309370040893555, + "learning_rate": 2e-05, + "loss": 0.13949288, + "step": 513 + }, + { + "epoch": 1.028, + "grad_norm": 3.3907320499420166, + "learning_rate": 2e-05, + "loss": 0.1515884, + "step": 514 + }, + { + "epoch": 1.03, + "grad_norm": 2.2120602130889893, + "learning_rate": 2e-05, + "loss": 0.11528069, + "step": 515 + }, + { + "epoch": 1.032, + "grad_norm": 1.858370065689087, + "learning_rate": 2e-05, + "loss": 0.10049677, + "step": 516 + }, + { + "epoch": 1.034, + "grad_norm": 3.5174007415771484, + "learning_rate": 2e-05, + "loss": 0.16639367, + "step": 517 + }, + { + "epoch": 1.036, + "grad_norm": 2.5854098796844482, + "learning_rate": 2e-05, + "loss": 0.16602588, + "step": 518 + }, + { + "epoch": 1.038, + "grad_norm": 2.3209636211395264, + "learning_rate": 2e-05, + "loss": 0.11949471, + "step": 519 + }, + { + "epoch": 1.04, + "grad_norm": 2.098727226257324, + "learning_rate": 2e-05, + "loss": 0.1452582, + "step": 520 + }, + { + "epoch": 1.042, + "grad_norm": 4.866150379180908, + "learning_rate": 2e-05, + "loss": 0.12613255, + "step": 521 + }, + { + "epoch": 1.044, + "grad_norm": 2.6759192943573, + "learning_rate": 2e-05, + "loss": 0.16882387, + "step": 522 + }, + { + "epoch": 1.046, + "grad_norm": 2.8629543781280518, + "learning_rate": 2e-05, + "loss": 0.17408426, + "step": 523 + }, + { + "epoch": 1.048, + "grad_norm": 3.283302068710327, + "learning_rate": 2e-05, + "loss": 0.13273655, + "step": 524 + }, + { + "epoch": 1.05, + "grad_norm": 2.690016984939575, + "learning_rate": 2e-05, + "loss": 0.16916898, + "step": 525 + }, + { + "epoch": 1.052, + "grad_norm": 3.075875997543335, + "learning_rate": 2e-05, + "loss": 0.15706718, + "step": 526 + }, + { + "epoch": 1.054, + "grad_norm": 4.240052223205566, + "learning_rate": 2e-05, + "loss": 0.12036274, + "step": 527 + }, + { + "epoch": 1.056, + "grad_norm": 2.4694156646728516, + "learning_rate": 2e-05, + "loss": 0.16254291, + "step": 528 + }, + { + "epoch": 1.058, + "grad_norm": 2.702230930328369, + "learning_rate": 2e-05, + "loss": 0.14166299, + "step": 529 + }, + { + "epoch": 1.06, + "grad_norm": 2.753516912460327, + "learning_rate": 2e-05, + "loss": 0.15594101, + "step": 530 + }, + { + "epoch": 1.062, + "grad_norm": 3.5839767456054688, + "learning_rate": 2e-05, + "loss": 0.12621114, + "step": 531 + }, + { + "epoch": 1.064, + "grad_norm": 2.400475263595581, + "learning_rate": 2e-05, + "loss": 0.11684637, + "step": 532 + }, + { + "epoch": 1.066, + "grad_norm": 2.5850939750671387, + "learning_rate": 2e-05, + "loss": 0.16528159, + "step": 533 + }, + { + "epoch": 1.068, + "grad_norm": 2.623412609100342, + "learning_rate": 2e-05, + "loss": 0.11953619, + "step": 534 + }, + { + "epoch": 1.07, + "grad_norm": 2.529339075088501, + "learning_rate": 2e-05, + "loss": 0.14916751, + "step": 535 + }, + { + "epoch": 1.072, + "grad_norm": 3.2517716884613037, + "learning_rate": 2e-05, + "loss": 0.15048769, + "step": 536 + }, + { + "epoch": 1.074, + "grad_norm": 1.9640837907791138, + "learning_rate": 2e-05, + "loss": 0.13277957, + "step": 537 + }, + { + "epoch": 1.076, + "grad_norm": 2.368446111679077, + "learning_rate": 2e-05, + "loss": 0.14852183, + "step": 538 + }, + { + "epoch": 1.078, + "grad_norm": 2.9297749996185303, + "learning_rate": 2e-05, + "loss": 0.15348649, + "step": 539 + }, + { + "epoch": 1.08, + "grad_norm": 5.759192943572998, + "learning_rate": 2e-05, + "loss": 0.13191144, + "step": 540 + }, + { + "epoch": 1.082, + "grad_norm": 2.021772623062134, + "learning_rate": 2e-05, + "loss": 0.1124662, + "step": 541 + }, + { + "epoch": 1.084, + "grad_norm": 2.8374252319335938, + "learning_rate": 2e-05, + "loss": 0.1071616, + "step": 542 + }, + { + "epoch": 1.086, + "grad_norm": 2.1634042263031006, + "learning_rate": 2e-05, + "loss": 0.11521625, + "step": 543 + }, + { + "epoch": 1.088, + "grad_norm": 2.589329719543457, + "learning_rate": 2e-05, + "loss": 0.13090719, + "step": 544 + }, + { + "epoch": 1.09, + "grad_norm": 2.291887044906616, + "learning_rate": 2e-05, + "loss": 0.09445122, + "step": 545 + }, + { + "epoch": 1.092, + "grad_norm": 2.3259284496307373, + "learning_rate": 2e-05, + "loss": 0.12547138, + "step": 546 + }, + { + "epoch": 1.094, + "grad_norm": 2.191845417022705, + "learning_rate": 2e-05, + "loss": 0.12708214, + "step": 547 + }, + { + "epoch": 1.096, + "grad_norm": 3.3253958225250244, + "learning_rate": 2e-05, + "loss": 0.14231913, + "step": 548 + }, + { + "epoch": 1.098, + "grad_norm": 2.9672670364379883, + "learning_rate": 2e-05, + "loss": 0.09965955, + "step": 549 + }, + { + "epoch": 1.1, + "grad_norm": 3.215210199356079, + "learning_rate": 2e-05, + "loss": 0.13290197, + "step": 550 + }, + { + "epoch": 1.102, + "grad_norm": 3.2850470542907715, + "learning_rate": 2e-05, + "loss": 0.10610727, + "step": 551 + }, + { + "epoch": 1.104, + "grad_norm": 2.7427141666412354, + "learning_rate": 2e-05, + "loss": 0.12695783, + "step": 552 + }, + { + "epoch": 1.106, + "grad_norm": 3.2531254291534424, + "learning_rate": 2e-05, + "loss": 0.12038823, + "step": 553 + }, + { + "epoch": 1.108, + "grad_norm": 2.5022897720336914, + "learning_rate": 2e-05, + "loss": 0.13614482, + "step": 554 + }, + { + "epoch": 1.11, + "grad_norm": 4.687258720397949, + "learning_rate": 2e-05, + "loss": 0.11169507, + "step": 555 + }, + { + "epoch": 1.112, + "grad_norm": 2.6494035720825195, + "learning_rate": 2e-05, + "loss": 0.11928535, + "step": 556 + }, + { + "epoch": 1.114, + "grad_norm": 2.6407148838043213, + "learning_rate": 2e-05, + "loss": 0.12640992, + "step": 557 + }, + { + "epoch": 1.116, + "grad_norm": 4.039032936096191, + "learning_rate": 2e-05, + "loss": 0.16989604, + "step": 558 + }, + { + "epoch": 1.1179999999999999, + "grad_norm": 3.1691837310791016, + "learning_rate": 2e-05, + "loss": 0.1739091, + "step": 559 + }, + { + "epoch": 1.12, + "grad_norm": 2.1099650859832764, + "learning_rate": 2e-05, + "loss": 0.13166931, + "step": 560 + }, + { + "epoch": 1.1219999999999999, + "grad_norm": 2.5065221786499023, + "learning_rate": 2e-05, + "loss": 0.14683142, + "step": 561 + }, + { + "epoch": 1.124, + "grad_norm": 2.24849534034729, + "learning_rate": 2e-05, + "loss": 0.10228881, + "step": 562 + }, + { + "epoch": 1.126, + "grad_norm": 3.08331298828125, + "learning_rate": 2e-05, + "loss": 0.15210506, + "step": 563 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 2.5049123764038086, + "learning_rate": 2e-05, + "loss": 0.12219332, + "step": 564 + }, + { + "epoch": 1.13, + "grad_norm": 2.139686346054077, + "learning_rate": 2e-05, + "loss": 0.11474234, + "step": 565 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 1.576886534690857, + "learning_rate": 2e-05, + "loss": 0.09722944, + "step": 566 + }, + { + "epoch": 1.134, + "grad_norm": 2.270977735519409, + "learning_rate": 2e-05, + "loss": 0.13902695, + "step": 567 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 3.4217686653137207, + "learning_rate": 2e-05, + "loss": 0.13287134, + "step": 568 + }, + { + "epoch": 1.138, + "grad_norm": 2.5737199783325195, + "learning_rate": 2e-05, + "loss": 0.14081581, + "step": 569 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 1.9033153057098389, + "learning_rate": 2e-05, + "loss": 0.13062626, + "step": 570 + }, + { + "epoch": 1.142, + "grad_norm": 1.9676927328109741, + "learning_rate": 2e-05, + "loss": 0.10712667, + "step": 571 + }, + { + "epoch": 1.144, + "grad_norm": 2.840505599975586, + "learning_rate": 2e-05, + "loss": 0.12923452, + "step": 572 + }, + { + "epoch": 1.146, + "grad_norm": 2.5398242473602295, + "learning_rate": 2e-05, + "loss": 0.14771613, + "step": 573 + }, + { + "epoch": 1.148, + "grad_norm": 2.549650192260742, + "learning_rate": 2e-05, + "loss": 0.13140582, + "step": 574 + }, + { + "epoch": 1.15, + "grad_norm": 2.1299757957458496, + "learning_rate": 2e-05, + "loss": 0.12547702, + "step": 575 + }, + { + "epoch": 1.152, + "grad_norm": 2.358029842376709, + "learning_rate": 2e-05, + "loss": 0.12179442, + "step": 576 + }, + { + "epoch": 1.154, + "grad_norm": 1.966770052909851, + "learning_rate": 2e-05, + "loss": 0.10394298, + "step": 577 + }, + { + "epoch": 1.156, + "grad_norm": 2.0004608631134033, + "learning_rate": 2e-05, + "loss": 0.14016056, + "step": 578 + }, + { + "epoch": 1.158, + "grad_norm": 3.5331180095672607, + "learning_rate": 2e-05, + "loss": 0.12845185, + "step": 579 + }, + { + "epoch": 1.16, + "grad_norm": 2.299360513687134, + "learning_rate": 2e-05, + "loss": 0.12573363, + "step": 580 + }, + { + "epoch": 1.162, + "grad_norm": 2.8737940788269043, + "learning_rate": 2e-05, + "loss": 0.12317209, + "step": 581 + }, + { + "epoch": 1.164, + "grad_norm": 4.00046968460083, + "learning_rate": 2e-05, + "loss": 0.09280093, + "step": 582 + }, + { + "epoch": 1.166, + "grad_norm": 2.854579210281372, + "learning_rate": 2e-05, + "loss": 0.12138534, + "step": 583 + }, + { + "epoch": 1.168, + "grad_norm": 2.668538808822632, + "learning_rate": 2e-05, + "loss": 0.10895318, + "step": 584 + }, + { + "epoch": 1.17, + "grad_norm": 1.9555225372314453, + "learning_rate": 2e-05, + "loss": 0.10446753, + "step": 585 + }, + { + "epoch": 1.172, + "grad_norm": 2.943380117416382, + "learning_rate": 2e-05, + "loss": 0.09996414, + "step": 586 + }, + { + "epoch": 1.174, + "grad_norm": 2.4844489097595215, + "learning_rate": 2e-05, + "loss": 0.11357398, + "step": 587 + }, + { + "epoch": 1.176, + "grad_norm": 4.776824474334717, + "learning_rate": 2e-05, + "loss": 0.09684882, + "step": 588 + }, + { + "epoch": 1.178, + "grad_norm": 2.3888590335845947, + "learning_rate": 2e-05, + "loss": 0.10829578, + "step": 589 + }, + { + "epoch": 1.18, + "grad_norm": 3.8649983406066895, + "learning_rate": 2e-05, + "loss": 0.10984306, + "step": 590 + }, + { + "epoch": 1.182, + "grad_norm": 2.9575302600860596, + "learning_rate": 2e-05, + "loss": 0.08193485, + "step": 591 + }, + { + "epoch": 1.184, + "grad_norm": 3.222970724105835, + "learning_rate": 2e-05, + "loss": 0.12279116, + "step": 592 + }, + { + "epoch": 1.186, + "grad_norm": 2.6744375228881836, + "learning_rate": 2e-05, + "loss": 0.12870033, + "step": 593 + }, + { + "epoch": 1.188, + "grad_norm": 4.028379917144775, + "learning_rate": 2e-05, + "loss": 0.13311404, + "step": 594 + }, + { + "epoch": 1.19, + "grad_norm": 6.1691460609436035, + "learning_rate": 2e-05, + "loss": 0.12187042, + "step": 595 + }, + { + "epoch": 1.192, + "grad_norm": 2.7343945503234863, + "learning_rate": 2e-05, + "loss": 0.15135875, + "step": 596 + }, + { + "epoch": 1.194, + "grad_norm": 2.8838765621185303, + "learning_rate": 2e-05, + "loss": 0.13852769, + "step": 597 + }, + { + "epoch": 1.196, + "grad_norm": 2.592514991760254, + "learning_rate": 2e-05, + "loss": 0.12420864, + "step": 598 + }, + { + "epoch": 1.198, + "grad_norm": 3.1650428771972656, + "learning_rate": 2e-05, + "loss": 0.1350922, + "step": 599 + }, + { + "epoch": 1.2, + "grad_norm": 2.812872886657715, + "learning_rate": 2e-05, + "loss": 0.11503953, + "step": 600 + }, + { + "epoch": 1.202, + "grad_norm": 2.564276933670044, + "learning_rate": 2e-05, + "loss": 0.13502818, + "step": 601 + }, + { + "epoch": 1.204, + "grad_norm": 2.3174049854278564, + "learning_rate": 2e-05, + "loss": 0.1046789, + "step": 602 + }, + { + "epoch": 1.206, + "grad_norm": 5.605673789978027, + "learning_rate": 2e-05, + "loss": 0.14738208, + "step": 603 + }, + { + "epoch": 1.208, + "grad_norm": 3.615462303161621, + "learning_rate": 2e-05, + "loss": 0.15257767, + "step": 604 + }, + { + "epoch": 1.21, + "grad_norm": 2.023568630218506, + "learning_rate": 2e-05, + "loss": 0.1027983, + "step": 605 + }, + { + "epoch": 1.212, + "grad_norm": 2.8860561847686768, + "learning_rate": 2e-05, + "loss": 0.14651015, + "step": 606 + }, + { + "epoch": 1.214, + "grad_norm": 2.644528388977051, + "learning_rate": 2e-05, + "loss": 0.13001838, + "step": 607 + }, + { + "epoch": 1.216, + "grad_norm": 1.84329092502594, + "learning_rate": 2e-05, + "loss": 0.09636261, + "step": 608 + }, + { + "epoch": 1.218, + "grad_norm": 2.386714220046997, + "learning_rate": 2e-05, + "loss": 0.11445861, + "step": 609 + }, + { + "epoch": 1.22, + "grad_norm": 2.4820950031280518, + "learning_rate": 2e-05, + "loss": 0.14000396, + "step": 610 + }, + { + "epoch": 1.222, + "grad_norm": 2.412814140319824, + "learning_rate": 2e-05, + "loss": 0.09793176, + "step": 611 + }, + { + "epoch": 1.224, + "grad_norm": 2.0066609382629395, + "learning_rate": 2e-05, + "loss": 0.11375158, + "step": 612 + }, + { + "epoch": 1.226, + "grad_norm": 2.4800400733947754, + "learning_rate": 2e-05, + "loss": 0.11607377, + "step": 613 + }, + { + "epoch": 1.228, + "grad_norm": 2.5596139430999756, + "learning_rate": 2e-05, + "loss": 0.11540417, + "step": 614 + }, + { + "epoch": 1.23, + "grad_norm": 1.8948832750320435, + "learning_rate": 2e-05, + "loss": 0.10477018, + "step": 615 + }, + { + "epoch": 1.232, + "grad_norm": 2.5590243339538574, + "learning_rate": 2e-05, + "loss": 0.11847038, + "step": 616 + }, + { + "epoch": 1.234, + "grad_norm": 2.4262442588806152, + "learning_rate": 2e-05, + "loss": 0.1339002, + "step": 617 + }, + { + "epoch": 1.236, + "grad_norm": 3.6982271671295166, + "learning_rate": 2e-05, + "loss": 0.15697673, + "step": 618 + }, + { + "epoch": 1.238, + "grad_norm": 2.4189836978912354, + "learning_rate": 2e-05, + "loss": 0.09398519, + "step": 619 + }, + { + "epoch": 1.24, + "grad_norm": 2.4480879306793213, + "learning_rate": 2e-05, + "loss": 0.10857891, + "step": 620 + }, + { + "epoch": 1.242, + "grad_norm": 2.9036366939544678, + "learning_rate": 2e-05, + "loss": 0.1304615, + "step": 621 + }, + { + "epoch": 1.244, + "grad_norm": 2.4365622997283936, + "learning_rate": 2e-05, + "loss": 0.09261293, + "step": 622 + }, + { + "epoch": 1.246, + "grad_norm": 3.2014307975769043, + "learning_rate": 2e-05, + "loss": 0.13648328, + "step": 623 + }, + { + "epoch": 1.248, + "grad_norm": 3.503472089767456, + "learning_rate": 2e-05, + "loss": 0.1255216, + "step": 624 + }, + { + "epoch": 1.25, + "grad_norm": 3.31048583984375, + "learning_rate": 2e-05, + "loss": 0.09449267, + "step": 625 + }, + { + "epoch": 1.252, + "grad_norm": 3.5570435523986816, + "learning_rate": 2e-05, + "loss": 0.12387832, + "step": 626 + }, + { + "epoch": 1.254, + "grad_norm": 2.5230705738067627, + "learning_rate": 2e-05, + "loss": 0.11507116, + "step": 627 + }, + { + "epoch": 1.256, + "grad_norm": 2.516810655593872, + "learning_rate": 2e-05, + "loss": 0.09768143, + "step": 628 + }, + { + "epoch": 1.258, + "grad_norm": 2.518480062484741, + "learning_rate": 2e-05, + "loss": 0.10322925, + "step": 629 + }, + { + "epoch": 1.26, + "grad_norm": 3.009742259979248, + "learning_rate": 2e-05, + "loss": 0.13511106, + "step": 630 + }, + { + "epoch": 1.262, + "grad_norm": 2.5133047103881836, + "learning_rate": 2e-05, + "loss": 0.09286143, + "step": 631 + }, + { + "epoch": 1.264, + "grad_norm": 1.873865008354187, + "learning_rate": 2e-05, + "loss": 0.0830982, + "step": 632 + }, + { + "epoch": 1.266, + "grad_norm": 2.617849826812744, + "learning_rate": 2e-05, + "loss": 0.14133078, + "step": 633 + }, + { + "epoch": 1.268, + "grad_norm": 2.539656162261963, + "learning_rate": 2e-05, + "loss": 0.1445739, + "step": 634 + }, + { + "epoch": 1.27, + "grad_norm": 2.631986379623413, + "learning_rate": 2e-05, + "loss": 0.1087111, + "step": 635 + }, + { + "epoch": 1.272, + "grad_norm": 3.4899914264678955, + "learning_rate": 2e-05, + "loss": 0.10924721, + "step": 636 + }, + { + "epoch": 1.274, + "grad_norm": 2.475717067718506, + "learning_rate": 2e-05, + "loss": 0.11710069, + "step": 637 + }, + { + "epoch": 1.276, + "grad_norm": 2.376640796661377, + "learning_rate": 2e-05, + "loss": 0.12215403, + "step": 638 + }, + { + "epoch": 1.278, + "grad_norm": 3.3351945877075195, + "learning_rate": 2e-05, + "loss": 0.1109551, + "step": 639 + }, + { + "epoch": 1.28, + "grad_norm": 4.320706844329834, + "learning_rate": 2e-05, + "loss": 0.15159556, + "step": 640 + }, + { + "epoch": 1.282, + "grad_norm": 2.7680718898773193, + "learning_rate": 2e-05, + "loss": 0.1090942, + "step": 641 + }, + { + "epoch": 1.284, + "grad_norm": 3.066657304763794, + "learning_rate": 2e-05, + "loss": 0.12701407, + "step": 642 + }, + { + "epoch": 1.286, + "grad_norm": 2.19394588470459, + "learning_rate": 2e-05, + "loss": 0.1153975, + "step": 643 + }, + { + "epoch": 1.288, + "grad_norm": 3.781139612197876, + "learning_rate": 2e-05, + "loss": 0.13794988, + "step": 644 + }, + { + "epoch": 1.29, + "grad_norm": 3.143144369125366, + "learning_rate": 2e-05, + "loss": 0.10998823, + "step": 645 + }, + { + "epoch": 1.292, + "grad_norm": 2.4902939796447754, + "learning_rate": 2e-05, + "loss": 0.10579651, + "step": 646 + }, + { + "epoch": 1.294, + "grad_norm": 3.307321310043335, + "learning_rate": 2e-05, + "loss": 0.1062723, + "step": 647 + }, + { + "epoch": 1.296, + "grad_norm": 2.1090455055236816, + "learning_rate": 2e-05, + "loss": 0.09155425, + "step": 648 + }, + { + "epoch": 1.298, + "grad_norm": 2.296339988708496, + "learning_rate": 2e-05, + "loss": 0.09818932, + "step": 649 + }, + { + "epoch": 1.3, + "grad_norm": 3.05061936378479, + "learning_rate": 2e-05, + "loss": 0.10809691, + "step": 650 + }, + { + "epoch": 1.302, + "grad_norm": 3.311122417449951, + "learning_rate": 2e-05, + "loss": 0.16015843, + "step": 651 + }, + { + "epoch": 1.304, + "grad_norm": 3.0931217670440674, + "learning_rate": 2e-05, + "loss": 0.10492094, + "step": 652 + }, + { + "epoch": 1.306, + "grad_norm": 2.87888765335083, + "learning_rate": 2e-05, + "loss": 0.10102548, + "step": 653 + }, + { + "epoch": 1.308, + "grad_norm": 2.918794631958008, + "learning_rate": 2e-05, + "loss": 0.12097271, + "step": 654 + }, + { + "epoch": 1.31, + "grad_norm": 2.5513689517974854, + "learning_rate": 2e-05, + "loss": 0.10996251, + "step": 655 + }, + { + "epoch": 1.312, + "grad_norm": 2.10673189163208, + "learning_rate": 2e-05, + "loss": 0.13955916, + "step": 656 + }, + { + "epoch": 1.314, + "grad_norm": 2.614741563796997, + "learning_rate": 2e-05, + "loss": 0.09799536, + "step": 657 + }, + { + "epoch": 1.316, + "grad_norm": 3.142801523208618, + "learning_rate": 2e-05, + "loss": 0.09701319, + "step": 658 + }, + { + "epoch": 1.318, + "grad_norm": 4.541778564453125, + "learning_rate": 2e-05, + "loss": 0.13997841, + "step": 659 + }, + { + "epoch": 1.32, + "grad_norm": 3.7341322898864746, + "learning_rate": 2e-05, + "loss": 0.11460865, + "step": 660 + }, + { + "epoch": 1.322, + "grad_norm": 2.7065863609313965, + "learning_rate": 2e-05, + "loss": 0.11451581, + "step": 661 + }, + { + "epoch": 1.324, + "grad_norm": 2.814460515975952, + "learning_rate": 2e-05, + "loss": 0.11479338, + "step": 662 + }, + { + "epoch": 1.326, + "grad_norm": 2.700387477874756, + "learning_rate": 2e-05, + "loss": 0.11010472, + "step": 663 + }, + { + "epoch": 1.328, + "grad_norm": 2.559100866317749, + "learning_rate": 2e-05, + "loss": 0.11664652, + "step": 664 + }, + { + "epoch": 1.33, + "grad_norm": 2.2762906551361084, + "learning_rate": 2e-05, + "loss": 0.09327792, + "step": 665 + }, + { + "epoch": 1.332, + "grad_norm": 2.627418279647827, + "learning_rate": 2e-05, + "loss": 0.08959809, + "step": 666 + }, + { + "epoch": 1.334, + "grad_norm": 2.182978868484497, + "learning_rate": 2e-05, + "loss": 0.09787555, + "step": 667 + }, + { + "epoch": 1.336, + "grad_norm": 2.3923563957214355, + "learning_rate": 2e-05, + "loss": 0.10572146, + "step": 668 + }, + { + "epoch": 1.338, + "grad_norm": 2.8589816093444824, + "learning_rate": 2e-05, + "loss": 0.13722643, + "step": 669 + }, + { + "epoch": 1.34, + "grad_norm": 2.460916757583618, + "learning_rate": 2e-05, + "loss": 0.08740199, + "step": 670 + }, + { + "epoch": 1.342, + "grad_norm": 2.4553704261779785, + "learning_rate": 2e-05, + "loss": 0.10862092, + "step": 671 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 1.6441093683242798, + "learning_rate": 2e-05, + "loss": 0.06703743, + "step": 672 + }, + { + "epoch": 1.346, + "grad_norm": 2.292106866836548, + "learning_rate": 2e-05, + "loss": 0.10088767, + "step": 673 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 2.2479658126831055, + "learning_rate": 2e-05, + "loss": 0.09034087, + "step": 674 + }, + { + "epoch": 1.35, + "grad_norm": 2.2497737407684326, + "learning_rate": 2e-05, + "loss": 0.11896604, + "step": 675 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 2.4198803901672363, + "learning_rate": 2e-05, + "loss": 0.09351024, + "step": 676 + }, + { + "epoch": 1.354, + "grad_norm": 2.0135796070098877, + "learning_rate": 2e-05, + "loss": 0.08445922, + "step": 677 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 3.221177339553833, + "learning_rate": 2e-05, + "loss": 0.11014754, + "step": 678 + }, + { + "epoch": 1.358, + "grad_norm": 2.270320415496826, + "learning_rate": 2e-05, + "loss": 0.12037845, + "step": 679 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 2.6250479221343994, + "learning_rate": 2e-05, + "loss": 0.11658848, + "step": 680 + }, + { + "epoch": 1.362, + "grad_norm": 2.4321134090423584, + "learning_rate": 2e-05, + "loss": 0.09802853, + "step": 681 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 2.4564032554626465, + "learning_rate": 2e-05, + "loss": 0.10415519, + "step": 682 + }, + { + "epoch": 1.366, + "grad_norm": 1.970099925994873, + "learning_rate": 2e-05, + "loss": 0.08627912, + "step": 683 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 2.5896036624908447, + "learning_rate": 2e-05, + "loss": 0.0935052, + "step": 684 + }, + { + "epoch": 1.37, + "grad_norm": 2.6092915534973145, + "learning_rate": 2e-05, + "loss": 0.09677845, + "step": 685 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 3.095132827758789, + "learning_rate": 2e-05, + "loss": 0.11584838, + "step": 686 + }, + { + "epoch": 1.374, + "grad_norm": 2.705355167388916, + "learning_rate": 2e-05, + "loss": 0.10224438, + "step": 687 + }, + { + "epoch": 1.376, + "grad_norm": 1.8498618602752686, + "learning_rate": 2e-05, + "loss": 0.07257505, + "step": 688 + }, + { + "epoch": 1.3780000000000001, + "grad_norm": 2.658275842666626, + "learning_rate": 2e-05, + "loss": 0.095241, + "step": 689 + }, + { + "epoch": 1.38, + "grad_norm": 2.1657066345214844, + "learning_rate": 2e-05, + "loss": 0.08493382, + "step": 690 + }, + { + "epoch": 1.3820000000000001, + "grad_norm": 2.649157762527466, + "learning_rate": 2e-05, + "loss": 0.10965092, + "step": 691 + }, + { + "epoch": 1.384, + "grad_norm": 2.6509430408477783, + "learning_rate": 2e-05, + "loss": 0.10417461, + "step": 692 + }, + { + "epoch": 1.3860000000000001, + "grad_norm": 2.7123641967773438, + "learning_rate": 2e-05, + "loss": 0.0952239, + "step": 693 + }, + { + "epoch": 1.388, + "grad_norm": 3.1654226779937744, + "learning_rate": 2e-05, + "loss": 0.1278252, + "step": 694 + }, + { + "epoch": 1.3900000000000001, + "grad_norm": 3.4855122566223145, + "learning_rate": 2e-05, + "loss": 0.10266776, + "step": 695 + }, + { + "epoch": 1.392, + "grad_norm": 1.523314118385315, + "learning_rate": 2e-05, + "loss": 0.05538701, + "step": 696 + }, + { + "epoch": 1.3940000000000001, + "grad_norm": 2.838423490524292, + "learning_rate": 2e-05, + "loss": 0.11393388, + "step": 697 + }, + { + "epoch": 1.396, + "grad_norm": 3.0423550605773926, + "learning_rate": 2e-05, + "loss": 0.11277989, + "step": 698 + }, + { + "epoch": 1.3980000000000001, + "grad_norm": 2.595038414001465, + "learning_rate": 2e-05, + "loss": 0.1143982, + "step": 699 + }, + { + "epoch": 1.4, + "grad_norm": 2.106015682220459, + "learning_rate": 2e-05, + "loss": 0.08432181, + "step": 700 + }, + { + "epoch": 1.4020000000000001, + "grad_norm": 2.072103977203369, + "learning_rate": 2e-05, + "loss": 0.10295324, + "step": 701 + }, + { + "epoch": 1.404, + "grad_norm": 5.35106086730957, + "learning_rate": 2e-05, + "loss": 0.12211221, + "step": 702 + }, + { + "epoch": 1.4060000000000001, + "grad_norm": 2.595242738723755, + "learning_rate": 2e-05, + "loss": 0.11621975, + "step": 703 + }, + { + "epoch": 1.408, + "grad_norm": 2.3813533782958984, + "learning_rate": 2e-05, + "loss": 0.07120501, + "step": 704 + }, + { + "epoch": 1.41, + "grad_norm": 2.7448511123657227, + "learning_rate": 2e-05, + "loss": 0.08237515, + "step": 705 + }, + { + "epoch": 1.412, + "grad_norm": 2.4283218383789062, + "learning_rate": 2e-05, + "loss": 0.07894477, + "step": 706 + }, + { + "epoch": 1.414, + "grad_norm": 2.722013235092163, + "learning_rate": 2e-05, + "loss": 0.09057313, + "step": 707 + }, + { + "epoch": 1.416, + "grad_norm": 2.1725480556488037, + "learning_rate": 2e-05, + "loss": 0.07726441, + "step": 708 + }, + { + "epoch": 1.418, + "grad_norm": 2.412940502166748, + "learning_rate": 2e-05, + "loss": 0.07639, + "step": 709 + }, + { + "epoch": 1.42, + "grad_norm": 1.8505913019180298, + "learning_rate": 2e-05, + "loss": 0.09286863, + "step": 710 + }, + { + "epoch": 1.422, + "grad_norm": 2.121731996536255, + "learning_rate": 2e-05, + "loss": 0.07919374, + "step": 711 + }, + { + "epoch": 1.424, + "grad_norm": 2.7666337490081787, + "learning_rate": 2e-05, + "loss": 0.10220705, + "step": 712 + }, + { + "epoch": 1.426, + "grad_norm": 2.482006072998047, + "learning_rate": 2e-05, + "loss": 0.08181632, + "step": 713 + }, + { + "epoch": 1.428, + "grad_norm": 2.693687915802002, + "learning_rate": 2e-05, + "loss": 0.06798096, + "step": 714 + }, + { + "epoch": 1.43, + "grad_norm": 2.5155346393585205, + "learning_rate": 2e-05, + "loss": 0.08410256, + "step": 715 + }, + { + "epoch": 1.432, + "grad_norm": 3.095309257507324, + "learning_rate": 2e-05, + "loss": 0.1147206, + "step": 716 + }, + { + "epoch": 1.434, + "grad_norm": 2.1130447387695312, + "learning_rate": 2e-05, + "loss": 0.08827046, + "step": 717 + }, + { + "epoch": 1.436, + "grad_norm": 3.6553380489349365, + "learning_rate": 2e-05, + "loss": 0.09300891, + "step": 718 + }, + { + "epoch": 1.438, + "grad_norm": 2.1224894523620605, + "learning_rate": 2e-05, + "loss": 0.09277943, + "step": 719 + }, + { + "epoch": 1.44, + "grad_norm": 1.9273908138275146, + "learning_rate": 2e-05, + "loss": 0.08951817, + "step": 720 + }, + { + "epoch": 1.442, + "grad_norm": 2.018853187561035, + "learning_rate": 2e-05, + "loss": 0.07283738, + "step": 721 + }, + { + "epoch": 1.444, + "grad_norm": 2.079155206680298, + "learning_rate": 2e-05, + "loss": 0.08982038, + "step": 722 + }, + { + "epoch": 1.446, + "grad_norm": 2.438000440597534, + "learning_rate": 2e-05, + "loss": 0.10790306, + "step": 723 + }, + { + "epoch": 1.448, + "grad_norm": 2.1911613941192627, + "learning_rate": 2e-05, + "loss": 0.07532462, + "step": 724 + }, + { + "epoch": 1.45, + "grad_norm": 1.8854244947433472, + "learning_rate": 2e-05, + "loss": 0.05531625, + "step": 725 + }, + { + "epoch": 1.452, + "grad_norm": 2.0435564517974854, + "learning_rate": 2e-05, + "loss": 0.08569458, + "step": 726 + }, + { + "epoch": 1.454, + "grad_norm": 4.157101631164551, + "learning_rate": 2e-05, + "loss": 0.10588066, + "step": 727 + }, + { + "epoch": 1.456, + "grad_norm": 1.762305498123169, + "learning_rate": 2e-05, + "loss": 0.08237082, + "step": 728 + }, + { + "epoch": 1.458, + "grad_norm": 1.7560533285140991, + "learning_rate": 2e-05, + "loss": 0.08737503, + "step": 729 + }, + { + "epoch": 1.46, + "grad_norm": 1.803896427154541, + "learning_rate": 2e-05, + "loss": 0.07071456, + "step": 730 + }, + { + "epoch": 1.462, + "grad_norm": 2.671704053878784, + "learning_rate": 2e-05, + "loss": 0.08183609, + "step": 731 + }, + { + "epoch": 1.464, + "grad_norm": 2.725677728652954, + "learning_rate": 2e-05, + "loss": 0.09709771, + "step": 732 + }, + { + "epoch": 1.466, + "grad_norm": 2.230529308319092, + "learning_rate": 2e-05, + "loss": 0.093163, + "step": 733 + }, + { + "epoch": 1.468, + "grad_norm": 2.1992108821868896, + "learning_rate": 2e-05, + "loss": 0.08508515, + "step": 734 + }, + { + "epoch": 1.47, + "grad_norm": 1.9853242635726929, + "learning_rate": 2e-05, + "loss": 0.07874966, + "step": 735 + }, + { + "epoch": 1.472, + "grad_norm": 3.195524215698242, + "learning_rate": 2e-05, + "loss": 0.07465832, + "step": 736 + }, + { + "epoch": 1.474, + "grad_norm": 2.578688383102417, + "learning_rate": 2e-05, + "loss": 0.0810234, + "step": 737 + }, + { + "epoch": 1.476, + "grad_norm": 2.219371795654297, + "learning_rate": 2e-05, + "loss": 0.06290153, + "step": 738 + }, + { + "epoch": 1.478, + "grad_norm": 3.0493226051330566, + "learning_rate": 2e-05, + "loss": 0.10852329, + "step": 739 + }, + { + "epoch": 1.48, + "grad_norm": 1.894547700881958, + "learning_rate": 2e-05, + "loss": 0.07198999, + "step": 740 + }, + { + "epoch": 1.482, + "grad_norm": 1.9765349626541138, + "learning_rate": 2e-05, + "loss": 0.07682022, + "step": 741 + }, + { + "epoch": 1.484, + "grad_norm": 2.758671522140503, + "learning_rate": 2e-05, + "loss": 0.08025845, + "step": 742 + }, + { + "epoch": 1.486, + "grad_norm": 5.092621803283691, + "learning_rate": 2e-05, + "loss": 0.11716142, + "step": 743 + }, + { + "epoch": 1.488, + "grad_norm": 1.7337188720703125, + "learning_rate": 2e-05, + "loss": 0.06889806, + "step": 744 + }, + { + "epoch": 1.49, + "grad_norm": 1.6227397918701172, + "learning_rate": 2e-05, + "loss": 0.06564939, + "step": 745 + }, + { + "epoch": 1.492, + "grad_norm": 1.8530882596969604, + "learning_rate": 2e-05, + "loss": 0.08244382, + "step": 746 + }, + { + "epoch": 1.494, + "grad_norm": 2.1974449157714844, + "learning_rate": 2e-05, + "loss": 0.10057726, + "step": 747 + }, + { + "epoch": 1.496, + "grad_norm": 1.9001625776290894, + "learning_rate": 2e-05, + "loss": 0.07622299, + "step": 748 + }, + { + "epoch": 1.498, + "grad_norm": 2.7725989818573, + "learning_rate": 2e-05, + "loss": 0.09877149, + "step": 749 + }, + { + "epoch": 1.5, + "grad_norm": 2.439131498336792, + "learning_rate": 2e-05, + "loss": 0.08865526, + "step": 750 + }, + { + "epoch": 1.502, + "grad_norm": 1.9627726078033447, + "learning_rate": 2e-05, + "loss": 0.0812942, + "step": 751 + }, + { + "epoch": 1.504, + "grad_norm": 2.461146831512451, + "learning_rate": 2e-05, + "loss": 0.08872318, + "step": 752 + }, + { + "epoch": 1.506, + "grad_norm": 2.2327353954315186, + "learning_rate": 2e-05, + "loss": 0.08371209, + "step": 753 + }, + { + "epoch": 1.508, + "grad_norm": 1.7057092189788818, + "learning_rate": 2e-05, + "loss": 0.08295096, + "step": 754 + }, + { + "epoch": 1.51, + "grad_norm": 1.592014193534851, + "learning_rate": 2e-05, + "loss": 0.06427182, + "step": 755 + }, + { + "epoch": 1.512, + "grad_norm": 2.1517910957336426, + "learning_rate": 2e-05, + "loss": 0.07752314, + "step": 756 + }, + { + "epoch": 1.514, + "grad_norm": 3.956221580505371, + "learning_rate": 2e-05, + "loss": 0.08947916, + "step": 757 + }, + { + "epoch": 1.516, + "grad_norm": 2.7721030712127686, + "learning_rate": 2e-05, + "loss": 0.0844971, + "step": 758 + }, + { + "epoch": 1.518, + "grad_norm": 1.7479230165481567, + "learning_rate": 2e-05, + "loss": 0.07433778, + "step": 759 + }, + { + "epoch": 1.52, + "grad_norm": 2.2564239501953125, + "learning_rate": 2e-05, + "loss": 0.08172427, + "step": 760 + }, + { + "epoch": 1.522, + "grad_norm": 2.6283817291259766, + "learning_rate": 2e-05, + "loss": 0.07976112, + "step": 761 + }, + { + "epoch": 1.524, + "grad_norm": 2.783998489379883, + "learning_rate": 2e-05, + "loss": 0.07664889, + "step": 762 + }, + { + "epoch": 1.526, + "grad_norm": 3.343437671661377, + "learning_rate": 2e-05, + "loss": 0.07145402, + "step": 763 + }, + { + "epoch": 1.528, + "grad_norm": 2.8911867141723633, + "learning_rate": 2e-05, + "loss": 0.06805974, + "step": 764 + }, + { + "epoch": 1.53, + "grad_norm": 2.998328685760498, + "learning_rate": 2e-05, + "loss": 0.07499643, + "step": 765 + }, + { + "epoch": 1.532, + "grad_norm": 2.484273672103882, + "learning_rate": 2e-05, + "loss": 0.07433078, + "step": 766 + }, + { + "epoch": 1.534, + "grad_norm": 2.329362392425537, + "learning_rate": 2e-05, + "loss": 0.08358801, + "step": 767 + }, + { + "epoch": 1.536, + "grad_norm": 2.202545642852783, + "learning_rate": 2e-05, + "loss": 0.06542914, + "step": 768 + }, + { + "epoch": 1.538, + "grad_norm": 2.0014524459838867, + "learning_rate": 2e-05, + "loss": 0.08993404, + "step": 769 + }, + { + "epoch": 1.54, + "grad_norm": 2.2723772525787354, + "learning_rate": 2e-05, + "loss": 0.10550876, + "step": 770 + }, + { + "epoch": 1.542, + "grad_norm": 2.0692672729492188, + "learning_rate": 2e-05, + "loss": 0.07269676, + "step": 771 + }, + { + "epoch": 1.544, + "grad_norm": 2.0737030506134033, + "learning_rate": 2e-05, + "loss": 0.06703752, + "step": 772 + }, + { + "epoch": 1.546, + "grad_norm": 2.1744225025177, + "learning_rate": 2e-05, + "loss": 0.08306466, + "step": 773 + }, + { + "epoch": 1.548, + "grad_norm": 17.611337661743164, + "learning_rate": 2e-05, + "loss": 0.064059, + "step": 774 + }, + { + "epoch": 1.55, + "grad_norm": 1.8644952774047852, + "learning_rate": 2e-05, + "loss": 0.07994568, + "step": 775 + }, + { + "epoch": 1.552, + "grad_norm": 1.5328989028930664, + "learning_rate": 2e-05, + "loss": 0.07587386, + "step": 776 + }, + { + "epoch": 1.554, + "grad_norm": 2.235229015350342, + "learning_rate": 2e-05, + "loss": 0.08034088, + "step": 777 + }, + { + "epoch": 1.556, + "grad_norm": 2.2400894165039062, + "learning_rate": 2e-05, + "loss": 0.07154867, + "step": 778 + }, + { + "epoch": 1.558, + "grad_norm": 2.6439497470855713, + "learning_rate": 2e-05, + "loss": 0.08810341, + "step": 779 + }, + { + "epoch": 1.56, + "grad_norm": 2.811596632003784, + "learning_rate": 2e-05, + "loss": 0.09437238, + "step": 780 + }, + { + "epoch": 1.562, + "grad_norm": 2.309126138687134, + "learning_rate": 2e-05, + "loss": 0.07691263, + "step": 781 + }, + { + "epoch": 1.564, + "grad_norm": 2.3875679969787598, + "learning_rate": 2e-05, + "loss": 0.07415307, + "step": 782 + }, + { + "epoch": 1.5659999999999998, + "grad_norm": 1.5229976177215576, + "learning_rate": 2e-05, + "loss": 0.0736395, + "step": 783 + }, + { + "epoch": 1.568, + "grad_norm": 3.096762180328369, + "learning_rate": 2e-05, + "loss": 0.09484315, + "step": 784 + }, + { + "epoch": 1.5699999999999998, + "grad_norm": 1.823845386505127, + "learning_rate": 2e-05, + "loss": 0.08399117, + "step": 785 + }, + { + "epoch": 1.572, + "grad_norm": 2.103595018386841, + "learning_rate": 2e-05, + "loss": 0.060688, + "step": 786 + }, + { + "epoch": 1.5739999999999998, + "grad_norm": 2.618736505508423, + "learning_rate": 2e-05, + "loss": 0.08902743, + "step": 787 + }, + { + "epoch": 1.576, + "grad_norm": 2.025015115737915, + "learning_rate": 2e-05, + "loss": 0.08033721, + "step": 788 + }, + { + "epoch": 1.5779999999999998, + "grad_norm": 9.556914329528809, + "learning_rate": 2e-05, + "loss": 0.11010626, + "step": 789 + }, + { + "epoch": 1.58, + "grad_norm": 2.4782986640930176, + "learning_rate": 2e-05, + "loss": 0.12129696, + "step": 790 + }, + { + "epoch": 1.5819999999999999, + "grad_norm": 2.86425518989563, + "learning_rate": 2e-05, + "loss": 0.08272199, + "step": 791 + }, + { + "epoch": 1.584, + "grad_norm": 1.8563334941864014, + "learning_rate": 2e-05, + "loss": 0.06831618, + "step": 792 + }, + { + "epoch": 1.5859999999999999, + "grad_norm": 1.709964394569397, + "learning_rate": 2e-05, + "loss": 0.06362506, + "step": 793 + }, + { + "epoch": 1.588, + "grad_norm": 2.137883186340332, + "learning_rate": 2e-05, + "loss": 0.07587803, + "step": 794 + }, + { + "epoch": 1.5899999999999999, + "grad_norm": 2.363016128540039, + "learning_rate": 2e-05, + "loss": 0.10132524, + "step": 795 + }, + { + "epoch": 1.592, + "grad_norm": 2.9902260303497314, + "learning_rate": 2e-05, + "loss": 0.12351868, + "step": 796 + }, + { + "epoch": 1.5939999999999999, + "grad_norm": 1.7475019693374634, + "learning_rate": 2e-05, + "loss": 0.0698937, + "step": 797 + }, + { + "epoch": 1.596, + "grad_norm": 1.7452584505081177, + "learning_rate": 2e-05, + "loss": 0.07310887, + "step": 798 + }, + { + "epoch": 1.5979999999999999, + "grad_norm": 2.2125442028045654, + "learning_rate": 2e-05, + "loss": 0.08515593, + "step": 799 + }, + { + "epoch": 1.6, + "grad_norm": 1.9570682048797607, + "learning_rate": 2e-05, + "loss": 0.09576114, + "step": 800 + }, + { + "epoch": 1.6019999999999999, + "grad_norm": 2.4968690872192383, + "learning_rate": 2e-05, + "loss": 0.08693783, + "step": 801 + }, + { + "epoch": 1.604, + "grad_norm": 2.26326847076416, + "learning_rate": 2e-05, + "loss": 0.08654219, + "step": 802 + }, + { + "epoch": 1.6059999999999999, + "grad_norm": 2.3274145126342773, + "learning_rate": 2e-05, + "loss": 0.10750651, + "step": 803 + }, + { + "epoch": 1.608, + "grad_norm": 1.52241849899292, + "learning_rate": 2e-05, + "loss": 0.07020888, + "step": 804 + }, + { + "epoch": 1.6099999999999999, + "grad_norm": 2.112908363342285, + "learning_rate": 2e-05, + "loss": 0.07239047, + "step": 805 + }, + { + "epoch": 1.612, + "grad_norm": 2.029871702194214, + "learning_rate": 2e-05, + "loss": 0.08264256, + "step": 806 + }, + { + "epoch": 1.6139999999999999, + "grad_norm": 1.7689646482467651, + "learning_rate": 2e-05, + "loss": 0.08204137, + "step": 807 + }, + { + "epoch": 1.616, + "grad_norm": 2.38230562210083, + "learning_rate": 2e-05, + "loss": 0.07087668, + "step": 808 + }, + { + "epoch": 1.6179999999999999, + "grad_norm": 1.8665469884872437, + "learning_rate": 2e-05, + "loss": 0.0778988, + "step": 809 + }, + { + "epoch": 1.62, + "grad_norm": 2.146146774291992, + "learning_rate": 2e-05, + "loss": 0.08147563, + "step": 810 + }, + { + "epoch": 1.6219999999999999, + "grad_norm": 1.901824951171875, + "learning_rate": 2e-05, + "loss": 0.07722423, + "step": 811 + }, + { + "epoch": 1.624, + "grad_norm": 1.654176950454712, + "learning_rate": 2e-05, + "loss": 0.07397038, + "step": 812 + }, + { + "epoch": 1.626, + "grad_norm": 2.2095117568969727, + "learning_rate": 2e-05, + "loss": 0.09214038, + "step": 813 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 2.471964120864868, + "learning_rate": 2e-05, + "loss": 0.08440986, + "step": 814 + }, + { + "epoch": 1.63, + "grad_norm": 2.14119815826416, + "learning_rate": 2e-05, + "loss": 0.08491719, + "step": 815 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 1.472819209098816, + "learning_rate": 2e-05, + "loss": 0.06080973, + "step": 816 + }, + { + "epoch": 1.634, + "grad_norm": 4.0974297523498535, + "learning_rate": 2e-05, + "loss": 0.11212557, + "step": 817 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 1.893890142440796, + "learning_rate": 2e-05, + "loss": 0.06430957, + "step": 818 + }, + { + "epoch": 1.638, + "grad_norm": 1.8362897634506226, + "learning_rate": 2e-05, + "loss": 0.07269567, + "step": 819 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 2.3635354042053223, + "learning_rate": 2e-05, + "loss": 0.07982106, + "step": 820 + }, + { + "epoch": 1.642, + "grad_norm": 2.827873706817627, + "learning_rate": 2e-05, + "loss": 0.06442684, + "step": 821 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 1.9763258695602417, + "learning_rate": 2e-05, + "loss": 0.0594982, + "step": 822 + }, + { + "epoch": 1.646, + "grad_norm": 1.7666466236114502, + "learning_rate": 2e-05, + "loss": 0.06877147, + "step": 823 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 3.103104829788208, + "learning_rate": 2e-05, + "loss": 0.07696585, + "step": 824 + }, + { + "epoch": 1.65, + "grad_norm": 3.5277068614959717, + "learning_rate": 2e-05, + "loss": 0.08776857, + "step": 825 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 2.6822338104248047, + "learning_rate": 2e-05, + "loss": 0.06033723, + "step": 826 + }, + { + "epoch": 1.654, + "grad_norm": 1.966208577156067, + "learning_rate": 2e-05, + "loss": 0.0782545, + "step": 827 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 2.1555910110473633, + "learning_rate": 2e-05, + "loss": 0.08536707, + "step": 828 + }, + { + "epoch": 1.658, + "grad_norm": 2.096773624420166, + "learning_rate": 2e-05, + "loss": 0.06903753, + "step": 829 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 1.9013687372207642, + "learning_rate": 2e-05, + "loss": 0.08171882, + "step": 830 + }, + { + "epoch": 1.662, + "grad_norm": 1.719090461730957, + "learning_rate": 2e-05, + "loss": 0.06542725, + "step": 831 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 2.0266594886779785, + "learning_rate": 2e-05, + "loss": 0.08807792, + "step": 832 + }, + { + "epoch": 1.666, + "grad_norm": 2.070847511291504, + "learning_rate": 2e-05, + "loss": 0.07473135, + "step": 833 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 2.2031726837158203, + "learning_rate": 2e-05, + "loss": 0.05795466, + "step": 834 + }, + { + "epoch": 1.67, + "grad_norm": 1.9450498819351196, + "learning_rate": 2e-05, + "loss": 0.05321917, + "step": 835 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 2.383620023727417, + "learning_rate": 2e-05, + "loss": 0.09668325, + "step": 836 + }, + { + "epoch": 1.674, + "grad_norm": 1.911126732826233, + "learning_rate": 2e-05, + "loss": 0.0709563, + "step": 837 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 1.8981558084487915, + "learning_rate": 2e-05, + "loss": 0.05752856, + "step": 838 + }, + { + "epoch": 1.678, + "grad_norm": 1.5697596073150635, + "learning_rate": 2e-05, + "loss": 0.06154868, + "step": 839 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 2.2556140422821045, + "learning_rate": 2e-05, + "loss": 0.06484008, + "step": 840 + }, + { + "epoch": 1.682, + "grad_norm": 2.297083616256714, + "learning_rate": 2e-05, + "loss": 0.06691624, + "step": 841 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 2.831334114074707, + "learning_rate": 2e-05, + "loss": 0.0824251, + "step": 842 + }, + { + "epoch": 1.686, + "grad_norm": 2.1341090202331543, + "learning_rate": 2e-05, + "loss": 0.08404815, + "step": 843 + }, + { + "epoch": 1.688, + "grad_norm": 1.4432538747787476, + "learning_rate": 2e-05, + "loss": 0.05486818, + "step": 844 + }, + { + "epoch": 1.69, + "grad_norm": 2.392119884490967, + "learning_rate": 2e-05, + "loss": 0.08042577, + "step": 845 + }, + { + "epoch": 1.692, + "grad_norm": 2.140087604522705, + "learning_rate": 2e-05, + "loss": 0.07316266, + "step": 846 + }, + { + "epoch": 1.694, + "grad_norm": 2.8727564811706543, + "learning_rate": 2e-05, + "loss": 0.09117547, + "step": 847 + }, + { + "epoch": 1.696, + "grad_norm": 2.5479319095611572, + "learning_rate": 2e-05, + "loss": 0.0947205, + "step": 848 + }, + { + "epoch": 1.698, + "grad_norm": 2.120222806930542, + "learning_rate": 2e-05, + "loss": 0.06924388, + "step": 849 + }, + { + "epoch": 1.7, + "grad_norm": 2.4992620944976807, + "learning_rate": 2e-05, + "loss": 0.09288818, + "step": 850 + }, + { + "epoch": 1.702, + "grad_norm": 1.9825425148010254, + "learning_rate": 2e-05, + "loss": 0.07824534, + "step": 851 + }, + { + "epoch": 1.704, + "grad_norm": 1.519212245941162, + "learning_rate": 2e-05, + "loss": 0.05538861, + "step": 852 + }, + { + "epoch": 1.706, + "grad_norm": 2.399399518966675, + "learning_rate": 2e-05, + "loss": 0.0779898, + "step": 853 + }, + { + "epoch": 1.708, + "grad_norm": 2.637857437133789, + "learning_rate": 2e-05, + "loss": 0.10369846, + "step": 854 + }, + { + "epoch": 1.71, + "grad_norm": 2.903981924057007, + "learning_rate": 2e-05, + "loss": 0.09455149, + "step": 855 + }, + { + "epoch": 1.712, + "grad_norm": 1.9127029180526733, + "learning_rate": 2e-05, + "loss": 0.06748534, + "step": 856 + }, + { + "epoch": 1.714, + "grad_norm": 1.633865237236023, + "learning_rate": 2e-05, + "loss": 0.07176484, + "step": 857 + }, + { + "epoch": 1.716, + "grad_norm": 2.8449621200561523, + "learning_rate": 2e-05, + "loss": 0.07032914, + "step": 858 + }, + { + "epoch": 1.718, + "grad_norm": 3.111711263656616, + "learning_rate": 2e-05, + "loss": 0.07360119, + "step": 859 + }, + { + "epoch": 1.72, + "grad_norm": 2.459895610809326, + "learning_rate": 2e-05, + "loss": 0.10556261, + "step": 860 + }, + { + "epoch": 1.722, + "grad_norm": 1.9931344985961914, + "learning_rate": 2e-05, + "loss": 0.06967217, + "step": 861 + }, + { + "epoch": 1.724, + "grad_norm": 1.429177165031433, + "learning_rate": 2e-05, + "loss": 0.04311872, + "step": 862 + }, + { + "epoch": 1.726, + "grad_norm": 1.8553142547607422, + "learning_rate": 2e-05, + "loss": 0.07448836, + "step": 863 + }, + { + "epoch": 1.728, + "grad_norm": 1.4926773309707642, + "learning_rate": 2e-05, + "loss": 0.0673956, + "step": 864 + }, + { + "epoch": 1.73, + "grad_norm": 2.5575432777404785, + "learning_rate": 2e-05, + "loss": 0.07588162, + "step": 865 + }, + { + "epoch": 1.732, + "grad_norm": 2.1007280349731445, + "learning_rate": 2e-05, + "loss": 0.05418327, + "step": 866 + }, + { + "epoch": 1.734, + "grad_norm": 3.0805535316467285, + "learning_rate": 2e-05, + "loss": 0.08815256, + "step": 867 + }, + { + "epoch": 1.736, + "grad_norm": 2.14973783493042, + "learning_rate": 2e-05, + "loss": 0.07602916, + "step": 868 + }, + { + "epoch": 1.738, + "grad_norm": 2.6056084632873535, + "learning_rate": 2e-05, + "loss": 0.08369519, + "step": 869 + }, + { + "epoch": 1.74, + "grad_norm": 1.980968952178955, + "learning_rate": 2e-05, + "loss": 0.06892893, + "step": 870 + }, + { + "epoch": 1.742, + "grad_norm": 2.932478427886963, + "learning_rate": 2e-05, + "loss": 0.08124112, + "step": 871 + }, + { + "epoch": 1.744, + "grad_norm": 1.6852787733078003, + "learning_rate": 2e-05, + "loss": 0.06830005, + "step": 872 + }, + { + "epoch": 1.746, + "grad_norm": 1.881437063217163, + "learning_rate": 2e-05, + "loss": 0.04541746, + "step": 873 + }, + { + "epoch": 1.748, + "grad_norm": 2.5663695335388184, + "learning_rate": 2e-05, + "loss": 0.07354845, + "step": 874 + }, + { + "epoch": 1.75, + "grad_norm": 2.884338617324829, + "learning_rate": 2e-05, + "loss": 0.07085122, + "step": 875 + }, + { + "epoch": 1.752, + "grad_norm": 2.173144578933716, + "learning_rate": 2e-05, + "loss": 0.06505815, + "step": 876 + }, + { + "epoch": 1.754, + "grad_norm": 2.744832754135132, + "learning_rate": 2e-05, + "loss": 0.10300224, + "step": 877 + }, + { + "epoch": 1.756, + "grad_norm": 2.6979944705963135, + "learning_rate": 2e-05, + "loss": 0.08862556, + "step": 878 + }, + { + "epoch": 1.758, + "grad_norm": 2.6769134998321533, + "learning_rate": 2e-05, + "loss": 0.07417379, + "step": 879 + }, + { + "epoch": 1.76, + "grad_norm": 2.0921900272369385, + "learning_rate": 2e-05, + "loss": 0.08049569, + "step": 880 + }, + { + "epoch": 1.762, + "grad_norm": 3.668351411819458, + "learning_rate": 2e-05, + "loss": 0.07346732, + "step": 881 + }, + { + "epoch": 1.764, + "grad_norm": 1.955716609954834, + "learning_rate": 2e-05, + "loss": 0.0588823, + "step": 882 + }, + { + "epoch": 1.766, + "grad_norm": 2.055715560913086, + "learning_rate": 2e-05, + "loss": 0.06786981, + "step": 883 + }, + { + "epoch": 1.768, + "grad_norm": 1.8874297142028809, + "learning_rate": 2e-05, + "loss": 0.06704556, + "step": 884 + }, + { + "epoch": 1.77, + "grad_norm": 2.452850103378296, + "learning_rate": 2e-05, + "loss": 0.0776285, + "step": 885 + }, + { + "epoch": 1.772, + "grad_norm": 1.6478520631790161, + "learning_rate": 2e-05, + "loss": 0.0682406, + "step": 886 + }, + { + "epoch": 1.774, + "grad_norm": 1.441072940826416, + "learning_rate": 2e-05, + "loss": 0.05489372, + "step": 887 + }, + { + "epoch": 1.776, + "grad_norm": 2.24617862701416, + "learning_rate": 2e-05, + "loss": 0.05962018, + "step": 888 + }, + { + "epoch": 1.778, + "grad_norm": 2.570594072341919, + "learning_rate": 2e-05, + "loss": 0.07772797, + "step": 889 + }, + { + "epoch": 1.78, + "grad_norm": 2.316793441772461, + "learning_rate": 2e-05, + "loss": 0.07414915, + "step": 890 + }, + { + "epoch": 1.782, + "grad_norm": 2.2875797748565674, + "learning_rate": 2e-05, + "loss": 0.06844224, + "step": 891 + }, + { + "epoch": 1.784, + "grad_norm": 3.2194788455963135, + "learning_rate": 2e-05, + "loss": 0.08942357, + "step": 892 + }, + { + "epoch": 1.786, + "grad_norm": 3.111416816711426, + "learning_rate": 2e-05, + "loss": 0.08543706, + "step": 893 + }, + { + "epoch": 1.788, + "grad_norm": 2.2496681213378906, + "learning_rate": 2e-05, + "loss": 0.06796233, + "step": 894 + }, + { + "epoch": 1.79, + "grad_norm": 2.393826723098755, + "learning_rate": 2e-05, + "loss": 0.06307848, + "step": 895 + }, + { + "epoch": 1.792, + "grad_norm": 1.8092612028121948, + "learning_rate": 2e-05, + "loss": 0.08083902, + "step": 896 + }, + { + "epoch": 1.794, + "grad_norm": 2.327549934387207, + "learning_rate": 2e-05, + "loss": 0.07006175, + "step": 897 + }, + { + "epoch": 1.796, + "grad_norm": 2.3291375637054443, + "learning_rate": 2e-05, + "loss": 0.07644182, + "step": 898 + }, + { + "epoch": 1.798, + "grad_norm": 1.8122745752334595, + "learning_rate": 2e-05, + "loss": 0.05956836, + "step": 899 + }, + { + "epoch": 1.8, + "grad_norm": 1.7686699628829956, + "learning_rate": 2e-05, + "loss": 0.0774643, + "step": 900 + }, + { + "epoch": 1.802, + "grad_norm": 3.383394956588745, + "learning_rate": 2e-05, + "loss": 0.09732038, + "step": 901 + }, + { + "epoch": 1.804, + "grad_norm": 1.4468183517456055, + "learning_rate": 2e-05, + "loss": 0.05280242, + "step": 902 + }, + { + "epoch": 1.806, + "grad_norm": 3.099715232849121, + "learning_rate": 2e-05, + "loss": 0.08549714, + "step": 903 + }, + { + "epoch": 1.808, + "grad_norm": 2.2685530185699463, + "learning_rate": 2e-05, + "loss": 0.08650636, + "step": 904 + }, + { + "epoch": 1.81, + "grad_norm": 2.1206886768341064, + "learning_rate": 2e-05, + "loss": 0.06690793, + "step": 905 + }, + { + "epoch": 1.812, + "grad_norm": 2.361654281616211, + "learning_rate": 2e-05, + "loss": 0.07197607, + "step": 906 + }, + { + "epoch": 1.814, + "grad_norm": 2.3998827934265137, + "learning_rate": 2e-05, + "loss": 0.06678525, + "step": 907 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 1.8265810012817383, + "learning_rate": 2e-05, + "loss": 0.06269243, + "step": 908 + }, + { + "epoch": 1.818, + "grad_norm": 2.0034234523773193, + "learning_rate": 2e-05, + "loss": 0.07594454, + "step": 909 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 1.8148810863494873, + "learning_rate": 2e-05, + "loss": 0.06071473, + "step": 910 + }, + { + "epoch": 1.822, + "grad_norm": 1.751679539680481, + "learning_rate": 2e-05, + "loss": 0.06768817, + "step": 911 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 2.00803279876709, + "learning_rate": 2e-05, + "loss": 0.07907202, + "step": 912 + }, + { + "epoch": 1.826, + "grad_norm": 2.013383388519287, + "learning_rate": 2e-05, + "loss": 0.06719398, + "step": 913 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 2.5518107414245605, + "learning_rate": 2e-05, + "loss": 0.07807682, + "step": 914 + }, + { + "epoch": 1.83, + "grad_norm": 2.0092356204986572, + "learning_rate": 2e-05, + "loss": 0.05918414, + "step": 915 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 1.9702318906784058, + "learning_rate": 2e-05, + "loss": 0.05898762, + "step": 916 + }, + { + "epoch": 1.834, + "grad_norm": 2.974558115005493, + "learning_rate": 2e-05, + "loss": 0.09832944, + "step": 917 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 3.1490416526794434, + "learning_rate": 2e-05, + "loss": 0.08504972, + "step": 918 + }, + { + "epoch": 1.838, + "grad_norm": 3.151014804840088, + "learning_rate": 2e-05, + "loss": 0.05736318, + "step": 919 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 2.9925057888031006, + "learning_rate": 2e-05, + "loss": 0.07320988, + "step": 920 + }, + { + "epoch": 1.842, + "grad_norm": 2.636167287826538, + "learning_rate": 2e-05, + "loss": 0.09282872, + "step": 921 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 1.984940528869629, + "learning_rate": 2e-05, + "loss": 0.05574302, + "step": 922 + }, + { + "epoch": 1.846, + "grad_norm": 2.508833169937134, + "learning_rate": 2e-05, + "loss": 0.0758808, + "step": 923 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 2.5532920360565186, + "learning_rate": 2e-05, + "loss": 0.0828255, + "step": 924 + }, + { + "epoch": 1.85, + "grad_norm": 2.2903261184692383, + "learning_rate": 2e-05, + "loss": 0.07507607, + "step": 925 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 2.095050096511841, + "learning_rate": 2e-05, + "loss": 0.05968579, + "step": 926 + }, + { + "epoch": 1.854, + "grad_norm": 1.5813626050949097, + "learning_rate": 2e-05, + "loss": 0.06536861, + "step": 927 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 1.9922809600830078, + "learning_rate": 2e-05, + "loss": 0.06198128, + "step": 928 + }, + { + "epoch": 1.858, + "grad_norm": 2.0554776191711426, + "learning_rate": 2e-05, + "loss": 0.07751165, + "step": 929 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 1.9760997295379639, + "learning_rate": 2e-05, + "loss": 0.0767539, + "step": 930 + }, + { + "epoch": 1.862, + "grad_norm": 1.6336450576782227, + "learning_rate": 2e-05, + "loss": 0.06495491, + "step": 931 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 1.796708583831787, + "learning_rate": 2e-05, + "loss": 0.0650342, + "step": 932 + }, + { + "epoch": 1.866, + "grad_norm": 1.6166343688964844, + "learning_rate": 2e-05, + "loss": 0.06242783, + "step": 933 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 2.2365167140960693, + "learning_rate": 2e-05, + "loss": 0.08445454, + "step": 934 + }, + { + "epoch": 1.87, + "grad_norm": 1.795891284942627, + "learning_rate": 2e-05, + "loss": 0.0526274, + "step": 935 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 2.8587825298309326, + "learning_rate": 2e-05, + "loss": 0.06400409, + "step": 936 + }, + { + "epoch": 1.874, + "grad_norm": 2.213303565979004, + "learning_rate": 2e-05, + "loss": 0.06138328, + "step": 937 + }, + { + "epoch": 1.876, + "grad_norm": 2.25402569770813, + "learning_rate": 2e-05, + "loss": 0.07633629, + "step": 938 + }, + { + "epoch": 1.8780000000000001, + "grad_norm": 2.096250057220459, + "learning_rate": 2e-05, + "loss": 0.07336065, + "step": 939 + }, + { + "epoch": 1.88, + "grad_norm": 1.6095550060272217, + "learning_rate": 2e-05, + "loss": 0.057545, + "step": 940 + }, + { + "epoch": 1.8820000000000001, + "grad_norm": 2.503938913345337, + "learning_rate": 2e-05, + "loss": 0.07715003, + "step": 941 + }, + { + "epoch": 1.884, + "grad_norm": 1.7810205221176147, + "learning_rate": 2e-05, + "loss": 0.06386582, + "step": 942 + }, + { + "epoch": 1.8860000000000001, + "grad_norm": 3.030251979827881, + "learning_rate": 2e-05, + "loss": 0.09021285, + "step": 943 + }, + { + "epoch": 1.888, + "grad_norm": 2.048762798309326, + "learning_rate": 2e-05, + "loss": 0.07349489, + "step": 944 + }, + { + "epoch": 1.8900000000000001, + "grad_norm": 2.1592016220092773, + "learning_rate": 2e-05, + "loss": 0.06426996, + "step": 945 + }, + { + "epoch": 1.892, + "grad_norm": 1.1694790124893188, + "learning_rate": 2e-05, + "loss": 0.05598904, + "step": 946 + }, + { + "epoch": 1.8940000000000001, + "grad_norm": 2.5215678215026855, + "learning_rate": 2e-05, + "loss": 0.05859364, + "step": 947 + }, + { + "epoch": 1.896, + "grad_norm": 1.6138578653335571, + "learning_rate": 2e-05, + "loss": 0.05201582, + "step": 948 + }, + { + "epoch": 1.8980000000000001, + "grad_norm": 2.5577175617218018, + "learning_rate": 2e-05, + "loss": 0.07010947, + "step": 949 + }, + { + "epoch": 1.9, + "grad_norm": 3.392971992492676, + "learning_rate": 2e-05, + "loss": 0.07514925, + "step": 950 + }, + { + "epoch": 1.9020000000000001, + "grad_norm": 1.6294666528701782, + "learning_rate": 2e-05, + "loss": 0.06291882, + "step": 951 + }, + { + "epoch": 1.904, + "grad_norm": 2.229604482650757, + "learning_rate": 2e-05, + "loss": 0.06559543, + "step": 952 + }, + { + "epoch": 1.9060000000000001, + "grad_norm": 1.924761414527893, + "learning_rate": 2e-05, + "loss": 0.0862481, + "step": 953 + }, + { + "epoch": 1.908, + "grad_norm": 1.9866812229156494, + "learning_rate": 2e-05, + "loss": 0.06686676, + "step": 954 + }, + { + "epoch": 1.9100000000000001, + "grad_norm": 1.7814255952835083, + "learning_rate": 2e-05, + "loss": 0.05599992, + "step": 955 + }, + { + "epoch": 1.912, + "grad_norm": 2.1436808109283447, + "learning_rate": 2e-05, + "loss": 0.07030681, + "step": 956 + }, + { + "epoch": 1.9140000000000001, + "grad_norm": 1.3862833976745605, + "learning_rate": 2e-05, + "loss": 0.04867277, + "step": 957 + }, + { + "epoch": 1.916, + "grad_norm": 1.781009316444397, + "learning_rate": 2e-05, + "loss": 0.05244367, + "step": 958 + }, + { + "epoch": 1.9180000000000001, + "grad_norm": 1.6906291246414185, + "learning_rate": 2e-05, + "loss": 0.06690039, + "step": 959 + }, + { + "epoch": 1.92, + "grad_norm": 3.1098814010620117, + "learning_rate": 2e-05, + "loss": 0.11306722, + "step": 960 + }, + { + "epoch": 1.9220000000000002, + "grad_norm": 1.7126940488815308, + "learning_rate": 2e-05, + "loss": 0.05899595, + "step": 961 + }, + { + "epoch": 1.924, + "grad_norm": 2.6065030097961426, + "learning_rate": 2e-05, + "loss": 0.06164584, + "step": 962 + }, + { + "epoch": 1.9260000000000002, + "grad_norm": 1.9014359712600708, + "learning_rate": 2e-05, + "loss": 0.07033944, + "step": 963 + }, + { + "epoch": 1.928, + "grad_norm": 1.3948487043380737, + "learning_rate": 2e-05, + "loss": 0.05189591, + "step": 964 + }, + { + "epoch": 1.9300000000000002, + "grad_norm": 2.2040138244628906, + "learning_rate": 2e-05, + "loss": 0.05501474, + "step": 965 + }, + { + "epoch": 1.932, + "grad_norm": 1.710676908493042, + "learning_rate": 2e-05, + "loss": 0.07464767, + "step": 966 + }, + { + "epoch": 1.9340000000000002, + "grad_norm": 2.508552312850952, + "learning_rate": 2e-05, + "loss": 0.07813706, + "step": 967 + }, + { + "epoch": 1.936, + "grad_norm": 1.4470033645629883, + "learning_rate": 2e-05, + "loss": 0.0598635, + "step": 968 + }, + { + "epoch": 1.938, + "grad_norm": 2.1353957653045654, + "learning_rate": 2e-05, + "loss": 0.05462658, + "step": 969 + }, + { + "epoch": 1.94, + "grad_norm": 2.136827230453491, + "learning_rate": 2e-05, + "loss": 0.0630064, + "step": 970 + }, + { + "epoch": 1.942, + "grad_norm": 2.935197591781616, + "learning_rate": 2e-05, + "loss": 0.09128846, + "step": 971 + }, + { + "epoch": 1.944, + "grad_norm": 2.3456857204437256, + "learning_rate": 2e-05, + "loss": 0.08022948, + "step": 972 + }, + { + "epoch": 1.946, + "grad_norm": 1.7174320220947266, + "learning_rate": 2e-05, + "loss": 0.06144142, + "step": 973 + }, + { + "epoch": 1.948, + "grad_norm": 1.8288911581039429, + "learning_rate": 2e-05, + "loss": 0.04687043, + "step": 974 + }, + { + "epoch": 1.95, + "grad_norm": 1.6098743677139282, + "learning_rate": 2e-05, + "loss": 0.06447228, + "step": 975 + }, + { + "epoch": 1.952, + "grad_norm": 2.4248886108398438, + "learning_rate": 2e-05, + "loss": 0.11559688, + "step": 976 + }, + { + "epoch": 1.954, + "grad_norm": 2.1920695304870605, + "learning_rate": 2e-05, + "loss": 0.0743776, + "step": 977 + }, + { + "epoch": 1.956, + "grad_norm": 1.9038764238357544, + "learning_rate": 2e-05, + "loss": 0.07296964, + "step": 978 + }, + { + "epoch": 1.958, + "grad_norm": 1.6800254583358765, + "learning_rate": 2e-05, + "loss": 0.06484336, + "step": 979 + }, + { + "epoch": 1.96, + "grad_norm": 3.069260597229004, + "learning_rate": 2e-05, + "loss": 0.07390512, + "step": 980 + }, + { + "epoch": 1.962, + "grad_norm": 2.405409336090088, + "learning_rate": 2e-05, + "loss": 0.09900182, + "step": 981 + }, + { + "epoch": 1.964, + "grad_norm": 1.6991225481033325, + "learning_rate": 2e-05, + "loss": 0.05558601, + "step": 982 + }, + { + "epoch": 1.966, + "grad_norm": 1.5244240760803223, + "learning_rate": 2e-05, + "loss": 0.06104971, + "step": 983 + }, + { + "epoch": 1.968, + "grad_norm": 1.8483918905258179, + "learning_rate": 2e-05, + "loss": 0.0812193, + "step": 984 + }, + { + "epoch": 1.97, + "grad_norm": 2.1218955516815186, + "learning_rate": 2e-05, + "loss": 0.08611986, + "step": 985 + }, + { + "epoch": 1.972, + "grad_norm": 1.5318834781646729, + "learning_rate": 2e-05, + "loss": 0.06183597, + "step": 986 + }, + { + "epoch": 1.974, + "grad_norm": 1.713119387626648, + "learning_rate": 2e-05, + "loss": 0.07985615, + "step": 987 + }, + { + "epoch": 1.976, + "grad_norm": 1.4061270952224731, + "learning_rate": 2e-05, + "loss": 0.04497521, + "step": 988 + }, + { + "epoch": 1.978, + "grad_norm": 2.804215431213379, + "learning_rate": 2e-05, + "loss": 0.06925857, + "step": 989 + }, + { + "epoch": 1.98, + "grad_norm": 1.997461199760437, + "learning_rate": 2e-05, + "loss": 0.06266791, + "step": 990 + }, + { + "epoch": 1.982, + "grad_norm": 1.849177598953247, + "learning_rate": 2e-05, + "loss": 0.08790208, + "step": 991 + }, + { + "epoch": 1.984, + "grad_norm": 1.7157963514328003, + "learning_rate": 2e-05, + "loss": 0.06307608, + "step": 992 + }, + { + "epoch": 1.986, + "grad_norm": 1.68929123878479, + "learning_rate": 2e-05, + "loss": 0.06697696, + "step": 993 + }, + { + "epoch": 1.988, + "grad_norm": 1.83262300491333, + "learning_rate": 2e-05, + "loss": 0.06532292, + "step": 994 + }, + { + "epoch": 1.99, + "grad_norm": 3.3675966262817383, + "learning_rate": 2e-05, + "loss": 0.09480947, + "step": 995 + }, + { + "epoch": 1.992, + "grad_norm": 2.172999143600464, + "learning_rate": 2e-05, + "loss": 0.06149952, + "step": 996 + }, + { + "epoch": 1.994, + "grad_norm": 1.6202707290649414, + "learning_rate": 2e-05, + "loss": 0.05982627, + "step": 997 + }, + { + "epoch": 1.996, + "grad_norm": 1.9917645454406738, + "learning_rate": 2e-05, + "loss": 0.07037735, + "step": 998 + }, + { + "epoch": 1.998, + "grad_norm": 2.8166284561157227, + "learning_rate": 2e-05, + "loss": 0.06254306, + "step": 999 + }, + { + "epoch": 2.0, + "grad_norm": 1.6974472999572754, + "learning_rate": 2e-05, + "loss": 0.07782187, + "step": 1000 + }, + { + "epoch": 2.0, + "eval_performance": { + "AngleClassification_1": 0.986, + "AngleClassification_2": 0.714, + "AngleClassification_3": 0.5269461077844312, + "Equal_1": 0.846, + "Equal_2": 0.6007984031936128, + "Equal_3": 0.6926147704590818, + "LineComparison_1": 0.966, + "LineComparison_2": 0.9600798403193613, + "LineComparison_3": 0.8363273453093812, + "Parallel_1": 0.8577154308617234, + "Parallel_2": 0.9438877755511023, + "Parallel_3": 0.444, + "Perpendicular_1": 0.888, + "Perpendicular_2": 0.342, + "Perpendicular_3": 0.08517034068136273, + "PointLiesOnCircle_1": 0.995691382765531, + "PointLiesOnCircle_2": 0.9887, + "PointLiesOnCircle_3": 0.7656, + "PointLiesOnLine_1": 0.811623246492986, + "PointLiesOnLine_2": 0.48096192384769537, + "PointLiesOnLine_3": 0.24750499001996007 + }, + "eval_runtime": 320.1345, + "eval_samples_per_second": 32.799, + "eval_steps_per_second": 0.656, + "step": 1000 + }, + { + "epoch": 2.002, + "grad_norm": 1.6161152124404907, + "learning_rate": 2e-05, + "loss": 0.0708351, + "step": 1001 + }, + { + "epoch": 2.004, + "grad_norm": 1.5361851453781128, + "learning_rate": 2e-05, + "loss": 0.0666085, + "step": 1002 + }, + { + "epoch": 2.006, + "grad_norm": 2.824312925338745, + "learning_rate": 2e-05, + "loss": 0.09088291, + "step": 1003 + }, + { + "epoch": 2.008, + "grad_norm": 1.716564416885376, + "learning_rate": 2e-05, + "loss": 0.0687227, + "step": 1004 + }, + { + "epoch": 2.01, + "grad_norm": 3.0519144535064697, + "learning_rate": 2e-05, + "loss": 0.07319454, + "step": 1005 + }, + { + "epoch": 2.012, + "grad_norm": 2.324007511138916, + "learning_rate": 2e-05, + "loss": 0.09257935, + "step": 1006 + }, + { + "epoch": 2.014, + "grad_norm": 4.496967792510986, + "learning_rate": 2e-05, + "loss": 0.08681659, + "step": 1007 + }, + { + "epoch": 2.016, + "grad_norm": 1.5570460557937622, + "learning_rate": 2e-05, + "loss": 0.04536861, + "step": 1008 + }, + { + "epoch": 2.018, + "grad_norm": 1.8645915985107422, + "learning_rate": 2e-05, + "loss": 0.05488719, + "step": 1009 + }, + { + "epoch": 2.02, + "grad_norm": 3.016777515411377, + "learning_rate": 2e-05, + "loss": 0.08436164, + "step": 1010 + }, + { + "epoch": 2.022, + "grad_norm": 1.9473544359207153, + "learning_rate": 2e-05, + "loss": 0.06556801, + "step": 1011 + }, + { + "epoch": 2.024, + "grad_norm": 4.963746547698975, + "learning_rate": 2e-05, + "loss": 0.08209211, + "step": 1012 + }, + { + "epoch": 2.026, + "grad_norm": 2.4789822101593018, + "learning_rate": 2e-05, + "loss": 0.0703003, + "step": 1013 + }, + { + "epoch": 2.028, + "grad_norm": 2.126289129257202, + "learning_rate": 2e-05, + "loss": 0.03831969, + "step": 1014 + }, + { + "epoch": 2.03, + "grad_norm": 2.6576788425445557, + "learning_rate": 2e-05, + "loss": 0.07527678, + "step": 1015 + }, + { + "epoch": 2.032, + "grad_norm": 1.8639591932296753, + "learning_rate": 2e-05, + "loss": 0.05922488, + "step": 1016 + }, + { + "epoch": 2.034, + "grad_norm": 1.5314511060714722, + "learning_rate": 2e-05, + "loss": 0.04786742, + "step": 1017 + }, + { + "epoch": 2.036, + "grad_norm": 2.3438453674316406, + "learning_rate": 2e-05, + "loss": 0.07262143, + "step": 1018 + }, + { + "epoch": 2.038, + "grad_norm": 1.8147228956222534, + "learning_rate": 2e-05, + "loss": 0.04851551, + "step": 1019 + }, + { + "epoch": 2.04, + "grad_norm": 2.4879653453826904, + "learning_rate": 2e-05, + "loss": 0.0645643, + "step": 1020 + }, + { + "epoch": 2.042, + "grad_norm": 2.240687608718872, + "learning_rate": 2e-05, + "loss": 0.07334122, + "step": 1021 + }, + { + "epoch": 2.044, + "grad_norm": 2.220008611679077, + "learning_rate": 2e-05, + "loss": 0.06366929, + "step": 1022 + }, + { + "epoch": 2.046, + "grad_norm": 2.523574113845825, + "learning_rate": 2e-05, + "loss": 0.07826661, + "step": 1023 + }, + { + "epoch": 2.048, + "grad_norm": 2.2099592685699463, + "learning_rate": 2e-05, + "loss": 0.09013459, + "step": 1024 + }, + { + "epoch": 2.05, + "grad_norm": 2.7173805236816406, + "learning_rate": 2e-05, + "loss": 0.08330725, + "step": 1025 + }, + { + "epoch": 2.052, + "grad_norm": 1.9037878513336182, + "learning_rate": 2e-05, + "loss": 0.04698469, + "step": 1026 + }, + { + "epoch": 2.054, + "grad_norm": 1.8339701890945435, + "learning_rate": 2e-05, + "loss": 0.07493852, + "step": 1027 + }, + { + "epoch": 2.056, + "grad_norm": 1.7632319927215576, + "learning_rate": 2e-05, + "loss": 0.05750106, + "step": 1028 + }, + { + "epoch": 2.058, + "grad_norm": 1.5772873163223267, + "learning_rate": 2e-05, + "loss": 0.05176328, + "step": 1029 + }, + { + "epoch": 2.06, + "grad_norm": 1.7405070066452026, + "learning_rate": 2e-05, + "loss": 0.05858688, + "step": 1030 + }, + { + "epoch": 2.062, + "grad_norm": 2.498608112335205, + "learning_rate": 2e-05, + "loss": 0.06766784, + "step": 1031 + }, + { + "epoch": 2.064, + "grad_norm": 1.690112590789795, + "learning_rate": 2e-05, + "loss": 0.06187538, + "step": 1032 + }, + { + "epoch": 2.066, + "grad_norm": 2.2642834186553955, + "learning_rate": 2e-05, + "loss": 0.07190007, + "step": 1033 + }, + { + "epoch": 2.068, + "grad_norm": 1.882744550704956, + "learning_rate": 2e-05, + "loss": 0.05131867, + "step": 1034 + }, + { + "epoch": 2.07, + "grad_norm": 1.5934783220291138, + "learning_rate": 2e-05, + "loss": 0.03696432, + "step": 1035 + }, + { + "epoch": 2.072, + "grad_norm": 2.2791152000427246, + "learning_rate": 2e-05, + "loss": 0.08653103, + "step": 1036 + }, + { + "epoch": 2.074, + "grad_norm": 1.565434217453003, + "learning_rate": 2e-05, + "loss": 0.05390278, + "step": 1037 + }, + { + "epoch": 2.076, + "grad_norm": 2.557328939437866, + "learning_rate": 2e-05, + "loss": 0.07149288, + "step": 1038 + }, + { + "epoch": 2.078, + "grad_norm": 2.0665178298950195, + "learning_rate": 2e-05, + "loss": 0.04702565, + "step": 1039 + }, + { + "epoch": 2.08, + "grad_norm": 1.5441120862960815, + "learning_rate": 2e-05, + "loss": 0.04806751, + "step": 1040 + }, + { + "epoch": 2.082, + "grad_norm": 1.4570614099502563, + "learning_rate": 2e-05, + "loss": 0.05136944, + "step": 1041 + }, + { + "epoch": 2.084, + "grad_norm": 5.4056243896484375, + "learning_rate": 2e-05, + "loss": 0.07949159, + "step": 1042 + }, + { + "epoch": 2.086, + "grad_norm": 2.241541624069214, + "learning_rate": 2e-05, + "loss": 0.08863392, + "step": 1043 + }, + { + "epoch": 2.088, + "grad_norm": 1.510636568069458, + "learning_rate": 2e-05, + "loss": 0.05880911, + "step": 1044 + }, + { + "epoch": 2.09, + "grad_norm": 2.41506028175354, + "learning_rate": 2e-05, + "loss": 0.05853722, + "step": 1045 + }, + { + "epoch": 2.092, + "grad_norm": 1.5590910911560059, + "learning_rate": 2e-05, + "loss": 0.04493964, + "step": 1046 + }, + { + "epoch": 2.094, + "grad_norm": 2.0853965282440186, + "learning_rate": 2e-05, + "loss": 0.08284545, + "step": 1047 + }, + { + "epoch": 2.096, + "grad_norm": 1.5066843032836914, + "learning_rate": 2e-05, + "loss": 0.05551049, + "step": 1048 + }, + { + "epoch": 2.098, + "grad_norm": 2.1711227893829346, + "learning_rate": 2e-05, + "loss": 0.07818221, + "step": 1049 + }, + { + "epoch": 2.1, + "grad_norm": 1.308896541595459, + "learning_rate": 2e-05, + "loss": 0.04287009, + "step": 1050 + }, + { + "epoch": 2.102, + "grad_norm": 2.321077346801758, + "learning_rate": 2e-05, + "loss": 0.0904954, + "step": 1051 + }, + { + "epoch": 2.104, + "grad_norm": 1.5421305894851685, + "learning_rate": 2e-05, + "loss": 0.06068816, + "step": 1052 + }, + { + "epoch": 2.106, + "grad_norm": 1.8072997331619263, + "learning_rate": 2e-05, + "loss": 0.0773791, + "step": 1053 + }, + { + "epoch": 2.108, + "grad_norm": 2.3196661472320557, + "learning_rate": 2e-05, + "loss": 0.06575853, + "step": 1054 + }, + { + "epoch": 2.11, + "grad_norm": 1.4077317714691162, + "learning_rate": 2e-05, + "loss": 0.05838005, + "step": 1055 + }, + { + "epoch": 2.112, + "grad_norm": 1.5628201961517334, + "learning_rate": 2e-05, + "loss": 0.05414519, + "step": 1056 + }, + { + "epoch": 2.114, + "grad_norm": 1.9730654954910278, + "learning_rate": 2e-05, + "loss": 0.063483, + "step": 1057 + }, + { + "epoch": 2.116, + "grad_norm": 1.9468834400177002, + "learning_rate": 2e-05, + "loss": 0.0622715, + "step": 1058 + }, + { + "epoch": 2.118, + "grad_norm": 2.9658288955688477, + "learning_rate": 2e-05, + "loss": 0.09098145, + "step": 1059 + }, + { + "epoch": 2.12, + "grad_norm": 2.6334218978881836, + "learning_rate": 2e-05, + "loss": 0.06693131, + "step": 1060 + }, + { + "epoch": 2.122, + "grad_norm": 3.37898850440979, + "learning_rate": 2e-05, + "loss": 0.09015776, + "step": 1061 + }, + { + "epoch": 2.124, + "grad_norm": 2.2235124111175537, + "learning_rate": 2e-05, + "loss": 0.06049117, + "step": 1062 + }, + { + "epoch": 2.126, + "grad_norm": 1.7589530944824219, + "learning_rate": 2e-05, + "loss": 0.05549654, + "step": 1063 + }, + { + "epoch": 2.128, + "grad_norm": 2.0518884658813477, + "learning_rate": 2e-05, + "loss": 0.08317362, + "step": 1064 + }, + { + "epoch": 2.13, + "grad_norm": 1.3999139070510864, + "learning_rate": 2e-05, + "loss": 0.04877549, + "step": 1065 + }, + { + "epoch": 2.132, + "grad_norm": 2.0118892192840576, + "learning_rate": 2e-05, + "loss": 0.07196333, + "step": 1066 + }, + { + "epoch": 2.134, + "grad_norm": 1.9917757511138916, + "learning_rate": 2e-05, + "loss": 0.05575627, + "step": 1067 + }, + { + "epoch": 2.136, + "grad_norm": 1.17472505569458, + "learning_rate": 2e-05, + "loss": 0.04250437, + "step": 1068 + }, + { + "epoch": 2.138, + "grad_norm": 1.4256974458694458, + "learning_rate": 2e-05, + "loss": 0.0531082, + "step": 1069 + }, + { + "epoch": 2.14, + "grad_norm": 2.143972396850586, + "learning_rate": 2e-05, + "loss": 0.0886039, + "step": 1070 + }, + { + "epoch": 2.142, + "grad_norm": 1.9579774141311646, + "learning_rate": 2e-05, + "loss": 0.06291603, + "step": 1071 + }, + { + "epoch": 2.144, + "grad_norm": 1.78086519241333, + "learning_rate": 2e-05, + "loss": 0.05680231, + "step": 1072 + }, + { + "epoch": 2.146, + "grad_norm": 1.8430737257003784, + "learning_rate": 2e-05, + "loss": 0.08492243, + "step": 1073 + }, + { + "epoch": 2.148, + "grad_norm": 1.4731837511062622, + "learning_rate": 2e-05, + "loss": 0.05535863, + "step": 1074 + }, + { + "epoch": 2.15, + "grad_norm": 1.6965607404708862, + "learning_rate": 2e-05, + "loss": 0.05869256, + "step": 1075 + }, + { + "epoch": 2.152, + "grad_norm": 1.960614800453186, + "learning_rate": 2e-05, + "loss": 0.0604171, + "step": 1076 + }, + { + "epoch": 2.154, + "grad_norm": 1.3592660427093506, + "learning_rate": 2e-05, + "loss": 0.04966343, + "step": 1077 + }, + { + "epoch": 2.156, + "grad_norm": 1.4302160739898682, + "learning_rate": 2e-05, + "loss": 0.05037902, + "step": 1078 + }, + { + "epoch": 2.158, + "grad_norm": 1.5716873407363892, + "learning_rate": 2e-05, + "loss": 0.06660844, + "step": 1079 + }, + { + "epoch": 2.16, + "grad_norm": 1.2582805156707764, + "learning_rate": 2e-05, + "loss": 0.04393882, + "step": 1080 + }, + { + "epoch": 2.162, + "grad_norm": 2.170337200164795, + "learning_rate": 2e-05, + "loss": 0.0725324, + "step": 1081 + }, + { + "epoch": 2.164, + "grad_norm": 3.0252792835235596, + "learning_rate": 2e-05, + "loss": 0.08293726, + "step": 1082 + }, + { + "epoch": 2.166, + "grad_norm": 2.281851053237915, + "learning_rate": 2e-05, + "loss": 0.09101221, + "step": 1083 + }, + { + "epoch": 2.168, + "grad_norm": 2.3761801719665527, + "learning_rate": 2e-05, + "loss": 0.06198543, + "step": 1084 + }, + { + "epoch": 2.17, + "grad_norm": 2.4593560695648193, + "learning_rate": 2e-05, + "loss": 0.08639136, + "step": 1085 + }, + { + "epoch": 2.172, + "grad_norm": 2.1723086833953857, + "learning_rate": 2e-05, + "loss": 0.06393465, + "step": 1086 + }, + { + "epoch": 2.174, + "grad_norm": 2.035813570022583, + "learning_rate": 2e-05, + "loss": 0.07111633, + "step": 1087 + }, + { + "epoch": 2.176, + "grad_norm": 2.0545802116394043, + "learning_rate": 2e-05, + "loss": 0.06639965, + "step": 1088 + }, + { + "epoch": 2.178, + "grad_norm": 2.862257957458496, + "learning_rate": 2e-05, + "loss": 0.08525625, + "step": 1089 + }, + { + "epoch": 2.18, + "grad_norm": 1.8956446647644043, + "learning_rate": 2e-05, + "loss": 0.06492867, + "step": 1090 + }, + { + "epoch": 2.182, + "grad_norm": 1.6777955293655396, + "learning_rate": 2e-05, + "loss": 0.06760434, + "step": 1091 + }, + { + "epoch": 2.184, + "grad_norm": 1.5188251733779907, + "learning_rate": 2e-05, + "loss": 0.06885065, + "step": 1092 + }, + { + "epoch": 2.186, + "grad_norm": 1.5363314151763916, + "learning_rate": 2e-05, + "loss": 0.06016498, + "step": 1093 + }, + { + "epoch": 2.188, + "grad_norm": 1.5654257535934448, + "learning_rate": 2e-05, + "loss": 0.0583441, + "step": 1094 + }, + { + "epoch": 2.19, + "grad_norm": 2.322394371032715, + "learning_rate": 2e-05, + "loss": 0.07740622, + "step": 1095 + }, + { + "epoch": 2.192, + "grad_norm": 1.8494588136672974, + "learning_rate": 2e-05, + "loss": 0.06826154, + "step": 1096 + }, + { + "epoch": 2.194, + "grad_norm": 1.8143563270568848, + "learning_rate": 2e-05, + "loss": 0.07409322, + "step": 1097 + }, + { + "epoch": 2.196, + "grad_norm": 2.239018440246582, + "learning_rate": 2e-05, + "loss": 0.06441543, + "step": 1098 + }, + { + "epoch": 2.198, + "grad_norm": 1.3814536333084106, + "learning_rate": 2e-05, + "loss": 0.06695756, + "step": 1099 + }, + { + "epoch": 2.2, + "grad_norm": 1.9067416191101074, + "learning_rate": 2e-05, + "loss": 0.0664328, + "step": 1100 + }, + { + "epoch": 2.202, + "grad_norm": 2.319135904312134, + "learning_rate": 2e-05, + "loss": 0.07544907, + "step": 1101 + }, + { + "epoch": 2.204, + "grad_norm": 1.6520764827728271, + "learning_rate": 2e-05, + "loss": 0.04925615, + "step": 1102 + }, + { + "epoch": 2.206, + "grad_norm": 2.0650670528411865, + "learning_rate": 2e-05, + "loss": 0.08087505, + "step": 1103 + }, + { + "epoch": 2.208, + "grad_norm": 2.419358730316162, + "learning_rate": 2e-05, + "loss": 0.08194488, + "step": 1104 + }, + { + "epoch": 2.21, + "grad_norm": 2.1177220344543457, + "learning_rate": 2e-05, + "loss": 0.06622218, + "step": 1105 + }, + { + "epoch": 2.212, + "grad_norm": 1.8905670642852783, + "learning_rate": 2e-05, + "loss": 0.05285282, + "step": 1106 + }, + { + "epoch": 2.214, + "grad_norm": 1.7442760467529297, + "learning_rate": 2e-05, + "loss": 0.07012412, + "step": 1107 + }, + { + "epoch": 2.216, + "grad_norm": 2.560086965560913, + "learning_rate": 2e-05, + "loss": 0.07056236, + "step": 1108 + }, + { + "epoch": 2.218, + "grad_norm": 1.3487662076950073, + "learning_rate": 2e-05, + "loss": 0.04359243, + "step": 1109 + }, + { + "epoch": 2.22, + "grad_norm": 2.0357956886291504, + "learning_rate": 2e-05, + "loss": 0.06766598, + "step": 1110 + }, + { + "epoch": 2.222, + "grad_norm": 1.6275434494018555, + "learning_rate": 2e-05, + "loss": 0.06916457, + "step": 1111 + }, + { + "epoch": 2.224, + "grad_norm": 1.9597185850143433, + "learning_rate": 2e-05, + "loss": 0.07099155, + "step": 1112 + }, + { + "epoch": 2.226, + "grad_norm": 1.4252787828445435, + "learning_rate": 2e-05, + "loss": 0.04932921, + "step": 1113 + }, + { + "epoch": 2.228, + "grad_norm": 1.5756709575653076, + "learning_rate": 2e-05, + "loss": 0.05278656, + "step": 1114 + }, + { + "epoch": 2.23, + "grad_norm": 2.331275701522827, + "learning_rate": 2e-05, + "loss": 0.07562015, + "step": 1115 + }, + { + "epoch": 2.232, + "grad_norm": 1.697662115097046, + "learning_rate": 2e-05, + "loss": 0.06566669, + "step": 1116 + }, + { + "epoch": 2.234, + "grad_norm": 1.760780692100525, + "learning_rate": 2e-05, + "loss": 0.04810528, + "step": 1117 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 2.7786638736724854, + "learning_rate": 2e-05, + "loss": 0.05388356, + "step": 1118 + }, + { + "epoch": 2.238, + "grad_norm": 2.2772700786590576, + "learning_rate": 2e-05, + "loss": 0.07156706, + "step": 1119 + }, + { + "epoch": 2.24, + "grad_norm": 1.3025233745574951, + "learning_rate": 2e-05, + "loss": 0.04904636, + "step": 1120 + }, + { + "epoch": 2.242, + "grad_norm": 2.5209176540374756, + "learning_rate": 2e-05, + "loss": 0.0652664, + "step": 1121 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 1.7902709245681763, + "learning_rate": 2e-05, + "loss": 0.061213, + "step": 1122 + }, + { + "epoch": 2.246, + "grad_norm": 1.2348299026489258, + "learning_rate": 2e-05, + "loss": 0.03768958, + "step": 1123 + }, + { + "epoch": 2.248, + "grad_norm": 1.4644815921783447, + "learning_rate": 2e-05, + "loss": 0.05811838, + "step": 1124 + }, + { + "epoch": 2.25, + "grad_norm": 1.8537580966949463, + "learning_rate": 2e-05, + "loss": 0.06129285, + "step": 1125 + }, + { + "epoch": 2.252, + "grad_norm": 1.9889532327651978, + "learning_rate": 2e-05, + "loss": 0.05362317, + "step": 1126 + }, + { + "epoch": 2.254, + "grad_norm": 1.2981350421905518, + "learning_rate": 2e-05, + "loss": 0.03556494, + "step": 1127 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 2.6438069343566895, + "learning_rate": 2e-05, + "loss": 0.07400218, + "step": 1128 + }, + { + "epoch": 2.258, + "grad_norm": 2.0551059246063232, + "learning_rate": 2e-05, + "loss": 0.06481735, + "step": 1129 + }, + { + "epoch": 2.26, + "grad_norm": 1.9357151985168457, + "learning_rate": 2e-05, + "loss": 0.07889327, + "step": 1130 + }, + { + "epoch": 2.262, + "grad_norm": 1.4854387044906616, + "learning_rate": 2e-05, + "loss": 0.04724164, + "step": 1131 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 3.739743947982788, + "learning_rate": 2e-05, + "loss": 0.07142067, + "step": 1132 + }, + { + "epoch": 2.266, + "grad_norm": 2.309699773788452, + "learning_rate": 2e-05, + "loss": 0.05937993, + "step": 1133 + }, + { + "epoch": 2.268, + "grad_norm": 3.0048446655273438, + "learning_rate": 2e-05, + "loss": 0.08490255, + "step": 1134 + }, + { + "epoch": 2.27, + "grad_norm": 1.9927393198013306, + "learning_rate": 2e-05, + "loss": 0.05297955, + "step": 1135 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 2.1314926147460938, + "learning_rate": 2e-05, + "loss": 0.05550206, + "step": 1136 + }, + { + "epoch": 2.274, + "grad_norm": 1.467367172241211, + "learning_rate": 2e-05, + "loss": 0.06153769, + "step": 1137 + }, + { + "epoch": 2.276, + "grad_norm": 2.6547863483428955, + "learning_rate": 2e-05, + "loss": 0.06249413, + "step": 1138 + }, + { + "epoch": 2.278, + "grad_norm": 2.3911125659942627, + "learning_rate": 2e-05, + "loss": 0.04478404, + "step": 1139 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 2.295572280883789, + "learning_rate": 2e-05, + "loss": 0.05000008, + "step": 1140 + }, + { + "epoch": 2.282, + "grad_norm": 1.823384165763855, + "learning_rate": 2e-05, + "loss": 0.05099558, + "step": 1141 + }, + { + "epoch": 2.284, + "grad_norm": 2.4514079093933105, + "learning_rate": 2e-05, + "loss": 0.07286972, + "step": 1142 + }, + { + "epoch": 2.286, + "grad_norm": 1.5815463066101074, + "learning_rate": 2e-05, + "loss": 0.06633377, + "step": 1143 + }, + { + "epoch": 2.288, + "grad_norm": 1.3394488096237183, + "learning_rate": 2e-05, + "loss": 0.05137707, + "step": 1144 + }, + { + "epoch": 2.29, + "grad_norm": 1.6430034637451172, + "learning_rate": 2e-05, + "loss": 0.03971995, + "step": 1145 + }, + { + "epoch": 2.292, + "grad_norm": 2.691917657852173, + "learning_rate": 2e-05, + "loss": 0.06252594, + "step": 1146 + }, + { + "epoch": 2.294, + "grad_norm": 1.7687281370162964, + "learning_rate": 2e-05, + "loss": 0.05269225, + "step": 1147 + }, + { + "epoch": 2.296, + "grad_norm": 1.8105900287628174, + "learning_rate": 2e-05, + "loss": 0.07036023, + "step": 1148 + }, + { + "epoch": 2.298, + "grad_norm": 1.4458359479904175, + "learning_rate": 2e-05, + "loss": 0.05376993, + "step": 1149 + }, + { + "epoch": 2.3, + "grad_norm": 1.6199760437011719, + "learning_rate": 2e-05, + "loss": 0.06054214, + "step": 1150 + }, + { + "epoch": 2.302, + "grad_norm": 1.98055100440979, + "learning_rate": 2e-05, + "loss": 0.07045031, + "step": 1151 + }, + { + "epoch": 2.304, + "grad_norm": 1.7091319561004639, + "learning_rate": 2e-05, + "loss": 0.04949348, + "step": 1152 + }, + { + "epoch": 2.306, + "grad_norm": 2.5169119834899902, + "learning_rate": 2e-05, + "loss": 0.06985944, + "step": 1153 + }, + { + "epoch": 2.308, + "grad_norm": 1.5572229623794556, + "learning_rate": 2e-05, + "loss": 0.0572869, + "step": 1154 + }, + { + "epoch": 2.31, + "grad_norm": 1.3380517959594727, + "learning_rate": 2e-05, + "loss": 0.04210208, + "step": 1155 + }, + { + "epoch": 2.312, + "grad_norm": 2.2991576194763184, + "learning_rate": 2e-05, + "loss": 0.06161058, + "step": 1156 + }, + { + "epoch": 2.314, + "grad_norm": 1.362855076789856, + "learning_rate": 2e-05, + "loss": 0.05341607, + "step": 1157 + }, + { + "epoch": 2.316, + "grad_norm": 1.6171643733978271, + "learning_rate": 2e-05, + "loss": 0.05781926, + "step": 1158 + }, + { + "epoch": 2.318, + "grad_norm": 2.0822646617889404, + "learning_rate": 2e-05, + "loss": 0.05547912, + "step": 1159 + }, + { + "epoch": 2.32, + "grad_norm": 2.1096858978271484, + "learning_rate": 2e-05, + "loss": 0.06335936, + "step": 1160 + }, + { + "epoch": 2.322, + "grad_norm": 1.6990431547164917, + "learning_rate": 2e-05, + "loss": 0.06178097, + "step": 1161 + }, + { + "epoch": 2.324, + "grad_norm": 1.849997878074646, + "learning_rate": 2e-05, + "loss": 0.06346186, + "step": 1162 + }, + { + "epoch": 2.326, + "grad_norm": 1.6597445011138916, + "learning_rate": 2e-05, + "loss": 0.05579802, + "step": 1163 + }, + { + "epoch": 2.328, + "grad_norm": 1.805808663368225, + "learning_rate": 2e-05, + "loss": 0.08137181, + "step": 1164 + }, + { + "epoch": 2.33, + "grad_norm": 1.5715245008468628, + "learning_rate": 2e-05, + "loss": 0.07559416, + "step": 1165 + }, + { + "epoch": 2.332, + "grad_norm": 1.6368138790130615, + "learning_rate": 2e-05, + "loss": 0.06028969, + "step": 1166 + }, + { + "epoch": 2.334, + "grad_norm": 2.369986057281494, + "learning_rate": 2e-05, + "loss": 0.05975366, + "step": 1167 + }, + { + "epoch": 2.336, + "grad_norm": 1.567192792892456, + "learning_rate": 2e-05, + "loss": 0.05366561, + "step": 1168 + }, + { + "epoch": 2.338, + "grad_norm": 1.529199242591858, + "learning_rate": 2e-05, + "loss": 0.05094039, + "step": 1169 + }, + { + "epoch": 2.34, + "grad_norm": 2.299103260040283, + "learning_rate": 2e-05, + "loss": 0.05972379, + "step": 1170 + }, + { + "epoch": 2.342, + "grad_norm": 2.0813450813293457, + "learning_rate": 2e-05, + "loss": 0.06179533, + "step": 1171 + }, + { + "epoch": 2.344, + "grad_norm": 1.8993735313415527, + "learning_rate": 2e-05, + "loss": 0.05901102, + "step": 1172 + }, + { + "epoch": 2.346, + "grad_norm": 1.505812644958496, + "learning_rate": 2e-05, + "loss": 0.05369399, + "step": 1173 + }, + { + "epoch": 2.348, + "grad_norm": 42.221553802490234, + "learning_rate": 2e-05, + "loss": 0.1112899, + "step": 1174 + }, + { + "epoch": 2.35, + "grad_norm": 1.6444224119186401, + "learning_rate": 2e-05, + "loss": 0.05893474, + "step": 1175 + }, + { + "epoch": 2.352, + "grad_norm": 1.4802619218826294, + "learning_rate": 2e-05, + "loss": 0.05816791, + "step": 1176 + }, + { + "epoch": 2.354, + "grad_norm": 1.5738928318023682, + "learning_rate": 2e-05, + "loss": 0.07226036, + "step": 1177 + }, + { + "epoch": 2.356, + "grad_norm": 1.077085018157959, + "learning_rate": 2e-05, + "loss": 0.03205248, + "step": 1178 + }, + { + "epoch": 2.358, + "grad_norm": 48.9713134765625, + "learning_rate": 2e-05, + "loss": 0.17173615, + "step": 1179 + }, + { + "epoch": 2.36, + "grad_norm": 2.1683356761932373, + "learning_rate": 2e-05, + "loss": 0.04456889, + "step": 1180 + }, + { + "epoch": 2.362, + "grad_norm": 8.09849739074707, + "learning_rate": 2e-05, + "loss": 0.29480407, + "step": 1181 + }, + { + "epoch": 2.364, + "grad_norm": 1.4625880718231201, + "learning_rate": 2e-05, + "loss": 0.04769106, + "step": 1182 + }, + { + "epoch": 2.366, + "grad_norm": 1.876531720161438, + "learning_rate": 2e-05, + "loss": 0.0816292, + "step": 1183 + }, + { + "epoch": 2.368, + "grad_norm": 1.9798403978347778, + "learning_rate": 2e-05, + "loss": 0.06011814, + "step": 1184 + }, + { + "epoch": 2.37, + "grad_norm": 1.5905330181121826, + "learning_rate": 2e-05, + "loss": 0.05314508, + "step": 1185 + }, + { + "epoch": 2.372, + "grad_norm": 1.7121469974517822, + "learning_rate": 2e-05, + "loss": 0.04185162, + "step": 1186 + }, + { + "epoch": 2.374, + "grad_norm": 1.7982616424560547, + "learning_rate": 2e-05, + "loss": 0.04177887, + "step": 1187 + }, + { + "epoch": 2.376, + "grad_norm": 2.124553680419922, + "learning_rate": 2e-05, + "loss": 0.06069058, + "step": 1188 + }, + { + "epoch": 2.378, + "grad_norm": 1.8068010807037354, + "learning_rate": 2e-05, + "loss": 0.05539012, + "step": 1189 + }, + { + "epoch": 2.38, + "grad_norm": 2.0612502098083496, + "learning_rate": 2e-05, + "loss": 0.07466509, + "step": 1190 + }, + { + "epoch": 2.382, + "grad_norm": 1.1964912414550781, + "learning_rate": 2e-05, + "loss": 0.04185656, + "step": 1191 + }, + { + "epoch": 2.384, + "grad_norm": 2.028778314590454, + "learning_rate": 2e-05, + "loss": 0.05173714, + "step": 1192 + }, + { + "epoch": 2.386, + "grad_norm": 1.7989981174468994, + "learning_rate": 2e-05, + "loss": 0.05134147, + "step": 1193 + }, + { + "epoch": 2.388, + "grad_norm": 1.6559616327285767, + "learning_rate": 2e-05, + "loss": 0.06251637, + "step": 1194 + }, + { + "epoch": 2.39, + "grad_norm": 1.4277832508087158, + "learning_rate": 2e-05, + "loss": 0.05279864, + "step": 1195 + }, + { + "epoch": 2.392, + "grad_norm": 1.6468921899795532, + "learning_rate": 2e-05, + "loss": 0.06037339, + "step": 1196 + }, + { + "epoch": 2.394, + "grad_norm": 3.1690855026245117, + "learning_rate": 2e-05, + "loss": 0.08198987, + "step": 1197 + }, + { + "epoch": 2.396, + "grad_norm": 1.2297303676605225, + "learning_rate": 2e-05, + "loss": 0.04745869, + "step": 1198 + }, + { + "epoch": 2.398, + "grad_norm": 2.2883074283599854, + "learning_rate": 2e-05, + "loss": 0.04119899, + "step": 1199 + }, + { + "epoch": 2.4, + "grad_norm": 2.0673367977142334, + "learning_rate": 2e-05, + "loss": 0.05730217, + "step": 1200 + }, + { + "epoch": 2.402, + "grad_norm": 1.2708475589752197, + "learning_rate": 2e-05, + "loss": 0.04456803, + "step": 1201 + }, + { + "epoch": 2.404, + "grad_norm": 1.3558706045150757, + "learning_rate": 2e-05, + "loss": 0.05208869, + "step": 1202 + }, + { + "epoch": 2.406, + "grad_norm": 1.2289046049118042, + "learning_rate": 2e-05, + "loss": 0.04607398, + "step": 1203 + }, + { + "epoch": 2.408, + "grad_norm": 3.603008985519409, + "learning_rate": 2e-05, + "loss": 0.08612712, + "step": 1204 + }, + { + "epoch": 2.41, + "grad_norm": 4.123850345611572, + "learning_rate": 2e-05, + "loss": 0.06038028, + "step": 1205 + }, + { + "epoch": 2.412, + "grad_norm": 2.2089900970458984, + "learning_rate": 2e-05, + "loss": 0.06190875, + "step": 1206 + }, + { + "epoch": 2.414, + "grad_norm": 1.7213412523269653, + "learning_rate": 2e-05, + "loss": 0.05619193, + "step": 1207 + }, + { + "epoch": 2.416, + "grad_norm": 2.1176223754882812, + "learning_rate": 2e-05, + "loss": 0.06305473, + "step": 1208 + }, + { + "epoch": 2.418, + "grad_norm": 2.206848621368408, + "learning_rate": 2e-05, + "loss": 0.06026538, + "step": 1209 + }, + { + "epoch": 2.42, + "grad_norm": 1.4014763832092285, + "learning_rate": 2e-05, + "loss": 0.04447113, + "step": 1210 + }, + { + "epoch": 2.422, + "grad_norm": 1.3986562490463257, + "learning_rate": 2e-05, + "loss": 0.04321969, + "step": 1211 + }, + { + "epoch": 2.424, + "grad_norm": 1.840933084487915, + "learning_rate": 2e-05, + "loss": 0.05479154, + "step": 1212 + }, + { + "epoch": 2.426, + "grad_norm": 2.6934752464294434, + "learning_rate": 2e-05, + "loss": 0.05985066, + "step": 1213 + }, + { + "epoch": 2.428, + "grad_norm": 1.4198068380355835, + "learning_rate": 2e-05, + "loss": 0.04191133, + "step": 1214 + }, + { + "epoch": 2.43, + "grad_norm": 2.204730272293091, + "learning_rate": 2e-05, + "loss": 0.04986127, + "step": 1215 + }, + { + "epoch": 2.432, + "grad_norm": 1.7930086851119995, + "learning_rate": 2e-05, + "loss": 0.03822114, + "step": 1216 + }, + { + "epoch": 2.434, + "grad_norm": 1.9965940713882446, + "learning_rate": 2e-05, + "loss": 0.07922506, + "step": 1217 + }, + { + "epoch": 2.436, + "grad_norm": 1.8581851720809937, + "learning_rate": 2e-05, + "loss": 0.06270727, + "step": 1218 + }, + { + "epoch": 2.438, + "grad_norm": 2.940757989883423, + "learning_rate": 2e-05, + "loss": 0.05123382, + "step": 1219 + }, + { + "epoch": 2.44, + "grad_norm": 1.8128314018249512, + "learning_rate": 2e-05, + "loss": 0.05950858, + "step": 1220 + }, + { + "epoch": 2.442, + "grad_norm": 2.7984111309051514, + "learning_rate": 2e-05, + "loss": 0.04888112, + "step": 1221 + }, + { + "epoch": 2.444, + "grad_norm": 1.7929303646087646, + "learning_rate": 2e-05, + "loss": 0.05642758, + "step": 1222 + }, + { + "epoch": 2.446, + "grad_norm": 3.5627307891845703, + "learning_rate": 2e-05, + "loss": 0.06712979, + "step": 1223 + }, + { + "epoch": 2.448, + "grad_norm": 2.1680853366851807, + "learning_rate": 2e-05, + "loss": 0.04695696, + "step": 1224 + }, + { + "epoch": 2.45, + "grad_norm": 1.543848991394043, + "learning_rate": 2e-05, + "loss": 0.0471656, + "step": 1225 + }, + { + "epoch": 2.452, + "grad_norm": 1.377380609512329, + "learning_rate": 2e-05, + "loss": 0.04254293, + "step": 1226 + }, + { + "epoch": 2.454, + "grad_norm": 2.116429090499878, + "learning_rate": 2e-05, + "loss": 0.0888541, + "step": 1227 + }, + { + "epoch": 2.456, + "grad_norm": 1.714355707168579, + "learning_rate": 2e-05, + "loss": 0.04009545, + "step": 1228 + }, + { + "epoch": 2.458, + "grad_norm": 2.4635934829711914, + "learning_rate": 2e-05, + "loss": 0.06196552, + "step": 1229 + }, + { + "epoch": 2.46, + "grad_norm": 1.6283334493637085, + "learning_rate": 2e-05, + "loss": 0.05298021, + "step": 1230 + }, + { + "epoch": 2.462, + "grad_norm": 2.1310596466064453, + "learning_rate": 2e-05, + "loss": 0.0582485, + "step": 1231 + }, + { + "epoch": 2.464, + "grad_norm": 2.8013837337493896, + "learning_rate": 2e-05, + "loss": 0.07070208, + "step": 1232 + }, + { + "epoch": 2.466, + "grad_norm": 2.706620693206787, + "learning_rate": 2e-05, + "loss": 0.05937808, + "step": 1233 + }, + { + "epoch": 2.468, + "grad_norm": 2.1183664798736572, + "learning_rate": 2e-05, + "loss": 0.07405435, + "step": 1234 + }, + { + "epoch": 2.4699999999999998, + "grad_norm": 1.8695333003997803, + "learning_rate": 2e-05, + "loss": 0.04990563, + "step": 1235 + }, + { + "epoch": 2.472, + "grad_norm": 1.3877243995666504, + "learning_rate": 2e-05, + "loss": 0.05714301, + "step": 1236 + }, + { + "epoch": 2.474, + "grad_norm": 2.160632371902466, + "learning_rate": 2e-05, + "loss": 0.05262284, + "step": 1237 + }, + { + "epoch": 2.476, + "grad_norm": 3.242225408554077, + "learning_rate": 2e-05, + "loss": 0.07168475, + "step": 1238 + }, + { + "epoch": 2.4779999999999998, + "grad_norm": 1.5651336908340454, + "learning_rate": 2e-05, + "loss": 0.03872868, + "step": 1239 + }, + { + "epoch": 2.48, + "grad_norm": 2.6729652881622314, + "learning_rate": 2e-05, + "loss": 0.06602624, + "step": 1240 + }, + { + "epoch": 2.482, + "grad_norm": 1.9314351081848145, + "learning_rate": 2e-05, + "loss": 0.06864213, + "step": 1241 + }, + { + "epoch": 2.484, + "grad_norm": 2.9383625984191895, + "learning_rate": 2e-05, + "loss": 0.07590427, + "step": 1242 + }, + { + "epoch": 2.4859999999999998, + "grad_norm": 1.5178899765014648, + "learning_rate": 2e-05, + "loss": 0.04809076, + "step": 1243 + }, + { + "epoch": 2.488, + "grad_norm": 1.3784013986587524, + "learning_rate": 2e-05, + "loss": 0.04864044, + "step": 1244 + }, + { + "epoch": 2.49, + "grad_norm": 1.29901123046875, + "learning_rate": 2e-05, + "loss": 0.03983694, + "step": 1245 + }, + { + "epoch": 2.492, + "grad_norm": 2.0041773319244385, + "learning_rate": 2e-05, + "loss": 0.06119365, + "step": 1246 + }, + { + "epoch": 2.4939999999999998, + "grad_norm": 1.760473608970642, + "learning_rate": 2e-05, + "loss": 0.05547423, + "step": 1247 + }, + { + "epoch": 2.496, + "grad_norm": 1.782887578010559, + "learning_rate": 2e-05, + "loss": 0.05595577, + "step": 1248 + }, + { + "epoch": 2.498, + "grad_norm": 1.431018590927124, + "learning_rate": 2e-05, + "loss": 0.05510469, + "step": 1249 + }, + { + "epoch": 2.5, + "grad_norm": 2.7681326866149902, + "learning_rate": 2e-05, + "loss": 0.07547972, + "step": 1250 + }, + { + "epoch": 2.502, + "grad_norm": 3.4799344539642334, + "learning_rate": 2e-05, + "loss": 0.08792815, + "step": 1251 + }, + { + "epoch": 2.504, + "grad_norm": 1.4463987350463867, + "learning_rate": 2e-05, + "loss": 0.04625604, + "step": 1252 + }, + { + "epoch": 2.5060000000000002, + "grad_norm": 7.698814392089844, + "learning_rate": 2e-05, + "loss": 0.07450949, + "step": 1253 + }, + { + "epoch": 2.508, + "grad_norm": 2.0825157165527344, + "learning_rate": 2e-05, + "loss": 0.06824566, + "step": 1254 + }, + { + "epoch": 2.51, + "grad_norm": 2.138805389404297, + "learning_rate": 2e-05, + "loss": 0.06627047, + "step": 1255 + }, + { + "epoch": 2.512, + "grad_norm": 2.9669065475463867, + "learning_rate": 2e-05, + "loss": 0.07500726, + "step": 1256 + }, + { + "epoch": 2.5140000000000002, + "grad_norm": 1.8249704837799072, + "learning_rate": 2e-05, + "loss": 0.05518293, + "step": 1257 + }, + { + "epoch": 2.516, + "grad_norm": 1.9127455949783325, + "learning_rate": 2e-05, + "loss": 0.05548128, + "step": 1258 + }, + { + "epoch": 2.518, + "grad_norm": 2.1432528495788574, + "learning_rate": 2e-05, + "loss": 0.0882006, + "step": 1259 + }, + { + "epoch": 2.52, + "grad_norm": 1.9188241958618164, + "learning_rate": 2e-05, + "loss": 0.05358675, + "step": 1260 + }, + { + "epoch": 2.5220000000000002, + "grad_norm": 1.2570300102233887, + "learning_rate": 2e-05, + "loss": 0.04172009, + "step": 1261 + }, + { + "epoch": 2.524, + "grad_norm": 1.4593505859375, + "learning_rate": 2e-05, + "loss": 0.05119102, + "step": 1262 + }, + { + "epoch": 2.526, + "grad_norm": 1.6832845211029053, + "learning_rate": 2e-05, + "loss": 0.04923711, + "step": 1263 + }, + { + "epoch": 2.528, + "grad_norm": 2.0682425498962402, + "learning_rate": 2e-05, + "loss": 0.05748929, + "step": 1264 + }, + { + "epoch": 2.5300000000000002, + "grad_norm": 2.118595838546753, + "learning_rate": 2e-05, + "loss": 0.07105608, + "step": 1265 + }, + { + "epoch": 2.532, + "grad_norm": 1.294793725013733, + "learning_rate": 2e-05, + "loss": 0.05542181, + "step": 1266 + }, + { + "epoch": 2.534, + "grad_norm": 2.6284120082855225, + "learning_rate": 2e-05, + "loss": 0.04738271, + "step": 1267 + }, + { + "epoch": 2.536, + "grad_norm": 1.572341799736023, + "learning_rate": 2e-05, + "loss": 0.04801615, + "step": 1268 + }, + { + "epoch": 2.5380000000000003, + "grad_norm": 1.7317556142807007, + "learning_rate": 2e-05, + "loss": 0.04552308, + "step": 1269 + }, + { + "epoch": 2.54, + "grad_norm": 1.7648011445999146, + "learning_rate": 2e-05, + "loss": 0.05321316, + "step": 1270 + }, + { + "epoch": 2.542, + "grad_norm": 3.3624472618103027, + "learning_rate": 2e-05, + "loss": 0.04517394, + "step": 1271 + }, + { + "epoch": 2.544, + "grad_norm": 1.198052167892456, + "learning_rate": 2e-05, + "loss": 0.03292371, + "step": 1272 + }, + { + "epoch": 2.5460000000000003, + "grad_norm": 1.9982932806015015, + "learning_rate": 2e-05, + "loss": 0.04539865, + "step": 1273 + }, + { + "epoch": 2.548, + "grad_norm": 2.1996402740478516, + "learning_rate": 2e-05, + "loss": 0.06276131, + "step": 1274 + }, + { + "epoch": 2.55, + "grad_norm": 2.3174142837524414, + "learning_rate": 2e-05, + "loss": 0.07670379, + "step": 1275 + }, + { + "epoch": 2.552, + "grad_norm": 2.5731277465820312, + "learning_rate": 2e-05, + "loss": 0.05952568, + "step": 1276 + }, + { + "epoch": 2.5540000000000003, + "grad_norm": 2.444291591644287, + "learning_rate": 2e-05, + "loss": 0.06525612, + "step": 1277 + }, + { + "epoch": 2.556, + "grad_norm": 2.2760965824127197, + "learning_rate": 2e-05, + "loss": 0.06015107, + "step": 1278 + }, + { + "epoch": 2.558, + "grad_norm": 1.7860162258148193, + "learning_rate": 2e-05, + "loss": 0.06085454, + "step": 1279 + }, + { + "epoch": 2.56, + "grad_norm": 1.5104821920394897, + "learning_rate": 2e-05, + "loss": 0.04777806, + "step": 1280 + }, + { + "epoch": 2.5620000000000003, + "grad_norm": 2.0118963718414307, + "learning_rate": 2e-05, + "loss": 0.06037318, + "step": 1281 + }, + { + "epoch": 2.564, + "grad_norm": 1.3859965801239014, + "learning_rate": 2e-05, + "loss": 0.05690242, + "step": 1282 + }, + { + "epoch": 2.566, + "grad_norm": 1.6642067432403564, + "learning_rate": 2e-05, + "loss": 0.06878574, + "step": 1283 + }, + { + "epoch": 2.568, + "grad_norm": 1.7605303525924683, + "learning_rate": 2e-05, + "loss": 0.04636247, + "step": 1284 + }, + { + "epoch": 2.57, + "grad_norm": 1.2372325658798218, + "learning_rate": 2e-05, + "loss": 0.0318442, + "step": 1285 + }, + { + "epoch": 2.572, + "grad_norm": 1.015973687171936, + "learning_rate": 2e-05, + "loss": 0.02155654, + "step": 1286 + }, + { + "epoch": 2.574, + "grad_norm": 1.6201320886611938, + "learning_rate": 2e-05, + "loss": 0.03975923, + "step": 1287 + }, + { + "epoch": 2.576, + "grad_norm": 3.8816092014312744, + "learning_rate": 2e-05, + "loss": 0.08194003, + "step": 1288 + }, + { + "epoch": 2.578, + "grad_norm": 1.7358951568603516, + "learning_rate": 2e-05, + "loss": 0.04974889, + "step": 1289 + }, + { + "epoch": 2.58, + "grad_norm": 2.392040252685547, + "learning_rate": 2e-05, + "loss": 0.06020691, + "step": 1290 + }, + { + "epoch": 2.582, + "grad_norm": 1.1086469888687134, + "learning_rate": 2e-05, + "loss": 0.04583845, + "step": 1291 + }, + { + "epoch": 2.584, + "grad_norm": 1.5086019039154053, + "learning_rate": 2e-05, + "loss": 0.04313815, + "step": 1292 + }, + { + "epoch": 2.586, + "grad_norm": 1.7809884548187256, + "learning_rate": 2e-05, + "loss": 0.05329616, + "step": 1293 + }, + { + "epoch": 2.588, + "grad_norm": 1.52059006690979, + "learning_rate": 2e-05, + "loss": 0.04525972, + "step": 1294 + }, + { + "epoch": 2.59, + "grad_norm": 2.2042572498321533, + "learning_rate": 2e-05, + "loss": 0.0593795, + "step": 1295 + }, + { + "epoch": 2.592, + "grad_norm": 1.675024151802063, + "learning_rate": 2e-05, + "loss": 0.06052756, + "step": 1296 + }, + { + "epoch": 2.594, + "grad_norm": 2.8984549045562744, + "learning_rate": 2e-05, + "loss": 0.0687076, + "step": 1297 + }, + { + "epoch": 2.596, + "grad_norm": 2.2647969722747803, + "learning_rate": 2e-05, + "loss": 0.05009675, + "step": 1298 + }, + { + "epoch": 2.598, + "grad_norm": 1.8025486469268799, + "learning_rate": 2e-05, + "loss": 0.05575639, + "step": 1299 + }, + { + "epoch": 2.6, + "grad_norm": 1.8778318166732788, + "learning_rate": 2e-05, + "loss": 0.04730181, + "step": 1300 + }, + { + "epoch": 2.602, + "grad_norm": 1.6435505151748657, + "learning_rate": 2e-05, + "loss": 0.04583211, + "step": 1301 + }, + { + "epoch": 2.604, + "grad_norm": 1.7641148567199707, + "learning_rate": 2e-05, + "loss": 0.04589385, + "step": 1302 + }, + { + "epoch": 2.606, + "grad_norm": 2.1896462440490723, + "learning_rate": 2e-05, + "loss": 0.05265369, + "step": 1303 + }, + { + "epoch": 2.608, + "grad_norm": 2.137312173843384, + "learning_rate": 2e-05, + "loss": 0.05405066, + "step": 1304 + }, + { + "epoch": 2.61, + "grad_norm": 1.550561547279358, + "learning_rate": 2e-05, + "loss": 0.05104889, + "step": 1305 + }, + { + "epoch": 2.612, + "grad_norm": 1.8577524423599243, + "learning_rate": 2e-05, + "loss": 0.06039685, + "step": 1306 + }, + { + "epoch": 2.614, + "grad_norm": 2.0149245262145996, + "learning_rate": 2e-05, + "loss": 0.05752065, + "step": 1307 + }, + { + "epoch": 2.616, + "grad_norm": 1.2771774530410767, + "learning_rate": 2e-05, + "loss": 0.04627162, + "step": 1308 + }, + { + "epoch": 2.618, + "grad_norm": 1.2289670705795288, + "learning_rate": 2e-05, + "loss": 0.04631304, + "step": 1309 + }, + { + "epoch": 2.62, + "grad_norm": 1.5598198175430298, + "learning_rate": 2e-05, + "loss": 0.04497156, + "step": 1310 + }, + { + "epoch": 2.622, + "grad_norm": 2.00862717628479, + "learning_rate": 2e-05, + "loss": 0.0671872, + "step": 1311 + }, + { + "epoch": 2.624, + "grad_norm": 1.549716591835022, + "learning_rate": 2e-05, + "loss": 0.04912385, + "step": 1312 + }, + { + "epoch": 2.626, + "grad_norm": 1.7805136442184448, + "learning_rate": 2e-05, + "loss": 0.04953271, + "step": 1313 + }, + { + "epoch": 2.628, + "grad_norm": 1.4479440450668335, + "learning_rate": 2e-05, + "loss": 0.03930682, + "step": 1314 + }, + { + "epoch": 2.63, + "grad_norm": 2.026073932647705, + "learning_rate": 2e-05, + "loss": 0.0562569, + "step": 1315 + }, + { + "epoch": 2.632, + "grad_norm": 1.291940450668335, + "learning_rate": 2e-05, + "loss": 0.05585942, + "step": 1316 + }, + { + "epoch": 2.634, + "grad_norm": 1.6085025072097778, + "learning_rate": 2e-05, + "loss": 0.05520656, + "step": 1317 + }, + { + "epoch": 2.636, + "grad_norm": 1.7852100133895874, + "learning_rate": 2e-05, + "loss": 0.03830192, + "step": 1318 + }, + { + "epoch": 2.638, + "grad_norm": 2.1503891944885254, + "learning_rate": 2e-05, + "loss": 0.05418783, + "step": 1319 + }, + { + "epoch": 2.64, + "grad_norm": 1.5858957767486572, + "learning_rate": 2e-05, + "loss": 0.05161075, + "step": 1320 + }, + { + "epoch": 2.642, + "grad_norm": 1.517919659614563, + "learning_rate": 2e-05, + "loss": 0.04599277, + "step": 1321 + }, + { + "epoch": 2.644, + "grad_norm": 2.923825979232788, + "learning_rate": 2e-05, + "loss": 0.05314268, + "step": 1322 + }, + { + "epoch": 2.646, + "grad_norm": 1.6294347047805786, + "learning_rate": 2e-05, + "loss": 0.04354763, + "step": 1323 + }, + { + "epoch": 2.648, + "grad_norm": 2.0495193004608154, + "learning_rate": 2e-05, + "loss": 0.05836939, + "step": 1324 + }, + { + "epoch": 2.65, + "grad_norm": 1.94076669216156, + "learning_rate": 2e-05, + "loss": 0.04274346, + "step": 1325 + }, + { + "epoch": 2.652, + "grad_norm": 2.360132932662964, + "learning_rate": 2e-05, + "loss": 0.0647784, + "step": 1326 + }, + { + "epoch": 2.654, + "grad_norm": 1.8153984546661377, + "learning_rate": 2e-05, + "loss": 0.04720091, + "step": 1327 + }, + { + "epoch": 2.656, + "grad_norm": 1.725906252861023, + "learning_rate": 2e-05, + "loss": 0.04768829, + "step": 1328 + }, + { + "epoch": 2.658, + "grad_norm": 3.0785069465637207, + "learning_rate": 2e-05, + "loss": 0.05972423, + "step": 1329 + }, + { + "epoch": 2.66, + "grad_norm": 2.830115556716919, + "learning_rate": 2e-05, + "loss": 0.07905363, + "step": 1330 + }, + { + "epoch": 2.662, + "grad_norm": 1.4237239360809326, + "learning_rate": 2e-05, + "loss": 0.05330458, + "step": 1331 + }, + { + "epoch": 2.664, + "grad_norm": 1.5219199657440186, + "learning_rate": 2e-05, + "loss": 0.05649319, + "step": 1332 + }, + { + "epoch": 2.666, + "grad_norm": 1.4314558506011963, + "learning_rate": 2e-05, + "loss": 0.03354269, + "step": 1333 + }, + { + "epoch": 2.668, + "grad_norm": 2.2276740074157715, + "learning_rate": 2e-05, + "loss": 0.07491125, + "step": 1334 + }, + { + "epoch": 2.67, + "grad_norm": 2.2556209564208984, + "learning_rate": 2e-05, + "loss": 0.0776452, + "step": 1335 + }, + { + "epoch": 2.672, + "grad_norm": 1.8496445417404175, + "learning_rate": 2e-05, + "loss": 0.04727801, + "step": 1336 + }, + { + "epoch": 2.674, + "grad_norm": 2.769913673400879, + "learning_rate": 2e-05, + "loss": 0.07160389, + "step": 1337 + }, + { + "epoch": 2.676, + "grad_norm": 1.8476206064224243, + "learning_rate": 2e-05, + "loss": 0.04567086, + "step": 1338 + }, + { + "epoch": 2.678, + "grad_norm": 1.8353848457336426, + "learning_rate": 2e-05, + "loss": 0.05308025, + "step": 1339 + }, + { + "epoch": 2.68, + "grad_norm": 1.6821805238723755, + "learning_rate": 2e-05, + "loss": 0.04728777, + "step": 1340 + }, + { + "epoch": 2.682, + "grad_norm": 2.2069616317749023, + "learning_rate": 2e-05, + "loss": 0.05139998, + "step": 1341 + }, + { + "epoch": 2.684, + "grad_norm": 3.2691590785980225, + "learning_rate": 2e-05, + "loss": 0.03943311, + "step": 1342 + }, + { + "epoch": 2.686, + "grad_norm": 1.286525845527649, + "learning_rate": 2e-05, + "loss": 0.04956998, + "step": 1343 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 1.5744187831878662, + "learning_rate": 2e-05, + "loss": 0.04305596, + "step": 1344 + }, + { + "epoch": 2.69, + "grad_norm": 2.1982498168945312, + "learning_rate": 2e-05, + "loss": 0.06531291, + "step": 1345 + }, + { + "epoch": 2.692, + "grad_norm": 1.5973800420761108, + "learning_rate": 2e-05, + "loss": 0.06064547, + "step": 1346 + }, + { + "epoch": 2.694, + "grad_norm": 1.2219719886779785, + "learning_rate": 2e-05, + "loss": 0.04524098, + "step": 1347 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 2.1111559867858887, + "learning_rate": 2e-05, + "loss": 0.07144919, + "step": 1348 + }, + { + "epoch": 2.698, + "grad_norm": 1.7122925519943237, + "learning_rate": 2e-05, + "loss": 0.04555336, + "step": 1349 + }, + { + "epoch": 2.7, + "grad_norm": 1.607475757598877, + "learning_rate": 2e-05, + "loss": 0.05808245, + "step": 1350 + }, + { + "epoch": 2.702, + "grad_norm": 1.6828935146331787, + "learning_rate": 2e-05, + "loss": 0.05782773, + "step": 1351 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 1.7249581813812256, + "learning_rate": 2e-05, + "loss": 0.0577189, + "step": 1352 + }, + { + "epoch": 2.706, + "grad_norm": 1.476163387298584, + "learning_rate": 2e-05, + "loss": 0.0563928, + "step": 1353 + }, + { + "epoch": 2.708, + "grad_norm": 1.3817315101623535, + "learning_rate": 2e-05, + "loss": 0.04125504, + "step": 1354 + }, + { + "epoch": 2.71, + "grad_norm": 1.9194447994232178, + "learning_rate": 2e-05, + "loss": 0.06330554, + "step": 1355 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 1.357925534248352, + "learning_rate": 2e-05, + "loss": 0.04602174, + "step": 1356 + }, + { + "epoch": 2.714, + "grad_norm": 2.3565990924835205, + "learning_rate": 2e-05, + "loss": 0.05163093, + "step": 1357 + }, + { + "epoch": 2.716, + "grad_norm": 1.6682748794555664, + "learning_rate": 2e-05, + "loss": 0.03820909, + "step": 1358 + }, + { + "epoch": 2.718, + "grad_norm": 2.4361672401428223, + "learning_rate": 2e-05, + "loss": 0.05794044, + "step": 1359 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 2.8087456226348877, + "learning_rate": 2e-05, + "loss": 0.0474041, + "step": 1360 + }, + { + "epoch": 2.722, + "grad_norm": 2.0830671787261963, + "learning_rate": 2e-05, + "loss": 0.0617595, + "step": 1361 + }, + { + "epoch": 2.724, + "grad_norm": 2.5234179496765137, + "learning_rate": 2e-05, + "loss": 0.07854919, + "step": 1362 + }, + { + "epoch": 2.726, + "grad_norm": 2.3360183238983154, + "learning_rate": 2e-05, + "loss": 0.0680959, + "step": 1363 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 1.46207594871521, + "learning_rate": 2e-05, + "loss": 0.05441956, + "step": 1364 + }, + { + "epoch": 2.73, + "grad_norm": 1.2000892162322998, + "learning_rate": 2e-05, + "loss": 0.03472488, + "step": 1365 + }, + { + "epoch": 2.732, + "grad_norm": 1.1089409589767456, + "learning_rate": 2e-05, + "loss": 0.03588956, + "step": 1366 + }, + { + "epoch": 2.734, + "grad_norm": 1.484050989151001, + "learning_rate": 2e-05, + "loss": 0.0420528, + "step": 1367 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 2.1856143474578857, + "learning_rate": 2e-05, + "loss": 0.07063307, + "step": 1368 + }, + { + "epoch": 2.738, + "grad_norm": 1.3538694381713867, + "learning_rate": 2e-05, + "loss": 0.04442645, + "step": 1369 + }, + { + "epoch": 2.74, + "grad_norm": 1.1728174686431885, + "learning_rate": 2e-05, + "loss": 0.03758407, + "step": 1370 + }, + { + "epoch": 2.742, + "grad_norm": 1.5417742729187012, + "learning_rate": 2e-05, + "loss": 0.05347462, + "step": 1371 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 1.2521121501922607, + "learning_rate": 2e-05, + "loss": 0.03596249, + "step": 1372 + }, + { + "epoch": 2.746, + "grad_norm": 1.7495527267456055, + "learning_rate": 2e-05, + "loss": 0.05753302, + "step": 1373 + }, + { + "epoch": 2.748, + "grad_norm": 1.933049201965332, + "learning_rate": 2e-05, + "loss": 0.04527674, + "step": 1374 + }, + { + "epoch": 2.75, + "grad_norm": 2.8900275230407715, + "learning_rate": 2e-05, + "loss": 0.06932513, + "step": 1375 + }, + { + "epoch": 2.752, + "grad_norm": 1.453918695449829, + "learning_rate": 2e-05, + "loss": 0.04357269, + "step": 1376 + }, + { + "epoch": 2.754, + "grad_norm": 1.4563038349151611, + "learning_rate": 2e-05, + "loss": 0.05056226, + "step": 1377 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 1.504270076751709, + "learning_rate": 2e-05, + "loss": 0.05600676, + "step": 1378 + }, + { + "epoch": 2.758, + "grad_norm": 1.9855608940124512, + "learning_rate": 2e-05, + "loss": 0.05795199, + "step": 1379 + }, + { + "epoch": 2.76, + "grad_norm": 1.656665563583374, + "learning_rate": 2e-05, + "loss": 0.05140248, + "step": 1380 + }, + { + "epoch": 2.762, + "grad_norm": 1.7435170412063599, + "learning_rate": 2e-05, + "loss": 0.04258007, + "step": 1381 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 1.655081033706665, + "learning_rate": 2e-05, + "loss": 0.0503007, + "step": 1382 + }, + { + "epoch": 2.766, + "grad_norm": 1.523956298828125, + "learning_rate": 2e-05, + "loss": 0.0550363, + "step": 1383 + }, + { + "epoch": 2.768, + "grad_norm": 1.4899818897247314, + "learning_rate": 2e-05, + "loss": 0.04842805, + "step": 1384 + }, + { + "epoch": 2.77, + "grad_norm": 2.0813798904418945, + "learning_rate": 2e-05, + "loss": 0.05239079, + "step": 1385 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 1.5431971549987793, + "learning_rate": 2e-05, + "loss": 0.05558814, + "step": 1386 + }, + { + "epoch": 2.774, + "grad_norm": 1.7558424472808838, + "learning_rate": 2e-05, + "loss": 0.04031524, + "step": 1387 + }, + { + "epoch": 2.776, + "grad_norm": 0.9710761308670044, + "learning_rate": 2e-05, + "loss": 0.02542916, + "step": 1388 + }, + { + "epoch": 2.778, + "grad_norm": 1.3783509731292725, + "learning_rate": 2e-05, + "loss": 0.05585878, + "step": 1389 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 1.7113022804260254, + "learning_rate": 2e-05, + "loss": 0.06848748, + "step": 1390 + }, + { + "epoch": 2.782, + "grad_norm": 1.9026986360549927, + "learning_rate": 2e-05, + "loss": 0.04627277, + "step": 1391 + }, + { + "epoch": 2.784, + "grad_norm": 2.1564736366271973, + "learning_rate": 2e-05, + "loss": 0.04546306, + "step": 1392 + }, + { + "epoch": 2.786, + "grad_norm": 1.8871279954910278, + "learning_rate": 2e-05, + "loss": 0.05558582, + "step": 1393 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 1.3954287767410278, + "learning_rate": 2e-05, + "loss": 0.05387101, + "step": 1394 + }, + { + "epoch": 2.79, + "grad_norm": 1.9269911050796509, + "learning_rate": 2e-05, + "loss": 0.0572519, + "step": 1395 + }, + { + "epoch": 2.792, + "grad_norm": 1.7022475004196167, + "learning_rate": 2e-05, + "loss": 0.04831417, + "step": 1396 + }, + { + "epoch": 2.794, + "grad_norm": 1.7882295846939087, + "learning_rate": 2e-05, + "loss": 0.03874616, + "step": 1397 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 1.4796777963638306, + "learning_rate": 2e-05, + "loss": 0.03719701, + "step": 1398 + }, + { + "epoch": 2.798, + "grad_norm": 1.5420184135437012, + "learning_rate": 2e-05, + "loss": 0.05557998, + "step": 1399 + }, + { + "epoch": 2.8, + "grad_norm": 1.8424732685089111, + "learning_rate": 2e-05, + "loss": 0.05429619, + "step": 1400 + }, + { + "epoch": 2.802, + "grad_norm": 1.4261318445205688, + "learning_rate": 2e-05, + "loss": 0.05679541, + "step": 1401 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 1.93724524974823, + "learning_rate": 2e-05, + "loss": 0.04712646, + "step": 1402 + }, + { + "epoch": 2.806, + "grad_norm": 1.8298678398132324, + "learning_rate": 2e-05, + "loss": 0.0460495, + "step": 1403 + }, + { + "epoch": 2.808, + "grad_norm": 2.4823086261749268, + "learning_rate": 2e-05, + "loss": 0.05720939, + "step": 1404 + }, + { + "epoch": 2.81, + "grad_norm": 2.05409574508667, + "learning_rate": 2e-05, + "loss": 0.06287205, + "step": 1405 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 1.9300044775009155, + "learning_rate": 2e-05, + "loss": 0.05220521, + "step": 1406 + }, + { + "epoch": 2.814, + "grad_norm": 1.6382372379302979, + "learning_rate": 2e-05, + "loss": 0.04585889, + "step": 1407 + }, + { + "epoch": 2.816, + "grad_norm": 1.6744272708892822, + "learning_rate": 2e-05, + "loss": 0.05738734, + "step": 1408 + }, + { + "epoch": 2.818, + "grad_norm": 1.097986102104187, + "learning_rate": 2e-05, + "loss": 0.03808478, + "step": 1409 + }, + { + "epoch": 2.82, + "grad_norm": 1.8194561004638672, + "learning_rate": 2e-05, + "loss": 0.05379384, + "step": 1410 + }, + { + "epoch": 2.822, + "grad_norm": 1.1608667373657227, + "learning_rate": 2e-05, + "loss": 0.04018103, + "step": 1411 + }, + { + "epoch": 2.824, + "grad_norm": 1.5536198616027832, + "learning_rate": 2e-05, + "loss": 0.0568902, + "step": 1412 + }, + { + "epoch": 2.826, + "grad_norm": 1.306771159172058, + "learning_rate": 2e-05, + "loss": 0.04700731, + "step": 1413 + }, + { + "epoch": 2.828, + "grad_norm": 1.4206809997558594, + "learning_rate": 2e-05, + "loss": 0.03569669, + "step": 1414 + }, + { + "epoch": 2.83, + "grad_norm": 2.00671124458313, + "learning_rate": 2e-05, + "loss": 0.0654131, + "step": 1415 + }, + { + "epoch": 2.832, + "grad_norm": 1.3739982843399048, + "learning_rate": 2e-05, + "loss": 0.05260873, + "step": 1416 + }, + { + "epoch": 2.834, + "grad_norm": 1.3874986171722412, + "learning_rate": 2e-05, + "loss": 0.038274, + "step": 1417 + }, + { + "epoch": 2.836, + "grad_norm": 1.056638479232788, + "learning_rate": 2e-05, + "loss": 0.03164873, + "step": 1418 + }, + { + "epoch": 2.838, + "grad_norm": 2.646418809890747, + "learning_rate": 2e-05, + "loss": 0.0538192, + "step": 1419 + }, + { + "epoch": 2.84, + "grad_norm": 1.1970181465148926, + "learning_rate": 2e-05, + "loss": 0.03613362, + "step": 1420 + }, + { + "epoch": 2.842, + "grad_norm": 1.7380998134613037, + "learning_rate": 2e-05, + "loss": 0.05502573, + "step": 1421 + }, + { + "epoch": 2.844, + "grad_norm": 1.64398193359375, + "learning_rate": 2e-05, + "loss": 0.03875776, + "step": 1422 + }, + { + "epoch": 2.846, + "grad_norm": 2.1740400791168213, + "learning_rate": 2e-05, + "loss": 0.05134468, + "step": 1423 + }, + { + "epoch": 2.848, + "grad_norm": 2.8908863067626953, + "learning_rate": 2e-05, + "loss": 0.08046682, + "step": 1424 + }, + { + "epoch": 2.85, + "grad_norm": 1.89336359500885, + "learning_rate": 2e-05, + "loss": 0.05308978, + "step": 1425 + }, + { + "epoch": 2.852, + "grad_norm": 2.1659324169158936, + "learning_rate": 2e-05, + "loss": 0.0364816, + "step": 1426 + }, + { + "epoch": 2.854, + "grad_norm": 1.402146339416504, + "learning_rate": 2e-05, + "loss": 0.04808577, + "step": 1427 + }, + { + "epoch": 2.856, + "grad_norm": 1.9512687921524048, + "learning_rate": 2e-05, + "loss": 0.05041477, + "step": 1428 + }, + { + "epoch": 2.858, + "grad_norm": 1.3872478008270264, + "learning_rate": 2e-05, + "loss": 0.04761492, + "step": 1429 + }, + { + "epoch": 2.86, + "grad_norm": 1.659584403038025, + "learning_rate": 2e-05, + "loss": 0.06967552, + "step": 1430 + }, + { + "epoch": 2.862, + "grad_norm": 1.8425308465957642, + "learning_rate": 2e-05, + "loss": 0.04844275, + "step": 1431 + }, + { + "epoch": 2.864, + "grad_norm": 1.3185906410217285, + "learning_rate": 2e-05, + "loss": 0.04250592, + "step": 1432 + }, + { + "epoch": 2.866, + "grad_norm": 1.1444284915924072, + "learning_rate": 2e-05, + "loss": 0.03224308, + "step": 1433 + }, + { + "epoch": 2.868, + "grad_norm": 1.4065706729888916, + "learning_rate": 2e-05, + "loss": 0.05243749, + "step": 1434 + }, + { + "epoch": 2.87, + "grad_norm": 2.1064388751983643, + "learning_rate": 2e-05, + "loss": 0.02927379, + "step": 1435 + }, + { + "epoch": 2.872, + "grad_norm": 1.6056631803512573, + "learning_rate": 2e-05, + "loss": 0.03909209, + "step": 1436 + }, + { + "epoch": 2.874, + "grad_norm": 1.3546314239501953, + "learning_rate": 2e-05, + "loss": 0.0485357, + "step": 1437 + }, + { + "epoch": 2.876, + "grad_norm": 1.5769225358963013, + "learning_rate": 2e-05, + "loss": 0.05911487, + "step": 1438 + }, + { + "epoch": 2.878, + "grad_norm": 1.589120864868164, + "learning_rate": 2e-05, + "loss": 0.04771549, + "step": 1439 + }, + { + "epoch": 2.88, + "grad_norm": 2.1113979816436768, + "learning_rate": 2e-05, + "loss": 0.07277437, + "step": 1440 + }, + { + "epoch": 2.882, + "grad_norm": 2.021388530731201, + "learning_rate": 2e-05, + "loss": 0.04649474, + "step": 1441 + }, + { + "epoch": 2.884, + "grad_norm": 1.8449045419692993, + "learning_rate": 2e-05, + "loss": 0.04096931, + "step": 1442 + }, + { + "epoch": 2.886, + "grad_norm": 2.4654793739318848, + "learning_rate": 2e-05, + "loss": 0.06707292, + "step": 1443 + }, + { + "epoch": 2.888, + "grad_norm": 1.8038156032562256, + "learning_rate": 2e-05, + "loss": 0.05822627, + "step": 1444 + }, + { + "epoch": 2.89, + "grad_norm": 1.6336687803268433, + "learning_rate": 2e-05, + "loss": 0.04999952, + "step": 1445 + }, + { + "epoch": 2.892, + "grad_norm": 1.4008294343948364, + "learning_rate": 2e-05, + "loss": 0.04774325, + "step": 1446 + }, + { + "epoch": 2.894, + "grad_norm": 1.8084869384765625, + "learning_rate": 2e-05, + "loss": 0.07069649, + "step": 1447 + }, + { + "epoch": 2.896, + "grad_norm": 3.3638651371002197, + "learning_rate": 2e-05, + "loss": 0.09050111, + "step": 1448 + }, + { + "epoch": 2.898, + "grad_norm": 1.1725375652313232, + "learning_rate": 2e-05, + "loss": 0.04985755, + "step": 1449 + }, + { + "epoch": 2.9, + "grad_norm": 1.0440391302108765, + "learning_rate": 2e-05, + "loss": 0.05144562, + "step": 1450 + }, + { + "epoch": 2.902, + "grad_norm": 2.280170440673828, + "learning_rate": 2e-05, + "loss": 0.04932537, + "step": 1451 + }, + { + "epoch": 2.904, + "grad_norm": 1.461732268333435, + "learning_rate": 2e-05, + "loss": 0.04785193, + "step": 1452 + }, + { + "epoch": 2.906, + "grad_norm": 1.9254556894302368, + "learning_rate": 2e-05, + "loss": 0.05030049, + "step": 1453 + }, + { + "epoch": 2.908, + "grad_norm": 1.182039737701416, + "learning_rate": 2e-05, + "loss": 0.03006881, + "step": 1454 + }, + { + "epoch": 2.91, + "grad_norm": 2.5947253704071045, + "learning_rate": 2e-05, + "loss": 0.04492339, + "step": 1455 + }, + { + "epoch": 2.912, + "grad_norm": 1.543723225593567, + "learning_rate": 2e-05, + "loss": 0.05002248, + "step": 1456 + }, + { + "epoch": 2.914, + "grad_norm": 1.1024924516677856, + "learning_rate": 2e-05, + "loss": 0.041133, + "step": 1457 + }, + { + "epoch": 2.916, + "grad_norm": 1.7310285568237305, + "learning_rate": 2e-05, + "loss": 0.05967389, + "step": 1458 + }, + { + "epoch": 2.918, + "grad_norm": 2.281721830368042, + "learning_rate": 2e-05, + "loss": 0.06781383, + "step": 1459 + }, + { + "epoch": 2.92, + "grad_norm": 1.6971452236175537, + "learning_rate": 2e-05, + "loss": 0.05646857, + "step": 1460 + }, + { + "epoch": 2.922, + "grad_norm": 1.741514801979065, + "learning_rate": 2e-05, + "loss": 0.07292978, + "step": 1461 + }, + { + "epoch": 2.924, + "grad_norm": 1.4831644296646118, + "learning_rate": 2e-05, + "loss": 0.03878976, + "step": 1462 + }, + { + "epoch": 2.926, + "grad_norm": 1.676571011543274, + "learning_rate": 2e-05, + "loss": 0.05792604, + "step": 1463 + }, + { + "epoch": 2.928, + "grad_norm": 1.8864375352859497, + "learning_rate": 2e-05, + "loss": 0.06544764, + "step": 1464 + }, + { + "epoch": 2.93, + "grad_norm": 2.198659658432007, + "learning_rate": 2e-05, + "loss": 0.06713847, + "step": 1465 + }, + { + "epoch": 2.932, + "grad_norm": 1.498012900352478, + "learning_rate": 2e-05, + "loss": 0.03821286, + "step": 1466 + }, + { + "epoch": 2.934, + "grad_norm": 1.4330593347549438, + "learning_rate": 2e-05, + "loss": 0.06282357, + "step": 1467 + }, + { + "epoch": 2.936, + "grad_norm": 1.3292213678359985, + "learning_rate": 2e-05, + "loss": 0.06121788, + "step": 1468 + }, + { + "epoch": 2.9379999999999997, + "grad_norm": 1.5773099660873413, + "learning_rate": 2e-05, + "loss": 0.03123096, + "step": 1469 + }, + { + "epoch": 2.94, + "grad_norm": 1.3884388208389282, + "learning_rate": 2e-05, + "loss": 0.04133505, + "step": 1470 + }, + { + "epoch": 2.942, + "grad_norm": 1.1826488971710205, + "learning_rate": 2e-05, + "loss": 0.03776342, + "step": 1471 + }, + { + "epoch": 2.944, + "grad_norm": 1.5211957693099976, + "learning_rate": 2e-05, + "loss": 0.04243449, + "step": 1472 + }, + { + "epoch": 2.9459999999999997, + "grad_norm": 2.148106336593628, + "learning_rate": 2e-05, + "loss": 0.06043831, + "step": 1473 + }, + { + "epoch": 2.948, + "grad_norm": 1.1765345335006714, + "learning_rate": 2e-05, + "loss": 0.04386732, + "step": 1474 + }, + { + "epoch": 2.95, + "grad_norm": 1.502057433128357, + "learning_rate": 2e-05, + "loss": 0.0411534, + "step": 1475 + }, + { + "epoch": 2.952, + "grad_norm": 1.6430046558380127, + "learning_rate": 2e-05, + "loss": 0.037657, + "step": 1476 + }, + { + "epoch": 2.9539999999999997, + "grad_norm": 1.535294771194458, + "learning_rate": 2e-05, + "loss": 0.05130748, + "step": 1477 + }, + { + "epoch": 2.956, + "grad_norm": 1.9392225742340088, + "learning_rate": 2e-05, + "loss": 0.05215064, + "step": 1478 + }, + { + "epoch": 2.958, + "grad_norm": 1.1696999073028564, + "learning_rate": 2e-05, + "loss": 0.02946492, + "step": 1479 + }, + { + "epoch": 2.96, + "grad_norm": 1.414081335067749, + "learning_rate": 2e-05, + "loss": 0.04422535, + "step": 1480 + }, + { + "epoch": 2.9619999999999997, + "grad_norm": 1.719420075416565, + "learning_rate": 2e-05, + "loss": 0.05473565, + "step": 1481 + }, + { + "epoch": 2.964, + "grad_norm": 2.0351169109344482, + "learning_rate": 2e-05, + "loss": 0.04414459, + "step": 1482 + }, + { + "epoch": 2.966, + "grad_norm": 1.012802004814148, + "learning_rate": 2e-05, + "loss": 0.03308199, + "step": 1483 + }, + { + "epoch": 2.968, + "grad_norm": 2.026688814163208, + "learning_rate": 2e-05, + "loss": 0.04603078, + "step": 1484 + }, + { + "epoch": 2.9699999999999998, + "grad_norm": 2.1403253078460693, + "learning_rate": 2e-05, + "loss": 0.04239823, + "step": 1485 + }, + { + "epoch": 2.972, + "grad_norm": 2.087897539138794, + "learning_rate": 2e-05, + "loss": 0.05140383, + "step": 1486 + }, + { + "epoch": 2.974, + "grad_norm": 3.251877546310425, + "learning_rate": 2e-05, + "loss": 0.05295721, + "step": 1487 + }, + { + "epoch": 2.976, + "grad_norm": 2.2770659923553467, + "learning_rate": 2e-05, + "loss": 0.06836365, + "step": 1488 + }, + { + "epoch": 2.9779999999999998, + "grad_norm": 1.4098083972930908, + "learning_rate": 2e-05, + "loss": 0.03099969, + "step": 1489 + }, + { + "epoch": 2.98, + "grad_norm": 1.4418541193008423, + "learning_rate": 2e-05, + "loss": 0.0420429, + "step": 1490 + }, + { + "epoch": 2.982, + "grad_norm": 1.7272592782974243, + "learning_rate": 2e-05, + "loss": 0.05751514, + "step": 1491 + }, + { + "epoch": 2.984, + "grad_norm": 1.1021864414215088, + "learning_rate": 2e-05, + "loss": 0.03865705, + "step": 1492 + }, + { + "epoch": 2.9859999999999998, + "grad_norm": 1.4109399318695068, + "learning_rate": 2e-05, + "loss": 0.05035442, + "step": 1493 + }, + { + "epoch": 2.988, + "grad_norm": 1.5310124158859253, + "learning_rate": 2e-05, + "loss": 0.04429567, + "step": 1494 + }, + { + "epoch": 2.99, + "grad_norm": 1.1835428476333618, + "learning_rate": 2e-05, + "loss": 0.04300354, + "step": 1495 + }, + { + "epoch": 2.992, + "grad_norm": 1.6832672357559204, + "learning_rate": 2e-05, + "loss": 0.06267562, + "step": 1496 + }, + { + "epoch": 2.9939999999999998, + "grad_norm": 1.7960203886032104, + "learning_rate": 2e-05, + "loss": 0.04719928, + "step": 1497 + }, + { + "epoch": 2.996, + "grad_norm": 1.2733722925186157, + "learning_rate": 2e-05, + "loss": 0.04313634, + "step": 1498 + }, + { + "epoch": 2.998, + "grad_norm": 2.289060592651367, + "learning_rate": 2e-05, + "loss": 0.04968451, + "step": 1499 + }, + { + "epoch": 3.0, + "grad_norm": 1.4316084384918213, + "learning_rate": 2e-05, + "loss": 0.04869864, + "step": 1500 + }, + { + "epoch": 3.0, + "eval_performance": { + "AngleClassification_1": 0.992, + "AngleClassification_2": 0.52, + "AngleClassification_3": 0.49101796407185627, + "Equal_1": 0.97, + "Equal_2": 0.8323353293413174, + "Equal_3": 0.720558882235529, + "LineComparison_1": 0.992, + "LineComparison_2": 0.9740518962075848, + "LineComparison_3": 0.8582834331337326, + "Parallel_1": 0.9458917835671342, + "Parallel_2": 0.9438877755511023, + "Parallel_3": 0.744, + "Perpendicular_1": 0.934, + "Perpendicular_2": 0.41, + "Perpendicular_3": 0.19939879759519039, + "PointLiesOnCircle_1": 0.9859719438877755, + "PointLiesOnCircle_2": 0.9889333333333334, + "PointLiesOnCircle_3": 0.7892666666666667, + "PointLiesOnLine_1": 0.9799599198396793, + "PointLiesOnLine_2": 0.9238476953907816, + "PointLiesOnLine_3": 0.49101796407185627 + }, + "eval_runtime": 321.7153, + "eval_samples_per_second": 32.638, + "eval_steps_per_second": 0.653, + "step": 1500 + }, + { + "epoch": 3.002, + "grad_norm": 2.2160494327545166, + "learning_rate": 2e-05, + "loss": 0.04842204, + "step": 1501 + }, + { + "epoch": 3.004, + "grad_norm": 2.940279006958008, + "learning_rate": 2e-05, + "loss": 0.06389765, + "step": 1502 + }, + { + "epoch": 3.006, + "grad_norm": 1.4351305961608887, + "learning_rate": 2e-05, + "loss": 0.03972311, + "step": 1503 + }, + { + "epoch": 3.008, + "grad_norm": 1.8173421621322632, + "learning_rate": 2e-05, + "loss": 0.06714956, + "step": 1504 + }, + { + "epoch": 3.01, + "grad_norm": 1.6168220043182373, + "learning_rate": 2e-05, + "loss": 0.05728213, + "step": 1505 + }, + { + "epoch": 3.012, + "grad_norm": 2.713430166244507, + "learning_rate": 2e-05, + "loss": 0.08517258, + "step": 1506 + }, + { + "epoch": 3.014, + "grad_norm": 1.6535557508468628, + "learning_rate": 2e-05, + "loss": 0.05091209, + "step": 1507 + }, + { + "epoch": 3.016, + "grad_norm": 1.5731172561645508, + "learning_rate": 2e-05, + "loss": 0.04936452, + "step": 1508 + }, + { + "epoch": 3.018, + "grad_norm": 2.121694326400757, + "learning_rate": 2e-05, + "loss": 0.05214013, + "step": 1509 + }, + { + "epoch": 3.02, + "grad_norm": 1.7850323915481567, + "learning_rate": 2e-05, + "loss": 0.04803599, + "step": 1510 + }, + { + "epoch": 3.022, + "grad_norm": 1.0655592679977417, + "learning_rate": 2e-05, + "loss": 0.04061773, + "step": 1511 + }, + { + "epoch": 3.024, + "grad_norm": 1.7327584028244019, + "learning_rate": 2e-05, + "loss": 0.05965272, + "step": 1512 + }, + { + "epoch": 3.026, + "grad_norm": 1.4035505056381226, + "learning_rate": 2e-05, + "loss": 0.04205831, + "step": 1513 + }, + { + "epoch": 3.028, + "grad_norm": 1.2730896472930908, + "learning_rate": 2e-05, + "loss": 0.04025223, + "step": 1514 + }, + { + "epoch": 3.03, + "grad_norm": 1.6347522735595703, + "learning_rate": 2e-05, + "loss": 0.06551866, + "step": 1515 + }, + { + "epoch": 3.032, + "grad_norm": 1.24717116355896, + "learning_rate": 2e-05, + "loss": 0.04352552, + "step": 1516 + }, + { + "epoch": 3.034, + "grad_norm": 2.378849744796753, + "learning_rate": 2e-05, + "loss": 0.05578585, + "step": 1517 + }, + { + "epoch": 3.036, + "grad_norm": 1.5153645277023315, + "learning_rate": 2e-05, + "loss": 0.06007598, + "step": 1518 + }, + { + "epoch": 3.038, + "grad_norm": 1.4317395687103271, + "learning_rate": 2e-05, + "loss": 0.04686758, + "step": 1519 + }, + { + "epoch": 3.04, + "grad_norm": 1.150730013847351, + "learning_rate": 2e-05, + "loss": 0.03575693, + "step": 1520 + }, + { + "epoch": 3.042, + "grad_norm": 1.221483588218689, + "learning_rate": 2e-05, + "loss": 0.03753293, + "step": 1521 + }, + { + "epoch": 3.044, + "grad_norm": 1.595436453819275, + "learning_rate": 2e-05, + "loss": 0.04876781, + "step": 1522 + }, + { + "epoch": 3.046, + "grad_norm": 1.5910844802856445, + "learning_rate": 2e-05, + "loss": 0.04243572, + "step": 1523 + }, + { + "epoch": 3.048, + "grad_norm": 1.333764672279358, + "learning_rate": 2e-05, + "loss": 0.04689807, + "step": 1524 + }, + { + "epoch": 3.05, + "grad_norm": 1.244879961013794, + "learning_rate": 2e-05, + "loss": 0.05209486, + "step": 1525 + }, + { + "epoch": 3.052, + "grad_norm": 2.523118734359741, + "learning_rate": 2e-05, + "loss": 0.05441052, + "step": 1526 + }, + { + "epoch": 3.054, + "grad_norm": 1.1830792427062988, + "learning_rate": 2e-05, + "loss": 0.03513453, + "step": 1527 + }, + { + "epoch": 3.056, + "grad_norm": 1.4500566720962524, + "learning_rate": 2e-05, + "loss": 0.03788603, + "step": 1528 + }, + { + "epoch": 3.058, + "grad_norm": 1.9590797424316406, + "learning_rate": 2e-05, + "loss": 0.07833979, + "step": 1529 + }, + { + "epoch": 3.06, + "grad_norm": 2.1240177154541016, + "learning_rate": 2e-05, + "loss": 0.06781566, + "step": 1530 + }, + { + "epoch": 3.062, + "grad_norm": 2.877790689468384, + "learning_rate": 2e-05, + "loss": 0.05031575, + "step": 1531 + }, + { + "epoch": 3.064, + "grad_norm": 1.827868938446045, + "learning_rate": 2e-05, + "loss": 0.05153899, + "step": 1532 + }, + { + "epoch": 3.066, + "grad_norm": 1.4657886028289795, + "learning_rate": 2e-05, + "loss": 0.05448882, + "step": 1533 + }, + { + "epoch": 3.068, + "grad_norm": 2.052285671234131, + "learning_rate": 2e-05, + "loss": 0.06243248, + "step": 1534 + }, + { + "epoch": 3.07, + "grad_norm": 0.7502951622009277, + "learning_rate": 2e-05, + "loss": 0.02465053, + "step": 1535 + }, + { + "epoch": 3.072, + "grad_norm": 1.1672446727752686, + "learning_rate": 2e-05, + "loss": 0.03131329, + "step": 1536 + }, + { + "epoch": 3.074, + "grad_norm": 6.168121337890625, + "learning_rate": 2e-05, + "loss": 0.05717981, + "step": 1537 + }, + { + "epoch": 3.076, + "grad_norm": 1.6621938943862915, + "learning_rate": 2e-05, + "loss": 0.04681021, + "step": 1538 + }, + { + "epoch": 3.078, + "grad_norm": 1.9949926137924194, + "learning_rate": 2e-05, + "loss": 0.05344428, + "step": 1539 + }, + { + "epoch": 3.08, + "grad_norm": 1.7297910451889038, + "learning_rate": 2e-05, + "loss": 0.06000186, + "step": 1540 + }, + { + "epoch": 3.082, + "grad_norm": 1.9841177463531494, + "learning_rate": 2e-05, + "loss": 0.05356345, + "step": 1541 + }, + { + "epoch": 3.084, + "grad_norm": 1.619390606880188, + "learning_rate": 2e-05, + "loss": 0.05409247, + "step": 1542 + }, + { + "epoch": 3.086, + "grad_norm": 2.220137357711792, + "learning_rate": 2e-05, + "loss": 0.06488518, + "step": 1543 + }, + { + "epoch": 3.088, + "grad_norm": 1.2586517333984375, + "learning_rate": 2e-05, + "loss": 0.0334148, + "step": 1544 + }, + { + "epoch": 3.09, + "grad_norm": 2.6507513523101807, + "learning_rate": 2e-05, + "loss": 0.05396133, + "step": 1545 + }, + { + "epoch": 3.092, + "grad_norm": 2.484825372695923, + "learning_rate": 2e-05, + "loss": 0.05893756, + "step": 1546 + }, + { + "epoch": 3.094, + "grad_norm": 1.530316710472107, + "learning_rate": 2e-05, + "loss": 0.04786216, + "step": 1547 + }, + { + "epoch": 3.096, + "grad_norm": 1.5662510395050049, + "learning_rate": 2e-05, + "loss": 0.05811158, + "step": 1548 + }, + { + "epoch": 3.098, + "grad_norm": 1.7030950784683228, + "learning_rate": 2e-05, + "loss": 0.06066509, + "step": 1549 + }, + { + "epoch": 3.1, + "grad_norm": 1.5851967334747314, + "learning_rate": 2e-05, + "loss": 0.06345069, + "step": 1550 + }, + { + "epoch": 3.102, + "grad_norm": 2.7735044956207275, + "learning_rate": 2e-05, + "loss": 0.0733242, + "step": 1551 + }, + { + "epoch": 3.104, + "grad_norm": 1.4052716493606567, + "learning_rate": 2e-05, + "loss": 0.03768426, + "step": 1552 + }, + { + "epoch": 3.106, + "grad_norm": 2.072314977645874, + "learning_rate": 2e-05, + "loss": 0.06333002, + "step": 1553 + }, + { + "epoch": 3.108, + "grad_norm": 1.3217315673828125, + "learning_rate": 2e-05, + "loss": 0.04815859, + "step": 1554 + }, + { + "epoch": 3.11, + "grad_norm": 1.2543447017669678, + "learning_rate": 2e-05, + "loss": 0.04736142, + "step": 1555 + }, + { + "epoch": 3.112, + "grad_norm": 1.8522039651870728, + "learning_rate": 2e-05, + "loss": 0.04406268, + "step": 1556 + }, + { + "epoch": 3.114, + "grad_norm": 1.7204567193984985, + "learning_rate": 2e-05, + "loss": 0.04844861, + "step": 1557 + }, + { + "epoch": 3.116, + "grad_norm": 1.9749001264572144, + "learning_rate": 2e-05, + "loss": 0.05746056, + "step": 1558 + }, + { + "epoch": 3.118, + "grad_norm": 1.9995803833007812, + "learning_rate": 2e-05, + "loss": 0.05737771, + "step": 1559 + }, + { + "epoch": 3.12, + "grad_norm": 2.1211681365966797, + "learning_rate": 2e-05, + "loss": 0.04288347, + "step": 1560 + }, + { + "epoch": 3.122, + "grad_norm": 1.4146476984024048, + "learning_rate": 2e-05, + "loss": 0.05071546, + "step": 1561 + }, + { + "epoch": 3.124, + "grad_norm": 1.660503625869751, + "learning_rate": 2e-05, + "loss": 0.05359786, + "step": 1562 + }, + { + "epoch": 3.126, + "grad_norm": 2.073803186416626, + "learning_rate": 2e-05, + "loss": 0.0541298, + "step": 1563 + }, + { + "epoch": 3.128, + "grad_norm": 2.1681854724884033, + "learning_rate": 2e-05, + "loss": 0.03845184, + "step": 1564 + }, + { + "epoch": 3.13, + "grad_norm": 1.6317102909088135, + "learning_rate": 2e-05, + "loss": 0.04459539, + "step": 1565 + }, + { + "epoch": 3.132, + "grad_norm": 1.2800451517105103, + "learning_rate": 2e-05, + "loss": 0.05038659, + "step": 1566 + }, + { + "epoch": 3.134, + "grad_norm": 1.627054214477539, + "learning_rate": 2e-05, + "loss": 0.05846292, + "step": 1567 + }, + { + "epoch": 3.136, + "grad_norm": 1.5014915466308594, + "learning_rate": 2e-05, + "loss": 0.0461411, + "step": 1568 + }, + { + "epoch": 3.138, + "grad_norm": 1.4676257371902466, + "learning_rate": 2e-05, + "loss": 0.04794473, + "step": 1569 + }, + { + "epoch": 3.14, + "grad_norm": 0.8870722055435181, + "learning_rate": 2e-05, + "loss": 0.0262022, + "step": 1570 + }, + { + "epoch": 3.142, + "grad_norm": 1.5116006135940552, + "learning_rate": 2e-05, + "loss": 0.04896776, + "step": 1571 + }, + { + "epoch": 3.144, + "grad_norm": 2.2803430557250977, + "learning_rate": 2e-05, + "loss": 0.03939671, + "step": 1572 + }, + { + "epoch": 3.146, + "grad_norm": 1.6452845335006714, + "learning_rate": 2e-05, + "loss": 0.06173242, + "step": 1573 + }, + { + "epoch": 3.148, + "grad_norm": 1.3423179388046265, + "learning_rate": 2e-05, + "loss": 0.03313474, + "step": 1574 + }, + { + "epoch": 3.15, + "grad_norm": 1.491223692893982, + "learning_rate": 2e-05, + "loss": 0.04076514, + "step": 1575 + }, + { + "epoch": 3.152, + "grad_norm": 1.7405002117156982, + "learning_rate": 2e-05, + "loss": 0.06292631, + "step": 1576 + }, + { + "epoch": 3.154, + "grad_norm": 2.301616907119751, + "learning_rate": 2e-05, + "loss": 0.04174115, + "step": 1577 + }, + { + "epoch": 3.156, + "grad_norm": 1.4608789682388306, + "learning_rate": 2e-05, + "loss": 0.03542221, + "step": 1578 + }, + { + "epoch": 3.158, + "grad_norm": 1.6150435209274292, + "learning_rate": 2e-05, + "loss": 0.0463053, + "step": 1579 + }, + { + "epoch": 3.16, + "grad_norm": 1.156436562538147, + "learning_rate": 2e-05, + "loss": 0.03821928, + "step": 1580 + }, + { + "epoch": 3.162, + "grad_norm": 1.483613133430481, + "learning_rate": 2e-05, + "loss": 0.06052083, + "step": 1581 + }, + { + "epoch": 3.164, + "grad_norm": 3.3407390117645264, + "learning_rate": 2e-05, + "loss": 0.05487797, + "step": 1582 + }, + { + "epoch": 3.166, + "grad_norm": 1.3785268068313599, + "learning_rate": 2e-05, + "loss": 0.04208247, + "step": 1583 + }, + { + "epoch": 3.168, + "grad_norm": 2.0314083099365234, + "learning_rate": 2e-05, + "loss": 0.03377452, + "step": 1584 + }, + { + "epoch": 3.17, + "grad_norm": 2.2946577072143555, + "learning_rate": 2e-05, + "loss": 0.03864856, + "step": 1585 + }, + { + "epoch": 3.172, + "grad_norm": 1.5391055345535278, + "learning_rate": 2e-05, + "loss": 0.04968462, + "step": 1586 + }, + { + "epoch": 3.174, + "grad_norm": 2.3214282989501953, + "learning_rate": 2e-05, + "loss": 0.0490016, + "step": 1587 + }, + { + "epoch": 3.176, + "grad_norm": 2.729959487915039, + "learning_rate": 2e-05, + "loss": 0.06228638, + "step": 1588 + }, + { + "epoch": 3.178, + "grad_norm": 1.3159915208816528, + "learning_rate": 2e-05, + "loss": 0.04129592, + "step": 1589 + }, + { + "epoch": 3.18, + "grad_norm": 2.102053642272949, + "learning_rate": 2e-05, + "loss": 0.06420384, + "step": 1590 + }, + { + "epoch": 3.182, + "grad_norm": 1.6806936264038086, + "learning_rate": 2e-05, + "loss": 0.04690688, + "step": 1591 + }, + { + "epoch": 3.184, + "grad_norm": 1.5850759744644165, + "learning_rate": 2e-05, + "loss": 0.05270191, + "step": 1592 + }, + { + "epoch": 3.186, + "grad_norm": 1.6031419038772583, + "learning_rate": 2e-05, + "loss": 0.04544835, + "step": 1593 + }, + { + "epoch": 3.188, + "grad_norm": 1.542733907699585, + "learning_rate": 2e-05, + "loss": 0.05052012, + "step": 1594 + }, + { + "epoch": 3.19, + "grad_norm": 1.3358275890350342, + "learning_rate": 2e-05, + "loss": 0.04404087, + "step": 1595 + }, + { + "epoch": 3.192, + "grad_norm": 3.1650984287261963, + "learning_rate": 2e-05, + "loss": 0.0582697, + "step": 1596 + }, + { + "epoch": 3.194, + "grad_norm": 2.426821231842041, + "learning_rate": 2e-05, + "loss": 0.06109107, + "step": 1597 + }, + { + "epoch": 3.196, + "grad_norm": 1.221462607383728, + "learning_rate": 2e-05, + "loss": 0.03468029, + "step": 1598 + }, + { + "epoch": 3.198, + "grad_norm": 1.832042932510376, + "learning_rate": 2e-05, + "loss": 0.04503892, + "step": 1599 + }, + { + "epoch": 3.2, + "grad_norm": 1.6637980937957764, + "learning_rate": 2e-05, + "loss": 0.05590947, + "step": 1600 + }, + { + "epoch": 3.202, + "grad_norm": 1.707131266593933, + "learning_rate": 2e-05, + "loss": 0.04391931, + "step": 1601 + }, + { + "epoch": 3.204, + "grad_norm": 1.2049839496612549, + "learning_rate": 2e-05, + "loss": 0.03543879, + "step": 1602 + }, + { + "epoch": 3.206, + "grad_norm": 1.2202122211456299, + "learning_rate": 2e-05, + "loss": 0.0452034, + "step": 1603 + }, + { + "epoch": 3.208, + "grad_norm": 1.4739149808883667, + "learning_rate": 2e-05, + "loss": 0.05101965, + "step": 1604 + }, + { + "epoch": 3.21, + "grad_norm": 1.7410937547683716, + "learning_rate": 2e-05, + "loss": 0.05315409, + "step": 1605 + }, + { + "epoch": 3.212, + "grad_norm": 1.943514108657837, + "learning_rate": 2e-05, + "loss": 0.05091185, + "step": 1606 + }, + { + "epoch": 3.214, + "grad_norm": 2.420563220977783, + "learning_rate": 2e-05, + "loss": 0.04633863, + "step": 1607 + }, + { + "epoch": 3.216, + "grad_norm": 1.5087945461273193, + "learning_rate": 2e-05, + "loss": 0.0458011, + "step": 1608 + }, + { + "epoch": 3.218, + "grad_norm": 1.2308595180511475, + "learning_rate": 2e-05, + "loss": 0.04334211, + "step": 1609 + }, + { + "epoch": 3.22, + "grad_norm": 1.7495505809783936, + "learning_rate": 2e-05, + "loss": 0.05809302, + "step": 1610 + }, + { + "epoch": 3.222, + "grad_norm": 1.3433486223220825, + "learning_rate": 2e-05, + "loss": 0.0264309, + "step": 1611 + }, + { + "epoch": 3.224, + "grad_norm": 2.1560001373291016, + "learning_rate": 2e-05, + "loss": 0.05610461, + "step": 1612 + }, + { + "epoch": 3.226, + "grad_norm": 1.4553375244140625, + "learning_rate": 2e-05, + "loss": 0.06240574, + "step": 1613 + }, + { + "epoch": 3.228, + "grad_norm": 1.6326318979263306, + "learning_rate": 2e-05, + "loss": 0.04741699, + "step": 1614 + }, + { + "epoch": 3.23, + "grad_norm": 2.0145437717437744, + "learning_rate": 2e-05, + "loss": 0.06130683, + "step": 1615 + }, + { + "epoch": 3.232, + "grad_norm": 2.395770311355591, + "learning_rate": 2e-05, + "loss": 0.06174326, + "step": 1616 + }, + { + "epoch": 3.234, + "grad_norm": 1.4244645833969116, + "learning_rate": 2e-05, + "loss": 0.04702844, + "step": 1617 + }, + { + "epoch": 3.2359999999999998, + "grad_norm": 1.611910104751587, + "learning_rate": 2e-05, + "loss": 0.05492343, + "step": 1618 + }, + { + "epoch": 3.238, + "grad_norm": 1.3452534675598145, + "learning_rate": 2e-05, + "loss": 0.04002935, + "step": 1619 + }, + { + "epoch": 3.24, + "grad_norm": 1.4256281852722168, + "learning_rate": 2e-05, + "loss": 0.06138282, + "step": 1620 + }, + { + "epoch": 3.242, + "grad_norm": 2.094010353088379, + "learning_rate": 2e-05, + "loss": 0.05636217, + "step": 1621 + }, + { + "epoch": 3.2439999999999998, + "grad_norm": 1.8228808641433716, + "learning_rate": 2e-05, + "loss": 0.05640348, + "step": 1622 + }, + { + "epoch": 3.246, + "grad_norm": 0.991633415222168, + "learning_rate": 2e-05, + "loss": 0.03676727, + "step": 1623 + }, + { + "epoch": 3.248, + "grad_norm": 1.5511411428451538, + "learning_rate": 2e-05, + "loss": 0.03195164, + "step": 1624 + }, + { + "epoch": 3.25, + "grad_norm": 1.3886339664459229, + "learning_rate": 2e-05, + "loss": 0.06085005, + "step": 1625 + }, + { + "epoch": 3.252, + "grad_norm": 1.6809405088424683, + "learning_rate": 2e-05, + "loss": 0.06114616, + "step": 1626 + }, + { + "epoch": 3.254, + "grad_norm": 1.2058086395263672, + "learning_rate": 2e-05, + "loss": 0.03140325, + "step": 1627 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 1.2712700366973877, + "learning_rate": 2e-05, + "loss": 0.05181075, + "step": 1628 + }, + { + "epoch": 3.258, + "grad_norm": 1.6511715650558472, + "learning_rate": 2e-05, + "loss": 0.03143429, + "step": 1629 + }, + { + "epoch": 3.26, + "grad_norm": 1.5783659219741821, + "learning_rate": 2e-05, + "loss": 0.04343471, + "step": 1630 + }, + { + "epoch": 3.262, + "grad_norm": 0.9034122824668884, + "learning_rate": 2e-05, + "loss": 0.02218392, + "step": 1631 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 1.5698509216308594, + "learning_rate": 2e-05, + "loss": 0.05552916, + "step": 1632 + }, + { + "epoch": 3.266, + "grad_norm": 1.4005275964736938, + "learning_rate": 2e-05, + "loss": 0.03984913, + "step": 1633 + }, + { + "epoch": 3.268, + "grad_norm": 1.3338640928268433, + "learning_rate": 2e-05, + "loss": 0.03644662, + "step": 1634 + }, + { + "epoch": 3.27, + "grad_norm": 1.7735025882720947, + "learning_rate": 2e-05, + "loss": 0.03694466, + "step": 1635 + }, + { + "epoch": 3.2720000000000002, + "grad_norm": 2.160663604736328, + "learning_rate": 2e-05, + "loss": 0.05129207, + "step": 1636 + }, + { + "epoch": 3.274, + "grad_norm": 2.0547683238983154, + "learning_rate": 2e-05, + "loss": 0.03293342, + "step": 1637 + }, + { + "epoch": 3.276, + "grad_norm": 1.309456467628479, + "learning_rate": 2e-05, + "loss": 0.04493797, + "step": 1638 + }, + { + "epoch": 3.278, + "grad_norm": 2.039910316467285, + "learning_rate": 2e-05, + "loss": 0.06677075, + "step": 1639 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 2.196495771408081, + "learning_rate": 2e-05, + "loss": 0.05029328, + "step": 1640 + }, + { + "epoch": 3.282, + "grad_norm": 2.049243927001953, + "learning_rate": 2e-05, + "loss": 0.05796874, + "step": 1641 + }, + { + "epoch": 3.284, + "grad_norm": 2.323168992996216, + "learning_rate": 2e-05, + "loss": 0.04754086, + "step": 1642 + }, + { + "epoch": 3.286, + "grad_norm": 1.9049650430679321, + "learning_rate": 2e-05, + "loss": 0.05514568, + "step": 1643 + }, + { + "epoch": 3.288, + "grad_norm": 1.898848295211792, + "learning_rate": 2e-05, + "loss": 0.03569252, + "step": 1644 + }, + { + "epoch": 3.29, + "grad_norm": 2.345487356185913, + "learning_rate": 2e-05, + "loss": 0.0490138, + "step": 1645 + }, + { + "epoch": 3.292, + "grad_norm": 2.001127004623413, + "learning_rate": 2e-05, + "loss": 0.05215742, + "step": 1646 + }, + { + "epoch": 3.294, + "grad_norm": 1.4057964086532593, + "learning_rate": 2e-05, + "loss": 0.03418482, + "step": 1647 + }, + { + "epoch": 3.296, + "grad_norm": 1.412276029586792, + "learning_rate": 2e-05, + "loss": 0.03291162, + "step": 1648 + }, + { + "epoch": 3.298, + "grad_norm": 1.5926165580749512, + "learning_rate": 2e-05, + "loss": 0.04363141, + "step": 1649 + }, + { + "epoch": 3.3, + "grad_norm": 1.9674831628799438, + "learning_rate": 2e-05, + "loss": 0.06002123, + "step": 1650 + }, + { + "epoch": 3.302, + "grad_norm": 1.581549048423767, + "learning_rate": 2e-05, + "loss": 0.07400438, + "step": 1651 + }, + { + "epoch": 3.304, + "grad_norm": 1.1746406555175781, + "learning_rate": 2e-05, + "loss": 0.03233917, + "step": 1652 + }, + { + "epoch": 3.306, + "grad_norm": 1.3594640493392944, + "learning_rate": 2e-05, + "loss": 0.06445151, + "step": 1653 + }, + { + "epoch": 3.308, + "grad_norm": 1.3005573749542236, + "learning_rate": 2e-05, + "loss": 0.03622051, + "step": 1654 + }, + { + "epoch": 3.31, + "grad_norm": 0.8649536371231079, + "learning_rate": 2e-05, + "loss": 0.03161732, + "step": 1655 + }, + { + "epoch": 3.312, + "grad_norm": 1.1120051145553589, + "learning_rate": 2e-05, + "loss": 0.03670903, + "step": 1656 + }, + { + "epoch": 3.314, + "grad_norm": 2.0635063648223877, + "learning_rate": 2e-05, + "loss": 0.03296005, + "step": 1657 + }, + { + "epoch": 3.316, + "grad_norm": 1.942264437675476, + "learning_rate": 2e-05, + "loss": 0.04086999, + "step": 1658 + }, + { + "epoch": 3.318, + "grad_norm": 1.2889881134033203, + "learning_rate": 2e-05, + "loss": 0.03653413, + "step": 1659 + }, + { + "epoch": 3.32, + "grad_norm": 1.5247739553451538, + "learning_rate": 2e-05, + "loss": 0.05022556, + "step": 1660 + }, + { + "epoch": 3.322, + "grad_norm": 1.208240270614624, + "learning_rate": 2e-05, + "loss": 0.04346113, + "step": 1661 + }, + { + "epoch": 3.324, + "grad_norm": 1.5146716833114624, + "learning_rate": 2e-05, + "loss": 0.04634988, + "step": 1662 + }, + { + "epoch": 3.326, + "grad_norm": 1.9538823366165161, + "learning_rate": 2e-05, + "loss": 0.04254812, + "step": 1663 + }, + { + "epoch": 3.328, + "grad_norm": 2.021803379058838, + "learning_rate": 2e-05, + "loss": 0.05505613, + "step": 1664 + }, + { + "epoch": 3.33, + "grad_norm": 1.4671275615692139, + "learning_rate": 2e-05, + "loss": 0.04185315, + "step": 1665 + }, + { + "epoch": 3.332, + "grad_norm": 1.3730452060699463, + "learning_rate": 2e-05, + "loss": 0.04696137, + "step": 1666 + }, + { + "epoch": 3.334, + "grad_norm": 1.9206724166870117, + "learning_rate": 2e-05, + "loss": 0.05052917, + "step": 1667 + }, + { + "epoch": 3.336, + "grad_norm": 1.4539846181869507, + "learning_rate": 2e-05, + "loss": 0.04785593, + "step": 1668 + }, + { + "epoch": 3.338, + "grad_norm": 1.1836252212524414, + "learning_rate": 2e-05, + "loss": 0.03441792, + "step": 1669 + }, + { + "epoch": 3.34, + "grad_norm": 1.3856287002563477, + "learning_rate": 2e-05, + "loss": 0.03965211, + "step": 1670 + }, + { + "epoch": 3.342, + "grad_norm": 1.7527414560317993, + "learning_rate": 2e-05, + "loss": 0.05044956, + "step": 1671 + }, + { + "epoch": 3.344, + "grad_norm": 1.460963249206543, + "learning_rate": 2e-05, + "loss": 0.0464677, + "step": 1672 + }, + { + "epoch": 3.346, + "grad_norm": 2.3444771766662598, + "learning_rate": 2e-05, + "loss": 0.05217274, + "step": 1673 + }, + { + "epoch": 3.348, + "grad_norm": 1.5559501647949219, + "learning_rate": 2e-05, + "loss": 0.04852623, + "step": 1674 + }, + { + "epoch": 3.35, + "grad_norm": 1.9685094356536865, + "learning_rate": 2e-05, + "loss": 0.07367245, + "step": 1675 + }, + { + "epoch": 3.352, + "grad_norm": 1.3340469598770142, + "learning_rate": 2e-05, + "loss": 0.03031318, + "step": 1676 + }, + { + "epoch": 3.354, + "grad_norm": 1.5422812700271606, + "learning_rate": 2e-05, + "loss": 0.04720259, + "step": 1677 + }, + { + "epoch": 3.356, + "grad_norm": 1.6444870233535767, + "learning_rate": 2e-05, + "loss": 0.05606821, + "step": 1678 + }, + { + "epoch": 3.358, + "grad_norm": 2.2023096084594727, + "learning_rate": 2e-05, + "loss": 0.06454101, + "step": 1679 + }, + { + "epoch": 3.36, + "grad_norm": 1.5785565376281738, + "learning_rate": 2e-05, + "loss": 0.04989371, + "step": 1680 + }, + { + "epoch": 3.362, + "grad_norm": 1.6977547407150269, + "learning_rate": 2e-05, + "loss": 0.06437075, + "step": 1681 + }, + { + "epoch": 3.364, + "grad_norm": 2.207683801651001, + "learning_rate": 2e-05, + "loss": 0.05562439, + "step": 1682 + }, + { + "epoch": 3.366, + "grad_norm": 1.5128238201141357, + "learning_rate": 2e-05, + "loss": 0.05253971, + "step": 1683 + }, + { + "epoch": 3.368, + "grad_norm": 1.5503803491592407, + "learning_rate": 2e-05, + "loss": 0.03294653, + "step": 1684 + }, + { + "epoch": 3.37, + "grad_norm": 2.744611978530884, + "learning_rate": 2e-05, + "loss": 0.0665991, + "step": 1685 + }, + { + "epoch": 3.372, + "grad_norm": 1.9426320791244507, + "learning_rate": 2e-05, + "loss": 0.03777114, + "step": 1686 + }, + { + "epoch": 3.374, + "grad_norm": 1.7858504056930542, + "learning_rate": 2e-05, + "loss": 0.06296871, + "step": 1687 + }, + { + "epoch": 3.376, + "grad_norm": 1.6751807928085327, + "learning_rate": 2e-05, + "loss": 0.0362752, + "step": 1688 + }, + { + "epoch": 3.378, + "grad_norm": 1.8033543825149536, + "learning_rate": 2e-05, + "loss": 0.05158364, + "step": 1689 + }, + { + "epoch": 3.38, + "grad_norm": 1.0224000215530396, + "learning_rate": 2e-05, + "loss": 0.03424178, + "step": 1690 + }, + { + "epoch": 3.382, + "grad_norm": 1.2834762334823608, + "learning_rate": 2e-05, + "loss": 0.04011747, + "step": 1691 + }, + { + "epoch": 3.384, + "grad_norm": 1.573851466178894, + "learning_rate": 2e-05, + "loss": 0.04472176, + "step": 1692 + }, + { + "epoch": 3.386, + "grad_norm": 1.0216726064682007, + "learning_rate": 2e-05, + "loss": 0.03067708, + "step": 1693 + }, + { + "epoch": 3.388, + "grad_norm": 1.6406019926071167, + "learning_rate": 2e-05, + "loss": 0.06715198, + "step": 1694 + }, + { + "epoch": 3.39, + "grad_norm": 1.9565666913986206, + "learning_rate": 2e-05, + "loss": 0.04441306, + "step": 1695 + }, + { + "epoch": 3.392, + "grad_norm": 1.312374234199524, + "learning_rate": 2e-05, + "loss": 0.04021595, + "step": 1696 + }, + { + "epoch": 3.394, + "grad_norm": 1.5732117891311646, + "learning_rate": 2e-05, + "loss": 0.05825553, + "step": 1697 + }, + { + "epoch": 3.396, + "grad_norm": 1.7435842752456665, + "learning_rate": 2e-05, + "loss": 0.03737545, + "step": 1698 + }, + { + "epoch": 3.398, + "grad_norm": 1.3936766386032104, + "learning_rate": 2e-05, + "loss": 0.04649488, + "step": 1699 + }, + { + "epoch": 3.4, + "grad_norm": 1.222420573234558, + "learning_rate": 2e-05, + "loss": 0.04838939, + "step": 1700 + }, + { + "epoch": 3.402, + "grad_norm": 1.7847157716751099, + "learning_rate": 2e-05, + "loss": 0.04798005, + "step": 1701 + }, + { + "epoch": 3.404, + "grad_norm": 1.8903214931488037, + "learning_rate": 2e-05, + "loss": 0.04042132, + "step": 1702 + }, + { + "epoch": 3.406, + "grad_norm": 1.9824771881103516, + "learning_rate": 2e-05, + "loss": 0.04405872, + "step": 1703 + }, + { + "epoch": 3.408, + "grad_norm": 2.978909730911255, + "learning_rate": 2e-05, + "loss": 0.06235117, + "step": 1704 + }, + { + "epoch": 3.41, + "grad_norm": 2.1281321048736572, + "learning_rate": 2e-05, + "loss": 0.0631174, + "step": 1705 + }, + { + "epoch": 3.412, + "grad_norm": 1.207214117050171, + "learning_rate": 2e-05, + "loss": 0.04360072, + "step": 1706 + }, + { + "epoch": 3.414, + "grad_norm": 2.4614744186401367, + "learning_rate": 2e-05, + "loss": 0.05545459, + "step": 1707 + }, + { + "epoch": 3.416, + "grad_norm": 2.2295455932617188, + "learning_rate": 2e-05, + "loss": 0.05138841, + "step": 1708 + }, + { + "epoch": 3.418, + "grad_norm": 1.6426289081573486, + "learning_rate": 2e-05, + "loss": 0.03976878, + "step": 1709 + }, + { + "epoch": 3.42, + "grad_norm": 2.6790168285369873, + "learning_rate": 2e-05, + "loss": 0.06980205, + "step": 1710 + }, + { + "epoch": 3.422, + "grad_norm": 1.8094228506088257, + "learning_rate": 2e-05, + "loss": 0.05368996, + "step": 1711 + }, + { + "epoch": 3.424, + "grad_norm": 2.0319266319274902, + "learning_rate": 2e-05, + "loss": 0.0543413, + "step": 1712 + }, + { + "epoch": 3.426, + "grad_norm": 1.3606593608856201, + "learning_rate": 2e-05, + "loss": 0.03617115, + "step": 1713 + }, + { + "epoch": 3.428, + "grad_norm": 4.077547550201416, + "learning_rate": 2e-05, + "loss": 0.0491549, + "step": 1714 + }, + { + "epoch": 3.43, + "grad_norm": 2.273555040359497, + "learning_rate": 2e-05, + "loss": 0.06512973, + "step": 1715 + }, + { + "epoch": 3.432, + "grad_norm": 1.533727765083313, + "learning_rate": 2e-05, + "loss": 0.05353583, + "step": 1716 + }, + { + "epoch": 3.434, + "grad_norm": 1.5889472961425781, + "learning_rate": 2e-05, + "loss": 0.05391718, + "step": 1717 + }, + { + "epoch": 3.436, + "grad_norm": 1.513043761253357, + "learning_rate": 2e-05, + "loss": 0.05352363, + "step": 1718 + }, + { + "epoch": 3.438, + "grad_norm": 1.708634853363037, + "learning_rate": 2e-05, + "loss": 0.042217, + "step": 1719 + }, + { + "epoch": 3.44, + "grad_norm": 1.5580065250396729, + "learning_rate": 2e-05, + "loss": 0.05138306, + "step": 1720 + }, + { + "epoch": 3.442, + "grad_norm": 1.3362562656402588, + "learning_rate": 2e-05, + "loss": 0.04820231, + "step": 1721 + }, + { + "epoch": 3.444, + "grad_norm": 1.8119944334030151, + "learning_rate": 2e-05, + "loss": 0.03760799, + "step": 1722 + }, + { + "epoch": 3.446, + "grad_norm": 1.7421449422836304, + "learning_rate": 2e-05, + "loss": 0.04001708, + "step": 1723 + }, + { + "epoch": 3.448, + "grad_norm": 1.6809686422348022, + "learning_rate": 2e-05, + "loss": 0.05122007, + "step": 1724 + }, + { + "epoch": 3.45, + "grad_norm": 3.9546453952789307, + "learning_rate": 2e-05, + "loss": 0.04566801, + "step": 1725 + }, + { + "epoch": 3.452, + "grad_norm": 1.462601661682129, + "learning_rate": 2e-05, + "loss": 0.04439667, + "step": 1726 + }, + { + "epoch": 3.454, + "grad_norm": 2.2390847206115723, + "learning_rate": 2e-05, + "loss": 0.03910905, + "step": 1727 + }, + { + "epoch": 3.456, + "grad_norm": 1.7394695281982422, + "learning_rate": 2e-05, + "loss": 0.05194352, + "step": 1728 + }, + { + "epoch": 3.458, + "grad_norm": 2.4958033561706543, + "learning_rate": 2e-05, + "loss": 0.05337561, + "step": 1729 + }, + { + "epoch": 3.46, + "grad_norm": 2.3025898933410645, + "learning_rate": 2e-05, + "loss": 0.04093119, + "step": 1730 + }, + { + "epoch": 3.462, + "grad_norm": 1.6952155828475952, + "learning_rate": 2e-05, + "loss": 0.0470874, + "step": 1731 + }, + { + "epoch": 3.464, + "grad_norm": 1.3717252016067505, + "learning_rate": 2e-05, + "loss": 0.02857566, + "step": 1732 + }, + { + "epoch": 3.466, + "grad_norm": 2.161407232284546, + "learning_rate": 2e-05, + "loss": 0.06686375, + "step": 1733 + }, + { + "epoch": 3.468, + "grad_norm": 1.595219373703003, + "learning_rate": 2e-05, + "loss": 0.05054267, + "step": 1734 + }, + { + "epoch": 3.4699999999999998, + "grad_norm": 1.613440752029419, + "learning_rate": 2e-05, + "loss": 0.02639636, + "step": 1735 + }, + { + "epoch": 3.472, + "grad_norm": 2.263852596282959, + "learning_rate": 2e-05, + "loss": 0.05445278, + "step": 1736 + }, + { + "epoch": 3.474, + "grad_norm": 2.566851854324341, + "learning_rate": 2e-05, + "loss": 0.05587868, + "step": 1737 + }, + { + "epoch": 3.476, + "grad_norm": 2.8093175888061523, + "learning_rate": 2e-05, + "loss": 0.05327387, + "step": 1738 + }, + { + "epoch": 3.4779999999999998, + "grad_norm": 1.6908395290374756, + "learning_rate": 2e-05, + "loss": 0.04662281, + "step": 1739 + }, + { + "epoch": 3.48, + "grad_norm": 3.4398069381713867, + "learning_rate": 2e-05, + "loss": 0.06926405, + "step": 1740 + }, + { + "epoch": 3.482, + "grad_norm": 1.7976182699203491, + "learning_rate": 2e-05, + "loss": 0.05067788, + "step": 1741 + }, + { + "epoch": 3.484, + "grad_norm": 1.8646904230117798, + "learning_rate": 2e-05, + "loss": 0.04594427, + "step": 1742 + }, + { + "epoch": 3.4859999999999998, + "grad_norm": 1.8577203750610352, + "learning_rate": 2e-05, + "loss": 0.05025358, + "step": 1743 + }, + { + "epoch": 3.488, + "grad_norm": 1.3906282186508179, + "learning_rate": 2e-05, + "loss": 0.04262529, + "step": 1744 + }, + { + "epoch": 3.49, + "grad_norm": 1.8370797634124756, + "learning_rate": 2e-05, + "loss": 0.05269311, + "step": 1745 + }, + { + "epoch": 3.492, + "grad_norm": 2.2525057792663574, + "learning_rate": 2e-05, + "loss": 0.04886598, + "step": 1746 + }, + { + "epoch": 3.4939999999999998, + "grad_norm": 1.8896589279174805, + "learning_rate": 2e-05, + "loss": 0.05545288, + "step": 1747 + }, + { + "epoch": 3.496, + "grad_norm": 1.3138065338134766, + "learning_rate": 2e-05, + "loss": 0.04557213, + "step": 1748 + }, + { + "epoch": 3.498, + "grad_norm": 2.4323270320892334, + "learning_rate": 2e-05, + "loss": 0.05506615, + "step": 1749 + }, + { + "epoch": 3.5, + "grad_norm": 1.2671258449554443, + "learning_rate": 2e-05, + "loss": 0.03962918, + "step": 1750 + }, + { + "epoch": 3.502, + "grad_norm": 1.2083110809326172, + "learning_rate": 2e-05, + "loss": 0.04052875, + "step": 1751 + }, + { + "epoch": 3.504, + "grad_norm": 2.204509735107422, + "learning_rate": 2e-05, + "loss": 0.05543607, + "step": 1752 + }, + { + "epoch": 3.5060000000000002, + "grad_norm": 2.0916733741760254, + "learning_rate": 2e-05, + "loss": 0.05030297, + "step": 1753 + }, + { + "epoch": 3.508, + "grad_norm": 1.2749862670898438, + "learning_rate": 2e-05, + "loss": 0.05265682, + "step": 1754 + }, + { + "epoch": 3.51, + "grad_norm": 1.2697100639343262, + "learning_rate": 2e-05, + "loss": 0.04578009, + "step": 1755 + }, + { + "epoch": 3.512, + "grad_norm": 2.575673818588257, + "learning_rate": 2e-05, + "loss": 0.05128983, + "step": 1756 + }, + { + "epoch": 3.5140000000000002, + "grad_norm": 1.548751711845398, + "learning_rate": 2e-05, + "loss": 0.04093613, + "step": 1757 + }, + { + "epoch": 3.516, + "grad_norm": 1.305117130279541, + "learning_rate": 2e-05, + "loss": 0.04004899, + "step": 1758 + }, + { + "epoch": 3.518, + "grad_norm": 1.4659463167190552, + "learning_rate": 2e-05, + "loss": 0.04068562, + "step": 1759 + }, + { + "epoch": 3.52, + "grad_norm": 2.3224215507507324, + "learning_rate": 2e-05, + "loss": 0.03831296, + "step": 1760 + }, + { + "epoch": 3.5220000000000002, + "grad_norm": 1.6149941682815552, + "learning_rate": 2e-05, + "loss": 0.0297474, + "step": 1761 + }, + { + "epoch": 3.524, + "grad_norm": 1.7875992059707642, + "learning_rate": 2e-05, + "loss": 0.04237871, + "step": 1762 + }, + { + "epoch": 3.526, + "grad_norm": 1.6973800659179688, + "learning_rate": 2e-05, + "loss": 0.0459117, + "step": 1763 + }, + { + "epoch": 3.528, + "grad_norm": 2.5080761909484863, + "learning_rate": 2e-05, + "loss": 0.05171615, + "step": 1764 + }, + { + "epoch": 3.5300000000000002, + "grad_norm": 1.5514436960220337, + "learning_rate": 2e-05, + "loss": 0.03801546, + "step": 1765 + }, + { + "epoch": 3.532, + "grad_norm": 1.5856919288635254, + "learning_rate": 2e-05, + "loss": 0.03361356, + "step": 1766 + }, + { + "epoch": 3.534, + "grad_norm": 1.5322203636169434, + "learning_rate": 2e-05, + "loss": 0.04025748, + "step": 1767 + }, + { + "epoch": 3.536, + "grad_norm": 2.3193933963775635, + "learning_rate": 2e-05, + "loss": 0.06852245, + "step": 1768 + }, + { + "epoch": 3.5380000000000003, + "grad_norm": 2.101360559463501, + "learning_rate": 2e-05, + "loss": 0.06704597, + "step": 1769 + }, + { + "epoch": 3.54, + "grad_norm": 2.1348562240600586, + "learning_rate": 2e-05, + "loss": 0.03175708, + "step": 1770 + }, + { + "epoch": 3.542, + "grad_norm": 1.613877296447754, + "learning_rate": 2e-05, + "loss": 0.05038162, + "step": 1771 + }, + { + "epoch": 3.544, + "grad_norm": 1.5137550830841064, + "learning_rate": 2e-05, + "loss": 0.04188209, + "step": 1772 + }, + { + "epoch": 3.5460000000000003, + "grad_norm": 1.4603201150894165, + "learning_rate": 2e-05, + "loss": 0.03608978, + "step": 1773 + }, + { + "epoch": 3.548, + "grad_norm": 1.6376720666885376, + "learning_rate": 2e-05, + "loss": 0.05980166, + "step": 1774 + }, + { + "epoch": 3.55, + "grad_norm": 2.329392671585083, + "learning_rate": 2e-05, + "loss": 0.04924781, + "step": 1775 + }, + { + "epoch": 3.552, + "grad_norm": 2.361161470413208, + "learning_rate": 2e-05, + "loss": 0.05286585, + "step": 1776 + }, + { + "epoch": 3.5540000000000003, + "grad_norm": 1.530290126800537, + "learning_rate": 2e-05, + "loss": 0.03231184, + "step": 1777 + }, + { + "epoch": 3.556, + "grad_norm": 1.1076228618621826, + "learning_rate": 2e-05, + "loss": 0.02891509, + "step": 1778 + }, + { + "epoch": 3.558, + "grad_norm": 1.950429916381836, + "learning_rate": 2e-05, + "loss": 0.06025671, + "step": 1779 + }, + { + "epoch": 3.56, + "grad_norm": 1.4713891744613647, + "learning_rate": 2e-05, + "loss": 0.04263363, + "step": 1780 + }, + { + "epoch": 3.5620000000000003, + "grad_norm": 1.9598925113677979, + "learning_rate": 2e-05, + "loss": 0.0472781, + "step": 1781 + }, + { + "epoch": 3.564, + "grad_norm": 1.9756190776824951, + "learning_rate": 2e-05, + "loss": 0.05379218, + "step": 1782 + }, + { + "epoch": 3.566, + "grad_norm": 1.9871381521224976, + "learning_rate": 2e-05, + "loss": 0.04629632, + "step": 1783 + }, + { + "epoch": 3.568, + "grad_norm": 2.374814510345459, + "learning_rate": 2e-05, + "loss": 0.05010781, + "step": 1784 + }, + { + "epoch": 3.57, + "grad_norm": 1.6160145998001099, + "learning_rate": 2e-05, + "loss": 0.04810391, + "step": 1785 + }, + { + "epoch": 3.572, + "grad_norm": 1.333565354347229, + "learning_rate": 2e-05, + "loss": 0.04096656, + "step": 1786 + }, + { + "epoch": 3.574, + "grad_norm": 1.2630254030227661, + "learning_rate": 2e-05, + "loss": 0.04580157, + "step": 1787 + }, + { + "epoch": 3.576, + "grad_norm": 2.068432569503784, + "learning_rate": 2e-05, + "loss": 0.06533036, + "step": 1788 + }, + { + "epoch": 3.578, + "grad_norm": 1.422972321510315, + "learning_rate": 2e-05, + "loss": 0.04293206, + "step": 1789 + }, + { + "epoch": 3.58, + "grad_norm": 1.5883013010025024, + "learning_rate": 2e-05, + "loss": 0.03554819, + "step": 1790 + }, + { + "epoch": 3.582, + "grad_norm": 1.2352162599563599, + "learning_rate": 2e-05, + "loss": 0.03722744, + "step": 1791 + }, + { + "epoch": 3.584, + "grad_norm": 1.440484642982483, + "learning_rate": 2e-05, + "loss": 0.03654459, + "step": 1792 + }, + { + "epoch": 3.586, + "grad_norm": 2.2191922664642334, + "learning_rate": 2e-05, + "loss": 0.05253022, + "step": 1793 + }, + { + "epoch": 3.588, + "grad_norm": 1.7797856330871582, + "learning_rate": 2e-05, + "loss": 0.04316117, + "step": 1794 + }, + { + "epoch": 3.59, + "grad_norm": 1.476678729057312, + "learning_rate": 2e-05, + "loss": 0.04034651, + "step": 1795 + }, + { + "epoch": 3.592, + "grad_norm": 1.2704788446426392, + "learning_rate": 2e-05, + "loss": 0.03967745, + "step": 1796 + }, + { + "epoch": 3.594, + "grad_norm": 2.31673002243042, + "learning_rate": 2e-05, + "loss": 0.0395114, + "step": 1797 + }, + { + "epoch": 3.596, + "grad_norm": 1.2823116779327393, + "learning_rate": 2e-05, + "loss": 0.04466548, + "step": 1798 + }, + { + "epoch": 3.598, + "grad_norm": 1.1849486827850342, + "learning_rate": 2e-05, + "loss": 0.04506912, + "step": 1799 + }, + { + "epoch": 3.6, + "grad_norm": 1.558552622795105, + "learning_rate": 2e-05, + "loss": 0.05311072, + "step": 1800 + }, + { + "epoch": 3.602, + "grad_norm": 1.5442121028900146, + "learning_rate": 2e-05, + "loss": 0.0526191, + "step": 1801 + }, + { + "epoch": 3.604, + "grad_norm": 1.8791892528533936, + "learning_rate": 2e-05, + "loss": 0.06307591, + "step": 1802 + }, + { + "epoch": 3.606, + "grad_norm": 1.6081328392028809, + "learning_rate": 2e-05, + "loss": 0.03651448, + "step": 1803 + }, + { + "epoch": 3.608, + "grad_norm": 1.4104983806610107, + "learning_rate": 2e-05, + "loss": 0.04336455, + "step": 1804 + }, + { + "epoch": 3.61, + "grad_norm": 2.381561756134033, + "learning_rate": 2e-05, + "loss": 0.05898841, + "step": 1805 + }, + { + "epoch": 3.612, + "grad_norm": 1.3324648141860962, + "learning_rate": 2e-05, + "loss": 0.03454057, + "step": 1806 + }, + { + "epoch": 3.614, + "grad_norm": 1.5297226905822754, + "learning_rate": 2e-05, + "loss": 0.05038363, + "step": 1807 + }, + { + "epoch": 3.616, + "grad_norm": 1.5506995916366577, + "learning_rate": 2e-05, + "loss": 0.05092872, + "step": 1808 + }, + { + "epoch": 3.618, + "grad_norm": 1.181553602218628, + "learning_rate": 2e-05, + "loss": 0.03173397, + "step": 1809 + }, + { + "epoch": 3.62, + "grad_norm": 1.6461181640625, + "learning_rate": 2e-05, + "loss": 0.05175463, + "step": 1810 + }, + { + "epoch": 3.622, + "grad_norm": 1.9184272289276123, + "learning_rate": 2e-05, + "loss": 0.06124686, + "step": 1811 + }, + { + "epoch": 3.624, + "grad_norm": 1.6065723896026611, + "learning_rate": 2e-05, + "loss": 0.04484671, + "step": 1812 + }, + { + "epoch": 3.626, + "grad_norm": 1.443337082862854, + "learning_rate": 2e-05, + "loss": 0.04267926, + "step": 1813 + }, + { + "epoch": 3.628, + "grad_norm": 1.178184151649475, + "learning_rate": 2e-05, + "loss": 0.02920187, + "step": 1814 + }, + { + "epoch": 3.63, + "grad_norm": 1.146735429763794, + "learning_rate": 2e-05, + "loss": 0.02786646, + "step": 1815 + }, + { + "epoch": 3.632, + "grad_norm": 1.1989175081253052, + "learning_rate": 2e-05, + "loss": 0.03249002, + "step": 1816 + }, + { + "epoch": 3.634, + "grad_norm": 1.540666103363037, + "learning_rate": 2e-05, + "loss": 0.03965627, + "step": 1817 + }, + { + "epoch": 3.636, + "grad_norm": 1.4439377784729004, + "learning_rate": 2e-05, + "loss": 0.04156953, + "step": 1818 + }, + { + "epoch": 3.638, + "grad_norm": 1.2657922506332397, + "learning_rate": 2e-05, + "loss": 0.03509516, + "step": 1819 + }, + { + "epoch": 3.64, + "grad_norm": 2.9393463134765625, + "learning_rate": 2e-05, + "loss": 0.04102836, + "step": 1820 + }, + { + "epoch": 3.642, + "grad_norm": 1.263395071029663, + "learning_rate": 2e-05, + "loss": 0.03908949, + "step": 1821 + }, + { + "epoch": 3.644, + "grad_norm": 1.1321371793746948, + "learning_rate": 2e-05, + "loss": 0.03896394, + "step": 1822 + }, + { + "epoch": 3.646, + "grad_norm": 1.1515556573867798, + "learning_rate": 2e-05, + "loss": 0.0317642, + "step": 1823 + }, + { + "epoch": 3.648, + "grad_norm": 1.4203664064407349, + "learning_rate": 2e-05, + "loss": 0.05269869, + "step": 1824 + }, + { + "epoch": 3.65, + "grad_norm": 1.6567198038101196, + "learning_rate": 2e-05, + "loss": 0.03793368, + "step": 1825 + }, + { + "epoch": 3.652, + "grad_norm": 2.2994821071624756, + "learning_rate": 2e-05, + "loss": 0.0429872, + "step": 1826 + }, + { + "epoch": 3.654, + "grad_norm": 1.6607061624526978, + "learning_rate": 2e-05, + "loss": 0.03379823, + "step": 1827 + }, + { + "epoch": 3.656, + "grad_norm": 1.7107027769088745, + "learning_rate": 2e-05, + "loss": 0.04675277, + "step": 1828 + }, + { + "epoch": 3.658, + "grad_norm": 1.8303852081298828, + "learning_rate": 2e-05, + "loss": 0.04841805, + "step": 1829 + }, + { + "epoch": 3.66, + "grad_norm": 1.8825572729110718, + "learning_rate": 2e-05, + "loss": 0.04874373, + "step": 1830 + }, + { + "epoch": 3.662, + "grad_norm": 1.7029788494110107, + "learning_rate": 2e-05, + "loss": 0.05050893, + "step": 1831 + }, + { + "epoch": 3.664, + "grad_norm": 1.1050704717636108, + "learning_rate": 2e-05, + "loss": 0.03281447, + "step": 1832 + }, + { + "epoch": 3.666, + "grad_norm": 1.4161999225616455, + "learning_rate": 2e-05, + "loss": 0.0544338, + "step": 1833 + }, + { + "epoch": 3.668, + "grad_norm": 1.964059829711914, + "learning_rate": 2e-05, + "loss": 0.03872099, + "step": 1834 + }, + { + "epoch": 3.67, + "grad_norm": 2.016484022140503, + "learning_rate": 2e-05, + "loss": 0.05875476, + "step": 1835 + }, + { + "epoch": 3.672, + "grad_norm": 1.473386287689209, + "learning_rate": 2e-05, + "loss": 0.03022585, + "step": 1836 + }, + { + "epoch": 3.674, + "grad_norm": 1.348415493965149, + "learning_rate": 2e-05, + "loss": 0.04690105, + "step": 1837 + }, + { + "epoch": 3.676, + "grad_norm": 0.9426001310348511, + "learning_rate": 2e-05, + "loss": 0.02384089, + "step": 1838 + }, + { + "epoch": 3.678, + "grad_norm": 3.1298086643218994, + "learning_rate": 2e-05, + "loss": 0.06468902, + "step": 1839 + }, + { + "epoch": 3.68, + "grad_norm": 1.0675461292266846, + "learning_rate": 2e-05, + "loss": 0.03523882, + "step": 1840 + }, + { + "epoch": 3.682, + "grad_norm": 0.8646126985549927, + "learning_rate": 2e-05, + "loss": 0.01838133, + "step": 1841 + }, + { + "epoch": 3.684, + "grad_norm": 1.141176700592041, + "learning_rate": 2e-05, + "loss": 0.03865944, + "step": 1842 + }, + { + "epoch": 3.686, + "grad_norm": 2.4700798988342285, + "learning_rate": 2e-05, + "loss": 0.04170835, + "step": 1843 + }, + { + "epoch": 3.6879999999999997, + "grad_norm": 1.2326807975769043, + "learning_rate": 2e-05, + "loss": 0.03476126, + "step": 1844 + }, + { + "epoch": 3.69, + "grad_norm": 1.714464545249939, + "learning_rate": 2e-05, + "loss": 0.03549024, + "step": 1845 + }, + { + "epoch": 3.692, + "grad_norm": 1.038811206817627, + "learning_rate": 2e-05, + "loss": 0.02916805, + "step": 1846 + }, + { + "epoch": 3.694, + "grad_norm": 2.0106890201568604, + "learning_rate": 2e-05, + "loss": 0.04282944, + "step": 1847 + }, + { + "epoch": 3.6959999999999997, + "grad_norm": 1.4869582653045654, + "learning_rate": 2e-05, + "loss": 0.0293393, + "step": 1848 + }, + { + "epoch": 3.698, + "grad_norm": 1.699388861656189, + "learning_rate": 2e-05, + "loss": 0.03086464, + "step": 1849 + }, + { + "epoch": 3.7, + "grad_norm": 1.3598302602767944, + "learning_rate": 2e-05, + "loss": 0.03127223, + "step": 1850 + }, + { + "epoch": 3.702, + "grad_norm": 1.3136847019195557, + "learning_rate": 2e-05, + "loss": 0.03167934, + "step": 1851 + }, + { + "epoch": 3.7039999999999997, + "grad_norm": 1.9143792390823364, + "learning_rate": 2e-05, + "loss": 0.03092597, + "step": 1852 + }, + { + "epoch": 3.706, + "grad_norm": 1.9676464796066284, + "learning_rate": 2e-05, + "loss": 0.04451537, + "step": 1853 + }, + { + "epoch": 3.708, + "grad_norm": 0.9787660241127014, + "learning_rate": 2e-05, + "loss": 0.02394712, + "step": 1854 + }, + { + "epoch": 3.71, + "grad_norm": 2.257038116455078, + "learning_rate": 2e-05, + "loss": 0.04062683, + "step": 1855 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 1.5607311725616455, + "learning_rate": 2e-05, + "loss": 0.02945242, + "step": 1856 + }, + { + "epoch": 3.714, + "grad_norm": 1.4506772756576538, + "learning_rate": 2e-05, + "loss": 0.04430741, + "step": 1857 + }, + { + "epoch": 3.716, + "grad_norm": 2.7266697883605957, + "learning_rate": 2e-05, + "loss": 0.03880604, + "step": 1858 + }, + { + "epoch": 3.718, + "grad_norm": 1.774509072303772, + "learning_rate": 2e-05, + "loss": 0.04546557, + "step": 1859 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 2.044501543045044, + "learning_rate": 2e-05, + "loss": 0.05731916, + "step": 1860 + }, + { + "epoch": 3.722, + "grad_norm": 1.1676855087280273, + "learning_rate": 2e-05, + "loss": 0.03595718, + "step": 1861 + }, + { + "epoch": 3.724, + "grad_norm": 1.1329604387283325, + "learning_rate": 2e-05, + "loss": 0.03459671, + "step": 1862 + }, + { + "epoch": 3.726, + "grad_norm": 1.5749149322509766, + "learning_rate": 2e-05, + "loss": 0.03534314, + "step": 1863 + }, + { + "epoch": 3.7279999999999998, + "grad_norm": 2.428170680999756, + "learning_rate": 2e-05, + "loss": 0.08179246, + "step": 1864 + }, + { + "epoch": 3.73, + "grad_norm": 1.7956899404525757, + "learning_rate": 2e-05, + "loss": 0.06140782, + "step": 1865 + }, + { + "epoch": 3.732, + "grad_norm": 1.7499301433563232, + "learning_rate": 2e-05, + "loss": 0.04886729, + "step": 1866 + }, + { + "epoch": 3.734, + "grad_norm": 1.4074490070343018, + "learning_rate": 2e-05, + "loss": 0.04775254, + "step": 1867 + }, + { + "epoch": 3.7359999999999998, + "grad_norm": 1.8201322555541992, + "learning_rate": 2e-05, + "loss": 0.04804891, + "step": 1868 + }, + { + "epoch": 3.738, + "grad_norm": 1.9848655462265015, + "learning_rate": 2e-05, + "loss": 0.05379272, + "step": 1869 + }, + { + "epoch": 3.74, + "grad_norm": 1.2629104852676392, + "learning_rate": 2e-05, + "loss": 0.03760821, + "step": 1870 + }, + { + "epoch": 3.742, + "grad_norm": 1.1420437097549438, + "learning_rate": 2e-05, + "loss": 0.02983224, + "step": 1871 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 1.4850512742996216, + "learning_rate": 2e-05, + "loss": 0.04077002, + "step": 1872 + }, + { + "epoch": 3.746, + "grad_norm": 1.3481850624084473, + "learning_rate": 2e-05, + "loss": 0.04353747, + "step": 1873 + }, + { + "epoch": 3.748, + "grad_norm": 1.613179326057434, + "learning_rate": 2e-05, + "loss": 0.04657446, + "step": 1874 + }, + { + "epoch": 3.75, + "grad_norm": 1.436995029449463, + "learning_rate": 2e-05, + "loss": 0.04443494, + "step": 1875 + }, + { + "epoch": 3.752, + "grad_norm": 2.0734074115753174, + "learning_rate": 2e-05, + "loss": 0.05218915, + "step": 1876 + }, + { + "epoch": 3.754, + "grad_norm": 1.7502717971801758, + "learning_rate": 2e-05, + "loss": 0.0336446, + "step": 1877 + }, + { + "epoch": 3.7560000000000002, + "grad_norm": 1.8787667751312256, + "learning_rate": 2e-05, + "loss": 0.04104609, + "step": 1878 + }, + { + "epoch": 3.758, + "grad_norm": 2.121859550476074, + "learning_rate": 2e-05, + "loss": 0.04827151, + "step": 1879 + }, + { + "epoch": 3.76, + "grad_norm": 2.4621684551239014, + "learning_rate": 2e-05, + "loss": 0.04723193, + "step": 1880 + }, + { + "epoch": 3.762, + "grad_norm": 2.063389778137207, + "learning_rate": 2e-05, + "loss": 0.04516068, + "step": 1881 + }, + { + "epoch": 3.7640000000000002, + "grad_norm": 1.6342523097991943, + "learning_rate": 2e-05, + "loss": 0.05464017, + "step": 1882 + }, + { + "epoch": 3.766, + "grad_norm": 1.649129867553711, + "learning_rate": 2e-05, + "loss": 0.06015805, + "step": 1883 + }, + { + "epoch": 3.768, + "grad_norm": 1.4318526983261108, + "learning_rate": 2e-05, + "loss": 0.030782, + "step": 1884 + }, + { + "epoch": 3.77, + "grad_norm": 1.4619206190109253, + "learning_rate": 2e-05, + "loss": 0.04468732, + "step": 1885 + }, + { + "epoch": 3.7720000000000002, + "grad_norm": 2.3498082160949707, + "learning_rate": 2e-05, + "loss": 0.03591853, + "step": 1886 + }, + { + "epoch": 3.774, + "grad_norm": 2.933544397354126, + "learning_rate": 2e-05, + "loss": 0.04159092, + "step": 1887 + }, + { + "epoch": 3.776, + "grad_norm": 3.0242841243743896, + "learning_rate": 2e-05, + "loss": 0.07077288, + "step": 1888 + }, + { + "epoch": 3.778, + "grad_norm": 0.9896936416625977, + "learning_rate": 2e-05, + "loss": 0.0333928, + "step": 1889 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 2.85379958152771, + "learning_rate": 2e-05, + "loss": 0.04417305, + "step": 1890 + }, + { + "epoch": 3.782, + "grad_norm": 1.0257110595703125, + "learning_rate": 2e-05, + "loss": 0.03142691, + "step": 1891 + }, + { + "epoch": 3.784, + "grad_norm": 1.1791033744812012, + "learning_rate": 2e-05, + "loss": 0.03500428, + "step": 1892 + }, + { + "epoch": 3.786, + "grad_norm": 1.6889530420303345, + "learning_rate": 2e-05, + "loss": 0.05240297, + "step": 1893 + }, + { + "epoch": 3.7880000000000003, + "grad_norm": 1.7314518690109253, + "learning_rate": 2e-05, + "loss": 0.04307431, + "step": 1894 + }, + { + "epoch": 3.79, + "grad_norm": 1.1842002868652344, + "learning_rate": 2e-05, + "loss": 0.04247685, + "step": 1895 + }, + { + "epoch": 3.792, + "grad_norm": 2.100853681564331, + "learning_rate": 2e-05, + "loss": 0.05021702, + "step": 1896 + }, + { + "epoch": 3.794, + "grad_norm": 1.5216481685638428, + "learning_rate": 2e-05, + "loss": 0.03309655, + "step": 1897 + }, + { + "epoch": 3.7960000000000003, + "grad_norm": 2.276388168334961, + "learning_rate": 2e-05, + "loss": 0.05299261, + "step": 1898 + }, + { + "epoch": 3.798, + "grad_norm": 2.0013959407806396, + "learning_rate": 2e-05, + "loss": 0.04387742, + "step": 1899 + }, + { + "epoch": 3.8, + "grad_norm": 1.5426726341247559, + "learning_rate": 2e-05, + "loss": 0.04475781, + "step": 1900 + }, + { + "epoch": 3.802, + "grad_norm": 1.730944037437439, + "learning_rate": 2e-05, + "loss": 0.04797804, + "step": 1901 + }, + { + "epoch": 3.8040000000000003, + "grad_norm": 3.512629747390747, + "learning_rate": 2e-05, + "loss": 0.06400887, + "step": 1902 + }, + { + "epoch": 3.806, + "grad_norm": 1.5255076885223389, + "learning_rate": 2e-05, + "loss": 0.03512551, + "step": 1903 + }, + { + "epoch": 3.808, + "grad_norm": 1.6803914308547974, + "learning_rate": 2e-05, + "loss": 0.04707751, + "step": 1904 + }, + { + "epoch": 3.81, + "grad_norm": 1.6464742422103882, + "learning_rate": 2e-05, + "loss": 0.04364287, + "step": 1905 + }, + { + "epoch": 3.8120000000000003, + "grad_norm": 1.5913269519805908, + "learning_rate": 2e-05, + "loss": 0.05464795, + "step": 1906 + }, + { + "epoch": 3.814, + "grad_norm": 1.8374617099761963, + "learning_rate": 2e-05, + "loss": 0.04795897, + "step": 1907 + }, + { + "epoch": 3.816, + "grad_norm": 2.12955641746521, + "learning_rate": 2e-05, + "loss": 0.06177934, + "step": 1908 + }, + { + "epoch": 3.818, + "grad_norm": 1.9309619665145874, + "learning_rate": 2e-05, + "loss": 0.02248725, + "step": 1909 + }, + { + "epoch": 3.82, + "grad_norm": 1.7270091772079468, + "learning_rate": 2e-05, + "loss": 0.03895885, + "step": 1910 + }, + { + "epoch": 3.822, + "grad_norm": 1.6948938369750977, + "learning_rate": 2e-05, + "loss": 0.06202386, + "step": 1911 + }, + { + "epoch": 3.824, + "grad_norm": 1.287054181098938, + "learning_rate": 2e-05, + "loss": 0.03878157, + "step": 1912 + }, + { + "epoch": 3.826, + "grad_norm": 1.2202260494232178, + "learning_rate": 2e-05, + "loss": 0.03554882, + "step": 1913 + }, + { + "epoch": 3.828, + "grad_norm": 0.9808116555213928, + "learning_rate": 2e-05, + "loss": 0.02788639, + "step": 1914 + }, + { + "epoch": 3.83, + "grad_norm": 1.3717739582061768, + "learning_rate": 2e-05, + "loss": 0.04405915, + "step": 1915 + }, + { + "epoch": 3.832, + "grad_norm": 1.2487013339996338, + "learning_rate": 2e-05, + "loss": 0.03764036, + "step": 1916 + }, + { + "epoch": 3.834, + "grad_norm": 1.122071623802185, + "learning_rate": 2e-05, + "loss": 0.0333156, + "step": 1917 + }, + { + "epoch": 3.836, + "grad_norm": 1.3125051259994507, + "learning_rate": 2e-05, + "loss": 0.04306629, + "step": 1918 + }, + { + "epoch": 3.838, + "grad_norm": 1.7904739379882812, + "learning_rate": 2e-05, + "loss": 0.0361197, + "step": 1919 + }, + { + "epoch": 3.84, + "grad_norm": 1.1583505868911743, + "learning_rate": 2e-05, + "loss": 0.03510105, + "step": 1920 + }, + { + "epoch": 3.842, + "grad_norm": 1.6849479675292969, + "learning_rate": 2e-05, + "loss": 0.04817548, + "step": 1921 + }, + { + "epoch": 3.844, + "grad_norm": 2.617076873779297, + "learning_rate": 2e-05, + "loss": 0.06620353, + "step": 1922 + }, + { + "epoch": 3.846, + "grad_norm": 2.478874683380127, + "learning_rate": 2e-05, + "loss": 0.04455588, + "step": 1923 + }, + { + "epoch": 3.848, + "grad_norm": 1.146331548690796, + "learning_rate": 2e-05, + "loss": 0.04490707, + "step": 1924 + }, + { + "epoch": 3.85, + "grad_norm": 1.0759429931640625, + "learning_rate": 2e-05, + "loss": 0.03621227, + "step": 1925 + }, + { + "epoch": 3.852, + "grad_norm": 1.6324303150177002, + "learning_rate": 2e-05, + "loss": 0.04010741, + "step": 1926 + }, + { + "epoch": 3.854, + "grad_norm": 2.0158345699310303, + "learning_rate": 2e-05, + "loss": 0.05399124, + "step": 1927 + }, + { + "epoch": 3.856, + "grad_norm": 1.5364727973937988, + "learning_rate": 2e-05, + "loss": 0.04785627, + "step": 1928 + }, + { + "epoch": 3.858, + "grad_norm": 1.6611193418502808, + "learning_rate": 2e-05, + "loss": 0.03403072, + "step": 1929 + }, + { + "epoch": 3.86, + "grad_norm": 2.6205263137817383, + "learning_rate": 2e-05, + "loss": 0.04760019, + "step": 1930 + }, + { + "epoch": 3.862, + "grad_norm": 1.1166229248046875, + "learning_rate": 2e-05, + "loss": 0.03396893, + "step": 1931 + }, + { + "epoch": 3.864, + "grad_norm": 2.4609854221343994, + "learning_rate": 2e-05, + "loss": 0.05028281, + "step": 1932 + }, + { + "epoch": 3.866, + "grad_norm": 1.2173758745193481, + "learning_rate": 2e-05, + "loss": 0.03039585, + "step": 1933 + }, + { + "epoch": 3.868, + "grad_norm": 1.3882139921188354, + "learning_rate": 2e-05, + "loss": 0.04112496, + "step": 1934 + }, + { + "epoch": 3.87, + "grad_norm": 1.486579418182373, + "learning_rate": 2e-05, + "loss": 0.03768492, + "step": 1935 + }, + { + "epoch": 3.872, + "grad_norm": 1.4759503602981567, + "learning_rate": 2e-05, + "loss": 0.03338876, + "step": 1936 + }, + { + "epoch": 3.874, + "grad_norm": 1.290807843208313, + "learning_rate": 2e-05, + "loss": 0.03643284, + "step": 1937 + }, + { + "epoch": 3.876, + "grad_norm": 1.520533800125122, + "learning_rate": 2e-05, + "loss": 0.04213966, + "step": 1938 + }, + { + "epoch": 3.878, + "grad_norm": 1.262356162071228, + "learning_rate": 2e-05, + "loss": 0.04538347, + "step": 1939 + }, + { + "epoch": 3.88, + "grad_norm": 2.381612777709961, + "learning_rate": 2e-05, + "loss": 0.03353626, + "step": 1940 + }, + { + "epoch": 3.882, + "grad_norm": 1.6339051723480225, + "learning_rate": 2e-05, + "loss": 0.04302477, + "step": 1941 + }, + { + "epoch": 3.884, + "grad_norm": 1.4285898208618164, + "learning_rate": 2e-05, + "loss": 0.05180392, + "step": 1942 + }, + { + "epoch": 3.886, + "grad_norm": 1.1873326301574707, + "learning_rate": 2e-05, + "loss": 0.03653756, + "step": 1943 + }, + { + "epoch": 3.888, + "grad_norm": 1.3497165441513062, + "learning_rate": 2e-05, + "loss": 0.03856751, + "step": 1944 + }, + { + "epoch": 3.89, + "grad_norm": 1.258852481842041, + "learning_rate": 2e-05, + "loss": 0.04039215, + "step": 1945 + }, + { + "epoch": 3.892, + "grad_norm": 1.983088731765747, + "learning_rate": 2e-05, + "loss": 0.04817604, + "step": 1946 + }, + { + "epoch": 3.894, + "grad_norm": 2.92964768409729, + "learning_rate": 2e-05, + "loss": 0.053841, + "step": 1947 + }, + { + "epoch": 3.896, + "grad_norm": 1.1992443799972534, + "learning_rate": 2e-05, + "loss": 0.04172537, + "step": 1948 + }, + { + "epoch": 3.898, + "grad_norm": 1.1795976161956787, + "learning_rate": 2e-05, + "loss": 0.03664827, + "step": 1949 + }, + { + "epoch": 3.9, + "grad_norm": 1.2616045475006104, + "learning_rate": 2e-05, + "loss": 0.04682916, + "step": 1950 + }, + { + "epoch": 3.902, + "grad_norm": 3.0420405864715576, + "learning_rate": 2e-05, + "loss": 0.0575822, + "step": 1951 + }, + { + "epoch": 3.904, + "grad_norm": 1.0600016117095947, + "learning_rate": 2e-05, + "loss": 0.03584382, + "step": 1952 + }, + { + "epoch": 3.906, + "grad_norm": 1.396815299987793, + "learning_rate": 2e-05, + "loss": 0.03886579, + "step": 1953 + }, + { + "epoch": 3.908, + "grad_norm": 2.188819408416748, + "learning_rate": 2e-05, + "loss": 0.06812514, + "step": 1954 + }, + { + "epoch": 3.91, + "grad_norm": 1.5482600927352905, + "learning_rate": 2e-05, + "loss": 0.03375176, + "step": 1955 + }, + { + "epoch": 3.912, + "grad_norm": 1.212693691253662, + "learning_rate": 2e-05, + "loss": 0.0395262, + "step": 1956 + }, + { + "epoch": 3.914, + "grad_norm": 2.2088680267333984, + "learning_rate": 2e-05, + "loss": 0.03797888, + "step": 1957 + }, + { + "epoch": 3.916, + "grad_norm": 1.1913862228393555, + "learning_rate": 2e-05, + "loss": 0.02954198, + "step": 1958 + }, + { + "epoch": 3.918, + "grad_norm": 1.874579906463623, + "learning_rate": 2e-05, + "loss": 0.05844161, + "step": 1959 + }, + { + "epoch": 3.92, + "grad_norm": 1.4673166275024414, + "learning_rate": 2e-05, + "loss": 0.05150342, + "step": 1960 + }, + { + "epoch": 3.922, + "grad_norm": 2.861666679382324, + "learning_rate": 2e-05, + "loss": 0.05179626, + "step": 1961 + }, + { + "epoch": 3.924, + "grad_norm": 1.771929144859314, + "learning_rate": 2e-05, + "loss": 0.04365551, + "step": 1962 + }, + { + "epoch": 3.926, + "grad_norm": 1.3273606300354004, + "learning_rate": 2e-05, + "loss": 0.05284184, + "step": 1963 + }, + { + "epoch": 3.928, + "grad_norm": 1.9596598148345947, + "learning_rate": 2e-05, + "loss": 0.05003749, + "step": 1964 + }, + { + "epoch": 3.93, + "grad_norm": 2.103850841522217, + "learning_rate": 2e-05, + "loss": 0.04942531, + "step": 1965 + }, + { + "epoch": 3.932, + "grad_norm": 1.1391785144805908, + "learning_rate": 2e-05, + "loss": 0.02786987, + "step": 1966 + }, + { + "epoch": 3.934, + "grad_norm": 1.7148336172103882, + "learning_rate": 2e-05, + "loss": 0.04135071, + "step": 1967 + }, + { + "epoch": 3.936, + "grad_norm": 1.2991632223129272, + "learning_rate": 2e-05, + "loss": 0.06019251, + "step": 1968 + }, + { + "epoch": 3.9379999999999997, + "grad_norm": 1.5509945154190063, + "learning_rate": 2e-05, + "loss": 0.03836875, + "step": 1969 + }, + { + "epoch": 3.94, + "grad_norm": 1.69589364528656, + "learning_rate": 2e-05, + "loss": 0.05219426, + "step": 1970 + }, + { + "epoch": 3.942, + "grad_norm": 1.6247663497924805, + "learning_rate": 2e-05, + "loss": 0.05570246, + "step": 1971 + }, + { + "epoch": 3.944, + "grad_norm": 1.2264633178710938, + "learning_rate": 2e-05, + "loss": 0.04570191, + "step": 1972 + }, + { + "epoch": 3.9459999999999997, + "grad_norm": 2.1913163661956787, + "learning_rate": 2e-05, + "loss": 0.05165469, + "step": 1973 + }, + { + "epoch": 3.948, + "grad_norm": 1.0560282468795776, + "learning_rate": 2e-05, + "loss": 0.02876104, + "step": 1974 + }, + { + "epoch": 3.95, + "grad_norm": 1.5731909275054932, + "learning_rate": 2e-05, + "loss": 0.04167511, + "step": 1975 + }, + { + "epoch": 3.952, + "grad_norm": 1.399192452430725, + "learning_rate": 2e-05, + "loss": 0.04265817, + "step": 1976 + }, + { + "epoch": 3.9539999999999997, + "grad_norm": 1.8346514701843262, + "learning_rate": 2e-05, + "loss": 0.04521923, + "step": 1977 + }, + { + "epoch": 3.956, + "grad_norm": 1.6072944402694702, + "learning_rate": 2e-05, + "loss": 0.04234926, + "step": 1978 + }, + { + "epoch": 3.958, + "grad_norm": 1.2165111303329468, + "learning_rate": 2e-05, + "loss": 0.03402507, + "step": 1979 + }, + { + "epoch": 3.96, + "grad_norm": 1.2799257040023804, + "learning_rate": 2e-05, + "loss": 0.0419311, + "step": 1980 + }, + { + "epoch": 3.9619999999999997, + "grad_norm": 1.89944589138031, + "learning_rate": 2e-05, + "loss": 0.06260209, + "step": 1981 + }, + { + "epoch": 3.964, + "grad_norm": 1.6802691221237183, + "learning_rate": 2e-05, + "loss": 0.06025416, + "step": 1982 + }, + { + "epoch": 3.966, + "grad_norm": 1.2507396936416626, + "learning_rate": 2e-05, + "loss": 0.03038237, + "step": 1983 + }, + { + "epoch": 3.968, + "grad_norm": 1.3977298736572266, + "learning_rate": 2e-05, + "loss": 0.04241063, + "step": 1984 + }, + { + "epoch": 3.9699999999999998, + "grad_norm": 1.399631381034851, + "learning_rate": 2e-05, + "loss": 0.03736674, + "step": 1985 + }, + { + "epoch": 3.972, + "grad_norm": 1.1037839651107788, + "learning_rate": 2e-05, + "loss": 0.0263209, + "step": 1986 + }, + { + "epoch": 3.974, + "grad_norm": 1.7092723846435547, + "learning_rate": 2e-05, + "loss": 0.04478872, + "step": 1987 + }, + { + "epoch": 3.976, + "grad_norm": 2.23612117767334, + "learning_rate": 2e-05, + "loss": 0.03863956, + "step": 1988 + }, + { + "epoch": 3.9779999999999998, + "grad_norm": 1.2527269124984741, + "learning_rate": 2e-05, + "loss": 0.04319048, + "step": 1989 + }, + { + "epoch": 3.98, + "grad_norm": 1.2724004983901978, + "learning_rate": 2e-05, + "loss": 0.04712515, + "step": 1990 + }, + { + "epoch": 3.982, + "grad_norm": 1.1430199146270752, + "learning_rate": 2e-05, + "loss": 0.03822499, + "step": 1991 + }, + { + "epoch": 3.984, + "grad_norm": 1.8308535814285278, + "learning_rate": 2e-05, + "loss": 0.0425014, + "step": 1992 + }, + { + "epoch": 3.9859999999999998, + "grad_norm": 1.5834559202194214, + "learning_rate": 2e-05, + "loss": 0.04654464, + "step": 1993 + }, + { + "epoch": 3.988, + "grad_norm": 1.4839645624160767, + "learning_rate": 2e-05, + "loss": 0.06130801, + "step": 1994 + }, + { + "epoch": 3.99, + "grad_norm": 1.290731430053711, + "learning_rate": 2e-05, + "loss": 0.03394639, + "step": 1995 + }, + { + "epoch": 3.992, + "grad_norm": 1.468553066253662, + "learning_rate": 2e-05, + "loss": 0.05200931, + "step": 1996 + }, + { + "epoch": 3.9939999999999998, + "grad_norm": 2.330862522125244, + "learning_rate": 2e-05, + "loss": 0.04680311, + "step": 1997 + }, + { + "epoch": 3.996, + "grad_norm": 1.8820016384124756, + "learning_rate": 2e-05, + "loss": 0.06257437, + "step": 1998 + }, + { + "epoch": 3.998, + "grad_norm": 1.4825798273086548, + "learning_rate": 2e-05, + "loss": 0.03755412, + "step": 1999 + }, + { + "epoch": 4.0, + "grad_norm": 1.8379441499710083, + "learning_rate": 2e-05, + "loss": 0.04851647, + "step": 2000 + }, + { + "epoch": 4.0, + "eval_performance": { + "AngleClassification_1": 0.986, + "AngleClassification_2": 0.978, + "AngleClassification_3": 0.7784431137724551, + "Equal_1": 0.972, + "Equal_2": 0.874251497005988, + "Equal_3": 0.7445109780439122, + "LineComparison_1": 0.988, + "LineComparison_2": 0.9860279441117764, + "LineComparison_3": 0.9481037924151696, + "Parallel_1": 0.9919839679358717, + "Parallel_2": 0.9779559118236473, + "Parallel_3": 0.752, + "Perpendicular_1": 0.974, + "Perpendicular_2": 0.512, + "Perpendicular_3": 0.2565130260521042, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.9385333333333333, + "PointLiesOnLine_1": 0.9819639278557114, + "PointLiesOnLine_2": 0.9579158316633266, + "PointLiesOnLine_3": 0.5249500998003992 + }, + "eval_runtime": 321.6565, + "eval_samples_per_second": 32.644, + "eval_steps_per_second": 0.653, + "step": 2000 + }, + { + "epoch": 4.002, + "grad_norm": 2.689896583557129, + "learning_rate": 2e-05, + "loss": 0.06689534, + "step": 2001 + }, + { + "epoch": 4.004, + "grad_norm": 2.3726253509521484, + "learning_rate": 2e-05, + "loss": 0.06310491, + "step": 2002 + }, + { + "epoch": 4.006, + "grad_norm": 1.7566965818405151, + "learning_rate": 2e-05, + "loss": 0.05225249, + "step": 2003 + }, + { + "epoch": 4.008, + "grad_norm": 1.3011726140975952, + "learning_rate": 2e-05, + "loss": 0.03929686, + "step": 2004 + }, + { + "epoch": 4.01, + "grad_norm": 1.2295416593551636, + "learning_rate": 2e-05, + "loss": 0.03298599, + "step": 2005 + }, + { + "epoch": 4.012, + "grad_norm": 1.9119962453842163, + "learning_rate": 2e-05, + "loss": 0.04884803, + "step": 2006 + }, + { + "epoch": 4.014, + "grad_norm": 1.5606404542922974, + "learning_rate": 2e-05, + "loss": 0.04282958, + "step": 2007 + }, + { + "epoch": 4.016, + "grad_norm": 2.755392074584961, + "learning_rate": 2e-05, + "loss": 0.05961449, + "step": 2008 + }, + { + "epoch": 4.018, + "grad_norm": 1.6373087167739868, + "learning_rate": 2e-05, + "loss": 0.05367161, + "step": 2009 + }, + { + "epoch": 4.02, + "grad_norm": 2.529672861099243, + "learning_rate": 2e-05, + "loss": 0.04300452, + "step": 2010 + }, + { + "epoch": 4.022, + "grad_norm": 1.8200832605361938, + "learning_rate": 2e-05, + "loss": 0.05623111, + "step": 2011 + }, + { + "epoch": 4.024, + "grad_norm": 1.3841335773468018, + "learning_rate": 2e-05, + "loss": 0.03216402, + "step": 2012 + }, + { + "epoch": 4.026, + "grad_norm": 2.039104700088501, + "learning_rate": 2e-05, + "loss": 0.0455704, + "step": 2013 + }, + { + "epoch": 4.028, + "grad_norm": 2.0491440296173096, + "learning_rate": 2e-05, + "loss": 0.03618041, + "step": 2014 + }, + { + "epoch": 4.03, + "grad_norm": 1.554465889930725, + "learning_rate": 2e-05, + "loss": 0.02181048, + "step": 2015 + }, + { + "epoch": 4.032, + "grad_norm": 1.2771095037460327, + "learning_rate": 2e-05, + "loss": 0.03514237, + "step": 2016 + }, + { + "epoch": 4.034, + "grad_norm": 1.587876558303833, + "learning_rate": 2e-05, + "loss": 0.04905714, + "step": 2017 + }, + { + "epoch": 4.036, + "grad_norm": 2.0158097743988037, + "learning_rate": 2e-05, + "loss": 0.05387116, + "step": 2018 + }, + { + "epoch": 4.038, + "grad_norm": 2.1845550537109375, + "learning_rate": 2e-05, + "loss": 0.06457246, + "step": 2019 + }, + { + "epoch": 4.04, + "grad_norm": 1.155949354171753, + "learning_rate": 2e-05, + "loss": 0.02948808, + "step": 2020 + }, + { + "epoch": 4.042, + "grad_norm": 1.5860084295272827, + "learning_rate": 2e-05, + "loss": 0.05309289, + "step": 2021 + }, + { + "epoch": 4.044, + "grad_norm": 1.1510523557662964, + "learning_rate": 2e-05, + "loss": 0.04007091, + "step": 2022 + }, + { + "epoch": 4.046, + "grad_norm": 2.096463680267334, + "learning_rate": 2e-05, + "loss": 0.05365406, + "step": 2023 + }, + { + "epoch": 4.048, + "grad_norm": 1.4027256965637207, + "learning_rate": 2e-05, + "loss": 0.03679424, + "step": 2024 + }, + { + "epoch": 4.05, + "grad_norm": 1.503724455833435, + "learning_rate": 2e-05, + "loss": 0.03995445, + "step": 2025 + }, + { + "epoch": 4.052, + "grad_norm": 3.1506829261779785, + "learning_rate": 2e-05, + "loss": 0.07259705, + "step": 2026 + }, + { + "epoch": 4.054, + "grad_norm": 1.8490493297576904, + "learning_rate": 2e-05, + "loss": 0.04446945, + "step": 2027 + }, + { + "epoch": 4.056, + "grad_norm": 1.203890323638916, + "learning_rate": 2e-05, + "loss": 0.04504851, + "step": 2028 + }, + { + "epoch": 4.058, + "grad_norm": 2.1894469261169434, + "learning_rate": 2e-05, + "loss": 0.03918433, + "step": 2029 + }, + { + "epoch": 4.06, + "grad_norm": 1.990956425666809, + "learning_rate": 2e-05, + "loss": 0.03764991, + "step": 2030 + }, + { + "epoch": 4.062, + "grad_norm": 1.7570072412490845, + "learning_rate": 2e-05, + "loss": 0.04656107, + "step": 2031 + }, + { + "epoch": 4.064, + "grad_norm": 1.1472625732421875, + "learning_rate": 2e-05, + "loss": 0.02630271, + "step": 2032 + }, + { + "epoch": 4.066, + "grad_norm": 2.1167778968811035, + "learning_rate": 2e-05, + "loss": 0.03036687, + "step": 2033 + }, + { + "epoch": 4.068, + "grad_norm": 1.1994069814682007, + "learning_rate": 2e-05, + "loss": 0.03812218, + "step": 2034 + }, + { + "epoch": 4.07, + "grad_norm": 2.758052349090576, + "learning_rate": 2e-05, + "loss": 0.0450132, + "step": 2035 + }, + { + "epoch": 4.072, + "grad_norm": 1.4564862251281738, + "learning_rate": 2e-05, + "loss": 0.03845803, + "step": 2036 + }, + { + "epoch": 4.074, + "grad_norm": 1.1046653985977173, + "learning_rate": 2e-05, + "loss": 0.0331301, + "step": 2037 + }, + { + "epoch": 4.076, + "grad_norm": 2.1612513065338135, + "learning_rate": 2e-05, + "loss": 0.05518108, + "step": 2038 + }, + { + "epoch": 4.078, + "grad_norm": 1.5514698028564453, + "learning_rate": 2e-05, + "loss": 0.03647344, + "step": 2039 + }, + { + "epoch": 4.08, + "grad_norm": 2.8072474002838135, + "learning_rate": 2e-05, + "loss": 0.04705288, + "step": 2040 + }, + { + "epoch": 4.082, + "grad_norm": 1.4394696950912476, + "learning_rate": 2e-05, + "loss": 0.03800115, + "step": 2041 + }, + { + "epoch": 4.084, + "grad_norm": 1.580580711364746, + "learning_rate": 2e-05, + "loss": 0.03440983, + "step": 2042 + }, + { + "epoch": 4.086, + "grad_norm": 2.7896625995635986, + "learning_rate": 2e-05, + "loss": 0.0448301, + "step": 2043 + }, + { + "epoch": 4.088, + "grad_norm": 1.8632712364196777, + "learning_rate": 2e-05, + "loss": 0.03617346, + "step": 2044 + }, + { + "epoch": 4.09, + "grad_norm": 1.8190380334854126, + "learning_rate": 2e-05, + "loss": 0.0343765, + "step": 2045 + }, + { + "epoch": 4.092, + "grad_norm": 1.6877806186676025, + "learning_rate": 2e-05, + "loss": 0.04055114, + "step": 2046 + }, + { + "epoch": 4.094, + "grad_norm": 2.117882013320923, + "learning_rate": 2e-05, + "loss": 0.048363, + "step": 2047 + }, + { + "epoch": 4.096, + "grad_norm": 2.1963632106781006, + "learning_rate": 2e-05, + "loss": 0.05769034, + "step": 2048 + }, + { + "epoch": 4.098, + "grad_norm": 2.3755226135253906, + "learning_rate": 2e-05, + "loss": 0.07200582, + "step": 2049 + }, + { + "epoch": 4.1, + "grad_norm": 1.4905271530151367, + "learning_rate": 2e-05, + "loss": 0.0467202, + "step": 2050 + }, + { + "epoch": 4.102, + "grad_norm": 1.6176154613494873, + "learning_rate": 2e-05, + "loss": 0.05290875, + "step": 2051 + }, + { + "epoch": 4.104, + "grad_norm": 1.9842934608459473, + "learning_rate": 2e-05, + "loss": 0.0542838, + "step": 2052 + }, + { + "epoch": 4.106, + "grad_norm": 1.1524161100387573, + "learning_rate": 2e-05, + "loss": 0.03312632, + "step": 2053 + }, + { + "epoch": 4.108, + "grad_norm": 1.7908684015274048, + "learning_rate": 2e-05, + "loss": 0.03798482, + "step": 2054 + }, + { + "epoch": 4.11, + "grad_norm": 1.8829431533813477, + "learning_rate": 2e-05, + "loss": 0.03899117, + "step": 2055 + }, + { + "epoch": 4.112, + "grad_norm": 1.7515336275100708, + "learning_rate": 2e-05, + "loss": 0.04303076, + "step": 2056 + }, + { + "epoch": 4.114, + "grad_norm": 1.6121231317520142, + "learning_rate": 2e-05, + "loss": 0.04086188, + "step": 2057 + }, + { + "epoch": 4.116, + "grad_norm": 1.3123854398727417, + "learning_rate": 2e-05, + "loss": 0.04525847, + "step": 2058 + }, + { + "epoch": 4.118, + "grad_norm": 2.75618314743042, + "learning_rate": 2e-05, + "loss": 0.05385438, + "step": 2059 + }, + { + "epoch": 4.12, + "grad_norm": 1.0119373798370361, + "learning_rate": 2e-05, + "loss": 0.02559848, + "step": 2060 + }, + { + "epoch": 4.122, + "grad_norm": 1.6642717123031616, + "learning_rate": 2e-05, + "loss": 0.0500881, + "step": 2061 + }, + { + "epoch": 4.124, + "grad_norm": 1.3954622745513916, + "learning_rate": 2e-05, + "loss": 0.03746237, + "step": 2062 + }, + { + "epoch": 4.126, + "grad_norm": 1.5439870357513428, + "learning_rate": 2e-05, + "loss": 0.05281479, + "step": 2063 + }, + { + "epoch": 4.128, + "grad_norm": 1.3099309206008911, + "learning_rate": 2e-05, + "loss": 0.03846513, + "step": 2064 + }, + { + "epoch": 4.13, + "grad_norm": 1.7383625507354736, + "learning_rate": 2e-05, + "loss": 0.04843453, + "step": 2065 + }, + { + "epoch": 4.132, + "grad_norm": 1.8208372592926025, + "learning_rate": 2e-05, + "loss": 0.05107617, + "step": 2066 + }, + { + "epoch": 4.134, + "grad_norm": 2.1992876529693604, + "learning_rate": 2e-05, + "loss": 0.05615732, + "step": 2067 + }, + { + "epoch": 4.136, + "grad_norm": 1.8214548826217651, + "learning_rate": 2e-05, + "loss": 0.05642605, + "step": 2068 + }, + { + "epoch": 4.138, + "grad_norm": 1.3719658851623535, + "learning_rate": 2e-05, + "loss": 0.04263125, + "step": 2069 + }, + { + "epoch": 4.14, + "grad_norm": 1.8242491483688354, + "learning_rate": 2e-05, + "loss": 0.0747356, + "step": 2070 + }, + { + "epoch": 4.142, + "grad_norm": 2.162917375564575, + "learning_rate": 2e-05, + "loss": 0.05902673, + "step": 2071 + }, + { + "epoch": 4.144, + "grad_norm": 1.3159345388412476, + "learning_rate": 2e-05, + "loss": 0.04596569, + "step": 2072 + }, + { + "epoch": 4.146, + "grad_norm": 1.8809359073638916, + "learning_rate": 2e-05, + "loss": 0.05340003, + "step": 2073 + }, + { + "epoch": 4.148, + "grad_norm": 1.67131769657135, + "learning_rate": 2e-05, + "loss": 0.03571043, + "step": 2074 + }, + { + "epoch": 4.15, + "grad_norm": 1.7250995635986328, + "learning_rate": 2e-05, + "loss": 0.0564668, + "step": 2075 + }, + { + "epoch": 4.152, + "grad_norm": 1.3855060338974, + "learning_rate": 2e-05, + "loss": 0.04370652, + "step": 2076 + }, + { + "epoch": 4.154, + "grad_norm": 1.1010857820510864, + "learning_rate": 2e-05, + "loss": 0.03384936, + "step": 2077 + }, + { + "epoch": 4.156, + "grad_norm": 2.0639870166778564, + "learning_rate": 2e-05, + "loss": 0.03384199, + "step": 2078 + }, + { + "epoch": 4.158, + "grad_norm": 1.4212632179260254, + "learning_rate": 2e-05, + "loss": 0.03396594, + "step": 2079 + }, + { + "epoch": 4.16, + "grad_norm": 2.1525704860687256, + "learning_rate": 2e-05, + "loss": 0.05491355, + "step": 2080 + }, + { + "epoch": 4.162, + "grad_norm": 1.7113659381866455, + "learning_rate": 2e-05, + "loss": 0.04376032, + "step": 2081 + }, + { + "epoch": 4.164, + "grad_norm": 1.223374366760254, + "learning_rate": 2e-05, + "loss": 0.03837118, + "step": 2082 + }, + { + "epoch": 4.166, + "grad_norm": 1.7024589776992798, + "learning_rate": 2e-05, + "loss": 0.04713647, + "step": 2083 + }, + { + "epoch": 4.168, + "grad_norm": 1.7157230377197266, + "learning_rate": 2e-05, + "loss": 0.05556125, + "step": 2084 + }, + { + "epoch": 4.17, + "grad_norm": 1.5272091627120972, + "learning_rate": 2e-05, + "loss": 0.04492594, + "step": 2085 + }, + { + "epoch": 4.172, + "grad_norm": 1.2956258058547974, + "learning_rate": 2e-05, + "loss": 0.04288995, + "step": 2086 + }, + { + "epoch": 4.174, + "grad_norm": 1.3768620491027832, + "learning_rate": 2e-05, + "loss": 0.02881315, + "step": 2087 + }, + { + "epoch": 4.176, + "grad_norm": 1.5189354419708252, + "learning_rate": 2e-05, + "loss": 0.05045771, + "step": 2088 + }, + { + "epoch": 4.178, + "grad_norm": 1.8950650691986084, + "learning_rate": 2e-05, + "loss": 0.05032203, + "step": 2089 + }, + { + "epoch": 4.18, + "grad_norm": 1.9955432415008545, + "learning_rate": 2e-05, + "loss": 0.02519334, + "step": 2090 + }, + { + "epoch": 4.182, + "grad_norm": 1.4326856136322021, + "learning_rate": 2e-05, + "loss": 0.03771212, + "step": 2091 + }, + { + "epoch": 4.184, + "grad_norm": 1.1596758365631104, + "learning_rate": 2e-05, + "loss": 0.02768318, + "step": 2092 + }, + { + "epoch": 4.186, + "grad_norm": 1.1794545650482178, + "learning_rate": 2e-05, + "loss": 0.03789561, + "step": 2093 + }, + { + "epoch": 4.188, + "grad_norm": 0.9308399558067322, + "learning_rate": 2e-05, + "loss": 0.02805153, + "step": 2094 + }, + { + "epoch": 4.19, + "grad_norm": 1.1102815866470337, + "learning_rate": 2e-05, + "loss": 0.02839583, + "step": 2095 + }, + { + "epoch": 4.192, + "grad_norm": 1.020172357559204, + "learning_rate": 2e-05, + "loss": 0.03357933, + "step": 2096 + }, + { + "epoch": 4.194, + "grad_norm": 1.7360633611679077, + "learning_rate": 2e-05, + "loss": 0.05581947, + "step": 2097 + }, + { + "epoch": 4.196, + "grad_norm": 1.2498151063919067, + "learning_rate": 2e-05, + "loss": 0.03497965, + "step": 2098 + }, + { + "epoch": 4.198, + "grad_norm": 1.4616661071777344, + "learning_rate": 2e-05, + "loss": 0.04537179, + "step": 2099 + }, + { + "epoch": 4.2, + "grad_norm": 1.6234368085861206, + "learning_rate": 2e-05, + "loss": 0.0405514, + "step": 2100 + }, + { + "epoch": 4.202, + "grad_norm": 1.50863516330719, + "learning_rate": 2e-05, + "loss": 0.03994677, + "step": 2101 + }, + { + "epoch": 4.204, + "grad_norm": 2.2599105834960938, + "learning_rate": 2e-05, + "loss": 0.04103535, + "step": 2102 + }, + { + "epoch": 4.206, + "grad_norm": 1.642195224761963, + "learning_rate": 2e-05, + "loss": 0.02920062, + "step": 2103 + }, + { + "epoch": 4.208, + "grad_norm": 2.0362472534179688, + "learning_rate": 2e-05, + "loss": 0.03564665, + "step": 2104 + }, + { + "epoch": 4.21, + "grad_norm": 2.113086223602295, + "learning_rate": 2e-05, + "loss": 0.04320297, + "step": 2105 + }, + { + "epoch": 4.212, + "grad_norm": 1.1269563436508179, + "learning_rate": 2e-05, + "loss": 0.02520046, + "step": 2106 + }, + { + "epoch": 4.214, + "grad_norm": 2.32094144821167, + "learning_rate": 2e-05, + "loss": 0.06330884, + "step": 2107 + }, + { + "epoch": 4.216, + "grad_norm": 1.5615673065185547, + "learning_rate": 2e-05, + "loss": 0.0331628, + "step": 2108 + }, + { + "epoch": 4.218, + "grad_norm": 0.9397850632667542, + "learning_rate": 2e-05, + "loss": 0.02459784, + "step": 2109 + }, + { + "epoch": 4.22, + "grad_norm": 1.9889795780181885, + "learning_rate": 2e-05, + "loss": 0.03781548, + "step": 2110 + }, + { + "epoch": 4.222, + "grad_norm": 1.8364139795303345, + "learning_rate": 2e-05, + "loss": 0.03902372, + "step": 2111 + }, + { + "epoch": 4.224, + "grad_norm": 1.2772095203399658, + "learning_rate": 2e-05, + "loss": 0.0316204, + "step": 2112 + }, + { + "epoch": 4.226, + "grad_norm": 2.7459285259246826, + "learning_rate": 2e-05, + "loss": 0.06582908, + "step": 2113 + }, + { + "epoch": 4.228, + "grad_norm": 1.5086251497268677, + "learning_rate": 2e-05, + "loss": 0.04670456, + "step": 2114 + }, + { + "epoch": 4.23, + "grad_norm": 2.2771124839782715, + "learning_rate": 2e-05, + "loss": 0.07231095, + "step": 2115 + }, + { + "epoch": 4.232, + "grad_norm": 1.7897669076919556, + "learning_rate": 2e-05, + "loss": 0.04256113, + "step": 2116 + }, + { + "epoch": 4.234, + "grad_norm": 1.2214994430541992, + "learning_rate": 2e-05, + "loss": 0.03983855, + "step": 2117 + }, + { + "epoch": 4.236, + "grad_norm": 1.4263696670532227, + "learning_rate": 2e-05, + "loss": 0.03172107, + "step": 2118 + }, + { + "epoch": 4.2379999999999995, + "grad_norm": 2.4489498138427734, + "learning_rate": 2e-05, + "loss": 0.05162663, + "step": 2119 + }, + { + "epoch": 4.24, + "grad_norm": 1.7401384115219116, + "learning_rate": 2e-05, + "loss": 0.0567259, + "step": 2120 + }, + { + "epoch": 4.242, + "grad_norm": 2.355661153793335, + "learning_rate": 2e-05, + "loss": 0.04312699, + "step": 2121 + }, + { + "epoch": 4.244, + "grad_norm": 0.8899902701377869, + "learning_rate": 2e-05, + "loss": 0.02231044, + "step": 2122 + }, + { + "epoch": 4.246, + "grad_norm": 1.3299999237060547, + "learning_rate": 2e-05, + "loss": 0.03867707, + "step": 2123 + }, + { + "epoch": 4.248, + "grad_norm": 2.1780574321746826, + "learning_rate": 2e-05, + "loss": 0.05338889, + "step": 2124 + }, + { + "epoch": 4.25, + "grad_norm": 1.6817718744277954, + "learning_rate": 2e-05, + "loss": 0.04446897, + "step": 2125 + }, + { + "epoch": 4.252, + "grad_norm": 1.7029000520706177, + "learning_rate": 2e-05, + "loss": 0.03678191, + "step": 2126 + }, + { + "epoch": 4.254, + "grad_norm": 1.601068139076233, + "learning_rate": 2e-05, + "loss": 0.04554681, + "step": 2127 + }, + { + "epoch": 4.256, + "grad_norm": 2.417534828186035, + "learning_rate": 2e-05, + "loss": 0.06136102, + "step": 2128 + }, + { + "epoch": 4.258, + "grad_norm": 2.018188953399658, + "learning_rate": 2e-05, + "loss": 0.06119014, + "step": 2129 + }, + { + "epoch": 4.26, + "grad_norm": 2.963576078414917, + "learning_rate": 2e-05, + "loss": 0.06493448, + "step": 2130 + }, + { + "epoch": 4.2620000000000005, + "grad_norm": 1.6151946783065796, + "learning_rate": 2e-05, + "loss": 0.05280596, + "step": 2131 + }, + { + "epoch": 4.264, + "grad_norm": 1.7862824201583862, + "learning_rate": 2e-05, + "loss": 0.03816576, + "step": 2132 + }, + { + "epoch": 4.266, + "grad_norm": 2.704598903656006, + "learning_rate": 2e-05, + "loss": 0.06200828, + "step": 2133 + }, + { + "epoch": 4.268, + "grad_norm": 1.2484264373779297, + "learning_rate": 2e-05, + "loss": 0.02782817, + "step": 2134 + }, + { + "epoch": 4.27, + "grad_norm": 1.6894198656082153, + "learning_rate": 2e-05, + "loss": 0.0363794, + "step": 2135 + }, + { + "epoch": 4.272, + "grad_norm": 1.383814811706543, + "learning_rate": 2e-05, + "loss": 0.0340181, + "step": 2136 + }, + { + "epoch": 4.274, + "grad_norm": 1.5126454830169678, + "learning_rate": 2e-05, + "loss": 0.04073911, + "step": 2137 + }, + { + "epoch": 4.276, + "grad_norm": 1.4518946409225464, + "learning_rate": 2e-05, + "loss": 0.04448681, + "step": 2138 + }, + { + "epoch": 4.2780000000000005, + "grad_norm": 1.3890987634658813, + "learning_rate": 2e-05, + "loss": 0.02615726, + "step": 2139 + }, + { + "epoch": 4.28, + "grad_norm": 1.6702042818069458, + "learning_rate": 2e-05, + "loss": 0.05457231, + "step": 2140 + }, + { + "epoch": 4.282, + "grad_norm": 2.1039538383483887, + "learning_rate": 2e-05, + "loss": 0.04261757, + "step": 2141 + }, + { + "epoch": 4.284, + "grad_norm": 1.6950783729553223, + "learning_rate": 2e-05, + "loss": 0.05137327, + "step": 2142 + }, + { + "epoch": 4.286, + "grad_norm": 1.2793657779693604, + "learning_rate": 2e-05, + "loss": 0.03890596, + "step": 2143 + }, + { + "epoch": 4.288, + "grad_norm": 1.0241049528121948, + "learning_rate": 2e-05, + "loss": 0.02962255, + "step": 2144 + }, + { + "epoch": 4.29, + "grad_norm": 1.3482152223587036, + "learning_rate": 2e-05, + "loss": 0.04371033, + "step": 2145 + }, + { + "epoch": 4.292, + "grad_norm": 1.825649619102478, + "learning_rate": 2e-05, + "loss": 0.05308119, + "step": 2146 + }, + { + "epoch": 4.294, + "grad_norm": 1.4800101518630981, + "learning_rate": 2e-05, + "loss": 0.0398606, + "step": 2147 + }, + { + "epoch": 4.296, + "grad_norm": 1.812041163444519, + "learning_rate": 2e-05, + "loss": 0.04485359, + "step": 2148 + }, + { + "epoch": 4.298, + "grad_norm": 1.0440144538879395, + "learning_rate": 2e-05, + "loss": 0.04142414, + "step": 2149 + }, + { + "epoch": 4.3, + "grad_norm": 1.4665697813034058, + "learning_rate": 2e-05, + "loss": 0.03760947, + "step": 2150 + }, + { + "epoch": 4.302, + "grad_norm": 1.5210576057434082, + "learning_rate": 2e-05, + "loss": 0.03952883, + "step": 2151 + }, + { + "epoch": 4.304, + "grad_norm": 1.5270603895187378, + "learning_rate": 2e-05, + "loss": 0.04734674, + "step": 2152 + }, + { + "epoch": 4.306, + "grad_norm": 2.6761133670806885, + "learning_rate": 2e-05, + "loss": 0.05432082, + "step": 2153 + }, + { + "epoch": 4.308, + "grad_norm": 1.2086443901062012, + "learning_rate": 2e-05, + "loss": 0.03031913, + "step": 2154 + }, + { + "epoch": 4.31, + "grad_norm": 1.5387802124023438, + "learning_rate": 2e-05, + "loss": 0.0282026, + "step": 2155 + }, + { + "epoch": 4.312, + "grad_norm": 1.694658875465393, + "learning_rate": 2e-05, + "loss": 0.05619892, + "step": 2156 + }, + { + "epoch": 4.314, + "grad_norm": 1.5027580261230469, + "learning_rate": 2e-05, + "loss": 0.04229368, + "step": 2157 + }, + { + "epoch": 4.316, + "grad_norm": 1.3247681856155396, + "learning_rate": 2e-05, + "loss": 0.0459209, + "step": 2158 + }, + { + "epoch": 4.318, + "grad_norm": 0.9879952669143677, + "learning_rate": 2e-05, + "loss": 0.03440651, + "step": 2159 + }, + { + "epoch": 4.32, + "grad_norm": 1.2517679929733276, + "learning_rate": 2e-05, + "loss": 0.0451726, + "step": 2160 + }, + { + "epoch": 4.322, + "grad_norm": 1.853196620941162, + "learning_rate": 2e-05, + "loss": 0.03265027, + "step": 2161 + }, + { + "epoch": 4.324, + "grad_norm": 1.710205316543579, + "learning_rate": 2e-05, + "loss": 0.04459624, + "step": 2162 + }, + { + "epoch": 4.326, + "grad_norm": 1.6105256080627441, + "learning_rate": 2e-05, + "loss": 0.06459425, + "step": 2163 + }, + { + "epoch": 4.328, + "grad_norm": 1.3017997741699219, + "learning_rate": 2e-05, + "loss": 0.05446584, + "step": 2164 + }, + { + "epoch": 4.33, + "grad_norm": 1.122078776359558, + "learning_rate": 2e-05, + "loss": 0.0316535, + "step": 2165 + }, + { + "epoch": 4.332, + "grad_norm": 1.4984065294265747, + "learning_rate": 2e-05, + "loss": 0.03722906, + "step": 2166 + }, + { + "epoch": 4.334, + "grad_norm": 1.2994331121444702, + "learning_rate": 2e-05, + "loss": 0.0416957, + "step": 2167 + }, + { + "epoch": 4.336, + "grad_norm": 2.6297388076782227, + "learning_rate": 2e-05, + "loss": 0.05040634, + "step": 2168 + }, + { + "epoch": 4.338, + "grad_norm": 1.6323825120925903, + "learning_rate": 2e-05, + "loss": 0.03700216, + "step": 2169 + }, + { + "epoch": 4.34, + "grad_norm": 1.054328203201294, + "learning_rate": 2e-05, + "loss": 0.03690439, + "step": 2170 + }, + { + "epoch": 4.342, + "grad_norm": 1.3024274110794067, + "learning_rate": 2e-05, + "loss": 0.04472798, + "step": 2171 + }, + { + "epoch": 4.344, + "grad_norm": 1.3375612497329712, + "learning_rate": 2e-05, + "loss": 0.03631008, + "step": 2172 + }, + { + "epoch": 4.346, + "grad_norm": 1.0233983993530273, + "learning_rate": 2e-05, + "loss": 0.03837822, + "step": 2173 + }, + { + "epoch": 4.348, + "grad_norm": 2.863868474960327, + "learning_rate": 2e-05, + "loss": 0.04723253, + "step": 2174 + }, + { + "epoch": 4.35, + "grad_norm": 1.1577509641647339, + "learning_rate": 2e-05, + "loss": 0.04589339, + "step": 2175 + }, + { + "epoch": 4.352, + "grad_norm": 1.5174366235733032, + "learning_rate": 2e-05, + "loss": 0.03971744, + "step": 2176 + }, + { + "epoch": 4.354, + "grad_norm": 1.822798490524292, + "learning_rate": 2e-05, + "loss": 0.04629911, + "step": 2177 + }, + { + "epoch": 4.356, + "grad_norm": 2.334824562072754, + "learning_rate": 2e-05, + "loss": 0.04560362, + "step": 2178 + }, + { + "epoch": 4.358, + "grad_norm": 1.554010033607483, + "learning_rate": 2e-05, + "loss": 0.04555508, + "step": 2179 + }, + { + "epoch": 4.36, + "grad_norm": 1.4686816930770874, + "learning_rate": 2e-05, + "loss": 0.0364374, + "step": 2180 + }, + { + "epoch": 4.362, + "grad_norm": 1.435239315032959, + "learning_rate": 2e-05, + "loss": 0.03441803, + "step": 2181 + }, + { + "epoch": 4.364, + "grad_norm": 1.2505661249160767, + "learning_rate": 2e-05, + "loss": 0.03555541, + "step": 2182 + }, + { + "epoch": 4.366, + "grad_norm": 2.002845048904419, + "learning_rate": 2e-05, + "loss": 0.07516761, + "step": 2183 + }, + { + "epoch": 4.368, + "grad_norm": 2.682558059692383, + "learning_rate": 2e-05, + "loss": 0.0619045, + "step": 2184 + }, + { + "epoch": 4.37, + "grad_norm": 1.4598617553710938, + "learning_rate": 2e-05, + "loss": 0.02849967, + "step": 2185 + }, + { + "epoch": 4.372, + "grad_norm": 2.2034389972686768, + "learning_rate": 2e-05, + "loss": 0.05034188, + "step": 2186 + }, + { + "epoch": 4.374, + "grad_norm": 1.2549974918365479, + "learning_rate": 2e-05, + "loss": 0.03955576, + "step": 2187 + }, + { + "epoch": 4.376, + "grad_norm": 1.4328988790512085, + "learning_rate": 2e-05, + "loss": 0.04043427, + "step": 2188 + }, + { + "epoch": 4.378, + "grad_norm": 1.1671024560928345, + "learning_rate": 2e-05, + "loss": 0.03745596, + "step": 2189 + }, + { + "epoch": 4.38, + "grad_norm": 1.1555709838867188, + "learning_rate": 2e-05, + "loss": 0.03046387, + "step": 2190 + }, + { + "epoch": 4.382, + "grad_norm": 2.62123703956604, + "learning_rate": 2e-05, + "loss": 0.03433674, + "step": 2191 + }, + { + "epoch": 4.384, + "grad_norm": 1.5227280855178833, + "learning_rate": 2e-05, + "loss": 0.02560518, + "step": 2192 + }, + { + "epoch": 4.386, + "grad_norm": 1.8068288564682007, + "learning_rate": 2e-05, + "loss": 0.0400132, + "step": 2193 + }, + { + "epoch": 4.388, + "grad_norm": 1.201744794845581, + "learning_rate": 2e-05, + "loss": 0.04389308, + "step": 2194 + }, + { + "epoch": 4.39, + "grad_norm": 1.1598666906356812, + "learning_rate": 2e-05, + "loss": 0.04563789, + "step": 2195 + }, + { + "epoch": 4.392, + "grad_norm": 1.885544776916504, + "learning_rate": 2e-05, + "loss": 0.03769413, + "step": 2196 + }, + { + "epoch": 4.394, + "grad_norm": 1.3363451957702637, + "learning_rate": 2e-05, + "loss": 0.03677461, + "step": 2197 + }, + { + "epoch": 4.396, + "grad_norm": 1.197109341621399, + "learning_rate": 2e-05, + "loss": 0.03512957, + "step": 2198 + }, + { + "epoch": 4.398, + "grad_norm": 1.189475417137146, + "learning_rate": 2e-05, + "loss": 0.05237271, + "step": 2199 + }, + { + "epoch": 4.4, + "grad_norm": 0.9987121224403381, + "learning_rate": 2e-05, + "loss": 0.01947338, + "step": 2200 + }, + { + "epoch": 4.402, + "grad_norm": 1.0477992296218872, + "learning_rate": 2e-05, + "loss": 0.0338387, + "step": 2201 + }, + { + "epoch": 4.404, + "grad_norm": 2.6130008697509766, + "learning_rate": 2e-05, + "loss": 0.04995614, + "step": 2202 + }, + { + "epoch": 4.406, + "grad_norm": 1.40412437915802, + "learning_rate": 2e-05, + "loss": 0.04779441, + "step": 2203 + }, + { + "epoch": 4.408, + "grad_norm": 2.32883358001709, + "learning_rate": 2e-05, + "loss": 0.04897148, + "step": 2204 + }, + { + "epoch": 4.41, + "grad_norm": 3.4168405532836914, + "learning_rate": 2e-05, + "loss": 0.05897108, + "step": 2205 + }, + { + "epoch": 4.412, + "grad_norm": 1.6130071878433228, + "learning_rate": 2e-05, + "loss": 0.03696927, + "step": 2206 + }, + { + "epoch": 4.414, + "grad_norm": 1.5120912790298462, + "learning_rate": 2e-05, + "loss": 0.036088, + "step": 2207 + }, + { + "epoch": 4.416, + "grad_norm": 1.9834599494934082, + "learning_rate": 2e-05, + "loss": 0.06459089, + "step": 2208 + }, + { + "epoch": 4.418, + "grad_norm": 2.0346577167510986, + "learning_rate": 2e-05, + "loss": 0.05516463, + "step": 2209 + }, + { + "epoch": 4.42, + "grad_norm": 1.2212589979171753, + "learning_rate": 2e-05, + "loss": 0.0262365, + "step": 2210 + }, + { + "epoch": 4.422, + "grad_norm": 1.5008301734924316, + "learning_rate": 2e-05, + "loss": 0.04188941, + "step": 2211 + }, + { + "epoch": 4.424, + "grad_norm": 0.9288143515586853, + "learning_rate": 2e-05, + "loss": 0.02780385, + "step": 2212 + }, + { + "epoch": 4.426, + "grad_norm": 1.2318435907363892, + "learning_rate": 2e-05, + "loss": 0.02683347, + "step": 2213 + }, + { + "epoch": 4.428, + "grad_norm": 1.711323618888855, + "learning_rate": 2e-05, + "loss": 0.04659836, + "step": 2214 + }, + { + "epoch": 4.43, + "grad_norm": 1.3409011363983154, + "learning_rate": 2e-05, + "loss": 0.04741023, + "step": 2215 + }, + { + "epoch": 4.432, + "grad_norm": 1.6755443811416626, + "learning_rate": 2e-05, + "loss": 0.05947503, + "step": 2216 + }, + { + "epoch": 4.434, + "grad_norm": 2.3621582984924316, + "learning_rate": 2e-05, + "loss": 0.06439509, + "step": 2217 + }, + { + "epoch": 4.436, + "grad_norm": 1.113885760307312, + "learning_rate": 2e-05, + "loss": 0.03710642, + "step": 2218 + }, + { + "epoch": 4.438, + "grad_norm": 0.9438255429267883, + "learning_rate": 2e-05, + "loss": 0.02726935, + "step": 2219 + }, + { + "epoch": 4.44, + "grad_norm": 1.442383050918579, + "learning_rate": 2e-05, + "loss": 0.04566254, + "step": 2220 + }, + { + "epoch": 4.442, + "grad_norm": 0.9339209794998169, + "learning_rate": 2e-05, + "loss": 0.02968401, + "step": 2221 + }, + { + "epoch": 4.444, + "grad_norm": 1.852805256843567, + "learning_rate": 2e-05, + "loss": 0.05942626, + "step": 2222 + }, + { + "epoch": 4.446, + "grad_norm": 2.1776721477508545, + "learning_rate": 2e-05, + "loss": 0.06122681, + "step": 2223 + }, + { + "epoch": 4.448, + "grad_norm": 1.7097100019454956, + "learning_rate": 2e-05, + "loss": 0.02926885, + "step": 2224 + }, + { + "epoch": 4.45, + "grad_norm": 1.3947697877883911, + "learning_rate": 2e-05, + "loss": 0.03958159, + "step": 2225 + }, + { + "epoch": 4.452, + "grad_norm": 1.4915926456451416, + "learning_rate": 2e-05, + "loss": 0.03779658, + "step": 2226 + }, + { + "epoch": 4.454, + "grad_norm": 1.4534882307052612, + "learning_rate": 2e-05, + "loss": 0.02921861, + "step": 2227 + }, + { + "epoch": 4.456, + "grad_norm": 1.5068776607513428, + "learning_rate": 2e-05, + "loss": 0.03718809, + "step": 2228 + }, + { + "epoch": 4.458, + "grad_norm": 1.521750569343567, + "learning_rate": 2e-05, + "loss": 0.04906987, + "step": 2229 + }, + { + "epoch": 4.46, + "grad_norm": 1.9581565856933594, + "learning_rate": 2e-05, + "loss": 0.04215892, + "step": 2230 + }, + { + "epoch": 4.462, + "grad_norm": 1.3763012886047363, + "learning_rate": 2e-05, + "loss": 0.03585676, + "step": 2231 + }, + { + "epoch": 4.464, + "grad_norm": 1.682054042816162, + "learning_rate": 2e-05, + "loss": 0.03892795, + "step": 2232 + }, + { + "epoch": 4.466, + "grad_norm": 1.1888952255249023, + "learning_rate": 2e-05, + "loss": 0.02804256, + "step": 2233 + }, + { + "epoch": 4.468, + "grad_norm": 1.0312952995300293, + "learning_rate": 2e-05, + "loss": 0.04241008, + "step": 2234 + }, + { + "epoch": 4.47, + "grad_norm": 0.9027299284934998, + "learning_rate": 2e-05, + "loss": 0.02175211, + "step": 2235 + }, + { + "epoch": 4.4719999999999995, + "grad_norm": 1.6123967170715332, + "learning_rate": 2e-05, + "loss": 0.04372674, + "step": 2236 + }, + { + "epoch": 4.474, + "grad_norm": 1.2008719444274902, + "learning_rate": 2e-05, + "loss": 0.03309236, + "step": 2237 + }, + { + "epoch": 4.476, + "grad_norm": 1.4866458177566528, + "learning_rate": 2e-05, + "loss": 0.04360201, + "step": 2238 + }, + { + "epoch": 4.478, + "grad_norm": 1.139384150505066, + "learning_rate": 2e-05, + "loss": 0.02492278, + "step": 2239 + }, + { + "epoch": 4.48, + "grad_norm": 1.314674973487854, + "learning_rate": 2e-05, + "loss": 0.03167442, + "step": 2240 + }, + { + "epoch": 4.482, + "grad_norm": 1.1558375358581543, + "learning_rate": 2e-05, + "loss": 0.03370375, + "step": 2241 + }, + { + "epoch": 4.484, + "grad_norm": 0.98810875415802, + "learning_rate": 2e-05, + "loss": 0.03298265, + "step": 2242 + }, + { + "epoch": 4.486, + "grad_norm": 1.692529559135437, + "learning_rate": 2e-05, + "loss": 0.043402, + "step": 2243 + }, + { + "epoch": 4.4879999999999995, + "grad_norm": 2.226050853729248, + "learning_rate": 2e-05, + "loss": 0.05941268, + "step": 2244 + }, + { + "epoch": 4.49, + "grad_norm": 1.0783953666687012, + "learning_rate": 2e-05, + "loss": 0.02816807, + "step": 2245 + }, + { + "epoch": 4.492, + "grad_norm": 1.9201335906982422, + "learning_rate": 2e-05, + "loss": 0.04565334, + "step": 2246 + }, + { + "epoch": 4.494, + "grad_norm": 1.2199456691741943, + "learning_rate": 2e-05, + "loss": 0.03948256, + "step": 2247 + }, + { + "epoch": 4.496, + "grad_norm": 2.3383970260620117, + "learning_rate": 2e-05, + "loss": 0.0555183, + "step": 2248 + }, + { + "epoch": 4.498, + "grad_norm": 1.1793116331100464, + "learning_rate": 2e-05, + "loss": 0.03439267, + "step": 2249 + }, + { + "epoch": 4.5, + "grad_norm": 1.0222777128219604, + "learning_rate": 2e-05, + "loss": 0.02550181, + "step": 2250 + }, + { + "epoch": 4.502, + "grad_norm": 1.751338005065918, + "learning_rate": 2e-05, + "loss": 0.03421024, + "step": 2251 + }, + { + "epoch": 4.504, + "grad_norm": 1.8420368432998657, + "learning_rate": 2e-05, + "loss": 0.04032079, + "step": 2252 + }, + { + "epoch": 4.506, + "grad_norm": 1.5478731393814087, + "learning_rate": 2e-05, + "loss": 0.04113689, + "step": 2253 + }, + { + "epoch": 4.508, + "grad_norm": 1.9176809787750244, + "learning_rate": 2e-05, + "loss": 0.03211041, + "step": 2254 + }, + { + "epoch": 4.51, + "grad_norm": 1.356545090675354, + "learning_rate": 2e-05, + "loss": 0.03811408, + "step": 2255 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 1.470837950706482, + "learning_rate": 2e-05, + "loss": 0.04642755, + "step": 2256 + }, + { + "epoch": 4.514, + "grad_norm": 2.1132113933563232, + "learning_rate": 2e-05, + "loss": 0.06317526, + "step": 2257 + }, + { + "epoch": 4.516, + "grad_norm": 1.2848460674285889, + "learning_rate": 2e-05, + "loss": 0.02209956, + "step": 2258 + }, + { + "epoch": 4.518, + "grad_norm": 2.304513454437256, + "learning_rate": 2e-05, + "loss": 0.05873321, + "step": 2259 + }, + { + "epoch": 4.52, + "grad_norm": 1.1547423601150513, + "learning_rate": 2e-05, + "loss": 0.03009026, + "step": 2260 + }, + { + "epoch": 4.522, + "grad_norm": 2.2654247283935547, + "learning_rate": 2e-05, + "loss": 0.05305985, + "step": 2261 + }, + { + "epoch": 4.524, + "grad_norm": 1.3709189891815186, + "learning_rate": 2e-05, + "loss": 0.0413008, + "step": 2262 + }, + { + "epoch": 4.526, + "grad_norm": 1.2186251878738403, + "learning_rate": 2e-05, + "loss": 0.04457299, + "step": 2263 + }, + { + "epoch": 4.5280000000000005, + "grad_norm": 1.8440145254135132, + "learning_rate": 2e-05, + "loss": 0.04306265, + "step": 2264 + }, + { + "epoch": 4.53, + "grad_norm": 1.8175358772277832, + "learning_rate": 2e-05, + "loss": 0.04188886, + "step": 2265 + }, + { + "epoch": 4.532, + "grad_norm": 1.7477566003799438, + "learning_rate": 2e-05, + "loss": 0.03470108, + "step": 2266 + }, + { + "epoch": 4.534, + "grad_norm": 1.1122045516967773, + "learning_rate": 2e-05, + "loss": 0.03090529, + "step": 2267 + }, + { + "epoch": 4.536, + "grad_norm": 1.6068907976150513, + "learning_rate": 2e-05, + "loss": 0.04337908, + "step": 2268 + }, + { + "epoch": 4.538, + "grad_norm": 1.5025036334991455, + "learning_rate": 2e-05, + "loss": 0.04078036, + "step": 2269 + }, + { + "epoch": 4.54, + "grad_norm": 1.6896750926971436, + "learning_rate": 2e-05, + "loss": 0.05480632, + "step": 2270 + }, + { + "epoch": 4.542, + "grad_norm": 1.4953817129135132, + "learning_rate": 2e-05, + "loss": 0.02734038, + "step": 2271 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 1.010339617729187, + "learning_rate": 2e-05, + "loss": 0.03597734, + "step": 2272 + }, + { + "epoch": 4.546, + "grad_norm": 1.4324558973312378, + "learning_rate": 2e-05, + "loss": 0.03734388, + "step": 2273 + }, + { + "epoch": 4.548, + "grad_norm": 1.7131267786026, + "learning_rate": 2e-05, + "loss": 0.050556, + "step": 2274 + }, + { + "epoch": 4.55, + "grad_norm": 1.5647190809249878, + "learning_rate": 2e-05, + "loss": 0.03882205, + "step": 2275 + }, + { + "epoch": 4.552, + "grad_norm": 1.6061151027679443, + "learning_rate": 2e-05, + "loss": 0.05493605, + "step": 2276 + }, + { + "epoch": 4.554, + "grad_norm": 1.275571346282959, + "learning_rate": 2e-05, + "loss": 0.03493945, + "step": 2277 + }, + { + "epoch": 4.556, + "grad_norm": 1.767574429512024, + "learning_rate": 2e-05, + "loss": 0.03032027, + "step": 2278 + }, + { + "epoch": 4.558, + "grad_norm": 1.9915244579315186, + "learning_rate": 2e-05, + "loss": 0.04645093, + "step": 2279 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 1.441829800605774, + "learning_rate": 2e-05, + "loss": 0.03383998, + "step": 2280 + }, + { + "epoch": 4.562, + "grad_norm": 1.1743775606155396, + "learning_rate": 2e-05, + "loss": 0.03763331, + "step": 2281 + }, + { + "epoch": 4.564, + "grad_norm": 1.7172895669937134, + "learning_rate": 2e-05, + "loss": 0.0662686, + "step": 2282 + }, + { + "epoch": 4.566, + "grad_norm": 1.7259050607681274, + "learning_rate": 2e-05, + "loss": 0.03643933, + "step": 2283 + }, + { + "epoch": 4.568, + "grad_norm": 1.2674235105514526, + "learning_rate": 2e-05, + "loss": 0.03521616, + "step": 2284 + }, + { + "epoch": 4.57, + "grad_norm": 1.095694661140442, + "learning_rate": 2e-05, + "loss": 0.04045701, + "step": 2285 + }, + { + "epoch": 4.572, + "grad_norm": 1.3978180885314941, + "learning_rate": 2e-05, + "loss": 0.03319689, + "step": 2286 + }, + { + "epoch": 4.574, + "grad_norm": 1.4615137577056885, + "learning_rate": 2e-05, + "loss": 0.04723348, + "step": 2287 + }, + { + "epoch": 4.576, + "grad_norm": 2.235830783843994, + "learning_rate": 2e-05, + "loss": 0.02766045, + "step": 2288 + }, + { + "epoch": 4.578, + "grad_norm": 1.0262019634246826, + "learning_rate": 2e-05, + "loss": 0.02492511, + "step": 2289 + }, + { + "epoch": 4.58, + "grad_norm": 2.410766363143921, + "learning_rate": 2e-05, + "loss": 0.03837668, + "step": 2290 + }, + { + "epoch": 4.582, + "grad_norm": 1.3832848072052002, + "learning_rate": 2e-05, + "loss": 0.04708455, + "step": 2291 + }, + { + "epoch": 4.584, + "grad_norm": 1.5111563205718994, + "learning_rate": 2e-05, + "loss": 0.03212856, + "step": 2292 + }, + { + "epoch": 4.586, + "grad_norm": 1.8234524726867676, + "learning_rate": 2e-05, + "loss": 0.04155425, + "step": 2293 + }, + { + "epoch": 4.588, + "grad_norm": 1.8563976287841797, + "learning_rate": 2e-05, + "loss": 0.05284409, + "step": 2294 + }, + { + "epoch": 4.59, + "grad_norm": 1.0448572635650635, + "learning_rate": 2e-05, + "loss": 0.02735084, + "step": 2295 + }, + { + "epoch": 4.592, + "grad_norm": 0.9727917909622192, + "learning_rate": 2e-05, + "loss": 0.03186521, + "step": 2296 + }, + { + "epoch": 4.594, + "grad_norm": 1.7344650030136108, + "learning_rate": 2e-05, + "loss": 0.05732023, + "step": 2297 + }, + { + "epoch": 4.596, + "grad_norm": 1.630303144454956, + "learning_rate": 2e-05, + "loss": 0.03979348, + "step": 2298 + }, + { + "epoch": 4.598, + "grad_norm": 1.5773340463638306, + "learning_rate": 2e-05, + "loss": 0.04727698, + "step": 2299 + }, + { + "epoch": 4.6, + "grad_norm": 1.4864786863327026, + "learning_rate": 2e-05, + "loss": 0.0243044, + "step": 2300 + }, + { + "epoch": 4.602, + "grad_norm": 1.6385360956192017, + "learning_rate": 2e-05, + "loss": 0.04001735, + "step": 2301 + }, + { + "epoch": 4.604, + "grad_norm": 0.9057992696762085, + "learning_rate": 2e-05, + "loss": 0.02840018, + "step": 2302 + }, + { + "epoch": 4.606, + "grad_norm": 2.1752712726593018, + "learning_rate": 2e-05, + "loss": 0.02728005, + "step": 2303 + }, + { + "epoch": 4.608, + "grad_norm": 1.4201445579528809, + "learning_rate": 2e-05, + "loss": 0.03847256, + "step": 2304 + }, + { + "epoch": 4.61, + "grad_norm": 1.6967096328735352, + "learning_rate": 2e-05, + "loss": 0.03130513, + "step": 2305 + }, + { + "epoch": 4.612, + "grad_norm": 2.1925299167633057, + "learning_rate": 2e-05, + "loss": 0.04984703, + "step": 2306 + }, + { + "epoch": 4.614, + "grad_norm": 1.6784030199050903, + "learning_rate": 2e-05, + "loss": 0.03006441, + "step": 2307 + }, + { + "epoch": 4.616, + "grad_norm": 1.3622230291366577, + "learning_rate": 2e-05, + "loss": 0.03182603, + "step": 2308 + }, + { + "epoch": 4.618, + "grad_norm": 1.5717957019805908, + "learning_rate": 2e-05, + "loss": 0.03701077, + "step": 2309 + }, + { + "epoch": 4.62, + "grad_norm": 1.0062084197998047, + "learning_rate": 2e-05, + "loss": 0.0248526, + "step": 2310 + }, + { + "epoch": 4.622, + "grad_norm": 2.1822187900543213, + "learning_rate": 2e-05, + "loss": 0.03146263, + "step": 2311 + }, + { + "epoch": 4.624, + "grad_norm": 3.6029553413391113, + "learning_rate": 2e-05, + "loss": 0.06397329, + "step": 2312 + }, + { + "epoch": 4.626, + "grad_norm": 2.1496500968933105, + "learning_rate": 2e-05, + "loss": 0.03725152, + "step": 2313 + }, + { + "epoch": 4.628, + "grad_norm": 1.8497250080108643, + "learning_rate": 2e-05, + "loss": 0.03161447, + "step": 2314 + }, + { + "epoch": 4.63, + "grad_norm": 1.3184853792190552, + "learning_rate": 2e-05, + "loss": 0.03692345, + "step": 2315 + }, + { + "epoch": 4.632, + "grad_norm": 1.4427586793899536, + "learning_rate": 2e-05, + "loss": 0.02692084, + "step": 2316 + }, + { + "epoch": 4.634, + "grad_norm": 1.8580230474472046, + "learning_rate": 2e-05, + "loss": 0.0654839, + "step": 2317 + }, + { + "epoch": 4.636, + "grad_norm": 1.1471973657608032, + "learning_rate": 2e-05, + "loss": 0.03153278, + "step": 2318 + }, + { + "epoch": 4.638, + "grad_norm": 1.2644822597503662, + "learning_rate": 2e-05, + "loss": 0.04941096, + "step": 2319 + }, + { + "epoch": 4.64, + "grad_norm": 2.7720656394958496, + "learning_rate": 2e-05, + "loss": 0.05895547, + "step": 2320 + }, + { + "epoch": 4.642, + "grad_norm": 1.2557971477508545, + "learning_rate": 2e-05, + "loss": 0.04365244, + "step": 2321 + }, + { + "epoch": 4.644, + "grad_norm": 1.6720325946807861, + "learning_rate": 2e-05, + "loss": 0.05442552, + "step": 2322 + }, + { + "epoch": 4.646, + "grad_norm": 1.0071033239364624, + "learning_rate": 2e-05, + "loss": 0.02480928, + "step": 2323 + }, + { + "epoch": 4.648, + "grad_norm": 1.183469295501709, + "learning_rate": 2e-05, + "loss": 0.03799593, + "step": 2324 + }, + { + "epoch": 4.65, + "grad_norm": 1.3475804328918457, + "learning_rate": 2e-05, + "loss": 0.05035153, + "step": 2325 + }, + { + "epoch": 4.652, + "grad_norm": 2.5962164402008057, + "learning_rate": 2e-05, + "loss": 0.04414929, + "step": 2326 + }, + { + "epoch": 4.654, + "grad_norm": 1.1905382871627808, + "learning_rate": 2e-05, + "loss": 0.04276285, + "step": 2327 + }, + { + "epoch": 4.656, + "grad_norm": 1.5951045751571655, + "learning_rate": 2e-05, + "loss": 0.03594982, + "step": 2328 + }, + { + "epoch": 4.658, + "grad_norm": 1.6772533655166626, + "learning_rate": 2e-05, + "loss": 0.05392138, + "step": 2329 + }, + { + "epoch": 4.66, + "grad_norm": 1.5733336210250854, + "learning_rate": 2e-05, + "loss": 0.03919678, + "step": 2330 + }, + { + "epoch": 4.662, + "grad_norm": 1.1291429996490479, + "learning_rate": 2e-05, + "loss": 0.02948717, + "step": 2331 + }, + { + "epoch": 4.664, + "grad_norm": 1.38038969039917, + "learning_rate": 2e-05, + "loss": 0.04529754, + "step": 2332 + }, + { + "epoch": 4.666, + "grad_norm": 1.4430439472198486, + "learning_rate": 2e-05, + "loss": 0.02968345, + "step": 2333 + }, + { + "epoch": 4.668, + "grad_norm": 1.0805678367614746, + "learning_rate": 2e-05, + "loss": 0.03496836, + "step": 2334 + }, + { + "epoch": 4.67, + "grad_norm": 1.4857618808746338, + "learning_rate": 2e-05, + "loss": 0.04253389, + "step": 2335 + }, + { + "epoch": 4.672, + "grad_norm": 1.8443632125854492, + "learning_rate": 2e-05, + "loss": 0.03913691, + "step": 2336 + }, + { + "epoch": 4.674, + "grad_norm": 1.596656084060669, + "learning_rate": 2e-05, + "loss": 0.03494718, + "step": 2337 + }, + { + "epoch": 4.676, + "grad_norm": 2.0523974895477295, + "learning_rate": 2e-05, + "loss": 0.05468136, + "step": 2338 + }, + { + "epoch": 4.678, + "grad_norm": 1.5666359663009644, + "learning_rate": 2e-05, + "loss": 0.0429353, + "step": 2339 + }, + { + "epoch": 4.68, + "grad_norm": 1.299415946006775, + "learning_rate": 2e-05, + "loss": 0.02959892, + "step": 2340 + }, + { + "epoch": 4.682, + "grad_norm": 1.6929570436477661, + "learning_rate": 2e-05, + "loss": 0.04222934, + "step": 2341 + }, + { + "epoch": 4.684, + "grad_norm": 2.128430128097534, + "learning_rate": 2e-05, + "loss": 0.04607785, + "step": 2342 + }, + { + "epoch": 4.686, + "grad_norm": 2.121476650238037, + "learning_rate": 2e-05, + "loss": 0.04864143, + "step": 2343 + }, + { + "epoch": 4.688, + "grad_norm": 2.3001673221588135, + "learning_rate": 2e-05, + "loss": 0.04368895, + "step": 2344 + }, + { + "epoch": 4.6899999999999995, + "grad_norm": 1.804023265838623, + "learning_rate": 2e-05, + "loss": 0.03963099, + "step": 2345 + }, + { + "epoch": 4.692, + "grad_norm": 1.1703206300735474, + "learning_rate": 2e-05, + "loss": 0.03038155, + "step": 2346 + }, + { + "epoch": 4.694, + "grad_norm": 1.123602032661438, + "learning_rate": 2e-05, + "loss": 0.04947871, + "step": 2347 + }, + { + "epoch": 4.696, + "grad_norm": 1.3002231121063232, + "learning_rate": 2e-05, + "loss": 0.03265772, + "step": 2348 + }, + { + "epoch": 4.698, + "grad_norm": 1.3386176824569702, + "learning_rate": 2e-05, + "loss": 0.0319346, + "step": 2349 + }, + { + "epoch": 4.7, + "grad_norm": 1.3578490018844604, + "learning_rate": 2e-05, + "loss": 0.0321762, + "step": 2350 + }, + { + "epoch": 4.702, + "grad_norm": 2.0441887378692627, + "learning_rate": 2e-05, + "loss": 0.0398183, + "step": 2351 + }, + { + "epoch": 4.704, + "grad_norm": 1.819500207901001, + "learning_rate": 2e-05, + "loss": 0.03708476, + "step": 2352 + }, + { + "epoch": 4.7059999999999995, + "grad_norm": 1.4998730421066284, + "learning_rate": 2e-05, + "loss": 0.03444158, + "step": 2353 + }, + { + "epoch": 4.708, + "grad_norm": 1.3147387504577637, + "learning_rate": 2e-05, + "loss": 0.02567411, + "step": 2354 + }, + { + "epoch": 4.71, + "grad_norm": 1.0613209009170532, + "learning_rate": 2e-05, + "loss": 0.02812085, + "step": 2355 + }, + { + "epoch": 4.712, + "grad_norm": 1.6744329929351807, + "learning_rate": 2e-05, + "loss": 0.04626666, + "step": 2356 + }, + { + "epoch": 4.714, + "grad_norm": 1.8703510761260986, + "learning_rate": 2e-05, + "loss": 0.05195956, + "step": 2357 + }, + { + "epoch": 4.716, + "grad_norm": 1.7580175399780273, + "learning_rate": 2e-05, + "loss": 0.03270845, + "step": 2358 + }, + { + "epoch": 4.718, + "grad_norm": 1.9400629997253418, + "learning_rate": 2e-05, + "loss": 0.05601585, + "step": 2359 + }, + { + "epoch": 4.72, + "grad_norm": 1.4302650690078735, + "learning_rate": 2e-05, + "loss": 0.03492381, + "step": 2360 + }, + { + "epoch": 4.7219999999999995, + "grad_norm": 2.32159161567688, + "learning_rate": 2e-05, + "loss": 0.04457737, + "step": 2361 + }, + { + "epoch": 4.724, + "grad_norm": 3.4112353324890137, + "learning_rate": 2e-05, + "loss": 0.08344968, + "step": 2362 + }, + { + "epoch": 4.726, + "grad_norm": 1.6217650175094604, + "learning_rate": 2e-05, + "loss": 0.02905689, + "step": 2363 + }, + { + "epoch": 4.728, + "grad_norm": 1.6228758096694946, + "learning_rate": 2e-05, + "loss": 0.04028218, + "step": 2364 + }, + { + "epoch": 4.73, + "grad_norm": 1.1981812715530396, + "learning_rate": 2e-05, + "loss": 0.04487208, + "step": 2365 + }, + { + "epoch": 4.732, + "grad_norm": 1.7020169496536255, + "learning_rate": 2e-05, + "loss": 0.0399951, + "step": 2366 + }, + { + "epoch": 4.734, + "grad_norm": 1.3388174772262573, + "learning_rate": 2e-05, + "loss": 0.04234082, + "step": 2367 + }, + { + "epoch": 4.736, + "grad_norm": 1.5462074279785156, + "learning_rate": 2e-05, + "loss": 0.0329011, + "step": 2368 + }, + { + "epoch": 4.7379999999999995, + "grad_norm": 1.1013768911361694, + "learning_rate": 2e-05, + "loss": 0.03339402, + "step": 2369 + }, + { + "epoch": 4.74, + "grad_norm": 1.4851659536361694, + "learning_rate": 2e-05, + "loss": 0.03826153, + "step": 2370 + }, + { + "epoch": 4.742, + "grad_norm": 1.2620880603790283, + "learning_rate": 2e-05, + "loss": 0.03477462, + "step": 2371 + }, + { + "epoch": 4.744, + "grad_norm": 1.2920024394989014, + "learning_rate": 2e-05, + "loss": 0.04607169, + "step": 2372 + }, + { + "epoch": 4.746, + "grad_norm": 1.521671175956726, + "learning_rate": 2e-05, + "loss": 0.04394786, + "step": 2373 + }, + { + "epoch": 4.748, + "grad_norm": 1.4648962020874023, + "learning_rate": 2e-05, + "loss": 0.04500762, + "step": 2374 + }, + { + "epoch": 4.75, + "grad_norm": 1.8178099393844604, + "learning_rate": 2e-05, + "loss": 0.03826898, + "step": 2375 + }, + { + "epoch": 4.752, + "grad_norm": 0.9506421685218811, + "learning_rate": 2e-05, + "loss": 0.03018279, + "step": 2376 + }, + { + "epoch": 4.754, + "grad_norm": 1.1572425365447998, + "learning_rate": 2e-05, + "loss": 0.03146629, + "step": 2377 + }, + { + "epoch": 4.756, + "grad_norm": 0.9127270579338074, + "learning_rate": 2e-05, + "loss": 0.0217318, + "step": 2378 + }, + { + "epoch": 4.758, + "grad_norm": 1.051289677619934, + "learning_rate": 2e-05, + "loss": 0.03464397, + "step": 2379 + }, + { + "epoch": 4.76, + "grad_norm": 2.1047472953796387, + "learning_rate": 2e-05, + "loss": 0.05494668, + "step": 2380 + }, + { + "epoch": 4.7620000000000005, + "grad_norm": 1.4690008163452148, + "learning_rate": 2e-05, + "loss": 0.02600802, + "step": 2381 + }, + { + "epoch": 4.764, + "grad_norm": 1.9520142078399658, + "learning_rate": 2e-05, + "loss": 0.03388739, + "step": 2382 + }, + { + "epoch": 4.766, + "grad_norm": 2.3827433586120605, + "learning_rate": 2e-05, + "loss": 0.0476454, + "step": 2383 + }, + { + "epoch": 4.768, + "grad_norm": 1.4814623594284058, + "learning_rate": 2e-05, + "loss": 0.03822245, + "step": 2384 + }, + { + "epoch": 4.77, + "grad_norm": 2.171255350112915, + "learning_rate": 2e-05, + "loss": 0.06004865, + "step": 2385 + }, + { + "epoch": 4.772, + "grad_norm": 1.3165963888168335, + "learning_rate": 2e-05, + "loss": 0.03820898, + "step": 2386 + }, + { + "epoch": 4.774, + "grad_norm": 1.2138549089431763, + "learning_rate": 2e-05, + "loss": 0.03760416, + "step": 2387 + }, + { + "epoch": 4.776, + "grad_norm": 1.4797013998031616, + "learning_rate": 2e-05, + "loss": 0.03296691, + "step": 2388 + }, + { + "epoch": 4.7780000000000005, + "grad_norm": 1.994217038154602, + "learning_rate": 2e-05, + "loss": 0.05951263, + "step": 2389 + }, + { + "epoch": 4.78, + "grad_norm": 1.6225087642669678, + "learning_rate": 2e-05, + "loss": 0.03843221, + "step": 2390 + }, + { + "epoch": 4.782, + "grad_norm": 1.264367938041687, + "learning_rate": 2e-05, + "loss": 0.03937426, + "step": 2391 + }, + { + "epoch": 4.784, + "grad_norm": 2.4877493381500244, + "learning_rate": 2e-05, + "loss": 0.04250683, + "step": 2392 + }, + { + "epoch": 4.786, + "grad_norm": 1.6016268730163574, + "learning_rate": 2e-05, + "loss": 0.03200974, + "step": 2393 + }, + { + "epoch": 4.788, + "grad_norm": 1.2733501195907593, + "learning_rate": 2e-05, + "loss": 0.02490179, + "step": 2394 + }, + { + "epoch": 4.79, + "grad_norm": 2.5966386795043945, + "learning_rate": 2e-05, + "loss": 0.04996984, + "step": 2395 + }, + { + "epoch": 4.792, + "grad_norm": 1.5662956237792969, + "learning_rate": 2e-05, + "loss": 0.05331601, + "step": 2396 + }, + { + "epoch": 4.7940000000000005, + "grad_norm": 1.7879033088684082, + "learning_rate": 2e-05, + "loss": 0.04639386, + "step": 2397 + }, + { + "epoch": 4.796, + "grad_norm": 1.5951218605041504, + "learning_rate": 2e-05, + "loss": 0.04075851, + "step": 2398 + }, + { + "epoch": 4.798, + "grad_norm": 1.3353394269943237, + "learning_rate": 2e-05, + "loss": 0.03567629, + "step": 2399 + }, + { + "epoch": 4.8, + "grad_norm": 1.5784703493118286, + "learning_rate": 2e-05, + "loss": 0.0398753, + "step": 2400 + }, + { + "epoch": 4.802, + "grad_norm": 1.7636374235153198, + "learning_rate": 2e-05, + "loss": 0.03577316, + "step": 2401 + }, + { + "epoch": 4.804, + "grad_norm": 2.104825496673584, + "learning_rate": 2e-05, + "loss": 0.05970087, + "step": 2402 + }, + { + "epoch": 4.806, + "grad_norm": 1.0066531896591187, + "learning_rate": 2e-05, + "loss": 0.03378828, + "step": 2403 + }, + { + "epoch": 4.808, + "grad_norm": 1.1586098670959473, + "learning_rate": 2e-05, + "loss": 0.03477071, + "step": 2404 + }, + { + "epoch": 4.8100000000000005, + "grad_norm": 1.9183125495910645, + "learning_rate": 2e-05, + "loss": 0.04798802, + "step": 2405 + }, + { + "epoch": 4.812, + "grad_norm": 1.2498745918273926, + "learning_rate": 2e-05, + "loss": 0.04589451, + "step": 2406 + }, + { + "epoch": 4.814, + "grad_norm": 2.4411637783050537, + "learning_rate": 2e-05, + "loss": 0.06537707, + "step": 2407 + }, + { + "epoch": 4.816, + "grad_norm": 1.692878007888794, + "learning_rate": 2e-05, + "loss": 0.02775835, + "step": 2408 + }, + { + "epoch": 4.818, + "grad_norm": 1.5488640069961548, + "learning_rate": 2e-05, + "loss": 0.04157056, + "step": 2409 + }, + { + "epoch": 4.82, + "grad_norm": 1.456628680229187, + "learning_rate": 2e-05, + "loss": 0.04453836, + "step": 2410 + }, + { + "epoch": 4.822, + "grad_norm": 1.2620590925216675, + "learning_rate": 2e-05, + "loss": 0.04159302, + "step": 2411 + }, + { + "epoch": 4.824, + "grad_norm": 1.6489261388778687, + "learning_rate": 2e-05, + "loss": 0.03650364, + "step": 2412 + }, + { + "epoch": 4.826, + "grad_norm": 1.3085451126098633, + "learning_rate": 2e-05, + "loss": 0.03413782, + "step": 2413 + }, + { + "epoch": 4.828, + "grad_norm": 1.924852967262268, + "learning_rate": 2e-05, + "loss": 0.06830163, + "step": 2414 + }, + { + "epoch": 4.83, + "grad_norm": 1.020704746246338, + "learning_rate": 2e-05, + "loss": 0.02394637, + "step": 2415 + }, + { + "epoch": 4.832, + "grad_norm": 1.132530689239502, + "learning_rate": 2e-05, + "loss": 0.0420993, + "step": 2416 + }, + { + "epoch": 4.834, + "grad_norm": 0.9570915102958679, + "learning_rate": 2e-05, + "loss": 0.0311062, + "step": 2417 + }, + { + "epoch": 4.836, + "grad_norm": 1.0534124374389648, + "learning_rate": 2e-05, + "loss": 0.02990963, + "step": 2418 + }, + { + "epoch": 4.838, + "grad_norm": 1.1117393970489502, + "learning_rate": 2e-05, + "loss": 0.03667744, + "step": 2419 + }, + { + "epoch": 4.84, + "grad_norm": 1.32503342628479, + "learning_rate": 2e-05, + "loss": 0.03586256, + "step": 2420 + }, + { + "epoch": 4.842, + "grad_norm": 1.5087800025939941, + "learning_rate": 2e-05, + "loss": 0.03501411, + "step": 2421 + }, + { + "epoch": 4.844, + "grad_norm": 1.7611730098724365, + "learning_rate": 2e-05, + "loss": 0.03907187, + "step": 2422 + }, + { + "epoch": 4.846, + "grad_norm": 1.3522123098373413, + "learning_rate": 2e-05, + "loss": 0.04923314, + "step": 2423 + }, + { + "epoch": 4.848, + "grad_norm": 1.3680315017700195, + "learning_rate": 2e-05, + "loss": 0.03453797, + "step": 2424 + }, + { + "epoch": 4.85, + "grad_norm": 1.0860910415649414, + "learning_rate": 2e-05, + "loss": 0.03305909, + "step": 2425 + }, + { + "epoch": 4.852, + "grad_norm": 1.2761231660842896, + "learning_rate": 2e-05, + "loss": 0.03872238, + "step": 2426 + }, + { + "epoch": 4.854, + "grad_norm": 1.7196176052093506, + "learning_rate": 2e-05, + "loss": 0.03280783, + "step": 2427 + }, + { + "epoch": 4.856, + "grad_norm": 1.5164096355438232, + "learning_rate": 2e-05, + "loss": 0.03744507, + "step": 2428 + }, + { + "epoch": 4.858, + "grad_norm": 1.294891119003296, + "learning_rate": 2e-05, + "loss": 0.03750362, + "step": 2429 + }, + { + "epoch": 4.86, + "grad_norm": 1.5655124187469482, + "learning_rate": 2e-05, + "loss": 0.03739174, + "step": 2430 + }, + { + "epoch": 4.862, + "grad_norm": 1.806402325630188, + "learning_rate": 2e-05, + "loss": 0.04592914, + "step": 2431 + }, + { + "epoch": 4.864, + "grad_norm": 2.5879619121551514, + "learning_rate": 2e-05, + "loss": 0.03984898, + "step": 2432 + }, + { + "epoch": 4.866, + "grad_norm": 2.590233564376831, + "learning_rate": 2e-05, + "loss": 0.03624843, + "step": 2433 + }, + { + "epoch": 4.868, + "grad_norm": 1.4250051975250244, + "learning_rate": 2e-05, + "loss": 0.06627299, + "step": 2434 + }, + { + "epoch": 4.87, + "grad_norm": 2.142298936843872, + "learning_rate": 2e-05, + "loss": 0.04694505, + "step": 2435 + }, + { + "epoch": 4.872, + "grad_norm": 1.6512330770492554, + "learning_rate": 2e-05, + "loss": 0.04675486, + "step": 2436 + }, + { + "epoch": 4.874, + "grad_norm": 1.1466740369796753, + "learning_rate": 2e-05, + "loss": 0.0404415, + "step": 2437 + }, + { + "epoch": 4.876, + "grad_norm": 1.163206934928894, + "learning_rate": 2e-05, + "loss": 0.04484642, + "step": 2438 + }, + { + "epoch": 4.878, + "grad_norm": 1.4623184204101562, + "learning_rate": 2e-05, + "loss": 0.0501948, + "step": 2439 + }, + { + "epoch": 4.88, + "grad_norm": 1.4037455320358276, + "learning_rate": 2e-05, + "loss": 0.03190558, + "step": 2440 + }, + { + "epoch": 4.882, + "grad_norm": 1.6567379236221313, + "learning_rate": 2e-05, + "loss": 0.03677463, + "step": 2441 + }, + { + "epoch": 4.884, + "grad_norm": 1.5126616954803467, + "learning_rate": 2e-05, + "loss": 0.03623777, + "step": 2442 + }, + { + "epoch": 4.886, + "grad_norm": 1.434916615486145, + "learning_rate": 2e-05, + "loss": 0.04461944, + "step": 2443 + }, + { + "epoch": 4.888, + "grad_norm": 1.0425275564193726, + "learning_rate": 2e-05, + "loss": 0.03600231, + "step": 2444 + }, + { + "epoch": 4.89, + "grad_norm": 1.3526657819747925, + "learning_rate": 2e-05, + "loss": 0.05250793, + "step": 2445 + }, + { + "epoch": 4.892, + "grad_norm": 2.3579368591308594, + "learning_rate": 2e-05, + "loss": 0.04716105, + "step": 2446 + }, + { + "epoch": 4.894, + "grad_norm": 2.2220516204833984, + "learning_rate": 2e-05, + "loss": 0.04456067, + "step": 2447 + }, + { + "epoch": 4.896, + "grad_norm": 1.631173014640808, + "learning_rate": 2e-05, + "loss": 0.04533452, + "step": 2448 + }, + { + "epoch": 4.898, + "grad_norm": 1.5379027128219604, + "learning_rate": 2e-05, + "loss": 0.03291882, + "step": 2449 + }, + { + "epoch": 4.9, + "grad_norm": 1.6118535995483398, + "learning_rate": 2e-05, + "loss": 0.05840709, + "step": 2450 + }, + { + "epoch": 4.902, + "grad_norm": 1.0765273571014404, + "learning_rate": 2e-05, + "loss": 0.03540622, + "step": 2451 + }, + { + "epoch": 4.904, + "grad_norm": 1.0575835704803467, + "learning_rate": 2e-05, + "loss": 0.02790409, + "step": 2452 + }, + { + "epoch": 4.906, + "grad_norm": 0.8009814023971558, + "learning_rate": 2e-05, + "loss": 0.02813254, + "step": 2453 + }, + { + "epoch": 4.908, + "grad_norm": 1.1870671510696411, + "learning_rate": 2e-05, + "loss": 0.0324744, + "step": 2454 + }, + { + "epoch": 4.91, + "grad_norm": 1.583187460899353, + "learning_rate": 2e-05, + "loss": 0.0449949, + "step": 2455 + }, + { + "epoch": 4.912, + "grad_norm": 0.8084712028503418, + "learning_rate": 2e-05, + "loss": 0.02643416, + "step": 2456 + }, + { + "epoch": 4.914, + "grad_norm": 1.0200517177581787, + "learning_rate": 2e-05, + "loss": 0.02963091, + "step": 2457 + }, + { + "epoch": 4.916, + "grad_norm": 1.270112156867981, + "learning_rate": 2e-05, + "loss": 0.04167578, + "step": 2458 + }, + { + "epoch": 4.918, + "grad_norm": 1.7084966897964478, + "learning_rate": 2e-05, + "loss": 0.0530615, + "step": 2459 + }, + { + "epoch": 4.92, + "grad_norm": 0.9616237282752991, + "learning_rate": 2e-05, + "loss": 0.03146474, + "step": 2460 + }, + { + "epoch": 4.922, + "grad_norm": 1.698949933052063, + "learning_rate": 2e-05, + "loss": 0.04134315, + "step": 2461 + }, + { + "epoch": 4.924, + "grad_norm": 1.3575299978256226, + "learning_rate": 2e-05, + "loss": 0.04107556, + "step": 2462 + }, + { + "epoch": 4.926, + "grad_norm": 1.2565886974334717, + "learning_rate": 2e-05, + "loss": 0.02181477, + "step": 2463 + }, + { + "epoch": 4.928, + "grad_norm": 0.7854299545288086, + "learning_rate": 2e-05, + "loss": 0.0208516, + "step": 2464 + }, + { + "epoch": 4.93, + "grad_norm": 1.0150772333145142, + "learning_rate": 2e-05, + "loss": 0.03809316, + "step": 2465 + }, + { + "epoch": 4.932, + "grad_norm": 1.4074971675872803, + "learning_rate": 2e-05, + "loss": 0.04282606, + "step": 2466 + }, + { + "epoch": 4.934, + "grad_norm": 1.378995418548584, + "learning_rate": 2e-05, + "loss": 0.0498643, + "step": 2467 + }, + { + "epoch": 4.936, + "grad_norm": 1.3473279476165771, + "learning_rate": 2e-05, + "loss": 0.04881741, + "step": 2468 + }, + { + "epoch": 4.938, + "grad_norm": 1.5778177976608276, + "learning_rate": 2e-05, + "loss": 0.05556468, + "step": 2469 + }, + { + "epoch": 4.9399999999999995, + "grad_norm": 1.3268343210220337, + "learning_rate": 2e-05, + "loss": 0.04513773, + "step": 2470 + }, + { + "epoch": 4.942, + "grad_norm": 1.551581621170044, + "learning_rate": 2e-05, + "loss": 0.04969638, + "step": 2471 + }, + { + "epoch": 4.944, + "grad_norm": 1.280708909034729, + "learning_rate": 2e-05, + "loss": 0.04666021, + "step": 2472 + }, + { + "epoch": 4.946, + "grad_norm": 1.6760709285736084, + "learning_rate": 2e-05, + "loss": 0.04832494, + "step": 2473 + }, + { + "epoch": 4.948, + "grad_norm": 1.4590950012207031, + "learning_rate": 2e-05, + "loss": 0.05018194, + "step": 2474 + }, + { + "epoch": 4.95, + "grad_norm": 1.3153926134109497, + "learning_rate": 2e-05, + "loss": 0.03609195, + "step": 2475 + }, + { + "epoch": 4.952, + "grad_norm": 2.7595255374908447, + "learning_rate": 2e-05, + "loss": 0.02830896, + "step": 2476 + }, + { + "epoch": 4.954, + "grad_norm": 1.0296415090560913, + "learning_rate": 2e-05, + "loss": 0.02910339, + "step": 2477 + }, + { + "epoch": 4.9559999999999995, + "grad_norm": 1.205686330795288, + "learning_rate": 2e-05, + "loss": 0.02493732, + "step": 2478 + }, + { + "epoch": 4.958, + "grad_norm": 1.3554532527923584, + "learning_rate": 2e-05, + "loss": 0.03051371, + "step": 2479 + }, + { + "epoch": 4.96, + "grad_norm": 1.8102985620498657, + "learning_rate": 2e-05, + "loss": 0.05402344, + "step": 2480 + }, + { + "epoch": 4.962, + "grad_norm": 1.090096116065979, + "learning_rate": 2e-05, + "loss": 0.04083726, + "step": 2481 + }, + { + "epoch": 4.964, + "grad_norm": 1.3355975151062012, + "learning_rate": 2e-05, + "loss": 0.0298448, + "step": 2482 + }, + { + "epoch": 4.966, + "grad_norm": 1.1990944147109985, + "learning_rate": 2e-05, + "loss": 0.04024744, + "step": 2483 + }, + { + "epoch": 4.968, + "grad_norm": 1.599819540977478, + "learning_rate": 2e-05, + "loss": 0.04165243, + "step": 2484 + }, + { + "epoch": 4.97, + "grad_norm": 1.154665470123291, + "learning_rate": 2e-05, + "loss": 0.03814416, + "step": 2485 + }, + { + "epoch": 4.9719999999999995, + "grad_norm": 1.7538753747940063, + "learning_rate": 2e-05, + "loss": 0.04323652, + "step": 2486 + }, + { + "epoch": 4.974, + "grad_norm": 1.2068040370941162, + "learning_rate": 2e-05, + "loss": 0.02646199, + "step": 2487 + }, + { + "epoch": 4.976, + "grad_norm": 1.9645920991897583, + "learning_rate": 2e-05, + "loss": 0.04496079, + "step": 2488 + }, + { + "epoch": 4.978, + "grad_norm": 1.4738874435424805, + "learning_rate": 2e-05, + "loss": 0.04259995, + "step": 2489 + }, + { + "epoch": 4.98, + "grad_norm": 1.1883742809295654, + "learning_rate": 2e-05, + "loss": 0.02651443, + "step": 2490 + }, + { + "epoch": 4.982, + "grad_norm": 1.4614838361740112, + "learning_rate": 2e-05, + "loss": 0.0321265, + "step": 2491 + }, + { + "epoch": 4.984, + "grad_norm": 2.184630870819092, + "learning_rate": 2e-05, + "loss": 0.03757413, + "step": 2492 + }, + { + "epoch": 4.986, + "grad_norm": 1.9842206239700317, + "learning_rate": 2e-05, + "loss": 0.05184003, + "step": 2493 + }, + { + "epoch": 4.9879999999999995, + "grad_norm": 3.535095453262329, + "learning_rate": 2e-05, + "loss": 0.04131281, + "step": 2494 + }, + { + "epoch": 4.99, + "grad_norm": 2.6788275241851807, + "learning_rate": 2e-05, + "loss": 0.04076681, + "step": 2495 + }, + { + "epoch": 4.992, + "grad_norm": 1.9538054466247559, + "learning_rate": 2e-05, + "loss": 0.03534822, + "step": 2496 + }, + { + "epoch": 4.994, + "grad_norm": 2.29728364944458, + "learning_rate": 2e-05, + "loss": 0.0576341, + "step": 2497 + }, + { + "epoch": 4.996, + "grad_norm": 1.8153451681137085, + "learning_rate": 2e-05, + "loss": 0.04025466, + "step": 2498 + }, + { + "epoch": 4.998, + "grad_norm": 2.0112109184265137, + "learning_rate": 2e-05, + "loss": 0.05562697, + "step": 2499 + }, + { + "epoch": 5.0, + "grad_norm": 1.1833168268203735, + "learning_rate": 2e-05, + "loss": 0.04619126, + "step": 2500 + }, + { + "epoch": 5.0, + "eval_performance": { + "AngleClassification_1": 0.988, + "AngleClassification_2": 0.99, + "AngleClassification_3": 0.7265469061876247, + "Equal_1": 0.956, + "Equal_2": 0.8602794411177644, + "Equal_3": 0.7544910179640718, + "LineComparison_1": 1.0, + "LineComparison_2": 0.9900199600798403, + "LineComparison_3": 0.9481037924151696, + "Parallel_1": 0.9859719438877755, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.9, + "Perpendicular_1": 0.974, + "Perpendicular_2": 0.638, + "Perpendicular_3": 0.38577154308617234, + "PointLiesOnCircle_1": 0.9966599866399466, + "PointLiesOnCircle_2": 0.9953333333333334, + "PointLiesOnCircle_3": 0.9768, + "PointLiesOnLine_1": 0.9939879759519038, + "PointLiesOnLine_2": 0.9819639278557114, + "PointLiesOnLine_3": 0.6207584830339321 + }, + "eval_runtime": 320.4044, + "eval_samples_per_second": 32.771, + "eval_steps_per_second": 0.655, + "step": 2500 + }, + { + "epoch": 5.002, + "grad_norm": 2.2324397563934326, + "learning_rate": 2e-05, + "loss": 0.05695269, + "step": 2501 + }, + { + "epoch": 5.004, + "grad_norm": 1.6174829006195068, + "learning_rate": 2e-05, + "loss": 0.05285802, + "step": 2502 + }, + { + "epoch": 5.006, + "grad_norm": 2.227994680404663, + "learning_rate": 2e-05, + "loss": 0.08210929, + "step": 2503 + }, + { + "epoch": 5.008, + "grad_norm": 2.022165298461914, + "learning_rate": 2e-05, + "loss": 0.0481383, + "step": 2504 + }, + { + "epoch": 5.01, + "grad_norm": 1.2331695556640625, + "learning_rate": 2e-05, + "loss": 0.03883846, + "step": 2505 + }, + { + "epoch": 5.012, + "grad_norm": 2.4110569953918457, + "learning_rate": 2e-05, + "loss": 0.0675078, + "step": 2506 + }, + { + "epoch": 5.014, + "grad_norm": 1.4140784740447998, + "learning_rate": 2e-05, + "loss": 0.02794993, + "step": 2507 + }, + { + "epoch": 5.016, + "grad_norm": 1.0707343816757202, + "learning_rate": 2e-05, + "loss": 0.02114813, + "step": 2508 + }, + { + "epoch": 5.018, + "grad_norm": 1.7338379621505737, + "learning_rate": 2e-05, + "loss": 0.04929041, + "step": 2509 + }, + { + "epoch": 5.02, + "grad_norm": 2.7071118354797363, + "learning_rate": 2e-05, + "loss": 0.06345211, + "step": 2510 + }, + { + "epoch": 5.022, + "grad_norm": 1.2723110914230347, + "learning_rate": 2e-05, + "loss": 0.04290574, + "step": 2511 + }, + { + "epoch": 5.024, + "grad_norm": 2.29909348487854, + "learning_rate": 2e-05, + "loss": 0.05433185, + "step": 2512 + }, + { + "epoch": 5.026, + "grad_norm": 1.5173856019973755, + "learning_rate": 2e-05, + "loss": 0.03184156, + "step": 2513 + }, + { + "epoch": 5.028, + "grad_norm": 1.4544750452041626, + "learning_rate": 2e-05, + "loss": 0.03113187, + "step": 2514 + }, + { + "epoch": 5.03, + "grad_norm": 1.0467116832733154, + "learning_rate": 2e-05, + "loss": 0.02658311, + "step": 2515 + }, + { + "epoch": 5.032, + "grad_norm": 1.4478943347930908, + "learning_rate": 2e-05, + "loss": 0.04346019, + "step": 2516 + }, + { + "epoch": 5.034, + "grad_norm": 1.587401270866394, + "learning_rate": 2e-05, + "loss": 0.04132868, + "step": 2517 + }, + { + "epoch": 5.036, + "grad_norm": 1.247644305229187, + "learning_rate": 2e-05, + "loss": 0.03109686, + "step": 2518 + }, + { + "epoch": 5.038, + "grad_norm": 1.2669824361801147, + "learning_rate": 2e-05, + "loss": 0.02979152, + "step": 2519 + }, + { + "epoch": 5.04, + "grad_norm": 2.6262893676757812, + "learning_rate": 2e-05, + "loss": 0.06778932, + "step": 2520 + }, + { + "epoch": 5.042, + "grad_norm": 1.6838678121566772, + "learning_rate": 2e-05, + "loss": 0.06396025, + "step": 2521 + }, + { + "epoch": 5.044, + "grad_norm": 1.3168069124221802, + "learning_rate": 2e-05, + "loss": 0.02535046, + "step": 2522 + }, + { + "epoch": 5.046, + "grad_norm": 2.7136900424957275, + "learning_rate": 2e-05, + "loss": 0.04555402, + "step": 2523 + }, + { + "epoch": 5.048, + "grad_norm": 1.461851716041565, + "learning_rate": 2e-05, + "loss": 0.03833253, + "step": 2524 + }, + { + "epoch": 5.05, + "grad_norm": 1.8288956880569458, + "learning_rate": 2e-05, + "loss": 0.03891191, + "step": 2525 + }, + { + "epoch": 5.052, + "grad_norm": 1.1910473108291626, + "learning_rate": 2e-05, + "loss": 0.03411533, + "step": 2526 + }, + { + "epoch": 5.054, + "grad_norm": 1.5257526636123657, + "learning_rate": 2e-05, + "loss": 0.04626542, + "step": 2527 + }, + { + "epoch": 5.056, + "grad_norm": 1.2953051328659058, + "learning_rate": 2e-05, + "loss": 0.02808543, + "step": 2528 + }, + { + "epoch": 5.058, + "grad_norm": 1.4993410110473633, + "learning_rate": 2e-05, + "loss": 0.04547715, + "step": 2529 + }, + { + "epoch": 5.06, + "grad_norm": 2.6157491207122803, + "learning_rate": 2e-05, + "loss": 0.09757001, + "step": 2530 + }, + { + "epoch": 5.062, + "grad_norm": 2.16626238822937, + "learning_rate": 2e-05, + "loss": 0.05185439, + "step": 2531 + }, + { + "epoch": 5.064, + "grad_norm": 5.067115783691406, + "learning_rate": 2e-05, + "loss": 0.04093373, + "step": 2532 + }, + { + "epoch": 5.066, + "grad_norm": 3.5406131744384766, + "learning_rate": 2e-05, + "loss": 0.05016257, + "step": 2533 + }, + { + "epoch": 5.068, + "grad_norm": 1.9250370264053345, + "learning_rate": 2e-05, + "loss": 0.04495588, + "step": 2534 + }, + { + "epoch": 5.07, + "grad_norm": 1.3364894390106201, + "learning_rate": 2e-05, + "loss": 0.05048854, + "step": 2535 + }, + { + "epoch": 5.072, + "grad_norm": 1.1228950023651123, + "learning_rate": 2e-05, + "loss": 0.03420167, + "step": 2536 + }, + { + "epoch": 5.074, + "grad_norm": 2.442943572998047, + "learning_rate": 2e-05, + "loss": 0.04751071, + "step": 2537 + }, + { + "epoch": 5.076, + "grad_norm": 1.1549925804138184, + "learning_rate": 2e-05, + "loss": 0.03477682, + "step": 2538 + }, + { + "epoch": 5.078, + "grad_norm": 1.890607237815857, + "learning_rate": 2e-05, + "loss": 0.04445625, + "step": 2539 + }, + { + "epoch": 5.08, + "grad_norm": 1.524712324142456, + "learning_rate": 2e-05, + "loss": 0.04532345, + "step": 2540 + }, + { + "epoch": 5.082, + "grad_norm": 1.9938740730285645, + "learning_rate": 2e-05, + "loss": 0.04844673, + "step": 2541 + }, + { + "epoch": 5.084, + "grad_norm": 2.3578040599823, + "learning_rate": 2e-05, + "loss": 0.06914338, + "step": 2542 + }, + { + "epoch": 5.086, + "grad_norm": 1.6852574348449707, + "learning_rate": 2e-05, + "loss": 0.03590863, + "step": 2543 + }, + { + "epoch": 5.088, + "grad_norm": 1.804057240486145, + "learning_rate": 2e-05, + "loss": 0.0503867, + "step": 2544 + }, + { + "epoch": 5.09, + "grad_norm": 1.832448959350586, + "learning_rate": 2e-05, + "loss": 0.05448397, + "step": 2545 + }, + { + "epoch": 5.092, + "grad_norm": 1.6885632276535034, + "learning_rate": 2e-05, + "loss": 0.0470449, + "step": 2546 + }, + { + "epoch": 5.094, + "grad_norm": 0.9903779625892639, + "learning_rate": 2e-05, + "loss": 0.03127839, + "step": 2547 + }, + { + "epoch": 5.096, + "grad_norm": 1.0127002000808716, + "learning_rate": 2e-05, + "loss": 0.03280874, + "step": 2548 + }, + { + "epoch": 5.098, + "grad_norm": 1.8620246648788452, + "learning_rate": 2e-05, + "loss": 0.06030571, + "step": 2549 + }, + { + "epoch": 5.1, + "grad_norm": 1.3962090015411377, + "learning_rate": 2e-05, + "loss": 0.04562517, + "step": 2550 + }, + { + "epoch": 5.102, + "grad_norm": 1.8518095016479492, + "learning_rate": 2e-05, + "loss": 0.06386492, + "step": 2551 + }, + { + "epoch": 5.104, + "grad_norm": 1.2915924787521362, + "learning_rate": 2e-05, + "loss": 0.0317604, + "step": 2552 + }, + { + "epoch": 5.106, + "grad_norm": 1.6023142337799072, + "learning_rate": 2e-05, + "loss": 0.03814955, + "step": 2553 + }, + { + "epoch": 5.108, + "grad_norm": 1.1027803421020508, + "learning_rate": 2e-05, + "loss": 0.02706053, + "step": 2554 + }, + { + "epoch": 5.11, + "grad_norm": 2.314110517501831, + "learning_rate": 2e-05, + "loss": 0.0609056, + "step": 2555 + }, + { + "epoch": 5.112, + "grad_norm": 1.028228759765625, + "learning_rate": 2e-05, + "loss": 0.02673756, + "step": 2556 + }, + { + "epoch": 5.114, + "grad_norm": 1.5030735731124878, + "learning_rate": 2e-05, + "loss": 0.0449588, + "step": 2557 + }, + { + "epoch": 5.116, + "grad_norm": 1.3208764791488647, + "learning_rate": 2e-05, + "loss": 0.04690504, + "step": 2558 + }, + { + "epoch": 5.118, + "grad_norm": 2.08263897895813, + "learning_rate": 2e-05, + "loss": 0.04718242, + "step": 2559 + }, + { + "epoch": 5.12, + "grad_norm": 1.488623857498169, + "learning_rate": 2e-05, + "loss": 0.04909895, + "step": 2560 + }, + { + "epoch": 5.122, + "grad_norm": 1.8604114055633545, + "learning_rate": 2e-05, + "loss": 0.04605662, + "step": 2561 + }, + { + "epoch": 5.124, + "grad_norm": 1.0743006467819214, + "learning_rate": 2e-05, + "loss": 0.02543287, + "step": 2562 + }, + { + "epoch": 5.126, + "grad_norm": 1.3508437871932983, + "learning_rate": 2e-05, + "loss": 0.03909838, + "step": 2563 + }, + { + "epoch": 5.128, + "grad_norm": 1.4261709451675415, + "learning_rate": 2e-05, + "loss": 0.03505434, + "step": 2564 + }, + { + "epoch": 5.13, + "grad_norm": 2.1459856033325195, + "learning_rate": 2e-05, + "loss": 0.05590709, + "step": 2565 + }, + { + "epoch": 5.132, + "grad_norm": 2.010373115539551, + "learning_rate": 2e-05, + "loss": 0.03756604, + "step": 2566 + }, + { + "epoch": 5.134, + "grad_norm": 1.6585365533828735, + "learning_rate": 2e-05, + "loss": 0.03042065, + "step": 2567 + }, + { + "epoch": 5.136, + "grad_norm": 2.4542722702026367, + "learning_rate": 2e-05, + "loss": 0.05523026, + "step": 2568 + }, + { + "epoch": 5.138, + "grad_norm": 1.9174667596817017, + "learning_rate": 2e-05, + "loss": 0.06206827, + "step": 2569 + }, + { + "epoch": 5.14, + "grad_norm": 1.7819212675094604, + "learning_rate": 2e-05, + "loss": 0.0496488, + "step": 2570 + }, + { + "epoch": 5.142, + "grad_norm": 2.3288769721984863, + "learning_rate": 2e-05, + "loss": 0.05924601, + "step": 2571 + }, + { + "epoch": 5.144, + "grad_norm": 4.057807922363281, + "learning_rate": 2e-05, + "loss": 0.0507498, + "step": 2572 + }, + { + "epoch": 5.146, + "grad_norm": 1.5712443590164185, + "learning_rate": 2e-05, + "loss": 0.05386955, + "step": 2573 + }, + { + "epoch": 5.148, + "grad_norm": 1.6841580867767334, + "learning_rate": 2e-05, + "loss": 0.0409927, + "step": 2574 + }, + { + "epoch": 5.15, + "grad_norm": 2.0877368450164795, + "learning_rate": 2e-05, + "loss": 0.03837425, + "step": 2575 + }, + { + "epoch": 5.152, + "grad_norm": 1.4437499046325684, + "learning_rate": 2e-05, + "loss": 0.04897504, + "step": 2576 + }, + { + "epoch": 5.154, + "grad_norm": 1.4186006784439087, + "learning_rate": 2e-05, + "loss": 0.05277944, + "step": 2577 + }, + { + "epoch": 5.156, + "grad_norm": 6.804627418518066, + "learning_rate": 2e-05, + "loss": 0.05949213, + "step": 2578 + }, + { + "epoch": 5.158, + "grad_norm": 1.6603080034255981, + "learning_rate": 2e-05, + "loss": 0.05066625, + "step": 2579 + }, + { + "epoch": 5.16, + "grad_norm": 1.7450742721557617, + "learning_rate": 2e-05, + "loss": 0.05555181, + "step": 2580 + }, + { + "epoch": 5.162, + "grad_norm": 2.0960774421691895, + "learning_rate": 2e-05, + "loss": 0.05067931, + "step": 2581 + }, + { + "epoch": 5.164, + "grad_norm": 1.5471669435501099, + "learning_rate": 2e-05, + "loss": 0.04380035, + "step": 2582 + }, + { + "epoch": 5.166, + "grad_norm": 1.5839829444885254, + "learning_rate": 2e-05, + "loss": 0.0428129, + "step": 2583 + }, + { + "epoch": 5.168, + "grad_norm": 1.1979819536209106, + "learning_rate": 2e-05, + "loss": 0.03231291, + "step": 2584 + }, + { + "epoch": 5.17, + "grad_norm": 0.9489485621452332, + "learning_rate": 2e-05, + "loss": 0.03189579, + "step": 2585 + }, + { + "epoch": 5.172, + "grad_norm": 1.3143651485443115, + "learning_rate": 2e-05, + "loss": 0.03099883, + "step": 2586 + }, + { + "epoch": 5.174, + "grad_norm": 1.7608791589736938, + "learning_rate": 2e-05, + "loss": 0.06527974, + "step": 2587 + }, + { + "epoch": 5.176, + "grad_norm": 1.6358402967453003, + "learning_rate": 2e-05, + "loss": 0.05212991, + "step": 2588 + }, + { + "epoch": 5.178, + "grad_norm": 1.2819510698318481, + "learning_rate": 2e-05, + "loss": 0.03158101, + "step": 2589 + }, + { + "epoch": 5.18, + "grad_norm": 1.5717135667800903, + "learning_rate": 2e-05, + "loss": 0.04475843, + "step": 2590 + }, + { + "epoch": 5.182, + "grad_norm": 2.0249385833740234, + "learning_rate": 2e-05, + "loss": 0.05886701, + "step": 2591 + }, + { + "epoch": 5.184, + "grad_norm": 1.782658338546753, + "learning_rate": 2e-05, + "loss": 0.04498136, + "step": 2592 + }, + { + "epoch": 5.186, + "grad_norm": 1.6862472295761108, + "learning_rate": 2e-05, + "loss": 0.03234654, + "step": 2593 + }, + { + "epoch": 5.188, + "grad_norm": 1.2082955837249756, + "learning_rate": 2e-05, + "loss": 0.04066747, + "step": 2594 + }, + { + "epoch": 5.19, + "grad_norm": 1.7842987775802612, + "learning_rate": 2e-05, + "loss": 0.06287039, + "step": 2595 + }, + { + "epoch": 5.192, + "grad_norm": 1.250356674194336, + "learning_rate": 2e-05, + "loss": 0.04755474, + "step": 2596 + }, + { + "epoch": 5.194, + "grad_norm": 2.9072906970977783, + "learning_rate": 2e-05, + "loss": 0.05345254, + "step": 2597 + }, + { + "epoch": 5.196, + "grad_norm": 1.3460503816604614, + "learning_rate": 2e-05, + "loss": 0.05015785, + "step": 2598 + }, + { + "epoch": 5.198, + "grad_norm": 1.5144968032836914, + "learning_rate": 2e-05, + "loss": 0.05026857, + "step": 2599 + }, + { + "epoch": 5.2, + "grad_norm": 1.0161470174789429, + "learning_rate": 2e-05, + "loss": 0.03177109, + "step": 2600 + }, + { + "epoch": 5.202, + "grad_norm": 1.379644513130188, + "learning_rate": 2e-05, + "loss": 0.0431831, + "step": 2601 + }, + { + "epoch": 5.204, + "grad_norm": 1.5978702306747437, + "learning_rate": 2e-05, + "loss": 0.04161422, + "step": 2602 + }, + { + "epoch": 5.206, + "grad_norm": 1.7875207662582397, + "learning_rate": 2e-05, + "loss": 0.05222338, + "step": 2603 + }, + { + "epoch": 5.208, + "grad_norm": 1.0086873769760132, + "learning_rate": 2e-05, + "loss": 0.02815355, + "step": 2604 + }, + { + "epoch": 5.21, + "grad_norm": 1.4395867586135864, + "learning_rate": 2e-05, + "loss": 0.03475678, + "step": 2605 + }, + { + "epoch": 5.212, + "grad_norm": 1.7755112648010254, + "learning_rate": 2e-05, + "loss": 0.05450004, + "step": 2606 + }, + { + "epoch": 5.214, + "grad_norm": 1.830944299697876, + "learning_rate": 2e-05, + "loss": 0.04738111, + "step": 2607 + }, + { + "epoch": 5.216, + "grad_norm": 1.3517119884490967, + "learning_rate": 2e-05, + "loss": 0.0596354, + "step": 2608 + }, + { + "epoch": 5.218, + "grad_norm": 1.0986253023147583, + "learning_rate": 2e-05, + "loss": 0.0300137, + "step": 2609 + }, + { + "epoch": 5.22, + "grad_norm": 2.0288379192352295, + "learning_rate": 2e-05, + "loss": 0.03250675, + "step": 2610 + }, + { + "epoch": 5.222, + "grad_norm": 2.0758872032165527, + "learning_rate": 2e-05, + "loss": 0.05346858, + "step": 2611 + }, + { + "epoch": 5.224, + "grad_norm": 1.4884560108184814, + "learning_rate": 2e-05, + "loss": 0.04885757, + "step": 2612 + }, + { + "epoch": 5.226, + "grad_norm": 2.067258596420288, + "learning_rate": 2e-05, + "loss": 0.04791706, + "step": 2613 + }, + { + "epoch": 5.228, + "grad_norm": 1.7092328071594238, + "learning_rate": 2e-05, + "loss": 0.06562018, + "step": 2614 + }, + { + "epoch": 5.23, + "grad_norm": 2.985393762588501, + "learning_rate": 2e-05, + "loss": 0.05072825, + "step": 2615 + }, + { + "epoch": 5.232, + "grad_norm": 1.6960763931274414, + "learning_rate": 2e-05, + "loss": 0.05636974, + "step": 2616 + }, + { + "epoch": 5.234, + "grad_norm": 1.1226341724395752, + "learning_rate": 2e-05, + "loss": 0.03822513, + "step": 2617 + }, + { + "epoch": 5.236, + "grad_norm": 1.404388427734375, + "learning_rate": 2e-05, + "loss": 0.03551162, + "step": 2618 + }, + { + "epoch": 5.2379999999999995, + "grad_norm": 1.3654124736785889, + "learning_rate": 2e-05, + "loss": 0.04120996, + "step": 2619 + }, + { + "epoch": 5.24, + "grad_norm": 2.9756903648376465, + "learning_rate": 2e-05, + "loss": 0.05003164, + "step": 2620 + }, + { + "epoch": 5.242, + "grad_norm": 1.9504529237747192, + "learning_rate": 2e-05, + "loss": 0.06165954, + "step": 2621 + }, + { + "epoch": 5.244, + "grad_norm": 1.5840363502502441, + "learning_rate": 2e-05, + "loss": 0.0396319, + "step": 2622 + }, + { + "epoch": 5.246, + "grad_norm": 1.6869369745254517, + "learning_rate": 2e-05, + "loss": 0.04542349, + "step": 2623 + }, + { + "epoch": 5.248, + "grad_norm": 1.6327499151229858, + "learning_rate": 2e-05, + "loss": 0.05939788, + "step": 2624 + }, + { + "epoch": 5.25, + "grad_norm": 2.854781150817871, + "learning_rate": 2e-05, + "loss": 0.03455864, + "step": 2625 + }, + { + "epoch": 5.252, + "grad_norm": 1.3008240461349487, + "learning_rate": 2e-05, + "loss": 0.02595445, + "step": 2626 + }, + { + "epoch": 5.254, + "grad_norm": 1.2028790712356567, + "learning_rate": 2e-05, + "loss": 0.02978789, + "step": 2627 + }, + { + "epoch": 5.256, + "grad_norm": 1.6047648191452026, + "learning_rate": 2e-05, + "loss": 0.05063823, + "step": 2628 + }, + { + "epoch": 5.258, + "grad_norm": 2.121851682662964, + "learning_rate": 2e-05, + "loss": 0.03812759, + "step": 2629 + }, + { + "epoch": 5.26, + "grad_norm": 1.4679371118545532, + "learning_rate": 2e-05, + "loss": 0.04398443, + "step": 2630 + }, + { + "epoch": 5.2620000000000005, + "grad_norm": 1.7496758699417114, + "learning_rate": 2e-05, + "loss": 0.04756953, + "step": 2631 + }, + { + "epoch": 5.264, + "grad_norm": 2.85517954826355, + "learning_rate": 2e-05, + "loss": 0.04580143, + "step": 2632 + }, + { + "epoch": 5.266, + "grad_norm": 1.5564442873001099, + "learning_rate": 2e-05, + "loss": 0.05306519, + "step": 2633 + }, + { + "epoch": 5.268, + "grad_norm": 1.9785518646240234, + "learning_rate": 2e-05, + "loss": 0.0391811, + "step": 2634 + }, + { + "epoch": 5.27, + "grad_norm": 2.4866366386413574, + "learning_rate": 2e-05, + "loss": 0.05209032, + "step": 2635 + }, + { + "epoch": 5.272, + "grad_norm": 1.0321913957595825, + "learning_rate": 2e-05, + "loss": 0.02281884, + "step": 2636 + }, + { + "epoch": 5.274, + "grad_norm": 0.978119432926178, + "learning_rate": 2e-05, + "loss": 0.02463586, + "step": 2637 + }, + { + "epoch": 5.276, + "grad_norm": 1.338571548461914, + "learning_rate": 2e-05, + "loss": 0.05107386, + "step": 2638 + }, + { + "epoch": 5.2780000000000005, + "grad_norm": 1.172133207321167, + "learning_rate": 2e-05, + "loss": 0.03545574, + "step": 2639 + }, + { + "epoch": 5.28, + "grad_norm": 1.7161072492599487, + "learning_rate": 2e-05, + "loss": 0.0608919, + "step": 2640 + }, + { + "epoch": 5.282, + "grad_norm": 2.2566652297973633, + "learning_rate": 2e-05, + "loss": 0.06986575, + "step": 2641 + }, + { + "epoch": 5.284, + "grad_norm": 1.5361062288284302, + "learning_rate": 2e-05, + "loss": 0.0354271, + "step": 2642 + }, + { + "epoch": 5.286, + "grad_norm": 1.7138904333114624, + "learning_rate": 2e-05, + "loss": 0.05330596, + "step": 2643 + }, + { + "epoch": 5.288, + "grad_norm": 1.9515053033828735, + "learning_rate": 2e-05, + "loss": 0.05626545, + "step": 2644 + }, + { + "epoch": 5.29, + "grad_norm": 1.527269721031189, + "learning_rate": 2e-05, + "loss": 0.04371868, + "step": 2645 + }, + { + "epoch": 5.292, + "grad_norm": 1.7075636386871338, + "learning_rate": 2e-05, + "loss": 0.04032372, + "step": 2646 + }, + { + "epoch": 5.294, + "grad_norm": 1.0868964195251465, + "learning_rate": 2e-05, + "loss": 0.03899564, + "step": 2647 + }, + { + "epoch": 5.296, + "grad_norm": 3.068068504333496, + "learning_rate": 2e-05, + "loss": 0.04235051, + "step": 2648 + }, + { + "epoch": 5.298, + "grad_norm": 1.6693544387817383, + "learning_rate": 2e-05, + "loss": 0.05411904, + "step": 2649 + }, + { + "epoch": 5.3, + "grad_norm": 1.2736790180206299, + "learning_rate": 2e-05, + "loss": 0.0501396, + "step": 2650 + }, + { + "epoch": 5.302, + "grad_norm": 1.770841121673584, + "learning_rate": 2e-05, + "loss": 0.03934953, + "step": 2651 + }, + { + "epoch": 5.304, + "grad_norm": 1.1789344549179077, + "learning_rate": 2e-05, + "loss": 0.03890145, + "step": 2652 + }, + { + "epoch": 5.306, + "grad_norm": 1.8938695192337036, + "learning_rate": 2e-05, + "loss": 0.05737014, + "step": 2653 + }, + { + "epoch": 5.308, + "grad_norm": 1.4466159343719482, + "learning_rate": 2e-05, + "loss": 0.04105823, + "step": 2654 + }, + { + "epoch": 5.31, + "grad_norm": 1.9113630056381226, + "learning_rate": 2e-05, + "loss": 0.06063426, + "step": 2655 + }, + { + "epoch": 5.312, + "grad_norm": 1.341367483139038, + "learning_rate": 2e-05, + "loss": 0.03131312, + "step": 2656 + }, + { + "epoch": 5.314, + "grad_norm": 1.2381536960601807, + "learning_rate": 2e-05, + "loss": 0.03998591, + "step": 2657 + }, + { + "epoch": 5.316, + "grad_norm": 1.210856318473816, + "learning_rate": 2e-05, + "loss": 0.03550825, + "step": 2658 + }, + { + "epoch": 5.318, + "grad_norm": 1.4786920547485352, + "learning_rate": 2e-05, + "loss": 0.03781786, + "step": 2659 + }, + { + "epoch": 5.32, + "grad_norm": 1.7087515592575073, + "learning_rate": 2e-05, + "loss": 0.04967047, + "step": 2660 + }, + { + "epoch": 5.322, + "grad_norm": 1.5567851066589355, + "learning_rate": 2e-05, + "loss": 0.03445258, + "step": 2661 + }, + { + "epoch": 5.324, + "grad_norm": 0.9272580146789551, + "learning_rate": 2e-05, + "loss": 0.02637572, + "step": 2662 + }, + { + "epoch": 5.326, + "grad_norm": 2.146501302719116, + "learning_rate": 2e-05, + "loss": 0.03992863, + "step": 2663 + }, + { + "epoch": 5.328, + "grad_norm": 2.161367654800415, + "learning_rate": 2e-05, + "loss": 0.05867213, + "step": 2664 + }, + { + "epoch": 5.33, + "grad_norm": 1.6468945741653442, + "learning_rate": 2e-05, + "loss": 0.03345583, + "step": 2665 + }, + { + "epoch": 5.332, + "grad_norm": 1.8014037609100342, + "learning_rate": 2e-05, + "loss": 0.04314873, + "step": 2666 + }, + { + "epoch": 5.334, + "grad_norm": 1.926483392715454, + "learning_rate": 2e-05, + "loss": 0.05389591, + "step": 2667 + }, + { + "epoch": 5.336, + "grad_norm": 1.567809820175171, + "learning_rate": 2e-05, + "loss": 0.05018042, + "step": 2668 + }, + { + "epoch": 5.338, + "grad_norm": 0.9763085842132568, + "learning_rate": 2e-05, + "loss": 0.02220888, + "step": 2669 + }, + { + "epoch": 5.34, + "grad_norm": 1.6590309143066406, + "learning_rate": 2e-05, + "loss": 0.03101541, + "step": 2670 + }, + { + "epoch": 5.342, + "grad_norm": 1.1251074075698853, + "learning_rate": 2e-05, + "loss": 0.03375977, + "step": 2671 + }, + { + "epoch": 5.344, + "grad_norm": 1.7373671531677246, + "learning_rate": 2e-05, + "loss": 0.02912542, + "step": 2672 + }, + { + "epoch": 5.346, + "grad_norm": 1.122557282447815, + "learning_rate": 2e-05, + "loss": 0.02940472, + "step": 2673 + }, + { + "epoch": 5.348, + "grad_norm": 1.277514100074768, + "learning_rate": 2e-05, + "loss": 0.03232025, + "step": 2674 + }, + { + "epoch": 5.35, + "grad_norm": 1.4601491689682007, + "learning_rate": 2e-05, + "loss": 0.03994402, + "step": 2675 + }, + { + "epoch": 5.352, + "grad_norm": 0.8786239624023438, + "learning_rate": 2e-05, + "loss": 0.02341381, + "step": 2676 + }, + { + "epoch": 5.354, + "grad_norm": 1.1858524084091187, + "learning_rate": 2e-05, + "loss": 0.04786807, + "step": 2677 + }, + { + "epoch": 5.356, + "grad_norm": 1.1991043090820312, + "learning_rate": 2e-05, + "loss": 0.0410894, + "step": 2678 + }, + { + "epoch": 5.358, + "grad_norm": 0.9990038871765137, + "learning_rate": 2e-05, + "loss": 0.03214614, + "step": 2679 + }, + { + "epoch": 5.36, + "grad_norm": 1.499480128288269, + "learning_rate": 2e-05, + "loss": 0.04654964, + "step": 2680 + }, + { + "epoch": 5.362, + "grad_norm": 1.3104462623596191, + "learning_rate": 2e-05, + "loss": 0.03467711, + "step": 2681 + }, + { + "epoch": 5.364, + "grad_norm": 2.457524538040161, + "learning_rate": 2e-05, + "loss": 0.05111074, + "step": 2682 + }, + { + "epoch": 5.366, + "grad_norm": 1.4979830980300903, + "learning_rate": 2e-05, + "loss": 0.04499523, + "step": 2683 + }, + { + "epoch": 5.368, + "grad_norm": 4.056197166442871, + "learning_rate": 2e-05, + "loss": 0.02836193, + "step": 2684 + }, + { + "epoch": 5.37, + "grad_norm": 3.501704692840576, + "learning_rate": 2e-05, + "loss": 0.08720149, + "step": 2685 + }, + { + "epoch": 5.372, + "grad_norm": 1.552233099937439, + "learning_rate": 2e-05, + "loss": 0.04281237, + "step": 2686 + }, + { + "epoch": 5.374, + "grad_norm": 1.1963515281677246, + "learning_rate": 2e-05, + "loss": 0.02805083, + "step": 2687 + }, + { + "epoch": 5.376, + "grad_norm": 1.2588318586349487, + "learning_rate": 2e-05, + "loss": 0.03842116, + "step": 2688 + }, + { + "epoch": 5.378, + "grad_norm": 2.016977310180664, + "learning_rate": 2e-05, + "loss": 0.04068321, + "step": 2689 + }, + { + "epoch": 5.38, + "grad_norm": 1.2053779363632202, + "learning_rate": 2e-05, + "loss": 0.04211229, + "step": 2690 + }, + { + "epoch": 5.382, + "grad_norm": 1.485421895980835, + "learning_rate": 2e-05, + "loss": 0.05219296, + "step": 2691 + }, + { + "epoch": 5.384, + "grad_norm": 1.2090396881103516, + "learning_rate": 2e-05, + "loss": 0.04025387, + "step": 2692 + }, + { + "epoch": 5.386, + "grad_norm": 1.3661246299743652, + "learning_rate": 2e-05, + "loss": 0.03243088, + "step": 2693 + }, + { + "epoch": 5.388, + "grad_norm": 1.7728279829025269, + "learning_rate": 2e-05, + "loss": 0.04520981, + "step": 2694 + }, + { + "epoch": 5.39, + "grad_norm": 1.839638590812683, + "learning_rate": 2e-05, + "loss": 0.04922792, + "step": 2695 + }, + { + "epoch": 5.392, + "grad_norm": 1.2756119966506958, + "learning_rate": 2e-05, + "loss": 0.03673655, + "step": 2696 + }, + { + "epoch": 5.394, + "grad_norm": 1.812853217124939, + "learning_rate": 2e-05, + "loss": 0.0451444, + "step": 2697 + }, + { + "epoch": 5.396, + "grad_norm": 1.732804298400879, + "learning_rate": 2e-05, + "loss": 0.0696558, + "step": 2698 + }, + { + "epoch": 5.398, + "grad_norm": 2.259817123413086, + "learning_rate": 2e-05, + "loss": 0.05728294, + "step": 2699 + }, + { + "epoch": 5.4, + "grad_norm": 3.6404404640197754, + "learning_rate": 2e-05, + "loss": 0.05975555, + "step": 2700 + }, + { + "epoch": 5.402, + "grad_norm": 1.1913822889328003, + "learning_rate": 2e-05, + "loss": 0.04311104, + "step": 2701 + }, + { + "epoch": 5.404, + "grad_norm": 1.7628343105316162, + "learning_rate": 2e-05, + "loss": 0.04324729, + "step": 2702 + }, + { + "epoch": 5.406, + "grad_norm": 2.0190365314483643, + "learning_rate": 2e-05, + "loss": 0.05802897, + "step": 2703 + }, + { + "epoch": 5.408, + "grad_norm": 1.1721633672714233, + "learning_rate": 2e-05, + "loss": 0.03633162, + "step": 2704 + }, + { + "epoch": 5.41, + "grad_norm": 1.600894808769226, + "learning_rate": 2e-05, + "loss": 0.05752605, + "step": 2705 + }, + { + "epoch": 5.412, + "grad_norm": 1.0390570163726807, + "learning_rate": 2e-05, + "loss": 0.03858446, + "step": 2706 + }, + { + "epoch": 5.414, + "grad_norm": 1.4907500743865967, + "learning_rate": 2e-05, + "loss": 0.02707991, + "step": 2707 + }, + { + "epoch": 5.416, + "grad_norm": 2.0031204223632812, + "learning_rate": 2e-05, + "loss": 0.03021158, + "step": 2708 + }, + { + "epoch": 5.418, + "grad_norm": 4.072817325592041, + "learning_rate": 2e-05, + "loss": 0.05053132, + "step": 2709 + }, + { + "epoch": 5.42, + "grad_norm": 1.7337241172790527, + "learning_rate": 2e-05, + "loss": 0.03692292, + "step": 2710 + }, + { + "epoch": 5.422, + "grad_norm": 1.754112720489502, + "learning_rate": 2e-05, + "loss": 0.03906107, + "step": 2711 + }, + { + "epoch": 5.424, + "grad_norm": 4.156593322753906, + "learning_rate": 2e-05, + "loss": 0.03578386, + "step": 2712 + }, + { + "epoch": 5.426, + "grad_norm": 1.2016631364822388, + "learning_rate": 2e-05, + "loss": 0.03455806, + "step": 2713 + }, + { + "epoch": 5.428, + "grad_norm": 0.9195474982261658, + "learning_rate": 2e-05, + "loss": 0.02441694, + "step": 2714 + }, + { + "epoch": 5.43, + "grad_norm": 1.091452717781067, + "learning_rate": 2e-05, + "loss": 0.0270972, + "step": 2715 + }, + { + "epoch": 5.432, + "grad_norm": 1.6297458410263062, + "learning_rate": 2e-05, + "loss": 0.04799194, + "step": 2716 + }, + { + "epoch": 5.434, + "grad_norm": 1.3862227201461792, + "learning_rate": 2e-05, + "loss": 0.04216187, + "step": 2717 + }, + { + "epoch": 5.436, + "grad_norm": 1.8939886093139648, + "learning_rate": 2e-05, + "loss": 0.05403071, + "step": 2718 + }, + { + "epoch": 5.438, + "grad_norm": 2.1752545833587646, + "learning_rate": 2e-05, + "loss": 0.05855702, + "step": 2719 + }, + { + "epoch": 5.44, + "grad_norm": 2.9975874423980713, + "learning_rate": 2e-05, + "loss": 0.0767413, + "step": 2720 + }, + { + "epoch": 5.442, + "grad_norm": 1.3926620483398438, + "learning_rate": 2e-05, + "loss": 0.03643538, + "step": 2721 + }, + { + "epoch": 5.444, + "grad_norm": 1.96034574508667, + "learning_rate": 2e-05, + "loss": 0.04681049, + "step": 2722 + }, + { + "epoch": 5.446, + "grad_norm": 1.1249243021011353, + "learning_rate": 2e-05, + "loss": 0.02556986, + "step": 2723 + }, + { + "epoch": 5.448, + "grad_norm": 1.8713010549545288, + "learning_rate": 2e-05, + "loss": 0.05697018, + "step": 2724 + }, + { + "epoch": 5.45, + "grad_norm": 1.1329511404037476, + "learning_rate": 2e-05, + "loss": 0.02677982, + "step": 2725 + }, + { + "epoch": 5.452, + "grad_norm": 2.1637654304504395, + "learning_rate": 2e-05, + "loss": 0.05468876, + "step": 2726 + }, + { + "epoch": 5.454, + "grad_norm": 1.6031872034072876, + "learning_rate": 2e-05, + "loss": 0.04942069, + "step": 2727 + }, + { + "epoch": 5.456, + "grad_norm": 1.4238561391830444, + "learning_rate": 2e-05, + "loss": 0.04586676, + "step": 2728 + }, + { + "epoch": 5.458, + "grad_norm": 0.7893091440200806, + "learning_rate": 2e-05, + "loss": 0.02090956, + "step": 2729 + }, + { + "epoch": 5.46, + "grad_norm": 1.485260248184204, + "learning_rate": 2e-05, + "loss": 0.04206961, + "step": 2730 + }, + { + "epoch": 5.462, + "grad_norm": 0.9618636965751648, + "learning_rate": 2e-05, + "loss": 0.02777426, + "step": 2731 + }, + { + "epoch": 5.464, + "grad_norm": 1.663711428642273, + "learning_rate": 2e-05, + "loss": 0.04290383, + "step": 2732 + }, + { + "epoch": 5.466, + "grad_norm": 2.107558250427246, + "learning_rate": 2e-05, + "loss": 0.05302426, + "step": 2733 + }, + { + "epoch": 5.468, + "grad_norm": 1.3562395572662354, + "learning_rate": 2e-05, + "loss": 0.04122175, + "step": 2734 + }, + { + "epoch": 5.47, + "grad_norm": 1.3446310758590698, + "learning_rate": 2e-05, + "loss": 0.04664951, + "step": 2735 + }, + { + "epoch": 5.4719999999999995, + "grad_norm": 1.7537676095962524, + "learning_rate": 2e-05, + "loss": 0.05597606, + "step": 2736 + }, + { + "epoch": 5.474, + "grad_norm": 1.2579759359359741, + "learning_rate": 2e-05, + "loss": 0.04209308, + "step": 2737 + }, + { + "epoch": 5.476, + "grad_norm": 1.2744957208633423, + "learning_rate": 2e-05, + "loss": 0.03771916, + "step": 2738 + }, + { + "epoch": 5.478, + "grad_norm": 1.1946172714233398, + "learning_rate": 2e-05, + "loss": 0.04514795, + "step": 2739 + }, + { + "epoch": 5.48, + "grad_norm": 1.2385889291763306, + "learning_rate": 2e-05, + "loss": 0.03430807, + "step": 2740 + }, + { + "epoch": 5.482, + "grad_norm": 1.1070294380187988, + "learning_rate": 2e-05, + "loss": 0.02781132, + "step": 2741 + }, + { + "epoch": 5.484, + "grad_norm": 2.086735248565674, + "learning_rate": 2e-05, + "loss": 0.04818981, + "step": 2742 + }, + { + "epoch": 5.486, + "grad_norm": 1.2726064920425415, + "learning_rate": 2e-05, + "loss": 0.04557905, + "step": 2743 + }, + { + "epoch": 5.4879999999999995, + "grad_norm": 1.2682517766952515, + "learning_rate": 2e-05, + "loss": 0.03911246, + "step": 2744 + }, + { + "epoch": 5.49, + "grad_norm": 1.2447410821914673, + "learning_rate": 2e-05, + "loss": 0.04097291, + "step": 2745 + }, + { + "epoch": 5.492, + "grad_norm": 1.2082518339157104, + "learning_rate": 2e-05, + "loss": 0.04021576, + "step": 2746 + }, + { + "epoch": 5.494, + "grad_norm": 1.4685015678405762, + "learning_rate": 2e-05, + "loss": 0.04541424, + "step": 2747 + }, + { + "epoch": 5.496, + "grad_norm": 1.4241282939910889, + "learning_rate": 2e-05, + "loss": 0.03883777, + "step": 2748 + }, + { + "epoch": 5.498, + "grad_norm": 1.5642436742782593, + "learning_rate": 2e-05, + "loss": 0.05445066, + "step": 2749 + }, + { + "epoch": 5.5, + "grad_norm": 1.600178837776184, + "learning_rate": 2e-05, + "loss": 0.04767666, + "step": 2750 + }, + { + "epoch": 5.502, + "grad_norm": 1.0440183877944946, + "learning_rate": 2e-05, + "loss": 0.03446297, + "step": 2751 + }, + { + "epoch": 5.504, + "grad_norm": 1.2513372898101807, + "learning_rate": 2e-05, + "loss": 0.03565361, + "step": 2752 + }, + { + "epoch": 5.506, + "grad_norm": 1.2894079685211182, + "learning_rate": 2e-05, + "loss": 0.04235047, + "step": 2753 + }, + { + "epoch": 5.508, + "grad_norm": 1.8253742456436157, + "learning_rate": 2e-05, + "loss": 0.05126983, + "step": 2754 + }, + { + "epoch": 5.51, + "grad_norm": 1.881759762763977, + "learning_rate": 2e-05, + "loss": 0.05438966, + "step": 2755 + }, + { + "epoch": 5.5120000000000005, + "grad_norm": 0.9773890972137451, + "learning_rate": 2e-05, + "loss": 0.02625747, + "step": 2756 + }, + { + "epoch": 5.514, + "grad_norm": 1.1596041917800903, + "learning_rate": 2e-05, + "loss": 0.03821389, + "step": 2757 + }, + { + "epoch": 5.516, + "grad_norm": 1.3653874397277832, + "learning_rate": 2e-05, + "loss": 0.04838298, + "step": 2758 + }, + { + "epoch": 5.518, + "grad_norm": 1.4374557733535767, + "learning_rate": 2e-05, + "loss": 0.04565241, + "step": 2759 + }, + { + "epoch": 5.52, + "grad_norm": 1.6063563823699951, + "learning_rate": 2e-05, + "loss": 0.053575, + "step": 2760 + }, + { + "epoch": 5.522, + "grad_norm": 1.2797068357467651, + "learning_rate": 2e-05, + "loss": 0.03941024, + "step": 2761 + }, + { + "epoch": 5.524, + "grad_norm": 1.0527799129486084, + "learning_rate": 2e-05, + "loss": 0.01956801, + "step": 2762 + }, + { + "epoch": 5.526, + "grad_norm": 1.218973994255066, + "learning_rate": 2e-05, + "loss": 0.03965396, + "step": 2763 + }, + { + "epoch": 5.5280000000000005, + "grad_norm": 2.3557660579681396, + "learning_rate": 2e-05, + "loss": 0.03057607, + "step": 2764 + }, + { + "epoch": 5.53, + "grad_norm": 1.2339812517166138, + "learning_rate": 2e-05, + "loss": 0.0315393, + "step": 2765 + }, + { + "epoch": 5.532, + "grad_norm": 1.698705792427063, + "learning_rate": 2e-05, + "loss": 0.03224694, + "step": 2766 + }, + { + "epoch": 5.534, + "grad_norm": 1.9057179689407349, + "learning_rate": 2e-05, + "loss": 0.05384257, + "step": 2767 + }, + { + "epoch": 5.536, + "grad_norm": 1.3799241781234741, + "learning_rate": 2e-05, + "loss": 0.04452458, + "step": 2768 + }, + { + "epoch": 5.538, + "grad_norm": 1.2078452110290527, + "learning_rate": 2e-05, + "loss": 0.04307923, + "step": 2769 + }, + { + "epoch": 5.54, + "grad_norm": 1.1021215915679932, + "learning_rate": 2e-05, + "loss": 0.03390051, + "step": 2770 + }, + { + "epoch": 5.542, + "grad_norm": 0.9905539751052856, + "learning_rate": 2e-05, + "loss": 0.03263092, + "step": 2771 + }, + { + "epoch": 5.5440000000000005, + "grad_norm": 2.1359174251556396, + "learning_rate": 2e-05, + "loss": 0.05425924, + "step": 2772 + }, + { + "epoch": 5.546, + "grad_norm": 1.9481074810028076, + "learning_rate": 2e-05, + "loss": 0.05403948, + "step": 2773 + }, + { + "epoch": 5.548, + "grad_norm": 2.03695011138916, + "learning_rate": 2e-05, + "loss": 0.03124078, + "step": 2774 + }, + { + "epoch": 5.55, + "grad_norm": 2.420414447784424, + "learning_rate": 2e-05, + "loss": 0.05495863, + "step": 2775 + }, + { + "epoch": 5.552, + "grad_norm": 2.0198957920074463, + "learning_rate": 2e-05, + "loss": 0.05129372, + "step": 2776 + }, + { + "epoch": 5.554, + "grad_norm": 1.7094142436981201, + "learning_rate": 2e-05, + "loss": 0.0504851, + "step": 2777 + }, + { + "epoch": 5.556, + "grad_norm": 3.827301502227783, + "learning_rate": 2e-05, + "loss": 0.05337991, + "step": 2778 + }, + { + "epoch": 5.558, + "grad_norm": 1.3300044536590576, + "learning_rate": 2e-05, + "loss": 0.03548397, + "step": 2779 + }, + { + "epoch": 5.5600000000000005, + "grad_norm": 1.937286376953125, + "learning_rate": 2e-05, + "loss": 0.04464417, + "step": 2780 + }, + { + "epoch": 5.562, + "grad_norm": 2.370962142944336, + "learning_rate": 2e-05, + "loss": 0.05774143, + "step": 2781 + }, + { + "epoch": 5.564, + "grad_norm": 1.7345980405807495, + "learning_rate": 2e-05, + "loss": 0.04462703, + "step": 2782 + }, + { + "epoch": 5.566, + "grad_norm": 1.5614328384399414, + "learning_rate": 2e-05, + "loss": 0.04742298, + "step": 2783 + }, + { + "epoch": 5.568, + "grad_norm": 1.480557918548584, + "learning_rate": 2e-05, + "loss": 0.04577837, + "step": 2784 + }, + { + "epoch": 5.57, + "grad_norm": 1.257659912109375, + "learning_rate": 2e-05, + "loss": 0.03243734, + "step": 2785 + }, + { + "epoch": 5.572, + "grad_norm": 2.447787046432495, + "learning_rate": 2e-05, + "loss": 0.01604201, + "step": 2786 + }, + { + "epoch": 5.574, + "grad_norm": 1.1269577741622925, + "learning_rate": 2e-05, + "loss": 0.02630051, + "step": 2787 + }, + { + "epoch": 5.576, + "grad_norm": 1.0902881622314453, + "learning_rate": 2e-05, + "loss": 0.02891617, + "step": 2788 + }, + { + "epoch": 5.578, + "grad_norm": 2.013070821762085, + "learning_rate": 2e-05, + "loss": 0.04982309, + "step": 2789 + }, + { + "epoch": 5.58, + "grad_norm": 1.520807147026062, + "learning_rate": 2e-05, + "loss": 0.06273451, + "step": 2790 + }, + { + "epoch": 5.582, + "grad_norm": 1.0118508338928223, + "learning_rate": 2e-05, + "loss": 0.0323143, + "step": 2791 + }, + { + "epoch": 5.584, + "grad_norm": 1.589319109916687, + "learning_rate": 2e-05, + "loss": 0.04532817, + "step": 2792 + }, + { + "epoch": 5.586, + "grad_norm": 1.4283770322799683, + "learning_rate": 2e-05, + "loss": 0.03695945, + "step": 2793 + }, + { + "epoch": 5.588, + "grad_norm": 1.5165592432022095, + "learning_rate": 2e-05, + "loss": 0.04063965, + "step": 2794 + }, + { + "epoch": 5.59, + "grad_norm": 2.1748976707458496, + "learning_rate": 2e-05, + "loss": 0.0483332, + "step": 2795 + }, + { + "epoch": 5.592, + "grad_norm": 1.4957975149154663, + "learning_rate": 2e-05, + "loss": 0.03734215, + "step": 2796 + }, + { + "epoch": 5.594, + "grad_norm": 1.5057545900344849, + "learning_rate": 2e-05, + "loss": 0.0448417, + "step": 2797 + }, + { + "epoch": 5.596, + "grad_norm": 1.6289266347885132, + "learning_rate": 2e-05, + "loss": 0.04035756, + "step": 2798 + }, + { + "epoch": 5.598, + "grad_norm": 1.945152759552002, + "learning_rate": 2e-05, + "loss": 0.04505145, + "step": 2799 + }, + { + "epoch": 5.6, + "grad_norm": 3.9745194911956787, + "learning_rate": 2e-05, + "loss": 0.05695663, + "step": 2800 + }, + { + "epoch": 5.602, + "grad_norm": 1.1209356784820557, + "learning_rate": 2e-05, + "loss": 0.04382028, + "step": 2801 + }, + { + "epoch": 5.604, + "grad_norm": 1.7319954633712769, + "learning_rate": 2e-05, + "loss": 0.07861836, + "step": 2802 + }, + { + "epoch": 5.606, + "grad_norm": 1.2670855522155762, + "learning_rate": 2e-05, + "loss": 0.03723117, + "step": 2803 + }, + { + "epoch": 5.608, + "grad_norm": 1.5532779693603516, + "learning_rate": 2e-05, + "loss": 0.03009872, + "step": 2804 + }, + { + "epoch": 5.61, + "grad_norm": 1.9622262716293335, + "learning_rate": 2e-05, + "loss": 0.06740649, + "step": 2805 + }, + { + "epoch": 5.612, + "grad_norm": 1.1806871891021729, + "learning_rate": 2e-05, + "loss": 0.03466694, + "step": 2806 + }, + { + "epoch": 5.614, + "grad_norm": 2.1496856212615967, + "learning_rate": 2e-05, + "loss": 0.060385, + "step": 2807 + }, + { + "epoch": 5.616, + "grad_norm": 1.7937800884246826, + "learning_rate": 2e-05, + "loss": 0.04280568, + "step": 2808 + }, + { + "epoch": 5.618, + "grad_norm": 1.664449691772461, + "learning_rate": 2e-05, + "loss": 0.0573493, + "step": 2809 + }, + { + "epoch": 5.62, + "grad_norm": 2.113910436630249, + "learning_rate": 2e-05, + "loss": 0.05896275, + "step": 2810 + }, + { + "epoch": 5.622, + "grad_norm": 1.4774582386016846, + "learning_rate": 2e-05, + "loss": 0.05064362, + "step": 2811 + }, + { + "epoch": 5.624, + "grad_norm": 1.077660083770752, + "learning_rate": 2e-05, + "loss": 0.03990199, + "step": 2812 + }, + { + "epoch": 5.626, + "grad_norm": 2.0504021644592285, + "learning_rate": 2e-05, + "loss": 0.04368483, + "step": 2813 + }, + { + "epoch": 5.628, + "grad_norm": 1.3828272819519043, + "learning_rate": 2e-05, + "loss": 0.0452172, + "step": 2814 + }, + { + "epoch": 5.63, + "grad_norm": 1.3679569959640503, + "learning_rate": 2e-05, + "loss": 0.04204818, + "step": 2815 + }, + { + "epoch": 5.632, + "grad_norm": 1.904268503189087, + "learning_rate": 2e-05, + "loss": 0.03786581, + "step": 2816 + }, + { + "epoch": 5.634, + "grad_norm": 1.5074198246002197, + "learning_rate": 2e-05, + "loss": 0.05239385, + "step": 2817 + }, + { + "epoch": 5.636, + "grad_norm": 1.603095531463623, + "learning_rate": 2e-05, + "loss": 0.04265279, + "step": 2818 + }, + { + "epoch": 5.638, + "grad_norm": 1.2010900974273682, + "learning_rate": 2e-05, + "loss": 0.0357244, + "step": 2819 + }, + { + "epoch": 5.64, + "grad_norm": 1.8262673616409302, + "learning_rate": 2e-05, + "loss": 0.05815496, + "step": 2820 + }, + { + "epoch": 5.642, + "grad_norm": 1.6634687185287476, + "learning_rate": 2e-05, + "loss": 0.04178239, + "step": 2821 + }, + { + "epoch": 5.644, + "grad_norm": 1.9540555477142334, + "learning_rate": 2e-05, + "loss": 0.0514649, + "step": 2822 + }, + { + "epoch": 5.646, + "grad_norm": 1.463693618774414, + "learning_rate": 2e-05, + "loss": 0.0404924, + "step": 2823 + }, + { + "epoch": 5.648, + "grad_norm": 2.447908639907837, + "learning_rate": 2e-05, + "loss": 0.03566689, + "step": 2824 + }, + { + "epoch": 5.65, + "grad_norm": 1.290624737739563, + "learning_rate": 2e-05, + "loss": 0.03801401, + "step": 2825 + }, + { + "epoch": 5.652, + "grad_norm": 1.4527415037155151, + "learning_rate": 2e-05, + "loss": 0.03635446, + "step": 2826 + }, + { + "epoch": 5.654, + "grad_norm": 1.6195863485336304, + "learning_rate": 2e-05, + "loss": 0.05585508, + "step": 2827 + }, + { + "epoch": 5.656, + "grad_norm": 1.6211501359939575, + "learning_rate": 2e-05, + "loss": 0.06746172, + "step": 2828 + }, + { + "epoch": 5.658, + "grad_norm": 1.346248745918274, + "learning_rate": 2e-05, + "loss": 0.04963518, + "step": 2829 + }, + { + "epoch": 5.66, + "grad_norm": 1.062984585762024, + "learning_rate": 2e-05, + "loss": 0.03912539, + "step": 2830 + }, + { + "epoch": 5.662, + "grad_norm": 1.6748669147491455, + "learning_rate": 2e-05, + "loss": 0.06159768, + "step": 2831 + }, + { + "epoch": 5.664, + "grad_norm": 1.3549000024795532, + "learning_rate": 2e-05, + "loss": 0.04779611, + "step": 2832 + }, + { + "epoch": 5.666, + "grad_norm": 1.2861626148223877, + "learning_rate": 2e-05, + "loss": 0.03312145, + "step": 2833 + }, + { + "epoch": 5.668, + "grad_norm": 2.1982734203338623, + "learning_rate": 2e-05, + "loss": 0.05529946, + "step": 2834 + }, + { + "epoch": 5.67, + "grad_norm": 1.2915890216827393, + "learning_rate": 2e-05, + "loss": 0.03705323, + "step": 2835 + }, + { + "epoch": 5.672, + "grad_norm": 1.5575082302093506, + "learning_rate": 2e-05, + "loss": 0.03686603, + "step": 2836 + }, + { + "epoch": 5.674, + "grad_norm": 1.038826823234558, + "learning_rate": 2e-05, + "loss": 0.03793515, + "step": 2837 + }, + { + "epoch": 5.676, + "grad_norm": 1.6118172407150269, + "learning_rate": 2e-05, + "loss": 0.03095808, + "step": 2838 + }, + { + "epoch": 5.678, + "grad_norm": 1.6042510271072388, + "learning_rate": 2e-05, + "loss": 0.05740377, + "step": 2839 + }, + { + "epoch": 5.68, + "grad_norm": 1.7100886106491089, + "learning_rate": 2e-05, + "loss": 0.05264761, + "step": 2840 + }, + { + "epoch": 5.682, + "grad_norm": 1.575619101524353, + "learning_rate": 2e-05, + "loss": 0.03263854, + "step": 2841 + }, + { + "epoch": 5.684, + "grad_norm": 1.4796384572982788, + "learning_rate": 2e-05, + "loss": 0.04719574, + "step": 2842 + }, + { + "epoch": 5.686, + "grad_norm": 1.3210619688034058, + "learning_rate": 2e-05, + "loss": 0.0271394, + "step": 2843 + }, + { + "epoch": 5.688, + "grad_norm": 1.0833356380462646, + "learning_rate": 2e-05, + "loss": 0.03883443, + "step": 2844 + }, + { + "epoch": 5.6899999999999995, + "grad_norm": 2.5778627395629883, + "learning_rate": 2e-05, + "loss": 0.05159726, + "step": 2845 + }, + { + "epoch": 5.692, + "grad_norm": 2.201190948486328, + "learning_rate": 2e-05, + "loss": 0.04031905, + "step": 2846 + }, + { + "epoch": 5.694, + "grad_norm": 1.1688532829284668, + "learning_rate": 2e-05, + "loss": 0.03291119, + "step": 2847 + }, + { + "epoch": 5.696, + "grad_norm": 1.796081304550171, + "learning_rate": 2e-05, + "loss": 0.05050949, + "step": 2848 + }, + { + "epoch": 5.698, + "grad_norm": 1.016455888748169, + "learning_rate": 2e-05, + "loss": 0.03628581, + "step": 2849 + }, + { + "epoch": 5.7, + "grad_norm": 1.9447333812713623, + "learning_rate": 2e-05, + "loss": 0.04152183, + "step": 2850 + }, + { + "epoch": 5.702, + "grad_norm": 1.3478142023086548, + "learning_rate": 2e-05, + "loss": 0.03443387, + "step": 2851 + }, + { + "epoch": 5.704, + "grad_norm": 1.6059876680374146, + "learning_rate": 2e-05, + "loss": 0.05241629, + "step": 2852 + }, + { + "epoch": 5.7059999999999995, + "grad_norm": 1.2685903310775757, + "learning_rate": 2e-05, + "loss": 0.03086682, + "step": 2853 + }, + { + "epoch": 5.708, + "grad_norm": 2.984708547592163, + "learning_rate": 2e-05, + "loss": 0.05078296, + "step": 2854 + }, + { + "epoch": 5.71, + "grad_norm": 3.1346423625946045, + "learning_rate": 2e-05, + "loss": 0.04925366, + "step": 2855 + }, + { + "epoch": 5.712, + "grad_norm": 1.1678012609481812, + "learning_rate": 2e-05, + "loss": 0.03549577, + "step": 2856 + }, + { + "epoch": 5.714, + "grad_norm": 1.408694863319397, + "learning_rate": 2e-05, + "loss": 0.03885608, + "step": 2857 + }, + { + "epoch": 5.716, + "grad_norm": 0.9981144070625305, + "learning_rate": 2e-05, + "loss": 0.02968807, + "step": 2858 + }, + { + "epoch": 5.718, + "grad_norm": 1.307047724723816, + "learning_rate": 2e-05, + "loss": 0.03957972, + "step": 2859 + }, + { + "epoch": 5.72, + "grad_norm": 2.804670572280884, + "learning_rate": 2e-05, + "loss": 0.04513305, + "step": 2860 + }, + { + "epoch": 5.7219999999999995, + "grad_norm": 1.1364901065826416, + "learning_rate": 2e-05, + "loss": 0.03480239, + "step": 2861 + }, + { + "epoch": 5.724, + "grad_norm": 1.2232093811035156, + "learning_rate": 2e-05, + "loss": 0.03162998, + "step": 2862 + }, + { + "epoch": 5.726, + "grad_norm": 1.298130989074707, + "learning_rate": 2e-05, + "loss": 0.04124122, + "step": 2863 + }, + { + "epoch": 5.728, + "grad_norm": 1.2249360084533691, + "learning_rate": 2e-05, + "loss": 0.04093556, + "step": 2864 + }, + { + "epoch": 5.73, + "grad_norm": 1.261306881904602, + "learning_rate": 2e-05, + "loss": 0.04419369, + "step": 2865 + }, + { + "epoch": 5.732, + "grad_norm": 1.0616569519042969, + "learning_rate": 2e-05, + "loss": 0.02921419, + "step": 2866 + }, + { + "epoch": 5.734, + "grad_norm": 1.7513487339019775, + "learning_rate": 2e-05, + "loss": 0.03050586, + "step": 2867 + }, + { + "epoch": 5.736, + "grad_norm": 1.4477100372314453, + "learning_rate": 2e-05, + "loss": 0.0365976, + "step": 2868 + }, + { + "epoch": 5.7379999999999995, + "grad_norm": 2.1464011669158936, + "learning_rate": 2e-05, + "loss": 0.05555936, + "step": 2869 + }, + { + "epoch": 5.74, + "grad_norm": 1.7182115316390991, + "learning_rate": 2e-05, + "loss": 0.04245908, + "step": 2870 + }, + { + "epoch": 5.742, + "grad_norm": 1.0961130857467651, + "learning_rate": 2e-05, + "loss": 0.03913336, + "step": 2871 + }, + { + "epoch": 5.744, + "grad_norm": 1.87662935256958, + "learning_rate": 2e-05, + "loss": 0.04560525, + "step": 2872 + }, + { + "epoch": 5.746, + "grad_norm": 1.343624234199524, + "learning_rate": 2e-05, + "loss": 0.03066914, + "step": 2873 + }, + { + "epoch": 5.748, + "grad_norm": 2.7794783115386963, + "learning_rate": 2e-05, + "loss": 0.05809136, + "step": 2874 + }, + { + "epoch": 5.75, + "grad_norm": 1.3323760032653809, + "learning_rate": 2e-05, + "loss": 0.05136766, + "step": 2875 + }, + { + "epoch": 5.752, + "grad_norm": 1.9432049989700317, + "learning_rate": 2e-05, + "loss": 0.05197339, + "step": 2876 + }, + { + "epoch": 5.754, + "grad_norm": 1.1205425262451172, + "learning_rate": 2e-05, + "loss": 0.03465851, + "step": 2877 + }, + { + "epoch": 5.756, + "grad_norm": 1.6224393844604492, + "learning_rate": 2e-05, + "loss": 0.03670148, + "step": 2878 + }, + { + "epoch": 5.758, + "grad_norm": 1.3828800916671753, + "learning_rate": 2e-05, + "loss": 0.04724912, + "step": 2879 + }, + { + "epoch": 5.76, + "grad_norm": 1.6124157905578613, + "learning_rate": 2e-05, + "loss": 0.03899434, + "step": 2880 + }, + { + "epoch": 5.7620000000000005, + "grad_norm": 1.2006202936172485, + "learning_rate": 2e-05, + "loss": 0.04298317, + "step": 2881 + }, + { + "epoch": 5.764, + "grad_norm": 1.8390953540802002, + "learning_rate": 2e-05, + "loss": 0.04603451, + "step": 2882 + }, + { + "epoch": 5.766, + "grad_norm": 3.418121099472046, + "learning_rate": 2e-05, + "loss": 0.03081621, + "step": 2883 + }, + { + "epoch": 5.768, + "grad_norm": 0.9251627922058105, + "learning_rate": 2e-05, + "loss": 0.02699123, + "step": 2884 + }, + { + "epoch": 5.77, + "grad_norm": 1.5069046020507812, + "learning_rate": 2e-05, + "loss": 0.03988301, + "step": 2885 + }, + { + "epoch": 5.772, + "grad_norm": 2.365957498550415, + "learning_rate": 2e-05, + "loss": 0.03983999, + "step": 2886 + }, + { + "epoch": 5.774, + "grad_norm": 1.5599316358566284, + "learning_rate": 2e-05, + "loss": 0.04989076, + "step": 2887 + }, + { + "epoch": 5.776, + "grad_norm": 1.1228059530258179, + "learning_rate": 2e-05, + "loss": 0.03197101, + "step": 2888 + }, + { + "epoch": 5.7780000000000005, + "grad_norm": 1.517126202583313, + "learning_rate": 2e-05, + "loss": 0.03176156, + "step": 2889 + }, + { + "epoch": 5.78, + "grad_norm": 1.023742914199829, + "learning_rate": 2e-05, + "loss": 0.035194, + "step": 2890 + }, + { + "epoch": 5.782, + "grad_norm": 1.5021580457687378, + "learning_rate": 2e-05, + "loss": 0.0346195, + "step": 2891 + }, + { + "epoch": 5.784, + "grad_norm": 1.5000685453414917, + "learning_rate": 2e-05, + "loss": 0.03589319, + "step": 2892 + }, + { + "epoch": 5.786, + "grad_norm": 1.2418376207351685, + "learning_rate": 2e-05, + "loss": 0.03027872, + "step": 2893 + }, + { + "epoch": 5.788, + "grad_norm": 1.425329566001892, + "learning_rate": 2e-05, + "loss": 0.03400642, + "step": 2894 + }, + { + "epoch": 5.79, + "grad_norm": 1.4583309888839722, + "learning_rate": 2e-05, + "loss": 0.03597549, + "step": 2895 + }, + { + "epoch": 5.792, + "grad_norm": 1.5168836116790771, + "learning_rate": 2e-05, + "loss": 0.02261955, + "step": 2896 + }, + { + "epoch": 5.7940000000000005, + "grad_norm": 1.7382346391677856, + "learning_rate": 2e-05, + "loss": 0.03887838, + "step": 2897 + }, + { + "epoch": 5.796, + "grad_norm": 3.2048113346099854, + "learning_rate": 2e-05, + "loss": 0.07382256, + "step": 2898 + }, + { + "epoch": 5.798, + "grad_norm": 1.1313360929489136, + "learning_rate": 2e-05, + "loss": 0.02457133, + "step": 2899 + }, + { + "epoch": 5.8, + "grad_norm": 2.6181559562683105, + "learning_rate": 2e-05, + "loss": 0.04528946, + "step": 2900 + }, + { + "epoch": 5.802, + "grad_norm": 2.0275521278381348, + "learning_rate": 2e-05, + "loss": 0.04249582, + "step": 2901 + }, + { + "epoch": 5.804, + "grad_norm": 3.3684051036834717, + "learning_rate": 2e-05, + "loss": 0.04949026, + "step": 2902 + }, + { + "epoch": 5.806, + "grad_norm": 1.1052809953689575, + "learning_rate": 2e-05, + "loss": 0.03748845, + "step": 2903 + }, + { + "epoch": 5.808, + "grad_norm": 1.147633671760559, + "learning_rate": 2e-05, + "loss": 0.02890555, + "step": 2904 + }, + { + "epoch": 5.8100000000000005, + "grad_norm": 2.285583257675171, + "learning_rate": 2e-05, + "loss": 0.04941707, + "step": 2905 + }, + { + "epoch": 5.812, + "grad_norm": 2.404172658920288, + "learning_rate": 2e-05, + "loss": 0.02909764, + "step": 2906 + }, + { + "epoch": 5.814, + "grad_norm": 2.2711870670318604, + "learning_rate": 2e-05, + "loss": 0.06077723, + "step": 2907 + }, + { + "epoch": 5.816, + "grad_norm": 1.342374563217163, + "learning_rate": 2e-05, + "loss": 0.03685233, + "step": 2908 + }, + { + "epoch": 5.818, + "grad_norm": 1.1346458196640015, + "learning_rate": 2e-05, + "loss": 0.02703713, + "step": 2909 + }, + { + "epoch": 5.82, + "grad_norm": 0.9739170670509338, + "learning_rate": 2e-05, + "loss": 0.0288388, + "step": 2910 + }, + { + "epoch": 5.822, + "grad_norm": 1.538444995880127, + "learning_rate": 2e-05, + "loss": 0.0610106, + "step": 2911 + }, + { + "epoch": 5.824, + "grad_norm": 1.5088497400283813, + "learning_rate": 2e-05, + "loss": 0.03801049, + "step": 2912 + }, + { + "epoch": 5.826, + "grad_norm": 2.691092014312744, + "learning_rate": 2e-05, + "loss": 0.05319805, + "step": 2913 + }, + { + "epoch": 5.828, + "grad_norm": 1.7784383296966553, + "learning_rate": 2e-05, + "loss": 0.03665651, + "step": 2914 + }, + { + "epoch": 5.83, + "grad_norm": 2.699039936065674, + "learning_rate": 2e-05, + "loss": 0.06110234, + "step": 2915 + }, + { + "epoch": 5.832, + "grad_norm": 1.0186821222305298, + "learning_rate": 2e-05, + "loss": 0.0389097, + "step": 2916 + }, + { + "epoch": 5.834, + "grad_norm": 1.5855401754379272, + "learning_rate": 2e-05, + "loss": 0.05706736, + "step": 2917 + }, + { + "epoch": 5.836, + "grad_norm": 1.5186421871185303, + "learning_rate": 2e-05, + "loss": 0.04286093, + "step": 2918 + }, + { + "epoch": 5.838, + "grad_norm": 1.8473025560379028, + "learning_rate": 2e-05, + "loss": 0.04118644, + "step": 2919 + }, + { + "epoch": 5.84, + "grad_norm": 1.2848973274230957, + "learning_rate": 2e-05, + "loss": 0.03644439, + "step": 2920 + }, + { + "epoch": 5.842, + "grad_norm": 1.198118805885315, + "learning_rate": 2e-05, + "loss": 0.04760751, + "step": 2921 + }, + { + "epoch": 5.844, + "grad_norm": 1.7948402166366577, + "learning_rate": 2e-05, + "loss": 0.05257875, + "step": 2922 + }, + { + "epoch": 5.846, + "grad_norm": 1.2989088296890259, + "learning_rate": 2e-05, + "loss": 0.0287516, + "step": 2923 + }, + { + "epoch": 5.848, + "grad_norm": 0.9115301966667175, + "learning_rate": 2e-05, + "loss": 0.0334456, + "step": 2924 + }, + { + "epoch": 5.85, + "grad_norm": 11.285009384155273, + "learning_rate": 2e-05, + "loss": 0.03400384, + "step": 2925 + }, + { + "epoch": 5.852, + "grad_norm": 5.87296199798584, + "learning_rate": 2e-05, + "loss": 0.10033739, + "step": 2926 + }, + { + "epoch": 5.854, + "grad_norm": 1.882878065109253, + "learning_rate": 2e-05, + "loss": 0.04880483, + "step": 2927 + }, + { + "epoch": 5.856, + "grad_norm": 11.981039047241211, + "learning_rate": 2e-05, + "loss": 0.166832, + "step": 2928 + }, + { + "epoch": 5.858, + "grad_norm": 1.1964349746704102, + "learning_rate": 2e-05, + "loss": 0.02282231, + "step": 2929 + }, + { + "epoch": 5.86, + "grad_norm": 3.9903624057769775, + "learning_rate": 2e-05, + "loss": 0.06060695, + "step": 2930 + }, + { + "epoch": 5.862, + "grad_norm": 1.5105197429656982, + "learning_rate": 2e-05, + "loss": 0.04793704, + "step": 2931 + }, + { + "epoch": 5.864, + "grad_norm": 1.41489839553833, + "learning_rate": 2e-05, + "loss": 0.03938361, + "step": 2932 + }, + { + "epoch": 5.866, + "grad_norm": 2.26653790473938, + "learning_rate": 2e-05, + "loss": 0.0598973, + "step": 2933 + }, + { + "epoch": 5.868, + "grad_norm": 2.4137723445892334, + "learning_rate": 2e-05, + "loss": 0.05444659, + "step": 2934 + }, + { + "epoch": 5.87, + "grad_norm": 1.2001371383666992, + "learning_rate": 2e-05, + "loss": 0.03985805, + "step": 2935 + }, + { + "epoch": 5.872, + "grad_norm": 1.5971794128417969, + "learning_rate": 2e-05, + "loss": 0.06079929, + "step": 2936 + }, + { + "epoch": 5.874, + "grad_norm": 1.318945050239563, + "learning_rate": 2e-05, + "loss": 0.04854849, + "step": 2937 + }, + { + "epoch": 5.876, + "grad_norm": 1.246225118637085, + "learning_rate": 2e-05, + "loss": 0.03397983, + "step": 2938 + }, + { + "epoch": 5.878, + "grad_norm": 2.4939327239990234, + "learning_rate": 2e-05, + "loss": 0.04911096, + "step": 2939 + }, + { + "epoch": 5.88, + "grad_norm": 1.0084093809127808, + "learning_rate": 2e-05, + "loss": 0.03702591, + "step": 2940 + }, + { + "epoch": 5.882, + "grad_norm": 1.044545292854309, + "learning_rate": 2e-05, + "loss": 0.03148817, + "step": 2941 + }, + { + "epoch": 5.884, + "grad_norm": 1.6608963012695312, + "learning_rate": 2e-05, + "loss": 0.05824044, + "step": 2942 + }, + { + "epoch": 5.886, + "grad_norm": 2.168658494949341, + "learning_rate": 2e-05, + "loss": 0.05980438, + "step": 2943 + }, + { + "epoch": 5.888, + "grad_norm": 1.3249305486679077, + "learning_rate": 2e-05, + "loss": 0.04346288, + "step": 2944 + }, + { + "epoch": 5.89, + "grad_norm": 1.1912715435028076, + "learning_rate": 2e-05, + "loss": 0.03074624, + "step": 2945 + }, + { + "epoch": 5.892, + "grad_norm": 2.0439677238464355, + "learning_rate": 2e-05, + "loss": 0.06351136, + "step": 2946 + }, + { + "epoch": 5.894, + "grad_norm": 1.2318497896194458, + "learning_rate": 2e-05, + "loss": 0.03561502, + "step": 2947 + }, + { + "epoch": 5.896, + "grad_norm": 1.001007318496704, + "learning_rate": 2e-05, + "loss": 0.0264318, + "step": 2948 + }, + { + "epoch": 5.898, + "grad_norm": 1.6429765224456787, + "learning_rate": 2e-05, + "loss": 0.05054122, + "step": 2949 + }, + { + "epoch": 5.9, + "grad_norm": 1.8154584169387817, + "learning_rate": 2e-05, + "loss": 0.05611707, + "step": 2950 + }, + { + "epoch": 5.902, + "grad_norm": 1.8020012378692627, + "learning_rate": 2e-05, + "loss": 0.0445966, + "step": 2951 + }, + { + "epoch": 5.904, + "grad_norm": 1.1561640501022339, + "learning_rate": 2e-05, + "loss": 0.042308, + "step": 2952 + }, + { + "epoch": 5.906, + "grad_norm": 1.3049297332763672, + "learning_rate": 2e-05, + "loss": 0.02226608, + "step": 2953 + }, + { + "epoch": 5.908, + "grad_norm": 1.9397492408752441, + "learning_rate": 2e-05, + "loss": 0.04040186, + "step": 2954 + }, + { + "epoch": 5.91, + "grad_norm": 2.0794615745544434, + "learning_rate": 2e-05, + "loss": 0.04332725, + "step": 2955 + }, + { + "epoch": 5.912, + "grad_norm": 1.1256343126296997, + "learning_rate": 2e-05, + "loss": 0.03192374, + "step": 2956 + }, + { + "epoch": 5.914, + "grad_norm": 1.629341959953308, + "learning_rate": 2e-05, + "loss": 0.03662695, + "step": 2957 + }, + { + "epoch": 5.916, + "grad_norm": 1.8990249633789062, + "learning_rate": 2e-05, + "loss": 0.04180484, + "step": 2958 + }, + { + "epoch": 5.918, + "grad_norm": 1.3357110023498535, + "learning_rate": 2e-05, + "loss": 0.05098827, + "step": 2959 + }, + { + "epoch": 5.92, + "grad_norm": 2.420962333679199, + "learning_rate": 2e-05, + "loss": 0.04422121, + "step": 2960 + }, + { + "epoch": 5.922, + "grad_norm": 1.0313847064971924, + "learning_rate": 2e-05, + "loss": 0.02736597, + "step": 2961 + }, + { + "epoch": 5.924, + "grad_norm": 2.0497448444366455, + "learning_rate": 2e-05, + "loss": 0.04987185, + "step": 2962 + }, + { + "epoch": 5.926, + "grad_norm": 1.508475661277771, + "learning_rate": 2e-05, + "loss": 0.02963543, + "step": 2963 + }, + { + "epoch": 5.928, + "grad_norm": 2.0732882022857666, + "learning_rate": 2e-05, + "loss": 0.04629382, + "step": 2964 + }, + { + "epoch": 5.93, + "grad_norm": 1.1313819885253906, + "learning_rate": 2e-05, + "loss": 0.02837754, + "step": 2965 + }, + { + "epoch": 5.932, + "grad_norm": 1.3886733055114746, + "learning_rate": 2e-05, + "loss": 0.03276101, + "step": 2966 + }, + { + "epoch": 5.934, + "grad_norm": 1.8589578866958618, + "learning_rate": 2e-05, + "loss": 0.05601101, + "step": 2967 + }, + { + "epoch": 5.936, + "grad_norm": 1.4239290952682495, + "learning_rate": 2e-05, + "loss": 0.03772479, + "step": 2968 + }, + { + "epoch": 5.938, + "grad_norm": 1.3066622018814087, + "learning_rate": 2e-05, + "loss": 0.03739661, + "step": 2969 + }, + { + "epoch": 5.9399999999999995, + "grad_norm": 1.04325270652771, + "learning_rate": 2e-05, + "loss": 0.03995933, + "step": 2970 + }, + { + "epoch": 5.942, + "grad_norm": 1.5223791599273682, + "learning_rate": 2e-05, + "loss": 0.04225756, + "step": 2971 + }, + { + "epoch": 5.944, + "grad_norm": 2.6733899116516113, + "learning_rate": 2e-05, + "loss": 0.04441323, + "step": 2972 + }, + { + "epoch": 5.946, + "grad_norm": 1.6275644302368164, + "learning_rate": 2e-05, + "loss": 0.04762278, + "step": 2973 + }, + { + "epoch": 5.948, + "grad_norm": 1.2216103076934814, + "learning_rate": 2e-05, + "loss": 0.03107932, + "step": 2974 + }, + { + "epoch": 5.95, + "grad_norm": 1.3730415105819702, + "learning_rate": 2e-05, + "loss": 0.02952298, + "step": 2975 + }, + { + "epoch": 5.952, + "grad_norm": 1.223415493965149, + "learning_rate": 2e-05, + "loss": 0.03430998, + "step": 2976 + }, + { + "epoch": 5.954, + "grad_norm": 1.5398921966552734, + "learning_rate": 2e-05, + "loss": 0.05110143, + "step": 2977 + }, + { + "epoch": 5.9559999999999995, + "grad_norm": 1.7357475757598877, + "learning_rate": 2e-05, + "loss": 0.04328895, + "step": 2978 + }, + { + "epoch": 5.958, + "grad_norm": 1.7618504762649536, + "learning_rate": 2e-05, + "loss": 0.05958952, + "step": 2979 + }, + { + "epoch": 5.96, + "grad_norm": 2.2981998920440674, + "learning_rate": 2e-05, + "loss": 0.06046209, + "step": 2980 + }, + { + "epoch": 5.962, + "grad_norm": 1.7709860801696777, + "learning_rate": 2e-05, + "loss": 0.05310146, + "step": 2981 + }, + { + "epoch": 5.964, + "grad_norm": 1.4711432456970215, + "learning_rate": 2e-05, + "loss": 0.03057422, + "step": 2982 + }, + { + "epoch": 5.966, + "grad_norm": 2.1963133811950684, + "learning_rate": 2e-05, + "loss": 0.07748948, + "step": 2983 + }, + { + "epoch": 5.968, + "grad_norm": 1.2445276975631714, + "learning_rate": 2e-05, + "loss": 0.03456353, + "step": 2984 + }, + { + "epoch": 5.97, + "grad_norm": 1.2059154510498047, + "learning_rate": 2e-05, + "loss": 0.03177342, + "step": 2985 + }, + { + "epoch": 5.9719999999999995, + "grad_norm": 2.007486581802368, + "learning_rate": 2e-05, + "loss": 0.03973828, + "step": 2986 + }, + { + "epoch": 5.974, + "grad_norm": 1.5894452333450317, + "learning_rate": 2e-05, + "loss": 0.04387932, + "step": 2987 + }, + { + "epoch": 5.976, + "grad_norm": 1.9093830585479736, + "learning_rate": 2e-05, + "loss": 0.04532828, + "step": 2988 + }, + { + "epoch": 5.978, + "grad_norm": 1.9382355213165283, + "learning_rate": 2e-05, + "loss": 0.05527761, + "step": 2989 + }, + { + "epoch": 5.98, + "grad_norm": 1.3034459352493286, + "learning_rate": 2e-05, + "loss": 0.05401599, + "step": 2990 + }, + { + "epoch": 5.982, + "grad_norm": 1.4245580434799194, + "learning_rate": 2e-05, + "loss": 0.04059825, + "step": 2991 + }, + { + "epoch": 5.984, + "grad_norm": 2.453721523284912, + "learning_rate": 2e-05, + "loss": 0.04975076, + "step": 2992 + }, + { + "epoch": 5.986, + "grad_norm": 1.6775089502334595, + "learning_rate": 2e-05, + "loss": 0.03426765, + "step": 2993 + }, + { + "epoch": 5.9879999999999995, + "grad_norm": 1.31791090965271, + "learning_rate": 2e-05, + "loss": 0.0269012, + "step": 2994 + }, + { + "epoch": 5.99, + "grad_norm": 1.2786681652069092, + "learning_rate": 2e-05, + "loss": 0.04798415, + "step": 2995 + }, + { + "epoch": 5.992, + "grad_norm": 1.375746488571167, + "learning_rate": 2e-05, + "loss": 0.03704346, + "step": 2996 + }, + { + "epoch": 5.994, + "grad_norm": 2.2422120571136475, + "learning_rate": 2e-05, + "loss": 0.03538365, + "step": 2997 + }, + { + "epoch": 5.996, + "grad_norm": 1.6498358249664307, + "learning_rate": 2e-05, + "loss": 0.05676982, + "step": 2998 + }, + { + "epoch": 5.998, + "grad_norm": 0.9439936280250549, + "learning_rate": 2e-05, + "loss": 0.03124115, + "step": 2999 + }, + { + "epoch": 6.0, + "grad_norm": 1.866321086883545, + "learning_rate": 2e-05, + "loss": 0.04968661, + "step": 3000 + }, + { + "epoch": 6.0, + "eval_performance": { + "AngleClassification_1": 0.996, + "AngleClassification_2": 0.996, + "AngleClassification_3": 0.7944111776447106, + "Equal_1": 0.99, + "Equal_2": 0.9121756487025948, + "Equal_3": 0.7964071856287425, + "LineComparison_1": 0.986, + "LineComparison_2": 0.9960079840319361, + "LineComparison_3": 0.9720558882235529, + "Parallel_1": 0.9819639278557114, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.986, + "Perpendicular_1": 0.984, + "Perpendicular_2": 0.726, + "Perpendicular_3": 0.3717434869739479, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.9796, + "PointLiesOnLine_1": 0.9879759519038076, + "PointLiesOnLine_2": 0.9839679358717435, + "PointLiesOnLine_3": 0.7724550898203593 + }, + "eval_runtime": 319.7917, + "eval_samples_per_second": 32.834, + "eval_steps_per_second": 0.657, + "step": 3000 + }, + { + "epoch": 6.002, + "grad_norm": 1.5802181959152222, + "learning_rate": 2e-05, + "loss": 0.02955408, + "step": 3001 + }, + { + "epoch": 6.004, + "grad_norm": 1.1106842756271362, + "learning_rate": 2e-05, + "loss": 0.03170551, + "step": 3002 + }, + { + "epoch": 6.006, + "grad_norm": 1.0765694379806519, + "learning_rate": 2e-05, + "loss": 0.032282, + "step": 3003 + }, + { + "epoch": 6.008, + "grad_norm": 1.2182323932647705, + "learning_rate": 2e-05, + "loss": 0.04121379, + "step": 3004 + }, + { + "epoch": 6.01, + "grad_norm": 1.2825361490249634, + "learning_rate": 2e-05, + "loss": 0.04134614, + "step": 3005 + }, + { + "epoch": 6.012, + "grad_norm": 2.202105760574341, + "learning_rate": 2e-05, + "loss": 0.04897365, + "step": 3006 + }, + { + "epoch": 6.014, + "grad_norm": 2.587782382965088, + "learning_rate": 2e-05, + "loss": 0.06023916, + "step": 3007 + }, + { + "epoch": 6.016, + "grad_norm": 2.555541753768921, + "learning_rate": 2e-05, + "loss": 0.03511619, + "step": 3008 + }, + { + "epoch": 6.018, + "grad_norm": 1.417189598083496, + "learning_rate": 2e-05, + "loss": 0.05029716, + "step": 3009 + }, + { + "epoch": 6.02, + "grad_norm": 1.7279114723205566, + "learning_rate": 2e-05, + "loss": 0.0623187, + "step": 3010 + }, + { + "epoch": 6.022, + "grad_norm": 1.6738758087158203, + "learning_rate": 2e-05, + "loss": 0.03575193, + "step": 3011 + }, + { + "epoch": 6.024, + "grad_norm": 1.7566310167312622, + "learning_rate": 2e-05, + "loss": 0.05152703, + "step": 3012 + }, + { + "epoch": 6.026, + "grad_norm": 1.1162376403808594, + "learning_rate": 2e-05, + "loss": 0.03532304, + "step": 3013 + }, + { + "epoch": 6.028, + "grad_norm": 1.4076039791107178, + "learning_rate": 2e-05, + "loss": 0.04677567, + "step": 3014 + }, + { + "epoch": 6.03, + "grad_norm": 1.133365273475647, + "learning_rate": 2e-05, + "loss": 0.02753913, + "step": 3015 + }, + { + "epoch": 6.032, + "grad_norm": 1.4458855390548706, + "learning_rate": 2e-05, + "loss": 0.03868459, + "step": 3016 + }, + { + "epoch": 6.034, + "grad_norm": 2.3904032707214355, + "learning_rate": 2e-05, + "loss": 0.03503593, + "step": 3017 + }, + { + "epoch": 6.036, + "grad_norm": 1.949507713317871, + "learning_rate": 2e-05, + "loss": 0.04720395, + "step": 3018 + }, + { + "epoch": 6.038, + "grad_norm": 1.8610894680023193, + "learning_rate": 2e-05, + "loss": 0.05176724, + "step": 3019 + }, + { + "epoch": 6.04, + "grad_norm": 1.7843246459960938, + "learning_rate": 2e-05, + "loss": 0.0451156, + "step": 3020 + }, + { + "epoch": 6.042, + "grad_norm": 1.3640440702438354, + "learning_rate": 2e-05, + "loss": 0.04446036, + "step": 3021 + }, + { + "epoch": 6.044, + "grad_norm": 1.3423798084259033, + "learning_rate": 2e-05, + "loss": 0.04165779, + "step": 3022 + }, + { + "epoch": 6.046, + "grad_norm": 1.4849580526351929, + "learning_rate": 2e-05, + "loss": 0.03914479, + "step": 3023 + }, + { + "epoch": 6.048, + "grad_norm": 1.4148266315460205, + "learning_rate": 2e-05, + "loss": 0.03579354, + "step": 3024 + }, + { + "epoch": 6.05, + "grad_norm": 1.197055459022522, + "learning_rate": 2e-05, + "loss": 0.04568943, + "step": 3025 + }, + { + "epoch": 6.052, + "grad_norm": 0.9507676959037781, + "learning_rate": 2e-05, + "loss": 0.03244974, + "step": 3026 + }, + { + "epoch": 6.054, + "grad_norm": 1.1743727922439575, + "learning_rate": 2e-05, + "loss": 0.04001627, + "step": 3027 + }, + { + "epoch": 6.056, + "grad_norm": 1.06550931930542, + "learning_rate": 2e-05, + "loss": 0.0358919, + "step": 3028 + }, + { + "epoch": 6.058, + "grad_norm": 1.3072738647460938, + "learning_rate": 2e-05, + "loss": 0.04308274, + "step": 3029 + }, + { + "epoch": 6.06, + "grad_norm": 1.368091106414795, + "learning_rate": 2e-05, + "loss": 0.0464918, + "step": 3030 + }, + { + "epoch": 6.062, + "grad_norm": 1.4518086910247803, + "learning_rate": 2e-05, + "loss": 0.04207553, + "step": 3031 + }, + { + "epoch": 6.064, + "grad_norm": 0.9633432030677795, + "learning_rate": 2e-05, + "loss": 0.02657059, + "step": 3032 + }, + { + "epoch": 6.066, + "grad_norm": 2.7227838039398193, + "learning_rate": 2e-05, + "loss": 0.04998293, + "step": 3033 + }, + { + "epoch": 6.068, + "grad_norm": 3.5516669750213623, + "learning_rate": 2e-05, + "loss": 0.03489901, + "step": 3034 + }, + { + "epoch": 6.07, + "grad_norm": 1.5370341539382935, + "learning_rate": 2e-05, + "loss": 0.04453697, + "step": 3035 + }, + { + "epoch": 6.072, + "grad_norm": 1.7417138814926147, + "learning_rate": 2e-05, + "loss": 0.03877822, + "step": 3036 + }, + { + "epoch": 6.074, + "grad_norm": 1.3443036079406738, + "learning_rate": 2e-05, + "loss": 0.03986344, + "step": 3037 + }, + { + "epoch": 6.076, + "grad_norm": 1.4446654319763184, + "learning_rate": 2e-05, + "loss": 0.03199197, + "step": 3038 + }, + { + "epoch": 6.078, + "grad_norm": 1.7269781827926636, + "learning_rate": 2e-05, + "loss": 0.04078665, + "step": 3039 + }, + { + "epoch": 6.08, + "grad_norm": 1.3139877319335938, + "learning_rate": 2e-05, + "loss": 0.04576064, + "step": 3040 + }, + { + "epoch": 6.082, + "grad_norm": 1.6032078266143799, + "learning_rate": 2e-05, + "loss": 0.03815749, + "step": 3041 + }, + { + "epoch": 6.084, + "grad_norm": 2.1997110843658447, + "learning_rate": 2e-05, + "loss": 0.04005154, + "step": 3042 + }, + { + "epoch": 6.086, + "grad_norm": 1.4301249980926514, + "learning_rate": 2e-05, + "loss": 0.03681426, + "step": 3043 + }, + { + "epoch": 6.088, + "grad_norm": 0.9475594758987427, + "learning_rate": 2e-05, + "loss": 0.03235501, + "step": 3044 + }, + { + "epoch": 6.09, + "grad_norm": 1.1631615161895752, + "learning_rate": 2e-05, + "loss": 0.04111622, + "step": 3045 + }, + { + "epoch": 6.092, + "grad_norm": 1.8243446350097656, + "learning_rate": 2e-05, + "loss": 0.03791495, + "step": 3046 + }, + { + "epoch": 6.094, + "grad_norm": 1.5762276649475098, + "learning_rate": 2e-05, + "loss": 0.05622587, + "step": 3047 + }, + { + "epoch": 6.096, + "grad_norm": 1.136772632598877, + "learning_rate": 2e-05, + "loss": 0.03879281, + "step": 3048 + }, + { + "epoch": 6.098, + "grad_norm": 1.1505334377288818, + "learning_rate": 2e-05, + "loss": 0.03384344, + "step": 3049 + }, + { + "epoch": 6.1, + "grad_norm": 1.9480074644088745, + "learning_rate": 2e-05, + "loss": 0.03820986, + "step": 3050 + }, + { + "epoch": 6.102, + "grad_norm": 1.3107194900512695, + "learning_rate": 2e-05, + "loss": 0.03161674, + "step": 3051 + }, + { + "epoch": 6.104, + "grad_norm": 1.3566876649856567, + "learning_rate": 2e-05, + "loss": 0.0397422, + "step": 3052 + }, + { + "epoch": 6.106, + "grad_norm": 2.022648811340332, + "learning_rate": 2e-05, + "loss": 0.05766981, + "step": 3053 + }, + { + "epoch": 6.108, + "grad_norm": 1.7005928754806519, + "learning_rate": 2e-05, + "loss": 0.03167719, + "step": 3054 + }, + { + "epoch": 6.11, + "grad_norm": 3.042736768722534, + "learning_rate": 2e-05, + "loss": 0.04883799, + "step": 3055 + }, + { + "epoch": 6.112, + "grad_norm": 1.171810269355774, + "learning_rate": 2e-05, + "loss": 0.03449272, + "step": 3056 + }, + { + "epoch": 6.114, + "grad_norm": 1.3196911811828613, + "learning_rate": 2e-05, + "loss": 0.03630933, + "step": 3057 + }, + { + "epoch": 6.116, + "grad_norm": 1.3632961511611938, + "learning_rate": 2e-05, + "loss": 0.02563158, + "step": 3058 + }, + { + "epoch": 6.118, + "grad_norm": 1.1939222812652588, + "learning_rate": 2e-05, + "loss": 0.03741849, + "step": 3059 + }, + { + "epoch": 6.12, + "grad_norm": 1.714534044265747, + "learning_rate": 2e-05, + "loss": 0.05540673, + "step": 3060 + }, + { + "epoch": 6.122, + "grad_norm": 1.329563021659851, + "learning_rate": 2e-05, + "loss": 0.04123599, + "step": 3061 + }, + { + "epoch": 6.124, + "grad_norm": 1.8710254430770874, + "learning_rate": 2e-05, + "loss": 0.04224695, + "step": 3062 + }, + { + "epoch": 6.126, + "grad_norm": 1.2661737203598022, + "learning_rate": 2e-05, + "loss": 0.03803711, + "step": 3063 + }, + { + "epoch": 6.128, + "grad_norm": 1.2511358261108398, + "learning_rate": 2e-05, + "loss": 0.04244345, + "step": 3064 + }, + { + "epoch": 6.13, + "grad_norm": 0.9187764525413513, + "learning_rate": 2e-05, + "loss": 0.03632679, + "step": 3065 + }, + { + "epoch": 6.132, + "grad_norm": 2.078164577484131, + "learning_rate": 2e-05, + "loss": 0.04006804, + "step": 3066 + }, + { + "epoch": 6.134, + "grad_norm": 1.1497598886489868, + "learning_rate": 2e-05, + "loss": 0.03985286, + "step": 3067 + }, + { + "epoch": 6.136, + "grad_norm": 1.7147870063781738, + "learning_rate": 2e-05, + "loss": 0.03666584, + "step": 3068 + }, + { + "epoch": 6.138, + "grad_norm": 1.467666506767273, + "learning_rate": 2e-05, + "loss": 0.03018375, + "step": 3069 + }, + { + "epoch": 6.14, + "grad_norm": 2.5385053157806396, + "learning_rate": 2e-05, + "loss": 0.05757154, + "step": 3070 + }, + { + "epoch": 6.142, + "grad_norm": 2.1830251216888428, + "learning_rate": 2e-05, + "loss": 0.0400609, + "step": 3071 + }, + { + "epoch": 6.144, + "grad_norm": 2.2541887760162354, + "learning_rate": 2e-05, + "loss": 0.04700291, + "step": 3072 + }, + { + "epoch": 6.146, + "grad_norm": 1.2078704833984375, + "learning_rate": 2e-05, + "loss": 0.03664304, + "step": 3073 + }, + { + "epoch": 6.148, + "grad_norm": 1.6265748739242554, + "learning_rate": 2e-05, + "loss": 0.03525054, + "step": 3074 + }, + { + "epoch": 6.15, + "grad_norm": 1.9453312158584595, + "learning_rate": 2e-05, + "loss": 0.05236892, + "step": 3075 + }, + { + "epoch": 6.152, + "grad_norm": 1.7163499593734741, + "learning_rate": 2e-05, + "loss": 0.03309094, + "step": 3076 + }, + { + "epoch": 6.154, + "grad_norm": 1.738598108291626, + "learning_rate": 2e-05, + "loss": 0.03554305, + "step": 3077 + }, + { + "epoch": 6.156, + "grad_norm": 1.6464965343475342, + "learning_rate": 2e-05, + "loss": 0.05227431, + "step": 3078 + }, + { + "epoch": 6.158, + "grad_norm": 2.7797718048095703, + "learning_rate": 2e-05, + "loss": 0.04286774, + "step": 3079 + }, + { + "epoch": 6.16, + "grad_norm": 1.9953299760818481, + "learning_rate": 2e-05, + "loss": 0.05062944, + "step": 3080 + }, + { + "epoch": 6.162, + "grad_norm": 1.4432344436645508, + "learning_rate": 2e-05, + "loss": 0.03490534, + "step": 3081 + }, + { + "epoch": 6.164, + "grad_norm": 1.4732416868209839, + "learning_rate": 2e-05, + "loss": 0.04236253, + "step": 3082 + }, + { + "epoch": 6.166, + "grad_norm": 2.069814682006836, + "learning_rate": 2e-05, + "loss": 0.04514054, + "step": 3083 + }, + { + "epoch": 6.168, + "grad_norm": 1.8395639657974243, + "learning_rate": 2e-05, + "loss": 0.04167663, + "step": 3084 + }, + { + "epoch": 6.17, + "grad_norm": 1.5798648595809937, + "learning_rate": 2e-05, + "loss": 0.05062833, + "step": 3085 + }, + { + "epoch": 6.172, + "grad_norm": 1.718724012374878, + "learning_rate": 2e-05, + "loss": 0.03674094, + "step": 3086 + }, + { + "epoch": 6.174, + "grad_norm": 1.8063639402389526, + "learning_rate": 2e-05, + "loss": 0.05062528, + "step": 3087 + }, + { + "epoch": 6.176, + "grad_norm": 1.3898078203201294, + "learning_rate": 2e-05, + "loss": 0.04505751, + "step": 3088 + }, + { + "epoch": 6.178, + "grad_norm": 1.5374902486801147, + "learning_rate": 2e-05, + "loss": 0.05387596, + "step": 3089 + }, + { + "epoch": 6.18, + "grad_norm": 1.6485440731048584, + "learning_rate": 2e-05, + "loss": 0.05111238, + "step": 3090 + }, + { + "epoch": 6.182, + "grad_norm": 1.6274720430374146, + "learning_rate": 2e-05, + "loss": 0.03595363, + "step": 3091 + }, + { + "epoch": 6.184, + "grad_norm": 1.5677777528762817, + "learning_rate": 2e-05, + "loss": 0.03086212, + "step": 3092 + }, + { + "epoch": 6.186, + "grad_norm": 2.0702250003814697, + "learning_rate": 2e-05, + "loss": 0.03619816, + "step": 3093 + }, + { + "epoch": 6.188, + "grad_norm": 1.2100623846054077, + "learning_rate": 2e-05, + "loss": 0.02968932, + "step": 3094 + }, + { + "epoch": 6.19, + "grad_norm": 1.6770201921463013, + "learning_rate": 2e-05, + "loss": 0.05100476, + "step": 3095 + }, + { + "epoch": 6.192, + "grad_norm": 1.4166276454925537, + "learning_rate": 2e-05, + "loss": 0.04519867, + "step": 3096 + }, + { + "epoch": 6.194, + "grad_norm": 1.882489562034607, + "learning_rate": 2e-05, + "loss": 0.04186785, + "step": 3097 + }, + { + "epoch": 6.196, + "grad_norm": 1.4164977073669434, + "learning_rate": 2e-05, + "loss": 0.04563984, + "step": 3098 + }, + { + "epoch": 6.198, + "grad_norm": 1.6581916809082031, + "learning_rate": 2e-05, + "loss": 0.04879126, + "step": 3099 + }, + { + "epoch": 6.2, + "grad_norm": 1.4858304262161255, + "learning_rate": 2e-05, + "loss": 0.03413234, + "step": 3100 + }, + { + "epoch": 6.202, + "grad_norm": 1.121500849723816, + "learning_rate": 2e-05, + "loss": 0.02813966, + "step": 3101 + }, + { + "epoch": 6.204, + "grad_norm": 1.5754960775375366, + "learning_rate": 2e-05, + "loss": 0.04899345, + "step": 3102 + }, + { + "epoch": 6.206, + "grad_norm": 1.356394648551941, + "learning_rate": 2e-05, + "loss": 0.03853102, + "step": 3103 + }, + { + "epoch": 6.208, + "grad_norm": 1.0517022609710693, + "learning_rate": 2e-05, + "loss": 0.03538657, + "step": 3104 + }, + { + "epoch": 6.21, + "grad_norm": 1.3921403884887695, + "learning_rate": 2e-05, + "loss": 0.03444414, + "step": 3105 + }, + { + "epoch": 6.212, + "grad_norm": 1.133500099182129, + "learning_rate": 2e-05, + "loss": 0.03595487, + "step": 3106 + }, + { + "epoch": 6.214, + "grad_norm": 1.1165953874588013, + "learning_rate": 2e-05, + "loss": 0.02767484, + "step": 3107 + }, + { + "epoch": 6.216, + "grad_norm": 2.7160420417785645, + "learning_rate": 2e-05, + "loss": 0.03668471, + "step": 3108 + }, + { + "epoch": 6.218, + "grad_norm": 1.8742766380310059, + "learning_rate": 2e-05, + "loss": 0.04762148, + "step": 3109 + }, + { + "epoch": 6.22, + "grad_norm": 1.108489751815796, + "learning_rate": 2e-05, + "loss": 0.03803735, + "step": 3110 + }, + { + "epoch": 6.222, + "grad_norm": 1.4708930253982544, + "learning_rate": 2e-05, + "loss": 0.04263173, + "step": 3111 + }, + { + "epoch": 6.224, + "grad_norm": 2.706648349761963, + "learning_rate": 2e-05, + "loss": 0.05671699, + "step": 3112 + }, + { + "epoch": 6.226, + "grad_norm": 1.8484437465667725, + "learning_rate": 2e-05, + "loss": 0.02719938, + "step": 3113 + }, + { + "epoch": 6.228, + "grad_norm": 1.539387822151184, + "learning_rate": 2e-05, + "loss": 0.03141466, + "step": 3114 + }, + { + "epoch": 6.23, + "grad_norm": 1.7777189016342163, + "learning_rate": 2e-05, + "loss": 0.0490169, + "step": 3115 + }, + { + "epoch": 6.232, + "grad_norm": 1.6402760744094849, + "learning_rate": 2e-05, + "loss": 0.0498919, + "step": 3116 + }, + { + "epoch": 6.234, + "grad_norm": 0.8348276019096375, + "learning_rate": 2e-05, + "loss": 0.01968635, + "step": 3117 + }, + { + "epoch": 6.236, + "grad_norm": 1.0284675359725952, + "learning_rate": 2e-05, + "loss": 0.02943014, + "step": 3118 + }, + { + "epoch": 6.2379999999999995, + "grad_norm": 1.6384459733963013, + "learning_rate": 2e-05, + "loss": 0.05340105, + "step": 3119 + }, + { + "epoch": 6.24, + "grad_norm": 1.3227370977401733, + "learning_rate": 2e-05, + "loss": 0.03396504, + "step": 3120 + }, + { + "epoch": 6.242, + "grad_norm": 1.28560471534729, + "learning_rate": 2e-05, + "loss": 0.04309334, + "step": 3121 + }, + { + "epoch": 6.244, + "grad_norm": 1.1973211765289307, + "learning_rate": 2e-05, + "loss": 0.03165175, + "step": 3122 + }, + { + "epoch": 6.246, + "grad_norm": 1.492358922958374, + "learning_rate": 2e-05, + "loss": 0.04523762, + "step": 3123 + }, + { + "epoch": 6.248, + "grad_norm": 1.4751956462860107, + "learning_rate": 2e-05, + "loss": 0.04451723, + "step": 3124 + }, + { + "epoch": 6.25, + "grad_norm": 1.1065033674240112, + "learning_rate": 2e-05, + "loss": 0.04365182, + "step": 3125 + }, + { + "epoch": 6.252, + "grad_norm": 1.341362714767456, + "learning_rate": 2e-05, + "loss": 0.02606768, + "step": 3126 + }, + { + "epoch": 6.254, + "grad_norm": 1.7384096384048462, + "learning_rate": 2e-05, + "loss": 0.03969992, + "step": 3127 + }, + { + "epoch": 6.256, + "grad_norm": 1.5155085325241089, + "learning_rate": 2e-05, + "loss": 0.03229823, + "step": 3128 + }, + { + "epoch": 6.258, + "grad_norm": 1.0733712911605835, + "learning_rate": 2e-05, + "loss": 0.0306533, + "step": 3129 + }, + { + "epoch": 6.26, + "grad_norm": 2.0977604389190674, + "learning_rate": 2e-05, + "loss": 0.05317961, + "step": 3130 + }, + { + "epoch": 6.2620000000000005, + "grad_norm": 1.4438533782958984, + "learning_rate": 2e-05, + "loss": 0.05355418, + "step": 3131 + }, + { + "epoch": 6.264, + "grad_norm": 2.2710464000701904, + "learning_rate": 2e-05, + "loss": 0.04868567, + "step": 3132 + }, + { + "epoch": 6.266, + "grad_norm": 1.7031066417694092, + "learning_rate": 2e-05, + "loss": 0.04030811, + "step": 3133 + }, + { + "epoch": 6.268, + "grad_norm": 1.0892564058303833, + "learning_rate": 2e-05, + "loss": 0.0400371, + "step": 3134 + }, + { + "epoch": 6.27, + "grad_norm": 1.327901840209961, + "learning_rate": 2e-05, + "loss": 0.05018859, + "step": 3135 + }, + { + "epoch": 6.272, + "grad_norm": 1.1799272298812866, + "learning_rate": 2e-05, + "loss": 0.03606693, + "step": 3136 + }, + { + "epoch": 6.274, + "grad_norm": 1.553155541419983, + "learning_rate": 2e-05, + "loss": 0.06281506, + "step": 3137 + }, + { + "epoch": 6.276, + "grad_norm": 1.075716257095337, + "learning_rate": 2e-05, + "loss": 0.02753101, + "step": 3138 + }, + { + "epoch": 6.2780000000000005, + "grad_norm": 2.481788396835327, + "learning_rate": 2e-05, + "loss": 0.03427035, + "step": 3139 + }, + { + "epoch": 6.28, + "grad_norm": 1.606927752494812, + "learning_rate": 2e-05, + "loss": 0.03269802, + "step": 3140 + }, + { + "epoch": 6.282, + "grad_norm": 1.5751926898956299, + "learning_rate": 2e-05, + "loss": 0.03891, + "step": 3141 + }, + { + "epoch": 6.284, + "grad_norm": 2.1597113609313965, + "learning_rate": 2e-05, + "loss": 0.04709848, + "step": 3142 + }, + { + "epoch": 6.286, + "grad_norm": 1.4305531978607178, + "learning_rate": 2e-05, + "loss": 0.04866112, + "step": 3143 + }, + { + "epoch": 6.288, + "grad_norm": 1.1876941919326782, + "learning_rate": 2e-05, + "loss": 0.03890382, + "step": 3144 + }, + { + "epoch": 6.29, + "grad_norm": 2.109853744506836, + "learning_rate": 2e-05, + "loss": 0.04726175, + "step": 3145 + }, + { + "epoch": 6.292, + "grad_norm": 1.5774122476577759, + "learning_rate": 2e-05, + "loss": 0.04510861, + "step": 3146 + }, + { + "epoch": 6.294, + "grad_norm": 2.0001204013824463, + "learning_rate": 2e-05, + "loss": 0.03729511, + "step": 3147 + }, + { + "epoch": 6.296, + "grad_norm": 1.9148621559143066, + "learning_rate": 2e-05, + "loss": 0.05043909, + "step": 3148 + }, + { + "epoch": 6.298, + "grad_norm": 1.77232825756073, + "learning_rate": 2e-05, + "loss": 0.05203857, + "step": 3149 + }, + { + "epoch": 6.3, + "grad_norm": 1.7277973890304565, + "learning_rate": 2e-05, + "loss": 0.0506317, + "step": 3150 + }, + { + "epoch": 6.302, + "grad_norm": 1.567769169807434, + "learning_rate": 2e-05, + "loss": 0.04894867, + "step": 3151 + }, + { + "epoch": 6.304, + "grad_norm": 1.5426815748214722, + "learning_rate": 2e-05, + "loss": 0.04491549, + "step": 3152 + }, + { + "epoch": 6.306, + "grad_norm": 2.3619165420532227, + "learning_rate": 2e-05, + "loss": 0.04702245, + "step": 3153 + }, + { + "epoch": 6.308, + "grad_norm": 1.696313500404358, + "learning_rate": 2e-05, + "loss": 0.05087637, + "step": 3154 + }, + { + "epoch": 6.31, + "grad_norm": 1.100569248199463, + "learning_rate": 2e-05, + "loss": 0.03571167, + "step": 3155 + }, + { + "epoch": 6.312, + "grad_norm": 1.4361978769302368, + "learning_rate": 2e-05, + "loss": 0.03643346, + "step": 3156 + }, + { + "epoch": 6.314, + "grad_norm": 1.2051647901535034, + "learning_rate": 2e-05, + "loss": 0.05133547, + "step": 3157 + }, + { + "epoch": 6.316, + "grad_norm": 1.9132866859436035, + "learning_rate": 2e-05, + "loss": 0.03743671, + "step": 3158 + }, + { + "epoch": 6.318, + "grad_norm": 2.0840513706207275, + "learning_rate": 2e-05, + "loss": 0.05485996, + "step": 3159 + }, + { + "epoch": 6.32, + "grad_norm": 1.9084033966064453, + "learning_rate": 2e-05, + "loss": 0.05698278, + "step": 3160 + }, + { + "epoch": 6.322, + "grad_norm": 1.0253159999847412, + "learning_rate": 2e-05, + "loss": 0.02836904, + "step": 3161 + }, + { + "epoch": 6.324, + "grad_norm": 1.5270940065383911, + "learning_rate": 2e-05, + "loss": 0.0500167, + "step": 3162 + }, + { + "epoch": 6.326, + "grad_norm": 1.3842896223068237, + "learning_rate": 2e-05, + "loss": 0.03779293, + "step": 3163 + }, + { + "epoch": 6.328, + "grad_norm": 1.4352202415466309, + "learning_rate": 2e-05, + "loss": 0.03724783, + "step": 3164 + }, + { + "epoch": 6.33, + "grad_norm": 1.2283622026443481, + "learning_rate": 2e-05, + "loss": 0.04922156, + "step": 3165 + }, + { + "epoch": 6.332, + "grad_norm": 1.2028950452804565, + "learning_rate": 2e-05, + "loss": 0.03399231, + "step": 3166 + }, + { + "epoch": 6.334, + "grad_norm": 1.8364872932434082, + "learning_rate": 2e-05, + "loss": 0.05715834, + "step": 3167 + }, + { + "epoch": 6.336, + "grad_norm": 1.926184058189392, + "learning_rate": 2e-05, + "loss": 0.052489, + "step": 3168 + }, + { + "epoch": 6.338, + "grad_norm": 2.94455623626709, + "learning_rate": 2e-05, + "loss": 0.05718009, + "step": 3169 + }, + { + "epoch": 6.34, + "grad_norm": 1.064375877380371, + "learning_rate": 2e-05, + "loss": 0.03787267, + "step": 3170 + }, + { + "epoch": 6.342, + "grad_norm": 1.227028250694275, + "learning_rate": 2e-05, + "loss": 0.0388754, + "step": 3171 + }, + { + "epoch": 6.344, + "grad_norm": 1.999516487121582, + "learning_rate": 2e-05, + "loss": 0.02867444, + "step": 3172 + }, + { + "epoch": 6.346, + "grad_norm": 1.4541538953781128, + "learning_rate": 2e-05, + "loss": 0.04546948, + "step": 3173 + }, + { + "epoch": 6.348, + "grad_norm": 1.3081698417663574, + "learning_rate": 2e-05, + "loss": 0.05439358, + "step": 3174 + }, + { + "epoch": 6.35, + "grad_norm": 1.7099684476852417, + "learning_rate": 2e-05, + "loss": 0.04584994, + "step": 3175 + }, + { + "epoch": 6.352, + "grad_norm": 1.3694244623184204, + "learning_rate": 2e-05, + "loss": 0.05263059, + "step": 3176 + }, + { + "epoch": 6.354, + "grad_norm": 2.0050711631774902, + "learning_rate": 2e-05, + "loss": 0.06513149, + "step": 3177 + }, + { + "epoch": 6.356, + "grad_norm": 2.2543318271636963, + "learning_rate": 2e-05, + "loss": 0.07143942, + "step": 3178 + }, + { + "epoch": 6.358, + "grad_norm": 0.9358059167861938, + "learning_rate": 2e-05, + "loss": 0.02930567, + "step": 3179 + }, + { + "epoch": 6.36, + "grad_norm": 1.4434269666671753, + "learning_rate": 2e-05, + "loss": 0.02489997, + "step": 3180 + }, + { + "epoch": 6.362, + "grad_norm": 1.233762264251709, + "learning_rate": 2e-05, + "loss": 0.0335849, + "step": 3181 + }, + { + "epoch": 6.364, + "grad_norm": 1.5462418794631958, + "learning_rate": 2e-05, + "loss": 0.03041675, + "step": 3182 + }, + { + "epoch": 6.366, + "grad_norm": 2.0180463790893555, + "learning_rate": 2e-05, + "loss": 0.05690164, + "step": 3183 + }, + { + "epoch": 6.368, + "grad_norm": 1.6697211265563965, + "learning_rate": 2e-05, + "loss": 0.04623037, + "step": 3184 + }, + { + "epoch": 6.37, + "grad_norm": 1.815030574798584, + "learning_rate": 2e-05, + "loss": 0.03556263, + "step": 3185 + }, + { + "epoch": 6.372, + "grad_norm": 1.826423168182373, + "learning_rate": 2e-05, + "loss": 0.04994491, + "step": 3186 + }, + { + "epoch": 6.374, + "grad_norm": 1.6917457580566406, + "learning_rate": 2e-05, + "loss": 0.03865033, + "step": 3187 + }, + { + "epoch": 6.376, + "grad_norm": 1.7738280296325684, + "learning_rate": 2e-05, + "loss": 0.04079893, + "step": 3188 + }, + { + "epoch": 6.378, + "grad_norm": 1.0954397916793823, + "learning_rate": 2e-05, + "loss": 0.03440213, + "step": 3189 + }, + { + "epoch": 6.38, + "grad_norm": 1.043359637260437, + "learning_rate": 2e-05, + "loss": 0.03818577, + "step": 3190 + }, + { + "epoch": 6.382, + "grad_norm": 0.9938315153121948, + "learning_rate": 2e-05, + "loss": 0.03229547, + "step": 3191 + }, + { + "epoch": 6.384, + "grad_norm": 2.5036749839782715, + "learning_rate": 2e-05, + "loss": 0.04423948, + "step": 3192 + }, + { + "epoch": 6.386, + "grad_norm": 1.164203405380249, + "learning_rate": 2e-05, + "loss": 0.03556975, + "step": 3193 + }, + { + "epoch": 6.388, + "grad_norm": 1.4395684003829956, + "learning_rate": 2e-05, + "loss": 0.04390024, + "step": 3194 + }, + { + "epoch": 6.39, + "grad_norm": 1.4073199033737183, + "learning_rate": 2e-05, + "loss": 0.02870702, + "step": 3195 + }, + { + "epoch": 6.392, + "grad_norm": 1.905430793762207, + "learning_rate": 2e-05, + "loss": 0.05451259, + "step": 3196 + }, + { + "epoch": 6.394, + "grad_norm": 2.2426390647888184, + "learning_rate": 2e-05, + "loss": 0.04760137, + "step": 3197 + }, + { + "epoch": 6.396, + "grad_norm": 1.6508835554122925, + "learning_rate": 2e-05, + "loss": 0.03574256, + "step": 3198 + }, + { + "epoch": 6.398, + "grad_norm": 1.2062972784042358, + "learning_rate": 2e-05, + "loss": 0.03633127, + "step": 3199 + }, + { + "epoch": 6.4, + "grad_norm": 1.1305536031723022, + "learning_rate": 2e-05, + "loss": 0.03830415, + "step": 3200 + }, + { + "epoch": 6.402, + "grad_norm": 1.6477299928665161, + "learning_rate": 2e-05, + "loss": 0.04004838, + "step": 3201 + }, + { + "epoch": 6.404, + "grad_norm": 1.4459377527236938, + "learning_rate": 2e-05, + "loss": 0.0500464, + "step": 3202 + }, + { + "epoch": 6.406, + "grad_norm": 1.643071174621582, + "learning_rate": 2e-05, + "loss": 0.03596194, + "step": 3203 + }, + { + "epoch": 6.408, + "grad_norm": 1.4056624174118042, + "learning_rate": 2e-05, + "loss": 0.0451113, + "step": 3204 + }, + { + "epoch": 6.41, + "grad_norm": 1.7545050382614136, + "learning_rate": 2e-05, + "loss": 0.04658743, + "step": 3205 + }, + { + "epoch": 6.412, + "grad_norm": 1.7296162843704224, + "learning_rate": 2e-05, + "loss": 0.04593526, + "step": 3206 + }, + { + "epoch": 6.414, + "grad_norm": 1.6219792366027832, + "learning_rate": 2e-05, + "loss": 0.0362427, + "step": 3207 + }, + { + "epoch": 6.416, + "grad_norm": 1.0432740449905396, + "learning_rate": 2e-05, + "loss": 0.03033496, + "step": 3208 + }, + { + "epoch": 6.418, + "grad_norm": 1.3447455167770386, + "learning_rate": 2e-05, + "loss": 0.04312557, + "step": 3209 + }, + { + "epoch": 6.42, + "grad_norm": 1.140795350074768, + "learning_rate": 2e-05, + "loss": 0.02616213, + "step": 3210 + }, + { + "epoch": 6.422, + "grad_norm": 1.3150559663772583, + "learning_rate": 2e-05, + "loss": 0.03334409, + "step": 3211 + }, + { + "epoch": 6.424, + "grad_norm": 1.9361340999603271, + "learning_rate": 2e-05, + "loss": 0.06632831, + "step": 3212 + }, + { + "epoch": 6.426, + "grad_norm": 2.0635571479797363, + "learning_rate": 2e-05, + "loss": 0.04785598, + "step": 3213 + }, + { + "epoch": 6.428, + "grad_norm": 1.896443247795105, + "learning_rate": 2e-05, + "loss": 0.04902334, + "step": 3214 + }, + { + "epoch": 6.43, + "grad_norm": 1.3984146118164062, + "learning_rate": 2e-05, + "loss": 0.03529963, + "step": 3215 + }, + { + "epoch": 6.432, + "grad_norm": 1.1594336032867432, + "learning_rate": 2e-05, + "loss": 0.03067735, + "step": 3216 + }, + { + "epoch": 6.434, + "grad_norm": 1.5906693935394287, + "learning_rate": 2e-05, + "loss": 0.04803975, + "step": 3217 + }, + { + "epoch": 6.436, + "grad_norm": 1.211396336555481, + "learning_rate": 2e-05, + "loss": 0.02445906, + "step": 3218 + }, + { + "epoch": 6.438, + "grad_norm": 1.4170849323272705, + "learning_rate": 2e-05, + "loss": 0.03652653, + "step": 3219 + }, + { + "epoch": 6.44, + "grad_norm": 2.7853238582611084, + "learning_rate": 2e-05, + "loss": 0.05708388, + "step": 3220 + }, + { + "epoch": 6.442, + "grad_norm": 1.129001498222351, + "learning_rate": 2e-05, + "loss": 0.03645102, + "step": 3221 + }, + { + "epoch": 6.444, + "grad_norm": 1.4778531789779663, + "learning_rate": 2e-05, + "loss": 0.02655194, + "step": 3222 + }, + { + "epoch": 6.446, + "grad_norm": 1.6517877578735352, + "learning_rate": 2e-05, + "loss": 0.03924521, + "step": 3223 + }, + { + "epoch": 6.448, + "grad_norm": 2.0939931869506836, + "learning_rate": 2e-05, + "loss": 0.03322685, + "step": 3224 + }, + { + "epoch": 6.45, + "grad_norm": 1.4540835618972778, + "learning_rate": 2e-05, + "loss": 0.04464151, + "step": 3225 + }, + { + "epoch": 6.452, + "grad_norm": 1.3042082786560059, + "learning_rate": 2e-05, + "loss": 0.04663192, + "step": 3226 + }, + { + "epoch": 6.454, + "grad_norm": 1.1038719415664673, + "learning_rate": 2e-05, + "loss": 0.03338126, + "step": 3227 + }, + { + "epoch": 6.456, + "grad_norm": 1.4457974433898926, + "learning_rate": 2e-05, + "loss": 0.03782085, + "step": 3228 + }, + { + "epoch": 6.458, + "grad_norm": 1.439251184463501, + "learning_rate": 2e-05, + "loss": 0.04930679, + "step": 3229 + }, + { + "epoch": 6.46, + "grad_norm": 1.6377997398376465, + "learning_rate": 2e-05, + "loss": 0.04300382, + "step": 3230 + }, + { + "epoch": 6.462, + "grad_norm": 3.27187442779541, + "learning_rate": 2e-05, + "loss": 0.03758849, + "step": 3231 + }, + { + "epoch": 6.464, + "grad_norm": 1.257215976715088, + "learning_rate": 2e-05, + "loss": 0.03490548, + "step": 3232 + }, + { + "epoch": 6.466, + "grad_norm": 1.3594287633895874, + "learning_rate": 2e-05, + "loss": 0.03412262, + "step": 3233 + }, + { + "epoch": 6.468, + "grad_norm": 1.1700828075408936, + "learning_rate": 2e-05, + "loss": 0.03058089, + "step": 3234 + }, + { + "epoch": 6.47, + "grad_norm": 0.9974095225334167, + "learning_rate": 2e-05, + "loss": 0.03268985, + "step": 3235 + }, + { + "epoch": 6.4719999999999995, + "grad_norm": 1.6861094236373901, + "learning_rate": 2e-05, + "loss": 0.04692501, + "step": 3236 + }, + { + "epoch": 6.474, + "grad_norm": 1.220467209815979, + "learning_rate": 2e-05, + "loss": 0.04217067, + "step": 3237 + }, + { + "epoch": 6.476, + "grad_norm": 1.1765286922454834, + "learning_rate": 2e-05, + "loss": 0.03455502, + "step": 3238 + }, + { + "epoch": 6.478, + "grad_norm": 1.4939866065979004, + "learning_rate": 2e-05, + "loss": 0.05220221, + "step": 3239 + }, + { + "epoch": 6.48, + "grad_norm": 1.1564301252365112, + "learning_rate": 2e-05, + "loss": 0.03657191, + "step": 3240 + }, + { + "epoch": 6.482, + "grad_norm": 1.5989612340927124, + "learning_rate": 2e-05, + "loss": 0.04166878, + "step": 3241 + }, + { + "epoch": 6.484, + "grad_norm": 1.5818308591842651, + "learning_rate": 2e-05, + "loss": 0.04151236, + "step": 3242 + }, + { + "epoch": 6.486, + "grad_norm": 1.6136821508407593, + "learning_rate": 2e-05, + "loss": 0.03971895, + "step": 3243 + }, + { + "epoch": 6.4879999999999995, + "grad_norm": 1.5467586517333984, + "learning_rate": 2e-05, + "loss": 0.04718712, + "step": 3244 + }, + { + "epoch": 6.49, + "grad_norm": 0.9672493934631348, + "learning_rate": 2e-05, + "loss": 0.02750623, + "step": 3245 + }, + { + "epoch": 6.492, + "grad_norm": 1.5886257886886597, + "learning_rate": 2e-05, + "loss": 0.043421, + "step": 3246 + }, + { + "epoch": 6.494, + "grad_norm": 1.9581189155578613, + "learning_rate": 2e-05, + "loss": 0.04324682, + "step": 3247 + }, + { + "epoch": 6.496, + "grad_norm": 1.1795440912246704, + "learning_rate": 2e-05, + "loss": 0.04421395, + "step": 3248 + }, + { + "epoch": 6.498, + "grad_norm": 1.8300138711929321, + "learning_rate": 2e-05, + "loss": 0.05202936, + "step": 3249 + }, + { + "epoch": 6.5, + "grad_norm": 1.1608667373657227, + "learning_rate": 2e-05, + "loss": 0.03892054, + "step": 3250 + }, + { + "epoch": 6.502, + "grad_norm": 1.8854470252990723, + "learning_rate": 2e-05, + "loss": 0.05796061, + "step": 3251 + }, + { + "epoch": 6.504, + "grad_norm": 5.84080696105957, + "learning_rate": 2e-05, + "loss": 0.04915443, + "step": 3252 + }, + { + "epoch": 6.506, + "grad_norm": 1.2369056940078735, + "learning_rate": 2e-05, + "loss": 0.04068791, + "step": 3253 + }, + { + "epoch": 6.508, + "grad_norm": 2.1469545364379883, + "learning_rate": 2e-05, + "loss": 0.0260004, + "step": 3254 + }, + { + "epoch": 6.51, + "grad_norm": 1.370782494544983, + "learning_rate": 2e-05, + "loss": 0.03866827, + "step": 3255 + }, + { + "epoch": 6.5120000000000005, + "grad_norm": 1.2568261623382568, + "learning_rate": 2e-05, + "loss": 0.038529, + "step": 3256 + }, + { + "epoch": 6.514, + "grad_norm": 1.186782717704773, + "learning_rate": 2e-05, + "loss": 0.04556369, + "step": 3257 + }, + { + "epoch": 6.516, + "grad_norm": 0.8793965578079224, + "learning_rate": 2e-05, + "loss": 0.02259767, + "step": 3258 + }, + { + "epoch": 6.518, + "grad_norm": 1.2058748006820679, + "learning_rate": 2e-05, + "loss": 0.03813112, + "step": 3259 + }, + { + "epoch": 6.52, + "grad_norm": 1.2766646146774292, + "learning_rate": 2e-05, + "loss": 0.04175682, + "step": 3260 + }, + { + "epoch": 6.522, + "grad_norm": 1.3827567100524902, + "learning_rate": 2e-05, + "loss": 0.03931576, + "step": 3261 + }, + { + "epoch": 6.524, + "grad_norm": 1.741408109664917, + "learning_rate": 2e-05, + "loss": 0.03503319, + "step": 3262 + }, + { + "epoch": 6.526, + "grad_norm": 1.5188173055648804, + "learning_rate": 2e-05, + "loss": 0.03888669, + "step": 3263 + }, + { + "epoch": 6.5280000000000005, + "grad_norm": 1.9496771097183228, + "learning_rate": 2e-05, + "loss": 0.04322708, + "step": 3264 + }, + { + "epoch": 6.53, + "grad_norm": 2.4623851776123047, + "learning_rate": 2e-05, + "loss": 0.04068457, + "step": 3265 + }, + { + "epoch": 6.532, + "grad_norm": 0.9864256381988525, + "learning_rate": 2e-05, + "loss": 0.03565934, + "step": 3266 + }, + { + "epoch": 6.534, + "grad_norm": 0.9321268796920776, + "learning_rate": 2e-05, + "loss": 0.02948448, + "step": 3267 + }, + { + "epoch": 6.536, + "grad_norm": 1.1365487575531006, + "learning_rate": 2e-05, + "loss": 0.03684638, + "step": 3268 + }, + { + "epoch": 6.538, + "grad_norm": 1.8974064588546753, + "learning_rate": 2e-05, + "loss": 0.03136589, + "step": 3269 + }, + { + "epoch": 6.54, + "grad_norm": 1.6937148571014404, + "learning_rate": 2e-05, + "loss": 0.04182105, + "step": 3270 + }, + { + "epoch": 6.542, + "grad_norm": 1.4892710447311401, + "learning_rate": 2e-05, + "loss": 0.05436049, + "step": 3271 + }, + { + "epoch": 6.5440000000000005, + "grad_norm": 2.192354917526245, + "learning_rate": 2e-05, + "loss": 0.03655919, + "step": 3272 + }, + { + "epoch": 6.546, + "grad_norm": 1.3368217945098877, + "learning_rate": 2e-05, + "loss": 0.02830843, + "step": 3273 + }, + { + "epoch": 6.548, + "grad_norm": 1.422207236289978, + "learning_rate": 2e-05, + "loss": 0.02914708, + "step": 3274 + }, + { + "epoch": 6.55, + "grad_norm": 3.405113935470581, + "learning_rate": 2e-05, + "loss": 0.04792117, + "step": 3275 + }, + { + "epoch": 6.552, + "grad_norm": 2.973292112350464, + "learning_rate": 2e-05, + "loss": 0.050552, + "step": 3276 + }, + { + "epoch": 6.554, + "grad_norm": 1.7082622051239014, + "learning_rate": 2e-05, + "loss": 0.03328468, + "step": 3277 + }, + { + "epoch": 6.556, + "grad_norm": 1.2330087423324585, + "learning_rate": 2e-05, + "loss": 0.0288753, + "step": 3278 + }, + { + "epoch": 6.558, + "grad_norm": 1.4605454206466675, + "learning_rate": 2e-05, + "loss": 0.03361526, + "step": 3279 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 1.288313627243042, + "learning_rate": 2e-05, + "loss": 0.04549043, + "step": 3280 + }, + { + "epoch": 6.562, + "grad_norm": 2.2067997455596924, + "learning_rate": 2e-05, + "loss": 0.0399694, + "step": 3281 + }, + { + "epoch": 6.564, + "grad_norm": 1.6156538724899292, + "learning_rate": 2e-05, + "loss": 0.07278688, + "step": 3282 + }, + { + "epoch": 6.566, + "grad_norm": 2.2792131900787354, + "learning_rate": 2e-05, + "loss": 0.05353578, + "step": 3283 + }, + { + "epoch": 6.568, + "grad_norm": 1.9242584705352783, + "learning_rate": 2e-05, + "loss": 0.0594984, + "step": 3284 + }, + { + "epoch": 6.57, + "grad_norm": 1.4676686525344849, + "learning_rate": 2e-05, + "loss": 0.04927154, + "step": 3285 + }, + { + "epoch": 6.572, + "grad_norm": 1.2734441757202148, + "learning_rate": 2e-05, + "loss": 0.04294922, + "step": 3286 + }, + { + "epoch": 6.574, + "grad_norm": 1.739187240600586, + "learning_rate": 2e-05, + "loss": 0.04884008, + "step": 3287 + }, + { + "epoch": 6.576, + "grad_norm": 1.88695228099823, + "learning_rate": 2e-05, + "loss": 0.05116354, + "step": 3288 + }, + { + "epoch": 6.578, + "grad_norm": 1.28707754611969, + "learning_rate": 2e-05, + "loss": 0.04390291, + "step": 3289 + }, + { + "epoch": 6.58, + "grad_norm": 1.3685258626937866, + "learning_rate": 2e-05, + "loss": 0.03953566, + "step": 3290 + }, + { + "epoch": 6.582, + "grad_norm": 1.109430193901062, + "learning_rate": 2e-05, + "loss": 0.04132858, + "step": 3291 + }, + { + "epoch": 6.584, + "grad_norm": 1.8510801792144775, + "learning_rate": 2e-05, + "loss": 0.05430602, + "step": 3292 + }, + { + "epoch": 6.586, + "grad_norm": 1.2035598754882812, + "learning_rate": 2e-05, + "loss": 0.03927475, + "step": 3293 + }, + { + "epoch": 6.588, + "grad_norm": 2.5072784423828125, + "learning_rate": 2e-05, + "loss": 0.04595596, + "step": 3294 + }, + { + "epoch": 6.59, + "grad_norm": 1.213221549987793, + "learning_rate": 2e-05, + "loss": 0.04527979, + "step": 3295 + }, + { + "epoch": 6.592, + "grad_norm": 1.2507351636886597, + "learning_rate": 2e-05, + "loss": 0.03918802, + "step": 3296 + }, + { + "epoch": 6.594, + "grad_norm": 1.0590146780014038, + "learning_rate": 2e-05, + "loss": 0.03508047, + "step": 3297 + }, + { + "epoch": 6.596, + "grad_norm": 1.703151822090149, + "learning_rate": 2e-05, + "loss": 0.04405899, + "step": 3298 + }, + { + "epoch": 6.598, + "grad_norm": 1.1094061136245728, + "learning_rate": 2e-05, + "loss": 0.03539534, + "step": 3299 + }, + { + "epoch": 6.6, + "grad_norm": 1.1324583292007446, + "learning_rate": 2e-05, + "loss": 0.04019186, + "step": 3300 + }, + { + "epoch": 6.602, + "grad_norm": 3.1161935329437256, + "learning_rate": 2e-05, + "loss": 0.04159467, + "step": 3301 + }, + { + "epoch": 6.604, + "grad_norm": 1.558793306350708, + "learning_rate": 2e-05, + "loss": 0.04492222, + "step": 3302 + }, + { + "epoch": 6.606, + "grad_norm": 1.1711833477020264, + "learning_rate": 2e-05, + "loss": 0.03407395, + "step": 3303 + }, + { + "epoch": 6.608, + "grad_norm": 1.5858479738235474, + "learning_rate": 2e-05, + "loss": 0.06137519, + "step": 3304 + }, + { + "epoch": 6.61, + "grad_norm": 1.5749893188476562, + "learning_rate": 2e-05, + "loss": 0.06341276, + "step": 3305 + }, + { + "epoch": 6.612, + "grad_norm": 1.4319220781326294, + "learning_rate": 2e-05, + "loss": 0.03183598, + "step": 3306 + }, + { + "epoch": 6.614, + "grad_norm": 1.9067035913467407, + "learning_rate": 2e-05, + "loss": 0.03418519, + "step": 3307 + }, + { + "epoch": 6.616, + "grad_norm": 1.5378934144973755, + "learning_rate": 2e-05, + "loss": 0.05683891, + "step": 3308 + }, + { + "epoch": 6.618, + "grad_norm": 1.4987341165542603, + "learning_rate": 2e-05, + "loss": 0.0428179, + "step": 3309 + }, + { + "epoch": 6.62, + "grad_norm": 0.9848178625106812, + "learning_rate": 2e-05, + "loss": 0.03192403, + "step": 3310 + }, + { + "epoch": 6.622, + "grad_norm": 1.1798021793365479, + "learning_rate": 2e-05, + "loss": 0.04244391, + "step": 3311 + }, + { + "epoch": 6.624, + "grad_norm": 1.2790571451187134, + "learning_rate": 2e-05, + "loss": 0.05533645, + "step": 3312 + }, + { + "epoch": 6.626, + "grad_norm": 1.6922457218170166, + "learning_rate": 2e-05, + "loss": 0.04536498, + "step": 3313 + }, + { + "epoch": 6.628, + "grad_norm": 1.0179157257080078, + "learning_rate": 2e-05, + "loss": 0.03572208, + "step": 3314 + }, + { + "epoch": 6.63, + "grad_norm": 1.8201943635940552, + "learning_rate": 2e-05, + "loss": 0.03994879, + "step": 3315 + }, + { + "epoch": 6.632, + "grad_norm": 1.4098727703094482, + "learning_rate": 2e-05, + "loss": 0.04173999, + "step": 3316 + }, + { + "epoch": 6.634, + "grad_norm": 1.0485551357269287, + "learning_rate": 2e-05, + "loss": 0.03973451, + "step": 3317 + }, + { + "epoch": 6.636, + "grad_norm": 1.4638235569000244, + "learning_rate": 2e-05, + "loss": 0.04760775, + "step": 3318 + }, + { + "epoch": 6.638, + "grad_norm": 1.5323985815048218, + "learning_rate": 2e-05, + "loss": 0.04406105, + "step": 3319 + }, + { + "epoch": 6.64, + "grad_norm": 1.126651644706726, + "learning_rate": 2e-05, + "loss": 0.03304296, + "step": 3320 + }, + { + "epoch": 6.642, + "grad_norm": 1.278250813484192, + "learning_rate": 2e-05, + "loss": 0.02966734, + "step": 3321 + }, + { + "epoch": 6.644, + "grad_norm": 1.018739938735962, + "learning_rate": 2e-05, + "loss": 0.03991082, + "step": 3322 + }, + { + "epoch": 6.646, + "grad_norm": 1.2465643882751465, + "learning_rate": 2e-05, + "loss": 0.0458714, + "step": 3323 + }, + { + "epoch": 6.648, + "grad_norm": 29.821657180786133, + "learning_rate": 2e-05, + "loss": 0.0503132, + "step": 3324 + }, + { + "epoch": 6.65, + "grad_norm": 3.7819323539733887, + "learning_rate": 2e-05, + "loss": 0.04644343, + "step": 3325 + }, + { + "epoch": 6.652, + "grad_norm": 0.8932387232780457, + "learning_rate": 2e-05, + "loss": 0.02574427, + "step": 3326 + }, + { + "epoch": 6.654, + "grad_norm": 1.2899101972579956, + "learning_rate": 2e-05, + "loss": 0.0327214, + "step": 3327 + }, + { + "epoch": 6.656, + "grad_norm": 2.6038079261779785, + "learning_rate": 2e-05, + "loss": 0.05023351, + "step": 3328 + }, + { + "epoch": 6.658, + "grad_norm": 1.7134565114974976, + "learning_rate": 2e-05, + "loss": 0.04856233, + "step": 3329 + }, + { + "epoch": 6.66, + "grad_norm": 1.8361741304397583, + "learning_rate": 2e-05, + "loss": 0.06476304, + "step": 3330 + }, + { + "epoch": 6.662, + "grad_norm": 1.2037334442138672, + "learning_rate": 2e-05, + "loss": 0.04044564, + "step": 3331 + }, + { + "epoch": 6.664, + "grad_norm": 1.0349493026733398, + "learning_rate": 2e-05, + "loss": 0.03656833, + "step": 3332 + }, + { + "epoch": 6.666, + "grad_norm": 2.0848727226257324, + "learning_rate": 2e-05, + "loss": 0.03080843, + "step": 3333 + }, + { + "epoch": 6.668, + "grad_norm": 1.4211640357971191, + "learning_rate": 2e-05, + "loss": 0.04820876, + "step": 3334 + }, + { + "epoch": 6.67, + "grad_norm": 0.9767024517059326, + "learning_rate": 2e-05, + "loss": 0.03227487, + "step": 3335 + }, + { + "epoch": 6.672, + "grad_norm": 1.7927852869033813, + "learning_rate": 2e-05, + "loss": 0.03349268, + "step": 3336 + }, + { + "epoch": 6.674, + "grad_norm": 1.3334259986877441, + "learning_rate": 2e-05, + "loss": 0.03758476, + "step": 3337 + }, + { + "epoch": 6.676, + "grad_norm": 1.1497528553009033, + "learning_rate": 2e-05, + "loss": 0.0366753, + "step": 3338 + }, + { + "epoch": 6.678, + "grad_norm": 1.3280630111694336, + "learning_rate": 2e-05, + "loss": 0.04031434, + "step": 3339 + }, + { + "epoch": 6.68, + "grad_norm": 1.5619535446166992, + "learning_rate": 2e-05, + "loss": 0.02004341, + "step": 3340 + }, + { + "epoch": 6.682, + "grad_norm": 1.5152435302734375, + "learning_rate": 2e-05, + "loss": 0.04714475, + "step": 3341 + }, + { + "epoch": 6.684, + "grad_norm": 1.263704776763916, + "learning_rate": 2e-05, + "loss": 0.04945374, + "step": 3342 + }, + { + "epoch": 6.686, + "grad_norm": 1.4276520013809204, + "learning_rate": 2e-05, + "loss": 0.03332176, + "step": 3343 + }, + { + "epoch": 6.688, + "grad_norm": 0.9606978297233582, + "learning_rate": 2e-05, + "loss": 0.03038597, + "step": 3344 + }, + { + "epoch": 6.6899999999999995, + "grad_norm": 1.5653752088546753, + "learning_rate": 2e-05, + "loss": 0.04423832, + "step": 3345 + }, + { + "epoch": 6.692, + "grad_norm": 1.6132287979125977, + "learning_rate": 2e-05, + "loss": 0.04920972, + "step": 3346 + }, + { + "epoch": 6.694, + "grad_norm": 2.4168407917022705, + "learning_rate": 2e-05, + "loss": 0.05681578, + "step": 3347 + }, + { + "epoch": 6.696, + "grad_norm": 1.8760751485824585, + "learning_rate": 2e-05, + "loss": 0.0534036, + "step": 3348 + }, + { + "epoch": 6.698, + "grad_norm": 1.2305333614349365, + "learning_rate": 2e-05, + "loss": 0.02969261, + "step": 3349 + }, + { + "epoch": 6.7, + "grad_norm": 1.4671058654785156, + "learning_rate": 2e-05, + "loss": 0.02868845, + "step": 3350 + }, + { + "epoch": 6.702, + "grad_norm": 1.1077961921691895, + "learning_rate": 2e-05, + "loss": 0.0313329, + "step": 3351 + }, + { + "epoch": 6.704, + "grad_norm": 1.2289212942123413, + "learning_rate": 2e-05, + "loss": 0.04141407, + "step": 3352 + }, + { + "epoch": 6.7059999999999995, + "grad_norm": 1.0401378870010376, + "learning_rate": 2e-05, + "loss": 0.02844977, + "step": 3353 + }, + { + "epoch": 6.708, + "grad_norm": 1.6821035146713257, + "learning_rate": 2e-05, + "loss": 0.033038, + "step": 3354 + }, + { + "epoch": 6.71, + "grad_norm": 1.4683276414871216, + "learning_rate": 2e-05, + "loss": 0.03478734, + "step": 3355 + }, + { + "epoch": 6.712, + "grad_norm": 0.9681516885757446, + "learning_rate": 2e-05, + "loss": 0.02334734, + "step": 3356 + }, + { + "epoch": 6.714, + "grad_norm": 1.9194122552871704, + "learning_rate": 2e-05, + "loss": 0.05143356, + "step": 3357 + }, + { + "epoch": 6.716, + "grad_norm": 1.9872151613235474, + "learning_rate": 2e-05, + "loss": 0.0525632, + "step": 3358 + }, + { + "epoch": 6.718, + "grad_norm": 1.8832703828811646, + "learning_rate": 2e-05, + "loss": 0.04416494, + "step": 3359 + }, + { + "epoch": 6.72, + "grad_norm": 1.498202919960022, + "learning_rate": 2e-05, + "loss": 0.04000438, + "step": 3360 + }, + { + "epoch": 6.7219999999999995, + "grad_norm": 1.6321178674697876, + "learning_rate": 2e-05, + "loss": 0.05430613, + "step": 3361 + }, + { + "epoch": 6.724, + "grad_norm": 2.441689968109131, + "learning_rate": 2e-05, + "loss": 0.0355437, + "step": 3362 + }, + { + "epoch": 6.726, + "grad_norm": 1.513107419013977, + "learning_rate": 2e-05, + "loss": 0.03850288, + "step": 3363 + }, + { + "epoch": 6.728, + "grad_norm": 0.9349620938301086, + "learning_rate": 2e-05, + "loss": 0.02798152, + "step": 3364 + }, + { + "epoch": 6.73, + "grad_norm": 1.3889509439468384, + "learning_rate": 2e-05, + "loss": 0.04935129, + "step": 3365 + }, + { + "epoch": 6.732, + "grad_norm": 1.4558771848678589, + "learning_rate": 2e-05, + "loss": 0.04684127, + "step": 3366 + }, + { + "epoch": 6.734, + "grad_norm": 1.614352822303772, + "learning_rate": 2e-05, + "loss": 0.05351394, + "step": 3367 + }, + { + "epoch": 6.736, + "grad_norm": 1.7101666927337646, + "learning_rate": 2e-05, + "loss": 0.05229837, + "step": 3368 + }, + { + "epoch": 6.7379999999999995, + "grad_norm": 1.225295901298523, + "learning_rate": 2e-05, + "loss": 0.04323405, + "step": 3369 + }, + { + "epoch": 6.74, + "grad_norm": 1.0662435293197632, + "learning_rate": 2e-05, + "loss": 0.03803292, + "step": 3370 + }, + { + "epoch": 6.742, + "grad_norm": 1.4984309673309326, + "learning_rate": 2e-05, + "loss": 0.043239, + "step": 3371 + }, + { + "epoch": 6.744, + "grad_norm": 1.9732307195663452, + "learning_rate": 2e-05, + "loss": 0.06063057, + "step": 3372 + }, + { + "epoch": 6.746, + "grad_norm": 0.8619161248207092, + "learning_rate": 2e-05, + "loss": 0.02019028, + "step": 3373 + }, + { + "epoch": 6.748, + "grad_norm": 1.5696371793746948, + "learning_rate": 2e-05, + "loss": 0.04602835, + "step": 3374 + }, + { + "epoch": 6.75, + "grad_norm": 1.1177750825881958, + "learning_rate": 2e-05, + "loss": 0.03560708, + "step": 3375 + }, + { + "epoch": 6.752, + "grad_norm": 0.9110558032989502, + "learning_rate": 2e-05, + "loss": 0.01933064, + "step": 3376 + }, + { + "epoch": 6.754, + "grad_norm": 1.2915785312652588, + "learning_rate": 2e-05, + "loss": 0.04963629, + "step": 3377 + }, + { + "epoch": 6.756, + "grad_norm": 1.2118951082229614, + "learning_rate": 2e-05, + "loss": 0.03098101, + "step": 3378 + }, + { + "epoch": 6.758, + "grad_norm": 1.0286513566970825, + "learning_rate": 2e-05, + "loss": 0.02711356, + "step": 3379 + }, + { + "epoch": 6.76, + "grad_norm": 1.4874508380889893, + "learning_rate": 2e-05, + "loss": 0.03703025, + "step": 3380 + }, + { + "epoch": 6.7620000000000005, + "grad_norm": 1.0302430391311646, + "learning_rate": 2e-05, + "loss": 0.02999306, + "step": 3381 + }, + { + "epoch": 6.764, + "grad_norm": 1.1743347644805908, + "learning_rate": 2e-05, + "loss": 0.03301746, + "step": 3382 + }, + { + "epoch": 6.766, + "grad_norm": 1.2912507057189941, + "learning_rate": 2e-05, + "loss": 0.02964731, + "step": 3383 + }, + { + "epoch": 6.768, + "grad_norm": 1.8387466669082642, + "learning_rate": 2e-05, + "loss": 0.03587981, + "step": 3384 + }, + { + "epoch": 6.77, + "grad_norm": 1.2493430376052856, + "learning_rate": 2e-05, + "loss": 0.03021685, + "step": 3385 + }, + { + "epoch": 6.772, + "grad_norm": 1.8405768871307373, + "learning_rate": 2e-05, + "loss": 0.03573044, + "step": 3386 + }, + { + "epoch": 6.774, + "grad_norm": 1.5832135677337646, + "learning_rate": 2e-05, + "loss": 0.05767541, + "step": 3387 + }, + { + "epoch": 6.776, + "grad_norm": 1.0562320947647095, + "learning_rate": 2e-05, + "loss": 0.02797987, + "step": 3388 + }, + { + "epoch": 6.7780000000000005, + "grad_norm": 1.513688325881958, + "learning_rate": 2e-05, + "loss": 0.04148609, + "step": 3389 + }, + { + "epoch": 6.78, + "grad_norm": 1.0690497159957886, + "learning_rate": 2e-05, + "loss": 0.03502911, + "step": 3390 + }, + { + "epoch": 6.782, + "grad_norm": 1.8190356492996216, + "learning_rate": 2e-05, + "loss": 0.03984679, + "step": 3391 + }, + { + "epoch": 6.784, + "grad_norm": 1.2146891355514526, + "learning_rate": 2e-05, + "loss": 0.02811867, + "step": 3392 + }, + { + "epoch": 6.786, + "grad_norm": 2.19791579246521, + "learning_rate": 2e-05, + "loss": 0.02688298, + "step": 3393 + }, + { + "epoch": 6.788, + "grad_norm": 2.66430926322937, + "learning_rate": 2e-05, + "loss": 0.05289562, + "step": 3394 + }, + { + "epoch": 6.79, + "grad_norm": 1.111362338066101, + "learning_rate": 2e-05, + "loss": 0.03009887, + "step": 3395 + }, + { + "epoch": 6.792, + "grad_norm": 2.333982229232788, + "learning_rate": 2e-05, + "loss": 0.0382598, + "step": 3396 + }, + { + "epoch": 6.7940000000000005, + "grad_norm": 1.1318389177322388, + "learning_rate": 2e-05, + "loss": 0.03336748, + "step": 3397 + }, + { + "epoch": 6.796, + "grad_norm": 1.1174737215042114, + "learning_rate": 2e-05, + "loss": 0.03350708, + "step": 3398 + }, + { + "epoch": 6.798, + "grad_norm": 1.5082918405532837, + "learning_rate": 2e-05, + "loss": 0.03707235, + "step": 3399 + }, + { + "epoch": 6.8, + "grad_norm": 2.6029469966888428, + "learning_rate": 2e-05, + "loss": 0.0542256, + "step": 3400 + }, + { + "epoch": 6.802, + "grad_norm": 1.5858310461044312, + "learning_rate": 2e-05, + "loss": 0.03691325, + "step": 3401 + }, + { + "epoch": 6.804, + "grad_norm": 1.3726555109024048, + "learning_rate": 2e-05, + "loss": 0.04375734, + "step": 3402 + }, + { + "epoch": 6.806, + "grad_norm": 1.2137669324874878, + "learning_rate": 2e-05, + "loss": 0.03287685, + "step": 3403 + }, + { + "epoch": 6.808, + "grad_norm": 1.421502709388733, + "learning_rate": 2e-05, + "loss": 0.04229927, + "step": 3404 + }, + { + "epoch": 6.8100000000000005, + "grad_norm": 0.9239016771316528, + "learning_rate": 2e-05, + "loss": 0.02439195, + "step": 3405 + }, + { + "epoch": 6.812, + "grad_norm": 1.5839835405349731, + "learning_rate": 2e-05, + "loss": 0.04999159, + "step": 3406 + }, + { + "epoch": 6.814, + "grad_norm": 1.8794617652893066, + "learning_rate": 2e-05, + "loss": 0.04069415, + "step": 3407 + }, + { + "epoch": 6.816, + "grad_norm": 1.6794978380203247, + "learning_rate": 2e-05, + "loss": 0.042319, + "step": 3408 + }, + { + "epoch": 6.818, + "grad_norm": 4.976267337799072, + "learning_rate": 2e-05, + "loss": 0.03227082, + "step": 3409 + }, + { + "epoch": 6.82, + "grad_norm": 1.4886870384216309, + "learning_rate": 2e-05, + "loss": 0.04275644, + "step": 3410 + }, + { + "epoch": 6.822, + "grad_norm": 7.621867656707764, + "learning_rate": 2e-05, + "loss": 0.04230917, + "step": 3411 + }, + { + "epoch": 6.824, + "grad_norm": 1.1310099363327026, + "learning_rate": 2e-05, + "loss": 0.038154, + "step": 3412 + }, + { + "epoch": 6.826, + "grad_norm": 0.9763595461845398, + "learning_rate": 2e-05, + "loss": 0.0353835, + "step": 3413 + }, + { + "epoch": 6.828, + "grad_norm": 1.6774576902389526, + "learning_rate": 2e-05, + "loss": 0.04004671, + "step": 3414 + }, + { + "epoch": 6.83, + "grad_norm": 1.4653470516204834, + "learning_rate": 2e-05, + "loss": 0.04310292, + "step": 3415 + }, + { + "epoch": 6.832, + "grad_norm": 1.3005256652832031, + "learning_rate": 2e-05, + "loss": 0.04270222, + "step": 3416 + }, + { + "epoch": 6.834, + "grad_norm": 1.2623956203460693, + "learning_rate": 2e-05, + "loss": 0.04610048, + "step": 3417 + }, + { + "epoch": 6.836, + "grad_norm": 3.5397067070007324, + "learning_rate": 2e-05, + "loss": 0.03764736, + "step": 3418 + }, + { + "epoch": 6.838, + "grad_norm": 1.3481019735336304, + "learning_rate": 2e-05, + "loss": 0.0253664, + "step": 3419 + }, + { + "epoch": 6.84, + "grad_norm": 1.1361665725708008, + "learning_rate": 2e-05, + "loss": 0.03342405, + "step": 3420 + }, + { + "epoch": 6.842, + "grad_norm": 1.141860842704773, + "learning_rate": 2e-05, + "loss": 0.03326521, + "step": 3421 + }, + { + "epoch": 6.844, + "grad_norm": 1.170899748802185, + "learning_rate": 2e-05, + "loss": 0.03952732, + "step": 3422 + }, + { + "epoch": 6.846, + "grad_norm": 1.3554216623306274, + "learning_rate": 2e-05, + "loss": 0.04645754, + "step": 3423 + }, + { + "epoch": 6.848, + "grad_norm": 1.1913604736328125, + "learning_rate": 2e-05, + "loss": 0.02899063, + "step": 3424 + }, + { + "epoch": 6.85, + "grad_norm": 0.9958671927452087, + "learning_rate": 2e-05, + "loss": 0.03145525, + "step": 3425 + }, + { + "epoch": 6.852, + "grad_norm": 1.9133530855178833, + "learning_rate": 2e-05, + "loss": 0.05315136, + "step": 3426 + }, + { + "epoch": 6.854, + "grad_norm": 1.4105165004730225, + "learning_rate": 2e-05, + "loss": 0.03489037, + "step": 3427 + }, + { + "epoch": 6.856, + "grad_norm": 2.0334339141845703, + "learning_rate": 2e-05, + "loss": 0.03431166, + "step": 3428 + }, + { + "epoch": 6.858, + "grad_norm": 1.2631068229675293, + "learning_rate": 2e-05, + "loss": 0.04041663, + "step": 3429 + }, + { + "epoch": 6.86, + "grad_norm": 0.889681875705719, + "learning_rate": 2e-05, + "loss": 0.02558031, + "step": 3430 + }, + { + "epoch": 6.862, + "grad_norm": 1.6193792819976807, + "learning_rate": 2e-05, + "loss": 0.04380007, + "step": 3431 + }, + { + "epoch": 6.864, + "grad_norm": 1.2849256992340088, + "learning_rate": 2e-05, + "loss": 0.03113081, + "step": 3432 + }, + { + "epoch": 6.866, + "grad_norm": 1.6333054304122925, + "learning_rate": 2e-05, + "loss": 0.02930366, + "step": 3433 + }, + { + "epoch": 6.868, + "grad_norm": 1.2340630292892456, + "learning_rate": 2e-05, + "loss": 0.03456418, + "step": 3434 + }, + { + "epoch": 6.87, + "grad_norm": 1.9538198709487915, + "learning_rate": 2e-05, + "loss": 0.037586, + "step": 3435 + }, + { + "epoch": 6.872, + "grad_norm": 1.0432629585266113, + "learning_rate": 2e-05, + "loss": 0.02851261, + "step": 3436 + }, + { + "epoch": 6.874, + "grad_norm": 1.1603375673294067, + "learning_rate": 2e-05, + "loss": 0.034657, + "step": 3437 + }, + { + "epoch": 6.876, + "grad_norm": 1.1851812601089478, + "learning_rate": 2e-05, + "loss": 0.03753354, + "step": 3438 + }, + { + "epoch": 6.878, + "grad_norm": 1.6592788696289062, + "learning_rate": 2e-05, + "loss": 0.04491906, + "step": 3439 + }, + { + "epoch": 6.88, + "grad_norm": 1.2274245023727417, + "learning_rate": 2e-05, + "loss": 0.0340904, + "step": 3440 + }, + { + "epoch": 6.882, + "grad_norm": 1.3614306449890137, + "learning_rate": 2e-05, + "loss": 0.0444744, + "step": 3441 + }, + { + "epoch": 6.884, + "grad_norm": 1.531777262687683, + "learning_rate": 2e-05, + "loss": 0.04471203, + "step": 3442 + }, + { + "epoch": 6.886, + "grad_norm": 1.5285937786102295, + "learning_rate": 2e-05, + "loss": 0.05250137, + "step": 3443 + }, + { + "epoch": 6.888, + "grad_norm": 1.1851426362991333, + "learning_rate": 2e-05, + "loss": 0.02804793, + "step": 3444 + }, + { + "epoch": 6.89, + "grad_norm": 1.1896464824676514, + "learning_rate": 2e-05, + "loss": 0.03605233, + "step": 3445 + }, + { + "epoch": 6.892, + "grad_norm": 1.2750282287597656, + "learning_rate": 2e-05, + "loss": 0.03346939, + "step": 3446 + }, + { + "epoch": 6.894, + "grad_norm": 1.1519888639450073, + "learning_rate": 2e-05, + "loss": 0.0394849, + "step": 3447 + }, + { + "epoch": 6.896, + "grad_norm": 1.385860800743103, + "learning_rate": 2e-05, + "loss": 0.03840117, + "step": 3448 + }, + { + "epoch": 6.898, + "grad_norm": 1.8041616678237915, + "learning_rate": 2e-05, + "loss": 0.04675615, + "step": 3449 + }, + { + "epoch": 6.9, + "grad_norm": 2.1062817573547363, + "learning_rate": 2e-05, + "loss": 0.06331439, + "step": 3450 + }, + { + "epoch": 6.902, + "grad_norm": 1.432571530342102, + "learning_rate": 2e-05, + "loss": 0.04114764, + "step": 3451 + }, + { + "epoch": 6.904, + "grad_norm": 1.6515777111053467, + "learning_rate": 2e-05, + "loss": 0.03684805, + "step": 3452 + }, + { + "epoch": 6.906, + "grad_norm": 1.2222999334335327, + "learning_rate": 2e-05, + "loss": 0.03101808, + "step": 3453 + }, + { + "epoch": 6.908, + "grad_norm": 1.390168309211731, + "learning_rate": 2e-05, + "loss": 0.0516277, + "step": 3454 + }, + { + "epoch": 6.91, + "grad_norm": 0.9776650667190552, + "learning_rate": 2e-05, + "loss": 0.02990855, + "step": 3455 + }, + { + "epoch": 6.912, + "grad_norm": 2.2317774295806885, + "learning_rate": 2e-05, + "loss": 0.03915323, + "step": 3456 + }, + { + "epoch": 6.914, + "grad_norm": 1.5829832553863525, + "learning_rate": 2e-05, + "loss": 0.04086168, + "step": 3457 + }, + { + "epoch": 6.916, + "grad_norm": 1.0172293186187744, + "learning_rate": 2e-05, + "loss": 0.02923257, + "step": 3458 + }, + { + "epoch": 6.918, + "grad_norm": 1.2105185985565186, + "learning_rate": 2e-05, + "loss": 0.02814864, + "step": 3459 + }, + { + "epoch": 6.92, + "grad_norm": 2.631154775619507, + "learning_rate": 2e-05, + "loss": 0.05200697, + "step": 3460 + }, + { + "epoch": 6.922, + "grad_norm": 1.0045074224472046, + "learning_rate": 2e-05, + "loss": 0.03085414, + "step": 3461 + }, + { + "epoch": 6.924, + "grad_norm": 1.0970081090927124, + "learning_rate": 2e-05, + "loss": 0.03664451, + "step": 3462 + }, + { + "epoch": 6.926, + "grad_norm": 1.9260581731796265, + "learning_rate": 2e-05, + "loss": 0.05451979, + "step": 3463 + }, + { + "epoch": 6.928, + "grad_norm": 1.4260892868041992, + "learning_rate": 2e-05, + "loss": 0.03202819, + "step": 3464 + }, + { + "epoch": 6.93, + "grad_norm": 1.867712378501892, + "learning_rate": 2e-05, + "loss": 0.0472446, + "step": 3465 + }, + { + "epoch": 6.932, + "grad_norm": 1.2931119203567505, + "learning_rate": 2e-05, + "loss": 0.03627722, + "step": 3466 + }, + { + "epoch": 6.934, + "grad_norm": 1.267653226852417, + "learning_rate": 2e-05, + "loss": 0.04178291, + "step": 3467 + }, + { + "epoch": 6.936, + "grad_norm": 1.8989514112472534, + "learning_rate": 2e-05, + "loss": 0.06069873, + "step": 3468 + }, + { + "epoch": 6.938, + "grad_norm": 1.369830846786499, + "learning_rate": 2e-05, + "loss": 0.03865648, + "step": 3469 + }, + { + "epoch": 6.9399999999999995, + "grad_norm": 2.1145782470703125, + "learning_rate": 2e-05, + "loss": 0.03145633, + "step": 3470 + }, + { + "epoch": 6.942, + "grad_norm": 1.4967544078826904, + "learning_rate": 2e-05, + "loss": 0.04484332, + "step": 3471 + }, + { + "epoch": 6.944, + "grad_norm": 0.9439011216163635, + "learning_rate": 2e-05, + "loss": 0.0252577, + "step": 3472 + }, + { + "epoch": 6.946, + "grad_norm": 1.3101210594177246, + "learning_rate": 2e-05, + "loss": 0.02739098, + "step": 3473 + }, + { + "epoch": 6.948, + "grad_norm": 1.3791102170944214, + "learning_rate": 2e-05, + "loss": 0.03050887, + "step": 3474 + }, + { + "epoch": 6.95, + "grad_norm": 1.8151313066482544, + "learning_rate": 2e-05, + "loss": 0.04466762, + "step": 3475 + }, + { + "epoch": 6.952, + "grad_norm": 1.7713474035263062, + "learning_rate": 2e-05, + "loss": 0.03592313, + "step": 3476 + }, + { + "epoch": 6.954, + "grad_norm": 1.4001665115356445, + "learning_rate": 2e-05, + "loss": 0.04347411, + "step": 3477 + }, + { + "epoch": 6.9559999999999995, + "grad_norm": 2.1821348667144775, + "learning_rate": 2e-05, + "loss": 0.04121158, + "step": 3478 + }, + { + "epoch": 6.958, + "grad_norm": 1.3329628705978394, + "learning_rate": 2e-05, + "loss": 0.0454905, + "step": 3479 + }, + { + "epoch": 6.96, + "grad_norm": 2.2498857975006104, + "learning_rate": 2e-05, + "loss": 0.05597805, + "step": 3480 + }, + { + "epoch": 6.962, + "grad_norm": 1.236721396446228, + "learning_rate": 2e-05, + "loss": 0.03585602, + "step": 3481 + }, + { + "epoch": 6.964, + "grad_norm": 2.087568521499634, + "learning_rate": 2e-05, + "loss": 0.03135363, + "step": 3482 + }, + { + "epoch": 6.966, + "grad_norm": 2.696734666824341, + "learning_rate": 2e-05, + "loss": 0.05923796, + "step": 3483 + }, + { + "epoch": 6.968, + "grad_norm": 1.5622137784957886, + "learning_rate": 2e-05, + "loss": 0.04081835, + "step": 3484 + }, + { + "epoch": 6.97, + "grad_norm": 1.6300699710845947, + "learning_rate": 2e-05, + "loss": 0.0384453, + "step": 3485 + }, + { + "epoch": 6.9719999999999995, + "grad_norm": 1.4294748306274414, + "learning_rate": 2e-05, + "loss": 0.05283351, + "step": 3486 + }, + { + "epoch": 6.974, + "grad_norm": 5.5012335777282715, + "learning_rate": 2e-05, + "loss": 0.04624298, + "step": 3487 + }, + { + "epoch": 6.976, + "grad_norm": 0.8324853181838989, + "learning_rate": 2e-05, + "loss": 0.0236811, + "step": 3488 + }, + { + "epoch": 6.978, + "grad_norm": 1.4482553005218506, + "learning_rate": 2e-05, + "loss": 0.04167052, + "step": 3489 + }, + { + "epoch": 6.98, + "grad_norm": 1.1689647436141968, + "learning_rate": 2e-05, + "loss": 0.03899954, + "step": 3490 + }, + { + "epoch": 6.982, + "grad_norm": 1.237749695777893, + "learning_rate": 2e-05, + "loss": 0.03976803, + "step": 3491 + }, + { + "epoch": 6.984, + "grad_norm": 0.9023482799530029, + "learning_rate": 2e-05, + "loss": 0.02601608, + "step": 3492 + }, + { + "epoch": 6.986, + "grad_norm": 1.958407998085022, + "learning_rate": 2e-05, + "loss": 0.0403572, + "step": 3493 + }, + { + "epoch": 6.9879999999999995, + "grad_norm": 1.4088008403778076, + "learning_rate": 2e-05, + "loss": 0.03802837, + "step": 3494 + }, + { + "epoch": 6.99, + "grad_norm": 1.656472086906433, + "learning_rate": 2e-05, + "loss": 0.04578206, + "step": 3495 + }, + { + "epoch": 6.992, + "grad_norm": 2.143359422683716, + "learning_rate": 2e-05, + "loss": 0.04628511, + "step": 3496 + }, + { + "epoch": 6.994, + "grad_norm": 1.4736688137054443, + "learning_rate": 2e-05, + "loss": 0.02740022, + "step": 3497 + }, + { + "epoch": 6.996, + "grad_norm": 1.2310930490493774, + "learning_rate": 2e-05, + "loss": 0.04610129, + "step": 3498 + }, + { + "epoch": 6.998, + "grad_norm": 1.3057410717010498, + "learning_rate": 2e-05, + "loss": 0.03868414, + "step": 3499 + }, + { + "epoch": 7.0, + "grad_norm": 1.592368483543396, + "learning_rate": 2e-05, + "loss": 0.0320503, + "step": 3500 + }, + { + "epoch": 7.0, + "eval_performance": { + "AngleClassification_1": 0.986, + "AngleClassification_2": 0.996, + "AngleClassification_3": 0.9021956087824351, + "Equal_1": 0.994, + "Equal_2": 0.9301397205588823, + "Equal_3": 0.7924151696606786, + "LineComparison_1": 0.998, + "LineComparison_2": 0.9960079840319361, + "LineComparison_3": 0.9920159680638723, + "Parallel_1": 0.9719438877755511, + "Parallel_2": 0.9919839679358717, + "Parallel_3": 0.988, + "Perpendicular_1": 0.982, + "Perpendicular_2": 0.74, + "Perpendicular_3": 0.3316633266533066, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9972666666666666, + "PointLiesOnCircle_3": 0.9936, + "PointLiesOnLine_1": 1.0, + "PointLiesOnLine_2": 0.9899799599198397, + "PointLiesOnLine_3": 0.8223552894211577 + }, + "eval_runtime": 319.1391, + "eval_samples_per_second": 32.901, + "eval_steps_per_second": 0.658, + "step": 3500 + }, + { + "epoch": 7.002, + "grad_norm": 2.186814069747925, + "learning_rate": 2e-05, + "loss": 0.06875495, + "step": 3501 + }, + { + "epoch": 7.004, + "grad_norm": 2.7851738929748535, + "learning_rate": 2e-05, + "loss": 0.07352186, + "step": 3502 + }, + { + "epoch": 7.006, + "grad_norm": 1.6021937131881714, + "learning_rate": 2e-05, + "loss": 0.04718731, + "step": 3503 + }, + { + "epoch": 7.008, + "grad_norm": 1.708878755569458, + "learning_rate": 2e-05, + "loss": 0.05183144, + "step": 3504 + }, + { + "epoch": 7.01, + "grad_norm": 1.2704074382781982, + "learning_rate": 2e-05, + "loss": 0.04325004, + "step": 3505 + }, + { + "epoch": 7.012, + "grad_norm": 1.7846604585647583, + "learning_rate": 2e-05, + "loss": 0.04165163, + "step": 3506 + }, + { + "epoch": 7.014, + "grad_norm": 2.000779390335083, + "learning_rate": 2e-05, + "loss": 0.05959515, + "step": 3507 + }, + { + "epoch": 7.016, + "grad_norm": 1.2613239288330078, + "learning_rate": 2e-05, + "loss": 0.04999759, + "step": 3508 + }, + { + "epoch": 7.018, + "grad_norm": 1.419448971748352, + "learning_rate": 2e-05, + "loss": 0.05279196, + "step": 3509 + }, + { + "epoch": 7.02, + "grad_norm": 1.1730986833572388, + "learning_rate": 2e-05, + "loss": 0.04814022, + "step": 3510 + }, + { + "epoch": 7.022, + "grad_norm": 1.3364038467407227, + "learning_rate": 2e-05, + "loss": 0.0524789, + "step": 3511 + }, + { + "epoch": 7.024, + "grad_norm": 1.4599989652633667, + "learning_rate": 2e-05, + "loss": 0.06457356, + "step": 3512 + }, + { + "epoch": 7.026, + "grad_norm": 1.2211647033691406, + "learning_rate": 2e-05, + "loss": 0.03781735, + "step": 3513 + }, + { + "epoch": 7.028, + "grad_norm": 1.916827917098999, + "learning_rate": 2e-05, + "loss": 0.04609166, + "step": 3514 + }, + { + "epoch": 7.03, + "grad_norm": 1.3852880001068115, + "learning_rate": 2e-05, + "loss": 0.05185224, + "step": 3515 + }, + { + "epoch": 7.032, + "grad_norm": 1.9086310863494873, + "learning_rate": 2e-05, + "loss": 0.06317137, + "step": 3516 + }, + { + "epoch": 7.034, + "grad_norm": 1.3954685926437378, + "learning_rate": 2e-05, + "loss": 0.04755458, + "step": 3517 + }, + { + "epoch": 7.036, + "grad_norm": 1.2691612243652344, + "learning_rate": 2e-05, + "loss": 0.04614236, + "step": 3518 + }, + { + "epoch": 7.038, + "grad_norm": 1.438931941986084, + "learning_rate": 2e-05, + "loss": 0.06192727, + "step": 3519 + }, + { + "epoch": 7.04, + "grad_norm": 1.505270004272461, + "learning_rate": 2e-05, + "loss": 0.04731307, + "step": 3520 + }, + { + "epoch": 7.042, + "grad_norm": 1.2275315523147583, + "learning_rate": 2e-05, + "loss": 0.05152331, + "step": 3521 + }, + { + "epoch": 7.044, + "grad_norm": 1.6780346632003784, + "learning_rate": 2e-05, + "loss": 0.04752352, + "step": 3522 + }, + { + "epoch": 7.046, + "grad_norm": 1.3278478384017944, + "learning_rate": 2e-05, + "loss": 0.03956543, + "step": 3523 + }, + { + "epoch": 7.048, + "grad_norm": 1.9214937686920166, + "learning_rate": 2e-05, + "loss": 0.05849922, + "step": 3524 + }, + { + "epoch": 7.05, + "grad_norm": 2.2144150733947754, + "learning_rate": 2e-05, + "loss": 0.04467774, + "step": 3525 + }, + { + "epoch": 7.052, + "grad_norm": 1.5400785207748413, + "learning_rate": 2e-05, + "loss": 0.05098802, + "step": 3526 + }, + { + "epoch": 7.054, + "grad_norm": 2.3327882289886475, + "learning_rate": 2e-05, + "loss": 0.04657032, + "step": 3527 + }, + { + "epoch": 7.056, + "grad_norm": 1.3429843187332153, + "learning_rate": 2e-05, + "loss": 0.04105748, + "step": 3528 + }, + { + "epoch": 7.058, + "grad_norm": 1.845400094985962, + "learning_rate": 2e-05, + "loss": 0.04504231, + "step": 3529 + }, + { + "epoch": 7.06, + "grad_norm": 2.3795065879821777, + "learning_rate": 2e-05, + "loss": 0.0596103, + "step": 3530 + }, + { + "epoch": 7.062, + "grad_norm": 1.7181967496871948, + "learning_rate": 2e-05, + "loss": 0.05508241, + "step": 3531 + }, + { + "epoch": 7.064, + "grad_norm": 2.2078285217285156, + "learning_rate": 2e-05, + "loss": 0.07361597, + "step": 3532 + }, + { + "epoch": 7.066, + "grad_norm": 1.3147023916244507, + "learning_rate": 2e-05, + "loss": 0.04743645, + "step": 3533 + }, + { + "epoch": 7.068, + "grad_norm": 1.294423222541809, + "learning_rate": 2e-05, + "loss": 0.03428509, + "step": 3534 + }, + { + "epoch": 7.07, + "grad_norm": 1.5514196157455444, + "learning_rate": 2e-05, + "loss": 0.05675917, + "step": 3535 + }, + { + "epoch": 7.072, + "grad_norm": 2.4513447284698486, + "learning_rate": 2e-05, + "loss": 0.05515049, + "step": 3536 + }, + { + "epoch": 7.074, + "grad_norm": 1.598224401473999, + "learning_rate": 2e-05, + "loss": 0.07278877, + "step": 3537 + }, + { + "epoch": 7.076, + "grad_norm": 1.0842782258987427, + "learning_rate": 2e-05, + "loss": 0.04107867, + "step": 3538 + }, + { + "epoch": 7.078, + "grad_norm": 1.510959267616272, + "learning_rate": 2e-05, + "loss": 0.06507348, + "step": 3539 + }, + { + "epoch": 7.08, + "grad_norm": 1.5507309436798096, + "learning_rate": 2e-05, + "loss": 0.03261951, + "step": 3540 + }, + { + "epoch": 7.082, + "grad_norm": 1.7355281114578247, + "learning_rate": 2e-05, + "loss": 0.06146067, + "step": 3541 + }, + { + "epoch": 7.084, + "grad_norm": 4.931440830230713, + "learning_rate": 2e-05, + "loss": 0.04650098, + "step": 3542 + }, + { + "epoch": 7.086, + "grad_norm": 2.3309950828552246, + "learning_rate": 2e-05, + "loss": 0.09169232, + "step": 3543 + }, + { + "epoch": 7.088, + "grad_norm": 2.2715747356414795, + "learning_rate": 2e-05, + "loss": 0.05420232, + "step": 3544 + }, + { + "epoch": 7.09, + "grad_norm": 2.207563877105713, + "learning_rate": 2e-05, + "loss": 0.03945183, + "step": 3545 + }, + { + "epoch": 7.092, + "grad_norm": 1.6729447841644287, + "learning_rate": 2e-05, + "loss": 0.04845568, + "step": 3546 + }, + { + "epoch": 7.094, + "grad_norm": 1.796739101409912, + "learning_rate": 2e-05, + "loss": 0.06503285, + "step": 3547 + }, + { + "epoch": 7.096, + "grad_norm": 1.2983309030532837, + "learning_rate": 2e-05, + "loss": 0.04906647, + "step": 3548 + }, + { + "epoch": 7.098, + "grad_norm": 1.4331189393997192, + "learning_rate": 2e-05, + "loss": 0.05561874, + "step": 3549 + }, + { + "epoch": 7.1, + "grad_norm": 1.5238709449768066, + "learning_rate": 2e-05, + "loss": 0.05148921, + "step": 3550 + }, + { + "epoch": 7.102, + "grad_norm": 1.2250235080718994, + "learning_rate": 2e-05, + "loss": 0.04760787, + "step": 3551 + }, + { + "epoch": 7.104, + "grad_norm": 1.3822916746139526, + "learning_rate": 2e-05, + "loss": 0.0511881, + "step": 3552 + }, + { + "epoch": 7.106, + "grad_norm": 2.5087249279022217, + "learning_rate": 2e-05, + "loss": 0.06286559, + "step": 3553 + }, + { + "epoch": 7.108, + "grad_norm": 2.8455286026000977, + "learning_rate": 2e-05, + "loss": 0.05309931, + "step": 3554 + }, + { + "epoch": 7.11, + "grad_norm": 1.4749555587768555, + "learning_rate": 2e-05, + "loss": 0.05005356, + "step": 3555 + }, + { + "epoch": 7.112, + "grad_norm": 2.161928415298462, + "learning_rate": 2e-05, + "loss": 0.04809496, + "step": 3556 + }, + { + "epoch": 7.114, + "grad_norm": 1.3201868534088135, + "learning_rate": 2e-05, + "loss": 0.05066467, + "step": 3557 + }, + { + "epoch": 7.116, + "grad_norm": 1.2147846221923828, + "learning_rate": 2e-05, + "loss": 0.05296815, + "step": 3558 + }, + { + "epoch": 7.118, + "grad_norm": 1.3991172313690186, + "learning_rate": 2e-05, + "loss": 0.04677524, + "step": 3559 + }, + { + "epoch": 7.12, + "grad_norm": 6.09710693359375, + "learning_rate": 2e-05, + "loss": 0.04200501, + "step": 3560 + }, + { + "epoch": 7.122, + "grad_norm": 1.4460575580596924, + "learning_rate": 2e-05, + "loss": 0.0484569, + "step": 3561 + }, + { + "epoch": 7.124, + "grad_norm": 1.2892142534255981, + "learning_rate": 2e-05, + "loss": 0.04489253, + "step": 3562 + }, + { + "epoch": 7.126, + "grad_norm": 2.244136333465576, + "learning_rate": 2e-05, + "loss": 0.05174838, + "step": 3563 + }, + { + "epoch": 7.128, + "grad_norm": 0.9121345281600952, + "learning_rate": 2e-05, + "loss": 0.02768136, + "step": 3564 + }, + { + "epoch": 7.13, + "grad_norm": 1.626583218574524, + "learning_rate": 2e-05, + "loss": 0.05284789, + "step": 3565 + }, + { + "epoch": 7.132, + "grad_norm": 1.1587470769882202, + "learning_rate": 2e-05, + "loss": 0.04420138, + "step": 3566 + }, + { + "epoch": 7.134, + "grad_norm": 1.1360605955123901, + "learning_rate": 2e-05, + "loss": 0.03466136, + "step": 3567 + }, + { + "epoch": 7.136, + "grad_norm": 1.6288130283355713, + "learning_rate": 2e-05, + "loss": 0.05938084, + "step": 3568 + }, + { + "epoch": 7.138, + "grad_norm": 1.7451311349868774, + "learning_rate": 2e-05, + "loss": 0.07126134, + "step": 3569 + }, + { + "epoch": 7.14, + "grad_norm": 1.0442672967910767, + "learning_rate": 2e-05, + "loss": 0.0229485, + "step": 3570 + }, + { + "epoch": 7.142, + "grad_norm": 0.9555102586746216, + "learning_rate": 2e-05, + "loss": 0.03555539, + "step": 3571 + }, + { + "epoch": 7.144, + "grad_norm": 1.9347809553146362, + "learning_rate": 2e-05, + "loss": 0.065361, + "step": 3572 + }, + { + "epoch": 7.146, + "grad_norm": 1.5342516899108887, + "learning_rate": 2e-05, + "loss": 0.05312759, + "step": 3573 + }, + { + "epoch": 7.148, + "grad_norm": 2.1047842502593994, + "learning_rate": 2e-05, + "loss": 0.04418172, + "step": 3574 + }, + { + "epoch": 7.15, + "grad_norm": 1.7148213386535645, + "learning_rate": 2e-05, + "loss": 0.05144256, + "step": 3575 + }, + { + "epoch": 7.152, + "grad_norm": 2.749667167663574, + "learning_rate": 2e-05, + "loss": 0.07798346, + "step": 3576 + }, + { + "epoch": 7.154, + "grad_norm": 2.0609371662139893, + "learning_rate": 2e-05, + "loss": 0.04355539, + "step": 3577 + }, + { + "epoch": 7.156, + "grad_norm": 1.6791129112243652, + "learning_rate": 2e-05, + "loss": 0.05896265, + "step": 3578 + }, + { + "epoch": 7.158, + "grad_norm": 2.215697765350342, + "learning_rate": 2e-05, + "loss": 0.0662262, + "step": 3579 + }, + { + "epoch": 7.16, + "grad_norm": 1.0952160358428955, + "learning_rate": 2e-05, + "loss": 0.03965465, + "step": 3580 + }, + { + "epoch": 7.162, + "grad_norm": 1.4443844556808472, + "learning_rate": 2e-05, + "loss": 0.041635, + "step": 3581 + }, + { + "epoch": 7.164, + "grad_norm": 3.1026885509490967, + "learning_rate": 2e-05, + "loss": 0.06438316, + "step": 3582 + }, + { + "epoch": 7.166, + "grad_norm": 1.489847183227539, + "learning_rate": 2e-05, + "loss": 0.04829738, + "step": 3583 + }, + { + "epoch": 7.168, + "grad_norm": 1.4437965154647827, + "learning_rate": 2e-05, + "loss": 0.05083038, + "step": 3584 + }, + { + "epoch": 7.17, + "grad_norm": 1.0720726251602173, + "learning_rate": 2e-05, + "loss": 0.04749148, + "step": 3585 + }, + { + "epoch": 7.172, + "grad_norm": 1.5895586013793945, + "learning_rate": 2e-05, + "loss": 0.06001811, + "step": 3586 + }, + { + "epoch": 7.174, + "grad_norm": 1.490312933921814, + "learning_rate": 2e-05, + "loss": 0.03755351, + "step": 3587 + }, + { + "epoch": 7.176, + "grad_norm": 1.5626713037490845, + "learning_rate": 2e-05, + "loss": 0.05684235, + "step": 3588 + }, + { + "epoch": 7.178, + "grad_norm": 1.5374075174331665, + "learning_rate": 2e-05, + "loss": 0.05959694, + "step": 3589 + }, + { + "epoch": 7.18, + "grad_norm": 1.7578250169754028, + "learning_rate": 2e-05, + "loss": 0.05503277, + "step": 3590 + }, + { + "epoch": 7.182, + "grad_norm": 2.2269232273101807, + "learning_rate": 2e-05, + "loss": 0.05974087, + "step": 3591 + }, + { + "epoch": 7.184, + "grad_norm": 2.0154407024383545, + "learning_rate": 2e-05, + "loss": 0.064426, + "step": 3592 + }, + { + "epoch": 7.186, + "grad_norm": 4.343257427215576, + "learning_rate": 2e-05, + "loss": 0.06492545, + "step": 3593 + }, + { + "epoch": 7.188, + "grad_norm": 1.5611995458602905, + "learning_rate": 2e-05, + "loss": 0.05464255, + "step": 3594 + }, + { + "epoch": 7.19, + "grad_norm": 2.143812656402588, + "learning_rate": 2e-05, + "loss": 0.06049203, + "step": 3595 + }, + { + "epoch": 7.192, + "grad_norm": 1.4210190773010254, + "learning_rate": 2e-05, + "loss": 0.05442027, + "step": 3596 + }, + { + "epoch": 7.194, + "grad_norm": 1.7201104164123535, + "learning_rate": 2e-05, + "loss": 0.07043049, + "step": 3597 + }, + { + "epoch": 7.196, + "grad_norm": 1.361403226852417, + "learning_rate": 2e-05, + "loss": 0.04256427, + "step": 3598 + }, + { + "epoch": 7.198, + "grad_norm": 1.2954514026641846, + "learning_rate": 2e-05, + "loss": 0.04150357, + "step": 3599 + }, + { + "epoch": 7.2, + "grad_norm": 1.6332778930664062, + "learning_rate": 2e-05, + "loss": 0.04461187, + "step": 3600 + }, + { + "epoch": 7.202, + "grad_norm": 1.5767091512680054, + "learning_rate": 2e-05, + "loss": 0.05819118, + "step": 3601 + }, + { + "epoch": 7.204, + "grad_norm": 1.3004631996154785, + "learning_rate": 2e-05, + "loss": 0.03637975, + "step": 3602 + }, + { + "epoch": 7.206, + "grad_norm": 1.3462961912155151, + "learning_rate": 2e-05, + "loss": 0.03405475, + "step": 3603 + }, + { + "epoch": 7.208, + "grad_norm": 1.1177797317504883, + "learning_rate": 2e-05, + "loss": 0.03819214, + "step": 3604 + }, + { + "epoch": 7.21, + "grad_norm": 2.921354293823242, + "learning_rate": 2e-05, + "loss": 0.06285311, + "step": 3605 + }, + { + "epoch": 7.212, + "grad_norm": 1.4188235998153687, + "learning_rate": 2e-05, + "loss": 0.05607903, + "step": 3606 + }, + { + "epoch": 7.214, + "grad_norm": 2.9079253673553467, + "learning_rate": 2e-05, + "loss": 0.06680961, + "step": 3607 + }, + { + "epoch": 7.216, + "grad_norm": 1.4562344551086426, + "learning_rate": 2e-05, + "loss": 0.05460964, + "step": 3608 + }, + { + "epoch": 7.218, + "grad_norm": 1.5972447395324707, + "learning_rate": 2e-05, + "loss": 0.04806063, + "step": 3609 + }, + { + "epoch": 7.22, + "grad_norm": 1.8971936702728271, + "learning_rate": 2e-05, + "loss": 0.05641398, + "step": 3610 + }, + { + "epoch": 7.222, + "grad_norm": 1.4135507345199585, + "learning_rate": 2e-05, + "loss": 0.04675736, + "step": 3611 + }, + { + "epoch": 7.224, + "grad_norm": 1.3104100227355957, + "learning_rate": 2e-05, + "loss": 0.04616308, + "step": 3612 + }, + { + "epoch": 7.226, + "grad_norm": 1.6768646240234375, + "learning_rate": 2e-05, + "loss": 0.05637186, + "step": 3613 + }, + { + "epoch": 7.228, + "grad_norm": 1.6318180561065674, + "learning_rate": 2e-05, + "loss": 0.06836526, + "step": 3614 + }, + { + "epoch": 7.23, + "grad_norm": 1.0449095964431763, + "learning_rate": 2e-05, + "loss": 0.03408219, + "step": 3615 + }, + { + "epoch": 7.232, + "grad_norm": 1.6176615953445435, + "learning_rate": 2e-05, + "loss": 0.06785093, + "step": 3616 + }, + { + "epoch": 7.234, + "grad_norm": 1.3260332345962524, + "learning_rate": 2e-05, + "loss": 0.04694654, + "step": 3617 + }, + { + "epoch": 7.236, + "grad_norm": 1.478721261024475, + "learning_rate": 2e-05, + "loss": 0.04473628, + "step": 3618 + }, + { + "epoch": 7.2379999999999995, + "grad_norm": 1.9737378358840942, + "learning_rate": 2e-05, + "loss": 0.06412191, + "step": 3619 + }, + { + "epoch": 7.24, + "grad_norm": 1.4525940418243408, + "learning_rate": 2e-05, + "loss": 0.04360288, + "step": 3620 + }, + { + "epoch": 7.242, + "grad_norm": 2.22672963142395, + "learning_rate": 2e-05, + "loss": 0.06661811, + "step": 3621 + }, + { + "epoch": 7.244, + "grad_norm": 1.2235952615737915, + "learning_rate": 2e-05, + "loss": 0.0464549, + "step": 3622 + }, + { + "epoch": 7.246, + "grad_norm": 1.7983734607696533, + "learning_rate": 2e-05, + "loss": 0.04726644, + "step": 3623 + }, + { + "epoch": 7.248, + "grad_norm": 1.316263198852539, + "learning_rate": 2e-05, + "loss": 0.0438324, + "step": 3624 + }, + { + "epoch": 7.25, + "grad_norm": 1.3271807432174683, + "learning_rate": 2e-05, + "loss": 0.04459179, + "step": 3625 + }, + { + "epoch": 7.252, + "grad_norm": 1.7631975412368774, + "learning_rate": 2e-05, + "loss": 0.04965796, + "step": 3626 + }, + { + "epoch": 7.254, + "grad_norm": 2.7661640644073486, + "learning_rate": 2e-05, + "loss": 0.06493346, + "step": 3627 + }, + { + "epoch": 7.256, + "grad_norm": 1.1280755996704102, + "learning_rate": 2e-05, + "loss": 0.03161623, + "step": 3628 + }, + { + "epoch": 7.258, + "grad_norm": 1.7222224473953247, + "learning_rate": 2e-05, + "loss": 0.0412074, + "step": 3629 + }, + { + "epoch": 7.26, + "grad_norm": 1.4843904972076416, + "learning_rate": 2e-05, + "loss": 0.04300556, + "step": 3630 + }, + { + "epoch": 7.2620000000000005, + "grad_norm": 1.4301111698150635, + "learning_rate": 2e-05, + "loss": 0.05478829, + "step": 3631 + }, + { + "epoch": 7.264, + "grad_norm": 1.6331700086593628, + "learning_rate": 2e-05, + "loss": 0.04211323, + "step": 3632 + }, + { + "epoch": 7.266, + "grad_norm": 1.3613964319229126, + "learning_rate": 2e-05, + "loss": 0.04794502, + "step": 3633 + }, + { + "epoch": 7.268, + "grad_norm": 1.2767889499664307, + "learning_rate": 2e-05, + "loss": 0.04734127, + "step": 3634 + }, + { + "epoch": 7.27, + "grad_norm": 1.6509748697280884, + "learning_rate": 2e-05, + "loss": 0.05143131, + "step": 3635 + }, + { + "epoch": 7.272, + "grad_norm": 1.3792498111724854, + "learning_rate": 2e-05, + "loss": 0.04537631, + "step": 3636 + }, + { + "epoch": 7.274, + "grad_norm": 1.7637391090393066, + "learning_rate": 2e-05, + "loss": 0.04865746, + "step": 3637 + }, + { + "epoch": 7.276, + "grad_norm": 1.275904655456543, + "learning_rate": 2e-05, + "loss": 0.03691653, + "step": 3638 + }, + { + "epoch": 7.2780000000000005, + "grad_norm": 1.575995922088623, + "learning_rate": 2e-05, + "loss": 0.05559878, + "step": 3639 + }, + { + "epoch": 7.28, + "grad_norm": 2.00274920463562, + "learning_rate": 2e-05, + "loss": 0.06520915, + "step": 3640 + }, + { + "epoch": 7.282, + "grad_norm": 1.6603018045425415, + "learning_rate": 2e-05, + "loss": 0.04692579, + "step": 3641 + }, + { + "epoch": 7.284, + "grad_norm": 1.4534879922866821, + "learning_rate": 2e-05, + "loss": 0.05134547, + "step": 3642 + }, + { + "epoch": 7.286, + "grad_norm": 1.8674389123916626, + "learning_rate": 2e-05, + "loss": 0.06286973, + "step": 3643 + }, + { + "epoch": 7.288, + "grad_norm": 1.2336664199829102, + "learning_rate": 2e-05, + "loss": 0.04018402, + "step": 3644 + }, + { + "epoch": 7.29, + "grad_norm": 1.0961309671401978, + "learning_rate": 2e-05, + "loss": 0.03630316, + "step": 3645 + }, + { + "epoch": 7.292, + "grad_norm": 1.4161972999572754, + "learning_rate": 2e-05, + "loss": 0.04280121, + "step": 3646 + }, + { + "epoch": 7.294, + "grad_norm": 1.8841363191604614, + "learning_rate": 2e-05, + "loss": 0.06651107, + "step": 3647 + }, + { + "epoch": 7.296, + "grad_norm": 1.3848674297332764, + "learning_rate": 2e-05, + "loss": 0.05075488, + "step": 3648 + }, + { + "epoch": 7.298, + "grad_norm": 2.675525665283203, + "learning_rate": 2e-05, + "loss": 0.0590144, + "step": 3649 + }, + { + "epoch": 7.3, + "grad_norm": 2.8043618202209473, + "learning_rate": 2e-05, + "loss": 0.05341516, + "step": 3650 + }, + { + "epoch": 7.302, + "grad_norm": 1.16990065574646, + "learning_rate": 2e-05, + "loss": 0.03649558, + "step": 3651 + }, + { + "epoch": 7.304, + "grad_norm": 1.3103454113006592, + "learning_rate": 2e-05, + "loss": 0.04651901, + "step": 3652 + }, + { + "epoch": 7.306, + "grad_norm": 1.39406156539917, + "learning_rate": 2e-05, + "loss": 0.04839561, + "step": 3653 + }, + { + "epoch": 7.308, + "grad_norm": 1.6239453554153442, + "learning_rate": 2e-05, + "loss": 0.05581132, + "step": 3654 + }, + { + "epoch": 7.31, + "grad_norm": 1.9287298917770386, + "learning_rate": 2e-05, + "loss": 0.05162586, + "step": 3655 + }, + { + "epoch": 7.312, + "grad_norm": 1.1683954000473022, + "learning_rate": 2e-05, + "loss": 0.04523386, + "step": 3656 + }, + { + "epoch": 7.314, + "grad_norm": 1.9765461683273315, + "learning_rate": 2e-05, + "loss": 0.05748572, + "step": 3657 + }, + { + "epoch": 7.316, + "grad_norm": 1.5653101205825806, + "learning_rate": 2e-05, + "loss": 0.05275439, + "step": 3658 + }, + { + "epoch": 7.318, + "grad_norm": 1.9025769233703613, + "learning_rate": 2e-05, + "loss": 0.0513374, + "step": 3659 + }, + { + "epoch": 7.32, + "grad_norm": 0.9924256801605225, + "learning_rate": 2e-05, + "loss": 0.02910082, + "step": 3660 + }, + { + "epoch": 7.322, + "grad_norm": 1.058456301689148, + "learning_rate": 2e-05, + "loss": 0.03298526, + "step": 3661 + }, + { + "epoch": 7.324, + "grad_norm": 1.0151259899139404, + "learning_rate": 2e-05, + "loss": 0.03118435, + "step": 3662 + }, + { + "epoch": 7.326, + "grad_norm": 2.0813844203948975, + "learning_rate": 2e-05, + "loss": 0.05353644, + "step": 3663 + }, + { + "epoch": 7.328, + "grad_norm": 2.497664213180542, + "learning_rate": 2e-05, + "loss": 0.07998416, + "step": 3664 + }, + { + "epoch": 7.33, + "grad_norm": 1.1892470121383667, + "learning_rate": 2e-05, + "loss": 0.04133311, + "step": 3665 + }, + { + "epoch": 7.332, + "grad_norm": 1.0578644275665283, + "learning_rate": 2e-05, + "loss": 0.03139057, + "step": 3666 + }, + { + "epoch": 7.334, + "grad_norm": 1.7718684673309326, + "learning_rate": 2e-05, + "loss": 0.06689329, + "step": 3667 + }, + { + "epoch": 7.336, + "grad_norm": 2.739551305770874, + "learning_rate": 2e-05, + "loss": 0.03666856, + "step": 3668 + }, + { + "epoch": 7.338, + "grad_norm": 1.403628945350647, + "learning_rate": 2e-05, + "loss": 0.0506552, + "step": 3669 + }, + { + "epoch": 7.34, + "grad_norm": 1.8418684005737305, + "learning_rate": 2e-05, + "loss": 0.05253516, + "step": 3670 + }, + { + "epoch": 7.342, + "grad_norm": 1.2462759017944336, + "learning_rate": 2e-05, + "loss": 0.04325264, + "step": 3671 + }, + { + "epoch": 7.344, + "grad_norm": 3.195005178451538, + "learning_rate": 2e-05, + "loss": 0.05133444, + "step": 3672 + }, + { + "epoch": 7.346, + "grad_norm": 1.543524146080017, + "learning_rate": 2e-05, + "loss": 0.04906403, + "step": 3673 + }, + { + "epoch": 7.348, + "grad_norm": 1.4386686086654663, + "learning_rate": 2e-05, + "loss": 0.04781891, + "step": 3674 + }, + { + "epoch": 7.35, + "grad_norm": 1.7783071994781494, + "learning_rate": 2e-05, + "loss": 0.0641686, + "step": 3675 + }, + { + "epoch": 7.352, + "grad_norm": 1.090715765953064, + "learning_rate": 2e-05, + "loss": 0.03242114, + "step": 3676 + }, + { + "epoch": 7.354, + "grad_norm": 1.3116369247436523, + "learning_rate": 2e-05, + "loss": 0.03984781, + "step": 3677 + }, + { + "epoch": 7.356, + "grad_norm": 1.894168734550476, + "learning_rate": 2e-05, + "loss": 0.03056693, + "step": 3678 + }, + { + "epoch": 7.358, + "grad_norm": 1.7085542678833008, + "learning_rate": 2e-05, + "loss": 0.05443097, + "step": 3679 + }, + { + "epoch": 7.36, + "grad_norm": 1.401723027229309, + "learning_rate": 2e-05, + "loss": 0.04774553, + "step": 3680 + }, + { + "epoch": 7.362, + "grad_norm": 1.4581307172775269, + "learning_rate": 2e-05, + "loss": 0.04450407, + "step": 3681 + }, + { + "epoch": 7.364, + "grad_norm": 1.6069996356964111, + "learning_rate": 2e-05, + "loss": 0.06170261, + "step": 3682 + }, + { + "epoch": 7.366, + "grad_norm": 1.324460506439209, + "learning_rate": 2e-05, + "loss": 0.04895044, + "step": 3683 + }, + { + "epoch": 7.368, + "grad_norm": 1.3643289804458618, + "learning_rate": 2e-05, + "loss": 0.04545024, + "step": 3684 + }, + { + "epoch": 7.37, + "grad_norm": 1.6585203409194946, + "learning_rate": 2e-05, + "loss": 0.04801652, + "step": 3685 + }, + { + "epoch": 7.372, + "grad_norm": 1.4287261962890625, + "learning_rate": 2e-05, + "loss": 0.0614093, + "step": 3686 + }, + { + "epoch": 7.374, + "grad_norm": 1.5790996551513672, + "learning_rate": 2e-05, + "loss": 0.05381008, + "step": 3687 + }, + { + "epoch": 7.376, + "grad_norm": 2.0287680625915527, + "learning_rate": 2e-05, + "loss": 0.06601522, + "step": 3688 + }, + { + "epoch": 7.378, + "grad_norm": 1.171452283859253, + "learning_rate": 2e-05, + "loss": 0.03029294, + "step": 3689 + }, + { + "epoch": 7.38, + "grad_norm": 1.8826758861541748, + "learning_rate": 2e-05, + "loss": 0.0547618, + "step": 3690 + }, + { + "epoch": 7.382, + "grad_norm": 1.565005898475647, + "learning_rate": 2e-05, + "loss": 0.0449411, + "step": 3691 + }, + { + "epoch": 7.384, + "grad_norm": 1.5869972705841064, + "learning_rate": 2e-05, + "loss": 0.04129549, + "step": 3692 + }, + { + "epoch": 7.386, + "grad_norm": 1.52235746383667, + "learning_rate": 2e-05, + "loss": 0.05102809, + "step": 3693 + }, + { + "epoch": 7.388, + "grad_norm": 1.656127691268921, + "learning_rate": 2e-05, + "loss": 0.04343271, + "step": 3694 + }, + { + "epoch": 7.39, + "grad_norm": 1.1580047607421875, + "learning_rate": 2e-05, + "loss": 0.04137683, + "step": 3695 + }, + { + "epoch": 7.392, + "grad_norm": 1.3034822940826416, + "learning_rate": 2e-05, + "loss": 0.04663299, + "step": 3696 + }, + { + "epoch": 7.394, + "grad_norm": 1.391483187675476, + "learning_rate": 2e-05, + "loss": 0.04537113, + "step": 3697 + }, + { + "epoch": 7.396, + "grad_norm": 1.4313862323760986, + "learning_rate": 2e-05, + "loss": 0.04555268, + "step": 3698 + }, + { + "epoch": 7.398, + "grad_norm": 3.1390175819396973, + "learning_rate": 2e-05, + "loss": 0.06643859, + "step": 3699 + }, + { + "epoch": 7.4, + "grad_norm": 1.6369024515151978, + "learning_rate": 2e-05, + "loss": 0.05245568, + "step": 3700 + }, + { + "epoch": 7.402, + "grad_norm": 1.2686548233032227, + "learning_rate": 2e-05, + "loss": 0.04974839, + "step": 3701 + }, + { + "epoch": 7.404, + "grad_norm": 1.2958829402923584, + "learning_rate": 2e-05, + "loss": 0.04743432, + "step": 3702 + }, + { + "epoch": 7.406, + "grad_norm": 1.9894742965698242, + "learning_rate": 2e-05, + "loss": 0.04571943, + "step": 3703 + }, + { + "epoch": 7.408, + "grad_norm": 1.0856343507766724, + "learning_rate": 2e-05, + "loss": 0.03500437, + "step": 3704 + }, + { + "epoch": 7.41, + "grad_norm": 1.949939250946045, + "learning_rate": 2e-05, + "loss": 0.05323801, + "step": 3705 + }, + { + "epoch": 7.412, + "grad_norm": 1.4476184844970703, + "learning_rate": 2e-05, + "loss": 0.04785695, + "step": 3706 + }, + { + "epoch": 7.414, + "grad_norm": 3.4324965476989746, + "learning_rate": 2e-05, + "loss": 0.06573795, + "step": 3707 + }, + { + "epoch": 7.416, + "grad_norm": 1.3931694030761719, + "learning_rate": 2e-05, + "loss": 0.04984997, + "step": 3708 + }, + { + "epoch": 7.418, + "grad_norm": 1.56455659866333, + "learning_rate": 2e-05, + "loss": 0.05046746, + "step": 3709 + }, + { + "epoch": 7.42, + "grad_norm": 1.200805425643921, + "learning_rate": 2e-05, + "loss": 0.03915904, + "step": 3710 + }, + { + "epoch": 7.422, + "grad_norm": 1.366477608680725, + "learning_rate": 2e-05, + "loss": 0.04656648, + "step": 3711 + }, + { + "epoch": 7.424, + "grad_norm": 1.8359090089797974, + "learning_rate": 2e-05, + "loss": 0.05103552, + "step": 3712 + }, + { + "epoch": 7.426, + "grad_norm": 1.4380766153335571, + "learning_rate": 2e-05, + "loss": 0.04777982, + "step": 3713 + }, + { + "epoch": 7.428, + "grad_norm": 1.0222777128219604, + "learning_rate": 2e-05, + "loss": 0.03150903, + "step": 3714 + }, + { + "epoch": 7.43, + "grad_norm": 2.749195098876953, + "learning_rate": 2e-05, + "loss": 0.05602058, + "step": 3715 + }, + { + "epoch": 7.432, + "grad_norm": 1.7111470699310303, + "learning_rate": 2e-05, + "loss": 0.04583771, + "step": 3716 + }, + { + "epoch": 7.434, + "grad_norm": 1.3404757976531982, + "learning_rate": 2e-05, + "loss": 0.04275534, + "step": 3717 + }, + { + "epoch": 7.436, + "grad_norm": 1.210221529006958, + "learning_rate": 2e-05, + "loss": 0.04058505, + "step": 3718 + }, + { + "epoch": 7.438, + "grad_norm": 2.068676471710205, + "learning_rate": 2e-05, + "loss": 0.04987239, + "step": 3719 + }, + { + "epoch": 7.44, + "grad_norm": 1.6463974714279175, + "learning_rate": 2e-05, + "loss": 0.0489587, + "step": 3720 + }, + { + "epoch": 7.442, + "grad_norm": 0.9529186487197876, + "learning_rate": 2e-05, + "loss": 0.02814676, + "step": 3721 + }, + { + "epoch": 7.444, + "grad_norm": 1.097665548324585, + "learning_rate": 2e-05, + "loss": 0.0485144, + "step": 3722 + }, + { + "epoch": 7.446, + "grad_norm": 1.5147879123687744, + "learning_rate": 2e-05, + "loss": 0.04698938, + "step": 3723 + }, + { + "epoch": 7.448, + "grad_norm": 1.2705267667770386, + "learning_rate": 2e-05, + "loss": 0.03390349, + "step": 3724 + }, + { + "epoch": 7.45, + "grad_norm": 2.0715646743774414, + "learning_rate": 2e-05, + "loss": 0.07898147, + "step": 3725 + }, + { + "epoch": 7.452, + "grad_norm": 1.3505516052246094, + "learning_rate": 2e-05, + "loss": 0.05181169, + "step": 3726 + }, + { + "epoch": 7.454, + "grad_norm": 1.6015242338180542, + "learning_rate": 2e-05, + "loss": 0.05964314, + "step": 3727 + }, + { + "epoch": 7.456, + "grad_norm": 1.358168601989746, + "learning_rate": 2e-05, + "loss": 0.04774554, + "step": 3728 + }, + { + "epoch": 7.458, + "grad_norm": 1.53998863697052, + "learning_rate": 2e-05, + "loss": 0.04986886, + "step": 3729 + }, + { + "epoch": 7.46, + "grad_norm": 1.2373335361480713, + "learning_rate": 2e-05, + "loss": 0.05145576, + "step": 3730 + }, + { + "epoch": 7.462, + "grad_norm": 1.2821953296661377, + "learning_rate": 2e-05, + "loss": 0.05637913, + "step": 3731 + }, + { + "epoch": 7.464, + "grad_norm": 1.3105919361114502, + "learning_rate": 2e-05, + "loss": 0.04199533, + "step": 3732 + }, + { + "epoch": 7.466, + "grad_norm": 1.5130268335342407, + "learning_rate": 2e-05, + "loss": 0.04699155, + "step": 3733 + }, + { + "epoch": 7.468, + "grad_norm": 1.737265706062317, + "learning_rate": 2e-05, + "loss": 0.03969675, + "step": 3734 + }, + { + "epoch": 7.47, + "grad_norm": 1.4570841789245605, + "learning_rate": 2e-05, + "loss": 0.04642572, + "step": 3735 + }, + { + "epoch": 7.4719999999999995, + "grad_norm": 0.9005123376846313, + "learning_rate": 2e-05, + "loss": 0.03132126, + "step": 3736 + }, + { + "epoch": 7.474, + "grad_norm": 1.973960041999817, + "learning_rate": 2e-05, + "loss": 0.05362177, + "step": 3737 + }, + { + "epoch": 7.476, + "grad_norm": 2.2350363731384277, + "learning_rate": 2e-05, + "loss": 0.06357549, + "step": 3738 + }, + { + "epoch": 7.478, + "grad_norm": 1.4672166109085083, + "learning_rate": 2e-05, + "loss": 0.04843113, + "step": 3739 + }, + { + "epoch": 7.48, + "grad_norm": 1.9889647960662842, + "learning_rate": 2e-05, + "loss": 0.04717405, + "step": 3740 + }, + { + "epoch": 7.482, + "grad_norm": 1.1756142377853394, + "learning_rate": 2e-05, + "loss": 0.04274727, + "step": 3741 + }, + { + "epoch": 7.484, + "grad_norm": 1.004916787147522, + "learning_rate": 2e-05, + "loss": 0.03753413, + "step": 3742 + }, + { + "epoch": 7.486, + "grad_norm": 1.277570128440857, + "learning_rate": 2e-05, + "loss": 0.04001706, + "step": 3743 + }, + { + "epoch": 7.4879999999999995, + "grad_norm": 1.0165624618530273, + "learning_rate": 2e-05, + "loss": 0.03563813, + "step": 3744 + }, + { + "epoch": 7.49, + "grad_norm": 1.3447015285491943, + "learning_rate": 2e-05, + "loss": 0.04556485, + "step": 3745 + }, + { + "epoch": 7.492, + "grad_norm": 1.9407835006713867, + "learning_rate": 2e-05, + "loss": 0.05292122, + "step": 3746 + }, + { + "epoch": 7.494, + "grad_norm": 4.274045467376709, + "learning_rate": 2e-05, + "loss": 0.04551305, + "step": 3747 + }, + { + "epoch": 7.496, + "grad_norm": 1.2199167013168335, + "learning_rate": 2e-05, + "loss": 0.04236592, + "step": 3748 + }, + { + "epoch": 7.498, + "grad_norm": 1.2471674680709839, + "learning_rate": 2e-05, + "loss": 0.03323042, + "step": 3749 + }, + { + "epoch": 7.5, + "grad_norm": 1.1750595569610596, + "learning_rate": 2e-05, + "loss": 0.0397836, + "step": 3750 + }, + { + "epoch": 7.502, + "grad_norm": 1.7069101333618164, + "learning_rate": 2e-05, + "loss": 0.04767216, + "step": 3751 + }, + { + "epoch": 7.504, + "grad_norm": 1.3713217973709106, + "learning_rate": 2e-05, + "loss": 0.04113061, + "step": 3752 + }, + { + "epoch": 7.506, + "grad_norm": 1.2698354721069336, + "learning_rate": 2e-05, + "loss": 0.04726923, + "step": 3753 + }, + { + "epoch": 7.508, + "grad_norm": 1.3195385932922363, + "learning_rate": 2e-05, + "loss": 0.04144061, + "step": 3754 + }, + { + "epoch": 7.51, + "grad_norm": 1.405106782913208, + "learning_rate": 2e-05, + "loss": 0.04036184, + "step": 3755 + }, + { + "epoch": 7.5120000000000005, + "grad_norm": 1.3161296844482422, + "learning_rate": 2e-05, + "loss": 0.03146423, + "step": 3756 + }, + { + "epoch": 7.514, + "grad_norm": 1.8844631910324097, + "learning_rate": 2e-05, + "loss": 0.06133197, + "step": 3757 + }, + { + "epoch": 7.516, + "grad_norm": 2.0802526473999023, + "learning_rate": 2e-05, + "loss": 0.04892286, + "step": 3758 + }, + { + "epoch": 7.518, + "grad_norm": 3.128467082977295, + "learning_rate": 2e-05, + "loss": 0.06032344, + "step": 3759 + }, + { + "epoch": 7.52, + "grad_norm": 2.1159846782684326, + "learning_rate": 2e-05, + "loss": 0.07681625, + "step": 3760 + }, + { + "epoch": 7.522, + "grad_norm": 1.6027671098709106, + "learning_rate": 2e-05, + "loss": 0.05662168, + "step": 3761 + }, + { + "epoch": 7.524, + "grad_norm": 2.217183828353882, + "learning_rate": 2e-05, + "loss": 0.06822868, + "step": 3762 + }, + { + "epoch": 7.526, + "grad_norm": 1.5272791385650635, + "learning_rate": 2e-05, + "loss": 0.04064973, + "step": 3763 + }, + { + "epoch": 7.5280000000000005, + "grad_norm": 1.4808307886123657, + "learning_rate": 2e-05, + "loss": 0.04885202, + "step": 3764 + }, + { + "epoch": 7.53, + "grad_norm": 1.6249582767486572, + "learning_rate": 2e-05, + "loss": 0.05506381, + "step": 3765 + }, + { + "epoch": 7.532, + "grad_norm": 1.6478004455566406, + "learning_rate": 2e-05, + "loss": 0.04652187, + "step": 3766 + }, + { + "epoch": 7.534, + "grad_norm": 2.208526134490967, + "learning_rate": 2e-05, + "loss": 0.049362, + "step": 3767 + }, + { + "epoch": 7.536, + "grad_norm": 1.3688409328460693, + "learning_rate": 2e-05, + "loss": 0.04084139, + "step": 3768 + }, + { + "epoch": 7.538, + "grad_norm": 1.5067732334136963, + "learning_rate": 2e-05, + "loss": 0.04833493, + "step": 3769 + }, + { + "epoch": 7.54, + "grad_norm": 1.175787091255188, + "learning_rate": 2e-05, + "loss": 0.04301163, + "step": 3770 + }, + { + "epoch": 7.542, + "grad_norm": 2.0285022258758545, + "learning_rate": 2e-05, + "loss": 0.06193438, + "step": 3771 + }, + { + "epoch": 7.5440000000000005, + "grad_norm": 1.2851672172546387, + "learning_rate": 2e-05, + "loss": 0.05366874, + "step": 3772 + }, + { + "epoch": 7.546, + "grad_norm": 1.5538355112075806, + "learning_rate": 2e-05, + "loss": 0.06452001, + "step": 3773 + }, + { + "epoch": 7.548, + "grad_norm": 1.1925280094146729, + "learning_rate": 2e-05, + "loss": 0.04862928, + "step": 3774 + }, + { + "epoch": 7.55, + "grad_norm": 1.3164457082748413, + "learning_rate": 2e-05, + "loss": 0.04869618, + "step": 3775 + }, + { + "epoch": 7.552, + "grad_norm": 1.8876863718032837, + "learning_rate": 2e-05, + "loss": 0.04987069, + "step": 3776 + }, + { + "epoch": 7.554, + "grad_norm": 1.348827600479126, + "learning_rate": 2e-05, + "loss": 0.05645898, + "step": 3777 + }, + { + "epoch": 7.556, + "grad_norm": 1.2659227848052979, + "learning_rate": 2e-05, + "loss": 0.04560403, + "step": 3778 + }, + { + "epoch": 7.558, + "grad_norm": 1.553835391998291, + "learning_rate": 2e-05, + "loss": 0.05577955, + "step": 3779 + }, + { + "epoch": 7.5600000000000005, + "grad_norm": 2.5734472274780273, + "learning_rate": 2e-05, + "loss": 0.05894909, + "step": 3780 + }, + { + "epoch": 7.562, + "grad_norm": 1.9130885601043701, + "learning_rate": 2e-05, + "loss": 0.04794927, + "step": 3781 + }, + { + "epoch": 7.564, + "grad_norm": 1.092248797416687, + "learning_rate": 2e-05, + "loss": 0.03762339, + "step": 3782 + }, + { + "epoch": 7.566, + "grad_norm": 1.438640832901001, + "learning_rate": 2e-05, + "loss": 0.03907945, + "step": 3783 + }, + { + "epoch": 7.568, + "grad_norm": 1.047423005104065, + "learning_rate": 2e-05, + "loss": 0.0316382, + "step": 3784 + }, + { + "epoch": 7.57, + "grad_norm": 1.7063649892807007, + "learning_rate": 2e-05, + "loss": 0.04632139, + "step": 3785 + }, + { + "epoch": 7.572, + "grad_norm": 1.954330325126648, + "learning_rate": 2e-05, + "loss": 0.05917243, + "step": 3786 + }, + { + "epoch": 7.574, + "grad_norm": 1.586054801940918, + "learning_rate": 2e-05, + "loss": 0.06359352, + "step": 3787 + }, + { + "epoch": 7.576, + "grad_norm": 1.6239789724349976, + "learning_rate": 2e-05, + "loss": 0.04376218, + "step": 3788 + }, + { + "epoch": 7.578, + "grad_norm": 1.7396554946899414, + "learning_rate": 2e-05, + "loss": 0.04864378, + "step": 3789 + }, + { + "epoch": 7.58, + "grad_norm": 1.1192086935043335, + "learning_rate": 2e-05, + "loss": 0.03764372, + "step": 3790 + }, + { + "epoch": 7.582, + "grad_norm": 2.047725200653076, + "learning_rate": 2e-05, + "loss": 0.05432349, + "step": 3791 + }, + { + "epoch": 7.584, + "grad_norm": 1.4690004587173462, + "learning_rate": 2e-05, + "loss": 0.05431195, + "step": 3792 + }, + { + "epoch": 7.586, + "grad_norm": 1.6998225450515747, + "learning_rate": 2e-05, + "loss": 0.05252321, + "step": 3793 + }, + { + "epoch": 7.588, + "grad_norm": 1.0173852443695068, + "learning_rate": 2e-05, + "loss": 0.03292936, + "step": 3794 + }, + { + "epoch": 7.59, + "grad_norm": 1.2898446321487427, + "learning_rate": 2e-05, + "loss": 0.04756307, + "step": 3795 + }, + { + "epoch": 7.592, + "grad_norm": 1.3239659070968628, + "learning_rate": 2e-05, + "loss": 0.03519905, + "step": 3796 + }, + { + "epoch": 7.594, + "grad_norm": 0.9740890860557556, + "learning_rate": 2e-05, + "loss": 0.03270172, + "step": 3797 + }, + { + "epoch": 7.596, + "grad_norm": 1.0018112659454346, + "learning_rate": 2e-05, + "loss": 0.03196118, + "step": 3798 + }, + { + "epoch": 7.598, + "grad_norm": 1.5058925151824951, + "learning_rate": 2e-05, + "loss": 0.04523263, + "step": 3799 + }, + { + "epoch": 7.6, + "grad_norm": 1.1637154817581177, + "learning_rate": 2e-05, + "loss": 0.03923118, + "step": 3800 + }, + { + "epoch": 7.602, + "grad_norm": 1.1870934963226318, + "learning_rate": 2e-05, + "loss": 0.04084285, + "step": 3801 + }, + { + "epoch": 7.604, + "grad_norm": 2.0652079582214355, + "learning_rate": 2e-05, + "loss": 0.05630117, + "step": 3802 + }, + { + "epoch": 7.606, + "grad_norm": 2.6154839992523193, + "learning_rate": 2e-05, + "loss": 0.06760383, + "step": 3803 + }, + { + "epoch": 7.608, + "grad_norm": 1.4074623584747314, + "learning_rate": 2e-05, + "loss": 0.04475513, + "step": 3804 + }, + { + "epoch": 7.61, + "grad_norm": 1.3313047885894775, + "learning_rate": 2e-05, + "loss": 0.04672603, + "step": 3805 + }, + { + "epoch": 7.612, + "grad_norm": 1.1976416110992432, + "learning_rate": 2e-05, + "loss": 0.04390628, + "step": 3806 + }, + { + "epoch": 7.614, + "grad_norm": 2.146451950073242, + "learning_rate": 2e-05, + "loss": 0.05690853, + "step": 3807 + }, + { + "epoch": 7.616, + "grad_norm": 1.3076081275939941, + "learning_rate": 2e-05, + "loss": 0.05246904, + "step": 3808 + }, + { + "epoch": 7.618, + "grad_norm": 1.1059523820877075, + "learning_rate": 2e-05, + "loss": 0.02809783, + "step": 3809 + }, + { + "epoch": 7.62, + "grad_norm": 1.2201027870178223, + "learning_rate": 2e-05, + "loss": 0.03628596, + "step": 3810 + }, + { + "epoch": 7.622, + "grad_norm": 1.1609594821929932, + "learning_rate": 2e-05, + "loss": 0.03147845, + "step": 3811 + }, + { + "epoch": 7.624, + "grad_norm": 1.2720181941986084, + "learning_rate": 2e-05, + "loss": 0.04290503, + "step": 3812 + }, + { + "epoch": 7.626, + "grad_norm": 1.812412142753601, + "learning_rate": 2e-05, + "loss": 0.07085785, + "step": 3813 + }, + { + "epoch": 7.628, + "grad_norm": 1.4819214344024658, + "learning_rate": 2e-05, + "loss": 0.05656476, + "step": 3814 + }, + { + "epoch": 7.63, + "grad_norm": 1.9143245220184326, + "learning_rate": 2e-05, + "loss": 0.08178715, + "step": 3815 + }, + { + "epoch": 7.632, + "grad_norm": 1.3047770261764526, + "learning_rate": 2e-05, + "loss": 0.05004913, + "step": 3816 + }, + { + "epoch": 7.634, + "grad_norm": 1.793925404548645, + "learning_rate": 2e-05, + "loss": 0.04365033, + "step": 3817 + }, + { + "epoch": 7.636, + "grad_norm": 1.9158854484558105, + "learning_rate": 2e-05, + "loss": 0.05776561, + "step": 3818 + }, + { + "epoch": 7.638, + "grad_norm": 1.4316856861114502, + "learning_rate": 2e-05, + "loss": 0.04811051, + "step": 3819 + }, + { + "epoch": 7.64, + "grad_norm": 1.6033645868301392, + "learning_rate": 2e-05, + "loss": 0.04948676, + "step": 3820 + }, + { + "epoch": 7.642, + "grad_norm": 1.9721709489822388, + "learning_rate": 2e-05, + "loss": 0.07113369, + "step": 3821 + }, + { + "epoch": 7.644, + "grad_norm": 2.0091919898986816, + "learning_rate": 2e-05, + "loss": 0.04521521, + "step": 3822 + }, + { + "epoch": 7.646, + "grad_norm": 2.2790145874023438, + "learning_rate": 2e-05, + "loss": 0.06836452, + "step": 3823 + }, + { + "epoch": 7.648, + "grad_norm": 1.6919300556182861, + "learning_rate": 2e-05, + "loss": 0.04315101, + "step": 3824 + }, + { + "epoch": 7.65, + "grad_norm": 1.6368211507797241, + "learning_rate": 2e-05, + "loss": 0.06151176, + "step": 3825 + }, + { + "epoch": 7.652, + "grad_norm": 1.644728660583496, + "learning_rate": 2e-05, + "loss": 0.05243739, + "step": 3826 + }, + { + "epoch": 7.654, + "grad_norm": 2.1954734325408936, + "learning_rate": 2e-05, + "loss": 0.04884458, + "step": 3827 + }, + { + "epoch": 7.656, + "grad_norm": 1.4791172742843628, + "learning_rate": 2e-05, + "loss": 0.04870691, + "step": 3828 + }, + { + "epoch": 7.658, + "grad_norm": 1.6743173599243164, + "learning_rate": 2e-05, + "loss": 0.06012721, + "step": 3829 + }, + { + "epoch": 7.66, + "grad_norm": 1.8005884885787964, + "learning_rate": 2e-05, + "loss": 0.05182242, + "step": 3830 + }, + { + "epoch": 7.662, + "grad_norm": 1.710728645324707, + "learning_rate": 2e-05, + "loss": 0.05671708, + "step": 3831 + }, + { + "epoch": 7.664, + "grad_norm": 1.4567813873291016, + "learning_rate": 2e-05, + "loss": 0.05406286, + "step": 3832 + }, + { + "epoch": 7.666, + "grad_norm": 1.37957763671875, + "learning_rate": 2e-05, + "loss": 0.05166141, + "step": 3833 + }, + { + "epoch": 7.668, + "grad_norm": 1.3298771381378174, + "learning_rate": 2e-05, + "loss": 0.04657075, + "step": 3834 + }, + { + "epoch": 7.67, + "grad_norm": 1.678098440170288, + "learning_rate": 2e-05, + "loss": 0.06359571, + "step": 3835 + }, + { + "epoch": 7.672, + "grad_norm": 1.3091044425964355, + "learning_rate": 2e-05, + "loss": 0.04521374, + "step": 3836 + }, + { + "epoch": 7.674, + "grad_norm": 1.1188842058181763, + "learning_rate": 2e-05, + "loss": 0.05468755, + "step": 3837 + }, + { + "epoch": 7.676, + "grad_norm": 1.4298003911972046, + "learning_rate": 2e-05, + "loss": 0.05605362, + "step": 3838 + }, + { + "epoch": 7.678, + "grad_norm": 2.1561481952667236, + "learning_rate": 2e-05, + "loss": 0.05341341, + "step": 3839 + }, + { + "epoch": 7.68, + "grad_norm": 1.3226423263549805, + "learning_rate": 2e-05, + "loss": 0.04852953, + "step": 3840 + }, + { + "epoch": 7.682, + "grad_norm": 1.267012596130371, + "learning_rate": 2e-05, + "loss": 0.05608751, + "step": 3841 + }, + { + "epoch": 7.684, + "grad_norm": 1.670479416847229, + "learning_rate": 2e-05, + "loss": 0.06202754, + "step": 3842 + }, + { + "epoch": 7.686, + "grad_norm": 1.5169838666915894, + "learning_rate": 2e-05, + "loss": 0.04942085, + "step": 3843 + }, + { + "epoch": 7.688, + "grad_norm": 1.5922930240631104, + "learning_rate": 2e-05, + "loss": 0.03477084, + "step": 3844 + }, + { + "epoch": 7.6899999999999995, + "grad_norm": 1.939310908317566, + "learning_rate": 2e-05, + "loss": 0.03845644, + "step": 3845 + }, + { + "epoch": 7.692, + "grad_norm": 1.3163906335830688, + "learning_rate": 2e-05, + "loss": 0.04649363, + "step": 3846 + }, + { + "epoch": 7.694, + "grad_norm": 2.3533008098602295, + "learning_rate": 2e-05, + "loss": 0.05440918, + "step": 3847 + }, + { + "epoch": 7.696, + "grad_norm": 2.0161502361297607, + "learning_rate": 2e-05, + "loss": 0.04734223, + "step": 3848 + }, + { + "epoch": 7.698, + "grad_norm": 1.932399868965149, + "learning_rate": 2e-05, + "loss": 0.04660235, + "step": 3849 + }, + { + "epoch": 7.7, + "grad_norm": 1.0067873001098633, + "learning_rate": 2e-05, + "loss": 0.03094147, + "step": 3850 + }, + { + "epoch": 7.702, + "grad_norm": 1.2816112041473389, + "learning_rate": 2e-05, + "loss": 0.04377981, + "step": 3851 + }, + { + "epoch": 7.704, + "grad_norm": 1.6109353303909302, + "learning_rate": 2e-05, + "loss": 0.04601968, + "step": 3852 + }, + { + "epoch": 7.7059999999999995, + "grad_norm": 1.514350175857544, + "learning_rate": 2e-05, + "loss": 0.06082528, + "step": 3853 + }, + { + "epoch": 7.708, + "grad_norm": 1.3705843687057495, + "learning_rate": 2e-05, + "loss": 0.04245031, + "step": 3854 + }, + { + "epoch": 7.71, + "grad_norm": 1.340019941329956, + "learning_rate": 2e-05, + "loss": 0.03780312, + "step": 3855 + }, + { + "epoch": 7.712, + "grad_norm": 1.9258254766464233, + "learning_rate": 2e-05, + "loss": 0.04252723, + "step": 3856 + }, + { + "epoch": 7.714, + "grad_norm": 2.0864787101745605, + "learning_rate": 2e-05, + "loss": 0.05551865, + "step": 3857 + }, + { + "epoch": 7.716, + "grad_norm": 1.5121456384658813, + "learning_rate": 2e-05, + "loss": 0.04484479, + "step": 3858 + }, + { + "epoch": 7.718, + "grad_norm": 2.2497289180755615, + "learning_rate": 2e-05, + "loss": 0.06071467, + "step": 3859 + }, + { + "epoch": 7.72, + "grad_norm": 2.1290595531463623, + "learning_rate": 2e-05, + "loss": 0.05557271, + "step": 3860 + }, + { + "epoch": 7.7219999999999995, + "grad_norm": 1.1759511232376099, + "learning_rate": 2e-05, + "loss": 0.03121856, + "step": 3861 + }, + { + "epoch": 7.724, + "grad_norm": 1.8583542108535767, + "learning_rate": 2e-05, + "loss": 0.04486027, + "step": 3862 + }, + { + "epoch": 7.726, + "grad_norm": 1.753166913986206, + "learning_rate": 2e-05, + "loss": 0.05814328, + "step": 3863 + }, + { + "epoch": 7.728, + "grad_norm": 1.2085702419281006, + "learning_rate": 2e-05, + "loss": 0.04606169, + "step": 3864 + }, + { + "epoch": 7.73, + "grad_norm": 0.9439897537231445, + "learning_rate": 2e-05, + "loss": 0.03185405, + "step": 3865 + }, + { + "epoch": 7.732, + "grad_norm": 1.1782246828079224, + "learning_rate": 2e-05, + "loss": 0.038643, + "step": 3866 + }, + { + "epoch": 7.734, + "grad_norm": 2.3399665355682373, + "learning_rate": 2e-05, + "loss": 0.04473191, + "step": 3867 + }, + { + "epoch": 7.736, + "grad_norm": 1.0290220975875854, + "learning_rate": 2e-05, + "loss": 0.03318851, + "step": 3868 + }, + { + "epoch": 7.7379999999999995, + "grad_norm": 2.829364061355591, + "learning_rate": 2e-05, + "loss": 0.05109379, + "step": 3869 + }, + { + "epoch": 7.74, + "grad_norm": 1.3606927394866943, + "learning_rate": 2e-05, + "loss": 0.05341338, + "step": 3870 + }, + { + "epoch": 7.742, + "grad_norm": 1.5313520431518555, + "learning_rate": 2e-05, + "loss": 0.04662801, + "step": 3871 + }, + { + "epoch": 7.744, + "grad_norm": 1.2733118534088135, + "learning_rate": 2e-05, + "loss": 0.04062459, + "step": 3872 + }, + { + "epoch": 7.746, + "grad_norm": 1.3704501390457153, + "learning_rate": 2e-05, + "loss": 0.05422856, + "step": 3873 + }, + { + "epoch": 7.748, + "grad_norm": 1.1714882850646973, + "learning_rate": 2e-05, + "loss": 0.04571648, + "step": 3874 + }, + { + "epoch": 7.75, + "grad_norm": 1.7015575170516968, + "learning_rate": 2e-05, + "loss": 0.05158704, + "step": 3875 + }, + { + "epoch": 7.752, + "grad_norm": 1.004583716392517, + "learning_rate": 2e-05, + "loss": 0.0300518, + "step": 3876 + }, + { + "epoch": 7.754, + "grad_norm": 1.118895173072815, + "learning_rate": 2e-05, + "loss": 0.03498417, + "step": 3877 + }, + { + "epoch": 7.756, + "grad_norm": 1.5967800617218018, + "learning_rate": 2e-05, + "loss": 0.05754827, + "step": 3878 + }, + { + "epoch": 7.758, + "grad_norm": 2.8003337383270264, + "learning_rate": 2e-05, + "loss": 0.06484474, + "step": 3879 + }, + { + "epoch": 7.76, + "grad_norm": 1.5037521123886108, + "learning_rate": 2e-05, + "loss": 0.05595599, + "step": 3880 + }, + { + "epoch": 7.7620000000000005, + "grad_norm": 1.328269362449646, + "learning_rate": 2e-05, + "loss": 0.03779745, + "step": 3881 + }, + { + "epoch": 7.764, + "grad_norm": 1.3670969009399414, + "learning_rate": 2e-05, + "loss": 0.04492682, + "step": 3882 + }, + { + "epoch": 7.766, + "grad_norm": 1.441186785697937, + "learning_rate": 2e-05, + "loss": 0.04883204, + "step": 3883 + }, + { + "epoch": 7.768, + "grad_norm": 1.1069203615188599, + "learning_rate": 2e-05, + "loss": 0.03243296, + "step": 3884 + }, + { + "epoch": 7.77, + "grad_norm": 1.1207300424575806, + "learning_rate": 2e-05, + "loss": 0.0498887, + "step": 3885 + }, + { + "epoch": 7.772, + "grad_norm": 1.2130826711654663, + "learning_rate": 2e-05, + "loss": 0.03474583, + "step": 3886 + }, + { + "epoch": 7.774, + "grad_norm": 1.099841833114624, + "learning_rate": 2e-05, + "loss": 0.04760575, + "step": 3887 + }, + { + "epoch": 7.776, + "grad_norm": 1.401013731956482, + "learning_rate": 2e-05, + "loss": 0.04593216, + "step": 3888 + }, + { + "epoch": 7.7780000000000005, + "grad_norm": 0.9707529544830322, + "learning_rate": 2e-05, + "loss": 0.03742143, + "step": 3889 + }, + { + "epoch": 7.78, + "grad_norm": 1.4640308618545532, + "learning_rate": 2e-05, + "loss": 0.05783337, + "step": 3890 + }, + { + "epoch": 7.782, + "grad_norm": 1.4882322549819946, + "learning_rate": 2e-05, + "loss": 0.0567808, + "step": 3891 + }, + { + "epoch": 7.784, + "grad_norm": 1.206899642944336, + "learning_rate": 2e-05, + "loss": 0.0442479, + "step": 3892 + }, + { + "epoch": 7.786, + "grad_norm": 1.2967495918273926, + "learning_rate": 2e-05, + "loss": 0.04167457, + "step": 3893 + }, + { + "epoch": 7.788, + "grad_norm": 1.8896660804748535, + "learning_rate": 2e-05, + "loss": 0.04580644, + "step": 3894 + }, + { + "epoch": 7.79, + "grad_norm": 2.820158004760742, + "learning_rate": 2e-05, + "loss": 0.0647872, + "step": 3895 + }, + { + "epoch": 7.792, + "grad_norm": 2.5546226501464844, + "learning_rate": 2e-05, + "loss": 0.07351266, + "step": 3896 + }, + { + "epoch": 7.7940000000000005, + "grad_norm": 2.217388868331909, + "learning_rate": 2e-05, + "loss": 0.03897737, + "step": 3897 + }, + { + "epoch": 7.796, + "grad_norm": 1.7939376831054688, + "learning_rate": 2e-05, + "loss": 0.06104586, + "step": 3898 + }, + { + "epoch": 7.798, + "grad_norm": 1.0939199924468994, + "learning_rate": 2e-05, + "loss": 0.03686985, + "step": 3899 + }, + { + "epoch": 7.8, + "grad_norm": 1.504327416419983, + "learning_rate": 2e-05, + "loss": 0.05772873, + "step": 3900 + }, + { + "epoch": 7.802, + "grad_norm": 1.402275562286377, + "learning_rate": 2e-05, + "loss": 0.05212471, + "step": 3901 + }, + { + "epoch": 7.804, + "grad_norm": 1.3643393516540527, + "learning_rate": 2e-05, + "loss": 0.04227594, + "step": 3902 + }, + { + "epoch": 7.806, + "grad_norm": 1.3468226194381714, + "learning_rate": 2e-05, + "loss": 0.04262753, + "step": 3903 + }, + { + "epoch": 7.808, + "grad_norm": 1.6918001174926758, + "learning_rate": 2e-05, + "loss": 0.04145945, + "step": 3904 + }, + { + "epoch": 7.8100000000000005, + "grad_norm": 1.0832053422927856, + "learning_rate": 2e-05, + "loss": 0.03417937, + "step": 3905 + }, + { + "epoch": 7.812, + "grad_norm": 2.516186237335205, + "learning_rate": 2e-05, + "loss": 0.07497337, + "step": 3906 + }, + { + "epoch": 7.814, + "grad_norm": 3.0091376304626465, + "learning_rate": 2e-05, + "loss": 0.05103327, + "step": 3907 + }, + { + "epoch": 7.816, + "grad_norm": 1.4199695587158203, + "learning_rate": 2e-05, + "loss": 0.04106236, + "step": 3908 + }, + { + "epoch": 7.818, + "grad_norm": 2.151015281677246, + "learning_rate": 2e-05, + "loss": 0.05017766, + "step": 3909 + }, + { + "epoch": 7.82, + "grad_norm": 1.086867094039917, + "learning_rate": 2e-05, + "loss": 0.04102978, + "step": 3910 + }, + { + "epoch": 7.822, + "grad_norm": 1.1980141401290894, + "learning_rate": 2e-05, + "loss": 0.04687583, + "step": 3911 + }, + { + "epoch": 7.824, + "grad_norm": 1.7949175834655762, + "learning_rate": 2e-05, + "loss": 0.04523139, + "step": 3912 + }, + { + "epoch": 7.826, + "grad_norm": 1.5633015632629395, + "learning_rate": 2e-05, + "loss": 0.0565387, + "step": 3913 + }, + { + "epoch": 7.828, + "grad_norm": 1.9414931535720825, + "learning_rate": 2e-05, + "loss": 0.03930401, + "step": 3914 + }, + { + "epoch": 7.83, + "grad_norm": 1.1001695394515991, + "learning_rate": 2e-05, + "loss": 0.03971751, + "step": 3915 + }, + { + "epoch": 7.832, + "grad_norm": 1.6626832485198975, + "learning_rate": 2e-05, + "loss": 0.06389378, + "step": 3916 + }, + { + "epoch": 7.834, + "grad_norm": 1.166463017463684, + "learning_rate": 2e-05, + "loss": 0.03996446, + "step": 3917 + }, + { + "epoch": 7.836, + "grad_norm": 1.091837763786316, + "learning_rate": 2e-05, + "loss": 0.03501406, + "step": 3918 + }, + { + "epoch": 7.838, + "grad_norm": 1.5070854425430298, + "learning_rate": 2e-05, + "loss": 0.04527948, + "step": 3919 + }, + { + "epoch": 7.84, + "grad_norm": 2.192519187927246, + "learning_rate": 2e-05, + "loss": 0.05735584, + "step": 3920 + }, + { + "epoch": 7.842, + "grad_norm": 3.073923349380493, + "learning_rate": 2e-05, + "loss": 0.06089948, + "step": 3921 + }, + { + "epoch": 7.844, + "grad_norm": 2.4988508224487305, + "learning_rate": 2e-05, + "loss": 0.06159104, + "step": 3922 + }, + { + "epoch": 7.846, + "grad_norm": 1.6986947059631348, + "learning_rate": 2e-05, + "loss": 0.06097404, + "step": 3923 + }, + { + "epoch": 7.848, + "grad_norm": 1.2336899042129517, + "learning_rate": 2e-05, + "loss": 0.0403936, + "step": 3924 + }, + { + "epoch": 7.85, + "grad_norm": 3.7979280948638916, + "learning_rate": 2e-05, + "loss": 0.05501074, + "step": 3925 + }, + { + "epoch": 7.852, + "grad_norm": 1.620132565498352, + "learning_rate": 2e-05, + "loss": 0.05394637, + "step": 3926 + }, + { + "epoch": 7.854, + "grad_norm": 1.056098222732544, + "learning_rate": 2e-05, + "loss": 0.02777421, + "step": 3927 + }, + { + "epoch": 7.856, + "grad_norm": 1.5885734558105469, + "learning_rate": 2e-05, + "loss": 0.05239529, + "step": 3928 + }, + { + "epoch": 7.858, + "grad_norm": 1.4689480066299438, + "learning_rate": 2e-05, + "loss": 0.05046121, + "step": 3929 + }, + { + "epoch": 7.86, + "grad_norm": 2.288593053817749, + "learning_rate": 2e-05, + "loss": 0.04710715, + "step": 3930 + }, + { + "epoch": 7.862, + "grad_norm": 1.4412630796432495, + "learning_rate": 2e-05, + "loss": 0.04159814, + "step": 3931 + }, + { + "epoch": 7.864, + "grad_norm": 1.4466181993484497, + "learning_rate": 2e-05, + "loss": 0.03699116, + "step": 3932 + }, + { + "epoch": 7.866, + "grad_norm": 3.294027328491211, + "learning_rate": 2e-05, + "loss": 0.05937823, + "step": 3933 + }, + { + "epoch": 7.868, + "grad_norm": 2.1902272701263428, + "learning_rate": 2e-05, + "loss": 0.06694526, + "step": 3934 + }, + { + "epoch": 7.87, + "grad_norm": 2.0902066230773926, + "learning_rate": 2e-05, + "loss": 0.04954188, + "step": 3935 + }, + { + "epoch": 7.872, + "grad_norm": 1.534563422203064, + "learning_rate": 2e-05, + "loss": 0.05196944, + "step": 3936 + }, + { + "epoch": 7.874, + "grad_norm": 1.241812825202942, + "learning_rate": 2e-05, + "loss": 0.04576861, + "step": 3937 + }, + { + "epoch": 7.876, + "grad_norm": 2.344264507293701, + "learning_rate": 2e-05, + "loss": 0.04945996, + "step": 3938 + }, + { + "epoch": 7.878, + "grad_norm": 3.0257856845855713, + "learning_rate": 2e-05, + "loss": 0.05597422, + "step": 3939 + }, + { + "epoch": 7.88, + "grad_norm": 1.6022652387619019, + "learning_rate": 2e-05, + "loss": 0.05853411, + "step": 3940 + }, + { + "epoch": 7.882, + "grad_norm": 1.321745753288269, + "learning_rate": 2e-05, + "loss": 0.05200193, + "step": 3941 + }, + { + "epoch": 7.884, + "grad_norm": 1.6581352949142456, + "learning_rate": 2e-05, + "loss": 0.05210207, + "step": 3942 + }, + { + "epoch": 7.886, + "grad_norm": 2.1290030479431152, + "learning_rate": 2e-05, + "loss": 0.04898274, + "step": 3943 + }, + { + "epoch": 7.888, + "grad_norm": 1.2903791666030884, + "learning_rate": 2e-05, + "loss": 0.03680386, + "step": 3944 + }, + { + "epoch": 7.89, + "grad_norm": 1.1676714420318604, + "learning_rate": 2e-05, + "loss": 0.03573669, + "step": 3945 + }, + { + "epoch": 7.892, + "grad_norm": 3.1287758350372314, + "learning_rate": 2e-05, + "loss": 0.05708075, + "step": 3946 + }, + { + "epoch": 7.894, + "grad_norm": 1.6967209577560425, + "learning_rate": 2e-05, + "loss": 0.0523774, + "step": 3947 + }, + { + "epoch": 7.896, + "grad_norm": 1.3352283239364624, + "learning_rate": 2e-05, + "loss": 0.05093125, + "step": 3948 + }, + { + "epoch": 7.898, + "grad_norm": 1.4991921186447144, + "learning_rate": 2e-05, + "loss": 0.04498458, + "step": 3949 + }, + { + "epoch": 7.9, + "grad_norm": 1.7871036529541016, + "learning_rate": 2e-05, + "loss": 0.05441068, + "step": 3950 + }, + { + "epoch": 7.902, + "grad_norm": 1.8313108682632446, + "learning_rate": 2e-05, + "loss": 0.04300974, + "step": 3951 + }, + { + "epoch": 7.904, + "grad_norm": 2.545856475830078, + "learning_rate": 2e-05, + "loss": 0.0370265, + "step": 3952 + }, + { + "epoch": 7.906, + "grad_norm": 1.4351707696914673, + "learning_rate": 2e-05, + "loss": 0.04847009, + "step": 3953 + }, + { + "epoch": 7.908, + "grad_norm": 1.5962963104248047, + "learning_rate": 2e-05, + "loss": 0.05077235, + "step": 3954 + }, + { + "epoch": 7.91, + "grad_norm": 1.3633484840393066, + "learning_rate": 2e-05, + "loss": 0.04732682, + "step": 3955 + }, + { + "epoch": 7.912, + "grad_norm": 1.209276556968689, + "learning_rate": 2e-05, + "loss": 0.03878414, + "step": 3956 + }, + { + "epoch": 7.914, + "grad_norm": 1.957251787185669, + "learning_rate": 2e-05, + "loss": 0.07924426, + "step": 3957 + }, + { + "epoch": 7.916, + "grad_norm": 3.1658568382263184, + "learning_rate": 2e-05, + "loss": 0.05959687, + "step": 3958 + }, + { + "epoch": 7.918, + "grad_norm": 0.9807082414627075, + "learning_rate": 2e-05, + "loss": 0.03472539, + "step": 3959 + }, + { + "epoch": 7.92, + "grad_norm": 2.003063917160034, + "learning_rate": 2e-05, + "loss": 0.05765334, + "step": 3960 + }, + { + "epoch": 7.922, + "grad_norm": 1.3743501901626587, + "learning_rate": 2e-05, + "loss": 0.048442, + "step": 3961 + }, + { + "epoch": 7.924, + "grad_norm": 1.9723905324935913, + "learning_rate": 2e-05, + "loss": 0.06652151, + "step": 3962 + }, + { + "epoch": 7.926, + "grad_norm": 1.4909707307815552, + "learning_rate": 2e-05, + "loss": 0.03205038, + "step": 3963 + }, + { + "epoch": 7.928, + "grad_norm": 1.1835213899612427, + "learning_rate": 2e-05, + "loss": 0.03661142, + "step": 3964 + }, + { + "epoch": 7.93, + "grad_norm": 1.435006856918335, + "learning_rate": 2e-05, + "loss": 0.05666805, + "step": 3965 + }, + { + "epoch": 7.932, + "grad_norm": 2.22232723236084, + "learning_rate": 2e-05, + "loss": 0.03182552, + "step": 3966 + }, + { + "epoch": 7.934, + "grad_norm": 1.0161634683609009, + "learning_rate": 2e-05, + "loss": 0.03687178, + "step": 3967 + }, + { + "epoch": 7.936, + "grad_norm": 1.145479679107666, + "learning_rate": 2e-05, + "loss": 0.04139556, + "step": 3968 + }, + { + "epoch": 7.938, + "grad_norm": 1.3257615566253662, + "learning_rate": 2e-05, + "loss": 0.03712116, + "step": 3969 + }, + { + "epoch": 7.9399999999999995, + "grad_norm": 1.5132187604904175, + "learning_rate": 2e-05, + "loss": 0.0525584, + "step": 3970 + }, + { + "epoch": 7.942, + "grad_norm": 1.7809244394302368, + "learning_rate": 2e-05, + "loss": 0.05465652, + "step": 3971 + }, + { + "epoch": 7.944, + "grad_norm": 1.6416901350021362, + "learning_rate": 2e-05, + "loss": 0.05641666, + "step": 3972 + }, + { + "epoch": 7.946, + "grad_norm": 1.5238450765609741, + "learning_rate": 2e-05, + "loss": 0.03642607, + "step": 3973 + }, + { + "epoch": 7.948, + "grad_norm": 2.732297420501709, + "learning_rate": 2e-05, + "loss": 0.05012231, + "step": 3974 + }, + { + "epoch": 7.95, + "grad_norm": 1.3529413938522339, + "learning_rate": 2e-05, + "loss": 0.03961353, + "step": 3975 + }, + { + "epoch": 7.952, + "grad_norm": 3.372781753540039, + "learning_rate": 2e-05, + "loss": 0.04853031, + "step": 3976 + }, + { + "epoch": 7.954, + "grad_norm": 2.362414836883545, + "learning_rate": 2e-05, + "loss": 0.0558218, + "step": 3977 + }, + { + "epoch": 7.9559999999999995, + "grad_norm": 1.9768744707107544, + "learning_rate": 2e-05, + "loss": 0.04110438, + "step": 3978 + }, + { + "epoch": 7.958, + "grad_norm": 1.477582335472107, + "learning_rate": 2e-05, + "loss": 0.04463062, + "step": 3979 + }, + { + "epoch": 7.96, + "grad_norm": 1.62968111038208, + "learning_rate": 2e-05, + "loss": 0.04137983, + "step": 3980 + }, + { + "epoch": 7.962, + "grad_norm": 2.0086047649383545, + "learning_rate": 2e-05, + "loss": 0.05147458, + "step": 3981 + }, + { + "epoch": 7.964, + "grad_norm": 2.941561222076416, + "learning_rate": 2e-05, + "loss": 0.05373877, + "step": 3982 + }, + { + "epoch": 7.966, + "grad_norm": 1.2433993816375732, + "learning_rate": 2e-05, + "loss": 0.03581764, + "step": 3983 + }, + { + "epoch": 7.968, + "grad_norm": 1.5138049125671387, + "learning_rate": 2e-05, + "loss": 0.04050705, + "step": 3984 + }, + { + "epoch": 7.97, + "grad_norm": 1.289192795753479, + "learning_rate": 2e-05, + "loss": 0.04422689, + "step": 3985 + }, + { + "epoch": 7.9719999999999995, + "grad_norm": 1.3445667028427124, + "learning_rate": 2e-05, + "loss": 0.03943577, + "step": 3986 + }, + { + "epoch": 7.974, + "grad_norm": 1.3681025505065918, + "learning_rate": 2e-05, + "loss": 0.04766712, + "step": 3987 + }, + { + "epoch": 7.976, + "grad_norm": 1.488904356956482, + "learning_rate": 2e-05, + "loss": 0.04435727, + "step": 3988 + }, + { + "epoch": 7.978, + "grad_norm": 1.3543517589569092, + "learning_rate": 2e-05, + "loss": 0.05017151, + "step": 3989 + }, + { + "epoch": 7.98, + "grad_norm": 1.1544119119644165, + "learning_rate": 2e-05, + "loss": 0.04686942, + "step": 3990 + }, + { + "epoch": 7.982, + "grad_norm": 1.188092589378357, + "learning_rate": 2e-05, + "loss": 0.0426802, + "step": 3991 + }, + { + "epoch": 7.984, + "grad_norm": 1.4243437051773071, + "learning_rate": 2e-05, + "loss": 0.06280725, + "step": 3992 + }, + { + "epoch": 7.986, + "grad_norm": 1.6010160446166992, + "learning_rate": 2e-05, + "loss": 0.0427033, + "step": 3993 + }, + { + "epoch": 7.9879999999999995, + "grad_norm": 1.1920783519744873, + "learning_rate": 2e-05, + "loss": 0.04067779, + "step": 3994 + }, + { + "epoch": 7.99, + "grad_norm": 1.6925147771835327, + "learning_rate": 2e-05, + "loss": 0.05137225, + "step": 3995 + }, + { + "epoch": 7.992, + "grad_norm": 1.1469167470932007, + "learning_rate": 2e-05, + "loss": 0.0329942, + "step": 3996 + }, + { + "epoch": 7.994, + "grad_norm": 1.2153685092926025, + "learning_rate": 2e-05, + "loss": 0.0425185, + "step": 3997 + }, + { + "epoch": 7.996, + "grad_norm": 1.2573164701461792, + "learning_rate": 2e-05, + "loss": 0.0418978, + "step": 3998 + }, + { + "epoch": 7.998, + "grad_norm": 1.892161250114441, + "learning_rate": 2e-05, + "loss": 0.04587626, + "step": 3999 + }, + { + "epoch": 8.0, + "grad_norm": 1.3999619483947754, + "learning_rate": 2e-05, + "loss": 0.0356279, + "step": 4000 + }, + { + "epoch": 8.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 0.994, + "AngleClassification_3": 0.9241516966067864, + "Equal_1": 0.992, + "Equal_2": 0.9421157684630739, + "Equal_3": 0.8323353293413174, + "LineComparison_1": 0.996, + "LineComparison_2": 0.9960079840319361, + "LineComparison_3": 0.9760479041916168, + "Parallel_1": 0.9839679358717435, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.988, + "Perpendicular_1": 0.98, + "Perpendicular_2": 0.838, + "Perpendicular_3": 0.5160320641282565, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9996666666666667, + "PointLiesOnCircle_3": 0.9848666666666667, + "PointLiesOnLine_1": 0.9919839679358717, + "PointLiesOnLine_2": 0.9899799599198397, + "PointLiesOnLine_3": 0.8702594810379242 + }, + "eval_runtime": 320.1064, + "eval_samples_per_second": 32.802, + "eval_steps_per_second": 0.656, + "step": 4000 + }, + { + "epoch": 8.002, + "grad_norm": 1.324432611465454, + "learning_rate": 2e-05, + "loss": 0.05207051, + "step": 4001 + }, + { + "epoch": 8.004, + "grad_norm": 1.0732799768447876, + "learning_rate": 2e-05, + "loss": 0.0246641, + "step": 4002 + }, + { + "epoch": 8.006, + "grad_norm": 2.0701658725738525, + "learning_rate": 2e-05, + "loss": 0.04946818, + "step": 4003 + }, + { + "epoch": 8.008, + "grad_norm": 1.8197208642959595, + "learning_rate": 2e-05, + "loss": 0.04853348, + "step": 4004 + }, + { + "epoch": 8.01, + "grad_norm": 1.2051986455917358, + "learning_rate": 2e-05, + "loss": 0.03389344, + "step": 4005 + }, + { + "epoch": 8.012, + "grad_norm": 1.289626121520996, + "learning_rate": 2e-05, + "loss": 0.04067516, + "step": 4006 + }, + { + "epoch": 8.014, + "grad_norm": 3.4996466636657715, + "learning_rate": 2e-05, + "loss": 0.07795639, + "step": 4007 + }, + { + "epoch": 8.016, + "grad_norm": 1.8532557487487793, + "learning_rate": 2e-05, + "loss": 0.04180763, + "step": 4008 + }, + { + "epoch": 8.018, + "grad_norm": 1.4671931266784668, + "learning_rate": 2e-05, + "loss": 0.04495455, + "step": 4009 + }, + { + "epoch": 8.02, + "grad_norm": 1.7721261978149414, + "learning_rate": 2e-05, + "loss": 0.04306563, + "step": 4010 + }, + { + "epoch": 8.022, + "grad_norm": 1.5724709033966064, + "learning_rate": 2e-05, + "loss": 0.03561451, + "step": 4011 + }, + { + "epoch": 8.024, + "grad_norm": 1.0541595220565796, + "learning_rate": 2e-05, + "loss": 0.03057979, + "step": 4012 + }, + { + "epoch": 8.026, + "grad_norm": 1.8422211408615112, + "learning_rate": 2e-05, + "loss": 0.05177515, + "step": 4013 + }, + { + "epoch": 8.028, + "grad_norm": 1.4579343795776367, + "learning_rate": 2e-05, + "loss": 0.04849966, + "step": 4014 + }, + { + "epoch": 8.03, + "grad_norm": 1.6735895872116089, + "learning_rate": 2e-05, + "loss": 0.03308116, + "step": 4015 + }, + { + "epoch": 8.032, + "grad_norm": 1.21261465549469, + "learning_rate": 2e-05, + "loss": 0.03874951, + "step": 4016 + }, + { + "epoch": 8.034, + "grad_norm": 3.218160390853882, + "learning_rate": 2e-05, + "loss": 0.03598247, + "step": 4017 + }, + { + "epoch": 8.036, + "grad_norm": 1.264754056930542, + "learning_rate": 2e-05, + "loss": 0.03381562, + "step": 4018 + }, + { + "epoch": 8.038, + "grad_norm": 2.2951626777648926, + "learning_rate": 2e-05, + "loss": 0.05828879, + "step": 4019 + }, + { + "epoch": 8.04, + "grad_norm": 1.1480064392089844, + "learning_rate": 2e-05, + "loss": 0.03656226, + "step": 4020 + }, + { + "epoch": 8.042, + "grad_norm": 1.7028883695602417, + "learning_rate": 2e-05, + "loss": 0.0495864, + "step": 4021 + }, + { + "epoch": 8.044, + "grad_norm": 2.4957919120788574, + "learning_rate": 2e-05, + "loss": 0.06445777, + "step": 4022 + }, + { + "epoch": 8.046, + "grad_norm": 2.0943565368652344, + "learning_rate": 2e-05, + "loss": 0.05261732, + "step": 4023 + }, + { + "epoch": 8.048, + "grad_norm": 2.1373465061187744, + "learning_rate": 2e-05, + "loss": 0.06218139, + "step": 4024 + }, + { + "epoch": 8.05, + "grad_norm": 1.4479154348373413, + "learning_rate": 2e-05, + "loss": 0.05500597, + "step": 4025 + }, + { + "epoch": 8.052, + "grad_norm": 1.107658863067627, + "learning_rate": 2e-05, + "loss": 0.04318401, + "step": 4026 + }, + { + "epoch": 8.054, + "grad_norm": 1.5696206092834473, + "learning_rate": 2e-05, + "loss": 0.04423702, + "step": 4027 + }, + { + "epoch": 8.056, + "grad_norm": 2.1380436420440674, + "learning_rate": 2e-05, + "loss": 0.04484853, + "step": 4028 + }, + { + "epoch": 8.058, + "grad_norm": 1.4376975297927856, + "learning_rate": 2e-05, + "loss": 0.03389053, + "step": 4029 + }, + { + "epoch": 8.06, + "grad_norm": 1.9726535081863403, + "learning_rate": 2e-05, + "loss": 0.05158387, + "step": 4030 + }, + { + "epoch": 8.062, + "grad_norm": 1.9487265348434448, + "learning_rate": 2e-05, + "loss": 0.05221076, + "step": 4031 + }, + { + "epoch": 8.064, + "grad_norm": 1.4403632879257202, + "learning_rate": 2e-05, + "loss": 0.04184157, + "step": 4032 + }, + { + "epoch": 8.066, + "grad_norm": 1.2656970024108887, + "learning_rate": 2e-05, + "loss": 0.04085121, + "step": 4033 + }, + { + "epoch": 8.068, + "grad_norm": 1.3084996938705444, + "learning_rate": 2e-05, + "loss": 0.03328889, + "step": 4034 + }, + { + "epoch": 8.07, + "grad_norm": 1.760668396949768, + "learning_rate": 2e-05, + "loss": 0.04869121, + "step": 4035 + }, + { + "epoch": 8.072, + "grad_norm": 1.6372207403182983, + "learning_rate": 2e-05, + "loss": 0.04539852, + "step": 4036 + }, + { + "epoch": 8.074, + "grad_norm": 1.2221434116363525, + "learning_rate": 2e-05, + "loss": 0.03623684, + "step": 4037 + }, + { + "epoch": 8.076, + "grad_norm": 1.080573558807373, + "learning_rate": 2e-05, + "loss": 0.03736069, + "step": 4038 + }, + { + "epoch": 8.078, + "grad_norm": 1.135930061340332, + "learning_rate": 2e-05, + "loss": 0.02960359, + "step": 4039 + }, + { + "epoch": 8.08, + "grad_norm": 1.2680519819259644, + "learning_rate": 2e-05, + "loss": 0.04433823, + "step": 4040 + }, + { + "epoch": 8.082, + "grad_norm": 1.3253871202468872, + "learning_rate": 2e-05, + "loss": 0.03741252, + "step": 4041 + }, + { + "epoch": 8.084, + "grad_norm": 1.1967227458953857, + "learning_rate": 2e-05, + "loss": 0.03461279, + "step": 4042 + }, + { + "epoch": 8.086, + "grad_norm": 1.4370568990707397, + "learning_rate": 2e-05, + "loss": 0.05240814, + "step": 4043 + }, + { + "epoch": 8.088, + "grad_norm": 2.1426353454589844, + "learning_rate": 2e-05, + "loss": 0.04719769, + "step": 4044 + }, + { + "epoch": 8.09, + "grad_norm": 1.9252382516860962, + "learning_rate": 2e-05, + "loss": 0.05830365, + "step": 4045 + }, + { + "epoch": 8.092, + "grad_norm": 1.4431771039962769, + "learning_rate": 2e-05, + "loss": 0.05209182, + "step": 4046 + }, + { + "epoch": 8.094, + "grad_norm": 1.4242539405822754, + "learning_rate": 2e-05, + "loss": 0.04281589, + "step": 4047 + }, + { + "epoch": 8.096, + "grad_norm": 2.547609567642212, + "learning_rate": 2e-05, + "loss": 0.04535336, + "step": 4048 + }, + { + "epoch": 8.098, + "grad_norm": 5.5258965492248535, + "learning_rate": 2e-05, + "loss": 0.06601989, + "step": 4049 + }, + { + "epoch": 8.1, + "grad_norm": 1.6703463792800903, + "learning_rate": 2e-05, + "loss": 0.05124013, + "step": 4050 + }, + { + "epoch": 8.102, + "grad_norm": 1.6066875457763672, + "learning_rate": 2e-05, + "loss": 0.05647477, + "step": 4051 + }, + { + "epoch": 8.104, + "grad_norm": 1.0216000080108643, + "learning_rate": 2e-05, + "loss": 0.03139941, + "step": 4052 + }, + { + "epoch": 8.106, + "grad_norm": 1.099047303199768, + "learning_rate": 2e-05, + "loss": 0.0338769, + "step": 4053 + }, + { + "epoch": 8.108, + "grad_norm": 1.3956516981124878, + "learning_rate": 2e-05, + "loss": 0.04579592, + "step": 4054 + }, + { + "epoch": 8.11, + "grad_norm": 1.361737608909607, + "learning_rate": 2e-05, + "loss": 0.04546849, + "step": 4055 + }, + { + "epoch": 8.112, + "grad_norm": 1.3461675643920898, + "learning_rate": 2e-05, + "loss": 0.05312871, + "step": 4056 + }, + { + "epoch": 8.114, + "grad_norm": 1.6900047063827515, + "learning_rate": 2e-05, + "loss": 0.03799757, + "step": 4057 + }, + { + "epoch": 8.116, + "grad_norm": 1.488451361656189, + "learning_rate": 2e-05, + "loss": 0.04809422, + "step": 4058 + }, + { + "epoch": 8.118, + "grad_norm": 1.6791332960128784, + "learning_rate": 2e-05, + "loss": 0.05416831, + "step": 4059 + }, + { + "epoch": 8.12, + "grad_norm": 1.0899131298065186, + "learning_rate": 2e-05, + "loss": 0.03874314, + "step": 4060 + }, + { + "epoch": 8.122, + "grad_norm": 1.2762742042541504, + "learning_rate": 2e-05, + "loss": 0.04447027, + "step": 4061 + }, + { + "epoch": 8.124, + "grad_norm": 1.5424968004226685, + "learning_rate": 2e-05, + "loss": 0.04579141, + "step": 4062 + }, + { + "epoch": 8.126, + "grad_norm": 1.8162665367126465, + "learning_rate": 2e-05, + "loss": 0.04598666, + "step": 4063 + }, + { + "epoch": 8.128, + "grad_norm": 1.428122639656067, + "learning_rate": 2e-05, + "loss": 0.05174505, + "step": 4064 + }, + { + "epoch": 8.13, + "grad_norm": 1.538621187210083, + "learning_rate": 2e-05, + "loss": 0.03250303, + "step": 4065 + }, + { + "epoch": 8.132, + "grad_norm": 1.3809401988983154, + "learning_rate": 2e-05, + "loss": 0.03839519, + "step": 4066 + }, + { + "epoch": 8.134, + "grad_norm": 1.4454106092453003, + "learning_rate": 2e-05, + "loss": 0.0449599, + "step": 4067 + }, + { + "epoch": 8.136, + "grad_norm": 1.3159528970718384, + "learning_rate": 2e-05, + "loss": 0.04408343, + "step": 4068 + }, + { + "epoch": 8.138, + "grad_norm": 1.5110152959823608, + "learning_rate": 2e-05, + "loss": 0.03739053, + "step": 4069 + }, + { + "epoch": 8.14, + "grad_norm": 1.500827431678772, + "learning_rate": 2e-05, + "loss": 0.04702216, + "step": 4070 + }, + { + "epoch": 8.142, + "grad_norm": 1.5902290344238281, + "learning_rate": 2e-05, + "loss": 0.04143539, + "step": 4071 + }, + { + "epoch": 8.144, + "grad_norm": 1.9934486150741577, + "learning_rate": 2e-05, + "loss": 0.03529008, + "step": 4072 + }, + { + "epoch": 8.146, + "grad_norm": 1.8927077054977417, + "learning_rate": 2e-05, + "loss": 0.04800845, + "step": 4073 + }, + { + "epoch": 8.148, + "grad_norm": 1.6216754913330078, + "learning_rate": 2e-05, + "loss": 0.05250569, + "step": 4074 + }, + { + "epoch": 8.15, + "grad_norm": 1.7913944721221924, + "learning_rate": 2e-05, + "loss": 0.04616375, + "step": 4075 + }, + { + "epoch": 8.152, + "grad_norm": 1.5655014514923096, + "learning_rate": 2e-05, + "loss": 0.05108138, + "step": 4076 + }, + { + "epoch": 8.154, + "grad_norm": 1.6830389499664307, + "learning_rate": 2e-05, + "loss": 0.05072855, + "step": 4077 + }, + { + "epoch": 8.156, + "grad_norm": 1.5033944845199585, + "learning_rate": 2e-05, + "loss": 0.04459769, + "step": 4078 + }, + { + "epoch": 8.158, + "grad_norm": 1.5763124227523804, + "learning_rate": 2e-05, + "loss": 0.05702509, + "step": 4079 + }, + { + "epoch": 8.16, + "grad_norm": 1.5542982816696167, + "learning_rate": 2e-05, + "loss": 0.05888487, + "step": 4080 + }, + { + "epoch": 8.162, + "grad_norm": 1.8763879537582397, + "learning_rate": 2e-05, + "loss": 0.03832364, + "step": 4081 + }, + { + "epoch": 8.164, + "grad_norm": 2.169970989227295, + "learning_rate": 2e-05, + "loss": 0.06408481, + "step": 4082 + }, + { + "epoch": 8.166, + "grad_norm": 1.4158964157104492, + "learning_rate": 2e-05, + "loss": 0.0494574, + "step": 4083 + }, + { + "epoch": 8.168, + "grad_norm": 1.2305322885513306, + "learning_rate": 2e-05, + "loss": 0.04329508, + "step": 4084 + }, + { + "epoch": 8.17, + "grad_norm": 2.0390779972076416, + "learning_rate": 2e-05, + "loss": 0.03159539, + "step": 4085 + }, + { + "epoch": 8.172, + "grad_norm": 1.616227149963379, + "learning_rate": 2e-05, + "loss": 0.08061463, + "step": 4086 + }, + { + "epoch": 8.174, + "grad_norm": 2.116055965423584, + "learning_rate": 2e-05, + "loss": 0.05509023, + "step": 4087 + }, + { + "epoch": 8.176, + "grad_norm": 1.04578697681427, + "learning_rate": 2e-05, + "loss": 0.03397291, + "step": 4088 + }, + { + "epoch": 8.178, + "grad_norm": 1.4719500541687012, + "learning_rate": 2e-05, + "loss": 0.05373418, + "step": 4089 + }, + { + "epoch": 8.18, + "grad_norm": 1.9806996583938599, + "learning_rate": 2e-05, + "loss": 0.07111174, + "step": 4090 + }, + { + "epoch": 8.182, + "grad_norm": 1.30508291721344, + "learning_rate": 2e-05, + "loss": 0.04480116, + "step": 4091 + }, + { + "epoch": 8.184, + "grad_norm": 0.9421553015708923, + "learning_rate": 2e-05, + "loss": 0.03321157, + "step": 4092 + }, + { + "epoch": 8.186, + "grad_norm": 1.0021604299545288, + "learning_rate": 2e-05, + "loss": 0.04082393, + "step": 4093 + }, + { + "epoch": 8.188, + "grad_norm": 1.6946673393249512, + "learning_rate": 2e-05, + "loss": 0.05384293, + "step": 4094 + }, + { + "epoch": 8.19, + "grad_norm": 1.0426790714263916, + "learning_rate": 2e-05, + "loss": 0.03759111, + "step": 4095 + }, + { + "epoch": 8.192, + "grad_norm": 2.0568313598632812, + "learning_rate": 2e-05, + "loss": 0.05517369, + "step": 4096 + }, + { + "epoch": 8.194, + "grad_norm": 1.5960932970046997, + "learning_rate": 2e-05, + "loss": 0.06000062, + "step": 4097 + }, + { + "epoch": 8.196, + "grad_norm": 1.7500308752059937, + "learning_rate": 2e-05, + "loss": 0.05646428, + "step": 4098 + }, + { + "epoch": 8.198, + "grad_norm": 1.4026926755905151, + "learning_rate": 2e-05, + "loss": 0.03786077, + "step": 4099 + }, + { + "epoch": 8.2, + "grad_norm": 1.0574769973754883, + "learning_rate": 2e-05, + "loss": 0.03918152, + "step": 4100 + }, + { + "epoch": 8.202, + "grad_norm": 1.5338131189346313, + "learning_rate": 2e-05, + "loss": 0.04909813, + "step": 4101 + }, + { + "epoch": 8.204, + "grad_norm": 1.2247587442398071, + "learning_rate": 2e-05, + "loss": 0.03550359, + "step": 4102 + }, + { + "epoch": 8.206, + "grad_norm": 0.9067367315292358, + "learning_rate": 2e-05, + "loss": 0.02538646, + "step": 4103 + }, + { + "epoch": 8.208, + "grad_norm": 1.0645413398742676, + "learning_rate": 2e-05, + "loss": 0.02908769, + "step": 4104 + }, + { + "epoch": 8.21, + "grad_norm": 1.0075726509094238, + "learning_rate": 2e-05, + "loss": 0.03003458, + "step": 4105 + }, + { + "epoch": 8.212, + "grad_norm": 1.2275352478027344, + "learning_rate": 2e-05, + "loss": 0.051549, + "step": 4106 + }, + { + "epoch": 8.214, + "grad_norm": 1.427625298500061, + "learning_rate": 2e-05, + "loss": 0.04538945, + "step": 4107 + }, + { + "epoch": 8.216, + "grad_norm": 1.29180908203125, + "learning_rate": 2e-05, + "loss": 0.04527157, + "step": 4108 + }, + { + "epoch": 8.218, + "grad_norm": 1.678622841835022, + "learning_rate": 2e-05, + "loss": 0.03252411, + "step": 4109 + }, + { + "epoch": 8.22, + "grad_norm": 2.289098024368286, + "learning_rate": 2e-05, + "loss": 0.0732346, + "step": 4110 + }, + { + "epoch": 8.222, + "grad_norm": 1.325034499168396, + "learning_rate": 2e-05, + "loss": 0.05234051, + "step": 4111 + }, + { + "epoch": 8.224, + "grad_norm": 1.978644847869873, + "learning_rate": 2e-05, + "loss": 0.05532134, + "step": 4112 + }, + { + "epoch": 8.226, + "grad_norm": 1.7182114124298096, + "learning_rate": 2e-05, + "loss": 0.0578098, + "step": 4113 + }, + { + "epoch": 8.228, + "grad_norm": 1.4846113920211792, + "learning_rate": 2e-05, + "loss": 0.04360331, + "step": 4114 + }, + { + "epoch": 8.23, + "grad_norm": 1.358889102935791, + "learning_rate": 2e-05, + "loss": 0.05047805, + "step": 4115 + }, + { + "epoch": 8.232, + "grad_norm": 1.1359084844589233, + "learning_rate": 2e-05, + "loss": 0.04568867, + "step": 4116 + }, + { + "epoch": 8.234, + "grad_norm": 1.419180154800415, + "learning_rate": 2e-05, + "loss": 0.03961842, + "step": 4117 + }, + { + "epoch": 8.236, + "grad_norm": 1.3813687562942505, + "learning_rate": 2e-05, + "loss": 0.05298204, + "step": 4118 + }, + { + "epoch": 8.238, + "grad_norm": 1.8170478343963623, + "learning_rate": 2e-05, + "loss": 0.04838009, + "step": 4119 + }, + { + "epoch": 8.24, + "grad_norm": 2.8637313842773438, + "learning_rate": 2e-05, + "loss": 0.06887215, + "step": 4120 + }, + { + "epoch": 8.242, + "grad_norm": 1.631001591682434, + "learning_rate": 2e-05, + "loss": 0.04441953, + "step": 4121 + }, + { + "epoch": 8.244, + "grad_norm": 1.4597910642623901, + "learning_rate": 2e-05, + "loss": 0.04484698, + "step": 4122 + }, + { + "epoch": 8.246, + "grad_norm": 0.9799979329109192, + "learning_rate": 2e-05, + "loss": 0.02647844, + "step": 4123 + }, + { + "epoch": 8.248, + "grad_norm": 1.7152873277664185, + "learning_rate": 2e-05, + "loss": 0.06567133, + "step": 4124 + }, + { + "epoch": 8.25, + "grad_norm": 1.425673007965088, + "learning_rate": 2e-05, + "loss": 0.04011939, + "step": 4125 + }, + { + "epoch": 8.252, + "grad_norm": 2.331421375274658, + "learning_rate": 2e-05, + "loss": 0.06116243, + "step": 4126 + }, + { + "epoch": 8.254, + "grad_norm": 3.033942461013794, + "learning_rate": 2e-05, + "loss": 0.04823023, + "step": 4127 + }, + { + "epoch": 8.256, + "grad_norm": 1.2400274276733398, + "learning_rate": 2e-05, + "loss": 0.03877744, + "step": 4128 + }, + { + "epoch": 8.258, + "grad_norm": 1.145862340927124, + "learning_rate": 2e-05, + "loss": 0.04107977, + "step": 4129 + }, + { + "epoch": 8.26, + "grad_norm": 1.4702391624450684, + "learning_rate": 2e-05, + "loss": 0.06288863, + "step": 4130 + }, + { + "epoch": 8.262, + "grad_norm": 1.4819300174713135, + "learning_rate": 2e-05, + "loss": 0.05648782, + "step": 4131 + }, + { + "epoch": 8.264, + "grad_norm": 1.4082889556884766, + "learning_rate": 2e-05, + "loss": 0.05002226, + "step": 4132 + }, + { + "epoch": 8.266, + "grad_norm": 1.2530863285064697, + "learning_rate": 2e-05, + "loss": 0.04050343, + "step": 4133 + }, + { + "epoch": 8.268, + "grad_norm": 1.1162209510803223, + "learning_rate": 2e-05, + "loss": 0.03798089, + "step": 4134 + }, + { + "epoch": 8.27, + "grad_norm": 1.6518158912658691, + "learning_rate": 2e-05, + "loss": 0.04391007, + "step": 4135 + }, + { + "epoch": 8.272, + "grad_norm": 1.1009466648101807, + "learning_rate": 2e-05, + "loss": 0.04458437, + "step": 4136 + }, + { + "epoch": 8.274000000000001, + "grad_norm": 1.3596596717834473, + "learning_rate": 2e-05, + "loss": 0.0591647, + "step": 4137 + }, + { + "epoch": 8.276, + "grad_norm": 1.1786128282546997, + "learning_rate": 2e-05, + "loss": 0.03626633, + "step": 4138 + }, + { + "epoch": 8.278, + "grad_norm": 1.3034263849258423, + "learning_rate": 2e-05, + "loss": 0.03502483, + "step": 4139 + }, + { + "epoch": 8.28, + "grad_norm": 1.229113221168518, + "learning_rate": 2e-05, + "loss": 0.05113623, + "step": 4140 + }, + { + "epoch": 8.282, + "grad_norm": 1.813858985900879, + "learning_rate": 2e-05, + "loss": 0.04342901, + "step": 4141 + }, + { + "epoch": 8.284, + "grad_norm": 1.5317081212997437, + "learning_rate": 2e-05, + "loss": 0.05725561, + "step": 4142 + }, + { + "epoch": 8.286, + "grad_norm": 1.4534281492233276, + "learning_rate": 2e-05, + "loss": 0.04506959, + "step": 4143 + }, + { + "epoch": 8.288, + "grad_norm": 1.4005870819091797, + "learning_rate": 2e-05, + "loss": 0.05307934, + "step": 4144 + }, + { + "epoch": 8.29, + "grad_norm": 1.480741262435913, + "learning_rate": 2e-05, + "loss": 0.0603156, + "step": 4145 + }, + { + "epoch": 8.292, + "grad_norm": 1.7137929201126099, + "learning_rate": 2e-05, + "loss": 0.06312002, + "step": 4146 + }, + { + "epoch": 8.294, + "grad_norm": 1.4770913124084473, + "learning_rate": 2e-05, + "loss": 0.05373389, + "step": 4147 + }, + { + "epoch": 8.296, + "grad_norm": 1.1751221418380737, + "learning_rate": 2e-05, + "loss": 0.03505269, + "step": 4148 + }, + { + "epoch": 8.298, + "grad_norm": 1.420811653137207, + "learning_rate": 2e-05, + "loss": 0.03696169, + "step": 4149 + }, + { + "epoch": 8.3, + "grad_norm": 1.4480655193328857, + "learning_rate": 2e-05, + "loss": 0.04662473, + "step": 4150 + }, + { + "epoch": 8.302, + "grad_norm": 2.5621485710144043, + "learning_rate": 2e-05, + "loss": 0.04032537, + "step": 4151 + }, + { + "epoch": 8.304, + "grad_norm": 1.8456149101257324, + "learning_rate": 2e-05, + "loss": 0.05323803, + "step": 4152 + }, + { + "epoch": 8.306, + "grad_norm": 1.150632619857788, + "learning_rate": 2e-05, + "loss": 0.03429832, + "step": 4153 + }, + { + "epoch": 8.308, + "grad_norm": 1.3768160343170166, + "learning_rate": 2e-05, + "loss": 0.04700703, + "step": 4154 + }, + { + "epoch": 8.31, + "grad_norm": 3.1976876258850098, + "learning_rate": 2e-05, + "loss": 0.0506537, + "step": 4155 + }, + { + "epoch": 8.312, + "grad_norm": 1.144446611404419, + "learning_rate": 2e-05, + "loss": 0.04074134, + "step": 4156 + }, + { + "epoch": 8.314, + "grad_norm": 1.0326695442199707, + "learning_rate": 2e-05, + "loss": 0.03758462, + "step": 4157 + }, + { + "epoch": 8.316, + "grad_norm": 1.5255597829818726, + "learning_rate": 2e-05, + "loss": 0.05389597, + "step": 4158 + }, + { + "epoch": 8.318, + "grad_norm": 1.6066386699676514, + "learning_rate": 2e-05, + "loss": 0.04561181, + "step": 4159 + }, + { + "epoch": 8.32, + "grad_norm": 1.2478289604187012, + "learning_rate": 2e-05, + "loss": 0.05142663, + "step": 4160 + }, + { + "epoch": 8.322, + "grad_norm": 1.8503996133804321, + "learning_rate": 2e-05, + "loss": 0.04205081, + "step": 4161 + }, + { + "epoch": 8.324, + "grad_norm": 2.421180486679077, + "learning_rate": 2e-05, + "loss": 0.03882204, + "step": 4162 + }, + { + "epoch": 8.326, + "grad_norm": 1.386807918548584, + "learning_rate": 2e-05, + "loss": 0.05854031, + "step": 4163 + }, + { + "epoch": 8.328, + "grad_norm": 2.2929842472076416, + "learning_rate": 2e-05, + "loss": 0.04718957, + "step": 4164 + }, + { + "epoch": 8.33, + "grad_norm": 0.9168434739112854, + "learning_rate": 2e-05, + "loss": 0.02453806, + "step": 4165 + }, + { + "epoch": 8.332, + "grad_norm": 6.381651401519775, + "learning_rate": 2e-05, + "loss": 0.06762838, + "step": 4166 + }, + { + "epoch": 8.334, + "grad_norm": 1.249552607536316, + "learning_rate": 2e-05, + "loss": 0.0523265, + "step": 4167 + }, + { + "epoch": 8.336, + "grad_norm": 1.3933881521224976, + "learning_rate": 2e-05, + "loss": 0.04636804, + "step": 4168 + }, + { + "epoch": 8.338, + "grad_norm": 1.6516380310058594, + "learning_rate": 2e-05, + "loss": 0.04308794, + "step": 4169 + }, + { + "epoch": 8.34, + "grad_norm": 1.5074326992034912, + "learning_rate": 2e-05, + "loss": 0.03881112, + "step": 4170 + }, + { + "epoch": 8.342, + "grad_norm": 1.099740743637085, + "learning_rate": 2e-05, + "loss": 0.04978018, + "step": 4171 + }, + { + "epoch": 8.344, + "grad_norm": 1.2606548070907593, + "learning_rate": 2e-05, + "loss": 0.04098994, + "step": 4172 + }, + { + "epoch": 8.346, + "grad_norm": 1.293941617012024, + "learning_rate": 2e-05, + "loss": 0.04166168, + "step": 4173 + }, + { + "epoch": 8.348, + "grad_norm": 1.2298349142074585, + "learning_rate": 2e-05, + "loss": 0.04704496, + "step": 4174 + }, + { + "epoch": 8.35, + "grad_norm": 1.1148796081542969, + "learning_rate": 2e-05, + "loss": 0.04025009, + "step": 4175 + }, + { + "epoch": 8.352, + "grad_norm": 1.6456156969070435, + "learning_rate": 2e-05, + "loss": 0.04465715, + "step": 4176 + }, + { + "epoch": 8.354, + "grad_norm": 1.3357329368591309, + "learning_rate": 2e-05, + "loss": 0.04234711, + "step": 4177 + }, + { + "epoch": 8.356, + "grad_norm": 1.0840578079223633, + "learning_rate": 2e-05, + "loss": 0.0413487, + "step": 4178 + }, + { + "epoch": 8.358, + "grad_norm": 1.1986706256866455, + "learning_rate": 2e-05, + "loss": 0.04364356, + "step": 4179 + }, + { + "epoch": 8.36, + "grad_norm": 1.2560433149337769, + "learning_rate": 2e-05, + "loss": 0.04892932, + "step": 4180 + }, + { + "epoch": 8.362, + "grad_norm": 1.2587891817092896, + "learning_rate": 2e-05, + "loss": 0.04414427, + "step": 4181 + }, + { + "epoch": 8.364, + "grad_norm": 1.3355039358139038, + "learning_rate": 2e-05, + "loss": 0.05022869, + "step": 4182 + }, + { + "epoch": 8.366, + "grad_norm": 1.277706503868103, + "learning_rate": 2e-05, + "loss": 0.04096631, + "step": 4183 + }, + { + "epoch": 8.368, + "grad_norm": 1.781320333480835, + "learning_rate": 2e-05, + "loss": 0.0590826, + "step": 4184 + }, + { + "epoch": 8.37, + "grad_norm": 1.9013862609863281, + "learning_rate": 2e-05, + "loss": 0.04787229, + "step": 4185 + }, + { + "epoch": 8.372, + "grad_norm": 1.6217079162597656, + "learning_rate": 2e-05, + "loss": 0.05203458, + "step": 4186 + }, + { + "epoch": 8.374, + "grad_norm": 1.394450068473816, + "learning_rate": 2e-05, + "loss": 0.05752437, + "step": 4187 + }, + { + "epoch": 8.376, + "grad_norm": 0.8604865670204163, + "learning_rate": 2e-05, + "loss": 0.02740677, + "step": 4188 + }, + { + "epoch": 8.378, + "grad_norm": 1.4956645965576172, + "learning_rate": 2e-05, + "loss": 0.0441338, + "step": 4189 + }, + { + "epoch": 8.38, + "grad_norm": 1.9489855766296387, + "learning_rate": 2e-05, + "loss": 0.07236366, + "step": 4190 + }, + { + "epoch": 8.382, + "grad_norm": 1.3766798973083496, + "learning_rate": 2e-05, + "loss": 0.04797775, + "step": 4191 + }, + { + "epoch": 8.384, + "grad_norm": 1.082892894744873, + "learning_rate": 2e-05, + "loss": 0.04361612, + "step": 4192 + }, + { + "epoch": 8.386, + "grad_norm": 1.0579158067703247, + "learning_rate": 2e-05, + "loss": 0.04247576, + "step": 4193 + }, + { + "epoch": 8.388, + "grad_norm": 2.016787052154541, + "learning_rate": 2e-05, + "loss": 0.05003262, + "step": 4194 + }, + { + "epoch": 8.39, + "grad_norm": 1.2439075708389282, + "learning_rate": 2e-05, + "loss": 0.04027151, + "step": 4195 + }, + { + "epoch": 8.392, + "grad_norm": 1.218614935874939, + "learning_rate": 2e-05, + "loss": 0.04031168, + "step": 4196 + }, + { + "epoch": 8.394, + "grad_norm": 1.2276456356048584, + "learning_rate": 2e-05, + "loss": 0.04959612, + "step": 4197 + }, + { + "epoch": 8.396, + "grad_norm": 1.965421199798584, + "learning_rate": 2e-05, + "loss": 0.04630712, + "step": 4198 + }, + { + "epoch": 8.398, + "grad_norm": 1.103851318359375, + "learning_rate": 2e-05, + "loss": 0.03305928, + "step": 4199 + }, + { + "epoch": 8.4, + "grad_norm": 1.183467149734497, + "learning_rate": 2e-05, + "loss": 0.04819242, + "step": 4200 + }, + { + "epoch": 8.402, + "grad_norm": 1.2708202600479126, + "learning_rate": 2e-05, + "loss": 0.04293144, + "step": 4201 + }, + { + "epoch": 8.404, + "grad_norm": 1.534544587135315, + "learning_rate": 2e-05, + "loss": 0.04298072, + "step": 4202 + }, + { + "epoch": 8.406, + "grad_norm": 1.9671140909194946, + "learning_rate": 2e-05, + "loss": 0.05531413, + "step": 4203 + }, + { + "epoch": 8.408, + "grad_norm": 1.3386366367340088, + "learning_rate": 2e-05, + "loss": 0.03748547, + "step": 4204 + }, + { + "epoch": 8.41, + "grad_norm": 2.237844467163086, + "learning_rate": 2e-05, + "loss": 0.05199295, + "step": 4205 + }, + { + "epoch": 8.412, + "grad_norm": 1.5903010368347168, + "learning_rate": 2e-05, + "loss": 0.04232503, + "step": 4206 + }, + { + "epoch": 8.414, + "grad_norm": 1.1368825435638428, + "learning_rate": 2e-05, + "loss": 0.03277536, + "step": 4207 + }, + { + "epoch": 8.416, + "grad_norm": 1.1275688409805298, + "learning_rate": 2e-05, + "loss": 0.04298157, + "step": 4208 + }, + { + "epoch": 8.418, + "grad_norm": 1.7428966760635376, + "learning_rate": 2e-05, + "loss": 0.05383879, + "step": 4209 + }, + { + "epoch": 8.42, + "grad_norm": 1.735756516456604, + "learning_rate": 2e-05, + "loss": 0.06661352, + "step": 4210 + }, + { + "epoch": 8.422, + "grad_norm": 1.376869797706604, + "learning_rate": 2e-05, + "loss": 0.04623829, + "step": 4211 + }, + { + "epoch": 8.424, + "grad_norm": 2.886859178543091, + "learning_rate": 2e-05, + "loss": 0.06156854, + "step": 4212 + }, + { + "epoch": 8.426, + "grad_norm": 1.3723516464233398, + "learning_rate": 2e-05, + "loss": 0.04914469, + "step": 4213 + }, + { + "epoch": 8.428, + "grad_norm": 1.2038379907608032, + "learning_rate": 2e-05, + "loss": 0.0332229, + "step": 4214 + }, + { + "epoch": 8.43, + "grad_norm": 1.1281368732452393, + "learning_rate": 2e-05, + "loss": 0.03839248, + "step": 4215 + }, + { + "epoch": 8.432, + "grad_norm": 1.2697054147720337, + "learning_rate": 2e-05, + "loss": 0.03925816, + "step": 4216 + }, + { + "epoch": 8.434, + "grad_norm": 1.1074013710021973, + "learning_rate": 2e-05, + "loss": 0.04592735, + "step": 4217 + }, + { + "epoch": 8.436, + "grad_norm": 1.722346305847168, + "learning_rate": 2e-05, + "loss": 0.05641073, + "step": 4218 + }, + { + "epoch": 8.438, + "grad_norm": 1.3256508111953735, + "learning_rate": 2e-05, + "loss": 0.04843621, + "step": 4219 + }, + { + "epoch": 8.44, + "grad_norm": 1.3698394298553467, + "learning_rate": 2e-05, + "loss": 0.04274657, + "step": 4220 + }, + { + "epoch": 8.442, + "grad_norm": 1.7315605878829956, + "learning_rate": 2e-05, + "loss": 0.05456549, + "step": 4221 + }, + { + "epoch": 8.444, + "grad_norm": 1.2529323101043701, + "learning_rate": 2e-05, + "loss": 0.02203387, + "step": 4222 + }, + { + "epoch": 8.446, + "grad_norm": 2.7959108352661133, + "learning_rate": 2e-05, + "loss": 0.05525438, + "step": 4223 + }, + { + "epoch": 8.448, + "grad_norm": 1.3126109838485718, + "learning_rate": 2e-05, + "loss": 0.0481948, + "step": 4224 + }, + { + "epoch": 8.45, + "grad_norm": 1.4437806606292725, + "learning_rate": 2e-05, + "loss": 0.05020737, + "step": 4225 + }, + { + "epoch": 8.452, + "grad_norm": 1.2649277448654175, + "learning_rate": 2e-05, + "loss": 0.04435945, + "step": 4226 + }, + { + "epoch": 8.454, + "grad_norm": 1.9862298965454102, + "learning_rate": 2e-05, + "loss": 0.04354966, + "step": 4227 + }, + { + "epoch": 8.456, + "grad_norm": 1.2959977388381958, + "learning_rate": 2e-05, + "loss": 0.04303737, + "step": 4228 + }, + { + "epoch": 8.458, + "grad_norm": 1.9561591148376465, + "learning_rate": 2e-05, + "loss": 0.04295287, + "step": 4229 + }, + { + "epoch": 8.46, + "grad_norm": 1.8894590139389038, + "learning_rate": 2e-05, + "loss": 0.04403112, + "step": 4230 + }, + { + "epoch": 8.462, + "grad_norm": 2.1889703273773193, + "learning_rate": 2e-05, + "loss": 0.06404384, + "step": 4231 + }, + { + "epoch": 8.464, + "grad_norm": 1.6557297706604004, + "learning_rate": 2e-05, + "loss": 0.05207235, + "step": 4232 + }, + { + "epoch": 8.466, + "grad_norm": 1.6295475959777832, + "learning_rate": 2e-05, + "loss": 0.04500223, + "step": 4233 + }, + { + "epoch": 8.468, + "grad_norm": 1.411564588546753, + "learning_rate": 2e-05, + "loss": 0.04487611, + "step": 4234 + }, + { + "epoch": 8.47, + "grad_norm": 1.886080026626587, + "learning_rate": 2e-05, + "loss": 0.03916345, + "step": 4235 + }, + { + "epoch": 8.472, + "grad_norm": 1.2690297365188599, + "learning_rate": 2e-05, + "loss": 0.03984193, + "step": 4236 + }, + { + "epoch": 8.474, + "grad_norm": 1.3404802083969116, + "learning_rate": 2e-05, + "loss": 0.05036929, + "step": 4237 + }, + { + "epoch": 8.475999999999999, + "grad_norm": 1.5249813795089722, + "learning_rate": 2e-05, + "loss": 0.04390936, + "step": 4238 + }, + { + "epoch": 8.478, + "grad_norm": 1.550538182258606, + "learning_rate": 2e-05, + "loss": 0.06574559, + "step": 4239 + }, + { + "epoch": 8.48, + "grad_norm": 1.3451995849609375, + "learning_rate": 2e-05, + "loss": 0.04779359, + "step": 4240 + }, + { + "epoch": 8.482, + "grad_norm": 1.6042060852050781, + "learning_rate": 2e-05, + "loss": 0.04354705, + "step": 4241 + }, + { + "epoch": 8.484, + "grad_norm": 1.3204594850540161, + "learning_rate": 2e-05, + "loss": 0.03058997, + "step": 4242 + }, + { + "epoch": 8.486, + "grad_norm": 1.1289502382278442, + "learning_rate": 2e-05, + "loss": 0.04202674, + "step": 4243 + }, + { + "epoch": 8.488, + "grad_norm": 1.6240679025650024, + "learning_rate": 2e-05, + "loss": 0.05241815, + "step": 4244 + }, + { + "epoch": 8.49, + "grad_norm": 1.0163280963897705, + "learning_rate": 2e-05, + "loss": 0.03246521, + "step": 4245 + }, + { + "epoch": 8.492, + "grad_norm": 1.3230700492858887, + "learning_rate": 2e-05, + "loss": 0.04476965, + "step": 4246 + }, + { + "epoch": 8.494, + "grad_norm": 1.686160683631897, + "learning_rate": 2e-05, + "loss": 0.05470574, + "step": 4247 + }, + { + "epoch": 8.496, + "grad_norm": 1.419310450553894, + "learning_rate": 2e-05, + "loss": 0.04859785, + "step": 4248 + }, + { + "epoch": 8.498, + "grad_norm": 1.431846261024475, + "learning_rate": 2e-05, + "loss": 0.04167376, + "step": 4249 + }, + { + "epoch": 8.5, + "grad_norm": 1.4465125799179077, + "learning_rate": 2e-05, + "loss": 0.06550825, + "step": 4250 + }, + { + "epoch": 8.502, + "grad_norm": 2.1822566986083984, + "learning_rate": 2e-05, + "loss": 0.05803905, + "step": 4251 + }, + { + "epoch": 8.504, + "grad_norm": 1.2648488283157349, + "learning_rate": 2e-05, + "loss": 0.05157655, + "step": 4252 + }, + { + "epoch": 8.506, + "grad_norm": 1.1298472881317139, + "learning_rate": 2e-05, + "loss": 0.04487451, + "step": 4253 + }, + { + "epoch": 8.508, + "grad_norm": 1.696395754814148, + "learning_rate": 2e-05, + "loss": 0.04066026, + "step": 4254 + }, + { + "epoch": 8.51, + "grad_norm": 1.068587064743042, + "learning_rate": 2e-05, + "loss": 0.02926126, + "step": 4255 + }, + { + "epoch": 8.512, + "grad_norm": 1.2515802383422852, + "learning_rate": 2e-05, + "loss": 0.04963365, + "step": 4256 + }, + { + "epoch": 8.514, + "grad_norm": 1.1776715517044067, + "learning_rate": 2e-05, + "loss": 0.03471005, + "step": 4257 + }, + { + "epoch": 8.516, + "grad_norm": 1.47025728225708, + "learning_rate": 2e-05, + "loss": 0.04580873, + "step": 4258 + }, + { + "epoch": 8.518, + "grad_norm": 1.3387584686279297, + "learning_rate": 2e-05, + "loss": 0.0392997, + "step": 4259 + }, + { + "epoch": 8.52, + "grad_norm": 1.7431012392044067, + "learning_rate": 2e-05, + "loss": 0.04956745, + "step": 4260 + }, + { + "epoch": 8.522, + "grad_norm": 2.1029186248779297, + "learning_rate": 2e-05, + "loss": 0.0650689, + "step": 4261 + }, + { + "epoch": 8.524000000000001, + "grad_norm": 1.218745470046997, + "learning_rate": 2e-05, + "loss": 0.03940692, + "step": 4262 + }, + { + "epoch": 8.526, + "grad_norm": 1.8055042028427124, + "learning_rate": 2e-05, + "loss": 0.04726121, + "step": 4263 + }, + { + "epoch": 8.528, + "grad_norm": 1.2838687896728516, + "learning_rate": 2e-05, + "loss": 0.04223323, + "step": 4264 + }, + { + "epoch": 8.53, + "grad_norm": 1.1817346811294556, + "learning_rate": 2e-05, + "loss": 0.03910241, + "step": 4265 + }, + { + "epoch": 8.532, + "grad_norm": 2.022214651107788, + "learning_rate": 2e-05, + "loss": 0.05597316, + "step": 4266 + }, + { + "epoch": 8.534, + "grad_norm": 2.186800479888916, + "learning_rate": 2e-05, + "loss": 0.03829915, + "step": 4267 + }, + { + "epoch": 8.536, + "grad_norm": 1.6917003393173218, + "learning_rate": 2e-05, + "loss": 0.05600104, + "step": 4268 + }, + { + "epoch": 8.538, + "grad_norm": 1.4837727546691895, + "learning_rate": 2e-05, + "loss": 0.04096905, + "step": 4269 + }, + { + "epoch": 8.54, + "grad_norm": 1.8344814777374268, + "learning_rate": 2e-05, + "loss": 0.0464284, + "step": 4270 + }, + { + "epoch": 8.542, + "grad_norm": 1.7132585048675537, + "learning_rate": 2e-05, + "loss": 0.04327628, + "step": 4271 + }, + { + "epoch": 8.544, + "grad_norm": 1.674782395362854, + "learning_rate": 2e-05, + "loss": 0.04533742, + "step": 4272 + }, + { + "epoch": 8.546, + "grad_norm": 1.6211055517196655, + "learning_rate": 2e-05, + "loss": 0.04555016, + "step": 4273 + }, + { + "epoch": 8.548, + "grad_norm": 2.098546266555786, + "learning_rate": 2e-05, + "loss": 0.06424834, + "step": 4274 + }, + { + "epoch": 8.55, + "grad_norm": 1.812697172164917, + "learning_rate": 2e-05, + "loss": 0.05120952, + "step": 4275 + }, + { + "epoch": 8.552, + "grad_norm": 1.0520689487457275, + "learning_rate": 2e-05, + "loss": 0.03506008, + "step": 4276 + }, + { + "epoch": 8.554, + "grad_norm": 1.5413808822631836, + "learning_rate": 2e-05, + "loss": 0.03506286, + "step": 4277 + }, + { + "epoch": 8.556000000000001, + "grad_norm": 0.9895492196083069, + "learning_rate": 2e-05, + "loss": 0.03194591, + "step": 4278 + }, + { + "epoch": 8.558, + "grad_norm": 2.4871461391448975, + "learning_rate": 2e-05, + "loss": 0.04312088, + "step": 4279 + }, + { + "epoch": 8.56, + "grad_norm": 2.135133743286133, + "learning_rate": 2e-05, + "loss": 0.05813865, + "step": 4280 + }, + { + "epoch": 8.562, + "grad_norm": 1.7616969347000122, + "learning_rate": 2e-05, + "loss": 0.05905205, + "step": 4281 + }, + { + "epoch": 8.564, + "grad_norm": 1.1986794471740723, + "learning_rate": 2e-05, + "loss": 0.04172123, + "step": 4282 + }, + { + "epoch": 8.566, + "grad_norm": 1.0787440538406372, + "learning_rate": 2e-05, + "loss": 0.03688566, + "step": 4283 + }, + { + "epoch": 8.568, + "grad_norm": 1.3387644290924072, + "learning_rate": 2e-05, + "loss": 0.04537062, + "step": 4284 + }, + { + "epoch": 8.57, + "grad_norm": 2.0228538513183594, + "learning_rate": 2e-05, + "loss": 0.06465685, + "step": 4285 + }, + { + "epoch": 8.572, + "grad_norm": 1.7300091981887817, + "learning_rate": 2e-05, + "loss": 0.05473095, + "step": 4286 + }, + { + "epoch": 8.574, + "grad_norm": 1.032915711402893, + "learning_rate": 2e-05, + "loss": 0.0316976, + "step": 4287 + }, + { + "epoch": 8.576, + "grad_norm": 1.168629765510559, + "learning_rate": 2e-05, + "loss": 0.0534174, + "step": 4288 + }, + { + "epoch": 8.578, + "grad_norm": 1.6948527097702026, + "learning_rate": 2e-05, + "loss": 0.05644149, + "step": 4289 + }, + { + "epoch": 8.58, + "grad_norm": 1.5725301504135132, + "learning_rate": 2e-05, + "loss": 0.04966125, + "step": 4290 + }, + { + "epoch": 8.582, + "grad_norm": 1.6310228109359741, + "learning_rate": 2e-05, + "loss": 0.04760915, + "step": 4291 + }, + { + "epoch": 8.584, + "grad_norm": 1.232686996459961, + "learning_rate": 2e-05, + "loss": 0.03924007, + "step": 4292 + }, + { + "epoch": 8.586, + "grad_norm": 1.2287647724151611, + "learning_rate": 2e-05, + "loss": 0.03555786, + "step": 4293 + }, + { + "epoch": 8.588, + "grad_norm": 2.1337671279907227, + "learning_rate": 2e-05, + "loss": 0.03892121, + "step": 4294 + }, + { + "epoch": 8.59, + "grad_norm": 0.8796424865722656, + "learning_rate": 2e-05, + "loss": 0.02620396, + "step": 4295 + }, + { + "epoch": 8.592, + "grad_norm": 1.5610501766204834, + "learning_rate": 2e-05, + "loss": 0.05278179, + "step": 4296 + }, + { + "epoch": 8.594, + "grad_norm": 1.3259083032608032, + "learning_rate": 2e-05, + "loss": 0.05059385, + "step": 4297 + }, + { + "epoch": 8.596, + "grad_norm": 1.2965830564498901, + "learning_rate": 2e-05, + "loss": 0.04437011, + "step": 4298 + }, + { + "epoch": 8.598, + "grad_norm": 1.28528892993927, + "learning_rate": 2e-05, + "loss": 0.04673284, + "step": 4299 + }, + { + "epoch": 8.6, + "grad_norm": 1.567898154258728, + "learning_rate": 2e-05, + "loss": 0.05069918, + "step": 4300 + }, + { + "epoch": 8.602, + "grad_norm": 1.5278816223144531, + "learning_rate": 2e-05, + "loss": 0.04407776, + "step": 4301 + }, + { + "epoch": 8.604, + "grad_norm": 2.5078423023223877, + "learning_rate": 2e-05, + "loss": 0.04832874, + "step": 4302 + }, + { + "epoch": 8.606, + "grad_norm": 3.015632390975952, + "learning_rate": 2e-05, + "loss": 0.05268471, + "step": 4303 + }, + { + "epoch": 8.608, + "grad_norm": 1.8364429473876953, + "learning_rate": 2e-05, + "loss": 0.04438996, + "step": 4304 + }, + { + "epoch": 8.61, + "grad_norm": 2.421682119369507, + "learning_rate": 2e-05, + "loss": 0.05156955, + "step": 4305 + }, + { + "epoch": 8.612, + "grad_norm": 1.4038487672805786, + "learning_rate": 2e-05, + "loss": 0.03900445, + "step": 4306 + }, + { + "epoch": 8.614, + "grad_norm": 1.6436742544174194, + "learning_rate": 2e-05, + "loss": 0.05502493, + "step": 4307 + }, + { + "epoch": 8.616, + "grad_norm": 1.551708459854126, + "learning_rate": 2e-05, + "loss": 0.052692, + "step": 4308 + }, + { + "epoch": 8.618, + "grad_norm": 1.148949384689331, + "learning_rate": 2e-05, + "loss": 0.04152766, + "step": 4309 + }, + { + "epoch": 8.62, + "grad_norm": 1.9535950422286987, + "learning_rate": 2e-05, + "loss": 0.04875577, + "step": 4310 + }, + { + "epoch": 8.622, + "grad_norm": 2.445028066635132, + "learning_rate": 2e-05, + "loss": 0.05650291, + "step": 4311 + }, + { + "epoch": 8.624, + "grad_norm": 1.3438160419464111, + "learning_rate": 2e-05, + "loss": 0.04274925, + "step": 4312 + }, + { + "epoch": 8.626, + "grad_norm": 1.1872248649597168, + "learning_rate": 2e-05, + "loss": 0.05045023, + "step": 4313 + }, + { + "epoch": 8.628, + "grad_norm": 0.948291540145874, + "learning_rate": 2e-05, + "loss": 0.03525672, + "step": 4314 + }, + { + "epoch": 8.63, + "grad_norm": 1.2028381824493408, + "learning_rate": 2e-05, + "loss": 0.03516944, + "step": 4315 + }, + { + "epoch": 8.632, + "grad_norm": 1.463492512702942, + "learning_rate": 2e-05, + "loss": 0.05510323, + "step": 4316 + }, + { + "epoch": 8.634, + "grad_norm": 1.7272472381591797, + "learning_rate": 2e-05, + "loss": 0.04611889, + "step": 4317 + }, + { + "epoch": 8.636, + "grad_norm": 1.8609179258346558, + "learning_rate": 2e-05, + "loss": 0.05178494, + "step": 4318 + }, + { + "epoch": 8.638, + "grad_norm": 1.6433526277542114, + "learning_rate": 2e-05, + "loss": 0.05495635, + "step": 4319 + }, + { + "epoch": 8.64, + "grad_norm": 2.0367109775543213, + "learning_rate": 2e-05, + "loss": 0.05452634, + "step": 4320 + }, + { + "epoch": 8.642, + "grad_norm": 1.3078205585479736, + "learning_rate": 2e-05, + "loss": 0.04912869, + "step": 4321 + }, + { + "epoch": 8.644, + "grad_norm": 1.7824225425720215, + "learning_rate": 2e-05, + "loss": 0.06498539, + "step": 4322 + }, + { + "epoch": 8.646, + "grad_norm": 1.2698032855987549, + "learning_rate": 2e-05, + "loss": 0.03695104, + "step": 4323 + }, + { + "epoch": 8.648, + "grad_norm": 1.456852912902832, + "learning_rate": 2e-05, + "loss": 0.04406907, + "step": 4324 + }, + { + "epoch": 8.65, + "grad_norm": 1.1602133512496948, + "learning_rate": 2e-05, + "loss": 0.03222954, + "step": 4325 + }, + { + "epoch": 8.652, + "grad_norm": 1.945662021636963, + "learning_rate": 2e-05, + "loss": 0.05972052, + "step": 4326 + }, + { + "epoch": 8.654, + "grad_norm": 1.0064737796783447, + "learning_rate": 2e-05, + "loss": 0.03110239, + "step": 4327 + }, + { + "epoch": 8.656, + "grad_norm": 1.220845103263855, + "learning_rate": 2e-05, + "loss": 0.04152022, + "step": 4328 + }, + { + "epoch": 8.658, + "grad_norm": 1.2207328081130981, + "learning_rate": 2e-05, + "loss": 0.0476977, + "step": 4329 + }, + { + "epoch": 8.66, + "grad_norm": 2.3454084396362305, + "learning_rate": 2e-05, + "loss": 0.05494661, + "step": 4330 + }, + { + "epoch": 8.662, + "grad_norm": 1.1374601125717163, + "learning_rate": 2e-05, + "loss": 0.04281973, + "step": 4331 + }, + { + "epoch": 8.664, + "grad_norm": 1.3726214170455933, + "learning_rate": 2e-05, + "loss": 0.0457747, + "step": 4332 + }, + { + "epoch": 8.666, + "grad_norm": 2.3275725841522217, + "learning_rate": 2e-05, + "loss": 0.0412515, + "step": 4333 + }, + { + "epoch": 8.668, + "grad_norm": 1.3664522171020508, + "learning_rate": 2e-05, + "loss": 0.04096291, + "step": 4334 + }, + { + "epoch": 8.67, + "grad_norm": 1.359914779663086, + "learning_rate": 2e-05, + "loss": 0.04645447, + "step": 4335 + }, + { + "epoch": 8.672, + "grad_norm": 1.2743496894836426, + "learning_rate": 2e-05, + "loss": 0.03728715, + "step": 4336 + }, + { + "epoch": 8.674, + "grad_norm": 1.1057415008544922, + "learning_rate": 2e-05, + "loss": 0.03883886, + "step": 4337 + }, + { + "epoch": 8.676, + "grad_norm": 2.738429546356201, + "learning_rate": 2e-05, + "loss": 0.05373093, + "step": 4338 + }, + { + "epoch": 8.678, + "grad_norm": 1.3785386085510254, + "learning_rate": 2e-05, + "loss": 0.04660262, + "step": 4339 + }, + { + "epoch": 8.68, + "grad_norm": 1.4276515245437622, + "learning_rate": 2e-05, + "loss": 0.04423345, + "step": 4340 + }, + { + "epoch": 8.682, + "grad_norm": 1.684853196144104, + "learning_rate": 2e-05, + "loss": 0.05505417, + "step": 4341 + }, + { + "epoch": 8.684, + "grad_norm": 1.370588779449463, + "learning_rate": 2e-05, + "loss": 0.04044452, + "step": 4342 + }, + { + "epoch": 8.686, + "grad_norm": 1.3224636316299438, + "learning_rate": 2e-05, + "loss": 0.06086275, + "step": 4343 + }, + { + "epoch": 8.688, + "grad_norm": 1.181963324546814, + "learning_rate": 2e-05, + "loss": 0.04808854, + "step": 4344 + }, + { + "epoch": 8.69, + "grad_norm": 1.409883975982666, + "learning_rate": 2e-05, + "loss": 0.05268676, + "step": 4345 + }, + { + "epoch": 8.692, + "grad_norm": 5.802072525024414, + "learning_rate": 2e-05, + "loss": 0.05748282, + "step": 4346 + }, + { + "epoch": 8.693999999999999, + "grad_norm": 1.2217000722885132, + "learning_rate": 2e-05, + "loss": 0.03751257, + "step": 4347 + }, + { + "epoch": 8.696, + "grad_norm": 1.3667843341827393, + "learning_rate": 2e-05, + "loss": 0.04032121, + "step": 4348 + }, + { + "epoch": 8.698, + "grad_norm": 1.1752148866653442, + "learning_rate": 2e-05, + "loss": 0.03708064, + "step": 4349 + }, + { + "epoch": 8.7, + "grad_norm": 0.9877530336380005, + "learning_rate": 2e-05, + "loss": 0.02924716, + "step": 4350 + }, + { + "epoch": 8.702, + "grad_norm": 1.4895782470703125, + "learning_rate": 2e-05, + "loss": 0.04259634, + "step": 4351 + }, + { + "epoch": 8.704, + "grad_norm": 1.0186421871185303, + "learning_rate": 2e-05, + "loss": 0.04007004, + "step": 4352 + }, + { + "epoch": 8.706, + "grad_norm": 1.8200626373291016, + "learning_rate": 2e-05, + "loss": 0.05589747, + "step": 4353 + }, + { + "epoch": 8.708, + "grad_norm": 1.385703444480896, + "learning_rate": 2e-05, + "loss": 0.04419235, + "step": 4354 + }, + { + "epoch": 8.71, + "grad_norm": 1.624614953994751, + "learning_rate": 2e-05, + "loss": 0.03844646, + "step": 4355 + }, + { + "epoch": 8.712, + "grad_norm": 1.4218086004257202, + "learning_rate": 2e-05, + "loss": 0.05011544, + "step": 4356 + }, + { + "epoch": 8.714, + "grad_norm": 1.4951738119125366, + "learning_rate": 2e-05, + "loss": 0.05101977, + "step": 4357 + }, + { + "epoch": 8.716, + "grad_norm": 1.324513554573059, + "learning_rate": 2e-05, + "loss": 0.04258473, + "step": 4358 + }, + { + "epoch": 8.718, + "grad_norm": 1.4158241748809814, + "learning_rate": 2e-05, + "loss": 0.06014551, + "step": 4359 + }, + { + "epoch": 8.72, + "grad_norm": 1.1224169731140137, + "learning_rate": 2e-05, + "loss": 0.04492978, + "step": 4360 + }, + { + "epoch": 8.722, + "grad_norm": 1.1736798286437988, + "learning_rate": 2e-05, + "loss": 0.04360911, + "step": 4361 + }, + { + "epoch": 8.724, + "grad_norm": 1.0621728897094727, + "learning_rate": 2e-05, + "loss": 0.03657208, + "step": 4362 + }, + { + "epoch": 8.725999999999999, + "grad_norm": 1.2689882516860962, + "learning_rate": 2e-05, + "loss": 0.05324752, + "step": 4363 + }, + { + "epoch": 8.728, + "grad_norm": 1.0777446031570435, + "learning_rate": 2e-05, + "loss": 0.0410941, + "step": 4364 + }, + { + "epoch": 8.73, + "grad_norm": 1.4247167110443115, + "learning_rate": 2e-05, + "loss": 0.04764949, + "step": 4365 + }, + { + "epoch": 8.732, + "grad_norm": 1.272257685661316, + "learning_rate": 2e-05, + "loss": 0.03613374, + "step": 4366 + }, + { + "epoch": 8.734, + "grad_norm": 0.9077991843223572, + "learning_rate": 2e-05, + "loss": 0.03062226, + "step": 4367 + }, + { + "epoch": 8.736, + "grad_norm": 1.2713913917541504, + "learning_rate": 2e-05, + "loss": 0.05430157, + "step": 4368 + }, + { + "epoch": 8.738, + "grad_norm": 1.4644341468811035, + "learning_rate": 2e-05, + "loss": 0.05187561, + "step": 4369 + }, + { + "epoch": 8.74, + "grad_norm": 1.0588116645812988, + "learning_rate": 2e-05, + "loss": 0.03304412, + "step": 4370 + }, + { + "epoch": 8.742, + "grad_norm": 2.1608660221099854, + "learning_rate": 2e-05, + "loss": 0.05963966, + "step": 4371 + }, + { + "epoch": 8.744, + "grad_norm": 1.0907118320465088, + "learning_rate": 2e-05, + "loss": 0.03733838, + "step": 4372 + }, + { + "epoch": 8.746, + "grad_norm": 2.0541343688964844, + "learning_rate": 2e-05, + "loss": 0.04198984, + "step": 4373 + }, + { + "epoch": 8.748, + "grad_norm": 1.6834027767181396, + "learning_rate": 2e-05, + "loss": 0.02988002, + "step": 4374 + }, + { + "epoch": 8.75, + "grad_norm": 1.2698630094528198, + "learning_rate": 2e-05, + "loss": 0.04798315, + "step": 4375 + }, + { + "epoch": 8.752, + "grad_norm": 2.5312836170196533, + "learning_rate": 2e-05, + "loss": 0.0558813, + "step": 4376 + }, + { + "epoch": 8.754, + "grad_norm": 1.4217891693115234, + "learning_rate": 2e-05, + "loss": 0.05483246, + "step": 4377 + }, + { + "epoch": 8.756, + "grad_norm": 1.6704591512680054, + "learning_rate": 2e-05, + "loss": 0.0550387, + "step": 4378 + }, + { + "epoch": 8.758, + "grad_norm": 1.5293974876403809, + "learning_rate": 2e-05, + "loss": 0.05219758, + "step": 4379 + }, + { + "epoch": 8.76, + "grad_norm": 1.3517143726348877, + "learning_rate": 2e-05, + "loss": 0.04489984, + "step": 4380 + }, + { + "epoch": 8.762, + "grad_norm": 1.3553781509399414, + "learning_rate": 2e-05, + "loss": 0.04696585, + "step": 4381 + }, + { + "epoch": 8.764, + "grad_norm": 1.3556764125823975, + "learning_rate": 2e-05, + "loss": 0.03715477, + "step": 4382 + }, + { + "epoch": 8.766, + "grad_norm": 2.0850911140441895, + "learning_rate": 2e-05, + "loss": 0.05471366, + "step": 4383 + }, + { + "epoch": 8.768, + "grad_norm": 1.2767047882080078, + "learning_rate": 2e-05, + "loss": 0.03740011, + "step": 4384 + }, + { + "epoch": 8.77, + "grad_norm": 1.2590597867965698, + "learning_rate": 2e-05, + "loss": 0.0351716, + "step": 4385 + }, + { + "epoch": 8.772, + "grad_norm": 1.1498363018035889, + "learning_rate": 2e-05, + "loss": 0.04321297, + "step": 4386 + }, + { + "epoch": 8.774000000000001, + "grad_norm": 1.0147833824157715, + "learning_rate": 2e-05, + "loss": 0.03920402, + "step": 4387 + }, + { + "epoch": 8.776, + "grad_norm": 1.6818196773529053, + "learning_rate": 2e-05, + "loss": 0.05098328, + "step": 4388 + }, + { + "epoch": 8.778, + "grad_norm": 1.4138433933258057, + "learning_rate": 2e-05, + "loss": 0.03823352, + "step": 4389 + }, + { + "epoch": 8.78, + "grad_norm": 1.0351346731185913, + "learning_rate": 2e-05, + "loss": 0.03335971, + "step": 4390 + }, + { + "epoch": 8.782, + "grad_norm": 1.7980780601501465, + "learning_rate": 2e-05, + "loss": 0.07383931, + "step": 4391 + }, + { + "epoch": 8.784, + "grad_norm": 1.1801741123199463, + "learning_rate": 2e-05, + "loss": 0.04325704, + "step": 4392 + }, + { + "epoch": 8.786, + "grad_norm": 0.9786828756332397, + "learning_rate": 2e-05, + "loss": 0.03433499, + "step": 4393 + }, + { + "epoch": 8.788, + "grad_norm": 1.6378285884857178, + "learning_rate": 2e-05, + "loss": 0.03292269, + "step": 4394 + }, + { + "epoch": 8.79, + "grad_norm": 1.5976024866104126, + "learning_rate": 2e-05, + "loss": 0.05588251, + "step": 4395 + }, + { + "epoch": 8.792, + "grad_norm": 4.473025798797607, + "learning_rate": 2e-05, + "loss": 0.07918499, + "step": 4396 + }, + { + "epoch": 8.794, + "grad_norm": 1.5586071014404297, + "learning_rate": 2e-05, + "loss": 0.05898192, + "step": 4397 + }, + { + "epoch": 8.796, + "grad_norm": 2.6317224502563477, + "learning_rate": 2e-05, + "loss": 0.05829602, + "step": 4398 + }, + { + "epoch": 8.798, + "grad_norm": 1.3767268657684326, + "learning_rate": 2e-05, + "loss": 0.05414031, + "step": 4399 + }, + { + "epoch": 8.8, + "grad_norm": 0.9226973652839661, + "learning_rate": 2e-05, + "loss": 0.02986227, + "step": 4400 + }, + { + "epoch": 8.802, + "grad_norm": 2.223020315170288, + "learning_rate": 2e-05, + "loss": 0.05309587, + "step": 4401 + }, + { + "epoch": 8.804, + "grad_norm": 1.2311885356903076, + "learning_rate": 2e-05, + "loss": 0.04153631, + "step": 4402 + }, + { + "epoch": 8.806000000000001, + "grad_norm": 1.4580053091049194, + "learning_rate": 2e-05, + "loss": 0.0463781, + "step": 4403 + }, + { + "epoch": 8.808, + "grad_norm": 1.253326416015625, + "learning_rate": 2e-05, + "loss": 0.03931756, + "step": 4404 + }, + { + "epoch": 8.81, + "grad_norm": 1.2515385150909424, + "learning_rate": 2e-05, + "loss": 0.04841346, + "step": 4405 + }, + { + "epoch": 8.812, + "grad_norm": 1.8925061225891113, + "learning_rate": 2e-05, + "loss": 0.04482786, + "step": 4406 + }, + { + "epoch": 8.814, + "grad_norm": 1.552807331085205, + "learning_rate": 2e-05, + "loss": 0.05065498, + "step": 4407 + }, + { + "epoch": 8.816, + "grad_norm": 1.1485422849655151, + "learning_rate": 2e-05, + "loss": 0.04279656, + "step": 4408 + }, + { + "epoch": 8.818, + "grad_norm": 1.3679656982421875, + "learning_rate": 2e-05, + "loss": 0.04369261, + "step": 4409 + }, + { + "epoch": 8.82, + "grad_norm": 1.7776272296905518, + "learning_rate": 2e-05, + "loss": 0.03488866, + "step": 4410 + }, + { + "epoch": 8.822, + "grad_norm": 1.3123103380203247, + "learning_rate": 2e-05, + "loss": 0.0491695, + "step": 4411 + }, + { + "epoch": 8.824, + "grad_norm": 1.5892049074172974, + "learning_rate": 2e-05, + "loss": 0.04135371, + "step": 4412 + }, + { + "epoch": 8.826, + "grad_norm": 1.5765873193740845, + "learning_rate": 2e-05, + "loss": 0.04748004, + "step": 4413 + }, + { + "epoch": 8.828, + "grad_norm": 1.5640965700149536, + "learning_rate": 2e-05, + "loss": 0.06020439, + "step": 4414 + }, + { + "epoch": 8.83, + "grad_norm": 1.4658604860305786, + "learning_rate": 2e-05, + "loss": 0.04439153, + "step": 4415 + }, + { + "epoch": 8.832, + "grad_norm": 1.7687060832977295, + "learning_rate": 2e-05, + "loss": 0.05248412, + "step": 4416 + }, + { + "epoch": 8.834, + "grad_norm": 1.102990746498108, + "learning_rate": 2e-05, + "loss": 0.03225242, + "step": 4417 + }, + { + "epoch": 8.836, + "grad_norm": 1.2133313417434692, + "learning_rate": 2e-05, + "loss": 0.03089228, + "step": 4418 + }, + { + "epoch": 8.838, + "grad_norm": 2.1788408756256104, + "learning_rate": 2e-05, + "loss": 0.05756471, + "step": 4419 + }, + { + "epoch": 8.84, + "grad_norm": 1.5459779500961304, + "learning_rate": 2e-05, + "loss": 0.03825875, + "step": 4420 + }, + { + "epoch": 8.842, + "grad_norm": 2.9150943756103516, + "learning_rate": 2e-05, + "loss": 0.05558386, + "step": 4421 + }, + { + "epoch": 8.844, + "grad_norm": 1.6710635423660278, + "learning_rate": 2e-05, + "loss": 0.06062218, + "step": 4422 + }, + { + "epoch": 8.846, + "grad_norm": 1.0483447313308716, + "learning_rate": 2e-05, + "loss": 0.03988266, + "step": 4423 + }, + { + "epoch": 8.848, + "grad_norm": 1.760643720626831, + "learning_rate": 2e-05, + "loss": 0.04801723, + "step": 4424 + }, + { + "epoch": 8.85, + "grad_norm": 1.5282479524612427, + "learning_rate": 2e-05, + "loss": 0.05213196, + "step": 4425 + }, + { + "epoch": 8.852, + "grad_norm": 1.7912678718566895, + "learning_rate": 2e-05, + "loss": 0.04636016, + "step": 4426 + }, + { + "epoch": 8.854, + "grad_norm": 0.8681804537773132, + "learning_rate": 2e-05, + "loss": 0.03162609, + "step": 4427 + }, + { + "epoch": 8.856, + "grad_norm": 1.3080195188522339, + "learning_rate": 2e-05, + "loss": 0.05422079, + "step": 4428 + }, + { + "epoch": 8.858, + "grad_norm": 1.337620735168457, + "learning_rate": 2e-05, + "loss": 0.05549601, + "step": 4429 + }, + { + "epoch": 8.86, + "grad_norm": 1.0535590648651123, + "learning_rate": 2e-05, + "loss": 0.04026527, + "step": 4430 + }, + { + "epoch": 8.862, + "grad_norm": 1.1052578687667847, + "learning_rate": 2e-05, + "loss": 0.03884507, + "step": 4431 + }, + { + "epoch": 8.864, + "grad_norm": 1.7922765016555786, + "learning_rate": 2e-05, + "loss": 0.05645222, + "step": 4432 + }, + { + "epoch": 8.866, + "grad_norm": 1.553610920906067, + "learning_rate": 2e-05, + "loss": 0.055147, + "step": 4433 + }, + { + "epoch": 8.868, + "grad_norm": 1.6815420389175415, + "learning_rate": 2e-05, + "loss": 0.0513879, + "step": 4434 + }, + { + "epoch": 8.87, + "grad_norm": 2.2995853424072266, + "learning_rate": 2e-05, + "loss": 0.0562899, + "step": 4435 + }, + { + "epoch": 8.872, + "grad_norm": 1.9725992679595947, + "learning_rate": 2e-05, + "loss": 0.057873, + "step": 4436 + }, + { + "epoch": 8.874, + "grad_norm": 2.180466890335083, + "learning_rate": 2e-05, + "loss": 0.04819129, + "step": 4437 + }, + { + "epoch": 8.876, + "grad_norm": 1.4554449319839478, + "learning_rate": 2e-05, + "loss": 0.04344202, + "step": 4438 + }, + { + "epoch": 8.878, + "grad_norm": 1.6162256002426147, + "learning_rate": 2e-05, + "loss": 0.04701218, + "step": 4439 + }, + { + "epoch": 8.88, + "grad_norm": 0.8536230325698853, + "learning_rate": 2e-05, + "loss": 0.02937111, + "step": 4440 + }, + { + "epoch": 8.882, + "grad_norm": 1.6734143495559692, + "learning_rate": 2e-05, + "loss": 0.05779783, + "step": 4441 + }, + { + "epoch": 8.884, + "grad_norm": 1.528039574623108, + "learning_rate": 2e-05, + "loss": 0.0378891, + "step": 4442 + }, + { + "epoch": 8.886, + "grad_norm": 1.4532936811447144, + "learning_rate": 2e-05, + "loss": 0.04875976, + "step": 4443 + }, + { + "epoch": 8.888, + "grad_norm": 1.1509127616882324, + "learning_rate": 2e-05, + "loss": 0.04518646, + "step": 4444 + }, + { + "epoch": 8.89, + "grad_norm": 1.0691490173339844, + "learning_rate": 2e-05, + "loss": 0.0388993, + "step": 4445 + }, + { + "epoch": 8.892, + "grad_norm": 1.3075459003448486, + "learning_rate": 2e-05, + "loss": 0.0404879, + "step": 4446 + }, + { + "epoch": 8.894, + "grad_norm": 1.5355843305587769, + "learning_rate": 2e-05, + "loss": 0.06093806, + "step": 4447 + }, + { + "epoch": 8.896, + "grad_norm": 1.2035893201828003, + "learning_rate": 2e-05, + "loss": 0.03944387, + "step": 4448 + }, + { + "epoch": 8.898, + "grad_norm": 1.2087056636810303, + "learning_rate": 2e-05, + "loss": 0.04656249, + "step": 4449 + }, + { + "epoch": 8.9, + "grad_norm": 1.3271777629852295, + "learning_rate": 2e-05, + "loss": 0.04627533, + "step": 4450 + }, + { + "epoch": 8.902, + "grad_norm": 1.2811769247055054, + "learning_rate": 2e-05, + "loss": 0.04390157, + "step": 4451 + }, + { + "epoch": 8.904, + "grad_norm": 1.3496356010437012, + "learning_rate": 2e-05, + "loss": 0.05148339, + "step": 4452 + }, + { + "epoch": 8.906, + "grad_norm": 2.2644078731536865, + "learning_rate": 2e-05, + "loss": 0.06438026, + "step": 4453 + }, + { + "epoch": 8.908, + "grad_norm": 0.9715155363082886, + "learning_rate": 2e-05, + "loss": 0.02947888, + "step": 4454 + }, + { + "epoch": 8.91, + "grad_norm": 1.787411093711853, + "learning_rate": 2e-05, + "loss": 0.04107827, + "step": 4455 + }, + { + "epoch": 8.912, + "grad_norm": 1.1115169525146484, + "learning_rate": 2e-05, + "loss": 0.0399262, + "step": 4456 + }, + { + "epoch": 8.914, + "grad_norm": 1.2536200284957886, + "learning_rate": 2e-05, + "loss": 0.03909591, + "step": 4457 + }, + { + "epoch": 8.916, + "grad_norm": 1.3224055767059326, + "learning_rate": 2e-05, + "loss": 0.04420022, + "step": 4458 + }, + { + "epoch": 8.918, + "grad_norm": 1.4246692657470703, + "learning_rate": 2e-05, + "loss": 0.03702656, + "step": 4459 + }, + { + "epoch": 8.92, + "grad_norm": 1.9952268600463867, + "learning_rate": 2e-05, + "loss": 0.05900434, + "step": 4460 + }, + { + "epoch": 8.922, + "grad_norm": 2.0388376712799072, + "learning_rate": 2e-05, + "loss": 0.05166218, + "step": 4461 + }, + { + "epoch": 8.924, + "grad_norm": 1.1385499238967896, + "learning_rate": 2e-05, + "loss": 0.04556513, + "step": 4462 + }, + { + "epoch": 8.926, + "grad_norm": 1.5945521593093872, + "learning_rate": 2e-05, + "loss": 0.0496194, + "step": 4463 + }, + { + "epoch": 8.928, + "grad_norm": 1.6476668119430542, + "learning_rate": 2e-05, + "loss": 0.04806423, + "step": 4464 + }, + { + "epoch": 8.93, + "grad_norm": 1.9072840213775635, + "learning_rate": 2e-05, + "loss": 0.05876537, + "step": 4465 + }, + { + "epoch": 8.932, + "grad_norm": 1.7775014638900757, + "learning_rate": 2e-05, + "loss": 0.05251986, + "step": 4466 + }, + { + "epoch": 8.934, + "grad_norm": 1.2985152006149292, + "learning_rate": 2e-05, + "loss": 0.0463018, + "step": 4467 + }, + { + "epoch": 8.936, + "grad_norm": 1.8547874689102173, + "learning_rate": 2e-05, + "loss": 0.06041159, + "step": 4468 + }, + { + "epoch": 8.938, + "grad_norm": 2.1976423263549805, + "learning_rate": 2e-05, + "loss": 0.03414884, + "step": 4469 + }, + { + "epoch": 8.94, + "grad_norm": 0.9419854283332825, + "learning_rate": 2e-05, + "loss": 0.02789684, + "step": 4470 + }, + { + "epoch": 8.942, + "grad_norm": 1.1904888153076172, + "learning_rate": 2e-05, + "loss": 0.04499068, + "step": 4471 + }, + { + "epoch": 8.943999999999999, + "grad_norm": 1.1681838035583496, + "learning_rate": 2e-05, + "loss": 0.04271141, + "step": 4472 + }, + { + "epoch": 8.946, + "grad_norm": 1.388636827468872, + "learning_rate": 2e-05, + "loss": 0.04021178, + "step": 4473 + }, + { + "epoch": 8.948, + "grad_norm": 1.6245975494384766, + "learning_rate": 2e-05, + "loss": 0.05697055, + "step": 4474 + }, + { + "epoch": 8.95, + "grad_norm": 1.1198829412460327, + "learning_rate": 2e-05, + "loss": 0.03857917, + "step": 4475 + }, + { + "epoch": 8.952, + "grad_norm": 1.9190189838409424, + "learning_rate": 2e-05, + "loss": 0.04297666, + "step": 4476 + }, + { + "epoch": 8.954, + "grad_norm": 1.584479808807373, + "learning_rate": 2e-05, + "loss": 0.06143427, + "step": 4477 + }, + { + "epoch": 8.956, + "grad_norm": 1.6050777435302734, + "learning_rate": 2e-05, + "loss": 0.04966013, + "step": 4478 + }, + { + "epoch": 8.958, + "grad_norm": 2.1203346252441406, + "learning_rate": 2e-05, + "loss": 0.06286226, + "step": 4479 + }, + { + "epoch": 8.96, + "grad_norm": 1.5613123178482056, + "learning_rate": 2e-05, + "loss": 0.04335446, + "step": 4480 + }, + { + "epoch": 8.962, + "grad_norm": 1.7688831090927124, + "learning_rate": 2e-05, + "loss": 0.05140468, + "step": 4481 + }, + { + "epoch": 8.964, + "grad_norm": 1.1225483417510986, + "learning_rate": 2e-05, + "loss": 0.03839725, + "step": 4482 + }, + { + "epoch": 8.966, + "grad_norm": 1.546858549118042, + "learning_rate": 2e-05, + "loss": 0.04927346, + "step": 4483 + }, + { + "epoch": 8.968, + "grad_norm": 1.4144601821899414, + "learning_rate": 2e-05, + "loss": 0.0489133, + "step": 4484 + }, + { + "epoch": 8.97, + "grad_norm": 1.2596808671951294, + "learning_rate": 2e-05, + "loss": 0.04895072, + "step": 4485 + }, + { + "epoch": 8.972, + "grad_norm": 1.3473176956176758, + "learning_rate": 2e-05, + "loss": 0.04276847, + "step": 4486 + }, + { + "epoch": 8.974, + "grad_norm": 1.0893629789352417, + "learning_rate": 2e-05, + "loss": 0.02666626, + "step": 4487 + }, + { + "epoch": 8.975999999999999, + "grad_norm": 1.3393633365631104, + "learning_rate": 2e-05, + "loss": 0.04725213, + "step": 4488 + }, + { + "epoch": 8.978, + "grad_norm": 1.2494399547576904, + "learning_rate": 2e-05, + "loss": 0.04335026, + "step": 4489 + }, + { + "epoch": 8.98, + "grad_norm": 1.394800066947937, + "learning_rate": 2e-05, + "loss": 0.05108353, + "step": 4490 + }, + { + "epoch": 8.982, + "grad_norm": 2.5832700729370117, + "learning_rate": 2e-05, + "loss": 0.03944442, + "step": 4491 + }, + { + "epoch": 8.984, + "grad_norm": 2.0006890296936035, + "learning_rate": 2e-05, + "loss": 0.06387276, + "step": 4492 + }, + { + "epoch": 8.986, + "grad_norm": 1.664270043373108, + "learning_rate": 2e-05, + "loss": 0.0435832, + "step": 4493 + }, + { + "epoch": 8.988, + "grad_norm": 1.6541657447814941, + "learning_rate": 2e-05, + "loss": 0.05543558, + "step": 4494 + }, + { + "epoch": 8.99, + "grad_norm": 1.3456977605819702, + "learning_rate": 2e-05, + "loss": 0.03495708, + "step": 4495 + }, + { + "epoch": 8.992, + "grad_norm": 1.4035969972610474, + "learning_rate": 2e-05, + "loss": 0.04944149, + "step": 4496 + }, + { + "epoch": 8.994, + "grad_norm": 2.1596810817718506, + "learning_rate": 2e-05, + "loss": 0.0587335, + "step": 4497 + }, + { + "epoch": 8.996, + "grad_norm": 1.4159624576568604, + "learning_rate": 2e-05, + "loss": 0.04697796, + "step": 4498 + }, + { + "epoch": 8.998, + "grad_norm": 2.2335352897644043, + "learning_rate": 2e-05, + "loss": 0.05628205, + "step": 4499 + }, + { + "epoch": 9.0, + "grad_norm": 1.6927722692489624, + "learning_rate": 2e-05, + "loss": 0.04296227, + "step": 4500 + }, + { + "epoch": 9.0, + "eval_performance": { + "AngleClassification_1": 0.992, + "AngleClassification_2": 0.99, + "AngleClassification_3": 0.8922155688622755, + "Equal_1": 0.99, + "Equal_2": 0.9600798403193613, + "Equal_3": 0.8662674650698603, + "LineComparison_1": 0.998, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9700598802395209, + "Parallel_1": 0.969939879759519, + "Parallel_2": 0.9919839679358717, + "Parallel_3": 0.988, + "Perpendicular_1": 0.992, + "Perpendicular_2": 0.828, + "Perpendicular_3": 0.533066132264529, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9896666666666667, + "PointLiesOnCircle_3": 0.9916, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9041916167664671 + }, + "eval_runtime": 320.19, + "eval_samples_per_second": 32.793, + "eval_steps_per_second": 0.656, + "step": 4500 + }, + { + "epoch": 9.002, + "grad_norm": 2.0088605880737305, + "learning_rate": 2e-05, + "loss": 0.05657805, + "step": 4501 + }, + { + "epoch": 9.004, + "grad_norm": 3.5889363288879395, + "learning_rate": 2e-05, + "loss": 0.08259207, + "step": 4502 + }, + { + "epoch": 9.006, + "grad_norm": 2.149068593978882, + "learning_rate": 2e-05, + "loss": 0.05930202, + "step": 4503 + }, + { + "epoch": 9.008, + "grad_norm": 4.037459373474121, + "learning_rate": 2e-05, + "loss": 0.0556262, + "step": 4504 + }, + { + "epoch": 9.01, + "grad_norm": 1.781816840171814, + "learning_rate": 2e-05, + "loss": 0.05204342, + "step": 4505 + }, + { + "epoch": 9.012, + "grad_norm": 2.256213903427124, + "learning_rate": 2e-05, + "loss": 0.07549164, + "step": 4506 + }, + { + "epoch": 9.014, + "grad_norm": 2.2270400524139404, + "learning_rate": 2e-05, + "loss": 0.10077457, + "step": 4507 + }, + { + "epoch": 9.016, + "grad_norm": 1.9802483320236206, + "learning_rate": 2e-05, + "loss": 0.0692708, + "step": 4508 + }, + { + "epoch": 9.018, + "grad_norm": 1.4657119512557983, + "learning_rate": 2e-05, + "loss": 0.05361843, + "step": 4509 + }, + { + "epoch": 9.02, + "grad_norm": 1.6600518226623535, + "learning_rate": 2e-05, + "loss": 0.05417612, + "step": 4510 + }, + { + "epoch": 9.022, + "grad_norm": 1.6829038858413696, + "learning_rate": 2e-05, + "loss": 0.05836887, + "step": 4511 + }, + { + "epoch": 9.024, + "grad_norm": 1.7397302389144897, + "learning_rate": 2e-05, + "loss": 0.06527451, + "step": 4512 + }, + { + "epoch": 9.026, + "grad_norm": 2.1521365642547607, + "learning_rate": 2e-05, + "loss": 0.0745489, + "step": 4513 + }, + { + "epoch": 9.028, + "grad_norm": 1.5954670906066895, + "learning_rate": 2e-05, + "loss": 0.04902204, + "step": 4514 + }, + { + "epoch": 9.03, + "grad_norm": 1.6460261344909668, + "learning_rate": 2e-05, + "loss": 0.05216154, + "step": 4515 + }, + { + "epoch": 9.032, + "grad_norm": 1.4781848192214966, + "learning_rate": 2e-05, + "loss": 0.05158208, + "step": 4516 + }, + { + "epoch": 9.034, + "grad_norm": 1.3276011943817139, + "learning_rate": 2e-05, + "loss": 0.04840251, + "step": 4517 + }, + { + "epoch": 9.036, + "grad_norm": 1.4326094388961792, + "learning_rate": 2e-05, + "loss": 0.06191067, + "step": 4518 + }, + { + "epoch": 9.038, + "grad_norm": 1.330862045288086, + "learning_rate": 2e-05, + "loss": 0.05839656, + "step": 4519 + }, + { + "epoch": 9.04, + "grad_norm": 2.0986196994781494, + "learning_rate": 2e-05, + "loss": 0.06784381, + "step": 4520 + }, + { + "epoch": 9.042, + "grad_norm": 2.1825203895568848, + "learning_rate": 2e-05, + "loss": 0.04452966, + "step": 4521 + }, + { + "epoch": 9.044, + "grad_norm": 1.90194571018219, + "learning_rate": 2e-05, + "loss": 0.06857677, + "step": 4522 + }, + { + "epoch": 9.046, + "grad_norm": 2.0310020446777344, + "learning_rate": 2e-05, + "loss": 0.04652988, + "step": 4523 + }, + { + "epoch": 9.048, + "grad_norm": 1.5352383852005005, + "learning_rate": 2e-05, + "loss": 0.04297383, + "step": 4524 + }, + { + "epoch": 9.05, + "grad_norm": 2.4602177143096924, + "learning_rate": 2e-05, + "loss": 0.04376163, + "step": 4525 + }, + { + "epoch": 9.052, + "grad_norm": 1.6497063636779785, + "learning_rate": 2e-05, + "loss": 0.0518771, + "step": 4526 + }, + { + "epoch": 9.054, + "grad_norm": 1.3094106912612915, + "learning_rate": 2e-05, + "loss": 0.04263393, + "step": 4527 + }, + { + "epoch": 9.056, + "grad_norm": 2.1495842933654785, + "learning_rate": 2e-05, + "loss": 0.06488663, + "step": 4528 + }, + { + "epoch": 9.058, + "grad_norm": 1.4335927963256836, + "learning_rate": 2e-05, + "loss": 0.04027612, + "step": 4529 + }, + { + "epoch": 9.06, + "grad_norm": 1.6587024927139282, + "learning_rate": 2e-05, + "loss": 0.06693022, + "step": 4530 + }, + { + "epoch": 9.062, + "grad_norm": 3.3605124950408936, + "learning_rate": 2e-05, + "loss": 0.07780167, + "step": 4531 + }, + { + "epoch": 9.064, + "grad_norm": 1.3746273517608643, + "learning_rate": 2e-05, + "loss": 0.06263963, + "step": 4532 + }, + { + "epoch": 9.066, + "grad_norm": 1.8702830076217651, + "learning_rate": 2e-05, + "loss": 0.04445141, + "step": 4533 + }, + { + "epoch": 9.068, + "grad_norm": 1.272550344467163, + "learning_rate": 2e-05, + "loss": 0.04481889, + "step": 4534 + }, + { + "epoch": 9.07, + "grad_norm": 1.7748891115188599, + "learning_rate": 2e-05, + "loss": 0.04557388, + "step": 4535 + }, + { + "epoch": 9.072, + "grad_norm": 6.206629276275635, + "learning_rate": 2e-05, + "loss": 0.07212655, + "step": 4536 + }, + { + "epoch": 9.074, + "grad_norm": 1.981056809425354, + "learning_rate": 2e-05, + "loss": 0.05645072, + "step": 4537 + }, + { + "epoch": 9.076, + "grad_norm": 2.8909831047058105, + "learning_rate": 2e-05, + "loss": 0.05474135, + "step": 4538 + }, + { + "epoch": 9.078, + "grad_norm": 1.981854796409607, + "learning_rate": 2e-05, + "loss": 0.05070494, + "step": 4539 + }, + { + "epoch": 9.08, + "grad_norm": 1.315761923789978, + "learning_rate": 2e-05, + "loss": 0.04502026, + "step": 4540 + }, + { + "epoch": 9.082, + "grad_norm": 1.2324833869934082, + "learning_rate": 2e-05, + "loss": 0.04134127, + "step": 4541 + }, + { + "epoch": 9.084, + "grad_norm": 2.135284185409546, + "learning_rate": 2e-05, + "loss": 0.05175925, + "step": 4542 + }, + { + "epoch": 9.086, + "grad_norm": 4.033361434936523, + "learning_rate": 2e-05, + "loss": 0.07532454, + "step": 4543 + }, + { + "epoch": 9.088, + "grad_norm": 1.9662163257598877, + "learning_rate": 2e-05, + "loss": 0.05289062, + "step": 4544 + }, + { + "epoch": 9.09, + "grad_norm": 1.4949467182159424, + "learning_rate": 2e-05, + "loss": 0.05135939, + "step": 4545 + }, + { + "epoch": 9.092, + "grad_norm": 1.5621041059494019, + "learning_rate": 2e-05, + "loss": 0.05257202, + "step": 4546 + }, + { + "epoch": 9.094, + "grad_norm": 2.7926948070526123, + "learning_rate": 2e-05, + "loss": 0.05770051, + "step": 4547 + }, + { + "epoch": 9.096, + "grad_norm": 1.2026070356369019, + "learning_rate": 2e-05, + "loss": 0.03686951, + "step": 4548 + }, + { + "epoch": 9.098, + "grad_norm": 1.5503953695297241, + "learning_rate": 2e-05, + "loss": 0.06764029, + "step": 4549 + }, + { + "epoch": 9.1, + "grad_norm": 1.4695518016815186, + "learning_rate": 2e-05, + "loss": 0.040582, + "step": 4550 + }, + { + "epoch": 9.102, + "grad_norm": 1.5409952402114868, + "learning_rate": 2e-05, + "loss": 0.04492978, + "step": 4551 + }, + { + "epoch": 9.104, + "grad_norm": 1.1860859394073486, + "learning_rate": 2e-05, + "loss": 0.03985904, + "step": 4552 + }, + { + "epoch": 9.106, + "grad_norm": 1.1103501319885254, + "learning_rate": 2e-05, + "loss": 0.03564502, + "step": 4553 + }, + { + "epoch": 9.108, + "grad_norm": 3.4660491943359375, + "learning_rate": 2e-05, + "loss": 0.07822317, + "step": 4554 + }, + { + "epoch": 9.11, + "grad_norm": 2.640744686126709, + "learning_rate": 2e-05, + "loss": 0.05545685, + "step": 4555 + }, + { + "epoch": 9.112, + "grad_norm": 2.2998898029327393, + "learning_rate": 2e-05, + "loss": 0.06051971, + "step": 4556 + }, + { + "epoch": 9.114, + "grad_norm": 2.3359665870666504, + "learning_rate": 2e-05, + "loss": 0.06999737, + "step": 4557 + }, + { + "epoch": 9.116, + "grad_norm": 1.607606291770935, + "learning_rate": 2e-05, + "loss": 0.05824485, + "step": 4558 + }, + { + "epoch": 9.118, + "grad_norm": 1.0786031484603882, + "learning_rate": 2e-05, + "loss": 0.04007483, + "step": 4559 + }, + { + "epoch": 9.12, + "grad_norm": 2.296783924102783, + "learning_rate": 2e-05, + "loss": 0.04448222, + "step": 4560 + }, + { + "epoch": 9.122, + "grad_norm": 1.5085766315460205, + "learning_rate": 2e-05, + "loss": 0.03726997, + "step": 4561 + }, + { + "epoch": 9.124, + "grad_norm": 2.368339776992798, + "learning_rate": 2e-05, + "loss": 0.05795577, + "step": 4562 + }, + { + "epoch": 9.126, + "grad_norm": 1.6994584798812866, + "learning_rate": 2e-05, + "loss": 0.04269854, + "step": 4563 + }, + { + "epoch": 9.128, + "grad_norm": 1.7187951803207397, + "learning_rate": 2e-05, + "loss": 0.05323853, + "step": 4564 + }, + { + "epoch": 9.13, + "grad_norm": 2.053581953048706, + "learning_rate": 2e-05, + "loss": 0.0678404, + "step": 4565 + }, + { + "epoch": 9.132, + "grad_norm": 1.663622260093689, + "learning_rate": 2e-05, + "loss": 0.04938076, + "step": 4566 + }, + { + "epoch": 9.134, + "grad_norm": 2.431302785873413, + "learning_rate": 2e-05, + "loss": 0.08686221, + "step": 4567 + }, + { + "epoch": 9.136, + "grad_norm": 1.7747234106063843, + "learning_rate": 2e-05, + "loss": 0.06363907, + "step": 4568 + }, + { + "epoch": 9.138, + "grad_norm": 2.0806329250335693, + "learning_rate": 2e-05, + "loss": 0.06777623, + "step": 4569 + }, + { + "epoch": 9.14, + "grad_norm": 1.9430458545684814, + "learning_rate": 2e-05, + "loss": 0.05345959, + "step": 4570 + }, + { + "epoch": 9.142, + "grad_norm": 1.6341060400009155, + "learning_rate": 2e-05, + "loss": 0.06348822, + "step": 4571 + }, + { + "epoch": 9.144, + "grad_norm": 2.909865140914917, + "learning_rate": 2e-05, + "loss": 0.06166807, + "step": 4572 + }, + { + "epoch": 9.146, + "grad_norm": 2.0202066898345947, + "learning_rate": 2e-05, + "loss": 0.06222503, + "step": 4573 + }, + { + "epoch": 9.148, + "grad_norm": 1.8997862339019775, + "learning_rate": 2e-05, + "loss": 0.04913793, + "step": 4574 + }, + { + "epoch": 9.15, + "grad_norm": 1.336957573890686, + "learning_rate": 2e-05, + "loss": 0.07197899, + "step": 4575 + }, + { + "epoch": 9.152, + "grad_norm": 1.426227331161499, + "learning_rate": 2e-05, + "loss": 0.05871036, + "step": 4576 + }, + { + "epoch": 9.154, + "grad_norm": 1.507077693939209, + "learning_rate": 2e-05, + "loss": 0.04596441, + "step": 4577 + }, + { + "epoch": 9.156, + "grad_norm": 1.3802088499069214, + "learning_rate": 2e-05, + "loss": 0.04962367, + "step": 4578 + }, + { + "epoch": 9.158, + "grad_norm": 1.2211660146713257, + "learning_rate": 2e-05, + "loss": 0.04041466, + "step": 4579 + }, + { + "epoch": 9.16, + "grad_norm": 1.5546852350234985, + "learning_rate": 2e-05, + "loss": 0.06333128, + "step": 4580 + }, + { + "epoch": 9.162, + "grad_norm": 1.5927609205245972, + "learning_rate": 2e-05, + "loss": 0.05327647, + "step": 4581 + }, + { + "epoch": 9.164, + "grad_norm": 1.427751898765564, + "learning_rate": 2e-05, + "loss": 0.04664875, + "step": 4582 + }, + { + "epoch": 9.166, + "grad_norm": 1.6107282638549805, + "learning_rate": 2e-05, + "loss": 0.06387123, + "step": 4583 + }, + { + "epoch": 9.168, + "grad_norm": 2.082274913787842, + "learning_rate": 2e-05, + "loss": 0.07043574, + "step": 4584 + }, + { + "epoch": 9.17, + "grad_norm": 1.6113282442092896, + "learning_rate": 2e-05, + "loss": 0.05279049, + "step": 4585 + }, + { + "epoch": 9.172, + "grad_norm": 1.2764414548873901, + "learning_rate": 2e-05, + "loss": 0.05512537, + "step": 4586 + }, + { + "epoch": 9.174, + "grad_norm": 1.731431007385254, + "learning_rate": 2e-05, + "loss": 0.05489038, + "step": 4587 + }, + { + "epoch": 9.176, + "grad_norm": 2.2795941829681396, + "learning_rate": 2e-05, + "loss": 0.05099697, + "step": 4588 + }, + { + "epoch": 9.178, + "grad_norm": 1.9231106042861938, + "learning_rate": 2e-05, + "loss": 0.0609756, + "step": 4589 + }, + { + "epoch": 9.18, + "grad_norm": 1.3888475894927979, + "learning_rate": 2e-05, + "loss": 0.04252925, + "step": 4590 + }, + { + "epoch": 9.182, + "grad_norm": 1.7997218370437622, + "learning_rate": 2e-05, + "loss": 0.0603526, + "step": 4591 + }, + { + "epoch": 9.184, + "grad_norm": 2.605591297149658, + "learning_rate": 2e-05, + "loss": 0.05147631, + "step": 4592 + }, + { + "epoch": 9.186, + "grad_norm": 1.1323707103729248, + "learning_rate": 2e-05, + "loss": 0.03506859, + "step": 4593 + }, + { + "epoch": 9.188, + "grad_norm": 1.9055380821228027, + "learning_rate": 2e-05, + "loss": 0.05622992, + "step": 4594 + }, + { + "epoch": 9.19, + "grad_norm": 1.66822350025177, + "learning_rate": 2e-05, + "loss": 0.047398, + "step": 4595 + }, + { + "epoch": 9.192, + "grad_norm": 1.536523461341858, + "learning_rate": 2e-05, + "loss": 0.05335706, + "step": 4596 + }, + { + "epoch": 9.194, + "grad_norm": 2.4968795776367188, + "learning_rate": 2e-05, + "loss": 0.04762807, + "step": 4597 + }, + { + "epoch": 9.196, + "grad_norm": 1.6647475957870483, + "learning_rate": 2e-05, + "loss": 0.06161051, + "step": 4598 + }, + { + "epoch": 9.198, + "grad_norm": 2.187264919281006, + "learning_rate": 2e-05, + "loss": 0.08325572, + "step": 4599 + }, + { + "epoch": 9.2, + "grad_norm": 1.3294416666030884, + "learning_rate": 2e-05, + "loss": 0.05907375, + "step": 4600 + }, + { + "epoch": 9.202, + "grad_norm": 1.3682829141616821, + "learning_rate": 2e-05, + "loss": 0.0561332, + "step": 4601 + }, + { + "epoch": 9.204, + "grad_norm": 3.3193435668945312, + "learning_rate": 2e-05, + "loss": 0.06492063, + "step": 4602 + }, + { + "epoch": 9.206, + "grad_norm": 2.0114078521728516, + "learning_rate": 2e-05, + "loss": 0.06080581, + "step": 4603 + }, + { + "epoch": 9.208, + "grad_norm": 1.9517841339111328, + "learning_rate": 2e-05, + "loss": 0.06941114, + "step": 4604 + }, + { + "epoch": 9.21, + "grad_norm": 1.9020707607269287, + "learning_rate": 2e-05, + "loss": 0.06470014, + "step": 4605 + }, + { + "epoch": 9.212, + "grad_norm": 1.7281121015548706, + "learning_rate": 2e-05, + "loss": 0.05396245, + "step": 4606 + }, + { + "epoch": 9.214, + "grad_norm": 3.0623199939727783, + "learning_rate": 2e-05, + "loss": 0.08490207, + "step": 4607 + }, + { + "epoch": 9.216, + "grad_norm": 2.61993670463562, + "learning_rate": 2e-05, + "loss": 0.06834012, + "step": 4608 + }, + { + "epoch": 9.218, + "grad_norm": 1.3250088691711426, + "learning_rate": 2e-05, + "loss": 0.04134344, + "step": 4609 + }, + { + "epoch": 9.22, + "grad_norm": 2.761884927749634, + "learning_rate": 2e-05, + "loss": 0.06493714, + "step": 4610 + }, + { + "epoch": 9.222, + "grad_norm": 1.54371178150177, + "learning_rate": 2e-05, + "loss": 0.05743696, + "step": 4611 + }, + { + "epoch": 9.224, + "grad_norm": 5.61605167388916, + "learning_rate": 2e-05, + "loss": 0.06084044, + "step": 4612 + }, + { + "epoch": 9.226, + "grad_norm": 1.3761494159698486, + "learning_rate": 2e-05, + "loss": 0.0450317, + "step": 4613 + }, + { + "epoch": 9.228, + "grad_norm": 1.8002984523773193, + "learning_rate": 2e-05, + "loss": 0.0667226, + "step": 4614 + }, + { + "epoch": 9.23, + "grad_norm": 1.532179355621338, + "learning_rate": 2e-05, + "loss": 0.0425299, + "step": 4615 + }, + { + "epoch": 9.232, + "grad_norm": 1.6311419010162354, + "learning_rate": 2e-05, + "loss": 0.04938279, + "step": 4616 + }, + { + "epoch": 9.234, + "grad_norm": 1.4770641326904297, + "learning_rate": 2e-05, + "loss": 0.06095012, + "step": 4617 + }, + { + "epoch": 9.236, + "grad_norm": 1.8100465536117554, + "learning_rate": 2e-05, + "loss": 0.06146353, + "step": 4618 + }, + { + "epoch": 9.238, + "grad_norm": 2.3167026042938232, + "learning_rate": 2e-05, + "loss": 0.05474924, + "step": 4619 + }, + { + "epoch": 9.24, + "grad_norm": 1.2826061248779297, + "learning_rate": 2e-05, + "loss": 0.04267112, + "step": 4620 + }, + { + "epoch": 9.242, + "grad_norm": 1.6880052089691162, + "learning_rate": 2e-05, + "loss": 0.05456819, + "step": 4621 + }, + { + "epoch": 9.244, + "grad_norm": 1.711029052734375, + "learning_rate": 2e-05, + "loss": 0.0621344, + "step": 4622 + }, + { + "epoch": 9.246, + "grad_norm": 2.999143600463867, + "learning_rate": 2e-05, + "loss": 0.08942266, + "step": 4623 + }, + { + "epoch": 9.248, + "grad_norm": 3.74527645111084, + "learning_rate": 2e-05, + "loss": 0.05644033, + "step": 4624 + }, + { + "epoch": 9.25, + "grad_norm": 1.9168704748153687, + "learning_rate": 2e-05, + "loss": 0.03920052, + "step": 4625 + }, + { + "epoch": 9.252, + "grad_norm": 1.4764798879623413, + "learning_rate": 2e-05, + "loss": 0.06123003, + "step": 4626 + }, + { + "epoch": 9.254, + "grad_norm": 1.1448794603347778, + "learning_rate": 2e-05, + "loss": 0.04291178, + "step": 4627 + }, + { + "epoch": 9.256, + "grad_norm": 1.2735017538070679, + "learning_rate": 2e-05, + "loss": 0.0421702, + "step": 4628 + }, + { + "epoch": 9.258, + "grad_norm": 1.3639416694641113, + "learning_rate": 2e-05, + "loss": 0.04493828, + "step": 4629 + }, + { + "epoch": 9.26, + "grad_norm": 1.3818625211715698, + "learning_rate": 2e-05, + "loss": 0.04133046, + "step": 4630 + }, + { + "epoch": 9.262, + "grad_norm": 1.436888575553894, + "learning_rate": 2e-05, + "loss": 0.05594487, + "step": 4631 + }, + { + "epoch": 9.264, + "grad_norm": 1.6045076847076416, + "learning_rate": 2e-05, + "loss": 0.07328442, + "step": 4632 + }, + { + "epoch": 9.266, + "grad_norm": 1.4170843362808228, + "learning_rate": 2e-05, + "loss": 0.04571041, + "step": 4633 + }, + { + "epoch": 9.268, + "grad_norm": 1.7131657600402832, + "learning_rate": 2e-05, + "loss": 0.06080949, + "step": 4634 + }, + { + "epoch": 9.27, + "grad_norm": 1.8091500997543335, + "learning_rate": 2e-05, + "loss": 0.0525728, + "step": 4635 + }, + { + "epoch": 9.272, + "grad_norm": 1.8526278734207153, + "learning_rate": 2e-05, + "loss": 0.06190364, + "step": 4636 + }, + { + "epoch": 9.274000000000001, + "grad_norm": 1.4877046346664429, + "learning_rate": 2e-05, + "loss": 0.06286225, + "step": 4637 + }, + { + "epoch": 9.276, + "grad_norm": 1.772066354751587, + "learning_rate": 2e-05, + "loss": 0.04873686, + "step": 4638 + }, + { + "epoch": 9.278, + "grad_norm": 2.045987606048584, + "learning_rate": 2e-05, + "loss": 0.06090255, + "step": 4639 + }, + { + "epoch": 9.28, + "grad_norm": 1.1641405820846558, + "learning_rate": 2e-05, + "loss": 0.03027676, + "step": 4640 + }, + { + "epoch": 9.282, + "grad_norm": 1.6448951959609985, + "learning_rate": 2e-05, + "loss": 0.0530159, + "step": 4641 + }, + { + "epoch": 9.284, + "grad_norm": 1.8884637355804443, + "learning_rate": 2e-05, + "loss": 0.0639272, + "step": 4642 + }, + { + "epoch": 9.286, + "grad_norm": 1.355390191078186, + "learning_rate": 2e-05, + "loss": 0.04228202, + "step": 4643 + }, + { + "epoch": 9.288, + "grad_norm": 2.2434418201446533, + "learning_rate": 2e-05, + "loss": 0.07046621, + "step": 4644 + }, + { + "epoch": 9.29, + "grad_norm": 2.844688892364502, + "learning_rate": 2e-05, + "loss": 0.08373692, + "step": 4645 + }, + { + "epoch": 9.292, + "grad_norm": 1.803123950958252, + "learning_rate": 2e-05, + "loss": 0.05633461, + "step": 4646 + }, + { + "epoch": 9.294, + "grad_norm": 2.8948237895965576, + "learning_rate": 2e-05, + "loss": 0.05847919, + "step": 4647 + }, + { + "epoch": 9.296, + "grad_norm": 2.4830639362335205, + "learning_rate": 2e-05, + "loss": 0.05488868, + "step": 4648 + }, + { + "epoch": 9.298, + "grad_norm": 1.4770337343215942, + "learning_rate": 2e-05, + "loss": 0.04579071, + "step": 4649 + }, + { + "epoch": 9.3, + "grad_norm": 2.6344194412231445, + "learning_rate": 2e-05, + "loss": 0.05781864, + "step": 4650 + }, + { + "epoch": 9.302, + "grad_norm": 1.8816792964935303, + "learning_rate": 2e-05, + "loss": 0.05238046, + "step": 4651 + }, + { + "epoch": 9.304, + "grad_norm": 1.3099342584609985, + "learning_rate": 2e-05, + "loss": 0.04636721, + "step": 4652 + }, + { + "epoch": 9.306, + "grad_norm": 2.569141149520874, + "learning_rate": 2e-05, + "loss": 0.05769484, + "step": 4653 + }, + { + "epoch": 9.308, + "grad_norm": 1.5715465545654297, + "learning_rate": 2e-05, + "loss": 0.05822673, + "step": 4654 + }, + { + "epoch": 9.31, + "grad_norm": 2.693352222442627, + "learning_rate": 2e-05, + "loss": 0.08047836, + "step": 4655 + }, + { + "epoch": 9.312, + "grad_norm": 2.2986159324645996, + "learning_rate": 2e-05, + "loss": 0.06082643, + "step": 4656 + }, + { + "epoch": 9.314, + "grad_norm": 1.2851147651672363, + "learning_rate": 2e-05, + "loss": 0.04421428, + "step": 4657 + }, + { + "epoch": 9.316, + "grad_norm": 2.6191282272338867, + "learning_rate": 2e-05, + "loss": 0.05954355, + "step": 4658 + }, + { + "epoch": 9.318, + "grad_norm": 1.7637144327163696, + "learning_rate": 2e-05, + "loss": 0.06339554, + "step": 4659 + }, + { + "epoch": 9.32, + "grad_norm": 2.729525089263916, + "learning_rate": 2e-05, + "loss": 0.07117513, + "step": 4660 + }, + { + "epoch": 9.322, + "grad_norm": 1.4302338361740112, + "learning_rate": 2e-05, + "loss": 0.04973509, + "step": 4661 + }, + { + "epoch": 9.324, + "grad_norm": 1.4821285009384155, + "learning_rate": 2e-05, + "loss": 0.05108538, + "step": 4662 + }, + { + "epoch": 9.326, + "grad_norm": 1.5328965187072754, + "learning_rate": 2e-05, + "loss": 0.05789268, + "step": 4663 + }, + { + "epoch": 9.328, + "grad_norm": 1.9559437036514282, + "learning_rate": 2e-05, + "loss": 0.06492808, + "step": 4664 + }, + { + "epoch": 9.33, + "grad_norm": 1.2809813022613525, + "learning_rate": 2e-05, + "loss": 0.04128128, + "step": 4665 + }, + { + "epoch": 9.332, + "grad_norm": 3.450357675552368, + "learning_rate": 2e-05, + "loss": 0.05096133, + "step": 4666 + }, + { + "epoch": 9.334, + "grad_norm": 2.293715238571167, + "learning_rate": 2e-05, + "loss": 0.03788307, + "step": 4667 + }, + { + "epoch": 9.336, + "grad_norm": 1.2826162576675415, + "learning_rate": 2e-05, + "loss": 0.05124827, + "step": 4668 + }, + { + "epoch": 9.338, + "grad_norm": 1.501015305519104, + "learning_rate": 2e-05, + "loss": 0.04789577, + "step": 4669 + }, + { + "epoch": 9.34, + "grad_norm": 2.1823089122772217, + "learning_rate": 2e-05, + "loss": 0.06318521, + "step": 4670 + }, + { + "epoch": 9.342, + "grad_norm": 1.5042678117752075, + "learning_rate": 2e-05, + "loss": 0.0364994, + "step": 4671 + }, + { + "epoch": 9.344, + "grad_norm": 1.1942931413650513, + "learning_rate": 2e-05, + "loss": 0.04444024, + "step": 4672 + }, + { + "epoch": 9.346, + "grad_norm": 1.6051136255264282, + "learning_rate": 2e-05, + "loss": 0.06005847, + "step": 4673 + }, + { + "epoch": 9.348, + "grad_norm": 1.2597368955612183, + "learning_rate": 2e-05, + "loss": 0.04411948, + "step": 4674 + }, + { + "epoch": 9.35, + "grad_norm": 1.3099509477615356, + "learning_rate": 2e-05, + "loss": 0.05771332, + "step": 4675 + }, + { + "epoch": 9.352, + "grad_norm": 2.081918954849243, + "learning_rate": 2e-05, + "loss": 0.04082836, + "step": 4676 + }, + { + "epoch": 9.354, + "grad_norm": 1.5217939615249634, + "learning_rate": 2e-05, + "loss": 0.05541312, + "step": 4677 + }, + { + "epoch": 9.356, + "grad_norm": 1.5885696411132812, + "learning_rate": 2e-05, + "loss": 0.06058114, + "step": 4678 + }, + { + "epoch": 9.358, + "grad_norm": 1.2582283020019531, + "learning_rate": 2e-05, + "loss": 0.02798801, + "step": 4679 + }, + { + "epoch": 9.36, + "grad_norm": 1.9092825651168823, + "learning_rate": 2e-05, + "loss": 0.06019748, + "step": 4680 + }, + { + "epoch": 9.362, + "grad_norm": 3.1406259536743164, + "learning_rate": 2e-05, + "loss": 0.04014409, + "step": 4681 + }, + { + "epoch": 9.364, + "grad_norm": 1.404429316520691, + "learning_rate": 2e-05, + "loss": 0.04144133, + "step": 4682 + }, + { + "epoch": 9.366, + "grad_norm": 3.4135115146636963, + "learning_rate": 2e-05, + "loss": 0.06071433, + "step": 4683 + }, + { + "epoch": 9.368, + "grad_norm": 2.7143144607543945, + "learning_rate": 2e-05, + "loss": 0.06220663, + "step": 4684 + }, + { + "epoch": 9.37, + "grad_norm": 1.7465615272521973, + "learning_rate": 2e-05, + "loss": 0.05392168, + "step": 4685 + }, + { + "epoch": 9.372, + "grad_norm": 2.8406643867492676, + "learning_rate": 2e-05, + "loss": 0.09703434, + "step": 4686 + }, + { + "epoch": 9.374, + "grad_norm": 1.889456868171692, + "learning_rate": 2e-05, + "loss": 0.04023182, + "step": 4687 + }, + { + "epoch": 9.376, + "grad_norm": 1.769338846206665, + "learning_rate": 2e-05, + "loss": 0.04535776, + "step": 4688 + }, + { + "epoch": 9.378, + "grad_norm": 1.3787952661514282, + "learning_rate": 2e-05, + "loss": 0.05142323, + "step": 4689 + }, + { + "epoch": 9.38, + "grad_norm": 1.353203296661377, + "learning_rate": 2e-05, + "loss": 0.04485345, + "step": 4690 + }, + { + "epoch": 9.382, + "grad_norm": 1.3988724946975708, + "learning_rate": 2e-05, + "loss": 0.03410086, + "step": 4691 + }, + { + "epoch": 9.384, + "grad_norm": 1.2784860134124756, + "learning_rate": 2e-05, + "loss": 0.05209333, + "step": 4692 + }, + { + "epoch": 9.386, + "grad_norm": 1.9616296291351318, + "learning_rate": 2e-05, + "loss": 0.0617491, + "step": 4693 + }, + { + "epoch": 9.388, + "grad_norm": 2.1018028259277344, + "learning_rate": 2e-05, + "loss": 0.06020827, + "step": 4694 + }, + { + "epoch": 9.39, + "grad_norm": 1.3357975482940674, + "learning_rate": 2e-05, + "loss": 0.03462115, + "step": 4695 + }, + { + "epoch": 9.392, + "grad_norm": 2.2010691165924072, + "learning_rate": 2e-05, + "loss": 0.05162614, + "step": 4696 + }, + { + "epoch": 9.394, + "grad_norm": 1.4281353950500488, + "learning_rate": 2e-05, + "loss": 0.04901909, + "step": 4697 + }, + { + "epoch": 9.396, + "grad_norm": 1.978698492050171, + "learning_rate": 2e-05, + "loss": 0.06718802, + "step": 4698 + }, + { + "epoch": 9.398, + "grad_norm": 1.465959906578064, + "learning_rate": 2e-05, + "loss": 0.03534799, + "step": 4699 + }, + { + "epoch": 9.4, + "grad_norm": 1.7106237411499023, + "learning_rate": 2e-05, + "loss": 0.06189927, + "step": 4700 + }, + { + "epoch": 9.402, + "grad_norm": 1.349387526512146, + "learning_rate": 2e-05, + "loss": 0.04897711, + "step": 4701 + }, + { + "epoch": 9.404, + "grad_norm": 1.9171662330627441, + "learning_rate": 2e-05, + "loss": 0.04699158, + "step": 4702 + }, + { + "epoch": 9.406, + "grad_norm": 2.517526388168335, + "learning_rate": 2e-05, + "loss": 0.07169571, + "step": 4703 + }, + { + "epoch": 9.408, + "grad_norm": 1.851466417312622, + "learning_rate": 2e-05, + "loss": 0.04784168, + "step": 4704 + }, + { + "epoch": 9.41, + "grad_norm": 2.378621816635132, + "learning_rate": 2e-05, + "loss": 0.04365069, + "step": 4705 + }, + { + "epoch": 9.412, + "grad_norm": 1.615919828414917, + "learning_rate": 2e-05, + "loss": 0.05020372, + "step": 4706 + }, + { + "epoch": 9.414, + "grad_norm": 1.691772222518921, + "learning_rate": 2e-05, + "loss": 0.04145588, + "step": 4707 + }, + { + "epoch": 9.416, + "grad_norm": 3.0600521564483643, + "learning_rate": 2e-05, + "loss": 0.04704102, + "step": 4708 + }, + { + "epoch": 9.418, + "grad_norm": 1.2809059619903564, + "learning_rate": 2e-05, + "loss": 0.04573686, + "step": 4709 + }, + { + "epoch": 9.42, + "grad_norm": 1.2681528329849243, + "learning_rate": 2e-05, + "loss": 0.04140832, + "step": 4710 + }, + { + "epoch": 9.422, + "grad_norm": 1.5011893510818481, + "learning_rate": 2e-05, + "loss": 0.04901879, + "step": 4711 + }, + { + "epoch": 9.424, + "grad_norm": 1.7330307960510254, + "learning_rate": 2e-05, + "loss": 0.05393284, + "step": 4712 + }, + { + "epoch": 9.426, + "grad_norm": 1.1501388549804688, + "learning_rate": 2e-05, + "loss": 0.04429302, + "step": 4713 + }, + { + "epoch": 9.428, + "grad_norm": 1.5243293046951294, + "learning_rate": 2e-05, + "loss": 0.06208751, + "step": 4714 + }, + { + "epoch": 9.43, + "grad_norm": 1.6041468381881714, + "learning_rate": 2e-05, + "loss": 0.05365137, + "step": 4715 + }, + { + "epoch": 9.432, + "grad_norm": 1.411880612373352, + "learning_rate": 2e-05, + "loss": 0.04755591, + "step": 4716 + }, + { + "epoch": 9.434, + "grad_norm": 1.7034417390823364, + "learning_rate": 2e-05, + "loss": 0.05169332, + "step": 4717 + }, + { + "epoch": 9.436, + "grad_norm": 1.398370623588562, + "learning_rate": 2e-05, + "loss": 0.04505457, + "step": 4718 + }, + { + "epoch": 9.438, + "grad_norm": 1.618485927581787, + "learning_rate": 2e-05, + "loss": 0.05048877, + "step": 4719 + }, + { + "epoch": 9.44, + "grad_norm": 2.2976009845733643, + "learning_rate": 2e-05, + "loss": 0.05732608, + "step": 4720 + }, + { + "epoch": 9.442, + "grad_norm": 1.4977824687957764, + "learning_rate": 2e-05, + "loss": 0.05055287, + "step": 4721 + }, + { + "epoch": 9.444, + "grad_norm": 2.0112006664276123, + "learning_rate": 2e-05, + "loss": 0.05900105, + "step": 4722 + }, + { + "epoch": 9.446, + "grad_norm": 1.2168583869934082, + "learning_rate": 2e-05, + "loss": 0.04659557, + "step": 4723 + }, + { + "epoch": 9.448, + "grad_norm": 1.8147743940353394, + "learning_rate": 2e-05, + "loss": 0.04836474, + "step": 4724 + }, + { + "epoch": 9.45, + "grad_norm": 2.1160666942596436, + "learning_rate": 2e-05, + "loss": 0.06309254, + "step": 4725 + }, + { + "epoch": 9.452, + "grad_norm": 1.0232397317886353, + "learning_rate": 2e-05, + "loss": 0.03620723, + "step": 4726 + }, + { + "epoch": 9.454, + "grad_norm": 1.4367437362670898, + "learning_rate": 2e-05, + "loss": 0.04998949, + "step": 4727 + }, + { + "epoch": 9.456, + "grad_norm": 1.9640501737594604, + "learning_rate": 2e-05, + "loss": 0.07069734, + "step": 4728 + }, + { + "epoch": 9.458, + "grad_norm": 4.114424228668213, + "learning_rate": 2e-05, + "loss": 0.09708965, + "step": 4729 + }, + { + "epoch": 9.46, + "grad_norm": 1.8628021478652954, + "learning_rate": 2e-05, + "loss": 0.0480114, + "step": 4730 + }, + { + "epoch": 9.462, + "grad_norm": 1.896959662437439, + "learning_rate": 2e-05, + "loss": 0.057604, + "step": 4731 + }, + { + "epoch": 9.464, + "grad_norm": 1.6538161039352417, + "learning_rate": 2e-05, + "loss": 0.05080595, + "step": 4732 + }, + { + "epoch": 9.466, + "grad_norm": 1.931830883026123, + "learning_rate": 2e-05, + "loss": 0.053108, + "step": 4733 + }, + { + "epoch": 9.468, + "grad_norm": 1.452933430671692, + "learning_rate": 2e-05, + "loss": 0.041306, + "step": 4734 + }, + { + "epoch": 9.47, + "grad_norm": 1.8941861391067505, + "learning_rate": 2e-05, + "loss": 0.05748565, + "step": 4735 + }, + { + "epoch": 9.472, + "grad_norm": 1.3065882921218872, + "learning_rate": 2e-05, + "loss": 0.05725362, + "step": 4736 + }, + { + "epoch": 9.474, + "grad_norm": 1.3110909461975098, + "learning_rate": 2e-05, + "loss": 0.04384652, + "step": 4737 + }, + { + "epoch": 9.475999999999999, + "grad_norm": 1.1904391050338745, + "learning_rate": 2e-05, + "loss": 0.0409494, + "step": 4738 + }, + { + "epoch": 9.478, + "grad_norm": 1.359976053237915, + "learning_rate": 2e-05, + "loss": 0.05123883, + "step": 4739 + }, + { + "epoch": 9.48, + "grad_norm": 1.628815770149231, + "learning_rate": 2e-05, + "loss": 0.0685708, + "step": 4740 + }, + { + "epoch": 9.482, + "grad_norm": 1.5649138689041138, + "learning_rate": 2e-05, + "loss": 0.0603338, + "step": 4741 + }, + { + "epoch": 9.484, + "grad_norm": 1.9411360025405884, + "learning_rate": 2e-05, + "loss": 0.05299631, + "step": 4742 + }, + { + "epoch": 9.486, + "grad_norm": 2.5453522205352783, + "learning_rate": 2e-05, + "loss": 0.04924603, + "step": 4743 + }, + { + "epoch": 9.488, + "grad_norm": 1.5162389278411865, + "learning_rate": 2e-05, + "loss": 0.04518431, + "step": 4744 + }, + { + "epoch": 9.49, + "grad_norm": 1.0650982856750488, + "learning_rate": 2e-05, + "loss": 0.03788474, + "step": 4745 + }, + { + "epoch": 9.492, + "grad_norm": 1.4776169061660767, + "learning_rate": 2e-05, + "loss": 0.04502506, + "step": 4746 + }, + { + "epoch": 9.494, + "grad_norm": 4.540150165557861, + "learning_rate": 2e-05, + "loss": 0.06305524, + "step": 4747 + }, + { + "epoch": 9.496, + "grad_norm": 1.8420084714889526, + "learning_rate": 2e-05, + "loss": 0.05973725, + "step": 4748 + }, + { + "epoch": 9.498, + "grad_norm": 1.177133321762085, + "learning_rate": 2e-05, + "loss": 0.04210076, + "step": 4749 + }, + { + "epoch": 9.5, + "grad_norm": 1.4064905643463135, + "learning_rate": 2e-05, + "loss": 0.04571437, + "step": 4750 + }, + { + "epoch": 9.502, + "grad_norm": 1.7198388576507568, + "learning_rate": 2e-05, + "loss": 0.04831848, + "step": 4751 + }, + { + "epoch": 9.504, + "grad_norm": 1.373218297958374, + "learning_rate": 2e-05, + "loss": 0.04200636, + "step": 4752 + }, + { + "epoch": 9.506, + "grad_norm": 2.0869367122650146, + "learning_rate": 2e-05, + "loss": 0.05860278, + "step": 4753 + }, + { + "epoch": 9.508, + "grad_norm": 2.03802227973938, + "learning_rate": 2e-05, + "loss": 0.04599741, + "step": 4754 + }, + { + "epoch": 9.51, + "grad_norm": 2.718126058578491, + "learning_rate": 2e-05, + "loss": 0.05679199, + "step": 4755 + }, + { + "epoch": 9.512, + "grad_norm": 2.1345672607421875, + "learning_rate": 2e-05, + "loss": 0.07050245, + "step": 4756 + }, + { + "epoch": 9.514, + "grad_norm": 3.7088184356689453, + "learning_rate": 2e-05, + "loss": 0.07732576, + "step": 4757 + }, + { + "epoch": 9.516, + "grad_norm": 1.6699403524398804, + "learning_rate": 2e-05, + "loss": 0.04383042, + "step": 4758 + }, + { + "epoch": 9.518, + "grad_norm": 1.5457981824874878, + "learning_rate": 2e-05, + "loss": 0.03807002, + "step": 4759 + }, + { + "epoch": 9.52, + "grad_norm": 1.9819368124008179, + "learning_rate": 2e-05, + "loss": 0.07731517, + "step": 4760 + }, + { + "epoch": 9.522, + "grad_norm": 2.0579514503479004, + "learning_rate": 2e-05, + "loss": 0.06062069, + "step": 4761 + }, + { + "epoch": 9.524000000000001, + "grad_norm": 1.8381377458572388, + "learning_rate": 2e-05, + "loss": 0.04691198, + "step": 4762 + }, + { + "epoch": 9.526, + "grad_norm": 2.709359645843506, + "learning_rate": 2e-05, + "loss": 0.05731576, + "step": 4763 + }, + { + "epoch": 9.528, + "grad_norm": 1.4428166151046753, + "learning_rate": 2e-05, + "loss": 0.04605105, + "step": 4764 + }, + { + "epoch": 9.53, + "grad_norm": 1.5184147357940674, + "learning_rate": 2e-05, + "loss": 0.05696104, + "step": 4765 + }, + { + "epoch": 9.532, + "grad_norm": 1.3034058809280396, + "learning_rate": 2e-05, + "loss": 0.03874236, + "step": 4766 + }, + { + "epoch": 9.534, + "grad_norm": 1.7730201482772827, + "learning_rate": 2e-05, + "loss": 0.05022582, + "step": 4767 + }, + { + "epoch": 9.536, + "grad_norm": 1.5613415241241455, + "learning_rate": 2e-05, + "loss": 0.06682578, + "step": 4768 + }, + { + "epoch": 9.538, + "grad_norm": 1.9399430751800537, + "learning_rate": 2e-05, + "loss": 0.05712492, + "step": 4769 + }, + { + "epoch": 9.54, + "grad_norm": 2.3643603324890137, + "learning_rate": 2e-05, + "loss": 0.0664971, + "step": 4770 + }, + { + "epoch": 9.542, + "grad_norm": 1.1537812948226929, + "learning_rate": 2e-05, + "loss": 0.04907678, + "step": 4771 + }, + { + "epoch": 9.544, + "grad_norm": 1.476564884185791, + "learning_rate": 2e-05, + "loss": 0.05081899, + "step": 4772 + }, + { + "epoch": 9.546, + "grad_norm": 1.345429539680481, + "learning_rate": 2e-05, + "loss": 0.0485204, + "step": 4773 + }, + { + "epoch": 9.548, + "grad_norm": 1.3945995569229126, + "learning_rate": 2e-05, + "loss": 0.05414914, + "step": 4774 + }, + { + "epoch": 9.55, + "grad_norm": 1.2747801542282104, + "learning_rate": 2e-05, + "loss": 0.04904943, + "step": 4775 + }, + { + "epoch": 9.552, + "grad_norm": 1.364349603652954, + "learning_rate": 2e-05, + "loss": 0.05541695, + "step": 4776 + }, + { + "epoch": 9.554, + "grad_norm": 1.40326726436615, + "learning_rate": 2e-05, + "loss": 0.0485206, + "step": 4777 + }, + { + "epoch": 9.556000000000001, + "grad_norm": 2.4218220710754395, + "learning_rate": 2e-05, + "loss": 0.05936872, + "step": 4778 + }, + { + "epoch": 9.558, + "grad_norm": 1.5374068021774292, + "learning_rate": 2e-05, + "loss": 0.03438132, + "step": 4779 + }, + { + "epoch": 9.56, + "grad_norm": 2.0976953506469727, + "learning_rate": 2e-05, + "loss": 0.04670816, + "step": 4780 + }, + { + "epoch": 9.562, + "grad_norm": 1.6651930809020996, + "learning_rate": 2e-05, + "loss": 0.04262748, + "step": 4781 + }, + { + "epoch": 9.564, + "grad_norm": 1.433442234992981, + "learning_rate": 2e-05, + "loss": 0.05769576, + "step": 4782 + }, + { + "epoch": 9.566, + "grad_norm": 2.727137565612793, + "learning_rate": 2e-05, + "loss": 0.05369073, + "step": 4783 + }, + { + "epoch": 9.568, + "grad_norm": 2.403437852859497, + "learning_rate": 2e-05, + "loss": 0.05518568, + "step": 4784 + }, + { + "epoch": 9.57, + "grad_norm": 1.5393919944763184, + "learning_rate": 2e-05, + "loss": 0.05311519, + "step": 4785 + }, + { + "epoch": 9.572, + "grad_norm": 1.650131344795227, + "learning_rate": 2e-05, + "loss": 0.05369199, + "step": 4786 + }, + { + "epoch": 9.574, + "grad_norm": 1.6887848377227783, + "learning_rate": 2e-05, + "loss": 0.04836053, + "step": 4787 + }, + { + "epoch": 9.576, + "grad_norm": 1.3404792547225952, + "learning_rate": 2e-05, + "loss": 0.05586416, + "step": 4788 + }, + { + "epoch": 9.578, + "grad_norm": 1.5387517213821411, + "learning_rate": 2e-05, + "loss": 0.05821295, + "step": 4789 + }, + { + "epoch": 9.58, + "grad_norm": 1.3446617126464844, + "learning_rate": 2e-05, + "loss": 0.04860961, + "step": 4790 + }, + { + "epoch": 9.582, + "grad_norm": 1.9128057956695557, + "learning_rate": 2e-05, + "loss": 0.06139975, + "step": 4791 + }, + { + "epoch": 9.584, + "grad_norm": 1.8623543977737427, + "learning_rate": 2e-05, + "loss": 0.04838707, + "step": 4792 + }, + { + "epoch": 9.586, + "grad_norm": 2.0884079933166504, + "learning_rate": 2e-05, + "loss": 0.05497058, + "step": 4793 + }, + { + "epoch": 9.588, + "grad_norm": 1.26129949092865, + "learning_rate": 2e-05, + "loss": 0.05032674, + "step": 4794 + }, + { + "epoch": 9.59, + "grad_norm": 1.5007047653198242, + "learning_rate": 2e-05, + "loss": 0.0500331, + "step": 4795 + }, + { + "epoch": 9.592, + "grad_norm": 2.1904873847961426, + "learning_rate": 2e-05, + "loss": 0.05238812, + "step": 4796 + }, + { + "epoch": 9.594, + "grad_norm": 2.0764124393463135, + "learning_rate": 2e-05, + "loss": 0.05428065, + "step": 4797 + }, + { + "epoch": 9.596, + "grad_norm": 1.8335858583450317, + "learning_rate": 2e-05, + "loss": 0.05815455, + "step": 4798 + }, + { + "epoch": 9.598, + "grad_norm": 1.186977744102478, + "learning_rate": 2e-05, + "loss": 0.04371549, + "step": 4799 + }, + { + "epoch": 9.6, + "grad_norm": 6.732235908508301, + "learning_rate": 2e-05, + "loss": 0.06942256, + "step": 4800 + }, + { + "epoch": 9.602, + "grad_norm": 2.0716724395751953, + "learning_rate": 2e-05, + "loss": 0.0744697, + "step": 4801 + }, + { + "epoch": 9.604, + "grad_norm": 1.558214783668518, + "learning_rate": 2e-05, + "loss": 0.06264573, + "step": 4802 + }, + { + "epoch": 9.606, + "grad_norm": 4.206948280334473, + "learning_rate": 2e-05, + "loss": 0.055962, + "step": 4803 + }, + { + "epoch": 9.608, + "grad_norm": 1.3140082359313965, + "learning_rate": 2e-05, + "loss": 0.04739345, + "step": 4804 + }, + { + "epoch": 9.61, + "grad_norm": 1.58623468875885, + "learning_rate": 2e-05, + "loss": 0.0550376, + "step": 4805 + }, + { + "epoch": 9.612, + "grad_norm": 1.9726959466934204, + "learning_rate": 2e-05, + "loss": 0.07566231, + "step": 4806 + }, + { + "epoch": 9.614, + "grad_norm": 1.6119412183761597, + "learning_rate": 2e-05, + "loss": 0.05679318, + "step": 4807 + }, + { + "epoch": 9.616, + "grad_norm": 1.3407459259033203, + "learning_rate": 2e-05, + "loss": 0.04322591, + "step": 4808 + }, + { + "epoch": 9.618, + "grad_norm": 1.3803037405014038, + "learning_rate": 2e-05, + "loss": 0.04514508, + "step": 4809 + }, + { + "epoch": 9.62, + "grad_norm": 1.5106648206710815, + "learning_rate": 2e-05, + "loss": 0.06446012, + "step": 4810 + }, + { + "epoch": 9.622, + "grad_norm": 1.2000592947006226, + "learning_rate": 2e-05, + "loss": 0.04472655, + "step": 4811 + }, + { + "epoch": 9.624, + "grad_norm": 3.254822015762329, + "learning_rate": 2e-05, + "loss": 0.07974194, + "step": 4812 + }, + { + "epoch": 9.626, + "grad_norm": 1.7189629077911377, + "learning_rate": 2e-05, + "loss": 0.05568657, + "step": 4813 + }, + { + "epoch": 9.628, + "grad_norm": 1.4042153358459473, + "learning_rate": 2e-05, + "loss": 0.04132761, + "step": 4814 + }, + { + "epoch": 9.63, + "grad_norm": 2.279066562652588, + "learning_rate": 2e-05, + "loss": 0.04985248, + "step": 4815 + }, + { + "epoch": 9.632, + "grad_norm": 1.4138849973678589, + "learning_rate": 2e-05, + "loss": 0.04461198, + "step": 4816 + }, + { + "epoch": 9.634, + "grad_norm": 2.0648868083953857, + "learning_rate": 2e-05, + "loss": 0.04070833, + "step": 4817 + }, + { + "epoch": 9.636, + "grad_norm": 3.6020028591156006, + "learning_rate": 2e-05, + "loss": 0.05208477, + "step": 4818 + }, + { + "epoch": 9.638, + "grad_norm": 1.8781304359436035, + "learning_rate": 2e-05, + "loss": 0.05021086, + "step": 4819 + }, + { + "epoch": 9.64, + "grad_norm": 1.0614352226257324, + "learning_rate": 2e-05, + "loss": 0.03605043, + "step": 4820 + }, + { + "epoch": 9.642, + "grad_norm": 1.910072684288025, + "learning_rate": 2e-05, + "loss": 0.06666766, + "step": 4821 + }, + { + "epoch": 9.644, + "grad_norm": 1.2410168647766113, + "learning_rate": 2e-05, + "loss": 0.04226237, + "step": 4822 + }, + { + "epoch": 9.646, + "grad_norm": 1.6553242206573486, + "learning_rate": 2e-05, + "loss": 0.05368885, + "step": 4823 + }, + { + "epoch": 9.648, + "grad_norm": 1.3068299293518066, + "learning_rate": 2e-05, + "loss": 0.03962949, + "step": 4824 + }, + { + "epoch": 9.65, + "grad_norm": 2.591618537902832, + "learning_rate": 2e-05, + "loss": 0.06662943, + "step": 4825 + }, + { + "epoch": 9.652, + "grad_norm": 2.577542781829834, + "learning_rate": 2e-05, + "loss": 0.06383407, + "step": 4826 + }, + { + "epoch": 9.654, + "grad_norm": 1.6264952421188354, + "learning_rate": 2e-05, + "loss": 0.0523547, + "step": 4827 + }, + { + "epoch": 9.656, + "grad_norm": 2.4551913738250732, + "learning_rate": 2e-05, + "loss": 0.04400642, + "step": 4828 + }, + { + "epoch": 9.658, + "grad_norm": 3.1886560916900635, + "learning_rate": 2e-05, + "loss": 0.07130433, + "step": 4829 + }, + { + "epoch": 9.66, + "grad_norm": 1.5255032777786255, + "learning_rate": 2e-05, + "loss": 0.04827879, + "step": 4830 + }, + { + "epoch": 9.662, + "grad_norm": 2.4944541454315186, + "learning_rate": 2e-05, + "loss": 0.07604385, + "step": 4831 + }, + { + "epoch": 9.664, + "grad_norm": 1.9444361925125122, + "learning_rate": 2e-05, + "loss": 0.04895295, + "step": 4832 + }, + { + "epoch": 9.666, + "grad_norm": 1.206829309463501, + "learning_rate": 2e-05, + "loss": 0.04027706, + "step": 4833 + }, + { + "epoch": 9.668, + "grad_norm": 1.0655403137207031, + "learning_rate": 2e-05, + "loss": 0.03361625, + "step": 4834 + }, + { + "epoch": 9.67, + "grad_norm": 1.6680309772491455, + "learning_rate": 2e-05, + "loss": 0.05498701, + "step": 4835 + }, + { + "epoch": 9.672, + "grad_norm": 1.6686928272247314, + "learning_rate": 2e-05, + "loss": 0.05405208, + "step": 4836 + }, + { + "epoch": 9.674, + "grad_norm": 1.5973371267318726, + "learning_rate": 2e-05, + "loss": 0.03787687, + "step": 4837 + }, + { + "epoch": 9.676, + "grad_norm": 1.5057041645050049, + "learning_rate": 2e-05, + "loss": 0.05292775, + "step": 4838 + }, + { + "epoch": 9.678, + "grad_norm": 2.048788547515869, + "learning_rate": 2e-05, + "loss": 0.05188146, + "step": 4839 + }, + { + "epoch": 9.68, + "grad_norm": 1.5165146589279175, + "learning_rate": 2e-05, + "loss": 0.05076149, + "step": 4840 + }, + { + "epoch": 9.682, + "grad_norm": 1.3067723512649536, + "learning_rate": 2e-05, + "loss": 0.04745375, + "step": 4841 + }, + { + "epoch": 9.684, + "grad_norm": 2.40364670753479, + "learning_rate": 2e-05, + "loss": 0.06933152, + "step": 4842 + }, + { + "epoch": 9.686, + "grad_norm": 2.187824010848999, + "learning_rate": 2e-05, + "loss": 0.06234965, + "step": 4843 + }, + { + "epoch": 9.688, + "grad_norm": 2.3751494884490967, + "learning_rate": 2e-05, + "loss": 0.05942409, + "step": 4844 + }, + { + "epoch": 9.69, + "grad_norm": 1.3556272983551025, + "learning_rate": 2e-05, + "loss": 0.0271932, + "step": 4845 + }, + { + "epoch": 9.692, + "grad_norm": 2.0702574253082275, + "learning_rate": 2e-05, + "loss": 0.04416346, + "step": 4846 + }, + { + "epoch": 9.693999999999999, + "grad_norm": 2.776580572128296, + "learning_rate": 2e-05, + "loss": 0.06459181, + "step": 4847 + }, + { + "epoch": 9.696, + "grad_norm": 1.423103928565979, + "learning_rate": 2e-05, + "loss": 0.04712578, + "step": 4848 + }, + { + "epoch": 9.698, + "grad_norm": 1.3185054063796997, + "learning_rate": 2e-05, + "loss": 0.03745867, + "step": 4849 + }, + { + "epoch": 9.7, + "grad_norm": 1.4054712057113647, + "learning_rate": 2e-05, + "loss": 0.05372046, + "step": 4850 + }, + { + "epoch": 9.702, + "grad_norm": 1.5396513938903809, + "learning_rate": 2e-05, + "loss": 0.04799499, + "step": 4851 + }, + { + "epoch": 9.704, + "grad_norm": 1.3242496252059937, + "learning_rate": 2e-05, + "loss": 0.04319259, + "step": 4852 + }, + { + "epoch": 9.706, + "grad_norm": 1.6607216596603394, + "learning_rate": 2e-05, + "loss": 0.04303396, + "step": 4853 + }, + { + "epoch": 9.708, + "grad_norm": 1.570757508277893, + "learning_rate": 2e-05, + "loss": 0.06974973, + "step": 4854 + }, + { + "epoch": 9.71, + "grad_norm": 1.5978914499282837, + "learning_rate": 2e-05, + "loss": 0.05531528, + "step": 4855 + }, + { + "epoch": 9.712, + "grad_norm": 2.2332658767700195, + "learning_rate": 2e-05, + "loss": 0.05066529, + "step": 4856 + }, + { + "epoch": 9.714, + "grad_norm": 1.695459246635437, + "learning_rate": 2e-05, + "loss": 0.04276437, + "step": 4857 + }, + { + "epoch": 9.716, + "grad_norm": 1.4969645738601685, + "learning_rate": 2e-05, + "loss": 0.0565182, + "step": 4858 + }, + { + "epoch": 9.718, + "grad_norm": 2.46620512008667, + "learning_rate": 2e-05, + "loss": 0.04657774, + "step": 4859 + }, + { + "epoch": 9.72, + "grad_norm": 2.0684173107147217, + "learning_rate": 2e-05, + "loss": 0.05510051, + "step": 4860 + }, + { + "epoch": 9.722, + "grad_norm": 1.5146620273590088, + "learning_rate": 2e-05, + "loss": 0.04686711, + "step": 4861 + }, + { + "epoch": 9.724, + "grad_norm": 2.6633617877960205, + "learning_rate": 2e-05, + "loss": 0.05086003, + "step": 4862 + }, + { + "epoch": 9.725999999999999, + "grad_norm": 1.2176555395126343, + "learning_rate": 2e-05, + "loss": 0.0497752, + "step": 4863 + }, + { + "epoch": 9.728, + "grad_norm": 1.2809525728225708, + "learning_rate": 2e-05, + "loss": 0.04838467, + "step": 4864 + }, + { + "epoch": 9.73, + "grad_norm": 1.3932509422302246, + "learning_rate": 2e-05, + "loss": 0.0427762, + "step": 4865 + }, + { + "epoch": 9.732, + "grad_norm": 3.854522466659546, + "learning_rate": 2e-05, + "loss": 0.0666661, + "step": 4866 + }, + { + "epoch": 9.734, + "grad_norm": 1.537145972251892, + "learning_rate": 2e-05, + "loss": 0.03788938, + "step": 4867 + }, + { + "epoch": 9.736, + "grad_norm": 2.0760679244995117, + "learning_rate": 2e-05, + "loss": 0.05522866, + "step": 4868 + }, + { + "epoch": 9.738, + "grad_norm": 1.693951964378357, + "learning_rate": 2e-05, + "loss": 0.06659015, + "step": 4869 + }, + { + "epoch": 9.74, + "grad_norm": 1.2150261402130127, + "learning_rate": 2e-05, + "loss": 0.04565568, + "step": 4870 + }, + { + "epoch": 9.742, + "grad_norm": 1.8635746240615845, + "learning_rate": 2e-05, + "loss": 0.0635218, + "step": 4871 + }, + { + "epoch": 9.744, + "grad_norm": 1.8649309873580933, + "learning_rate": 2e-05, + "loss": 0.06209478, + "step": 4872 + }, + { + "epoch": 9.746, + "grad_norm": 1.2454828023910522, + "learning_rate": 2e-05, + "loss": 0.04050411, + "step": 4873 + }, + { + "epoch": 9.748, + "grad_norm": 1.7118374109268188, + "learning_rate": 2e-05, + "loss": 0.0501698, + "step": 4874 + }, + { + "epoch": 9.75, + "grad_norm": 1.9825464487075806, + "learning_rate": 2e-05, + "loss": 0.0551829, + "step": 4875 + }, + { + "epoch": 9.752, + "grad_norm": 1.6307551860809326, + "learning_rate": 2e-05, + "loss": 0.06142005, + "step": 4876 + }, + { + "epoch": 9.754, + "grad_norm": 2.2640645503997803, + "learning_rate": 2e-05, + "loss": 0.06031884, + "step": 4877 + }, + { + "epoch": 9.756, + "grad_norm": 1.0484297275543213, + "learning_rate": 2e-05, + "loss": 0.05024759, + "step": 4878 + }, + { + "epoch": 9.758, + "grad_norm": 1.469162940979004, + "learning_rate": 2e-05, + "loss": 0.04584752, + "step": 4879 + }, + { + "epoch": 9.76, + "grad_norm": 1.5308761596679688, + "learning_rate": 2e-05, + "loss": 0.03732128, + "step": 4880 + }, + { + "epoch": 9.762, + "grad_norm": 1.904474139213562, + "learning_rate": 2e-05, + "loss": 0.05384236, + "step": 4881 + }, + { + "epoch": 9.764, + "grad_norm": 6.474231243133545, + "learning_rate": 2e-05, + "loss": 0.07180381, + "step": 4882 + }, + { + "epoch": 9.766, + "grad_norm": 1.5289386510849, + "learning_rate": 2e-05, + "loss": 0.05518559, + "step": 4883 + }, + { + "epoch": 9.768, + "grad_norm": 1.3522764444351196, + "learning_rate": 2e-05, + "loss": 0.05258585, + "step": 4884 + }, + { + "epoch": 9.77, + "grad_norm": 1.4351115226745605, + "learning_rate": 2e-05, + "loss": 0.03858275, + "step": 4885 + }, + { + "epoch": 9.772, + "grad_norm": 2.1494507789611816, + "learning_rate": 2e-05, + "loss": 0.05100446, + "step": 4886 + }, + { + "epoch": 9.774000000000001, + "grad_norm": 2.7354249954223633, + "learning_rate": 2e-05, + "loss": 0.05425896, + "step": 4887 + }, + { + "epoch": 9.776, + "grad_norm": 1.6631423234939575, + "learning_rate": 2e-05, + "loss": 0.04931759, + "step": 4888 + }, + { + "epoch": 9.778, + "grad_norm": 2.6517434120178223, + "learning_rate": 2e-05, + "loss": 0.06315567, + "step": 4889 + }, + { + "epoch": 9.78, + "grad_norm": 2.4273080825805664, + "learning_rate": 2e-05, + "loss": 0.07165055, + "step": 4890 + }, + { + "epoch": 9.782, + "grad_norm": 1.5773106813430786, + "learning_rate": 2e-05, + "loss": 0.06337696, + "step": 4891 + }, + { + "epoch": 9.784, + "grad_norm": 1.3168998956680298, + "learning_rate": 2e-05, + "loss": 0.04947937, + "step": 4892 + }, + { + "epoch": 9.786, + "grad_norm": 1.380509614944458, + "learning_rate": 2e-05, + "loss": 0.04712594, + "step": 4893 + }, + { + "epoch": 9.788, + "grad_norm": 1.479285717010498, + "learning_rate": 2e-05, + "loss": 0.04698952, + "step": 4894 + }, + { + "epoch": 9.79, + "grad_norm": 1.8223832845687866, + "learning_rate": 2e-05, + "loss": 0.04973854, + "step": 4895 + }, + { + "epoch": 9.792, + "grad_norm": 1.8875789642333984, + "learning_rate": 2e-05, + "loss": 0.05310801, + "step": 4896 + }, + { + "epoch": 9.794, + "grad_norm": 2.261950969696045, + "learning_rate": 2e-05, + "loss": 0.05974211, + "step": 4897 + }, + { + "epoch": 9.796, + "grad_norm": 1.628933072090149, + "learning_rate": 2e-05, + "loss": 0.04541996, + "step": 4898 + }, + { + "epoch": 9.798, + "grad_norm": 2.749540328979492, + "learning_rate": 2e-05, + "loss": 0.06158432, + "step": 4899 + }, + { + "epoch": 9.8, + "grad_norm": 2.566983938217163, + "learning_rate": 2e-05, + "loss": 0.05563569, + "step": 4900 + }, + { + "epoch": 9.802, + "grad_norm": 1.902557134628296, + "learning_rate": 2e-05, + "loss": 0.04794583, + "step": 4901 + }, + { + "epoch": 9.804, + "grad_norm": 3.0628104209899902, + "learning_rate": 2e-05, + "loss": 0.06039095, + "step": 4902 + }, + { + "epoch": 9.806000000000001, + "grad_norm": 1.1863480806350708, + "learning_rate": 2e-05, + "loss": 0.03568749, + "step": 4903 + }, + { + "epoch": 9.808, + "grad_norm": 2.010585308074951, + "learning_rate": 2e-05, + "loss": 0.04390045, + "step": 4904 + }, + { + "epoch": 9.81, + "grad_norm": 1.8428614139556885, + "learning_rate": 2e-05, + "loss": 0.06692166, + "step": 4905 + }, + { + "epoch": 9.812, + "grad_norm": 1.32927668094635, + "learning_rate": 2e-05, + "loss": 0.04278745, + "step": 4906 + }, + { + "epoch": 9.814, + "grad_norm": 1.290330410003662, + "learning_rate": 2e-05, + "loss": 0.0594513, + "step": 4907 + }, + { + "epoch": 9.816, + "grad_norm": 0.9869194030761719, + "learning_rate": 2e-05, + "loss": 0.0311925, + "step": 4908 + }, + { + "epoch": 9.818, + "grad_norm": 1.442022681236267, + "learning_rate": 2e-05, + "loss": 0.04770211, + "step": 4909 + }, + { + "epoch": 9.82, + "grad_norm": 1.677892804145813, + "learning_rate": 2e-05, + "loss": 0.04223461, + "step": 4910 + }, + { + "epoch": 9.822, + "grad_norm": 1.2613219022750854, + "learning_rate": 2e-05, + "loss": 0.04329883, + "step": 4911 + }, + { + "epoch": 9.824, + "grad_norm": 2.5619208812713623, + "learning_rate": 2e-05, + "loss": 0.04533758, + "step": 4912 + }, + { + "epoch": 9.826, + "grad_norm": 1.5010818243026733, + "learning_rate": 2e-05, + "loss": 0.0549286, + "step": 4913 + }, + { + "epoch": 9.828, + "grad_norm": 1.5867867469787598, + "learning_rate": 2e-05, + "loss": 0.05643577, + "step": 4914 + }, + { + "epoch": 9.83, + "grad_norm": 2.7611711025238037, + "learning_rate": 2e-05, + "loss": 0.04502559, + "step": 4915 + }, + { + "epoch": 9.832, + "grad_norm": 1.104813575744629, + "learning_rate": 2e-05, + "loss": 0.03817623, + "step": 4916 + }, + { + "epoch": 9.834, + "grad_norm": 1.927043080329895, + "learning_rate": 2e-05, + "loss": 0.05190308, + "step": 4917 + }, + { + "epoch": 9.836, + "grad_norm": 1.5192525386810303, + "learning_rate": 2e-05, + "loss": 0.03937745, + "step": 4918 + }, + { + "epoch": 9.838, + "grad_norm": 1.665751576423645, + "learning_rate": 2e-05, + "loss": 0.05758657, + "step": 4919 + }, + { + "epoch": 9.84, + "grad_norm": 2.303105354309082, + "learning_rate": 2e-05, + "loss": 0.05672823, + "step": 4920 + }, + { + "epoch": 9.842, + "grad_norm": 1.7795699834823608, + "learning_rate": 2e-05, + "loss": 0.05460572, + "step": 4921 + }, + { + "epoch": 9.844, + "grad_norm": 1.8190152645111084, + "learning_rate": 2e-05, + "loss": 0.04848269, + "step": 4922 + }, + { + "epoch": 9.846, + "grad_norm": 1.7064250707626343, + "learning_rate": 2e-05, + "loss": 0.05619481, + "step": 4923 + }, + { + "epoch": 9.848, + "grad_norm": 1.413002610206604, + "learning_rate": 2e-05, + "loss": 0.04609952, + "step": 4924 + }, + { + "epoch": 9.85, + "grad_norm": 1.523747444152832, + "learning_rate": 2e-05, + "loss": 0.06341836, + "step": 4925 + }, + { + "epoch": 9.852, + "grad_norm": 1.346929669380188, + "learning_rate": 2e-05, + "loss": 0.03450187, + "step": 4926 + }, + { + "epoch": 9.854, + "grad_norm": 1.8626230955123901, + "learning_rate": 2e-05, + "loss": 0.04854961, + "step": 4927 + }, + { + "epoch": 9.856, + "grad_norm": 1.1077322959899902, + "learning_rate": 2e-05, + "loss": 0.03540801, + "step": 4928 + }, + { + "epoch": 9.858, + "grad_norm": 1.6448397636413574, + "learning_rate": 2e-05, + "loss": 0.04669225, + "step": 4929 + }, + { + "epoch": 9.86, + "grad_norm": 2.1358819007873535, + "learning_rate": 2e-05, + "loss": 0.05960096, + "step": 4930 + }, + { + "epoch": 9.862, + "grad_norm": 4.25274658203125, + "learning_rate": 2e-05, + "loss": 0.08006147, + "step": 4931 + }, + { + "epoch": 9.864, + "grad_norm": 1.9246102571487427, + "learning_rate": 2e-05, + "loss": 0.07823011, + "step": 4932 + }, + { + "epoch": 9.866, + "grad_norm": 1.5427088737487793, + "learning_rate": 2e-05, + "loss": 0.04289813, + "step": 4933 + }, + { + "epoch": 9.868, + "grad_norm": 1.953423261642456, + "learning_rate": 2e-05, + "loss": 0.05337017, + "step": 4934 + }, + { + "epoch": 9.87, + "grad_norm": 1.5664440393447876, + "learning_rate": 2e-05, + "loss": 0.05721167, + "step": 4935 + }, + { + "epoch": 9.872, + "grad_norm": 2.196546792984009, + "learning_rate": 2e-05, + "loss": 0.05689994, + "step": 4936 + }, + { + "epoch": 9.874, + "grad_norm": 1.7827051877975464, + "learning_rate": 2e-05, + "loss": 0.05442892, + "step": 4937 + }, + { + "epoch": 9.876, + "grad_norm": 1.2307844161987305, + "learning_rate": 2e-05, + "loss": 0.04137726, + "step": 4938 + }, + { + "epoch": 9.878, + "grad_norm": 1.3965389728546143, + "learning_rate": 2e-05, + "loss": 0.04177804, + "step": 4939 + }, + { + "epoch": 9.88, + "grad_norm": 1.301183819770813, + "learning_rate": 2e-05, + "loss": 0.04382346, + "step": 4940 + }, + { + "epoch": 9.882, + "grad_norm": 2.091052293777466, + "learning_rate": 2e-05, + "loss": 0.05116628, + "step": 4941 + }, + { + "epoch": 9.884, + "grad_norm": 1.581408143043518, + "learning_rate": 2e-05, + "loss": 0.05110285, + "step": 4942 + }, + { + "epoch": 9.886, + "grad_norm": 2.7291102409362793, + "learning_rate": 2e-05, + "loss": 0.06126875, + "step": 4943 + }, + { + "epoch": 9.888, + "grad_norm": 2.2793638706207275, + "learning_rate": 2e-05, + "loss": 0.05009011, + "step": 4944 + }, + { + "epoch": 9.89, + "grad_norm": 3.306272506713867, + "learning_rate": 2e-05, + "loss": 0.06589854, + "step": 4945 + }, + { + "epoch": 9.892, + "grad_norm": 1.4898868799209595, + "learning_rate": 2e-05, + "loss": 0.05594926, + "step": 4946 + }, + { + "epoch": 9.894, + "grad_norm": 2.2353460788726807, + "learning_rate": 2e-05, + "loss": 0.06084714, + "step": 4947 + }, + { + "epoch": 9.896, + "grad_norm": 2.330730676651001, + "learning_rate": 2e-05, + "loss": 0.05106577, + "step": 4948 + }, + { + "epoch": 9.898, + "grad_norm": 1.3714518547058105, + "learning_rate": 2e-05, + "loss": 0.04674903, + "step": 4949 + }, + { + "epoch": 9.9, + "grad_norm": 1.4012672901153564, + "learning_rate": 2e-05, + "loss": 0.05167598, + "step": 4950 + }, + { + "epoch": 9.902, + "grad_norm": 1.0671659708023071, + "learning_rate": 2e-05, + "loss": 0.02830281, + "step": 4951 + }, + { + "epoch": 9.904, + "grad_norm": 1.8455958366394043, + "learning_rate": 2e-05, + "loss": 0.05752491, + "step": 4952 + }, + { + "epoch": 9.906, + "grad_norm": 1.4932456016540527, + "learning_rate": 2e-05, + "loss": 0.04646926, + "step": 4953 + }, + { + "epoch": 9.908, + "grad_norm": 1.6366294622421265, + "learning_rate": 2e-05, + "loss": 0.0468614, + "step": 4954 + }, + { + "epoch": 9.91, + "grad_norm": 2.687469720840454, + "learning_rate": 2e-05, + "loss": 0.0775534, + "step": 4955 + }, + { + "epoch": 9.912, + "grad_norm": 3.0272841453552246, + "learning_rate": 2e-05, + "loss": 0.06905685, + "step": 4956 + }, + { + "epoch": 9.914, + "grad_norm": 2.0984997749328613, + "learning_rate": 2e-05, + "loss": 0.07175948, + "step": 4957 + }, + { + "epoch": 9.916, + "grad_norm": 2.3406221866607666, + "learning_rate": 2e-05, + "loss": 0.06852732, + "step": 4958 + }, + { + "epoch": 9.918, + "grad_norm": 2.1772301197052, + "learning_rate": 2e-05, + "loss": 0.0689801, + "step": 4959 + }, + { + "epoch": 9.92, + "grad_norm": 1.6257960796356201, + "learning_rate": 2e-05, + "loss": 0.05941542, + "step": 4960 + }, + { + "epoch": 9.922, + "grad_norm": 2.138094663619995, + "learning_rate": 2e-05, + "loss": 0.04775422, + "step": 4961 + }, + { + "epoch": 9.924, + "grad_norm": 2.711124897003174, + "learning_rate": 2e-05, + "loss": 0.06104578, + "step": 4962 + }, + { + "epoch": 9.926, + "grad_norm": 2.045482873916626, + "learning_rate": 2e-05, + "loss": 0.06203432, + "step": 4963 + }, + { + "epoch": 9.928, + "grad_norm": 1.3155605792999268, + "learning_rate": 2e-05, + "loss": 0.05150554, + "step": 4964 + }, + { + "epoch": 9.93, + "grad_norm": 1.79209566116333, + "learning_rate": 2e-05, + "loss": 0.04305602, + "step": 4965 + }, + { + "epoch": 9.932, + "grad_norm": 2.037360191345215, + "learning_rate": 2e-05, + "loss": 0.05929771, + "step": 4966 + }, + { + "epoch": 9.934, + "grad_norm": 1.8728766441345215, + "learning_rate": 2e-05, + "loss": 0.05034747, + "step": 4967 + }, + { + "epoch": 9.936, + "grad_norm": 1.58304762840271, + "learning_rate": 2e-05, + "loss": 0.04624342, + "step": 4968 + }, + { + "epoch": 9.938, + "grad_norm": 1.070703387260437, + "learning_rate": 2e-05, + "loss": 0.04769868, + "step": 4969 + }, + { + "epoch": 9.94, + "grad_norm": 1.5392509698867798, + "learning_rate": 2e-05, + "loss": 0.04278378, + "step": 4970 + }, + { + "epoch": 9.942, + "grad_norm": 1.853967308998108, + "learning_rate": 2e-05, + "loss": 0.06206827, + "step": 4971 + }, + { + "epoch": 9.943999999999999, + "grad_norm": 1.6598646640777588, + "learning_rate": 2e-05, + "loss": 0.05348482, + "step": 4972 + }, + { + "epoch": 9.946, + "grad_norm": 1.3386852741241455, + "learning_rate": 2e-05, + "loss": 0.05379867, + "step": 4973 + }, + { + "epoch": 9.948, + "grad_norm": 1.0884183645248413, + "learning_rate": 2e-05, + "loss": 0.04097927, + "step": 4974 + }, + { + "epoch": 9.95, + "grad_norm": 2.1080918312072754, + "learning_rate": 2e-05, + "loss": 0.05956004, + "step": 4975 + }, + { + "epoch": 9.952, + "grad_norm": 3.454415798187256, + "learning_rate": 2e-05, + "loss": 0.05866547, + "step": 4976 + }, + { + "epoch": 9.954, + "grad_norm": 1.8443541526794434, + "learning_rate": 2e-05, + "loss": 0.05498111, + "step": 4977 + }, + { + "epoch": 9.956, + "grad_norm": 1.2437260150909424, + "learning_rate": 2e-05, + "loss": 0.04896142, + "step": 4978 + }, + { + "epoch": 9.958, + "grad_norm": 2.130713701248169, + "learning_rate": 2e-05, + "loss": 0.05966695, + "step": 4979 + }, + { + "epoch": 9.96, + "grad_norm": 2.0753307342529297, + "learning_rate": 2e-05, + "loss": 0.0618369, + "step": 4980 + }, + { + "epoch": 9.962, + "grad_norm": 2.517958879470825, + "learning_rate": 2e-05, + "loss": 0.06341064, + "step": 4981 + }, + { + "epoch": 9.964, + "grad_norm": 1.4736145734786987, + "learning_rate": 2e-05, + "loss": 0.05388065, + "step": 4982 + }, + { + "epoch": 9.966, + "grad_norm": 2.1390750408172607, + "learning_rate": 2e-05, + "loss": 0.0556998, + "step": 4983 + }, + { + "epoch": 9.968, + "grad_norm": 1.417447566986084, + "learning_rate": 2e-05, + "loss": 0.04931127, + "step": 4984 + }, + { + "epoch": 9.97, + "grad_norm": 1.8981767892837524, + "learning_rate": 2e-05, + "loss": 0.04368909, + "step": 4985 + }, + { + "epoch": 9.972, + "grad_norm": 2.0923216342926025, + "learning_rate": 2e-05, + "loss": 0.0481521, + "step": 4986 + }, + { + "epoch": 9.974, + "grad_norm": 2.3560774326324463, + "learning_rate": 2e-05, + "loss": 0.04617333, + "step": 4987 + }, + { + "epoch": 9.975999999999999, + "grad_norm": 2.1225790977478027, + "learning_rate": 2e-05, + "loss": 0.06341647, + "step": 4988 + }, + { + "epoch": 9.978, + "grad_norm": 1.4702603816986084, + "learning_rate": 2e-05, + "loss": 0.06608592, + "step": 4989 + }, + { + "epoch": 9.98, + "grad_norm": 1.437772512435913, + "learning_rate": 2e-05, + "loss": 0.0440442, + "step": 4990 + }, + { + "epoch": 9.982, + "grad_norm": 1.7781963348388672, + "learning_rate": 2e-05, + "loss": 0.04864323, + "step": 4991 + }, + { + "epoch": 9.984, + "grad_norm": 2.0641047954559326, + "learning_rate": 2e-05, + "loss": 0.05038781, + "step": 4992 + }, + { + "epoch": 9.986, + "grad_norm": 1.471038818359375, + "learning_rate": 2e-05, + "loss": 0.05805187, + "step": 4993 + }, + { + "epoch": 9.988, + "grad_norm": 1.7074260711669922, + "learning_rate": 2e-05, + "loss": 0.04913766, + "step": 4994 + }, + { + "epoch": 9.99, + "grad_norm": 1.5524414777755737, + "learning_rate": 2e-05, + "loss": 0.04417773, + "step": 4995 + }, + { + "epoch": 9.992, + "grad_norm": 2.4501919746398926, + "learning_rate": 2e-05, + "loss": 0.05671235, + "step": 4996 + }, + { + "epoch": 9.994, + "grad_norm": 1.801255226135254, + "learning_rate": 2e-05, + "loss": 0.06374493, + "step": 4997 + }, + { + "epoch": 9.996, + "grad_norm": 1.989261507987976, + "learning_rate": 2e-05, + "loss": 0.04868555, + "step": 4998 + }, + { + "epoch": 9.998, + "grad_norm": 1.389630675315857, + "learning_rate": 2e-05, + "loss": 0.0455512, + "step": 4999 + }, + { + "epoch": 10.0, + "grad_norm": 1.467817783355713, + "learning_rate": 2e-05, + "loss": 0.06919758, + "step": 5000 + }, + { + "epoch": 10.0, + "eval_performance": { + "AngleClassification_1": 0.988, + "AngleClassification_2": 0.994, + "AngleClassification_3": 0.9560878243512974, + "Equal_1": 0.998, + "Equal_2": 0.9600798403193613, + "Equal_3": 0.8542914171656687, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9900199600798403, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.988, + "Perpendicular_1": 0.988, + "Perpendicular_2": 0.916, + "Perpendicular_3": 0.5741482965931863, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.994, + "PointLiesOnCircle_3": 0.996, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9899799599198397, + "PointLiesOnLine_3": 0.9580838323353293 + }, + "eval_runtime": 320.6792, + "eval_samples_per_second": 32.743, + "eval_steps_per_second": 0.655, + "step": 5000 + }, + { + "epoch": 10.002, + "grad_norm": 1.6074891090393066, + "learning_rate": 2e-05, + "loss": 0.05639043, + "step": 5001 + }, + { + "epoch": 10.004, + "grad_norm": 1.6433279514312744, + "learning_rate": 2e-05, + "loss": 0.06391891, + "step": 5002 + }, + { + "epoch": 10.006, + "grad_norm": 1.0623618364334106, + "learning_rate": 2e-05, + "loss": 0.0363953, + "step": 5003 + }, + { + "epoch": 10.008, + "grad_norm": 2.0282094478607178, + "learning_rate": 2e-05, + "loss": 0.05439749, + "step": 5004 + }, + { + "epoch": 10.01, + "grad_norm": 2.1043431758880615, + "learning_rate": 2e-05, + "loss": 0.05599514, + "step": 5005 + }, + { + "epoch": 10.012, + "grad_norm": 1.4989700317382812, + "learning_rate": 2e-05, + "loss": 0.04970724, + "step": 5006 + }, + { + "epoch": 10.014, + "grad_norm": 1.6386247873306274, + "learning_rate": 2e-05, + "loss": 0.04516533, + "step": 5007 + }, + { + "epoch": 10.016, + "grad_norm": 1.3612985610961914, + "learning_rate": 2e-05, + "loss": 0.05006377, + "step": 5008 + }, + { + "epoch": 10.018, + "grad_norm": 1.6199952363967896, + "learning_rate": 2e-05, + "loss": 0.0529905, + "step": 5009 + }, + { + "epoch": 10.02, + "grad_norm": 1.690873384475708, + "learning_rate": 2e-05, + "loss": 0.06474019, + "step": 5010 + }, + { + "epoch": 10.022, + "grad_norm": 1.0925859212875366, + "learning_rate": 2e-05, + "loss": 0.04544307, + "step": 5011 + }, + { + "epoch": 10.024, + "grad_norm": 2.489445447921753, + "learning_rate": 2e-05, + "loss": 0.04316767, + "step": 5012 + }, + { + "epoch": 10.026, + "grad_norm": 1.0583429336547852, + "learning_rate": 2e-05, + "loss": 0.03954223, + "step": 5013 + }, + { + "epoch": 10.028, + "grad_norm": 2.259428024291992, + "learning_rate": 2e-05, + "loss": 0.05867039, + "step": 5014 + }, + { + "epoch": 10.03, + "grad_norm": 1.1833878755569458, + "learning_rate": 2e-05, + "loss": 0.04461697, + "step": 5015 + }, + { + "epoch": 10.032, + "grad_norm": 1.7030174732208252, + "learning_rate": 2e-05, + "loss": 0.06960414, + "step": 5016 + }, + { + "epoch": 10.034, + "grad_norm": 2.5388693809509277, + "learning_rate": 2e-05, + "loss": 0.04828655, + "step": 5017 + }, + { + "epoch": 10.036, + "grad_norm": 1.4048974514007568, + "learning_rate": 2e-05, + "loss": 0.03989777, + "step": 5018 + }, + { + "epoch": 10.038, + "grad_norm": 1.6983534097671509, + "learning_rate": 2e-05, + "loss": 0.04251143, + "step": 5019 + }, + { + "epoch": 10.04, + "grad_norm": 2.109666347503662, + "learning_rate": 2e-05, + "loss": 0.04322487, + "step": 5020 + }, + { + "epoch": 10.042, + "grad_norm": 1.6029553413391113, + "learning_rate": 2e-05, + "loss": 0.06342378, + "step": 5021 + }, + { + "epoch": 10.044, + "grad_norm": 1.3456430435180664, + "learning_rate": 2e-05, + "loss": 0.04814564, + "step": 5022 + }, + { + "epoch": 10.046, + "grad_norm": 1.7737743854522705, + "learning_rate": 2e-05, + "loss": 0.0458786, + "step": 5023 + }, + { + "epoch": 10.048, + "grad_norm": 1.2273495197296143, + "learning_rate": 2e-05, + "loss": 0.03703401, + "step": 5024 + }, + { + "epoch": 10.05, + "grad_norm": 2.4264330863952637, + "learning_rate": 2e-05, + "loss": 0.07618839, + "step": 5025 + }, + { + "epoch": 10.052, + "grad_norm": 1.851207971572876, + "learning_rate": 2e-05, + "loss": 0.07478073, + "step": 5026 + }, + { + "epoch": 10.054, + "grad_norm": 1.3361573219299316, + "learning_rate": 2e-05, + "loss": 0.05228508, + "step": 5027 + }, + { + "epoch": 10.056, + "grad_norm": 2.115432024002075, + "learning_rate": 2e-05, + "loss": 0.06497653, + "step": 5028 + }, + { + "epoch": 10.058, + "grad_norm": 1.5752816200256348, + "learning_rate": 2e-05, + "loss": 0.04064302, + "step": 5029 + }, + { + "epoch": 10.06, + "grad_norm": 1.224853754043579, + "learning_rate": 2e-05, + "loss": 0.04076491, + "step": 5030 + }, + { + "epoch": 10.062, + "grad_norm": 1.0174431800842285, + "learning_rate": 2e-05, + "loss": 0.02776841, + "step": 5031 + }, + { + "epoch": 10.064, + "grad_norm": 1.4870493412017822, + "learning_rate": 2e-05, + "loss": 0.05205495, + "step": 5032 + }, + { + "epoch": 10.066, + "grad_norm": 1.233358383178711, + "learning_rate": 2e-05, + "loss": 0.04476997, + "step": 5033 + }, + { + "epoch": 10.068, + "grad_norm": 1.4965168237686157, + "learning_rate": 2e-05, + "loss": 0.06173176, + "step": 5034 + }, + { + "epoch": 10.07, + "grad_norm": 1.2926126718521118, + "learning_rate": 2e-05, + "loss": 0.05273168, + "step": 5035 + }, + { + "epoch": 10.072, + "grad_norm": 1.9668138027191162, + "learning_rate": 2e-05, + "loss": 0.05729368, + "step": 5036 + }, + { + "epoch": 10.074, + "grad_norm": 1.9118866920471191, + "learning_rate": 2e-05, + "loss": 0.07089907, + "step": 5037 + }, + { + "epoch": 10.076, + "grad_norm": 1.567081093788147, + "learning_rate": 2e-05, + "loss": 0.04848254, + "step": 5038 + }, + { + "epoch": 10.078, + "grad_norm": 1.3504266738891602, + "learning_rate": 2e-05, + "loss": 0.04428206, + "step": 5039 + }, + { + "epoch": 10.08, + "grad_norm": 1.5226638317108154, + "learning_rate": 2e-05, + "loss": 0.05267453, + "step": 5040 + }, + { + "epoch": 10.082, + "grad_norm": 2.2543962001800537, + "learning_rate": 2e-05, + "loss": 0.05228227, + "step": 5041 + }, + { + "epoch": 10.084, + "grad_norm": 1.4806350469589233, + "learning_rate": 2e-05, + "loss": 0.04181156, + "step": 5042 + }, + { + "epoch": 10.086, + "grad_norm": 1.2908692359924316, + "learning_rate": 2e-05, + "loss": 0.04454116, + "step": 5043 + }, + { + "epoch": 10.088, + "grad_norm": 2.006575584411621, + "learning_rate": 2e-05, + "loss": 0.05524797, + "step": 5044 + }, + { + "epoch": 10.09, + "grad_norm": 1.949083685874939, + "learning_rate": 2e-05, + "loss": 0.04127013, + "step": 5045 + }, + { + "epoch": 10.092, + "grad_norm": 1.6486314535140991, + "learning_rate": 2e-05, + "loss": 0.06116631, + "step": 5046 + }, + { + "epoch": 10.094, + "grad_norm": 1.8648558855056763, + "learning_rate": 2e-05, + "loss": 0.04682179, + "step": 5047 + }, + { + "epoch": 10.096, + "grad_norm": 2.304664134979248, + "learning_rate": 2e-05, + "loss": 0.05680639, + "step": 5048 + }, + { + "epoch": 10.098, + "grad_norm": 1.4848788976669312, + "learning_rate": 2e-05, + "loss": 0.03981915, + "step": 5049 + }, + { + "epoch": 10.1, + "grad_norm": 1.9800957441329956, + "learning_rate": 2e-05, + "loss": 0.03908926, + "step": 5050 + }, + { + "epoch": 10.102, + "grad_norm": 1.5394706726074219, + "learning_rate": 2e-05, + "loss": 0.05664773, + "step": 5051 + }, + { + "epoch": 10.104, + "grad_norm": 2.171658992767334, + "learning_rate": 2e-05, + "loss": 0.05971353, + "step": 5052 + }, + { + "epoch": 10.106, + "grad_norm": 2.1404213905334473, + "learning_rate": 2e-05, + "loss": 0.05590626, + "step": 5053 + }, + { + "epoch": 10.108, + "grad_norm": 3.454113483428955, + "learning_rate": 2e-05, + "loss": 0.0404954, + "step": 5054 + }, + { + "epoch": 10.11, + "grad_norm": 4.064882278442383, + "learning_rate": 2e-05, + "loss": 0.0535365, + "step": 5055 + }, + { + "epoch": 10.112, + "grad_norm": 1.671109914779663, + "learning_rate": 2e-05, + "loss": 0.05773462, + "step": 5056 + }, + { + "epoch": 10.114, + "grad_norm": 2.3881421089172363, + "learning_rate": 2e-05, + "loss": 0.05546438, + "step": 5057 + }, + { + "epoch": 10.116, + "grad_norm": 2.157351493835449, + "learning_rate": 2e-05, + "loss": 0.05277005, + "step": 5058 + }, + { + "epoch": 10.118, + "grad_norm": 2.0532100200653076, + "learning_rate": 2e-05, + "loss": 0.05053755, + "step": 5059 + }, + { + "epoch": 10.12, + "grad_norm": 1.3669548034667969, + "learning_rate": 2e-05, + "loss": 0.05167579, + "step": 5060 + }, + { + "epoch": 10.122, + "grad_norm": 1.5119102001190186, + "learning_rate": 2e-05, + "loss": 0.05561122, + "step": 5061 + }, + { + "epoch": 10.124, + "grad_norm": 1.244271159172058, + "learning_rate": 2e-05, + "loss": 0.03419596, + "step": 5062 + }, + { + "epoch": 10.126, + "grad_norm": 1.6181620359420776, + "learning_rate": 2e-05, + "loss": 0.06645474, + "step": 5063 + }, + { + "epoch": 10.128, + "grad_norm": 2.0490105152130127, + "learning_rate": 2e-05, + "loss": 0.05366251, + "step": 5064 + }, + { + "epoch": 10.13, + "grad_norm": 1.2846488952636719, + "learning_rate": 2e-05, + "loss": 0.04805609, + "step": 5065 + }, + { + "epoch": 10.132, + "grad_norm": 1.5411098003387451, + "learning_rate": 2e-05, + "loss": 0.04068913, + "step": 5066 + }, + { + "epoch": 10.134, + "grad_norm": 1.4525972604751587, + "learning_rate": 2e-05, + "loss": 0.04148395, + "step": 5067 + }, + { + "epoch": 10.136, + "grad_norm": 1.9913413524627686, + "learning_rate": 2e-05, + "loss": 0.05251299, + "step": 5068 + }, + { + "epoch": 10.138, + "grad_norm": 1.329590082168579, + "learning_rate": 2e-05, + "loss": 0.03482792, + "step": 5069 + }, + { + "epoch": 10.14, + "grad_norm": 1.917649507522583, + "learning_rate": 2e-05, + "loss": 0.0389032, + "step": 5070 + }, + { + "epoch": 10.142, + "grad_norm": 1.8143230676651, + "learning_rate": 2e-05, + "loss": 0.06296816, + "step": 5071 + }, + { + "epoch": 10.144, + "grad_norm": 1.9044404029846191, + "learning_rate": 2e-05, + "loss": 0.07797446, + "step": 5072 + }, + { + "epoch": 10.146, + "grad_norm": 1.3988782167434692, + "learning_rate": 2e-05, + "loss": 0.03789392, + "step": 5073 + }, + { + "epoch": 10.148, + "grad_norm": 2.06538987159729, + "learning_rate": 2e-05, + "loss": 0.05376375, + "step": 5074 + }, + { + "epoch": 10.15, + "grad_norm": 1.908994436264038, + "learning_rate": 2e-05, + "loss": 0.05527605, + "step": 5075 + }, + { + "epoch": 10.152, + "grad_norm": 1.5503228902816772, + "learning_rate": 2e-05, + "loss": 0.04854936, + "step": 5076 + }, + { + "epoch": 10.154, + "grad_norm": 1.1571024656295776, + "learning_rate": 2e-05, + "loss": 0.03305773, + "step": 5077 + }, + { + "epoch": 10.156, + "grad_norm": 2.0785417556762695, + "learning_rate": 2e-05, + "loss": 0.06657989, + "step": 5078 + }, + { + "epoch": 10.158, + "grad_norm": 2.85941743850708, + "learning_rate": 2e-05, + "loss": 0.05014766, + "step": 5079 + }, + { + "epoch": 10.16, + "grad_norm": 2.1629788875579834, + "learning_rate": 2e-05, + "loss": 0.05455408, + "step": 5080 + }, + { + "epoch": 10.162, + "grad_norm": 3.536186933517456, + "learning_rate": 2e-05, + "loss": 0.04600221, + "step": 5081 + }, + { + "epoch": 10.164, + "grad_norm": 1.7894084453582764, + "learning_rate": 2e-05, + "loss": 0.03903076, + "step": 5082 + }, + { + "epoch": 10.166, + "grad_norm": 1.6507161855697632, + "learning_rate": 2e-05, + "loss": 0.03541517, + "step": 5083 + }, + { + "epoch": 10.168, + "grad_norm": 1.4610087871551514, + "learning_rate": 2e-05, + "loss": 0.05326772, + "step": 5084 + }, + { + "epoch": 10.17, + "grad_norm": 1.1885812282562256, + "learning_rate": 2e-05, + "loss": 0.04884129, + "step": 5085 + }, + { + "epoch": 10.172, + "grad_norm": 1.5193572044372559, + "learning_rate": 2e-05, + "loss": 0.05214652, + "step": 5086 + }, + { + "epoch": 10.174, + "grad_norm": 2.79036808013916, + "learning_rate": 2e-05, + "loss": 0.07177282, + "step": 5087 + }, + { + "epoch": 10.176, + "grad_norm": 2.4795050621032715, + "learning_rate": 2e-05, + "loss": 0.05598135, + "step": 5088 + }, + { + "epoch": 10.178, + "grad_norm": 1.8323618173599243, + "learning_rate": 2e-05, + "loss": 0.04974123, + "step": 5089 + }, + { + "epoch": 10.18, + "grad_norm": 1.8436342477798462, + "learning_rate": 2e-05, + "loss": 0.05738843, + "step": 5090 + }, + { + "epoch": 10.182, + "grad_norm": 1.8771424293518066, + "learning_rate": 2e-05, + "loss": 0.06057268, + "step": 5091 + }, + { + "epoch": 10.184, + "grad_norm": 1.8508983850479126, + "learning_rate": 2e-05, + "loss": 0.05036641, + "step": 5092 + }, + { + "epoch": 10.186, + "grad_norm": 1.3412444591522217, + "learning_rate": 2e-05, + "loss": 0.03264994, + "step": 5093 + }, + { + "epoch": 10.188, + "grad_norm": 1.8948533535003662, + "learning_rate": 2e-05, + "loss": 0.04661043, + "step": 5094 + }, + { + "epoch": 10.19, + "grad_norm": 1.1658676862716675, + "learning_rate": 2e-05, + "loss": 0.0359153, + "step": 5095 + }, + { + "epoch": 10.192, + "grad_norm": 2.3413681983947754, + "learning_rate": 2e-05, + "loss": 0.06752145, + "step": 5096 + }, + { + "epoch": 10.194, + "grad_norm": 2.0791563987731934, + "learning_rate": 2e-05, + "loss": 0.03742487, + "step": 5097 + }, + { + "epoch": 10.196, + "grad_norm": 2.3035924434661865, + "learning_rate": 2e-05, + "loss": 0.05462205, + "step": 5098 + }, + { + "epoch": 10.198, + "grad_norm": 1.5185017585754395, + "learning_rate": 2e-05, + "loss": 0.0364113, + "step": 5099 + }, + { + "epoch": 10.2, + "grad_norm": 2.8155453205108643, + "learning_rate": 2e-05, + "loss": 0.06460752, + "step": 5100 + }, + { + "epoch": 10.202, + "grad_norm": 2.565056800842285, + "learning_rate": 2e-05, + "loss": 0.04938511, + "step": 5101 + }, + { + "epoch": 10.204, + "grad_norm": 2.4186198711395264, + "learning_rate": 2e-05, + "loss": 0.05762579, + "step": 5102 + }, + { + "epoch": 10.206, + "grad_norm": 1.7323216199874878, + "learning_rate": 2e-05, + "loss": 0.04478491, + "step": 5103 + }, + { + "epoch": 10.208, + "grad_norm": 2.4618570804595947, + "learning_rate": 2e-05, + "loss": 0.0601164, + "step": 5104 + }, + { + "epoch": 10.21, + "grad_norm": 1.4251370429992676, + "learning_rate": 2e-05, + "loss": 0.05556741, + "step": 5105 + }, + { + "epoch": 10.212, + "grad_norm": 2.33821177482605, + "learning_rate": 2e-05, + "loss": 0.07309254, + "step": 5106 + }, + { + "epoch": 10.214, + "grad_norm": 1.3154981136322021, + "learning_rate": 2e-05, + "loss": 0.04135425, + "step": 5107 + }, + { + "epoch": 10.216, + "grad_norm": 2.5417816638946533, + "learning_rate": 2e-05, + "loss": 0.06982885, + "step": 5108 + }, + { + "epoch": 10.218, + "grad_norm": 2.1002843379974365, + "learning_rate": 2e-05, + "loss": 0.05075446, + "step": 5109 + }, + { + "epoch": 10.22, + "grad_norm": 1.574785828590393, + "learning_rate": 2e-05, + "loss": 0.04077484, + "step": 5110 + }, + { + "epoch": 10.222, + "grad_norm": 1.7429002523422241, + "learning_rate": 2e-05, + "loss": 0.04333501, + "step": 5111 + }, + { + "epoch": 10.224, + "grad_norm": 1.4146769046783447, + "learning_rate": 2e-05, + "loss": 0.04594901, + "step": 5112 + }, + { + "epoch": 10.226, + "grad_norm": 2.0971226692199707, + "learning_rate": 2e-05, + "loss": 0.06643675, + "step": 5113 + }, + { + "epoch": 10.228, + "grad_norm": 1.6116442680358887, + "learning_rate": 2e-05, + "loss": 0.04431507, + "step": 5114 + }, + { + "epoch": 10.23, + "grad_norm": 3.321749210357666, + "learning_rate": 2e-05, + "loss": 0.06666962, + "step": 5115 + }, + { + "epoch": 10.232, + "grad_norm": 1.248750925064087, + "learning_rate": 2e-05, + "loss": 0.05188263, + "step": 5116 + }, + { + "epoch": 10.234, + "grad_norm": 1.4745240211486816, + "learning_rate": 2e-05, + "loss": 0.0565898, + "step": 5117 + }, + { + "epoch": 10.236, + "grad_norm": 1.7704201936721802, + "learning_rate": 2e-05, + "loss": 0.04397082, + "step": 5118 + }, + { + "epoch": 10.238, + "grad_norm": 1.7633501291275024, + "learning_rate": 2e-05, + "loss": 0.0520197, + "step": 5119 + }, + { + "epoch": 10.24, + "grad_norm": 1.8023216724395752, + "learning_rate": 2e-05, + "loss": 0.06178421, + "step": 5120 + }, + { + "epoch": 10.242, + "grad_norm": 1.9784626960754395, + "learning_rate": 2e-05, + "loss": 0.05770921, + "step": 5121 + }, + { + "epoch": 10.244, + "grad_norm": 1.6166801452636719, + "learning_rate": 2e-05, + "loss": 0.05028439, + "step": 5122 + }, + { + "epoch": 10.246, + "grad_norm": 1.2488207817077637, + "learning_rate": 2e-05, + "loss": 0.05257055, + "step": 5123 + }, + { + "epoch": 10.248, + "grad_norm": 1.3418329954147339, + "learning_rate": 2e-05, + "loss": 0.03734386, + "step": 5124 + }, + { + "epoch": 10.25, + "grad_norm": 1.5167311429977417, + "learning_rate": 2e-05, + "loss": 0.0557439, + "step": 5125 + }, + { + "epoch": 10.252, + "grad_norm": 1.087599515914917, + "learning_rate": 2e-05, + "loss": 0.02959022, + "step": 5126 + }, + { + "epoch": 10.254, + "grad_norm": 1.6808693408966064, + "learning_rate": 2e-05, + "loss": 0.04390074, + "step": 5127 + }, + { + "epoch": 10.256, + "grad_norm": 1.909277319908142, + "learning_rate": 2e-05, + "loss": 0.05320806, + "step": 5128 + }, + { + "epoch": 10.258, + "grad_norm": 2.03977370262146, + "learning_rate": 2e-05, + "loss": 0.05649532, + "step": 5129 + }, + { + "epoch": 10.26, + "grad_norm": 1.8067188262939453, + "learning_rate": 2e-05, + "loss": 0.06265157, + "step": 5130 + }, + { + "epoch": 10.262, + "grad_norm": 2.781466007232666, + "learning_rate": 2e-05, + "loss": 0.06059079, + "step": 5131 + }, + { + "epoch": 10.264, + "grad_norm": 2.2374839782714844, + "learning_rate": 2e-05, + "loss": 0.04488031, + "step": 5132 + }, + { + "epoch": 10.266, + "grad_norm": 1.5937252044677734, + "learning_rate": 2e-05, + "loss": 0.04937337, + "step": 5133 + }, + { + "epoch": 10.268, + "grad_norm": 1.5034781694412231, + "learning_rate": 2e-05, + "loss": 0.04895452, + "step": 5134 + }, + { + "epoch": 10.27, + "grad_norm": 2.400557279586792, + "learning_rate": 2e-05, + "loss": 0.06906497, + "step": 5135 + }, + { + "epoch": 10.272, + "grad_norm": 1.635102391242981, + "learning_rate": 2e-05, + "loss": 0.03693788, + "step": 5136 + }, + { + "epoch": 10.274000000000001, + "grad_norm": 1.3570059537887573, + "learning_rate": 2e-05, + "loss": 0.05358264, + "step": 5137 + }, + { + "epoch": 10.276, + "grad_norm": 1.6576945781707764, + "learning_rate": 2e-05, + "loss": 0.05347702, + "step": 5138 + }, + { + "epoch": 10.278, + "grad_norm": 1.7237348556518555, + "learning_rate": 2e-05, + "loss": 0.04945024, + "step": 5139 + }, + { + "epoch": 10.28, + "grad_norm": 1.9865562915802002, + "learning_rate": 2e-05, + "loss": 0.05404085, + "step": 5140 + }, + { + "epoch": 10.282, + "grad_norm": 2.0699520111083984, + "learning_rate": 2e-05, + "loss": 0.04398727, + "step": 5141 + }, + { + "epoch": 10.284, + "grad_norm": 1.870590329170227, + "learning_rate": 2e-05, + "loss": 0.05769954, + "step": 5142 + }, + { + "epoch": 10.286, + "grad_norm": 1.6009098291397095, + "learning_rate": 2e-05, + "loss": 0.05270899, + "step": 5143 + }, + { + "epoch": 10.288, + "grad_norm": 1.0542221069335938, + "learning_rate": 2e-05, + "loss": 0.03725312, + "step": 5144 + }, + { + "epoch": 10.29, + "grad_norm": 1.4165904521942139, + "learning_rate": 2e-05, + "loss": 0.04188281, + "step": 5145 + }, + { + "epoch": 10.292, + "grad_norm": 3.5521252155303955, + "learning_rate": 2e-05, + "loss": 0.06759865, + "step": 5146 + }, + { + "epoch": 10.294, + "grad_norm": 1.433131217956543, + "learning_rate": 2e-05, + "loss": 0.05106829, + "step": 5147 + }, + { + "epoch": 10.296, + "grad_norm": 1.4577850103378296, + "learning_rate": 2e-05, + "loss": 0.05083516, + "step": 5148 + }, + { + "epoch": 10.298, + "grad_norm": 1.3248356580734253, + "learning_rate": 2e-05, + "loss": 0.04911773, + "step": 5149 + }, + { + "epoch": 10.3, + "grad_norm": 1.37599778175354, + "learning_rate": 2e-05, + "loss": 0.04473313, + "step": 5150 + }, + { + "epoch": 10.302, + "grad_norm": 1.5513116121292114, + "learning_rate": 2e-05, + "loss": 0.04096273, + "step": 5151 + }, + { + "epoch": 10.304, + "grad_norm": 1.411054015159607, + "learning_rate": 2e-05, + "loss": 0.04515607, + "step": 5152 + }, + { + "epoch": 10.306, + "grad_norm": 2.094132661819458, + "learning_rate": 2e-05, + "loss": 0.04543452, + "step": 5153 + }, + { + "epoch": 10.308, + "grad_norm": 1.480191707611084, + "learning_rate": 2e-05, + "loss": 0.04611487, + "step": 5154 + }, + { + "epoch": 10.31, + "grad_norm": 1.5969020128250122, + "learning_rate": 2e-05, + "loss": 0.05584262, + "step": 5155 + }, + { + "epoch": 10.312, + "grad_norm": 1.505147099494934, + "learning_rate": 2e-05, + "loss": 0.05068387, + "step": 5156 + }, + { + "epoch": 10.314, + "grad_norm": 1.7500067949295044, + "learning_rate": 2e-05, + "loss": 0.04701269, + "step": 5157 + }, + { + "epoch": 10.316, + "grad_norm": 1.2172950506210327, + "learning_rate": 2e-05, + "loss": 0.05006229, + "step": 5158 + }, + { + "epoch": 10.318, + "grad_norm": 1.3415688276290894, + "learning_rate": 2e-05, + "loss": 0.04002773, + "step": 5159 + }, + { + "epoch": 10.32, + "grad_norm": 1.4847948551177979, + "learning_rate": 2e-05, + "loss": 0.05143999, + "step": 5160 + }, + { + "epoch": 10.322, + "grad_norm": 1.2036330699920654, + "learning_rate": 2e-05, + "loss": 0.05749707, + "step": 5161 + }, + { + "epoch": 10.324, + "grad_norm": 1.8553268909454346, + "learning_rate": 2e-05, + "loss": 0.06461729, + "step": 5162 + }, + { + "epoch": 10.326, + "grad_norm": 1.2511320114135742, + "learning_rate": 2e-05, + "loss": 0.04641117, + "step": 5163 + }, + { + "epoch": 10.328, + "grad_norm": 3.30887770652771, + "learning_rate": 2e-05, + "loss": 0.0428102, + "step": 5164 + }, + { + "epoch": 10.33, + "grad_norm": 3.343966007232666, + "learning_rate": 2e-05, + "loss": 0.06479934, + "step": 5165 + }, + { + "epoch": 10.332, + "grad_norm": 1.185491919517517, + "learning_rate": 2e-05, + "loss": 0.0378601, + "step": 5166 + }, + { + "epoch": 10.334, + "grad_norm": 1.7531931400299072, + "learning_rate": 2e-05, + "loss": 0.04356651, + "step": 5167 + }, + { + "epoch": 10.336, + "grad_norm": 1.5299067497253418, + "learning_rate": 2e-05, + "loss": 0.0489331, + "step": 5168 + }, + { + "epoch": 10.338, + "grad_norm": 1.7191680669784546, + "learning_rate": 2e-05, + "loss": 0.05703234, + "step": 5169 + }, + { + "epoch": 10.34, + "grad_norm": 2.2998485565185547, + "learning_rate": 2e-05, + "loss": 0.05554244, + "step": 5170 + }, + { + "epoch": 10.342, + "grad_norm": 1.4917652606964111, + "learning_rate": 2e-05, + "loss": 0.0459786, + "step": 5171 + }, + { + "epoch": 10.344, + "grad_norm": 2.3985610008239746, + "learning_rate": 2e-05, + "loss": 0.07849705, + "step": 5172 + }, + { + "epoch": 10.346, + "grad_norm": 1.3253332376480103, + "learning_rate": 2e-05, + "loss": 0.03654364, + "step": 5173 + }, + { + "epoch": 10.348, + "grad_norm": 1.9052636623382568, + "learning_rate": 2e-05, + "loss": 0.06166274, + "step": 5174 + }, + { + "epoch": 10.35, + "grad_norm": 1.6939311027526855, + "learning_rate": 2e-05, + "loss": 0.04354223, + "step": 5175 + }, + { + "epoch": 10.352, + "grad_norm": 2.325695753097534, + "learning_rate": 2e-05, + "loss": 0.06419559, + "step": 5176 + }, + { + "epoch": 10.354, + "grad_norm": 2.277292013168335, + "learning_rate": 2e-05, + "loss": 0.05193496, + "step": 5177 + }, + { + "epoch": 10.356, + "grad_norm": 1.4175512790679932, + "learning_rate": 2e-05, + "loss": 0.04061021, + "step": 5178 + }, + { + "epoch": 10.358, + "grad_norm": 2.847827434539795, + "learning_rate": 2e-05, + "loss": 0.05605824, + "step": 5179 + }, + { + "epoch": 10.36, + "grad_norm": 1.4422760009765625, + "learning_rate": 2e-05, + "loss": 0.05320713, + "step": 5180 + }, + { + "epoch": 10.362, + "grad_norm": 1.720396637916565, + "learning_rate": 2e-05, + "loss": 0.05881133, + "step": 5181 + }, + { + "epoch": 10.364, + "grad_norm": 1.4462811946868896, + "learning_rate": 2e-05, + "loss": 0.05044965, + "step": 5182 + }, + { + "epoch": 10.366, + "grad_norm": 1.1778342723846436, + "learning_rate": 2e-05, + "loss": 0.03916883, + "step": 5183 + }, + { + "epoch": 10.368, + "grad_norm": 0.9606398344039917, + "learning_rate": 2e-05, + "loss": 0.0260303, + "step": 5184 + }, + { + "epoch": 10.37, + "grad_norm": 3.025618553161621, + "learning_rate": 2e-05, + "loss": 0.07479943, + "step": 5185 + }, + { + "epoch": 10.372, + "grad_norm": 1.7541829347610474, + "learning_rate": 2e-05, + "loss": 0.05436843, + "step": 5186 + }, + { + "epoch": 10.374, + "grad_norm": 1.5163887739181519, + "learning_rate": 2e-05, + "loss": 0.06724942, + "step": 5187 + }, + { + "epoch": 10.376, + "grad_norm": 1.2254157066345215, + "learning_rate": 2e-05, + "loss": 0.03878136, + "step": 5188 + }, + { + "epoch": 10.378, + "grad_norm": 1.7440659999847412, + "learning_rate": 2e-05, + "loss": 0.04667617, + "step": 5189 + }, + { + "epoch": 10.38, + "grad_norm": 1.921148657798767, + "learning_rate": 2e-05, + "loss": 0.0511181, + "step": 5190 + }, + { + "epoch": 10.382, + "grad_norm": 2.249518871307373, + "learning_rate": 2e-05, + "loss": 0.05295843, + "step": 5191 + }, + { + "epoch": 10.384, + "grad_norm": 1.9301743507385254, + "learning_rate": 2e-05, + "loss": 0.05415469, + "step": 5192 + }, + { + "epoch": 10.386, + "grad_norm": 2.933361291885376, + "learning_rate": 2e-05, + "loss": 0.05823458, + "step": 5193 + }, + { + "epoch": 10.388, + "grad_norm": 1.8775449991226196, + "learning_rate": 2e-05, + "loss": 0.03668153, + "step": 5194 + }, + { + "epoch": 10.39, + "grad_norm": 1.2913427352905273, + "learning_rate": 2e-05, + "loss": 0.03676874, + "step": 5195 + }, + { + "epoch": 10.392, + "grad_norm": 3.2641937732696533, + "learning_rate": 2e-05, + "loss": 0.03394835, + "step": 5196 + }, + { + "epoch": 10.394, + "grad_norm": 1.5311733484268188, + "learning_rate": 2e-05, + "loss": 0.05097717, + "step": 5197 + }, + { + "epoch": 10.396, + "grad_norm": 1.488006830215454, + "learning_rate": 2e-05, + "loss": 0.06110777, + "step": 5198 + }, + { + "epoch": 10.398, + "grad_norm": 1.3041068315505981, + "learning_rate": 2e-05, + "loss": 0.04042555, + "step": 5199 + }, + { + "epoch": 10.4, + "grad_norm": 1.8183988332748413, + "learning_rate": 2e-05, + "loss": 0.04754098, + "step": 5200 + }, + { + "epoch": 10.402, + "grad_norm": 2.3569672107696533, + "learning_rate": 2e-05, + "loss": 0.04439374, + "step": 5201 + }, + { + "epoch": 10.404, + "grad_norm": 1.4490054845809937, + "learning_rate": 2e-05, + "loss": 0.0474748, + "step": 5202 + }, + { + "epoch": 10.406, + "grad_norm": 1.513960838317871, + "learning_rate": 2e-05, + "loss": 0.05048071, + "step": 5203 + }, + { + "epoch": 10.408, + "grad_norm": 1.6078133583068848, + "learning_rate": 2e-05, + "loss": 0.06171016, + "step": 5204 + }, + { + "epoch": 10.41, + "grad_norm": 2.1041479110717773, + "learning_rate": 2e-05, + "loss": 0.07458056, + "step": 5205 + }, + { + "epoch": 10.412, + "grad_norm": 1.2768281698226929, + "learning_rate": 2e-05, + "loss": 0.05074968, + "step": 5206 + }, + { + "epoch": 10.414, + "grad_norm": 1.2827049493789673, + "learning_rate": 2e-05, + "loss": 0.03884158, + "step": 5207 + }, + { + "epoch": 10.416, + "grad_norm": 1.3199437856674194, + "learning_rate": 2e-05, + "loss": 0.04091244, + "step": 5208 + }, + { + "epoch": 10.418, + "grad_norm": 1.4176160097122192, + "learning_rate": 2e-05, + "loss": 0.05431768, + "step": 5209 + }, + { + "epoch": 10.42, + "grad_norm": 1.8141944408416748, + "learning_rate": 2e-05, + "loss": 0.04813956, + "step": 5210 + }, + { + "epoch": 10.422, + "grad_norm": 1.3609751462936401, + "learning_rate": 2e-05, + "loss": 0.05530354, + "step": 5211 + }, + { + "epoch": 10.424, + "grad_norm": 1.780599594116211, + "learning_rate": 2e-05, + "loss": 0.0568007, + "step": 5212 + }, + { + "epoch": 10.426, + "grad_norm": 1.6819827556610107, + "learning_rate": 2e-05, + "loss": 0.03991935, + "step": 5213 + }, + { + "epoch": 10.428, + "grad_norm": 2.557753801345825, + "learning_rate": 2e-05, + "loss": 0.0696613, + "step": 5214 + }, + { + "epoch": 10.43, + "grad_norm": 1.8446578979492188, + "learning_rate": 2e-05, + "loss": 0.04753678, + "step": 5215 + }, + { + "epoch": 10.432, + "grad_norm": 0.939564049243927, + "learning_rate": 2e-05, + "loss": 0.02770604, + "step": 5216 + }, + { + "epoch": 10.434, + "grad_norm": 1.930539846420288, + "learning_rate": 2e-05, + "loss": 0.05070189, + "step": 5217 + }, + { + "epoch": 10.436, + "grad_norm": 1.480115532875061, + "learning_rate": 2e-05, + "loss": 0.03808409, + "step": 5218 + }, + { + "epoch": 10.438, + "grad_norm": 1.2075250148773193, + "learning_rate": 2e-05, + "loss": 0.03466361, + "step": 5219 + }, + { + "epoch": 10.44, + "grad_norm": 2.400930166244507, + "learning_rate": 2e-05, + "loss": 0.05821132, + "step": 5220 + }, + { + "epoch": 10.442, + "grad_norm": 1.5590664148330688, + "learning_rate": 2e-05, + "loss": 0.04925883, + "step": 5221 + }, + { + "epoch": 10.444, + "grad_norm": 1.9377199411392212, + "learning_rate": 2e-05, + "loss": 0.05342076, + "step": 5222 + }, + { + "epoch": 10.446, + "grad_norm": 1.9182746410369873, + "learning_rate": 2e-05, + "loss": 0.06361227, + "step": 5223 + }, + { + "epoch": 10.448, + "grad_norm": 1.3133718967437744, + "learning_rate": 2e-05, + "loss": 0.04293478, + "step": 5224 + }, + { + "epoch": 10.45, + "grad_norm": 1.6044734716415405, + "learning_rate": 2e-05, + "loss": 0.06643161, + "step": 5225 + }, + { + "epoch": 10.452, + "grad_norm": 1.3510301113128662, + "learning_rate": 2e-05, + "loss": 0.0424097, + "step": 5226 + }, + { + "epoch": 10.454, + "grad_norm": 2.926586389541626, + "learning_rate": 2e-05, + "loss": 0.05616639, + "step": 5227 + }, + { + "epoch": 10.456, + "grad_norm": 1.7717212438583374, + "learning_rate": 2e-05, + "loss": 0.04580856, + "step": 5228 + }, + { + "epoch": 10.458, + "grad_norm": 1.2072011232376099, + "learning_rate": 2e-05, + "loss": 0.05067045, + "step": 5229 + }, + { + "epoch": 10.46, + "grad_norm": 2.7207653522491455, + "learning_rate": 2e-05, + "loss": 0.05808522, + "step": 5230 + }, + { + "epoch": 10.462, + "grad_norm": 1.8373502492904663, + "learning_rate": 2e-05, + "loss": 0.0528776, + "step": 5231 + }, + { + "epoch": 10.464, + "grad_norm": 1.2759414911270142, + "learning_rate": 2e-05, + "loss": 0.04170848, + "step": 5232 + }, + { + "epoch": 10.466, + "grad_norm": 1.9643059968948364, + "learning_rate": 2e-05, + "loss": 0.04919844, + "step": 5233 + }, + { + "epoch": 10.468, + "grad_norm": 3.1205170154571533, + "learning_rate": 2e-05, + "loss": 0.06371839, + "step": 5234 + }, + { + "epoch": 10.47, + "grad_norm": 13.818527221679688, + "learning_rate": 2e-05, + "loss": 0.0568444, + "step": 5235 + }, + { + "epoch": 10.472, + "grad_norm": 5.399960517883301, + "learning_rate": 2e-05, + "loss": 0.04257676, + "step": 5236 + }, + { + "epoch": 10.474, + "grad_norm": 1.6575089693069458, + "learning_rate": 2e-05, + "loss": 0.05032737, + "step": 5237 + }, + { + "epoch": 10.475999999999999, + "grad_norm": 1.2673888206481934, + "learning_rate": 2e-05, + "loss": 0.04592997, + "step": 5238 + }, + { + "epoch": 10.478, + "grad_norm": 1.8720520734786987, + "learning_rate": 2e-05, + "loss": 0.05344952, + "step": 5239 + }, + { + "epoch": 10.48, + "grad_norm": 5.716723918914795, + "learning_rate": 2e-05, + "loss": 0.05336998, + "step": 5240 + }, + { + "epoch": 10.482, + "grad_norm": 1.8104397058486938, + "learning_rate": 2e-05, + "loss": 0.05812651, + "step": 5241 + }, + { + "epoch": 10.484, + "grad_norm": 1.6072144508361816, + "learning_rate": 2e-05, + "loss": 0.05022549, + "step": 5242 + }, + { + "epoch": 10.486, + "grad_norm": 1.7834798097610474, + "learning_rate": 2e-05, + "loss": 0.0548949, + "step": 5243 + }, + { + "epoch": 10.488, + "grad_norm": 1.4747883081436157, + "learning_rate": 2e-05, + "loss": 0.07916712, + "step": 5244 + }, + { + "epoch": 10.49, + "grad_norm": 1.4801729917526245, + "learning_rate": 2e-05, + "loss": 0.0434589, + "step": 5245 + }, + { + "epoch": 10.492, + "grad_norm": 1.3738034963607788, + "learning_rate": 2e-05, + "loss": 0.04868829, + "step": 5246 + }, + { + "epoch": 10.494, + "grad_norm": 1.1676543951034546, + "learning_rate": 2e-05, + "loss": 0.03874259, + "step": 5247 + }, + { + "epoch": 10.496, + "grad_norm": 1.4973816871643066, + "learning_rate": 2e-05, + "loss": 0.04599489, + "step": 5248 + }, + { + "epoch": 10.498, + "grad_norm": 1.5680755376815796, + "learning_rate": 2e-05, + "loss": 0.04904544, + "step": 5249 + }, + { + "epoch": 10.5, + "grad_norm": 1.9003678560256958, + "learning_rate": 2e-05, + "loss": 0.07093397, + "step": 5250 + }, + { + "epoch": 10.502, + "grad_norm": 1.6360976696014404, + "learning_rate": 2e-05, + "loss": 0.04844421, + "step": 5251 + }, + { + "epoch": 10.504, + "grad_norm": 1.8371424674987793, + "learning_rate": 2e-05, + "loss": 0.0682577, + "step": 5252 + }, + { + "epoch": 10.506, + "grad_norm": 2.010197401046753, + "learning_rate": 2e-05, + "loss": 0.04919277, + "step": 5253 + }, + { + "epoch": 10.508, + "grad_norm": 1.8417555093765259, + "learning_rate": 2e-05, + "loss": 0.04783285, + "step": 5254 + }, + { + "epoch": 10.51, + "grad_norm": 1.6858887672424316, + "learning_rate": 2e-05, + "loss": 0.05456799, + "step": 5255 + }, + { + "epoch": 10.512, + "grad_norm": 3.253695011138916, + "learning_rate": 2e-05, + "loss": 0.03823632, + "step": 5256 + }, + { + "epoch": 10.514, + "grad_norm": 1.8431847095489502, + "learning_rate": 2e-05, + "loss": 0.06339058, + "step": 5257 + }, + { + "epoch": 10.516, + "grad_norm": 1.3688884973526, + "learning_rate": 2e-05, + "loss": 0.06101316, + "step": 5258 + }, + { + "epoch": 10.518, + "grad_norm": 0.9589589834213257, + "learning_rate": 2e-05, + "loss": 0.03332622, + "step": 5259 + }, + { + "epoch": 10.52, + "grad_norm": 1.5747522115707397, + "learning_rate": 2e-05, + "loss": 0.05914874, + "step": 5260 + }, + { + "epoch": 10.522, + "grad_norm": 1.884902834892273, + "learning_rate": 2e-05, + "loss": 0.07614301, + "step": 5261 + }, + { + "epoch": 10.524000000000001, + "grad_norm": 1.4136667251586914, + "learning_rate": 2e-05, + "loss": 0.05749642, + "step": 5262 + }, + { + "epoch": 10.526, + "grad_norm": 1.4973257780075073, + "learning_rate": 2e-05, + "loss": 0.05710443, + "step": 5263 + }, + { + "epoch": 10.528, + "grad_norm": 2.1103341579437256, + "learning_rate": 2e-05, + "loss": 0.06917971, + "step": 5264 + }, + { + "epoch": 10.53, + "grad_norm": 1.6362884044647217, + "learning_rate": 2e-05, + "loss": 0.04992104, + "step": 5265 + }, + { + "epoch": 10.532, + "grad_norm": 2.8185596466064453, + "learning_rate": 2e-05, + "loss": 0.05004818, + "step": 5266 + }, + { + "epoch": 10.534, + "grad_norm": 1.6403809785842896, + "learning_rate": 2e-05, + "loss": 0.05555485, + "step": 5267 + }, + { + "epoch": 10.536, + "grad_norm": 2.137666702270508, + "learning_rate": 2e-05, + "loss": 0.05246295, + "step": 5268 + }, + { + "epoch": 10.538, + "grad_norm": 1.6086887121200562, + "learning_rate": 2e-05, + "loss": 0.04962714, + "step": 5269 + }, + { + "epoch": 10.54, + "grad_norm": 1.4475809335708618, + "learning_rate": 2e-05, + "loss": 0.0453802, + "step": 5270 + }, + { + "epoch": 10.542, + "grad_norm": 1.6052064895629883, + "learning_rate": 2e-05, + "loss": 0.04843373, + "step": 5271 + }, + { + "epoch": 10.544, + "grad_norm": 1.8348846435546875, + "learning_rate": 2e-05, + "loss": 0.05354032, + "step": 5272 + }, + { + "epoch": 10.546, + "grad_norm": 1.5598926544189453, + "learning_rate": 2e-05, + "loss": 0.04522219, + "step": 5273 + }, + { + "epoch": 10.548, + "grad_norm": 1.7176995277404785, + "learning_rate": 2e-05, + "loss": 0.06097797, + "step": 5274 + }, + { + "epoch": 10.55, + "grad_norm": 1.6066391468048096, + "learning_rate": 2e-05, + "loss": 0.04459933, + "step": 5275 + }, + { + "epoch": 10.552, + "grad_norm": 1.2835681438446045, + "learning_rate": 2e-05, + "loss": 0.04766644, + "step": 5276 + }, + { + "epoch": 10.554, + "grad_norm": 1.5211418867111206, + "learning_rate": 2e-05, + "loss": 0.05268958, + "step": 5277 + }, + { + "epoch": 10.556000000000001, + "grad_norm": 1.3323153257369995, + "learning_rate": 2e-05, + "loss": 0.04630678, + "step": 5278 + }, + { + "epoch": 10.558, + "grad_norm": 2.196124315261841, + "learning_rate": 2e-05, + "loss": 0.04796411, + "step": 5279 + }, + { + "epoch": 10.56, + "grad_norm": 6.703586101531982, + "learning_rate": 2e-05, + "loss": 0.08236083, + "step": 5280 + }, + { + "epoch": 10.562, + "grad_norm": 2.39101505279541, + "learning_rate": 2e-05, + "loss": 0.0583214, + "step": 5281 + }, + { + "epoch": 10.564, + "grad_norm": 1.674933671951294, + "learning_rate": 2e-05, + "loss": 0.05167703, + "step": 5282 + }, + { + "epoch": 10.566, + "grad_norm": 1.468446135520935, + "learning_rate": 2e-05, + "loss": 0.04566929, + "step": 5283 + }, + { + "epoch": 10.568, + "grad_norm": 1.4748491048812866, + "learning_rate": 2e-05, + "loss": 0.04062827, + "step": 5284 + }, + { + "epoch": 10.57, + "grad_norm": 2.5267021656036377, + "learning_rate": 2e-05, + "loss": 0.05230495, + "step": 5285 + }, + { + "epoch": 10.572, + "grad_norm": 2.1810038089752197, + "learning_rate": 2e-05, + "loss": 0.04218719, + "step": 5286 + }, + { + "epoch": 10.574, + "grad_norm": 1.3321385383605957, + "learning_rate": 2e-05, + "loss": 0.04871097, + "step": 5287 + }, + { + "epoch": 10.576, + "grad_norm": 1.7109707593917847, + "learning_rate": 2e-05, + "loss": 0.04826786, + "step": 5288 + }, + { + "epoch": 10.578, + "grad_norm": 1.9152201414108276, + "learning_rate": 2e-05, + "loss": 0.04686288, + "step": 5289 + }, + { + "epoch": 10.58, + "grad_norm": 5.031193256378174, + "learning_rate": 2e-05, + "loss": 0.04337011, + "step": 5290 + }, + { + "epoch": 10.582, + "grad_norm": 3.426795482635498, + "learning_rate": 2e-05, + "loss": 0.03990654, + "step": 5291 + }, + { + "epoch": 10.584, + "grad_norm": 2.406569719314575, + "learning_rate": 2e-05, + "loss": 0.03668533, + "step": 5292 + }, + { + "epoch": 10.586, + "grad_norm": 3.1804933547973633, + "learning_rate": 2e-05, + "loss": 0.06070712, + "step": 5293 + }, + { + "epoch": 10.588, + "grad_norm": 2.061739206314087, + "learning_rate": 2e-05, + "loss": 0.02787476, + "step": 5294 + }, + { + "epoch": 10.59, + "grad_norm": 1.6928627490997314, + "learning_rate": 2e-05, + "loss": 0.04576159, + "step": 5295 + }, + { + "epoch": 10.592, + "grad_norm": 2.2229623794555664, + "learning_rate": 2e-05, + "loss": 0.05602458, + "step": 5296 + }, + { + "epoch": 10.594, + "grad_norm": 1.289467692375183, + "learning_rate": 2e-05, + "loss": 0.05179123, + "step": 5297 + }, + { + "epoch": 10.596, + "grad_norm": 1.5103563070297241, + "learning_rate": 2e-05, + "loss": 0.05836169, + "step": 5298 + }, + { + "epoch": 10.598, + "grad_norm": 1.6638998985290527, + "learning_rate": 2e-05, + "loss": 0.04362776, + "step": 5299 + }, + { + "epoch": 10.6, + "grad_norm": 1.3501155376434326, + "learning_rate": 2e-05, + "loss": 0.04407869, + "step": 5300 + }, + { + "epoch": 10.602, + "grad_norm": 1.708865761756897, + "learning_rate": 2e-05, + "loss": 0.05075856, + "step": 5301 + }, + { + "epoch": 10.604, + "grad_norm": 10.003942489624023, + "learning_rate": 2e-05, + "loss": 0.07709937, + "step": 5302 + }, + { + "epoch": 10.606, + "grad_norm": 1.8898708820343018, + "learning_rate": 2e-05, + "loss": 0.03913818, + "step": 5303 + }, + { + "epoch": 10.608, + "grad_norm": 1.95207679271698, + "learning_rate": 2e-05, + "loss": 0.05876492, + "step": 5304 + }, + { + "epoch": 10.61, + "grad_norm": 1.6811554431915283, + "learning_rate": 2e-05, + "loss": 0.03337777, + "step": 5305 + }, + { + "epoch": 10.612, + "grad_norm": 2.222196102142334, + "learning_rate": 2e-05, + "loss": 0.0575141, + "step": 5306 + }, + { + "epoch": 10.614, + "grad_norm": 2.2834959030151367, + "learning_rate": 2e-05, + "loss": 0.05859252, + "step": 5307 + }, + { + "epoch": 10.616, + "grad_norm": 2.21087646484375, + "learning_rate": 2e-05, + "loss": 0.05571397, + "step": 5308 + }, + { + "epoch": 10.618, + "grad_norm": 3.0877723693847656, + "learning_rate": 2e-05, + "loss": 0.10596378, + "step": 5309 + }, + { + "epoch": 10.62, + "grad_norm": 3.603444814682007, + "learning_rate": 2e-05, + "loss": 0.06696412, + "step": 5310 + }, + { + "epoch": 10.622, + "grad_norm": 1.8475040197372437, + "learning_rate": 2e-05, + "loss": 0.04399508, + "step": 5311 + }, + { + "epoch": 10.624, + "grad_norm": 1.4669568538665771, + "learning_rate": 2e-05, + "loss": 0.0431044, + "step": 5312 + }, + { + "epoch": 10.626, + "grad_norm": 1.3643040657043457, + "learning_rate": 2e-05, + "loss": 0.0470515, + "step": 5313 + }, + { + "epoch": 10.628, + "grad_norm": 1.2287578582763672, + "learning_rate": 2e-05, + "loss": 0.05811661, + "step": 5314 + }, + { + "epoch": 10.63, + "grad_norm": 1.0902774333953857, + "learning_rate": 2e-05, + "loss": 0.03484159, + "step": 5315 + }, + { + "epoch": 10.632, + "grad_norm": 1.6369050741195679, + "learning_rate": 2e-05, + "loss": 0.04110289, + "step": 5316 + }, + { + "epoch": 10.634, + "grad_norm": 1.275800108909607, + "learning_rate": 2e-05, + "loss": 0.05027898, + "step": 5317 + }, + { + "epoch": 10.636, + "grad_norm": 1.4583206176757812, + "learning_rate": 2e-05, + "loss": 0.03882696, + "step": 5318 + }, + { + "epoch": 10.638, + "grad_norm": 1.3193122148513794, + "learning_rate": 2e-05, + "loss": 0.03158198, + "step": 5319 + }, + { + "epoch": 10.64, + "grad_norm": 1.4229142665863037, + "learning_rate": 2e-05, + "loss": 0.04588968, + "step": 5320 + }, + { + "epoch": 10.642, + "grad_norm": 1.5470317602157593, + "learning_rate": 2e-05, + "loss": 0.04177276, + "step": 5321 + }, + { + "epoch": 10.644, + "grad_norm": 1.322806477546692, + "learning_rate": 2e-05, + "loss": 0.04808362, + "step": 5322 + }, + { + "epoch": 10.646, + "grad_norm": 1.7866078615188599, + "learning_rate": 2e-05, + "loss": 0.04570613, + "step": 5323 + }, + { + "epoch": 10.648, + "grad_norm": 1.8436026573181152, + "learning_rate": 2e-05, + "loss": 0.06046174, + "step": 5324 + }, + { + "epoch": 10.65, + "grad_norm": 1.6099350452423096, + "learning_rate": 2e-05, + "loss": 0.05201571, + "step": 5325 + }, + { + "epoch": 10.652, + "grad_norm": 1.4118268489837646, + "learning_rate": 2e-05, + "loss": 0.04571307, + "step": 5326 + }, + { + "epoch": 10.654, + "grad_norm": 1.7007311582565308, + "learning_rate": 2e-05, + "loss": 0.05410545, + "step": 5327 + }, + { + "epoch": 10.656, + "grad_norm": 1.3747484683990479, + "learning_rate": 2e-05, + "loss": 0.04396842, + "step": 5328 + }, + { + "epoch": 10.658, + "grad_norm": 1.5627105236053467, + "learning_rate": 2e-05, + "loss": 0.05439685, + "step": 5329 + }, + { + "epoch": 10.66, + "grad_norm": 1.6469447612762451, + "learning_rate": 2e-05, + "loss": 0.05709826, + "step": 5330 + }, + { + "epoch": 10.662, + "grad_norm": 3.3534231185913086, + "learning_rate": 2e-05, + "loss": 0.05348397, + "step": 5331 + }, + { + "epoch": 10.664, + "grad_norm": 1.3663334846496582, + "learning_rate": 2e-05, + "loss": 0.04078261, + "step": 5332 + }, + { + "epoch": 10.666, + "grad_norm": 1.6048120260238647, + "learning_rate": 2e-05, + "loss": 0.05477417, + "step": 5333 + }, + { + "epoch": 10.668, + "grad_norm": 1.6299818754196167, + "learning_rate": 2e-05, + "loss": 0.05402359, + "step": 5334 + }, + { + "epoch": 10.67, + "grad_norm": 1.9940632581710815, + "learning_rate": 2e-05, + "loss": 0.05533578, + "step": 5335 + }, + { + "epoch": 10.672, + "grad_norm": 1.1183878183364868, + "learning_rate": 2e-05, + "loss": 0.03753264, + "step": 5336 + }, + { + "epoch": 10.674, + "grad_norm": 1.5522422790527344, + "learning_rate": 2e-05, + "loss": 0.05828452, + "step": 5337 + }, + { + "epoch": 10.676, + "grad_norm": 1.3103322982788086, + "learning_rate": 2e-05, + "loss": 0.04702647, + "step": 5338 + }, + { + "epoch": 10.678, + "grad_norm": 1.2099779844284058, + "learning_rate": 2e-05, + "loss": 0.04113594, + "step": 5339 + }, + { + "epoch": 10.68, + "grad_norm": 3.985682964324951, + "learning_rate": 2e-05, + "loss": 0.07098438, + "step": 5340 + }, + { + "epoch": 10.682, + "grad_norm": 1.8771806955337524, + "learning_rate": 2e-05, + "loss": 0.04530764, + "step": 5341 + }, + { + "epoch": 10.684, + "grad_norm": 2.1567442417144775, + "learning_rate": 2e-05, + "loss": 0.04229917, + "step": 5342 + }, + { + "epoch": 10.686, + "grad_norm": 1.6238269805908203, + "learning_rate": 2e-05, + "loss": 0.04225535, + "step": 5343 + }, + { + "epoch": 10.688, + "grad_norm": 1.869265079498291, + "learning_rate": 2e-05, + "loss": 0.05382503, + "step": 5344 + }, + { + "epoch": 10.69, + "grad_norm": 1.8355045318603516, + "learning_rate": 2e-05, + "loss": 0.0423015, + "step": 5345 + }, + { + "epoch": 10.692, + "grad_norm": 1.3736673593521118, + "learning_rate": 2e-05, + "loss": 0.04386007, + "step": 5346 + }, + { + "epoch": 10.693999999999999, + "grad_norm": 2.0322253704071045, + "learning_rate": 2e-05, + "loss": 0.04041972, + "step": 5347 + }, + { + "epoch": 10.696, + "grad_norm": 2.336951732635498, + "learning_rate": 2e-05, + "loss": 0.05086974, + "step": 5348 + }, + { + "epoch": 10.698, + "grad_norm": 2.0457851886749268, + "learning_rate": 2e-05, + "loss": 0.05006742, + "step": 5349 + }, + { + "epoch": 10.7, + "grad_norm": 1.992970585823059, + "learning_rate": 2e-05, + "loss": 0.06358136, + "step": 5350 + }, + { + "epoch": 10.702, + "grad_norm": 1.2050204277038574, + "learning_rate": 2e-05, + "loss": 0.04032322, + "step": 5351 + }, + { + "epoch": 10.704, + "grad_norm": 1.4412949085235596, + "learning_rate": 2e-05, + "loss": 0.04633742, + "step": 5352 + }, + { + "epoch": 10.706, + "grad_norm": 1.5254440307617188, + "learning_rate": 2e-05, + "loss": 0.05168087, + "step": 5353 + }, + { + "epoch": 10.708, + "grad_norm": 1.4087555408477783, + "learning_rate": 2e-05, + "loss": 0.05477347, + "step": 5354 + }, + { + "epoch": 10.71, + "grad_norm": 1.431321382522583, + "learning_rate": 2e-05, + "loss": 0.03768836, + "step": 5355 + }, + { + "epoch": 10.712, + "grad_norm": 1.1506340503692627, + "learning_rate": 2e-05, + "loss": 0.04004525, + "step": 5356 + }, + { + "epoch": 10.714, + "grad_norm": 2.427537441253662, + "learning_rate": 2e-05, + "loss": 0.06314078, + "step": 5357 + }, + { + "epoch": 10.716, + "grad_norm": 1.5225038528442383, + "learning_rate": 2e-05, + "loss": 0.05141336, + "step": 5358 + }, + { + "epoch": 10.718, + "grad_norm": 3.697816848754883, + "learning_rate": 2e-05, + "loss": 0.05468082, + "step": 5359 + }, + { + "epoch": 10.72, + "grad_norm": 1.5800566673278809, + "learning_rate": 2e-05, + "loss": 0.058851, + "step": 5360 + }, + { + "epoch": 10.722, + "grad_norm": 0.8254619240760803, + "learning_rate": 2e-05, + "loss": 0.02738049, + "step": 5361 + }, + { + "epoch": 10.724, + "grad_norm": 1.369780421257019, + "learning_rate": 2e-05, + "loss": 0.04381582, + "step": 5362 + }, + { + "epoch": 10.725999999999999, + "grad_norm": 1.2137796878814697, + "learning_rate": 2e-05, + "loss": 0.0480298, + "step": 5363 + }, + { + "epoch": 10.728, + "grad_norm": 1.4857935905456543, + "learning_rate": 2e-05, + "loss": 0.03931804, + "step": 5364 + }, + { + "epoch": 10.73, + "grad_norm": 1.4152106046676636, + "learning_rate": 2e-05, + "loss": 0.03949886, + "step": 5365 + }, + { + "epoch": 10.732, + "grad_norm": 1.3397774696350098, + "learning_rate": 2e-05, + "loss": 0.05219603, + "step": 5366 + }, + { + "epoch": 10.734, + "grad_norm": 1.2872146368026733, + "learning_rate": 2e-05, + "loss": 0.042691, + "step": 5367 + }, + { + "epoch": 10.736, + "grad_norm": 2.1003661155700684, + "learning_rate": 2e-05, + "loss": 0.05165257, + "step": 5368 + }, + { + "epoch": 10.738, + "grad_norm": 2.007760524749756, + "learning_rate": 2e-05, + "loss": 0.06440516, + "step": 5369 + }, + { + "epoch": 10.74, + "grad_norm": 1.6404235363006592, + "learning_rate": 2e-05, + "loss": 0.05111123, + "step": 5370 + }, + { + "epoch": 10.742, + "grad_norm": 2.8288562297821045, + "learning_rate": 2e-05, + "loss": 0.05282591, + "step": 5371 + }, + { + "epoch": 10.744, + "grad_norm": 2.6334850788116455, + "learning_rate": 2e-05, + "loss": 0.06181185, + "step": 5372 + }, + { + "epoch": 10.746, + "grad_norm": 1.4054573774337769, + "learning_rate": 2e-05, + "loss": 0.05052005, + "step": 5373 + }, + { + "epoch": 10.748, + "grad_norm": 1.4174977540969849, + "learning_rate": 2e-05, + "loss": 0.04603645, + "step": 5374 + }, + { + "epoch": 10.75, + "grad_norm": 1.3553341627120972, + "learning_rate": 2e-05, + "loss": 0.04983129, + "step": 5375 + }, + { + "epoch": 10.752, + "grad_norm": 2.03845477104187, + "learning_rate": 2e-05, + "loss": 0.06038283, + "step": 5376 + }, + { + "epoch": 10.754, + "grad_norm": 2.294628858566284, + "learning_rate": 2e-05, + "loss": 0.04681225, + "step": 5377 + }, + { + "epoch": 10.756, + "grad_norm": 1.303605079650879, + "learning_rate": 2e-05, + "loss": 0.04528525, + "step": 5378 + }, + { + "epoch": 10.758, + "grad_norm": 1.6137866973876953, + "learning_rate": 2e-05, + "loss": 0.0557767, + "step": 5379 + }, + { + "epoch": 10.76, + "grad_norm": 1.5829466581344604, + "learning_rate": 2e-05, + "loss": 0.04809469, + "step": 5380 + }, + { + "epoch": 10.762, + "grad_norm": 1.2748074531555176, + "learning_rate": 2e-05, + "loss": 0.04331703, + "step": 5381 + }, + { + "epoch": 10.764, + "grad_norm": 1.5433510541915894, + "learning_rate": 2e-05, + "loss": 0.06725293, + "step": 5382 + }, + { + "epoch": 10.766, + "grad_norm": 1.6846797466278076, + "learning_rate": 2e-05, + "loss": 0.04823402, + "step": 5383 + }, + { + "epoch": 10.768, + "grad_norm": 2.141554355621338, + "learning_rate": 2e-05, + "loss": 0.06744844, + "step": 5384 + }, + { + "epoch": 10.77, + "grad_norm": 1.4289065599441528, + "learning_rate": 2e-05, + "loss": 0.05341191, + "step": 5385 + }, + { + "epoch": 10.772, + "grad_norm": 1.3509269952774048, + "learning_rate": 2e-05, + "loss": 0.05020216, + "step": 5386 + }, + { + "epoch": 10.774000000000001, + "grad_norm": 2.071425676345825, + "learning_rate": 2e-05, + "loss": 0.06101374, + "step": 5387 + }, + { + "epoch": 10.776, + "grad_norm": 1.1163301467895508, + "learning_rate": 2e-05, + "loss": 0.04933468, + "step": 5388 + }, + { + "epoch": 10.778, + "grad_norm": 1.2288962602615356, + "learning_rate": 2e-05, + "loss": 0.04980216, + "step": 5389 + }, + { + "epoch": 10.78, + "grad_norm": 1.9011704921722412, + "learning_rate": 2e-05, + "loss": 0.05300172, + "step": 5390 + }, + { + "epoch": 10.782, + "grad_norm": 1.5152117013931274, + "learning_rate": 2e-05, + "loss": 0.04199405, + "step": 5391 + }, + { + "epoch": 10.784, + "grad_norm": 1.2738306522369385, + "learning_rate": 2e-05, + "loss": 0.04918165, + "step": 5392 + }, + { + "epoch": 10.786, + "grad_norm": 1.598197102546692, + "learning_rate": 2e-05, + "loss": 0.05998203, + "step": 5393 + }, + { + "epoch": 10.788, + "grad_norm": 1.4320733547210693, + "learning_rate": 2e-05, + "loss": 0.04948084, + "step": 5394 + }, + { + "epoch": 10.79, + "grad_norm": 1.6892844438552856, + "learning_rate": 2e-05, + "loss": 0.06663223, + "step": 5395 + }, + { + "epoch": 10.792, + "grad_norm": 1.3951140642166138, + "learning_rate": 2e-05, + "loss": 0.05481715, + "step": 5396 + }, + { + "epoch": 10.794, + "grad_norm": 1.895914077758789, + "learning_rate": 2e-05, + "loss": 0.050419, + "step": 5397 + }, + { + "epoch": 10.796, + "grad_norm": 1.6895439624786377, + "learning_rate": 2e-05, + "loss": 0.05903822, + "step": 5398 + }, + { + "epoch": 10.798, + "grad_norm": 1.3120181560516357, + "learning_rate": 2e-05, + "loss": 0.03543907, + "step": 5399 + }, + { + "epoch": 10.8, + "grad_norm": 1.598132610321045, + "learning_rate": 2e-05, + "loss": 0.03839812, + "step": 5400 + }, + { + "epoch": 10.802, + "grad_norm": 1.5897235870361328, + "learning_rate": 2e-05, + "loss": 0.04522756, + "step": 5401 + }, + { + "epoch": 10.804, + "grad_norm": 1.8060530424118042, + "learning_rate": 2e-05, + "loss": 0.06058746, + "step": 5402 + }, + { + "epoch": 10.806000000000001, + "grad_norm": 1.4628931283950806, + "learning_rate": 2e-05, + "loss": 0.03918256, + "step": 5403 + }, + { + "epoch": 10.808, + "grad_norm": 1.6146814823150635, + "learning_rate": 2e-05, + "loss": 0.05647987, + "step": 5404 + }, + { + "epoch": 10.81, + "grad_norm": 2.082378387451172, + "learning_rate": 2e-05, + "loss": 0.04440005, + "step": 5405 + }, + { + "epoch": 10.812, + "grad_norm": 2.013496160507202, + "learning_rate": 2e-05, + "loss": 0.06099815, + "step": 5406 + }, + { + "epoch": 10.814, + "grad_norm": 2.267378807067871, + "learning_rate": 2e-05, + "loss": 0.06538484, + "step": 5407 + }, + { + "epoch": 10.816, + "grad_norm": 1.9403990507125854, + "learning_rate": 2e-05, + "loss": 0.05472123, + "step": 5408 + }, + { + "epoch": 10.818, + "grad_norm": 1.1750283241271973, + "learning_rate": 2e-05, + "loss": 0.03978173, + "step": 5409 + }, + { + "epoch": 10.82, + "grad_norm": 1.5307939052581787, + "learning_rate": 2e-05, + "loss": 0.06439765, + "step": 5410 + }, + { + "epoch": 10.822, + "grad_norm": 1.0061407089233398, + "learning_rate": 2e-05, + "loss": 0.03479624, + "step": 5411 + }, + { + "epoch": 10.824, + "grad_norm": 1.3248248100280762, + "learning_rate": 2e-05, + "loss": 0.04627767, + "step": 5412 + }, + { + "epoch": 10.826, + "grad_norm": 2.050506114959717, + "learning_rate": 2e-05, + "loss": 0.03434724, + "step": 5413 + }, + { + "epoch": 10.828, + "grad_norm": 1.4149460792541504, + "learning_rate": 2e-05, + "loss": 0.05991215, + "step": 5414 + }, + { + "epoch": 10.83, + "grad_norm": 1.5017156600952148, + "learning_rate": 2e-05, + "loss": 0.05083961, + "step": 5415 + }, + { + "epoch": 10.832, + "grad_norm": 1.0457175970077515, + "learning_rate": 2e-05, + "loss": 0.03952884, + "step": 5416 + }, + { + "epoch": 10.834, + "grad_norm": 1.5834999084472656, + "learning_rate": 2e-05, + "loss": 0.05933148, + "step": 5417 + }, + { + "epoch": 10.836, + "grad_norm": 1.1834666728973389, + "learning_rate": 2e-05, + "loss": 0.03205921, + "step": 5418 + }, + { + "epoch": 10.838, + "grad_norm": 1.2450237274169922, + "learning_rate": 2e-05, + "loss": 0.04426446, + "step": 5419 + }, + { + "epoch": 10.84, + "grad_norm": 1.2155392169952393, + "learning_rate": 2e-05, + "loss": 0.03819549, + "step": 5420 + }, + { + "epoch": 10.842, + "grad_norm": 1.4611936807632446, + "learning_rate": 2e-05, + "loss": 0.04256415, + "step": 5421 + }, + { + "epoch": 10.844, + "grad_norm": 0.9903550744056702, + "learning_rate": 2e-05, + "loss": 0.03737554, + "step": 5422 + }, + { + "epoch": 10.846, + "grad_norm": 1.4878544807434082, + "learning_rate": 2e-05, + "loss": 0.04571939, + "step": 5423 + }, + { + "epoch": 10.848, + "grad_norm": 3.3160300254821777, + "learning_rate": 2e-05, + "loss": 0.06830984, + "step": 5424 + }, + { + "epoch": 10.85, + "grad_norm": 1.3133734464645386, + "learning_rate": 2e-05, + "loss": 0.05649231, + "step": 5425 + }, + { + "epoch": 10.852, + "grad_norm": 1.1060919761657715, + "learning_rate": 2e-05, + "loss": 0.03377821, + "step": 5426 + }, + { + "epoch": 10.854, + "grad_norm": 1.686246395111084, + "learning_rate": 2e-05, + "loss": 0.04799533, + "step": 5427 + }, + { + "epoch": 10.856, + "grad_norm": 2.3602640628814697, + "learning_rate": 2e-05, + "loss": 0.03780733, + "step": 5428 + }, + { + "epoch": 10.858, + "grad_norm": 1.5096981525421143, + "learning_rate": 2e-05, + "loss": 0.04364054, + "step": 5429 + }, + { + "epoch": 10.86, + "grad_norm": 3.4624457359313965, + "learning_rate": 2e-05, + "loss": 0.07765651, + "step": 5430 + }, + { + "epoch": 10.862, + "grad_norm": 1.9215983152389526, + "learning_rate": 2e-05, + "loss": 0.0564341, + "step": 5431 + }, + { + "epoch": 10.864, + "grad_norm": 2.2769806385040283, + "learning_rate": 2e-05, + "loss": 0.0673317, + "step": 5432 + }, + { + "epoch": 10.866, + "grad_norm": 1.4568067789077759, + "learning_rate": 2e-05, + "loss": 0.04599699, + "step": 5433 + }, + { + "epoch": 10.868, + "grad_norm": 1.3727821111679077, + "learning_rate": 2e-05, + "loss": 0.05360325, + "step": 5434 + }, + { + "epoch": 10.87, + "grad_norm": 2.1646981239318848, + "learning_rate": 2e-05, + "loss": 0.05432039, + "step": 5435 + }, + { + "epoch": 10.872, + "grad_norm": 1.466217041015625, + "learning_rate": 2e-05, + "loss": 0.05962711, + "step": 5436 + }, + { + "epoch": 10.874, + "grad_norm": 1.4684398174285889, + "learning_rate": 2e-05, + "loss": 0.03964592, + "step": 5437 + }, + { + "epoch": 10.876, + "grad_norm": 1.3081820011138916, + "learning_rate": 2e-05, + "loss": 0.03898221, + "step": 5438 + }, + { + "epoch": 10.878, + "grad_norm": 1.5860309600830078, + "learning_rate": 2e-05, + "loss": 0.03976344, + "step": 5439 + }, + { + "epoch": 10.88, + "grad_norm": 1.7701555490493774, + "learning_rate": 2e-05, + "loss": 0.04799116, + "step": 5440 + }, + { + "epoch": 10.882, + "grad_norm": 2.2064380645751953, + "learning_rate": 2e-05, + "loss": 0.0660604, + "step": 5441 + }, + { + "epoch": 10.884, + "grad_norm": 1.732148289680481, + "learning_rate": 2e-05, + "loss": 0.05492194, + "step": 5442 + }, + { + "epoch": 10.886, + "grad_norm": 1.9355525970458984, + "learning_rate": 2e-05, + "loss": 0.04734231, + "step": 5443 + }, + { + "epoch": 10.888, + "grad_norm": 1.2587823867797852, + "learning_rate": 2e-05, + "loss": 0.03634328, + "step": 5444 + }, + { + "epoch": 10.89, + "grad_norm": 1.5590167045593262, + "learning_rate": 2e-05, + "loss": 0.06726646, + "step": 5445 + }, + { + "epoch": 10.892, + "grad_norm": 1.3101967573165894, + "learning_rate": 2e-05, + "loss": 0.04764952, + "step": 5446 + }, + { + "epoch": 10.894, + "grad_norm": 2.4456593990325928, + "learning_rate": 2e-05, + "loss": 0.05624555, + "step": 5447 + }, + { + "epoch": 10.896, + "grad_norm": 2.824155330657959, + "learning_rate": 2e-05, + "loss": 0.05465455, + "step": 5448 + }, + { + "epoch": 10.898, + "grad_norm": 1.4769179821014404, + "learning_rate": 2e-05, + "loss": 0.05461916, + "step": 5449 + }, + { + "epoch": 10.9, + "grad_norm": 1.4648699760437012, + "learning_rate": 2e-05, + "loss": 0.05058068, + "step": 5450 + }, + { + "epoch": 10.902, + "grad_norm": 1.2384024858474731, + "learning_rate": 2e-05, + "loss": 0.04764497, + "step": 5451 + }, + { + "epoch": 10.904, + "grad_norm": 1.6182745695114136, + "learning_rate": 2e-05, + "loss": 0.05357751, + "step": 5452 + }, + { + "epoch": 10.906, + "grad_norm": 1.5405651330947876, + "learning_rate": 2e-05, + "loss": 0.0363545, + "step": 5453 + }, + { + "epoch": 10.908, + "grad_norm": 1.2049387693405151, + "learning_rate": 2e-05, + "loss": 0.0472638, + "step": 5454 + }, + { + "epoch": 10.91, + "grad_norm": 2.5308351516723633, + "learning_rate": 2e-05, + "loss": 0.04407152, + "step": 5455 + }, + { + "epoch": 10.912, + "grad_norm": 1.3933305740356445, + "learning_rate": 2e-05, + "loss": 0.04460661, + "step": 5456 + }, + { + "epoch": 10.914, + "grad_norm": 1.4004764556884766, + "learning_rate": 2e-05, + "loss": 0.04893076, + "step": 5457 + }, + { + "epoch": 10.916, + "grad_norm": 1.8205981254577637, + "learning_rate": 2e-05, + "loss": 0.04703292, + "step": 5458 + }, + { + "epoch": 10.918, + "grad_norm": 1.8013707399368286, + "learning_rate": 2e-05, + "loss": 0.055125, + "step": 5459 + }, + { + "epoch": 10.92, + "grad_norm": 1.451092004776001, + "learning_rate": 2e-05, + "loss": 0.04543486, + "step": 5460 + }, + { + "epoch": 10.922, + "grad_norm": 1.1553832292556763, + "learning_rate": 2e-05, + "loss": 0.04265869, + "step": 5461 + }, + { + "epoch": 10.924, + "grad_norm": 1.960945725440979, + "learning_rate": 2e-05, + "loss": 0.05424058, + "step": 5462 + }, + { + "epoch": 10.926, + "grad_norm": 1.4518814086914062, + "learning_rate": 2e-05, + "loss": 0.04993463, + "step": 5463 + }, + { + "epoch": 10.928, + "grad_norm": 2.5878188610076904, + "learning_rate": 2e-05, + "loss": 0.04935042, + "step": 5464 + }, + { + "epoch": 10.93, + "grad_norm": 1.3575770854949951, + "learning_rate": 2e-05, + "loss": 0.05218004, + "step": 5465 + }, + { + "epoch": 10.932, + "grad_norm": 1.4629638195037842, + "learning_rate": 2e-05, + "loss": 0.05007394, + "step": 5466 + }, + { + "epoch": 10.934, + "grad_norm": 1.2704107761383057, + "learning_rate": 2e-05, + "loss": 0.04607791, + "step": 5467 + }, + { + "epoch": 10.936, + "grad_norm": 1.5008957386016846, + "learning_rate": 2e-05, + "loss": 0.04755886, + "step": 5468 + }, + { + "epoch": 10.938, + "grad_norm": 1.6490647792816162, + "learning_rate": 2e-05, + "loss": 0.04356069, + "step": 5469 + }, + { + "epoch": 10.94, + "grad_norm": 1.9687342643737793, + "learning_rate": 2e-05, + "loss": 0.04906989, + "step": 5470 + }, + { + "epoch": 10.942, + "grad_norm": 2.3474905490875244, + "learning_rate": 2e-05, + "loss": 0.04697709, + "step": 5471 + }, + { + "epoch": 10.943999999999999, + "grad_norm": 1.7721339464187622, + "learning_rate": 2e-05, + "loss": 0.06682569, + "step": 5472 + }, + { + "epoch": 10.946, + "grad_norm": 1.4311152696609497, + "learning_rate": 2e-05, + "loss": 0.04959798, + "step": 5473 + }, + { + "epoch": 10.948, + "grad_norm": 1.3908685445785522, + "learning_rate": 2e-05, + "loss": 0.04007864, + "step": 5474 + }, + { + "epoch": 10.95, + "grad_norm": 1.1253409385681152, + "learning_rate": 2e-05, + "loss": 0.0392089, + "step": 5475 + }, + { + "epoch": 10.952, + "grad_norm": 1.5628206729888916, + "learning_rate": 2e-05, + "loss": 0.03705291, + "step": 5476 + }, + { + "epoch": 10.954, + "grad_norm": 2.015963077545166, + "learning_rate": 2e-05, + "loss": 0.05837575, + "step": 5477 + }, + { + "epoch": 10.956, + "grad_norm": 1.1806350946426392, + "learning_rate": 2e-05, + "loss": 0.04036895, + "step": 5478 + }, + { + "epoch": 10.958, + "grad_norm": 1.8110302686691284, + "learning_rate": 2e-05, + "loss": 0.05655763, + "step": 5479 + }, + { + "epoch": 10.96, + "grad_norm": 1.6625659465789795, + "learning_rate": 2e-05, + "loss": 0.04185361, + "step": 5480 + }, + { + "epoch": 10.962, + "grad_norm": 1.8857502937316895, + "learning_rate": 2e-05, + "loss": 0.05384925, + "step": 5481 + }, + { + "epoch": 10.964, + "grad_norm": 1.3210104703903198, + "learning_rate": 2e-05, + "loss": 0.0597494, + "step": 5482 + }, + { + "epoch": 10.966, + "grad_norm": 1.587018609046936, + "learning_rate": 2e-05, + "loss": 0.05472574, + "step": 5483 + }, + { + "epoch": 10.968, + "grad_norm": 1.3318067789077759, + "learning_rate": 2e-05, + "loss": 0.0541456, + "step": 5484 + }, + { + "epoch": 10.97, + "grad_norm": 1.4794989824295044, + "learning_rate": 2e-05, + "loss": 0.05960969, + "step": 5485 + }, + { + "epoch": 10.972, + "grad_norm": 3.501803398132324, + "learning_rate": 2e-05, + "loss": 0.0588432, + "step": 5486 + }, + { + "epoch": 10.974, + "grad_norm": 1.6838804483413696, + "learning_rate": 2e-05, + "loss": 0.05511768, + "step": 5487 + }, + { + "epoch": 10.975999999999999, + "grad_norm": 1.2759146690368652, + "learning_rate": 2e-05, + "loss": 0.02767239, + "step": 5488 + }, + { + "epoch": 10.978, + "grad_norm": 1.4586148262023926, + "learning_rate": 2e-05, + "loss": 0.05381429, + "step": 5489 + }, + { + "epoch": 10.98, + "grad_norm": 1.5755432844161987, + "learning_rate": 2e-05, + "loss": 0.03358914, + "step": 5490 + }, + { + "epoch": 10.982, + "grad_norm": 1.175610899925232, + "learning_rate": 2e-05, + "loss": 0.04267032, + "step": 5491 + }, + { + "epoch": 10.984, + "grad_norm": 1.1766899824142456, + "learning_rate": 2e-05, + "loss": 0.03875332, + "step": 5492 + }, + { + "epoch": 10.986, + "grad_norm": 2.118785858154297, + "learning_rate": 2e-05, + "loss": 0.05230087, + "step": 5493 + }, + { + "epoch": 10.988, + "grad_norm": 1.6943100690841675, + "learning_rate": 2e-05, + "loss": 0.05274594, + "step": 5494 + }, + { + "epoch": 10.99, + "grad_norm": 1.087948203086853, + "learning_rate": 2e-05, + "loss": 0.03773314, + "step": 5495 + }, + { + "epoch": 10.992, + "grad_norm": 1.6124333143234253, + "learning_rate": 2e-05, + "loss": 0.04908092, + "step": 5496 + }, + { + "epoch": 10.994, + "grad_norm": 1.851080298423767, + "learning_rate": 2e-05, + "loss": 0.05457485, + "step": 5497 + }, + { + "epoch": 10.996, + "grad_norm": 1.4189625978469849, + "learning_rate": 2e-05, + "loss": 0.04965939, + "step": 5498 + }, + { + "epoch": 10.998, + "grad_norm": 1.9393243789672852, + "learning_rate": 2e-05, + "loss": 0.05298079, + "step": 5499 + }, + { + "epoch": 11.0, + "grad_norm": 1.709896445274353, + "learning_rate": 2e-05, + "loss": 0.05041164, + "step": 5500 + }, + { + "epoch": 11.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 0.996, + "AngleClassification_3": 0.9600798403193613, + "Equal_1": 0.994, + "Equal_2": 0.9660678642714571, + "Equal_3": 0.874251497005988, + "LineComparison_1": 0.996, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9740518962075848, + "Parallel_1": 0.9879759519038076, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.986, + "Perpendicular_1": 0.988, + "Perpendicular_2": 0.948, + "Perpendicular_3": 0.6513026052104208, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9956666666666667, + "PointLiesOnCircle_3": 0.9892666666666666, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9859719438877755, + "PointLiesOnLine_3": 0.9740518962075848 + }, + "eval_runtime": 319.8654, + "eval_samples_per_second": 32.826, + "eval_steps_per_second": 0.657, + "step": 5500 + }, + { + "epoch": 11.002, + "grad_norm": 1.7501477003097534, + "learning_rate": 2e-05, + "loss": 0.04067567, + "step": 5501 + }, + { + "epoch": 11.004, + "grad_norm": 2.942864179611206, + "learning_rate": 2e-05, + "loss": 0.05952626, + "step": 5502 + }, + { + "epoch": 11.006, + "grad_norm": 1.5498440265655518, + "learning_rate": 2e-05, + "loss": 0.03488794, + "step": 5503 + }, + { + "epoch": 11.008, + "grad_norm": 1.4755291938781738, + "learning_rate": 2e-05, + "loss": 0.05729527, + "step": 5504 + }, + { + "epoch": 11.01, + "grad_norm": 1.4485598802566528, + "learning_rate": 2e-05, + "loss": 0.05340921, + "step": 5505 + }, + { + "epoch": 11.012, + "grad_norm": 1.3793925046920776, + "learning_rate": 2e-05, + "loss": 0.04637571, + "step": 5506 + }, + { + "epoch": 11.014, + "grad_norm": 1.4103147983551025, + "learning_rate": 2e-05, + "loss": 0.04511734, + "step": 5507 + }, + { + "epoch": 11.016, + "grad_norm": 1.3534992933273315, + "learning_rate": 2e-05, + "loss": 0.04359635, + "step": 5508 + }, + { + "epoch": 11.018, + "grad_norm": 1.5268181562423706, + "learning_rate": 2e-05, + "loss": 0.07277807, + "step": 5509 + }, + { + "epoch": 11.02, + "grad_norm": 1.0915262699127197, + "learning_rate": 2e-05, + "loss": 0.05696312, + "step": 5510 + }, + { + "epoch": 11.022, + "grad_norm": 1.7027065753936768, + "learning_rate": 2e-05, + "loss": 0.05292184, + "step": 5511 + }, + { + "epoch": 11.024, + "grad_norm": 1.5341792106628418, + "learning_rate": 2e-05, + "loss": 0.05033635, + "step": 5512 + }, + { + "epoch": 11.026, + "grad_norm": 1.545507788658142, + "learning_rate": 2e-05, + "loss": 0.06413613, + "step": 5513 + }, + { + "epoch": 11.028, + "grad_norm": 1.227196455001831, + "learning_rate": 2e-05, + "loss": 0.04655745, + "step": 5514 + }, + { + "epoch": 11.03, + "grad_norm": 1.5401439666748047, + "learning_rate": 2e-05, + "loss": 0.05112302, + "step": 5515 + }, + { + "epoch": 11.032, + "grad_norm": 0.9862892627716064, + "learning_rate": 2e-05, + "loss": 0.03179028, + "step": 5516 + }, + { + "epoch": 11.034, + "grad_norm": 1.793843388557434, + "learning_rate": 2e-05, + "loss": 0.06312604, + "step": 5517 + }, + { + "epoch": 11.036, + "grad_norm": 2.1920554637908936, + "learning_rate": 2e-05, + "loss": 0.06224691, + "step": 5518 + }, + { + "epoch": 11.038, + "grad_norm": 1.6671439409255981, + "learning_rate": 2e-05, + "loss": 0.05866244, + "step": 5519 + }, + { + "epoch": 11.04, + "grad_norm": 1.550345540046692, + "learning_rate": 2e-05, + "loss": 0.04544277, + "step": 5520 + }, + { + "epoch": 11.042, + "grad_norm": 1.3246674537658691, + "learning_rate": 2e-05, + "loss": 0.03805883, + "step": 5521 + }, + { + "epoch": 11.044, + "grad_norm": 1.9008183479309082, + "learning_rate": 2e-05, + "loss": 0.04575905, + "step": 5522 + }, + { + "epoch": 11.046, + "grad_norm": 1.9028370380401611, + "learning_rate": 2e-05, + "loss": 0.05989815, + "step": 5523 + }, + { + "epoch": 11.048, + "grad_norm": 1.2603625059127808, + "learning_rate": 2e-05, + "loss": 0.03706544, + "step": 5524 + }, + { + "epoch": 11.05, + "grad_norm": 1.408327579498291, + "learning_rate": 2e-05, + "loss": 0.05105807, + "step": 5525 + }, + { + "epoch": 11.052, + "grad_norm": 1.762861728668213, + "learning_rate": 2e-05, + "loss": 0.03200347, + "step": 5526 + }, + { + "epoch": 11.054, + "grad_norm": 1.0934480428695679, + "learning_rate": 2e-05, + "loss": 0.04322592, + "step": 5527 + }, + { + "epoch": 11.056, + "grad_norm": 2.1491646766662598, + "learning_rate": 2e-05, + "loss": 0.04375915, + "step": 5528 + }, + { + "epoch": 11.058, + "grad_norm": 1.0617128610610962, + "learning_rate": 2e-05, + "loss": 0.03229889, + "step": 5529 + }, + { + "epoch": 11.06, + "grad_norm": 1.9952315092086792, + "learning_rate": 2e-05, + "loss": 0.0592179, + "step": 5530 + }, + { + "epoch": 11.062, + "grad_norm": 2.140627861022949, + "learning_rate": 2e-05, + "loss": 0.04644826, + "step": 5531 + }, + { + "epoch": 11.064, + "grad_norm": 1.437983512878418, + "learning_rate": 2e-05, + "loss": 0.03985, + "step": 5532 + }, + { + "epoch": 11.066, + "grad_norm": 2.316026210784912, + "learning_rate": 2e-05, + "loss": 0.05902199, + "step": 5533 + }, + { + "epoch": 11.068, + "grad_norm": 1.4913086891174316, + "learning_rate": 2e-05, + "loss": 0.03914908, + "step": 5534 + }, + { + "epoch": 11.07, + "grad_norm": 1.6400469541549683, + "learning_rate": 2e-05, + "loss": 0.04192843, + "step": 5535 + }, + { + "epoch": 11.072, + "grad_norm": 1.754105567932129, + "learning_rate": 2e-05, + "loss": 0.04989998, + "step": 5536 + }, + { + "epoch": 11.074, + "grad_norm": 2.085247039794922, + "learning_rate": 2e-05, + "loss": 0.04821778, + "step": 5537 + }, + { + "epoch": 11.076, + "grad_norm": 1.0525646209716797, + "learning_rate": 2e-05, + "loss": 0.03182067, + "step": 5538 + }, + { + "epoch": 11.078, + "grad_norm": 1.7224996089935303, + "learning_rate": 2e-05, + "loss": 0.04517365, + "step": 5539 + }, + { + "epoch": 11.08, + "grad_norm": 1.285914659500122, + "learning_rate": 2e-05, + "loss": 0.0277714, + "step": 5540 + }, + { + "epoch": 11.082, + "grad_norm": 1.9249039888381958, + "learning_rate": 2e-05, + "loss": 0.03731865, + "step": 5541 + }, + { + "epoch": 11.084, + "grad_norm": 3.5124082565307617, + "learning_rate": 2e-05, + "loss": 0.04889687, + "step": 5542 + }, + { + "epoch": 11.086, + "grad_norm": 1.3310890197753906, + "learning_rate": 2e-05, + "loss": 0.04091962, + "step": 5543 + }, + { + "epoch": 11.088, + "grad_norm": 1.5621048212051392, + "learning_rate": 2e-05, + "loss": 0.05734289, + "step": 5544 + }, + { + "epoch": 11.09, + "grad_norm": 2.3096556663513184, + "learning_rate": 2e-05, + "loss": 0.0544033, + "step": 5545 + }, + { + "epoch": 11.092, + "grad_norm": 2.0304203033447266, + "learning_rate": 2e-05, + "loss": 0.03412958, + "step": 5546 + }, + { + "epoch": 11.094, + "grad_norm": 3.0657801628112793, + "learning_rate": 2e-05, + "loss": 0.05514826, + "step": 5547 + }, + { + "epoch": 11.096, + "grad_norm": 1.4915958642959595, + "learning_rate": 2e-05, + "loss": 0.04357203, + "step": 5548 + }, + { + "epoch": 11.098, + "grad_norm": 2.101444959640503, + "learning_rate": 2e-05, + "loss": 0.05817562, + "step": 5549 + }, + { + "epoch": 11.1, + "grad_norm": 1.9840679168701172, + "learning_rate": 2e-05, + "loss": 0.05163846, + "step": 5550 + }, + { + "epoch": 11.102, + "grad_norm": 1.5036526918411255, + "learning_rate": 2e-05, + "loss": 0.03977584, + "step": 5551 + }, + { + "epoch": 11.104, + "grad_norm": 3.665069103240967, + "learning_rate": 2e-05, + "loss": 0.05804493, + "step": 5552 + }, + { + "epoch": 11.106, + "grad_norm": 6.136903762817383, + "learning_rate": 2e-05, + "loss": 0.05603513, + "step": 5553 + }, + { + "epoch": 11.108, + "grad_norm": 2.01459002494812, + "learning_rate": 2e-05, + "loss": 0.04759218, + "step": 5554 + }, + { + "epoch": 11.11, + "grad_norm": 1.221979022026062, + "learning_rate": 2e-05, + "loss": 0.04913111, + "step": 5555 + }, + { + "epoch": 11.112, + "grad_norm": 2.235368490219116, + "learning_rate": 2e-05, + "loss": 0.06046454, + "step": 5556 + }, + { + "epoch": 11.114, + "grad_norm": 2.291501998901367, + "learning_rate": 2e-05, + "loss": 0.05759356, + "step": 5557 + }, + { + "epoch": 11.116, + "grad_norm": 1.4968833923339844, + "learning_rate": 2e-05, + "loss": 0.06014612, + "step": 5558 + }, + { + "epoch": 11.118, + "grad_norm": 1.9257349967956543, + "learning_rate": 2e-05, + "loss": 0.0344676, + "step": 5559 + }, + { + "epoch": 11.12, + "grad_norm": 1.1876585483551025, + "learning_rate": 2e-05, + "loss": 0.03832501, + "step": 5560 + }, + { + "epoch": 11.122, + "grad_norm": 1.0107924938201904, + "learning_rate": 2e-05, + "loss": 0.03613226, + "step": 5561 + }, + { + "epoch": 11.124, + "grad_norm": 1.3432567119598389, + "learning_rate": 2e-05, + "loss": 0.0471199, + "step": 5562 + }, + { + "epoch": 11.126, + "grad_norm": 1.2003259658813477, + "learning_rate": 2e-05, + "loss": 0.04361069, + "step": 5563 + }, + { + "epoch": 11.128, + "grad_norm": 0.9868248701095581, + "learning_rate": 2e-05, + "loss": 0.03666285, + "step": 5564 + }, + { + "epoch": 11.13, + "grad_norm": 1.2956124544143677, + "learning_rate": 2e-05, + "loss": 0.04453753, + "step": 5565 + }, + { + "epoch": 11.132, + "grad_norm": 1.5581343173980713, + "learning_rate": 2e-05, + "loss": 0.04929563, + "step": 5566 + }, + { + "epoch": 11.134, + "grad_norm": 2.0106360912323, + "learning_rate": 2e-05, + "loss": 0.04062722, + "step": 5567 + }, + { + "epoch": 11.136, + "grad_norm": 1.4985320568084717, + "learning_rate": 2e-05, + "loss": 0.05832936, + "step": 5568 + }, + { + "epoch": 11.138, + "grad_norm": 1.6845251321792603, + "learning_rate": 2e-05, + "loss": 0.03393908, + "step": 5569 + }, + { + "epoch": 11.14, + "grad_norm": 1.7386165857315063, + "learning_rate": 2e-05, + "loss": 0.05278086, + "step": 5570 + }, + { + "epoch": 11.142, + "grad_norm": 1.629763126373291, + "learning_rate": 2e-05, + "loss": 0.05002792, + "step": 5571 + }, + { + "epoch": 11.144, + "grad_norm": 1.2016594409942627, + "learning_rate": 2e-05, + "loss": 0.05499132, + "step": 5572 + }, + { + "epoch": 11.146, + "grad_norm": 1.2156981229782104, + "learning_rate": 2e-05, + "loss": 0.04719136, + "step": 5573 + }, + { + "epoch": 11.148, + "grad_norm": 1.1108115911483765, + "learning_rate": 2e-05, + "loss": 0.04154814, + "step": 5574 + }, + { + "epoch": 11.15, + "grad_norm": 2.7240407466888428, + "learning_rate": 2e-05, + "loss": 0.07720132, + "step": 5575 + }, + { + "epoch": 11.152, + "grad_norm": 1.6736434698104858, + "learning_rate": 2e-05, + "loss": 0.0617914, + "step": 5576 + }, + { + "epoch": 11.154, + "grad_norm": 1.8674448728561401, + "learning_rate": 2e-05, + "loss": 0.05250496, + "step": 5577 + }, + { + "epoch": 11.156, + "grad_norm": 2.2997653484344482, + "learning_rate": 2e-05, + "loss": 0.08813807, + "step": 5578 + }, + { + "epoch": 11.158, + "grad_norm": 1.312296748161316, + "learning_rate": 2e-05, + "loss": 0.03612276, + "step": 5579 + }, + { + "epoch": 11.16, + "grad_norm": 1.4704864025115967, + "learning_rate": 2e-05, + "loss": 0.05287594, + "step": 5580 + }, + { + "epoch": 11.162, + "grad_norm": 0.9963863492012024, + "learning_rate": 2e-05, + "loss": 0.03753687, + "step": 5581 + }, + { + "epoch": 11.164, + "grad_norm": 2.919243097305298, + "learning_rate": 2e-05, + "loss": 0.05586375, + "step": 5582 + }, + { + "epoch": 11.166, + "grad_norm": 2.9407522678375244, + "learning_rate": 2e-05, + "loss": 0.04070038, + "step": 5583 + }, + { + "epoch": 11.168, + "grad_norm": 1.1414661407470703, + "learning_rate": 2e-05, + "loss": 0.03115014, + "step": 5584 + }, + { + "epoch": 11.17, + "grad_norm": 3.4734015464782715, + "learning_rate": 2e-05, + "loss": 0.05622058, + "step": 5585 + }, + { + "epoch": 11.172, + "grad_norm": 1.3070592880249023, + "learning_rate": 2e-05, + "loss": 0.05956008, + "step": 5586 + }, + { + "epoch": 11.174, + "grad_norm": 1.8945657014846802, + "learning_rate": 2e-05, + "loss": 0.05505826, + "step": 5587 + }, + { + "epoch": 11.176, + "grad_norm": 2.134795665740967, + "learning_rate": 2e-05, + "loss": 0.05037409, + "step": 5588 + }, + { + "epoch": 11.178, + "grad_norm": 1.618909239768982, + "learning_rate": 2e-05, + "loss": 0.04687338, + "step": 5589 + }, + { + "epoch": 11.18, + "grad_norm": 1.6525713205337524, + "learning_rate": 2e-05, + "loss": 0.03109693, + "step": 5590 + }, + { + "epoch": 11.182, + "grad_norm": 1.4128676652908325, + "learning_rate": 2e-05, + "loss": 0.04869646, + "step": 5591 + }, + { + "epoch": 11.184, + "grad_norm": 1.8516077995300293, + "learning_rate": 2e-05, + "loss": 0.05194153, + "step": 5592 + }, + { + "epoch": 11.186, + "grad_norm": 0.9598081111907959, + "learning_rate": 2e-05, + "loss": 0.03175935, + "step": 5593 + }, + { + "epoch": 11.188, + "grad_norm": 2.366396903991699, + "learning_rate": 2e-05, + "loss": 0.04697275, + "step": 5594 + }, + { + "epoch": 11.19, + "grad_norm": 1.4049228429794312, + "learning_rate": 2e-05, + "loss": 0.04712133, + "step": 5595 + }, + { + "epoch": 11.192, + "grad_norm": 1.392557978630066, + "learning_rate": 2e-05, + "loss": 0.04036619, + "step": 5596 + }, + { + "epoch": 11.194, + "grad_norm": 2.7289698123931885, + "learning_rate": 2e-05, + "loss": 0.05585587, + "step": 5597 + }, + { + "epoch": 11.196, + "grad_norm": 1.3307260274887085, + "learning_rate": 2e-05, + "loss": 0.04757424, + "step": 5598 + }, + { + "epoch": 11.198, + "grad_norm": 1.4445418119430542, + "learning_rate": 2e-05, + "loss": 0.04362855, + "step": 5599 + }, + { + "epoch": 11.2, + "grad_norm": 1.3054895401000977, + "learning_rate": 2e-05, + "loss": 0.0476041, + "step": 5600 + }, + { + "epoch": 11.202, + "grad_norm": 1.1574424505233765, + "learning_rate": 2e-05, + "loss": 0.04523313, + "step": 5601 + }, + { + "epoch": 11.204, + "grad_norm": 1.6531107425689697, + "learning_rate": 2e-05, + "loss": 0.07256036, + "step": 5602 + }, + { + "epoch": 11.206, + "grad_norm": 2.6283984184265137, + "learning_rate": 2e-05, + "loss": 0.05259442, + "step": 5603 + }, + { + "epoch": 11.208, + "grad_norm": 1.1295219659805298, + "learning_rate": 2e-05, + "loss": 0.02722023, + "step": 5604 + }, + { + "epoch": 11.21, + "grad_norm": 1.7580029964447021, + "learning_rate": 2e-05, + "loss": 0.0378816, + "step": 5605 + }, + { + "epoch": 11.212, + "grad_norm": 1.1564671993255615, + "learning_rate": 2e-05, + "loss": 0.04453285, + "step": 5606 + }, + { + "epoch": 11.214, + "grad_norm": 2.0429930686950684, + "learning_rate": 2e-05, + "loss": 0.06690899, + "step": 5607 + }, + { + "epoch": 11.216, + "grad_norm": 3.2511401176452637, + "learning_rate": 2e-05, + "loss": 0.07473037, + "step": 5608 + }, + { + "epoch": 11.218, + "grad_norm": 1.5251747369766235, + "learning_rate": 2e-05, + "loss": 0.06034661, + "step": 5609 + }, + { + "epoch": 11.22, + "grad_norm": 1.3258527517318726, + "learning_rate": 2e-05, + "loss": 0.04761764, + "step": 5610 + }, + { + "epoch": 11.222, + "grad_norm": 1.9736108779907227, + "learning_rate": 2e-05, + "loss": 0.04002544, + "step": 5611 + }, + { + "epoch": 11.224, + "grad_norm": 1.7155126333236694, + "learning_rate": 2e-05, + "loss": 0.05041002, + "step": 5612 + }, + { + "epoch": 11.226, + "grad_norm": 1.2253735065460205, + "learning_rate": 2e-05, + "loss": 0.04136164, + "step": 5613 + }, + { + "epoch": 11.228, + "grad_norm": 1.5986520051956177, + "learning_rate": 2e-05, + "loss": 0.04651842, + "step": 5614 + }, + { + "epoch": 11.23, + "grad_norm": 1.277672529220581, + "learning_rate": 2e-05, + "loss": 0.04242517, + "step": 5615 + }, + { + "epoch": 11.232, + "grad_norm": 1.531924843788147, + "learning_rate": 2e-05, + "loss": 0.0391548, + "step": 5616 + }, + { + "epoch": 11.234, + "grad_norm": 1.1350847482681274, + "learning_rate": 2e-05, + "loss": 0.04663714, + "step": 5617 + }, + { + "epoch": 11.236, + "grad_norm": 1.4929993152618408, + "learning_rate": 2e-05, + "loss": 0.04786985, + "step": 5618 + }, + { + "epoch": 11.238, + "grad_norm": 1.8032375574111938, + "learning_rate": 2e-05, + "loss": 0.0391562, + "step": 5619 + }, + { + "epoch": 11.24, + "grad_norm": 1.310897946357727, + "learning_rate": 2e-05, + "loss": 0.0419879, + "step": 5620 + }, + { + "epoch": 11.242, + "grad_norm": 1.8344627618789673, + "learning_rate": 2e-05, + "loss": 0.05171813, + "step": 5621 + }, + { + "epoch": 11.244, + "grad_norm": 1.4439165592193604, + "learning_rate": 2e-05, + "loss": 0.04475713, + "step": 5622 + }, + { + "epoch": 11.246, + "grad_norm": 1.3868870735168457, + "learning_rate": 2e-05, + "loss": 0.0524424, + "step": 5623 + }, + { + "epoch": 11.248, + "grad_norm": 1.4693684577941895, + "learning_rate": 2e-05, + "loss": 0.03726972, + "step": 5624 + }, + { + "epoch": 11.25, + "grad_norm": 1.3319610357284546, + "learning_rate": 2e-05, + "loss": 0.0376013, + "step": 5625 + }, + { + "epoch": 11.252, + "grad_norm": 1.1958826780319214, + "learning_rate": 2e-05, + "loss": 0.03406876, + "step": 5626 + }, + { + "epoch": 11.254, + "grad_norm": 1.5152924060821533, + "learning_rate": 2e-05, + "loss": 0.05023153, + "step": 5627 + }, + { + "epoch": 11.256, + "grad_norm": 1.8399534225463867, + "learning_rate": 2e-05, + "loss": 0.05513574, + "step": 5628 + }, + { + "epoch": 11.258, + "grad_norm": 1.1258716583251953, + "learning_rate": 2e-05, + "loss": 0.02846676, + "step": 5629 + }, + { + "epoch": 11.26, + "grad_norm": 1.4945526123046875, + "learning_rate": 2e-05, + "loss": 0.04922044, + "step": 5630 + }, + { + "epoch": 11.262, + "grad_norm": 1.420878291130066, + "learning_rate": 2e-05, + "loss": 0.05598453, + "step": 5631 + }, + { + "epoch": 11.264, + "grad_norm": 1.74343740940094, + "learning_rate": 2e-05, + "loss": 0.05555171, + "step": 5632 + }, + { + "epoch": 11.266, + "grad_norm": 1.4211103916168213, + "learning_rate": 2e-05, + "loss": 0.04510403, + "step": 5633 + }, + { + "epoch": 11.268, + "grad_norm": 1.2378079891204834, + "learning_rate": 2e-05, + "loss": 0.04460875, + "step": 5634 + }, + { + "epoch": 11.27, + "grad_norm": 2.1717655658721924, + "learning_rate": 2e-05, + "loss": 0.04952989, + "step": 5635 + }, + { + "epoch": 11.272, + "grad_norm": 1.3453963994979858, + "learning_rate": 2e-05, + "loss": 0.04727048, + "step": 5636 + }, + { + "epoch": 11.274000000000001, + "grad_norm": 1.2908873558044434, + "learning_rate": 2e-05, + "loss": 0.04669511, + "step": 5637 + }, + { + "epoch": 11.276, + "grad_norm": 2.5825376510620117, + "learning_rate": 2e-05, + "loss": 0.07910918, + "step": 5638 + }, + { + "epoch": 11.278, + "grad_norm": 1.3286540508270264, + "learning_rate": 2e-05, + "loss": 0.05208978, + "step": 5639 + }, + { + "epoch": 11.28, + "grad_norm": 2.3290231227874756, + "learning_rate": 2e-05, + "loss": 0.0465233, + "step": 5640 + }, + { + "epoch": 11.282, + "grad_norm": 3.7450168132781982, + "learning_rate": 2e-05, + "loss": 0.04494961, + "step": 5641 + }, + { + "epoch": 11.284, + "grad_norm": 2.037266492843628, + "learning_rate": 2e-05, + "loss": 0.05497518, + "step": 5642 + }, + { + "epoch": 11.286, + "grad_norm": 1.5916911363601685, + "learning_rate": 2e-05, + "loss": 0.04094953, + "step": 5643 + }, + { + "epoch": 11.288, + "grad_norm": 1.6948169469833374, + "learning_rate": 2e-05, + "loss": 0.0476352, + "step": 5644 + }, + { + "epoch": 11.29, + "grad_norm": 2.1372830867767334, + "learning_rate": 2e-05, + "loss": 0.05398251, + "step": 5645 + }, + { + "epoch": 11.292, + "grad_norm": 1.3128230571746826, + "learning_rate": 2e-05, + "loss": 0.04471845, + "step": 5646 + }, + { + "epoch": 11.294, + "grad_norm": 1.647568941116333, + "learning_rate": 2e-05, + "loss": 0.04331946, + "step": 5647 + }, + { + "epoch": 11.296, + "grad_norm": 1.923680067062378, + "learning_rate": 2e-05, + "loss": 0.05029312, + "step": 5648 + }, + { + "epoch": 11.298, + "grad_norm": 1.4542899131774902, + "learning_rate": 2e-05, + "loss": 0.05089693, + "step": 5649 + }, + { + "epoch": 11.3, + "grad_norm": 1.5298106670379639, + "learning_rate": 2e-05, + "loss": 0.05881848, + "step": 5650 + }, + { + "epoch": 11.302, + "grad_norm": 1.7057805061340332, + "learning_rate": 2e-05, + "loss": 0.03789561, + "step": 5651 + }, + { + "epoch": 11.304, + "grad_norm": 1.3690357208251953, + "learning_rate": 2e-05, + "loss": 0.04559196, + "step": 5652 + }, + { + "epoch": 11.306, + "grad_norm": 1.7711060047149658, + "learning_rate": 2e-05, + "loss": 0.04960579, + "step": 5653 + }, + { + "epoch": 11.308, + "grad_norm": 2.154360294342041, + "learning_rate": 2e-05, + "loss": 0.04761767, + "step": 5654 + }, + { + "epoch": 11.31, + "grad_norm": 1.1826550960540771, + "learning_rate": 2e-05, + "loss": 0.04088816, + "step": 5655 + }, + { + "epoch": 11.312, + "grad_norm": 2.5155763626098633, + "learning_rate": 2e-05, + "loss": 0.04958711, + "step": 5656 + }, + { + "epoch": 11.314, + "grad_norm": 1.2392568588256836, + "learning_rate": 2e-05, + "loss": 0.05496053, + "step": 5657 + }, + { + "epoch": 11.316, + "grad_norm": 1.723213791847229, + "learning_rate": 2e-05, + "loss": 0.04695657, + "step": 5658 + }, + { + "epoch": 11.318, + "grad_norm": 1.6298481225967407, + "learning_rate": 2e-05, + "loss": 0.03813414, + "step": 5659 + }, + { + "epoch": 11.32, + "grad_norm": 1.4247621297836304, + "learning_rate": 2e-05, + "loss": 0.04710161, + "step": 5660 + }, + { + "epoch": 11.322, + "grad_norm": 1.5706455707550049, + "learning_rate": 2e-05, + "loss": 0.0463535, + "step": 5661 + }, + { + "epoch": 11.324, + "grad_norm": 2.462221622467041, + "learning_rate": 2e-05, + "loss": 0.05929623, + "step": 5662 + }, + { + "epoch": 11.326, + "grad_norm": 1.4576634168624878, + "learning_rate": 2e-05, + "loss": 0.05897439, + "step": 5663 + }, + { + "epoch": 11.328, + "grad_norm": 1.6032121181488037, + "learning_rate": 2e-05, + "loss": 0.04866658, + "step": 5664 + }, + { + "epoch": 11.33, + "grad_norm": 1.4402192831039429, + "learning_rate": 2e-05, + "loss": 0.03332867, + "step": 5665 + }, + { + "epoch": 11.332, + "grad_norm": 2.8734233379364014, + "learning_rate": 2e-05, + "loss": 0.06615589, + "step": 5666 + }, + { + "epoch": 11.334, + "grad_norm": 1.652347445487976, + "learning_rate": 2e-05, + "loss": 0.07197806, + "step": 5667 + }, + { + "epoch": 11.336, + "grad_norm": 1.8642592430114746, + "learning_rate": 2e-05, + "loss": 0.0569929, + "step": 5668 + }, + { + "epoch": 11.338, + "grad_norm": 1.2723792791366577, + "learning_rate": 2e-05, + "loss": 0.04522455, + "step": 5669 + }, + { + "epoch": 11.34, + "grad_norm": 1.5746574401855469, + "learning_rate": 2e-05, + "loss": 0.05542893, + "step": 5670 + }, + { + "epoch": 11.342, + "grad_norm": 1.4517905712127686, + "learning_rate": 2e-05, + "loss": 0.04733806, + "step": 5671 + }, + { + "epoch": 11.344, + "grad_norm": 1.3056635856628418, + "learning_rate": 2e-05, + "loss": 0.04615809, + "step": 5672 + }, + { + "epoch": 11.346, + "grad_norm": 2.0126426219940186, + "learning_rate": 2e-05, + "loss": 0.04903982, + "step": 5673 + }, + { + "epoch": 11.348, + "grad_norm": 2.094006299972534, + "learning_rate": 2e-05, + "loss": 0.0404023, + "step": 5674 + }, + { + "epoch": 11.35, + "grad_norm": 2.433847188949585, + "learning_rate": 2e-05, + "loss": 0.03881406, + "step": 5675 + }, + { + "epoch": 11.352, + "grad_norm": 1.2057197093963623, + "learning_rate": 2e-05, + "loss": 0.0405013, + "step": 5676 + }, + { + "epoch": 11.354, + "grad_norm": 1.8133881092071533, + "learning_rate": 2e-05, + "loss": 0.04236561, + "step": 5677 + }, + { + "epoch": 11.356, + "grad_norm": 2.568739891052246, + "learning_rate": 2e-05, + "loss": 0.05626953, + "step": 5678 + }, + { + "epoch": 11.358, + "grad_norm": 1.2331525087356567, + "learning_rate": 2e-05, + "loss": 0.05501791, + "step": 5679 + }, + { + "epoch": 11.36, + "grad_norm": 1.5900580883026123, + "learning_rate": 2e-05, + "loss": 0.03576569, + "step": 5680 + }, + { + "epoch": 11.362, + "grad_norm": 1.9113343954086304, + "learning_rate": 2e-05, + "loss": 0.06783809, + "step": 5681 + }, + { + "epoch": 11.364, + "grad_norm": 2.187023639678955, + "learning_rate": 2e-05, + "loss": 0.04357416, + "step": 5682 + }, + { + "epoch": 11.366, + "grad_norm": 1.2699072360992432, + "learning_rate": 2e-05, + "loss": 0.03943901, + "step": 5683 + }, + { + "epoch": 11.368, + "grad_norm": 1.9693132638931274, + "learning_rate": 2e-05, + "loss": 0.05115809, + "step": 5684 + }, + { + "epoch": 11.37, + "grad_norm": 1.3677825927734375, + "learning_rate": 2e-05, + "loss": 0.04716478, + "step": 5685 + }, + { + "epoch": 11.372, + "grad_norm": 2.4023549556732178, + "learning_rate": 2e-05, + "loss": 0.04504751, + "step": 5686 + }, + { + "epoch": 11.374, + "grad_norm": 2.511744737625122, + "learning_rate": 2e-05, + "loss": 0.04527961, + "step": 5687 + }, + { + "epoch": 11.376, + "grad_norm": 3.562417984008789, + "learning_rate": 2e-05, + "loss": 0.05556261, + "step": 5688 + }, + { + "epoch": 11.378, + "grad_norm": 1.733673095703125, + "learning_rate": 2e-05, + "loss": 0.0592088, + "step": 5689 + }, + { + "epoch": 11.38, + "grad_norm": 1.9149365425109863, + "learning_rate": 2e-05, + "loss": 0.04439528, + "step": 5690 + }, + { + "epoch": 11.382, + "grad_norm": 2.3991384506225586, + "learning_rate": 2e-05, + "loss": 0.06331401, + "step": 5691 + }, + { + "epoch": 11.384, + "grad_norm": 1.122333288192749, + "learning_rate": 2e-05, + "loss": 0.04532647, + "step": 5692 + }, + { + "epoch": 11.386, + "grad_norm": 1.6301887035369873, + "learning_rate": 2e-05, + "loss": 0.05703401, + "step": 5693 + }, + { + "epoch": 11.388, + "grad_norm": 1.636753797531128, + "learning_rate": 2e-05, + "loss": 0.05419588, + "step": 5694 + }, + { + "epoch": 11.39, + "grad_norm": 1.2946252822875977, + "learning_rate": 2e-05, + "loss": 0.05132918, + "step": 5695 + }, + { + "epoch": 11.392, + "grad_norm": 2.3236074447631836, + "learning_rate": 2e-05, + "loss": 0.05070593, + "step": 5696 + }, + { + "epoch": 11.394, + "grad_norm": 1.109092116355896, + "learning_rate": 2e-05, + "loss": 0.0335158, + "step": 5697 + }, + { + "epoch": 11.396, + "grad_norm": 1.7937777042388916, + "learning_rate": 2e-05, + "loss": 0.06048029, + "step": 5698 + }, + { + "epoch": 11.398, + "grad_norm": 1.0988600254058838, + "learning_rate": 2e-05, + "loss": 0.04137257, + "step": 5699 + }, + { + "epoch": 11.4, + "grad_norm": 1.4302674531936646, + "learning_rate": 2e-05, + "loss": 0.04906035, + "step": 5700 + }, + { + "epoch": 11.402, + "grad_norm": 1.5210566520690918, + "learning_rate": 2e-05, + "loss": 0.03527754, + "step": 5701 + }, + { + "epoch": 11.404, + "grad_norm": 2.1926465034484863, + "learning_rate": 2e-05, + "loss": 0.04686191, + "step": 5702 + }, + { + "epoch": 11.406, + "grad_norm": 1.6266396045684814, + "learning_rate": 2e-05, + "loss": 0.05651092, + "step": 5703 + }, + { + "epoch": 11.408, + "grad_norm": 2.796839952468872, + "learning_rate": 2e-05, + "loss": 0.07000058, + "step": 5704 + }, + { + "epoch": 11.41, + "grad_norm": 1.4611526727676392, + "learning_rate": 2e-05, + "loss": 0.05321472, + "step": 5705 + }, + { + "epoch": 11.412, + "grad_norm": 1.8903465270996094, + "learning_rate": 2e-05, + "loss": 0.05226947, + "step": 5706 + }, + { + "epoch": 11.414, + "grad_norm": 1.4791998863220215, + "learning_rate": 2e-05, + "loss": 0.05257621, + "step": 5707 + }, + { + "epoch": 11.416, + "grad_norm": 1.5878609418869019, + "learning_rate": 2e-05, + "loss": 0.06273852, + "step": 5708 + }, + { + "epoch": 11.418, + "grad_norm": 2.1732583045959473, + "learning_rate": 2e-05, + "loss": 0.05149956, + "step": 5709 + }, + { + "epoch": 11.42, + "grad_norm": 1.677105188369751, + "learning_rate": 2e-05, + "loss": 0.0615867, + "step": 5710 + }, + { + "epoch": 11.422, + "grad_norm": 1.29386305809021, + "learning_rate": 2e-05, + "loss": 0.04953182, + "step": 5711 + }, + { + "epoch": 11.424, + "grad_norm": 1.7477467060089111, + "learning_rate": 2e-05, + "loss": 0.06783712, + "step": 5712 + }, + { + "epoch": 11.426, + "grad_norm": 3.544480085372925, + "learning_rate": 2e-05, + "loss": 0.04404292, + "step": 5713 + }, + { + "epoch": 11.428, + "grad_norm": 2.753350019454956, + "learning_rate": 2e-05, + "loss": 0.07701689, + "step": 5714 + }, + { + "epoch": 11.43, + "grad_norm": 1.8815979957580566, + "learning_rate": 2e-05, + "loss": 0.05695759, + "step": 5715 + }, + { + "epoch": 11.432, + "grad_norm": 1.5264737606048584, + "learning_rate": 2e-05, + "loss": 0.05297385, + "step": 5716 + }, + { + "epoch": 11.434, + "grad_norm": 1.047830581665039, + "learning_rate": 2e-05, + "loss": 0.03878511, + "step": 5717 + }, + { + "epoch": 11.436, + "grad_norm": 1.0375819206237793, + "learning_rate": 2e-05, + "loss": 0.03545886, + "step": 5718 + }, + { + "epoch": 11.438, + "grad_norm": 1.4977059364318848, + "learning_rate": 2e-05, + "loss": 0.0493038, + "step": 5719 + }, + { + "epoch": 11.44, + "grad_norm": 1.3288768529891968, + "learning_rate": 2e-05, + "loss": 0.0505427, + "step": 5720 + }, + { + "epoch": 11.442, + "grad_norm": 1.954521894454956, + "learning_rate": 2e-05, + "loss": 0.06295772, + "step": 5721 + }, + { + "epoch": 11.444, + "grad_norm": 1.0402206182479858, + "learning_rate": 2e-05, + "loss": 0.03475888, + "step": 5722 + }, + { + "epoch": 11.446, + "grad_norm": 2.404123544692993, + "learning_rate": 2e-05, + "loss": 0.0562352, + "step": 5723 + }, + { + "epoch": 11.448, + "grad_norm": 1.7195630073547363, + "learning_rate": 2e-05, + "loss": 0.05285126, + "step": 5724 + }, + { + "epoch": 11.45, + "grad_norm": 1.2412029504776, + "learning_rate": 2e-05, + "loss": 0.05088343, + "step": 5725 + }, + { + "epoch": 11.452, + "grad_norm": 2.3141374588012695, + "learning_rate": 2e-05, + "loss": 0.0701945, + "step": 5726 + }, + { + "epoch": 11.454, + "grad_norm": 1.1392027139663696, + "learning_rate": 2e-05, + "loss": 0.03937323, + "step": 5727 + }, + { + "epoch": 11.456, + "grad_norm": 1.5960547924041748, + "learning_rate": 2e-05, + "loss": 0.04383022, + "step": 5728 + }, + { + "epoch": 11.458, + "grad_norm": 1.1272916793823242, + "learning_rate": 2e-05, + "loss": 0.04322441, + "step": 5729 + }, + { + "epoch": 11.46, + "grad_norm": 1.3566938638687134, + "learning_rate": 2e-05, + "loss": 0.04182547, + "step": 5730 + }, + { + "epoch": 11.462, + "grad_norm": 1.7132365703582764, + "learning_rate": 2e-05, + "loss": 0.04979304, + "step": 5731 + }, + { + "epoch": 11.464, + "grad_norm": 1.125922679901123, + "learning_rate": 2e-05, + "loss": 0.04207063, + "step": 5732 + }, + { + "epoch": 11.466, + "grad_norm": 1.6913458108901978, + "learning_rate": 2e-05, + "loss": 0.04294857, + "step": 5733 + }, + { + "epoch": 11.468, + "grad_norm": 1.0827990770339966, + "learning_rate": 2e-05, + "loss": 0.05028411, + "step": 5734 + }, + { + "epoch": 11.47, + "grad_norm": 1.2500609159469604, + "learning_rate": 2e-05, + "loss": 0.04735212, + "step": 5735 + }, + { + "epoch": 11.472, + "grad_norm": 1.5820868015289307, + "learning_rate": 2e-05, + "loss": 0.04795214, + "step": 5736 + }, + { + "epoch": 11.474, + "grad_norm": 4.20005464553833, + "learning_rate": 2e-05, + "loss": 0.05945217, + "step": 5737 + }, + { + "epoch": 11.475999999999999, + "grad_norm": 1.4440017938613892, + "learning_rate": 2e-05, + "loss": 0.0505971, + "step": 5738 + }, + { + "epoch": 11.478, + "grad_norm": 1.6714645624160767, + "learning_rate": 2e-05, + "loss": 0.04327673, + "step": 5739 + }, + { + "epoch": 11.48, + "grad_norm": 1.5207058191299438, + "learning_rate": 2e-05, + "loss": 0.03629073, + "step": 5740 + }, + { + "epoch": 11.482, + "grad_norm": 2.2845335006713867, + "learning_rate": 2e-05, + "loss": 0.04524094, + "step": 5741 + }, + { + "epoch": 11.484, + "grad_norm": 1.6629916429519653, + "learning_rate": 2e-05, + "loss": 0.03859144, + "step": 5742 + }, + { + "epoch": 11.486, + "grad_norm": 1.1069337129592896, + "learning_rate": 2e-05, + "loss": 0.03897121, + "step": 5743 + }, + { + "epoch": 11.488, + "grad_norm": 1.555224895477295, + "learning_rate": 2e-05, + "loss": 0.03899939, + "step": 5744 + }, + { + "epoch": 11.49, + "grad_norm": 2.02616548538208, + "learning_rate": 2e-05, + "loss": 0.05024801, + "step": 5745 + }, + { + "epoch": 11.492, + "grad_norm": 1.102475881576538, + "learning_rate": 2e-05, + "loss": 0.03961539, + "step": 5746 + }, + { + "epoch": 11.494, + "grad_norm": 1.6318930387496948, + "learning_rate": 2e-05, + "loss": 0.04914928, + "step": 5747 + }, + { + "epoch": 11.496, + "grad_norm": 1.7205697298049927, + "learning_rate": 2e-05, + "loss": 0.04520516, + "step": 5748 + }, + { + "epoch": 11.498, + "grad_norm": 2.896815538406372, + "learning_rate": 2e-05, + "loss": 0.06124, + "step": 5749 + }, + { + "epoch": 11.5, + "grad_norm": 1.4367793798446655, + "learning_rate": 2e-05, + "loss": 0.03653619, + "step": 5750 + }, + { + "epoch": 11.502, + "grad_norm": 1.8099347352981567, + "learning_rate": 2e-05, + "loss": 0.04635201, + "step": 5751 + }, + { + "epoch": 11.504, + "grad_norm": 1.852498173713684, + "learning_rate": 2e-05, + "loss": 0.05065921, + "step": 5752 + }, + { + "epoch": 11.506, + "grad_norm": 1.1768702268600464, + "learning_rate": 2e-05, + "loss": 0.03448585, + "step": 5753 + }, + { + "epoch": 11.508, + "grad_norm": 1.5099678039550781, + "learning_rate": 2e-05, + "loss": 0.03958462, + "step": 5754 + }, + { + "epoch": 11.51, + "grad_norm": 3.429736375808716, + "learning_rate": 2e-05, + "loss": 0.04971122, + "step": 5755 + }, + { + "epoch": 11.512, + "grad_norm": 1.356461524963379, + "learning_rate": 2e-05, + "loss": 0.04755508, + "step": 5756 + }, + { + "epoch": 11.514, + "grad_norm": 2.2165658473968506, + "learning_rate": 2e-05, + "loss": 0.0588036, + "step": 5757 + }, + { + "epoch": 11.516, + "grad_norm": 1.2915750741958618, + "learning_rate": 2e-05, + "loss": 0.04737089, + "step": 5758 + }, + { + "epoch": 11.518, + "grad_norm": 1.303098440170288, + "learning_rate": 2e-05, + "loss": 0.04432829, + "step": 5759 + }, + { + "epoch": 11.52, + "grad_norm": 1.4214109182357788, + "learning_rate": 2e-05, + "loss": 0.04704283, + "step": 5760 + }, + { + "epoch": 11.522, + "grad_norm": 1.403090476989746, + "learning_rate": 2e-05, + "loss": 0.03694802, + "step": 5761 + }, + { + "epoch": 11.524000000000001, + "grad_norm": 4.832217216491699, + "learning_rate": 2e-05, + "loss": 0.05118045, + "step": 5762 + }, + { + "epoch": 11.526, + "grad_norm": 1.1842046976089478, + "learning_rate": 2e-05, + "loss": 0.0342038, + "step": 5763 + }, + { + "epoch": 11.528, + "grad_norm": 1.0605454444885254, + "learning_rate": 2e-05, + "loss": 0.03051936, + "step": 5764 + }, + { + "epoch": 11.53, + "grad_norm": 1.4966093301773071, + "learning_rate": 2e-05, + "loss": 0.05437325, + "step": 5765 + }, + { + "epoch": 11.532, + "grad_norm": 1.8924329280853271, + "learning_rate": 2e-05, + "loss": 0.0331119, + "step": 5766 + }, + { + "epoch": 11.534, + "grad_norm": 1.384433388710022, + "learning_rate": 2e-05, + "loss": 0.05945661, + "step": 5767 + }, + { + "epoch": 11.536, + "grad_norm": 1.6517621278762817, + "learning_rate": 2e-05, + "loss": 0.04912163, + "step": 5768 + }, + { + "epoch": 11.538, + "grad_norm": 1.414260983467102, + "learning_rate": 2e-05, + "loss": 0.05474976, + "step": 5769 + }, + { + "epoch": 11.54, + "grad_norm": 1.017541527748108, + "learning_rate": 2e-05, + "loss": 0.0367628, + "step": 5770 + }, + { + "epoch": 11.542, + "grad_norm": 1.0602508783340454, + "learning_rate": 2e-05, + "loss": 0.03636168, + "step": 5771 + }, + { + "epoch": 11.544, + "grad_norm": 1.6005361080169678, + "learning_rate": 2e-05, + "loss": 0.06516649, + "step": 5772 + }, + { + "epoch": 11.546, + "grad_norm": 1.2333847284317017, + "learning_rate": 2e-05, + "loss": 0.0381854, + "step": 5773 + }, + { + "epoch": 11.548, + "grad_norm": 1.438815951347351, + "learning_rate": 2e-05, + "loss": 0.04053817, + "step": 5774 + }, + { + "epoch": 11.55, + "grad_norm": 1.6449198722839355, + "learning_rate": 2e-05, + "loss": 0.04813613, + "step": 5775 + }, + { + "epoch": 11.552, + "grad_norm": 2.104741096496582, + "learning_rate": 2e-05, + "loss": 0.04231735, + "step": 5776 + }, + { + "epoch": 11.554, + "grad_norm": 1.2205615043640137, + "learning_rate": 2e-05, + "loss": 0.05345644, + "step": 5777 + }, + { + "epoch": 11.556000000000001, + "grad_norm": 1.7869209051132202, + "learning_rate": 2e-05, + "loss": 0.04566547, + "step": 5778 + }, + { + "epoch": 11.558, + "grad_norm": 2.4200775623321533, + "learning_rate": 2e-05, + "loss": 0.0442078, + "step": 5779 + }, + { + "epoch": 11.56, + "grad_norm": 1.3662333488464355, + "learning_rate": 2e-05, + "loss": 0.0457233, + "step": 5780 + }, + { + "epoch": 11.562, + "grad_norm": 1.3299016952514648, + "learning_rate": 2e-05, + "loss": 0.04756445, + "step": 5781 + }, + { + "epoch": 11.564, + "grad_norm": 1.2130399942398071, + "learning_rate": 2e-05, + "loss": 0.0316097, + "step": 5782 + }, + { + "epoch": 11.566, + "grad_norm": 1.805086612701416, + "learning_rate": 2e-05, + "loss": 0.060819, + "step": 5783 + }, + { + "epoch": 11.568, + "grad_norm": 1.9846824407577515, + "learning_rate": 2e-05, + "loss": 0.04482648, + "step": 5784 + }, + { + "epoch": 11.57, + "grad_norm": 2.4053966999053955, + "learning_rate": 2e-05, + "loss": 0.04749952, + "step": 5785 + }, + { + "epoch": 11.572, + "grad_norm": 1.3613585233688354, + "learning_rate": 2e-05, + "loss": 0.04724826, + "step": 5786 + }, + { + "epoch": 11.574, + "grad_norm": 2.5251240730285645, + "learning_rate": 2e-05, + "loss": 0.03613253, + "step": 5787 + }, + { + "epoch": 11.576, + "grad_norm": 2.14925217628479, + "learning_rate": 2e-05, + "loss": 0.04898756, + "step": 5788 + }, + { + "epoch": 11.578, + "grad_norm": 1.379772424697876, + "learning_rate": 2e-05, + "loss": 0.05590723, + "step": 5789 + }, + { + "epoch": 11.58, + "grad_norm": 2.1570491790771484, + "learning_rate": 2e-05, + "loss": 0.06367472, + "step": 5790 + }, + { + "epoch": 11.582, + "grad_norm": 1.7565891742706299, + "learning_rate": 2e-05, + "loss": 0.05175413, + "step": 5791 + }, + { + "epoch": 11.584, + "grad_norm": 1.842274785041809, + "learning_rate": 2e-05, + "loss": 0.05350547, + "step": 5792 + }, + { + "epoch": 11.586, + "grad_norm": 2.831634283065796, + "learning_rate": 2e-05, + "loss": 0.06426252, + "step": 5793 + }, + { + "epoch": 11.588, + "grad_norm": 1.5474140644073486, + "learning_rate": 2e-05, + "loss": 0.04553394, + "step": 5794 + }, + { + "epoch": 11.59, + "grad_norm": 2.2714931964874268, + "learning_rate": 2e-05, + "loss": 0.02121895, + "step": 5795 + }, + { + "epoch": 11.592, + "grad_norm": 1.8448201417922974, + "learning_rate": 2e-05, + "loss": 0.0600131, + "step": 5796 + }, + { + "epoch": 11.594, + "grad_norm": 1.1388860940933228, + "learning_rate": 2e-05, + "loss": 0.03663793, + "step": 5797 + }, + { + "epoch": 11.596, + "grad_norm": 1.9692730903625488, + "learning_rate": 2e-05, + "loss": 0.04725102, + "step": 5798 + }, + { + "epoch": 11.598, + "grad_norm": 1.9636507034301758, + "learning_rate": 2e-05, + "loss": 0.04308451, + "step": 5799 + }, + { + "epoch": 11.6, + "grad_norm": 1.0663782358169556, + "learning_rate": 2e-05, + "loss": 0.0288805, + "step": 5800 + }, + { + "epoch": 11.602, + "grad_norm": 1.4374663829803467, + "learning_rate": 2e-05, + "loss": 0.04004197, + "step": 5801 + }, + { + "epoch": 11.604, + "grad_norm": 1.7417012453079224, + "learning_rate": 2e-05, + "loss": 0.05123825, + "step": 5802 + }, + { + "epoch": 11.606, + "grad_norm": 2.1125712394714355, + "learning_rate": 2e-05, + "loss": 0.0459488, + "step": 5803 + }, + { + "epoch": 11.608, + "grad_norm": 1.3598778247833252, + "learning_rate": 2e-05, + "loss": 0.03898705, + "step": 5804 + }, + { + "epoch": 11.61, + "grad_norm": 1.2977287769317627, + "learning_rate": 2e-05, + "loss": 0.05425401, + "step": 5805 + }, + { + "epoch": 11.612, + "grad_norm": 1.863133430480957, + "learning_rate": 2e-05, + "loss": 0.05389511, + "step": 5806 + }, + { + "epoch": 11.614, + "grad_norm": 2.131502389907837, + "learning_rate": 2e-05, + "loss": 0.05331743, + "step": 5807 + }, + { + "epoch": 11.616, + "grad_norm": 2.4704604148864746, + "learning_rate": 2e-05, + "loss": 0.05750574, + "step": 5808 + }, + { + "epoch": 11.618, + "grad_norm": 1.9358444213867188, + "learning_rate": 2e-05, + "loss": 0.04222788, + "step": 5809 + }, + { + "epoch": 11.62, + "grad_norm": 1.9284040927886963, + "learning_rate": 2e-05, + "loss": 0.04325092, + "step": 5810 + }, + { + "epoch": 11.622, + "grad_norm": 2.1048789024353027, + "learning_rate": 2e-05, + "loss": 0.05008987, + "step": 5811 + }, + { + "epoch": 11.624, + "grad_norm": 2.60477614402771, + "learning_rate": 2e-05, + "loss": 0.04310272, + "step": 5812 + }, + { + "epoch": 11.626, + "grad_norm": 1.3271856307983398, + "learning_rate": 2e-05, + "loss": 0.03580116, + "step": 5813 + }, + { + "epoch": 11.628, + "grad_norm": 1.6767809391021729, + "learning_rate": 2e-05, + "loss": 0.03502236, + "step": 5814 + }, + { + "epoch": 11.63, + "grad_norm": 1.5544588565826416, + "learning_rate": 2e-05, + "loss": 0.05321189, + "step": 5815 + }, + { + "epoch": 11.632, + "grad_norm": 1.3371565341949463, + "learning_rate": 2e-05, + "loss": 0.04384248, + "step": 5816 + }, + { + "epoch": 11.634, + "grad_norm": 1.6395615339279175, + "learning_rate": 2e-05, + "loss": 0.04614739, + "step": 5817 + }, + { + "epoch": 11.636, + "grad_norm": 2.506648302078247, + "learning_rate": 2e-05, + "loss": 0.06705838, + "step": 5818 + }, + { + "epoch": 11.638, + "grad_norm": 1.640137791633606, + "learning_rate": 2e-05, + "loss": 0.05612247, + "step": 5819 + }, + { + "epoch": 11.64, + "grad_norm": 2.4301540851593018, + "learning_rate": 2e-05, + "loss": 0.05863875, + "step": 5820 + }, + { + "epoch": 11.642, + "grad_norm": 1.5058404207229614, + "learning_rate": 2e-05, + "loss": 0.04215011, + "step": 5821 + }, + { + "epoch": 11.644, + "grad_norm": 1.2542659044265747, + "learning_rate": 2e-05, + "loss": 0.04177867, + "step": 5822 + }, + { + "epoch": 11.646, + "grad_norm": 1.107443928718567, + "learning_rate": 2e-05, + "loss": 0.04239887, + "step": 5823 + }, + { + "epoch": 11.648, + "grad_norm": 1.2233611345291138, + "learning_rate": 2e-05, + "loss": 0.04678968, + "step": 5824 + }, + { + "epoch": 11.65, + "grad_norm": 1.8364397287368774, + "learning_rate": 2e-05, + "loss": 0.04483078, + "step": 5825 + }, + { + "epoch": 11.652, + "grad_norm": 1.9486418962478638, + "learning_rate": 2e-05, + "loss": 0.04975449, + "step": 5826 + }, + { + "epoch": 11.654, + "grad_norm": 1.555195689201355, + "learning_rate": 2e-05, + "loss": 0.04872251, + "step": 5827 + }, + { + "epoch": 11.656, + "grad_norm": 1.7315301895141602, + "learning_rate": 2e-05, + "loss": 0.04223534, + "step": 5828 + }, + { + "epoch": 11.658, + "grad_norm": 1.342836856842041, + "learning_rate": 2e-05, + "loss": 0.06062792, + "step": 5829 + }, + { + "epoch": 11.66, + "grad_norm": 1.3641505241394043, + "learning_rate": 2e-05, + "loss": 0.05880754, + "step": 5830 + }, + { + "epoch": 11.662, + "grad_norm": 1.6247472763061523, + "learning_rate": 2e-05, + "loss": 0.05718165, + "step": 5831 + }, + { + "epoch": 11.664, + "grad_norm": 1.6310220956802368, + "learning_rate": 2e-05, + "loss": 0.0669639, + "step": 5832 + }, + { + "epoch": 11.666, + "grad_norm": 1.7226531505584717, + "learning_rate": 2e-05, + "loss": 0.04655602, + "step": 5833 + }, + { + "epoch": 11.668, + "grad_norm": 1.3371641635894775, + "learning_rate": 2e-05, + "loss": 0.05352715, + "step": 5834 + }, + { + "epoch": 11.67, + "grad_norm": 1.9057135581970215, + "learning_rate": 2e-05, + "loss": 0.06148591, + "step": 5835 + }, + { + "epoch": 11.672, + "grad_norm": 2.0866620540618896, + "learning_rate": 2e-05, + "loss": 0.04995357, + "step": 5836 + }, + { + "epoch": 11.674, + "grad_norm": 2.5532031059265137, + "learning_rate": 2e-05, + "loss": 0.07622005, + "step": 5837 + }, + { + "epoch": 11.676, + "grad_norm": 2.778372287750244, + "learning_rate": 2e-05, + "loss": 0.06307095, + "step": 5838 + }, + { + "epoch": 11.678, + "grad_norm": 1.43252432346344, + "learning_rate": 2e-05, + "loss": 0.04999834, + "step": 5839 + }, + { + "epoch": 11.68, + "grad_norm": 1.3844120502471924, + "learning_rate": 2e-05, + "loss": 0.05112729, + "step": 5840 + }, + { + "epoch": 11.682, + "grad_norm": 1.2430758476257324, + "learning_rate": 2e-05, + "loss": 0.04392055, + "step": 5841 + }, + { + "epoch": 11.684, + "grad_norm": 1.273295521736145, + "learning_rate": 2e-05, + "loss": 0.04273405, + "step": 5842 + }, + { + "epoch": 11.686, + "grad_norm": 1.3100574016571045, + "learning_rate": 2e-05, + "loss": 0.05339495, + "step": 5843 + }, + { + "epoch": 11.688, + "grad_norm": 1.267080545425415, + "learning_rate": 2e-05, + "loss": 0.0397922, + "step": 5844 + }, + { + "epoch": 11.69, + "grad_norm": 1.1280689239501953, + "learning_rate": 2e-05, + "loss": 0.04549348, + "step": 5845 + }, + { + "epoch": 11.692, + "grad_norm": 1.0839488506317139, + "learning_rate": 2e-05, + "loss": 0.03936727, + "step": 5846 + }, + { + "epoch": 11.693999999999999, + "grad_norm": 1.552826166152954, + "learning_rate": 2e-05, + "loss": 0.04919448, + "step": 5847 + }, + { + "epoch": 11.696, + "grad_norm": 1.6215623617172241, + "learning_rate": 2e-05, + "loss": 0.03126223, + "step": 5848 + }, + { + "epoch": 11.698, + "grad_norm": 1.1549961566925049, + "learning_rate": 2e-05, + "loss": 0.03589753, + "step": 5849 + }, + { + "epoch": 11.7, + "grad_norm": 1.782952070236206, + "learning_rate": 2e-05, + "loss": 0.0509676, + "step": 5850 + }, + { + "epoch": 11.702, + "grad_norm": 1.7008768320083618, + "learning_rate": 2e-05, + "loss": 0.07416837, + "step": 5851 + }, + { + "epoch": 11.704, + "grad_norm": 1.7450941801071167, + "learning_rate": 2e-05, + "loss": 0.04925626, + "step": 5852 + }, + { + "epoch": 11.706, + "grad_norm": 2.7231221199035645, + "learning_rate": 2e-05, + "loss": 0.06783025, + "step": 5853 + }, + { + "epoch": 11.708, + "grad_norm": 1.7462410926818848, + "learning_rate": 2e-05, + "loss": 0.05293039, + "step": 5854 + }, + { + "epoch": 11.71, + "grad_norm": 1.803440809249878, + "learning_rate": 2e-05, + "loss": 0.05988405, + "step": 5855 + }, + { + "epoch": 11.712, + "grad_norm": 1.4763165712356567, + "learning_rate": 2e-05, + "loss": 0.04566108, + "step": 5856 + }, + { + "epoch": 11.714, + "grad_norm": 1.44134521484375, + "learning_rate": 2e-05, + "loss": 0.04696288, + "step": 5857 + }, + { + "epoch": 11.716, + "grad_norm": 1.0796514749526978, + "learning_rate": 2e-05, + "loss": 0.03282233, + "step": 5858 + }, + { + "epoch": 11.718, + "grad_norm": 1.3339581489562988, + "learning_rate": 2e-05, + "loss": 0.04751251, + "step": 5859 + }, + { + "epoch": 11.72, + "grad_norm": 1.997147560119629, + "learning_rate": 2e-05, + "loss": 0.04504245, + "step": 5860 + }, + { + "epoch": 11.722, + "grad_norm": 1.335020899772644, + "learning_rate": 2e-05, + "loss": 0.04243895, + "step": 5861 + }, + { + "epoch": 11.724, + "grad_norm": 2.0907578468322754, + "learning_rate": 2e-05, + "loss": 0.04816663, + "step": 5862 + }, + { + "epoch": 11.725999999999999, + "grad_norm": 1.7296607494354248, + "learning_rate": 2e-05, + "loss": 0.04865206, + "step": 5863 + }, + { + "epoch": 11.728, + "grad_norm": 1.610846996307373, + "learning_rate": 2e-05, + "loss": 0.0545282, + "step": 5864 + }, + { + "epoch": 11.73, + "grad_norm": 1.4066860675811768, + "learning_rate": 2e-05, + "loss": 0.047715, + "step": 5865 + }, + { + "epoch": 11.732, + "grad_norm": 1.3445024490356445, + "learning_rate": 2e-05, + "loss": 0.04617265, + "step": 5866 + }, + { + "epoch": 11.734, + "grad_norm": 2.7535033226013184, + "learning_rate": 2e-05, + "loss": 0.04723052, + "step": 5867 + }, + { + "epoch": 11.736, + "grad_norm": 1.5304971933364868, + "learning_rate": 2e-05, + "loss": 0.03638636, + "step": 5868 + }, + { + "epoch": 11.738, + "grad_norm": 1.0276997089385986, + "learning_rate": 2e-05, + "loss": 0.03411288, + "step": 5869 + }, + { + "epoch": 11.74, + "grad_norm": 1.4574671983718872, + "learning_rate": 2e-05, + "loss": 0.05382932, + "step": 5870 + }, + { + "epoch": 11.742, + "grad_norm": 3.5009210109710693, + "learning_rate": 2e-05, + "loss": 0.06978729, + "step": 5871 + }, + { + "epoch": 11.744, + "grad_norm": 2.0347979068756104, + "learning_rate": 2e-05, + "loss": 0.0829041, + "step": 5872 + }, + { + "epoch": 11.746, + "grad_norm": 2.443451404571533, + "learning_rate": 2e-05, + "loss": 0.06375959, + "step": 5873 + }, + { + "epoch": 11.748, + "grad_norm": 2.023491859436035, + "learning_rate": 2e-05, + "loss": 0.06856895, + "step": 5874 + }, + { + "epoch": 11.75, + "grad_norm": 1.3537729978561401, + "learning_rate": 2e-05, + "loss": 0.04351505, + "step": 5875 + }, + { + "epoch": 11.752, + "grad_norm": 1.664186716079712, + "learning_rate": 2e-05, + "loss": 0.03763783, + "step": 5876 + }, + { + "epoch": 11.754, + "grad_norm": 1.5077100992202759, + "learning_rate": 2e-05, + "loss": 0.03997686, + "step": 5877 + }, + { + "epoch": 11.756, + "grad_norm": 2.3517305850982666, + "learning_rate": 2e-05, + "loss": 0.03420842, + "step": 5878 + }, + { + "epoch": 11.758, + "grad_norm": 1.4957901239395142, + "learning_rate": 2e-05, + "loss": 0.04347527, + "step": 5879 + }, + { + "epoch": 11.76, + "grad_norm": 1.5121381282806396, + "learning_rate": 2e-05, + "loss": 0.05459063, + "step": 5880 + }, + { + "epoch": 11.762, + "grad_norm": 1.957458734512329, + "learning_rate": 2e-05, + "loss": 0.05225588, + "step": 5881 + }, + { + "epoch": 11.764, + "grad_norm": 1.498754858970642, + "learning_rate": 2e-05, + "loss": 0.03588563, + "step": 5882 + }, + { + "epoch": 11.766, + "grad_norm": 1.5608701705932617, + "learning_rate": 2e-05, + "loss": 0.04653958, + "step": 5883 + }, + { + "epoch": 11.768, + "grad_norm": 2.1723520755767822, + "learning_rate": 2e-05, + "loss": 0.05516068, + "step": 5884 + }, + { + "epoch": 11.77, + "grad_norm": 2.607795000076294, + "learning_rate": 2e-05, + "loss": 0.05331718, + "step": 5885 + }, + { + "epoch": 11.772, + "grad_norm": 1.2783042192459106, + "learning_rate": 2e-05, + "loss": 0.04722802, + "step": 5886 + }, + { + "epoch": 11.774000000000001, + "grad_norm": 1.131067156791687, + "learning_rate": 2e-05, + "loss": 0.04217054, + "step": 5887 + }, + { + "epoch": 11.776, + "grad_norm": 2.056450366973877, + "learning_rate": 2e-05, + "loss": 0.04825941, + "step": 5888 + }, + { + "epoch": 11.778, + "grad_norm": 1.3246713876724243, + "learning_rate": 2e-05, + "loss": 0.0532944, + "step": 5889 + }, + { + "epoch": 11.78, + "grad_norm": 1.6754448413848877, + "learning_rate": 2e-05, + "loss": 0.05571581, + "step": 5890 + }, + { + "epoch": 11.782, + "grad_norm": 1.6864081621170044, + "learning_rate": 2e-05, + "loss": 0.04703726, + "step": 5891 + }, + { + "epoch": 11.784, + "grad_norm": 1.1856135129928589, + "learning_rate": 2e-05, + "loss": 0.03456947, + "step": 5892 + }, + { + "epoch": 11.786, + "grad_norm": 1.1907005310058594, + "learning_rate": 2e-05, + "loss": 0.03893632, + "step": 5893 + }, + { + "epoch": 11.788, + "grad_norm": 1.570899248123169, + "learning_rate": 2e-05, + "loss": 0.05421094, + "step": 5894 + }, + { + "epoch": 11.79, + "grad_norm": 2.6174049377441406, + "learning_rate": 2e-05, + "loss": 0.05405217, + "step": 5895 + }, + { + "epoch": 11.792, + "grad_norm": 1.9722440242767334, + "learning_rate": 2e-05, + "loss": 0.04381296, + "step": 5896 + }, + { + "epoch": 11.794, + "grad_norm": 1.0976735353469849, + "learning_rate": 2e-05, + "loss": 0.02918898, + "step": 5897 + }, + { + "epoch": 11.796, + "grad_norm": 1.4373865127563477, + "learning_rate": 2e-05, + "loss": 0.05200865, + "step": 5898 + }, + { + "epoch": 11.798, + "grad_norm": 1.3867244720458984, + "learning_rate": 2e-05, + "loss": 0.03751809, + "step": 5899 + }, + { + "epoch": 11.8, + "grad_norm": 1.449845314025879, + "learning_rate": 2e-05, + "loss": 0.04667427, + "step": 5900 + }, + { + "epoch": 11.802, + "grad_norm": 1.7023156881332397, + "learning_rate": 2e-05, + "loss": 0.03977741, + "step": 5901 + }, + { + "epoch": 11.804, + "grad_norm": 1.2221662998199463, + "learning_rate": 2e-05, + "loss": 0.04769532, + "step": 5902 + }, + { + "epoch": 11.806000000000001, + "grad_norm": 3.443532943725586, + "learning_rate": 2e-05, + "loss": 0.06289086, + "step": 5903 + }, + { + "epoch": 11.808, + "grad_norm": 1.2141717672348022, + "learning_rate": 2e-05, + "loss": 0.0353791, + "step": 5904 + }, + { + "epoch": 11.81, + "grad_norm": 1.126047134399414, + "learning_rate": 2e-05, + "loss": 0.03271578, + "step": 5905 + }, + { + "epoch": 11.812, + "grad_norm": 1.419371485710144, + "learning_rate": 2e-05, + "loss": 0.0543963, + "step": 5906 + }, + { + "epoch": 11.814, + "grad_norm": 2.132117748260498, + "learning_rate": 2e-05, + "loss": 0.06309062, + "step": 5907 + }, + { + "epoch": 11.816, + "grad_norm": 1.5774179697036743, + "learning_rate": 2e-05, + "loss": 0.05101613, + "step": 5908 + }, + { + "epoch": 11.818, + "grad_norm": 1.1032048463821411, + "learning_rate": 2e-05, + "loss": 0.03164658, + "step": 5909 + }, + { + "epoch": 11.82, + "grad_norm": 1.8389184474945068, + "learning_rate": 2e-05, + "loss": 0.05535435, + "step": 5910 + }, + { + "epoch": 11.822, + "grad_norm": 1.3901278972625732, + "learning_rate": 2e-05, + "loss": 0.04581359, + "step": 5911 + }, + { + "epoch": 11.824, + "grad_norm": 1.5085663795471191, + "learning_rate": 2e-05, + "loss": 0.04446094, + "step": 5912 + }, + { + "epoch": 11.826, + "grad_norm": 1.282586932182312, + "learning_rate": 2e-05, + "loss": 0.04857932, + "step": 5913 + }, + { + "epoch": 11.828, + "grad_norm": 1.1354501247406006, + "learning_rate": 2e-05, + "loss": 0.04103622, + "step": 5914 + }, + { + "epoch": 11.83, + "grad_norm": 1.4101886749267578, + "learning_rate": 2e-05, + "loss": 0.03275158, + "step": 5915 + }, + { + "epoch": 11.832, + "grad_norm": 1.4309121370315552, + "learning_rate": 2e-05, + "loss": 0.04146826, + "step": 5916 + }, + { + "epoch": 11.834, + "grad_norm": 1.0470571517944336, + "learning_rate": 2e-05, + "loss": 0.03013843, + "step": 5917 + }, + { + "epoch": 11.836, + "grad_norm": 2.17950177192688, + "learning_rate": 2e-05, + "loss": 0.06400843, + "step": 5918 + }, + { + "epoch": 11.838, + "grad_norm": 1.5850987434387207, + "learning_rate": 2e-05, + "loss": 0.04890703, + "step": 5919 + }, + { + "epoch": 11.84, + "grad_norm": 2.1424400806427, + "learning_rate": 2e-05, + "loss": 0.04672211, + "step": 5920 + }, + { + "epoch": 11.842, + "grad_norm": 1.6447819471359253, + "learning_rate": 2e-05, + "loss": 0.05776293, + "step": 5921 + }, + { + "epoch": 11.844, + "grad_norm": 1.2918273210525513, + "learning_rate": 2e-05, + "loss": 0.04792162, + "step": 5922 + }, + { + "epoch": 11.846, + "grad_norm": 1.787546157836914, + "learning_rate": 2e-05, + "loss": 0.04592175, + "step": 5923 + }, + { + "epoch": 11.848, + "grad_norm": 2.0610523223876953, + "learning_rate": 2e-05, + "loss": 0.0371214, + "step": 5924 + }, + { + "epoch": 11.85, + "grad_norm": 1.302527666091919, + "learning_rate": 2e-05, + "loss": 0.0393346, + "step": 5925 + }, + { + "epoch": 11.852, + "grad_norm": 1.6477103233337402, + "learning_rate": 2e-05, + "loss": 0.05504544, + "step": 5926 + }, + { + "epoch": 11.854, + "grad_norm": 1.8336002826690674, + "learning_rate": 2e-05, + "loss": 0.05120177, + "step": 5927 + }, + { + "epoch": 11.856, + "grad_norm": 1.318718671798706, + "learning_rate": 2e-05, + "loss": 0.05148219, + "step": 5928 + }, + { + "epoch": 11.858, + "grad_norm": 1.6090713739395142, + "learning_rate": 2e-05, + "loss": 0.05758989, + "step": 5929 + }, + { + "epoch": 11.86, + "grad_norm": 1.4209413528442383, + "learning_rate": 2e-05, + "loss": 0.03771012, + "step": 5930 + }, + { + "epoch": 11.862, + "grad_norm": 1.147481083869934, + "learning_rate": 2e-05, + "loss": 0.05305842, + "step": 5931 + }, + { + "epoch": 11.864, + "grad_norm": 5.9689130783081055, + "learning_rate": 2e-05, + "loss": 0.03842753, + "step": 5932 + }, + { + "epoch": 11.866, + "grad_norm": 2.704475164413452, + "learning_rate": 2e-05, + "loss": 0.04317836, + "step": 5933 + }, + { + "epoch": 11.868, + "grad_norm": 13.987378120422363, + "learning_rate": 2e-05, + "loss": 0.04295823, + "step": 5934 + }, + { + "epoch": 11.87, + "grad_norm": 2.0759031772613525, + "learning_rate": 2e-05, + "loss": 0.05019844, + "step": 5935 + }, + { + "epoch": 11.872, + "grad_norm": 2.631631851196289, + "learning_rate": 2e-05, + "loss": 0.04314917, + "step": 5936 + }, + { + "epoch": 11.874, + "grad_norm": 1.2975753545761108, + "learning_rate": 2e-05, + "loss": 0.0499692, + "step": 5937 + }, + { + "epoch": 11.876, + "grad_norm": 1.5606701374053955, + "learning_rate": 2e-05, + "loss": 0.04614861, + "step": 5938 + }, + { + "epoch": 11.878, + "grad_norm": 1.3647533655166626, + "learning_rate": 2e-05, + "loss": 0.03900645, + "step": 5939 + }, + { + "epoch": 11.88, + "grad_norm": 1.462010145187378, + "learning_rate": 2e-05, + "loss": 0.05590525, + "step": 5940 + }, + { + "epoch": 11.882, + "grad_norm": 2.939069986343384, + "learning_rate": 2e-05, + "loss": 0.06512713, + "step": 5941 + }, + { + "epoch": 11.884, + "grad_norm": 2.307407855987549, + "learning_rate": 2e-05, + "loss": 0.04771993, + "step": 5942 + }, + { + "epoch": 11.886, + "grad_norm": 1.784456491470337, + "learning_rate": 2e-05, + "loss": 0.05009575, + "step": 5943 + }, + { + "epoch": 11.888, + "grad_norm": 1.886611819267273, + "learning_rate": 2e-05, + "loss": 0.05641106, + "step": 5944 + }, + { + "epoch": 11.89, + "grad_norm": 1.1477409601211548, + "learning_rate": 2e-05, + "loss": 0.0363906, + "step": 5945 + }, + { + "epoch": 11.892, + "grad_norm": 1.2578458786010742, + "learning_rate": 2e-05, + "loss": 0.04756481, + "step": 5946 + }, + { + "epoch": 11.894, + "grad_norm": 1.938241958618164, + "learning_rate": 2e-05, + "loss": 0.0651107, + "step": 5947 + }, + { + "epoch": 11.896, + "grad_norm": 2.048572063446045, + "learning_rate": 2e-05, + "loss": 0.05753088, + "step": 5948 + }, + { + "epoch": 11.898, + "grad_norm": 1.759401559829712, + "learning_rate": 2e-05, + "loss": 0.0439103, + "step": 5949 + }, + { + "epoch": 11.9, + "grad_norm": 1.6436753273010254, + "learning_rate": 2e-05, + "loss": 0.04948621, + "step": 5950 + }, + { + "epoch": 11.902, + "grad_norm": 1.4181915521621704, + "learning_rate": 2e-05, + "loss": 0.0447426, + "step": 5951 + }, + { + "epoch": 11.904, + "grad_norm": 1.2882474660873413, + "learning_rate": 2e-05, + "loss": 0.05199945, + "step": 5952 + }, + { + "epoch": 11.906, + "grad_norm": 1.0682871341705322, + "learning_rate": 2e-05, + "loss": 0.03927372, + "step": 5953 + }, + { + "epoch": 11.908, + "grad_norm": 1.2409359216690063, + "learning_rate": 2e-05, + "loss": 0.05247876, + "step": 5954 + }, + { + "epoch": 11.91, + "grad_norm": 1.4313156604766846, + "learning_rate": 2e-05, + "loss": 0.04420187, + "step": 5955 + }, + { + "epoch": 11.912, + "grad_norm": 1.0682390928268433, + "learning_rate": 2e-05, + "loss": 0.03606921, + "step": 5956 + }, + { + "epoch": 11.914, + "grad_norm": 1.4402589797973633, + "learning_rate": 2e-05, + "loss": 0.0467343, + "step": 5957 + }, + { + "epoch": 11.916, + "grad_norm": 1.4133102893829346, + "learning_rate": 2e-05, + "loss": 0.04049528, + "step": 5958 + }, + { + "epoch": 11.918, + "grad_norm": 1.3268779516220093, + "learning_rate": 2e-05, + "loss": 0.06124356, + "step": 5959 + }, + { + "epoch": 11.92, + "grad_norm": 1.5016638040542603, + "learning_rate": 2e-05, + "loss": 0.04653203, + "step": 5960 + }, + { + "epoch": 11.922, + "grad_norm": 1.280086874961853, + "learning_rate": 2e-05, + "loss": 0.04292293, + "step": 5961 + }, + { + "epoch": 11.924, + "grad_norm": 1.5201990604400635, + "learning_rate": 2e-05, + "loss": 0.05881717, + "step": 5962 + }, + { + "epoch": 11.926, + "grad_norm": 2.3044865131378174, + "learning_rate": 2e-05, + "loss": 0.06573568, + "step": 5963 + }, + { + "epoch": 11.928, + "grad_norm": 1.8041393756866455, + "learning_rate": 2e-05, + "loss": 0.05998692, + "step": 5964 + }, + { + "epoch": 11.93, + "grad_norm": 1.9705407619476318, + "learning_rate": 2e-05, + "loss": 0.05393011, + "step": 5965 + }, + { + "epoch": 11.932, + "grad_norm": 1.5527033805847168, + "learning_rate": 2e-05, + "loss": 0.05403456, + "step": 5966 + }, + { + "epoch": 11.934, + "grad_norm": 1.115352749824524, + "learning_rate": 2e-05, + "loss": 0.03984626, + "step": 5967 + }, + { + "epoch": 11.936, + "grad_norm": 2.1990714073181152, + "learning_rate": 2e-05, + "loss": 0.05849539, + "step": 5968 + }, + { + "epoch": 11.938, + "grad_norm": 2.753941535949707, + "learning_rate": 2e-05, + "loss": 0.05485374, + "step": 5969 + }, + { + "epoch": 11.94, + "grad_norm": 1.1634223461151123, + "learning_rate": 2e-05, + "loss": 0.04611646, + "step": 5970 + }, + { + "epoch": 11.942, + "grad_norm": 1.3184393644332886, + "learning_rate": 2e-05, + "loss": 0.04198894, + "step": 5971 + }, + { + "epoch": 11.943999999999999, + "grad_norm": 1.2988426685333252, + "learning_rate": 2e-05, + "loss": 0.05265917, + "step": 5972 + }, + { + "epoch": 11.946, + "grad_norm": 1.7585318088531494, + "learning_rate": 2e-05, + "loss": 0.05654293, + "step": 5973 + }, + { + "epoch": 11.948, + "grad_norm": 1.6478968858718872, + "learning_rate": 2e-05, + "loss": 0.03530508, + "step": 5974 + }, + { + "epoch": 11.95, + "grad_norm": 2.3206682205200195, + "learning_rate": 2e-05, + "loss": 0.04617747, + "step": 5975 + }, + { + "epoch": 11.952, + "grad_norm": 1.2929635047912598, + "learning_rate": 2e-05, + "loss": 0.03368628, + "step": 5976 + }, + { + "epoch": 11.954, + "grad_norm": 1.047960877418518, + "learning_rate": 2e-05, + "loss": 0.03170536, + "step": 5977 + }, + { + "epoch": 11.956, + "grad_norm": 1.0071247816085815, + "learning_rate": 2e-05, + "loss": 0.02869906, + "step": 5978 + }, + { + "epoch": 11.958, + "grad_norm": 2.0578083992004395, + "learning_rate": 2e-05, + "loss": 0.04879463, + "step": 5979 + }, + { + "epoch": 11.96, + "grad_norm": 1.6822428703308105, + "learning_rate": 2e-05, + "loss": 0.0515471, + "step": 5980 + }, + { + "epoch": 11.962, + "grad_norm": 1.1344988346099854, + "learning_rate": 2e-05, + "loss": 0.03721001, + "step": 5981 + }, + { + "epoch": 11.964, + "grad_norm": 1.0933568477630615, + "learning_rate": 2e-05, + "loss": 0.03774923, + "step": 5982 + }, + { + "epoch": 11.966, + "grad_norm": 1.6839399337768555, + "learning_rate": 2e-05, + "loss": 0.03457904, + "step": 5983 + }, + { + "epoch": 11.968, + "grad_norm": 1.0689489841461182, + "learning_rate": 2e-05, + "loss": 0.0464288, + "step": 5984 + }, + { + "epoch": 11.97, + "grad_norm": 1.0576542615890503, + "learning_rate": 2e-05, + "loss": 0.03478286, + "step": 5985 + }, + { + "epoch": 11.972, + "grad_norm": 1.7853341102600098, + "learning_rate": 2e-05, + "loss": 0.05685054, + "step": 5986 + }, + { + "epoch": 11.974, + "grad_norm": 1.4036171436309814, + "learning_rate": 2e-05, + "loss": 0.04735323, + "step": 5987 + }, + { + "epoch": 11.975999999999999, + "grad_norm": 2.2815001010894775, + "learning_rate": 2e-05, + "loss": 0.06094185, + "step": 5988 + }, + { + "epoch": 11.978, + "grad_norm": 1.4660301208496094, + "learning_rate": 2e-05, + "loss": 0.04432792, + "step": 5989 + }, + { + "epoch": 11.98, + "grad_norm": 2.011078119277954, + "learning_rate": 2e-05, + "loss": 0.05223164, + "step": 5990 + }, + { + "epoch": 11.982, + "grad_norm": 1.2670975923538208, + "learning_rate": 2e-05, + "loss": 0.03780036, + "step": 5991 + }, + { + "epoch": 11.984, + "grad_norm": 1.5381466150283813, + "learning_rate": 2e-05, + "loss": 0.04062825, + "step": 5992 + }, + { + "epoch": 11.986, + "grad_norm": 1.1946245431900024, + "learning_rate": 2e-05, + "loss": 0.03976649, + "step": 5993 + }, + { + "epoch": 11.988, + "grad_norm": 2.6480839252471924, + "learning_rate": 2e-05, + "loss": 0.04942157, + "step": 5994 + }, + { + "epoch": 11.99, + "grad_norm": 1.3954662084579468, + "learning_rate": 2e-05, + "loss": 0.04232785, + "step": 5995 + }, + { + "epoch": 11.992, + "grad_norm": 1.0573922395706177, + "learning_rate": 2e-05, + "loss": 0.03846195, + "step": 5996 + }, + { + "epoch": 11.994, + "grad_norm": 1.5346850156784058, + "learning_rate": 2e-05, + "loss": 0.04501946, + "step": 5997 + }, + { + "epoch": 11.996, + "grad_norm": 1.567658543586731, + "learning_rate": 2e-05, + "loss": 0.04125125, + "step": 5998 + }, + { + "epoch": 11.998, + "grad_norm": 1.4279097318649292, + "learning_rate": 2e-05, + "loss": 0.03719701, + "step": 5999 + }, + { + "epoch": 12.0, + "grad_norm": 1.4664994478225708, + "learning_rate": 2e-05, + "loss": 0.06088963, + "step": 6000 + }, + { + "epoch": 12.0, + "eval_performance": { + "AngleClassification_1": 0.978, + "AngleClassification_2": 0.992, + "AngleClassification_3": 0.9540918163672655, + "Equal_1": 0.998, + "Equal_2": 0.9800399201596807, + "Equal_3": 0.906187624750499, + "LineComparison_1": 1.0, + "LineComparison_2": 0.998003992015968, + "LineComparison_3": 0.9920159680638723, + "Parallel_1": 0.9919839679358717, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.992, + "Perpendicular_1": 0.986, + "Perpendicular_2": 0.946, + "Perpendicular_3": 0.6663326653306614, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.9856666666666667, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9979959919839679, + "PointLiesOnLine_3": 0.9680638722554891 + }, + "eval_runtime": 319.0536, + "eval_samples_per_second": 32.91, + "eval_steps_per_second": 0.658, + "step": 6000 + }, + { + "epoch": 12.002, + "grad_norm": 2.38565731048584, + "learning_rate": 2e-05, + "loss": 0.0569401, + "step": 6001 + }, + { + "epoch": 12.004, + "grad_norm": 2.440852403640747, + "learning_rate": 2e-05, + "loss": 0.05189714, + "step": 6002 + }, + { + "epoch": 12.006, + "grad_norm": 1.0071848630905151, + "learning_rate": 2e-05, + "loss": 0.03481928, + "step": 6003 + }, + { + "epoch": 12.008, + "grad_norm": 1.0588632822036743, + "learning_rate": 2e-05, + "loss": 0.03610472, + "step": 6004 + }, + { + "epoch": 12.01, + "grad_norm": 1.5201109647750854, + "learning_rate": 2e-05, + "loss": 0.04367625, + "step": 6005 + }, + { + "epoch": 12.012, + "grad_norm": 1.7423279285430908, + "learning_rate": 2e-05, + "loss": 0.03819691, + "step": 6006 + }, + { + "epoch": 12.014, + "grad_norm": 1.3542839288711548, + "learning_rate": 2e-05, + "loss": 0.0486775, + "step": 6007 + }, + { + "epoch": 12.016, + "grad_norm": 1.939989447593689, + "learning_rate": 2e-05, + "loss": 0.05491658, + "step": 6008 + }, + { + "epoch": 12.018, + "grad_norm": 2.302846670150757, + "learning_rate": 2e-05, + "loss": 0.04849474, + "step": 6009 + }, + { + "epoch": 12.02, + "grad_norm": 1.8097832202911377, + "learning_rate": 2e-05, + "loss": 0.04880159, + "step": 6010 + }, + { + "epoch": 12.022, + "grad_norm": 0.9699602723121643, + "learning_rate": 2e-05, + "loss": 0.03032275, + "step": 6011 + }, + { + "epoch": 12.024, + "grad_norm": 1.2547293901443481, + "learning_rate": 2e-05, + "loss": 0.03023205, + "step": 6012 + }, + { + "epoch": 12.026, + "grad_norm": 1.7490140199661255, + "learning_rate": 2e-05, + "loss": 0.03436543, + "step": 6013 + }, + { + "epoch": 12.028, + "grad_norm": 1.5913435220718384, + "learning_rate": 2e-05, + "loss": 0.03803439, + "step": 6014 + }, + { + "epoch": 12.03, + "grad_norm": 1.5961428880691528, + "learning_rate": 2e-05, + "loss": 0.04362968, + "step": 6015 + }, + { + "epoch": 12.032, + "grad_norm": 1.9875961542129517, + "learning_rate": 2e-05, + "loss": 0.0427319, + "step": 6016 + }, + { + "epoch": 12.034, + "grad_norm": 1.6006630659103394, + "learning_rate": 2e-05, + "loss": 0.04221214, + "step": 6017 + }, + { + "epoch": 12.036, + "grad_norm": 1.985170602798462, + "learning_rate": 2e-05, + "loss": 0.04991886, + "step": 6018 + }, + { + "epoch": 12.038, + "grad_norm": 1.3990066051483154, + "learning_rate": 2e-05, + "loss": 0.0360059, + "step": 6019 + }, + { + "epoch": 12.04, + "grad_norm": 1.732647180557251, + "learning_rate": 2e-05, + "loss": 0.03687097, + "step": 6020 + }, + { + "epoch": 12.042, + "grad_norm": 1.6655960083007812, + "learning_rate": 2e-05, + "loss": 0.05822805, + "step": 6021 + }, + { + "epoch": 12.044, + "grad_norm": 1.6695098876953125, + "learning_rate": 2e-05, + "loss": 0.04767611, + "step": 6022 + }, + { + "epoch": 12.046, + "grad_norm": 1.7412924766540527, + "learning_rate": 2e-05, + "loss": 0.04930716, + "step": 6023 + }, + { + "epoch": 12.048, + "grad_norm": 1.7823171615600586, + "learning_rate": 2e-05, + "loss": 0.03755423, + "step": 6024 + }, + { + "epoch": 12.05, + "grad_norm": 1.2226248979568481, + "learning_rate": 2e-05, + "loss": 0.04179034, + "step": 6025 + }, + { + "epoch": 12.052, + "grad_norm": 1.5548951625823975, + "learning_rate": 2e-05, + "loss": 0.03161469, + "step": 6026 + }, + { + "epoch": 12.054, + "grad_norm": 1.9556974172592163, + "learning_rate": 2e-05, + "loss": 0.04450768, + "step": 6027 + }, + { + "epoch": 12.056, + "grad_norm": 1.177003026008606, + "learning_rate": 2e-05, + "loss": 0.04270407, + "step": 6028 + }, + { + "epoch": 12.058, + "grad_norm": 1.3655465841293335, + "learning_rate": 2e-05, + "loss": 0.05215304, + "step": 6029 + }, + { + "epoch": 12.06, + "grad_norm": 3.6705446243286133, + "learning_rate": 2e-05, + "loss": 0.05514611, + "step": 6030 + }, + { + "epoch": 12.062, + "grad_norm": 7.220978260040283, + "learning_rate": 2e-05, + "loss": 0.05021281, + "step": 6031 + }, + { + "epoch": 12.064, + "grad_norm": 1.2202095985412598, + "learning_rate": 2e-05, + "loss": 0.03365953, + "step": 6032 + }, + { + "epoch": 12.066, + "grad_norm": 1.2675375938415527, + "learning_rate": 2e-05, + "loss": 0.03978067, + "step": 6033 + }, + { + "epoch": 12.068, + "grad_norm": 1.05901300907135, + "learning_rate": 2e-05, + "loss": 0.03192661, + "step": 6034 + }, + { + "epoch": 12.07, + "grad_norm": 1.8929624557495117, + "learning_rate": 2e-05, + "loss": 0.03948138, + "step": 6035 + }, + { + "epoch": 12.072, + "grad_norm": 1.3291373252868652, + "learning_rate": 2e-05, + "loss": 0.04005025, + "step": 6036 + }, + { + "epoch": 12.074, + "grad_norm": 2.3758695125579834, + "learning_rate": 2e-05, + "loss": 0.04173733, + "step": 6037 + }, + { + "epoch": 12.076, + "grad_norm": 2.500800848007202, + "learning_rate": 2e-05, + "loss": 0.05645321, + "step": 6038 + }, + { + "epoch": 12.078, + "grad_norm": 1.4360809326171875, + "learning_rate": 2e-05, + "loss": 0.04782884, + "step": 6039 + }, + { + "epoch": 12.08, + "grad_norm": 2.3848273754119873, + "learning_rate": 2e-05, + "loss": 0.04527626, + "step": 6040 + }, + { + "epoch": 12.082, + "grad_norm": 1.747944951057434, + "learning_rate": 2e-05, + "loss": 0.07185665, + "step": 6041 + }, + { + "epoch": 12.084, + "grad_norm": 3.546881675720215, + "learning_rate": 2e-05, + "loss": 0.05210949, + "step": 6042 + }, + { + "epoch": 12.086, + "grad_norm": 2.6596226692199707, + "learning_rate": 2e-05, + "loss": 0.04689426, + "step": 6043 + }, + { + "epoch": 12.088, + "grad_norm": 2.573843479156494, + "learning_rate": 2e-05, + "loss": 0.05877293, + "step": 6044 + }, + { + "epoch": 12.09, + "grad_norm": 2.237309694290161, + "learning_rate": 2e-05, + "loss": 0.03652829, + "step": 6045 + }, + { + "epoch": 12.092, + "grad_norm": 1.193651795387268, + "learning_rate": 2e-05, + "loss": 0.03118789, + "step": 6046 + }, + { + "epoch": 12.094, + "grad_norm": 2.136948823928833, + "learning_rate": 2e-05, + "loss": 0.03762619, + "step": 6047 + }, + { + "epoch": 12.096, + "grad_norm": 1.9296658039093018, + "learning_rate": 2e-05, + "loss": 0.05173202, + "step": 6048 + }, + { + "epoch": 12.098, + "grad_norm": 1.0394606590270996, + "learning_rate": 2e-05, + "loss": 0.03266693, + "step": 6049 + }, + { + "epoch": 12.1, + "grad_norm": 1.249806523323059, + "learning_rate": 2e-05, + "loss": 0.03473232, + "step": 6050 + }, + { + "epoch": 12.102, + "grad_norm": 1.4066314697265625, + "learning_rate": 2e-05, + "loss": 0.04402903, + "step": 6051 + }, + { + "epoch": 12.104, + "grad_norm": 1.833040475845337, + "learning_rate": 2e-05, + "loss": 0.03956625, + "step": 6052 + }, + { + "epoch": 12.106, + "grad_norm": 1.421608805656433, + "learning_rate": 2e-05, + "loss": 0.04748692, + "step": 6053 + }, + { + "epoch": 12.108, + "grad_norm": 2.0671706199645996, + "learning_rate": 2e-05, + "loss": 0.04925867, + "step": 6054 + }, + { + "epoch": 12.11, + "grad_norm": 1.106614589691162, + "learning_rate": 2e-05, + "loss": 0.04091745, + "step": 6055 + }, + { + "epoch": 12.112, + "grad_norm": 1.2263094186782837, + "learning_rate": 2e-05, + "loss": 0.04056298, + "step": 6056 + }, + { + "epoch": 12.114, + "grad_norm": 3.7029409408569336, + "learning_rate": 2e-05, + "loss": 0.05836046, + "step": 6057 + }, + { + "epoch": 12.116, + "grad_norm": 1.413335919380188, + "learning_rate": 2e-05, + "loss": 0.05062287, + "step": 6058 + }, + { + "epoch": 12.118, + "grad_norm": 3.4559438228607178, + "learning_rate": 2e-05, + "loss": 0.0437443, + "step": 6059 + }, + { + "epoch": 12.12, + "grad_norm": 1.7302478551864624, + "learning_rate": 2e-05, + "loss": 0.05316646, + "step": 6060 + }, + { + "epoch": 12.122, + "grad_norm": 1.5175634622573853, + "learning_rate": 2e-05, + "loss": 0.04142622, + "step": 6061 + }, + { + "epoch": 12.124, + "grad_norm": 1.6398677825927734, + "learning_rate": 2e-05, + "loss": 0.05954774, + "step": 6062 + }, + { + "epoch": 12.126, + "grad_norm": 1.5120245218276978, + "learning_rate": 2e-05, + "loss": 0.05301241, + "step": 6063 + }, + { + "epoch": 12.128, + "grad_norm": 1.1674778461456299, + "learning_rate": 2e-05, + "loss": 0.04489746, + "step": 6064 + }, + { + "epoch": 12.13, + "grad_norm": 1.299045205116272, + "learning_rate": 2e-05, + "loss": 0.03792151, + "step": 6065 + }, + { + "epoch": 12.132, + "grad_norm": 0.9443352222442627, + "learning_rate": 2e-05, + "loss": 0.02998193, + "step": 6066 + }, + { + "epoch": 12.134, + "grad_norm": 1.6018176078796387, + "learning_rate": 2e-05, + "loss": 0.04062822, + "step": 6067 + }, + { + "epoch": 12.136, + "grad_norm": 2.1333489418029785, + "learning_rate": 2e-05, + "loss": 0.03400786, + "step": 6068 + }, + { + "epoch": 12.138, + "grad_norm": 1.8581465482711792, + "learning_rate": 2e-05, + "loss": 0.05270678, + "step": 6069 + }, + { + "epoch": 12.14, + "grad_norm": 1.1669179201126099, + "learning_rate": 2e-05, + "loss": 0.04782745, + "step": 6070 + }, + { + "epoch": 12.142, + "grad_norm": 2.276296615600586, + "learning_rate": 2e-05, + "loss": 0.03568403, + "step": 6071 + }, + { + "epoch": 12.144, + "grad_norm": 1.1378567218780518, + "learning_rate": 2e-05, + "loss": 0.03819509, + "step": 6072 + }, + { + "epoch": 12.146, + "grad_norm": 2.1486968994140625, + "learning_rate": 2e-05, + "loss": 0.04405282, + "step": 6073 + }, + { + "epoch": 12.148, + "grad_norm": 1.2601561546325684, + "learning_rate": 2e-05, + "loss": 0.03810599, + "step": 6074 + }, + { + "epoch": 12.15, + "grad_norm": 1.7045177221298218, + "learning_rate": 2e-05, + "loss": 0.04431679, + "step": 6075 + }, + { + "epoch": 12.152, + "grad_norm": 1.4039207696914673, + "learning_rate": 2e-05, + "loss": 0.05113788, + "step": 6076 + }, + { + "epoch": 12.154, + "grad_norm": 1.8916300535202026, + "learning_rate": 2e-05, + "loss": 0.0379601, + "step": 6077 + }, + { + "epoch": 12.156, + "grad_norm": 1.337270736694336, + "learning_rate": 2e-05, + "loss": 0.04206997, + "step": 6078 + }, + { + "epoch": 12.158, + "grad_norm": 1.3554550409317017, + "learning_rate": 2e-05, + "loss": 0.04288577, + "step": 6079 + }, + { + "epoch": 12.16, + "grad_norm": 1.9474183320999146, + "learning_rate": 2e-05, + "loss": 0.04348049, + "step": 6080 + }, + { + "epoch": 12.162, + "grad_norm": 0.8301889300346375, + "learning_rate": 2e-05, + "loss": 0.02466136, + "step": 6081 + }, + { + "epoch": 12.164, + "grad_norm": 1.5788853168487549, + "learning_rate": 2e-05, + "loss": 0.05005291, + "step": 6082 + }, + { + "epoch": 12.166, + "grad_norm": 2.1936469078063965, + "learning_rate": 2e-05, + "loss": 0.05456689, + "step": 6083 + }, + { + "epoch": 12.168, + "grad_norm": 1.8480530977249146, + "learning_rate": 2e-05, + "loss": 0.05015424, + "step": 6084 + }, + { + "epoch": 12.17, + "grad_norm": 1.3784325122833252, + "learning_rate": 2e-05, + "loss": 0.03511147, + "step": 6085 + }, + { + "epoch": 12.172, + "grad_norm": 1.4679827690124512, + "learning_rate": 2e-05, + "loss": 0.028723, + "step": 6086 + }, + { + "epoch": 12.174, + "grad_norm": 1.677220344543457, + "learning_rate": 2e-05, + "loss": 0.04889953, + "step": 6087 + }, + { + "epoch": 12.176, + "grad_norm": 1.5468508005142212, + "learning_rate": 2e-05, + "loss": 0.04408753, + "step": 6088 + }, + { + "epoch": 12.178, + "grad_norm": 1.36641526222229, + "learning_rate": 2e-05, + "loss": 0.03904226, + "step": 6089 + }, + { + "epoch": 12.18, + "grad_norm": 2.0277597904205322, + "learning_rate": 2e-05, + "loss": 0.03122667, + "step": 6090 + }, + { + "epoch": 12.182, + "grad_norm": 2.095195770263672, + "learning_rate": 2e-05, + "loss": 0.04639265, + "step": 6091 + }, + { + "epoch": 12.184, + "grad_norm": 1.4929248094558716, + "learning_rate": 2e-05, + "loss": 0.04378875, + "step": 6092 + }, + { + "epoch": 12.186, + "grad_norm": 1.040937066078186, + "learning_rate": 2e-05, + "loss": 0.02706816, + "step": 6093 + }, + { + "epoch": 12.188, + "grad_norm": 1.5594635009765625, + "learning_rate": 2e-05, + "loss": 0.02982716, + "step": 6094 + }, + { + "epoch": 12.19, + "grad_norm": 1.0690845251083374, + "learning_rate": 2e-05, + "loss": 0.02932034, + "step": 6095 + }, + { + "epoch": 12.192, + "grad_norm": 1.3992670774459839, + "learning_rate": 2e-05, + "loss": 0.04999903, + "step": 6096 + }, + { + "epoch": 12.194, + "grad_norm": 1.2843531370162964, + "learning_rate": 2e-05, + "loss": 0.02511716, + "step": 6097 + }, + { + "epoch": 12.196, + "grad_norm": 1.8418145179748535, + "learning_rate": 2e-05, + "loss": 0.04045618, + "step": 6098 + }, + { + "epoch": 12.198, + "grad_norm": 1.3255176544189453, + "learning_rate": 2e-05, + "loss": 0.04162053, + "step": 6099 + }, + { + "epoch": 12.2, + "grad_norm": 1.1222180128097534, + "learning_rate": 2e-05, + "loss": 0.02524855, + "step": 6100 + }, + { + "epoch": 12.202, + "grad_norm": 1.5370876789093018, + "learning_rate": 2e-05, + "loss": 0.04771005, + "step": 6101 + }, + { + "epoch": 12.204, + "grad_norm": 1.1293209791183472, + "learning_rate": 2e-05, + "loss": 0.03034871, + "step": 6102 + }, + { + "epoch": 12.206, + "grad_norm": 1.13023042678833, + "learning_rate": 2e-05, + "loss": 0.02627375, + "step": 6103 + }, + { + "epoch": 12.208, + "grad_norm": 1.3612079620361328, + "learning_rate": 2e-05, + "loss": 0.04360874, + "step": 6104 + }, + { + "epoch": 12.21, + "grad_norm": 3.503488779067993, + "learning_rate": 2e-05, + "loss": 0.03669972, + "step": 6105 + }, + { + "epoch": 12.212, + "grad_norm": 2.4305996894836426, + "learning_rate": 2e-05, + "loss": 0.05017614, + "step": 6106 + }, + { + "epoch": 12.214, + "grad_norm": 1.4109909534454346, + "learning_rate": 2e-05, + "loss": 0.03262326, + "step": 6107 + }, + { + "epoch": 12.216, + "grad_norm": 1.8456965684890747, + "learning_rate": 2e-05, + "loss": 0.0493413, + "step": 6108 + }, + { + "epoch": 12.218, + "grad_norm": 1.6295368671417236, + "learning_rate": 2e-05, + "loss": 0.04406694, + "step": 6109 + }, + { + "epoch": 12.22, + "grad_norm": 1.2703592777252197, + "learning_rate": 2e-05, + "loss": 0.04533841, + "step": 6110 + }, + { + "epoch": 12.222, + "grad_norm": 3.6067559719085693, + "learning_rate": 2e-05, + "loss": 0.05607805, + "step": 6111 + }, + { + "epoch": 12.224, + "grad_norm": 1.2701184749603271, + "learning_rate": 2e-05, + "loss": 0.04785171, + "step": 6112 + }, + { + "epoch": 12.226, + "grad_norm": 1.1288963556289673, + "learning_rate": 2e-05, + "loss": 0.04206613, + "step": 6113 + }, + { + "epoch": 12.228, + "grad_norm": 1.6207784414291382, + "learning_rate": 2e-05, + "loss": 0.04109996, + "step": 6114 + }, + { + "epoch": 12.23, + "grad_norm": 1.5829488039016724, + "learning_rate": 2e-05, + "loss": 0.03511421, + "step": 6115 + }, + { + "epoch": 12.232, + "grad_norm": 1.4869654178619385, + "learning_rate": 2e-05, + "loss": 0.04815879, + "step": 6116 + }, + { + "epoch": 12.234, + "grad_norm": 1.412255048751831, + "learning_rate": 2e-05, + "loss": 0.0408961, + "step": 6117 + }, + { + "epoch": 12.236, + "grad_norm": 0.9086294174194336, + "learning_rate": 2e-05, + "loss": 0.03307983, + "step": 6118 + }, + { + "epoch": 12.238, + "grad_norm": 1.4410475492477417, + "learning_rate": 2e-05, + "loss": 0.04874083, + "step": 6119 + }, + { + "epoch": 12.24, + "grad_norm": 2.1542203426361084, + "learning_rate": 2e-05, + "loss": 0.03492495, + "step": 6120 + }, + { + "epoch": 12.242, + "grad_norm": 1.4290752410888672, + "learning_rate": 2e-05, + "loss": 0.03959789, + "step": 6121 + }, + { + "epoch": 12.244, + "grad_norm": 3.0469701290130615, + "learning_rate": 2e-05, + "loss": 0.07812107, + "step": 6122 + }, + { + "epoch": 12.246, + "grad_norm": 1.6953679323196411, + "learning_rate": 2e-05, + "loss": 0.02141916, + "step": 6123 + }, + { + "epoch": 12.248, + "grad_norm": 1.8195632696151733, + "learning_rate": 2e-05, + "loss": 0.0405052, + "step": 6124 + }, + { + "epoch": 12.25, + "grad_norm": 2.004796266555786, + "learning_rate": 2e-05, + "loss": 0.03984334, + "step": 6125 + }, + { + "epoch": 12.252, + "grad_norm": 0.8493677973747253, + "learning_rate": 2e-05, + "loss": 0.03030949, + "step": 6126 + }, + { + "epoch": 12.254, + "grad_norm": 1.603154182434082, + "learning_rate": 2e-05, + "loss": 0.03452389, + "step": 6127 + }, + { + "epoch": 12.256, + "grad_norm": 1.4318041801452637, + "learning_rate": 2e-05, + "loss": 0.05320135, + "step": 6128 + }, + { + "epoch": 12.258, + "grad_norm": 1.305381178855896, + "learning_rate": 2e-05, + "loss": 0.04058621, + "step": 6129 + }, + { + "epoch": 12.26, + "grad_norm": 1.0062555074691772, + "learning_rate": 2e-05, + "loss": 0.03081638, + "step": 6130 + }, + { + "epoch": 12.262, + "grad_norm": 1.51310133934021, + "learning_rate": 2e-05, + "loss": 0.03336924, + "step": 6131 + }, + { + "epoch": 12.264, + "grad_norm": 1.28446626663208, + "learning_rate": 2e-05, + "loss": 0.03799494, + "step": 6132 + }, + { + "epoch": 12.266, + "grad_norm": 2.550501823425293, + "learning_rate": 2e-05, + "loss": 0.05581835, + "step": 6133 + }, + { + "epoch": 12.268, + "grad_norm": 1.476683259010315, + "learning_rate": 2e-05, + "loss": 0.04637709, + "step": 6134 + }, + { + "epoch": 12.27, + "grad_norm": 2.046898126602173, + "learning_rate": 2e-05, + "loss": 0.04047813, + "step": 6135 + }, + { + "epoch": 12.272, + "grad_norm": 1.6879256963729858, + "learning_rate": 2e-05, + "loss": 0.050128, + "step": 6136 + }, + { + "epoch": 12.274000000000001, + "grad_norm": 1.1848565340042114, + "learning_rate": 2e-05, + "loss": 0.03365932, + "step": 6137 + }, + { + "epoch": 12.276, + "grad_norm": 1.4317147731781006, + "learning_rate": 2e-05, + "loss": 0.04208836, + "step": 6138 + }, + { + "epoch": 12.278, + "grad_norm": 0.8532785773277283, + "learning_rate": 2e-05, + "loss": 0.02701728, + "step": 6139 + }, + { + "epoch": 12.28, + "grad_norm": 2.347813129425049, + "learning_rate": 2e-05, + "loss": 0.05657571, + "step": 6140 + }, + { + "epoch": 12.282, + "grad_norm": 1.616212248802185, + "learning_rate": 2e-05, + "loss": 0.0320894, + "step": 6141 + }, + { + "epoch": 12.284, + "grad_norm": 1.4436291456222534, + "learning_rate": 2e-05, + "loss": 0.03790341, + "step": 6142 + }, + { + "epoch": 12.286, + "grad_norm": 1.915105938911438, + "learning_rate": 2e-05, + "loss": 0.04711257, + "step": 6143 + }, + { + "epoch": 12.288, + "grad_norm": 1.6643471717834473, + "learning_rate": 2e-05, + "loss": 0.02922626, + "step": 6144 + }, + { + "epoch": 12.29, + "grad_norm": 1.9651648998260498, + "learning_rate": 2e-05, + "loss": 0.06149007, + "step": 6145 + }, + { + "epoch": 12.292, + "grad_norm": 1.5634087324142456, + "learning_rate": 2e-05, + "loss": 0.03441588, + "step": 6146 + }, + { + "epoch": 12.294, + "grad_norm": 3.6139161586761475, + "learning_rate": 2e-05, + "loss": 0.0543981, + "step": 6147 + }, + { + "epoch": 12.296, + "grad_norm": 1.771533489227295, + "learning_rate": 2e-05, + "loss": 0.04886418, + "step": 6148 + }, + { + "epoch": 12.298, + "grad_norm": 1.4337700605392456, + "learning_rate": 2e-05, + "loss": 0.02946185, + "step": 6149 + }, + { + "epoch": 12.3, + "grad_norm": 1.214970350265503, + "learning_rate": 2e-05, + "loss": 0.03541949, + "step": 6150 + }, + { + "epoch": 12.302, + "grad_norm": 1.8696826696395874, + "learning_rate": 2e-05, + "loss": 0.04219283, + "step": 6151 + }, + { + "epoch": 12.304, + "grad_norm": 1.1375106573104858, + "learning_rate": 2e-05, + "loss": 0.02643434, + "step": 6152 + }, + { + "epoch": 12.306, + "grad_norm": 1.9617277383804321, + "learning_rate": 2e-05, + "loss": 0.04668794, + "step": 6153 + }, + { + "epoch": 12.308, + "grad_norm": 1.219461441040039, + "learning_rate": 2e-05, + "loss": 0.02614435, + "step": 6154 + }, + { + "epoch": 12.31, + "grad_norm": 2.0026473999023438, + "learning_rate": 2e-05, + "loss": 0.05046513, + "step": 6155 + }, + { + "epoch": 12.312, + "grad_norm": 2.2148213386535645, + "learning_rate": 2e-05, + "loss": 0.03219437, + "step": 6156 + }, + { + "epoch": 12.314, + "grad_norm": 1.3021752834320068, + "learning_rate": 2e-05, + "loss": 0.03959469, + "step": 6157 + }, + { + "epoch": 12.316, + "grad_norm": 2.0185019969940186, + "learning_rate": 2e-05, + "loss": 0.04784639, + "step": 6158 + }, + { + "epoch": 12.318, + "grad_norm": 1.835060954093933, + "learning_rate": 2e-05, + "loss": 0.04800402, + "step": 6159 + }, + { + "epoch": 12.32, + "grad_norm": 1.1471303701400757, + "learning_rate": 2e-05, + "loss": 0.04062325, + "step": 6160 + }, + { + "epoch": 12.322, + "grad_norm": 1.5545427799224854, + "learning_rate": 2e-05, + "loss": 0.05014087, + "step": 6161 + }, + { + "epoch": 12.324, + "grad_norm": 1.3972580432891846, + "learning_rate": 2e-05, + "loss": 0.03715761, + "step": 6162 + }, + { + "epoch": 12.326, + "grad_norm": 1.1503256559371948, + "learning_rate": 2e-05, + "loss": 0.03644323, + "step": 6163 + }, + { + "epoch": 12.328, + "grad_norm": 2.5401759147644043, + "learning_rate": 2e-05, + "loss": 0.04062868, + "step": 6164 + }, + { + "epoch": 12.33, + "grad_norm": 1.3734387159347534, + "learning_rate": 2e-05, + "loss": 0.04200601, + "step": 6165 + }, + { + "epoch": 12.332, + "grad_norm": 1.1165435314178467, + "learning_rate": 2e-05, + "loss": 0.03473271, + "step": 6166 + }, + { + "epoch": 12.334, + "grad_norm": 1.3993453979492188, + "learning_rate": 2e-05, + "loss": 0.0432322, + "step": 6167 + }, + { + "epoch": 12.336, + "grad_norm": 1.486961841583252, + "learning_rate": 2e-05, + "loss": 0.04133502, + "step": 6168 + }, + { + "epoch": 12.338, + "grad_norm": 1.8763487339019775, + "learning_rate": 2e-05, + "loss": 0.04380414, + "step": 6169 + }, + { + "epoch": 12.34, + "grad_norm": 3.696859121322632, + "learning_rate": 2e-05, + "loss": 0.05829802, + "step": 6170 + }, + { + "epoch": 12.342, + "grad_norm": 1.087334156036377, + "learning_rate": 2e-05, + "loss": 0.03167205, + "step": 6171 + }, + { + "epoch": 12.344, + "grad_norm": 1.7672123908996582, + "learning_rate": 2e-05, + "loss": 0.0549523, + "step": 6172 + }, + { + "epoch": 12.346, + "grad_norm": 1.4898287057876587, + "learning_rate": 2e-05, + "loss": 0.05321936, + "step": 6173 + }, + { + "epoch": 12.348, + "grad_norm": 1.1989002227783203, + "learning_rate": 2e-05, + "loss": 0.03441871, + "step": 6174 + }, + { + "epoch": 12.35, + "grad_norm": 1.5749406814575195, + "learning_rate": 2e-05, + "loss": 0.04303573, + "step": 6175 + }, + { + "epoch": 12.352, + "grad_norm": 1.9614742994308472, + "learning_rate": 2e-05, + "loss": 0.06837121, + "step": 6176 + }, + { + "epoch": 12.354, + "grad_norm": 2.225933313369751, + "learning_rate": 2e-05, + "loss": 0.04865203, + "step": 6177 + }, + { + "epoch": 12.356, + "grad_norm": 2.307185173034668, + "learning_rate": 2e-05, + "loss": 0.05228138, + "step": 6178 + }, + { + "epoch": 12.358, + "grad_norm": 1.664549708366394, + "learning_rate": 2e-05, + "loss": 0.04851029, + "step": 6179 + }, + { + "epoch": 12.36, + "grad_norm": 1.9192323684692383, + "learning_rate": 2e-05, + "loss": 0.03343064, + "step": 6180 + }, + { + "epoch": 12.362, + "grad_norm": 1.0249700546264648, + "learning_rate": 2e-05, + "loss": 0.02511407, + "step": 6181 + }, + { + "epoch": 12.364, + "grad_norm": 1.6150821447372437, + "learning_rate": 2e-05, + "loss": 0.0319114, + "step": 6182 + }, + { + "epoch": 12.366, + "grad_norm": 1.0298298597335815, + "learning_rate": 2e-05, + "loss": 0.02553884, + "step": 6183 + }, + { + "epoch": 12.368, + "grad_norm": 2.103635787963867, + "learning_rate": 2e-05, + "loss": 0.05173234, + "step": 6184 + }, + { + "epoch": 12.37, + "grad_norm": 3.9606614112854004, + "learning_rate": 2e-05, + "loss": 0.04179762, + "step": 6185 + }, + { + "epoch": 12.372, + "grad_norm": 1.0706768035888672, + "learning_rate": 2e-05, + "loss": 0.03646227, + "step": 6186 + }, + { + "epoch": 12.374, + "grad_norm": 1.6147788763046265, + "learning_rate": 2e-05, + "loss": 0.03327171, + "step": 6187 + }, + { + "epoch": 12.376, + "grad_norm": 2.25844407081604, + "learning_rate": 2e-05, + "loss": 0.05061604, + "step": 6188 + }, + { + "epoch": 12.378, + "grad_norm": 1.574654459953308, + "learning_rate": 2e-05, + "loss": 0.04572042, + "step": 6189 + }, + { + "epoch": 12.38, + "grad_norm": 1.6320792436599731, + "learning_rate": 2e-05, + "loss": 0.04378324, + "step": 6190 + }, + { + "epoch": 12.382, + "grad_norm": 2.3261327743530273, + "learning_rate": 2e-05, + "loss": 0.04833006, + "step": 6191 + }, + { + "epoch": 12.384, + "grad_norm": 1.5708943605422974, + "learning_rate": 2e-05, + "loss": 0.03576447, + "step": 6192 + }, + { + "epoch": 12.386, + "grad_norm": 1.543707251548767, + "learning_rate": 2e-05, + "loss": 0.03951041, + "step": 6193 + }, + { + "epoch": 12.388, + "grad_norm": 1.1679742336273193, + "learning_rate": 2e-05, + "loss": 0.03864168, + "step": 6194 + }, + { + "epoch": 12.39, + "grad_norm": 2.291689157485962, + "learning_rate": 2e-05, + "loss": 0.04182493, + "step": 6195 + }, + { + "epoch": 12.392, + "grad_norm": 1.7392657995224, + "learning_rate": 2e-05, + "loss": 0.04270639, + "step": 6196 + }, + { + "epoch": 12.394, + "grad_norm": 1.5574685335159302, + "learning_rate": 2e-05, + "loss": 0.03201068, + "step": 6197 + }, + { + "epoch": 12.396, + "grad_norm": 1.4459373950958252, + "learning_rate": 2e-05, + "loss": 0.05233443, + "step": 6198 + }, + { + "epoch": 12.398, + "grad_norm": 1.459169626235962, + "learning_rate": 2e-05, + "loss": 0.0575183, + "step": 6199 + }, + { + "epoch": 12.4, + "grad_norm": 1.6888347864151, + "learning_rate": 2e-05, + "loss": 0.03846366, + "step": 6200 + }, + { + "epoch": 12.402, + "grad_norm": 2.6593565940856934, + "learning_rate": 2e-05, + "loss": 0.03823034, + "step": 6201 + }, + { + "epoch": 12.404, + "grad_norm": 1.6569043397903442, + "learning_rate": 2e-05, + "loss": 0.04339511, + "step": 6202 + }, + { + "epoch": 12.406, + "grad_norm": 0.9647795557975769, + "learning_rate": 2e-05, + "loss": 0.03080498, + "step": 6203 + }, + { + "epoch": 12.408, + "grad_norm": 2.6168878078460693, + "learning_rate": 2e-05, + "loss": 0.04123443, + "step": 6204 + }, + { + "epoch": 12.41, + "grad_norm": 1.4847103357315063, + "learning_rate": 2e-05, + "loss": 0.04044367, + "step": 6205 + }, + { + "epoch": 12.412, + "grad_norm": 2.002779483795166, + "learning_rate": 2e-05, + "loss": 0.06478892, + "step": 6206 + }, + { + "epoch": 12.414, + "grad_norm": 1.692628026008606, + "learning_rate": 2e-05, + "loss": 0.04533452, + "step": 6207 + }, + { + "epoch": 12.416, + "grad_norm": 2.04227876663208, + "learning_rate": 2e-05, + "loss": 0.05172842, + "step": 6208 + }, + { + "epoch": 12.418, + "grad_norm": 1.5471951961517334, + "learning_rate": 2e-05, + "loss": 0.04024774, + "step": 6209 + }, + { + "epoch": 12.42, + "grad_norm": 3.0287656784057617, + "learning_rate": 2e-05, + "loss": 0.07643662, + "step": 6210 + }, + { + "epoch": 12.422, + "grad_norm": 1.17807137966156, + "learning_rate": 2e-05, + "loss": 0.03349002, + "step": 6211 + }, + { + "epoch": 12.424, + "grad_norm": 1.3760405778884888, + "learning_rate": 2e-05, + "loss": 0.04548774, + "step": 6212 + }, + { + "epoch": 12.426, + "grad_norm": 1.2264763116836548, + "learning_rate": 2e-05, + "loss": 0.03662616, + "step": 6213 + }, + { + "epoch": 12.428, + "grad_norm": 1.1853759288787842, + "learning_rate": 2e-05, + "loss": 0.03420371, + "step": 6214 + }, + { + "epoch": 12.43, + "grad_norm": 1.136852502822876, + "learning_rate": 2e-05, + "loss": 0.04038435, + "step": 6215 + }, + { + "epoch": 12.432, + "grad_norm": 1.1933867931365967, + "learning_rate": 2e-05, + "loss": 0.04216532, + "step": 6216 + }, + { + "epoch": 12.434, + "grad_norm": 4.264523506164551, + "learning_rate": 2e-05, + "loss": 0.0426877, + "step": 6217 + }, + { + "epoch": 12.436, + "grad_norm": 1.2900351285934448, + "learning_rate": 2e-05, + "loss": 0.03366522, + "step": 6218 + }, + { + "epoch": 12.438, + "grad_norm": 2.727051258087158, + "learning_rate": 2e-05, + "loss": 0.05351751, + "step": 6219 + }, + { + "epoch": 12.44, + "grad_norm": 1.5386077165603638, + "learning_rate": 2e-05, + "loss": 0.05369819, + "step": 6220 + }, + { + "epoch": 12.442, + "grad_norm": 1.445359468460083, + "learning_rate": 2e-05, + "loss": 0.04263123, + "step": 6221 + }, + { + "epoch": 12.444, + "grad_norm": 1.123322606086731, + "learning_rate": 2e-05, + "loss": 0.04013848, + "step": 6222 + }, + { + "epoch": 12.446, + "grad_norm": 0.9369562268257141, + "learning_rate": 2e-05, + "loss": 0.02055129, + "step": 6223 + }, + { + "epoch": 12.448, + "grad_norm": 1.31740140914917, + "learning_rate": 2e-05, + "loss": 0.04068302, + "step": 6224 + }, + { + "epoch": 12.45, + "grad_norm": 1.307610034942627, + "learning_rate": 2e-05, + "loss": 0.05438361, + "step": 6225 + }, + { + "epoch": 12.452, + "grad_norm": 1.2957713603973389, + "learning_rate": 2e-05, + "loss": 0.02663429, + "step": 6226 + }, + { + "epoch": 12.454, + "grad_norm": 1.6599071025848389, + "learning_rate": 2e-05, + "loss": 0.04203624, + "step": 6227 + }, + { + "epoch": 12.456, + "grad_norm": 1.427505373954773, + "learning_rate": 2e-05, + "loss": 0.04064513, + "step": 6228 + }, + { + "epoch": 12.458, + "grad_norm": 1.9157131910324097, + "learning_rate": 2e-05, + "loss": 0.05884474, + "step": 6229 + }, + { + "epoch": 12.46, + "grad_norm": 1.3565418720245361, + "learning_rate": 2e-05, + "loss": 0.04177081, + "step": 6230 + }, + { + "epoch": 12.462, + "grad_norm": 1.1073006391525269, + "learning_rate": 2e-05, + "loss": 0.03636095, + "step": 6231 + }, + { + "epoch": 12.464, + "grad_norm": 1.0298948287963867, + "learning_rate": 2e-05, + "loss": 0.03898697, + "step": 6232 + }, + { + "epoch": 12.466, + "grad_norm": 1.1140865087509155, + "learning_rate": 2e-05, + "loss": 0.03568536, + "step": 6233 + }, + { + "epoch": 12.468, + "grad_norm": 1.7985575199127197, + "learning_rate": 2e-05, + "loss": 0.04029547, + "step": 6234 + }, + { + "epoch": 12.47, + "grad_norm": 1.2943682670593262, + "learning_rate": 2e-05, + "loss": 0.03818323, + "step": 6235 + }, + { + "epoch": 12.472, + "grad_norm": 1.2726134061813354, + "learning_rate": 2e-05, + "loss": 0.04815301, + "step": 6236 + }, + { + "epoch": 12.474, + "grad_norm": 1.8308722972869873, + "learning_rate": 2e-05, + "loss": 0.05556687, + "step": 6237 + }, + { + "epoch": 12.475999999999999, + "grad_norm": 1.4734604358673096, + "learning_rate": 2e-05, + "loss": 0.05400109, + "step": 6238 + }, + { + "epoch": 12.478, + "grad_norm": 1.909253478050232, + "learning_rate": 2e-05, + "loss": 0.04208795, + "step": 6239 + }, + { + "epoch": 12.48, + "grad_norm": 1.3197088241577148, + "learning_rate": 2e-05, + "loss": 0.03566179, + "step": 6240 + }, + { + "epoch": 12.482, + "grad_norm": 1.8218048810958862, + "learning_rate": 2e-05, + "loss": 0.05417576, + "step": 6241 + }, + { + "epoch": 12.484, + "grad_norm": 1.8446364402770996, + "learning_rate": 2e-05, + "loss": 0.04411314, + "step": 6242 + }, + { + "epoch": 12.486, + "grad_norm": 1.5696487426757812, + "learning_rate": 2e-05, + "loss": 0.04941075, + "step": 6243 + }, + { + "epoch": 12.488, + "grad_norm": 3.357449531555176, + "learning_rate": 2e-05, + "loss": 0.05497446, + "step": 6244 + }, + { + "epoch": 12.49, + "grad_norm": 3.2609894275665283, + "learning_rate": 2e-05, + "loss": 0.06280646, + "step": 6245 + }, + { + "epoch": 12.492, + "grad_norm": 1.8737690448760986, + "learning_rate": 2e-05, + "loss": 0.03448436, + "step": 6246 + }, + { + "epoch": 12.494, + "grad_norm": 1.4557076692581177, + "learning_rate": 2e-05, + "loss": 0.05713212, + "step": 6247 + }, + { + "epoch": 12.496, + "grad_norm": 1.4441951513290405, + "learning_rate": 2e-05, + "loss": 0.04061989, + "step": 6248 + }, + { + "epoch": 12.498, + "grad_norm": 1.1646778583526611, + "learning_rate": 2e-05, + "loss": 0.02681767, + "step": 6249 + }, + { + "epoch": 12.5, + "grad_norm": 1.1207839250564575, + "learning_rate": 2e-05, + "loss": 0.03733048, + "step": 6250 + }, + { + "epoch": 12.502, + "grad_norm": 2.0258734226226807, + "learning_rate": 2e-05, + "loss": 0.05226999, + "step": 6251 + }, + { + "epoch": 12.504, + "grad_norm": 2.436753034591675, + "learning_rate": 2e-05, + "loss": 0.06572986, + "step": 6252 + }, + { + "epoch": 12.506, + "grad_norm": 1.5565096139907837, + "learning_rate": 2e-05, + "loss": 0.04375243, + "step": 6253 + }, + { + "epoch": 12.508, + "grad_norm": 2.353769302368164, + "learning_rate": 2e-05, + "loss": 0.06041701, + "step": 6254 + }, + { + "epoch": 12.51, + "grad_norm": 1.1774601936340332, + "learning_rate": 2e-05, + "loss": 0.04639146, + "step": 6255 + }, + { + "epoch": 12.512, + "grad_norm": 2.4573864936828613, + "learning_rate": 2e-05, + "loss": 0.04883677, + "step": 6256 + }, + { + "epoch": 12.514, + "grad_norm": 2.6108806133270264, + "learning_rate": 2e-05, + "loss": 0.06272373, + "step": 6257 + }, + { + "epoch": 12.516, + "grad_norm": 1.03660249710083, + "learning_rate": 2e-05, + "loss": 0.02872313, + "step": 6258 + }, + { + "epoch": 12.518, + "grad_norm": 2.0671160221099854, + "learning_rate": 2e-05, + "loss": 0.03815708, + "step": 6259 + }, + { + "epoch": 12.52, + "grad_norm": 2.072272777557373, + "learning_rate": 2e-05, + "loss": 0.03585609, + "step": 6260 + }, + { + "epoch": 12.522, + "grad_norm": 1.6502577066421509, + "learning_rate": 2e-05, + "loss": 0.05801409, + "step": 6261 + }, + { + "epoch": 12.524000000000001, + "grad_norm": 4.274342060089111, + "learning_rate": 2e-05, + "loss": 0.04877973, + "step": 6262 + }, + { + "epoch": 12.526, + "grad_norm": 1.2209831476211548, + "learning_rate": 2e-05, + "loss": 0.03942978, + "step": 6263 + }, + { + "epoch": 12.528, + "grad_norm": 1.6523079872131348, + "learning_rate": 2e-05, + "loss": 0.0488749, + "step": 6264 + }, + { + "epoch": 12.53, + "grad_norm": 2.2305943965911865, + "learning_rate": 2e-05, + "loss": 0.05138541, + "step": 6265 + }, + { + "epoch": 12.532, + "grad_norm": 1.3322969675064087, + "learning_rate": 2e-05, + "loss": 0.04032963, + "step": 6266 + }, + { + "epoch": 12.534, + "grad_norm": 2.0621213912963867, + "learning_rate": 2e-05, + "loss": 0.05739632, + "step": 6267 + }, + { + "epoch": 12.536, + "grad_norm": 0.9136289358139038, + "learning_rate": 2e-05, + "loss": 0.03144271, + "step": 6268 + }, + { + "epoch": 12.538, + "grad_norm": 1.2017995119094849, + "learning_rate": 2e-05, + "loss": 0.0261726, + "step": 6269 + }, + { + "epoch": 12.54, + "grad_norm": 1.861670732498169, + "learning_rate": 2e-05, + "loss": 0.05213568, + "step": 6270 + }, + { + "epoch": 12.542, + "grad_norm": 3.512301206588745, + "learning_rate": 2e-05, + "loss": 0.10029002, + "step": 6271 + }, + { + "epoch": 12.544, + "grad_norm": 1.3964629173278809, + "learning_rate": 2e-05, + "loss": 0.0487773, + "step": 6272 + }, + { + "epoch": 12.546, + "grad_norm": 1.554492473602295, + "learning_rate": 2e-05, + "loss": 0.04269033, + "step": 6273 + }, + { + "epoch": 12.548, + "grad_norm": 1.0274790525436401, + "learning_rate": 2e-05, + "loss": 0.02957164, + "step": 6274 + }, + { + "epoch": 12.55, + "grad_norm": 1.4311920404434204, + "learning_rate": 2e-05, + "loss": 0.03769149, + "step": 6275 + }, + { + "epoch": 12.552, + "grad_norm": 1.9928914308547974, + "learning_rate": 2e-05, + "loss": 0.04190875, + "step": 6276 + }, + { + "epoch": 12.554, + "grad_norm": 1.5183537006378174, + "learning_rate": 2e-05, + "loss": 0.04576535, + "step": 6277 + }, + { + "epoch": 12.556000000000001, + "grad_norm": 2.0962085723876953, + "learning_rate": 2e-05, + "loss": 0.04964176, + "step": 6278 + }, + { + "epoch": 12.558, + "grad_norm": 2.3136391639709473, + "learning_rate": 2e-05, + "loss": 0.03684264, + "step": 6279 + }, + { + "epoch": 12.56, + "grad_norm": 2.110243558883667, + "learning_rate": 2e-05, + "loss": 0.05677021, + "step": 6280 + }, + { + "epoch": 12.562, + "grad_norm": 3.119753360748291, + "learning_rate": 2e-05, + "loss": 0.04740483, + "step": 6281 + }, + { + "epoch": 12.564, + "grad_norm": 1.0421844720840454, + "learning_rate": 2e-05, + "loss": 0.03042347, + "step": 6282 + }, + { + "epoch": 12.566, + "grad_norm": 1.7684757709503174, + "learning_rate": 2e-05, + "loss": 0.04349629, + "step": 6283 + }, + { + "epoch": 12.568, + "grad_norm": 1.600565791130066, + "learning_rate": 2e-05, + "loss": 0.04671545, + "step": 6284 + }, + { + "epoch": 12.57, + "grad_norm": 1.4540395736694336, + "learning_rate": 2e-05, + "loss": 0.04411473, + "step": 6285 + }, + { + "epoch": 12.572, + "grad_norm": 1.092216968536377, + "learning_rate": 2e-05, + "loss": 0.03701637, + "step": 6286 + }, + { + "epoch": 12.574, + "grad_norm": 1.4555920362472534, + "learning_rate": 2e-05, + "loss": 0.05183559, + "step": 6287 + }, + { + "epoch": 12.576, + "grad_norm": 2.1019439697265625, + "learning_rate": 2e-05, + "loss": 0.06192996, + "step": 6288 + }, + { + "epoch": 12.578, + "grad_norm": 1.7618634700775146, + "learning_rate": 2e-05, + "loss": 0.03472544, + "step": 6289 + }, + { + "epoch": 12.58, + "grad_norm": 1.7783581018447876, + "learning_rate": 2e-05, + "loss": 0.04450977, + "step": 6290 + }, + { + "epoch": 12.582, + "grad_norm": 1.32155179977417, + "learning_rate": 2e-05, + "loss": 0.04913367, + "step": 6291 + }, + { + "epoch": 12.584, + "grad_norm": 1.7167394161224365, + "learning_rate": 2e-05, + "loss": 0.03525726, + "step": 6292 + }, + { + "epoch": 12.586, + "grad_norm": 1.5468295812606812, + "learning_rate": 2e-05, + "loss": 0.04274698, + "step": 6293 + }, + { + "epoch": 12.588, + "grad_norm": 1.0729691982269287, + "learning_rate": 2e-05, + "loss": 0.04672535, + "step": 6294 + }, + { + "epoch": 12.59, + "grad_norm": 1.9018189907073975, + "learning_rate": 2e-05, + "loss": 0.04148772, + "step": 6295 + }, + { + "epoch": 12.592, + "grad_norm": 1.4016549587249756, + "learning_rate": 2e-05, + "loss": 0.04215005, + "step": 6296 + }, + { + "epoch": 12.594, + "grad_norm": 1.5528912544250488, + "learning_rate": 2e-05, + "loss": 0.03888253, + "step": 6297 + }, + { + "epoch": 12.596, + "grad_norm": 1.093119740486145, + "learning_rate": 2e-05, + "loss": 0.03328623, + "step": 6298 + }, + { + "epoch": 12.598, + "grad_norm": 1.691413402557373, + "learning_rate": 2e-05, + "loss": 0.04086442, + "step": 6299 + }, + { + "epoch": 12.6, + "grad_norm": 1.854809284210205, + "learning_rate": 2e-05, + "loss": 0.03869913, + "step": 6300 + }, + { + "epoch": 12.602, + "grad_norm": 3.5889766216278076, + "learning_rate": 2e-05, + "loss": 0.03339052, + "step": 6301 + }, + { + "epoch": 12.604, + "grad_norm": 1.0106459856033325, + "learning_rate": 2e-05, + "loss": 0.03362305, + "step": 6302 + }, + { + "epoch": 12.606, + "grad_norm": 1.3339776992797852, + "learning_rate": 2e-05, + "loss": 0.03432526, + "step": 6303 + }, + { + "epoch": 12.608, + "grad_norm": 1.3809486627578735, + "learning_rate": 2e-05, + "loss": 0.0381794, + "step": 6304 + }, + { + "epoch": 12.61, + "grad_norm": 4.405636787414551, + "learning_rate": 2e-05, + "loss": 0.05852031, + "step": 6305 + }, + { + "epoch": 12.612, + "grad_norm": 2.022235870361328, + "learning_rate": 2e-05, + "loss": 0.04848814, + "step": 6306 + }, + { + "epoch": 12.614, + "grad_norm": 2.3099355697631836, + "learning_rate": 2e-05, + "loss": 0.05684424, + "step": 6307 + }, + { + "epoch": 12.616, + "grad_norm": 1.1495301723480225, + "learning_rate": 2e-05, + "loss": 0.02855151, + "step": 6308 + }, + { + "epoch": 12.618, + "grad_norm": 1.6715397834777832, + "learning_rate": 2e-05, + "loss": 0.03850598, + "step": 6309 + }, + { + "epoch": 12.62, + "grad_norm": 1.773354172706604, + "learning_rate": 2e-05, + "loss": 0.04509133, + "step": 6310 + }, + { + "epoch": 12.622, + "grad_norm": 1.419111967086792, + "learning_rate": 2e-05, + "loss": 0.03800103, + "step": 6311 + }, + { + "epoch": 12.624, + "grad_norm": 1.9277119636535645, + "learning_rate": 2e-05, + "loss": 0.05758407, + "step": 6312 + }, + { + "epoch": 12.626, + "grad_norm": 2.3455944061279297, + "learning_rate": 2e-05, + "loss": 0.05955437, + "step": 6313 + }, + { + "epoch": 12.628, + "grad_norm": 1.5641530752182007, + "learning_rate": 2e-05, + "loss": 0.05085967, + "step": 6314 + }, + { + "epoch": 12.63, + "grad_norm": 4.214975833892822, + "learning_rate": 2e-05, + "loss": 0.04824074, + "step": 6315 + }, + { + "epoch": 12.632, + "grad_norm": 2.380524158477783, + "learning_rate": 2e-05, + "loss": 0.0585499, + "step": 6316 + }, + { + "epoch": 12.634, + "grad_norm": 1.3643995523452759, + "learning_rate": 2e-05, + "loss": 0.04846994, + "step": 6317 + }, + { + "epoch": 12.636, + "grad_norm": 1.191190242767334, + "learning_rate": 2e-05, + "loss": 0.03240199, + "step": 6318 + }, + { + "epoch": 12.638, + "grad_norm": 2.9533886909484863, + "learning_rate": 2e-05, + "loss": 0.0543364, + "step": 6319 + }, + { + "epoch": 12.64, + "grad_norm": 1.2633097171783447, + "learning_rate": 2e-05, + "loss": 0.03655366, + "step": 6320 + }, + { + "epoch": 12.642, + "grad_norm": 1.2036083936691284, + "learning_rate": 2e-05, + "loss": 0.03070158, + "step": 6321 + }, + { + "epoch": 12.644, + "grad_norm": 2.5942792892456055, + "learning_rate": 2e-05, + "loss": 0.04942968, + "step": 6322 + }, + { + "epoch": 12.646, + "grad_norm": 1.0475519895553589, + "learning_rate": 2e-05, + "loss": 0.03110964, + "step": 6323 + }, + { + "epoch": 12.648, + "grad_norm": 1.921798825263977, + "learning_rate": 2e-05, + "loss": 0.06349019, + "step": 6324 + }, + { + "epoch": 12.65, + "grad_norm": 1.2762218713760376, + "learning_rate": 2e-05, + "loss": 0.04748666, + "step": 6325 + }, + { + "epoch": 12.652, + "grad_norm": 1.2782567739486694, + "learning_rate": 2e-05, + "loss": 0.04201008, + "step": 6326 + }, + { + "epoch": 12.654, + "grad_norm": 0.7562774419784546, + "learning_rate": 2e-05, + "loss": 0.02028156, + "step": 6327 + }, + { + "epoch": 12.656, + "grad_norm": 1.9511414766311646, + "learning_rate": 2e-05, + "loss": 0.04364496, + "step": 6328 + }, + { + "epoch": 12.658, + "grad_norm": 1.7642337083816528, + "learning_rate": 2e-05, + "loss": 0.05323292, + "step": 6329 + }, + { + "epoch": 12.66, + "grad_norm": 2.2341959476470947, + "learning_rate": 2e-05, + "loss": 0.03851413, + "step": 6330 + }, + { + "epoch": 12.662, + "grad_norm": 1.5995410680770874, + "learning_rate": 2e-05, + "loss": 0.0516777, + "step": 6331 + }, + { + "epoch": 12.664, + "grad_norm": 1.8847028017044067, + "learning_rate": 2e-05, + "loss": 0.04388159, + "step": 6332 + }, + { + "epoch": 12.666, + "grad_norm": 2.132364511489868, + "learning_rate": 2e-05, + "loss": 0.04297321, + "step": 6333 + }, + { + "epoch": 12.668, + "grad_norm": 2.32963228225708, + "learning_rate": 2e-05, + "loss": 0.03326223, + "step": 6334 + }, + { + "epoch": 12.67, + "grad_norm": 1.9717942476272583, + "learning_rate": 2e-05, + "loss": 0.04060198, + "step": 6335 + }, + { + "epoch": 12.672, + "grad_norm": 1.5272952318191528, + "learning_rate": 2e-05, + "loss": 0.05175653, + "step": 6336 + }, + { + "epoch": 12.674, + "grad_norm": 1.090294361114502, + "learning_rate": 2e-05, + "loss": 0.05121554, + "step": 6337 + }, + { + "epoch": 12.676, + "grad_norm": 0.9349586367607117, + "learning_rate": 2e-05, + "loss": 0.03004047, + "step": 6338 + }, + { + "epoch": 12.678, + "grad_norm": 3.6723544597625732, + "learning_rate": 2e-05, + "loss": 0.05083375, + "step": 6339 + }, + { + "epoch": 12.68, + "grad_norm": 1.8542966842651367, + "learning_rate": 2e-05, + "loss": 0.05500766, + "step": 6340 + }, + { + "epoch": 12.682, + "grad_norm": 1.6407077312469482, + "learning_rate": 2e-05, + "loss": 0.0521637, + "step": 6341 + }, + { + "epoch": 12.684, + "grad_norm": 1.1183030605316162, + "learning_rate": 2e-05, + "loss": 0.03896394, + "step": 6342 + }, + { + "epoch": 12.686, + "grad_norm": 1.987824559211731, + "learning_rate": 2e-05, + "loss": 0.05758013, + "step": 6343 + }, + { + "epoch": 12.688, + "grad_norm": 1.398348331451416, + "learning_rate": 2e-05, + "loss": 0.04054468, + "step": 6344 + }, + { + "epoch": 12.69, + "grad_norm": 1.3957326412200928, + "learning_rate": 2e-05, + "loss": 0.03729579, + "step": 6345 + }, + { + "epoch": 12.692, + "grad_norm": 1.2762089967727661, + "learning_rate": 2e-05, + "loss": 0.03696339, + "step": 6346 + }, + { + "epoch": 12.693999999999999, + "grad_norm": 2.0938408374786377, + "learning_rate": 2e-05, + "loss": 0.03400442, + "step": 6347 + }, + { + "epoch": 12.696, + "grad_norm": 1.4813979864120483, + "learning_rate": 2e-05, + "loss": 0.03928123, + "step": 6348 + }, + { + "epoch": 12.698, + "grad_norm": 1.0480759143829346, + "learning_rate": 2e-05, + "loss": 0.03268198, + "step": 6349 + }, + { + "epoch": 12.7, + "grad_norm": 1.0556561946868896, + "learning_rate": 2e-05, + "loss": 0.03014071, + "step": 6350 + }, + { + "epoch": 12.702, + "grad_norm": 1.721622347831726, + "learning_rate": 2e-05, + "loss": 0.04010597, + "step": 6351 + }, + { + "epoch": 12.704, + "grad_norm": 1.3392608165740967, + "learning_rate": 2e-05, + "loss": 0.04958211, + "step": 6352 + }, + { + "epoch": 12.706, + "grad_norm": 2.3841845989227295, + "learning_rate": 2e-05, + "loss": 0.06484979, + "step": 6353 + }, + { + "epoch": 12.708, + "grad_norm": 1.1281391382217407, + "learning_rate": 2e-05, + "loss": 0.03068016, + "step": 6354 + }, + { + "epoch": 12.71, + "grad_norm": 1.1351035833358765, + "learning_rate": 2e-05, + "loss": 0.03658585, + "step": 6355 + }, + { + "epoch": 12.712, + "grad_norm": 2.5402114391326904, + "learning_rate": 2e-05, + "loss": 0.04548267, + "step": 6356 + }, + { + "epoch": 12.714, + "grad_norm": 2.3668107986450195, + "learning_rate": 2e-05, + "loss": 0.03807458, + "step": 6357 + }, + { + "epoch": 12.716, + "grad_norm": 1.4179434776306152, + "learning_rate": 2e-05, + "loss": 0.03555714, + "step": 6358 + }, + { + "epoch": 12.718, + "grad_norm": 1.4804911613464355, + "learning_rate": 2e-05, + "loss": 0.04498214, + "step": 6359 + }, + { + "epoch": 12.72, + "grad_norm": 2.23551344871521, + "learning_rate": 2e-05, + "loss": 0.05175463, + "step": 6360 + }, + { + "epoch": 12.722, + "grad_norm": 1.7548456192016602, + "learning_rate": 2e-05, + "loss": 0.06676908, + "step": 6361 + }, + { + "epoch": 12.724, + "grad_norm": 1.852115511894226, + "learning_rate": 2e-05, + "loss": 0.05606053, + "step": 6362 + }, + { + "epoch": 12.725999999999999, + "grad_norm": 1.3961422443389893, + "learning_rate": 2e-05, + "loss": 0.04076095, + "step": 6363 + }, + { + "epoch": 12.728, + "grad_norm": 2.0832810401916504, + "learning_rate": 2e-05, + "loss": 0.06329206, + "step": 6364 + }, + { + "epoch": 12.73, + "grad_norm": 2.148437976837158, + "learning_rate": 2e-05, + "loss": 0.06662284, + "step": 6365 + }, + { + "epoch": 12.732, + "grad_norm": 1.9320807456970215, + "learning_rate": 2e-05, + "loss": 0.04591805, + "step": 6366 + }, + { + "epoch": 12.734, + "grad_norm": 1.0681511163711548, + "learning_rate": 2e-05, + "loss": 0.03428921, + "step": 6367 + }, + { + "epoch": 12.736, + "grad_norm": 1.1943943500518799, + "learning_rate": 2e-05, + "loss": 0.0293221, + "step": 6368 + }, + { + "epoch": 12.738, + "grad_norm": 1.1118965148925781, + "learning_rate": 2e-05, + "loss": 0.03196052, + "step": 6369 + }, + { + "epoch": 12.74, + "grad_norm": 1.5805182456970215, + "learning_rate": 2e-05, + "loss": 0.04865031, + "step": 6370 + }, + { + "epoch": 12.742, + "grad_norm": 3.262758255004883, + "learning_rate": 2e-05, + "loss": 0.05576367, + "step": 6371 + }, + { + "epoch": 12.744, + "grad_norm": 1.1225249767303467, + "learning_rate": 2e-05, + "loss": 0.03483301, + "step": 6372 + }, + { + "epoch": 12.746, + "grad_norm": 1.2867493629455566, + "learning_rate": 2e-05, + "loss": 0.04086279, + "step": 6373 + }, + { + "epoch": 12.748, + "grad_norm": 1.298018217086792, + "learning_rate": 2e-05, + "loss": 0.04258086, + "step": 6374 + }, + { + "epoch": 12.75, + "grad_norm": 1.216958999633789, + "learning_rate": 2e-05, + "loss": 0.04485769, + "step": 6375 + }, + { + "epoch": 12.752, + "grad_norm": 1.4479326009750366, + "learning_rate": 2e-05, + "loss": 0.04460638, + "step": 6376 + }, + { + "epoch": 12.754, + "grad_norm": 1.5058059692382812, + "learning_rate": 2e-05, + "loss": 0.0504184, + "step": 6377 + }, + { + "epoch": 12.756, + "grad_norm": 1.393099308013916, + "learning_rate": 2e-05, + "loss": 0.03610604, + "step": 6378 + }, + { + "epoch": 12.758, + "grad_norm": 1.2349895238876343, + "learning_rate": 2e-05, + "loss": 0.04152373, + "step": 6379 + }, + { + "epoch": 12.76, + "grad_norm": 2.2473106384277344, + "learning_rate": 2e-05, + "loss": 0.06257766, + "step": 6380 + }, + { + "epoch": 12.762, + "grad_norm": 4.311264514923096, + "learning_rate": 2e-05, + "loss": 0.04196393, + "step": 6381 + }, + { + "epoch": 12.764, + "grad_norm": 1.3708561658859253, + "learning_rate": 2e-05, + "loss": 0.03996716, + "step": 6382 + }, + { + "epoch": 12.766, + "grad_norm": 1.4658493995666504, + "learning_rate": 2e-05, + "loss": 0.03841384, + "step": 6383 + }, + { + "epoch": 12.768, + "grad_norm": 1.5609458684921265, + "learning_rate": 2e-05, + "loss": 0.04608671, + "step": 6384 + }, + { + "epoch": 12.77, + "grad_norm": 1.023798942565918, + "learning_rate": 2e-05, + "loss": 0.03047959, + "step": 6385 + }, + { + "epoch": 12.772, + "grad_norm": 2.012105703353882, + "learning_rate": 2e-05, + "loss": 0.04641195, + "step": 6386 + }, + { + "epoch": 12.774000000000001, + "grad_norm": 1.3531718254089355, + "learning_rate": 2e-05, + "loss": 0.04071771, + "step": 6387 + }, + { + "epoch": 12.776, + "grad_norm": 1.2917602062225342, + "learning_rate": 2e-05, + "loss": 0.03840836, + "step": 6388 + }, + { + "epoch": 12.778, + "grad_norm": 1.412302851676941, + "learning_rate": 2e-05, + "loss": 0.0444256, + "step": 6389 + }, + { + "epoch": 12.78, + "grad_norm": 2.208261489868164, + "learning_rate": 2e-05, + "loss": 0.03917177, + "step": 6390 + }, + { + "epoch": 12.782, + "grad_norm": 2.6841533184051514, + "learning_rate": 2e-05, + "loss": 0.04568644, + "step": 6391 + }, + { + "epoch": 12.784, + "grad_norm": 2.0989646911621094, + "learning_rate": 2e-05, + "loss": 0.04213623, + "step": 6392 + }, + { + "epoch": 12.786, + "grad_norm": 1.1875020265579224, + "learning_rate": 2e-05, + "loss": 0.03316896, + "step": 6393 + }, + { + "epoch": 12.788, + "grad_norm": 1.4427107572555542, + "learning_rate": 2e-05, + "loss": 0.04718685, + "step": 6394 + }, + { + "epoch": 12.79, + "grad_norm": 2.2982850074768066, + "learning_rate": 2e-05, + "loss": 0.06501245, + "step": 6395 + }, + { + "epoch": 12.792, + "grad_norm": 1.4120808839797974, + "learning_rate": 2e-05, + "loss": 0.04840774, + "step": 6396 + }, + { + "epoch": 12.794, + "grad_norm": 1.1496741771697998, + "learning_rate": 2e-05, + "loss": 0.02569684, + "step": 6397 + }, + { + "epoch": 12.796, + "grad_norm": 1.4502733945846558, + "learning_rate": 2e-05, + "loss": 0.0588194, + "step": 6398 + }, + { + "epoch": 12.798, + "grad_norm": 1.6323798894882202, + "learning_rate": 2e-05, + "loss": 0.04662085, + "step": 6399 + }, + { + "epoch": 12.8, + "grad_norm": 1.459981083869934, + "learning_rate": 2e-05, + "loss": 0.0337071, + "step": 6400 + }, + { + "epoch": 12.802, + "grad_norm": 1.2611836194992065, + "learning_rate": 2e-05, + "loss": 0.03413685, + "step": 6401 + }, + { + "epoch": 12.804, + "grad_norm": 1.0556434392929077, + "learning_rate": 2e-05, + "loss": 0.03022856, + "step": 6402 + }, + { + "epoch": 12.806000000000001, + "grad_norm": 1.1885935068130493, + "learning_rate": 2e-05, + "loss": 0.03547438, + "step": 6403 + }, + { + "epoch": 12.808, + "grad_norm": 1.4913263320922852, + "learning_rate": 2e-05, + "loss": 0.04301264, + "step": 6404 + }, + { + "epoch": 12.81, + "grad_norm": 1.678415298461914, + "learning_rate": 2e-05, + "loss": 0.0440227, + "step": 6405 + }, + { + "epoch": 12.812, + "grad_norm": 2.0774550437927246, + "learning_rate": 2e-05, + "loss": 0.03051781, + "step": 6406 + }, + { + "epoch": 12.814, + "grad_norm": 1.5015497207641602, + "learning_rate": 2e-05, + "loss": 0.04394282, + "step": 6407 + }, + { + "epoch": 12.816, + "grad_norm": 1.5424638986587524, + "learning_rate": 2e-05, + "loss": 0.0474889, + "step": 6408 + }, + { + "epoch": 12.818, + "grad_norm": 1.2423845529556274, + "learning_rate": 2e-05, + "loss": 0.03328826, + "step": 6409 + }, + { + "epoch": 12.82, + "grad_norm": 1.3648911714553833, + "learning_rate": 2e-05, + "loss": 0.05593203, + "step": 6410 + }, + { + "epoch": 12.822, + "grad_norm": 1.6377047300338745, + "learning_rate": 2e-05, + "loss": 0.04586127, + "step": 6411 + }, + { + "epoch": 12.824, + "grad_norm": 1.2745712995529175, + "learning_rate": 2e-05, + "loss": 0.02756302, + "step": 6412 + }, + { + "epoch": 12.826, + "grad_norm": 1.0557851791381836, + "learning_rate": 2e-05, + "loss": 0.02719074, + "step": 6413 + }, + { + "epoch": 12.828, + "grad_norm": 1.3740254640579224, + "learning_rate": 2e-05, + "loss": 0.03186225, + "step": 6414 + }, + { + "epoch": 12.83, + "grad_norm": 2.600973129272461, + "learning_rate": 2e-05, + "loss": 0.04746537, + "step": 6415 + }, + { + "epoch": 12.832, + "grad_norm": 1.1148313283920288, + "learning_rate": 2e-05, + "loss": 0.02911303, + "step": 6416 + }, + { + "epoch": 12.834, + "grad_norm": 1.085671067237854, + "learning_rate": 2e-05, + "loss": 0.03521315, + "step": 6417 + }, + { + "epoch": 12.836, + "grad_norm": 2.3671929836273193, + "learning_rate": 2e-05, + "loss": 0.04562847, + "step": 6418 + }, + { + "epoch": 12.838, + "grad_norm": 2.1059203147888184, + "learning_rate": 2e-05, + "loss": 0.05967949, + "step": 6419 + }, + { + "epoch": 12.84, + "grad_norm": 2.0058083534240723, + "learning_rate": 2e-05, + "loss": 0.03334425, + "step": 6420 + }, + { + "epoch": 12.842, + "grad_norm": 1.4104883670806885, + "learning_rate": 2e-05, + "loss": 0.03960668, + "step": 6421 + }, + { + "epoch": 12.844, + "grad_norm": 1.3945311307907104, + "learning_rate": 2e-05, + "loss": 0.05327337, + "step": 6422 + }, + { + "epoch": 12.846, + "grad_norm": 1.1917275190353394, + "learning_rate": 2e-05, + "loss": 0.03146118, + "step": 6423 + }, + { + "epoch": 12.848, + "grad_norm": 1.3188499212265015, + "learning_rate": 2e-05, + "loss": 0.03480065, + "step": 6424 + }, + { + "epoch": 12.85, + "grad_norm": 1.2820628881454468, + "learning_rate": 2e-05, + "loss": 0.02856812, + "step": 6425 + }, + { + "epoch": 12.852, + "grad_norm": 1.3507776260375977, + "learning_rate": 2e-05, + "loss": 0.0419952, + "step": 6426 + }, + { + "epoch": 12.854, + "grad_norm": 1.5551594495773315, + "learning_rate": 2e-05, + "loss": 0.06390089, + "step": 6427 + }, + { + "epoch": 12.856, + "grad_norm": 2.567291259765625, + "learning_rate": 2e-05, + "loss": 0.04246731, + "step": 6428 + }, + { + "epoch": 12.858, + "grad_norm": 1.707859754562378, + "learning_rate": 2e-05, + "loss": 0.05553305, + "step": 6429 + }, + { + "epoch": 12.86, + "grad_norm": 1.1673024892807007, + "learning_rate": 2e-05, + "loss": 0.04302656, + "step": 6430 + }, + { + "epoch": 12.862, + "grad_norm": 1.1114625930786133, + "learning_rate": 2e-05, + "loss": 0.03706741, + "step": 6431 + }, + { + "epoch": 12.864, + "grad_norm": 1.044904112815857, + "learning_rate": 2e-05, + "loss": 0.03192709, + "step": 6432 + }, + { + "epoch": 12.866, + "grad_norm": 1.69417142868042, + "learning_rate": 2e-05, + "loss": 0.05506884, + "step": 6433 + }, + { + "epoch": 12.868, + "grad_norm": 1.2619975805282593, + "learning_rate": 2e-05, + "loss": 0.03792794, + "step": 6434 + }, + { + "epoch": 12.87, + "grad_norm": 2.1460132598876953, + "learning_rate": 2e-05, + "loss": 0.05722627, + "step": 6435 + }, + { + "epoch": 12.872, + "grad_norm": 1.2090710401535034, + "learning_rate": 2e-05, + "loss": 0.05044911, + "step": 6436 + }, + { + "epoch": 12.874, + "grad_norm": 1.7237330675125122, + "learning_rate": 2e-05, + "loss": 0.0452129, + "step": 6437 + }, + { + "epoch": 12.876, + "grad_norm": 1.0395435094833374, + "learning_rate": 2e-05, + "loss": 0.02874579, + "step": 6438 + }, + { + "epoch": 12.878, + "grad_norm": 1.4696030616760254, + "learning_rate": 2e-05, + "loss": 0.04806498, + "step": 6439 + }, + { + "epoch": 12.88, + "grad_norm": 1.6383066177368164, + "learning_rate": 2e-05, + "loss": 0.03882147, + "step": 6440 + }, + { + "epoch": 12.882, + "grad_norm": 1.9339408874511719, + "learning_rate": 2e-05, + "loss": 0.06159985, + "step": 6441 + }, + { + "epoch": 12.884, + "grad_norm": 3.267814874649048, + "learning_rate": 2e-05, + "loss": 0.05857627, + "step": 6442 + }, + { + "epoch": 12.886, + "grad_norm": 2.2561628818511963, + "learning_rate": 2e-05, + "loss": 0.05107491, + "step": 6443 + }, + { + "epoch": 12.888, + "grad_norm": 1.9300895929336548, + "learning_rate": 2e-05, + "loss": 0.05386838, + "step": 6444 + }, + { + "epoch": 12.89, + "grad_norm": 3.2582273483276367, + "learning_rate": 2e-05, + "loss": 0.04117388, + "step": 6445 + }, + { + "epoch": 12.892, + "grad_norm": 1.428175449371338, + "learning_rate": 2e-05, + "loss": 0.03108019, + "step": 6446 + }, + { + "epoch": 12.894, + "grad_norm": 2.0222373008728027, + "learning_rate": 2e-05, + "loss": 0.03365955, + "step": 6447 + }, + { + "epoch": 12.896, + "grad_norm": 2.43520188331604, + "learning_rate": 2e-05, + "loss": 0.04152467, + "step": 6448 + }, + { + "epoch": 12.898, + "grad_norm": 1.3054020404815674, + "learning_rate": 2e-05, + "loss": 0.03886139, + "step": 6449 + }, + { + "epoch": 12.9, + "grad_norm": 1.3858981132507324, + "learning_rate": 2e-05, + "loss": 0.03895619, + "step": 6450 + }, + { + "epoch": 12.902, + "grad_norm": 0.9143564105033875, + "learning_rate": 2e-05, + "loss": 0.01918302, + "step": 6451 + }, + { + "epoch": 12.904, + "grad_norm": 1.4373931884765625, + "learning_rate": 2e-05, + "loss": 0.04344804, + "step": 6452 + }, + { + "epoch": 12.906, + "grad_norm": 1.0674983263015747, + "learning_rate": 2e-05, + "loss": 0.03236082, + "step": 6453 + }, + { + "epoch": 12.908, + "grad_norm": 1.2760518789291382, + "learning_rate": 2e-05, + "loss": 0.04398597, + "step": 6454 + }, + { + "epoch": 12.91, + "grad_norm": 2.119486093521118, + "learning_rate": 2e-05, + "loss": 0.03598646, + "step": 6455 + }, + { + "epoch": 12.912, + "grad_norm": 1.0327049493789673, + "learning_rate": 2e-05, + "loss": 0.02299444, + "step": 6456 + }, + { + "epoch": 12.914, + "grad_norm": 1.3598545789718628, + "learning_rate": 2e-05, + "loss": 0.04965741, + "step": 6457 + }, + { + "epoch": 12.916, + "grad_norm": 1.109059453010559, + "learning_rate": 2e-05, + "loss": 0.0339277, + "step": 6458 + }, + { + "epoch": 12.918, + "grad_norm": 1.2943098545074463, + "learning_rate": 2e-05, + "loss": 0.03731314, + "step": 6459 + }, + { + "epoch": 12.92, + "grad_norm": 2.1976876258850098, + "learning_rate": 2e-05, + "loss": 0.0418474, + "step": 6460 + }, + { + "epoch": 12.922, + "grad_norm": 0.9695811867713928, + "learning_rate": 2e-05, + "loss": 0.02404466, + "step": 6461 + }, + { + "epoch": 12.924, + "grad_norm": 1.4028820991516113, + "learning_rate": 2e-05, + "loss": 0.04700348, + "step": 6462 + }, + { + "epoch": 12.926, + "grad_norm": 2.074523448944092, + "learning_rate": 2e-05, + "loss": 0.04677882, + "step": 6463 + }, + { + "epoch": 12.928, + "grad_norm": 0.876520574092865, + "learning_rate": 2e-05, + "loss": 0.02563859, + "step": 6464 + }, + { + "epoch": 12.93, + "grad_norm": 1.8341282606124878, + "learning_rate": 2e-05, + "loss": 0.04422235, + "step": 6465 + }, + { + "epoch": 12.932, + "grad_norm": 1.3097070455551147, + "learning_rate": 2e-05, + "loss": 0.03325322, + "step": 6466 + }, + { + "epoch": 12.934, + "grad_norm": 1.4689263105392456, + "learning_rate": 2e-05, + "loss": 0.04611597, + "step": 6467 + }, + { + "epoch": 12.936, + "grad_norm": 1.5380563735961914, + "learning_rate": 2e-05, + "loss": 0.05122569, + "step": 6468 + }, + { + "epoch": 12.938, + "grad_norm": 0.8768236637115479, + "learning_rate": 2e-05, + "loss": 0.01557497, + "step": 6469 + }, + { + "epoch": 12.94, + "grad_norm": 3.0059239864349365, + "learning_rate": 2e-05, + "loss": 0.03941962, + "step": 6470 + }, + { + "epoch": 12.942, + "grad_norm": 4.760193347930908, + "learning_rate": 2e-05, + "loss": 0.03318344, + "step": 6471 + }, + { + "epoch": 12.943999999999999, + "grad_norm": 2.0453317165374756, + "learning_rate": 2e-05, + "loss": 0.04006668, + "step": 6472 + }, + { + "epoch": 12.946, + "grad_norm": 2.809612512588501, + "learning_rate": 2e-05, + "loss": 0.04919286, + "step": 6473 + }, + { + "epoch": 12.948, + "grad_norm": 1.04146146774292, + "learning_rate": 2e-05, + "loss": 0.0355149, + "step": 6474 + }, + { + "epoch": 12.95, + "grad_norm": 1.4496123790740967, + "learning_rate": 2e-05, + "loss": 0.04766847, + "step": 6475 + }, + { + "epoch": 12.952, + "grad_norm": 1.7231749296188354, + "learning_rate": 2e-05, + "loss": 0.03629901, + "step": 6476 + }, + { + "epoch": 12.954, + "grad_norm": 1.4813939332962036, + "learning_rate": 2e-05, + "loss": 0.05657486, + "step": 6477 + }, + { + "epoch": 12.956, + "grad_norm": 1.8923465013504028, + "learning_rate": 2e-05, + "loss": 0.03107157, + "step": 6478 + }, + { + "epoch": 12.958, + "grad_norm": 1.1124436855316162, + "learning_rate": 2e-05, + "loss": 0.03300238, + "step": 6479 + }, + { + "epoch": 12.96, + "grad_norm": 1.609631061553955, + "learning_rate": 2e-05, + "loss": 0.05744763, + "step": 6480 + }, + { + "epoch": 12.962, + "grad_norm": 1.092146396636963, + "learning_rate": 2e-05, + "loss": 0.0354538, + "step": 6481 + }, + { + "epoch": 12.964, + "grad_norm": 1.0714373588562012, + "learning_rate": 2e-05, + "loss": 0.03718103, + "step": 6482 + }, + { + "epoch": 12.966, + "grad_norm": 1.5786356925964355, + "learning_rate": 2e-05, + "loss": 0.04812677, + "step": 6483 + }, + { + "epoch": 12.968, + "grad_norm": 2.6054508686065674, + "learning_rate": 2e-05, + "loss": 0.0359851, + "step": 6484 + }, + { + "epoch": 12.97, + "grad_norm": 0.9874547123908997, + "learning_rate": 2e-05, + "loss": 0.03105699, + "step": 6485 + }, + { + "epoch": 12.972, + "grad_norm": 2.4184865951538086, + "learning_rate": 2e-05, + "loss": 0.05367502, + "step": 6486 + }, + { + "epoch": 12.974, + "grad_norm": 3.007775068283081, + "learning_rate": 2e-05, + "loss": 0.0452963, + "step": 6487 + }, + { + "epoch": 12.975999999999999, + "grad_norm": 1.4173858165740967, + "learning_rate": 2e-05, + "loss": 0.05711803, + "step": 6488 + }, + { + "epoch": 12.978, + "grad_norm": 1.6504642963409424, + "learning_rate": 2e-05, + "loss": 0.04591904, + "step": 6489 + }, + { + "epoch": 12.98, + "grad_norm": 1.0074936151504517, + "learning_rate": 2e-05, + "loss": 0.02326766, + "step": 6490 + }, + { + "epoch": 12.982, + "grad_norm": 1.0605522394180298, + "learning_rate": 2e-05, + "loss": 0.02780467, + "step": 6491 + }, + { + "epoch": 12.984, + "grad_norm": 2.823035717010498, + "learning_rate": 2e-05, + "loss": 0.0511883, + "step": 6492 + }, + { + "epoch": 12.986, + "grad_norm": 1.2774420976638794, + "learning_rate": 2e-05, + "loss": 0.0274408, + "step": 6493 + }, + { + "epoch": 12.988, + "grad_norm": 0.9702324867248535, + "learning_rate": 2e-05, + "loss": 0.02415428, + "step": 6494 + }, + { + "epoch": 12.99, + "grad_norm": 1.212349534034729, + "learning_rate": 2e-05, + "loss": 0.04195923, + "step": 6495 + }, + { + "epoch": 12.992, + "grad_norm": 1.8738391399383545, + "learning_rate": 2e-05, + "loss": 0.03428925, + "step": 6496 + }, + { + "epoch": 12.994, + "grad_norm": 1.183615803718567, + "learning_rate": 2e-05, + "loss": 0.03930584, + "step": 6497 + }, + { + "epoch": 12.996, + "grad_norm": 3.319544553756714, + "learning_rate": 2e-05, + "loss": 0.0459759, + "step": 6498 + }, + { + "epoch": 12.998, + "grad_norm": 1.1456108093261719, + "learning_rate": 2e-05, + "loss": 0.03367121, + "step": 6499 + }, + { + "epoch": 13.0, + "grad_norm": 1.9047518968582153, + "learning_rate": 2e-05, + "loss": 0.04684749, + "step": 6500 + }, + { + "epoch": 13.0, + "eval_performance": { + "AngleClassification_1": 0.996, + "AngleClassification_2": 0.996, + "AngleClassification_3": 0.9580838323353293, + "Equal_1": 0.996, + "Equal_2": 0.9700598802395209, + "Equal_3": 0.8882235528942116, + "LineComparison_1": 0.998, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9860279441117764, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.992, + "Perpendicular_1": 0.984, + "Perpendicular_2": 0.966, + "Perpendicular_3": 0.7144288577154309, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9916666666666667, + "PointLiesOnCircle_3": 0.9896666666666667, + "PointLiesOnLine_1": 0.9819639278557114, + "PointLiesOnLine_2": 0.9919839679358717, + "PointLiesOnLine_3": 0.9680638722554891 + }, + "eval_runtime": 319.563, + "eval_samples_per_second": 32.857, + "eval_steps_per_second": 0.657, + "step": 6500 + }, + { + "epoch": 13.002, + "grad_norm": 2.025676727294922, + "learning_rate": 2e-05, + "loss": 0.05510117, + "step": 6501 + }, + { + "epoch": 13.004, + "grad_norm": 1.67191481590271, + "learning_rate": 2e-05, + "loss": 0.05629362, + "step": 6502 + }, + { + "epoch": 13.006, + "grad_norm": 1.4045917987823486, + "learning_rate": 2e-05, + "loss": 0.0437433, + "step": 6503 + }, + { + "epoch": 13.008, + "grad_norm": 2.3563649654388428, + "learning_rate": 2e-05, + "loss": 0.04698532, + "step": 6504 + }, + { + "epoch": 13.01, + "grad_norm": 1.7476520538330078, + "learning_rate": 2e-05, + "loss": 0.05161941, + "step": 6505 + }, + { + "epoch": 13.012, + "grad_norm": 1.3390698432922363, + "learning_rate": 2e-05, + "loss": 0.03999062, + "step": 6506 + }, + { + "epoch": 13.014, + "grad_norm": 1.905305027961731, + "learning_rate": 2e-05, + "loss": 0.04302887, + "step": 6507 + }, + { + "epoch": 13.016, + "grad_norm": 2.7333922386169434, + "learning_rate": 2e-05, + "loss": 0.04309512, + "step": 6508 + }, + { + "epoch": 13.018, + "grad_norm": 1.2732406854629517, + "learning_rate": 2e-05, + "loss": 0.04579434, + "step": 6509 + }, + { + "epoch": 13.02, + "grad_norm": 2.010542631149292, + "learning_rate": 2e-05, + "loss": 0.04244982, + "step": 6510 + }, + { + "epoch": 13.022, + "grad_norm": 1.6808205842971802, + "learning_rate": 2e-05, + "loss": 0.04541232, + "step": 6511 + }, + { + "epoch": 13.024, + "grad_norm": 1.3999242782592773, + "learning_rate": 2e-05, + "loss": 0.05283143, + "step": 6512 + }, + { + "epoch": 13.026, + "grad_norm": 1.0839227437973022, + "learning_rate": 2e-05, + "loss": 0.03332079, + "step": 6513 + }, + { + "epoch": 13.028, + "grad_norm": 1.7581884860992432, + "learning_rate": 2e-05, + "loss": 0.03697137, + "step": 6514 + }, + { + "epoch": 13.03, + "grad_norm": 2.3362925052642822, + "learning_rate": 2e-05, + "loss": 0.04086022, + "step": 6515 + }, + { + "epoch": 13.032, + "grad_norm": 1.2541543245315552, + "learning_rate": 2e-05, + "loss": 0.04349049, + "step": 6516 + }, + { + "epoch": 13.034, + "grad_norm": 3.33878231048584, + "learning_rate": 2e-05, + "loss": 0.04347111, + "step": 6517 + }, + { + "epoch": 13.036, + "grad_norm": 1.553660273551941, + "learning_rate": 2e-05, + "loss": 0.03859802, + "step": 6518 + }, + { + "epoch": 13.038, + "grad_norm": 1.1606173515319824, + "learning_rate": 2e-05, + "loss": 0.04000216, + "step": 6519 + }, + { + "epoch": 13.04, + "grad_norm": 1.0634113550186157, + "learning_rate": 2e-05, + "loss": 0.02386756, + "step": 6520 + }, + { + "epoch": 13.042, + "grad_norm": 1.356263518333435, + "learning_rate": 2e-05, + "loss": 0.03155567, + "step": 6521 + }, + { + "epoch": 13.044, + "grad_norm": 2.3392069339752197, + "learning_rate": 2e-05, + "loss": 0.06134815, + "step": 6522 + }, + { + "epoch": 13.046, + "grad_norm": 4.142246723175049, + "learning_rate": 2e-05, + "loss": 0.06078862, + "step": 6523 + }, + { + "epoch": 13.048, + "grad_norm": 2.212329387664795, + "learning_rate": 2e-05, + "loss": 0.04393127, + "step": 6524 + }, + { + "epoch": 13.05, + "grad_norm": 1.69950270652771, + "learning_rate": 2e-05, + "loss": 0.04409553, + "step": 6525 + }, + { + "epoch": 13.052, + "grad_norm": 1.850864052772522, + "learning_rate": 2e-05, + "loss": 0.03946983, + "step": 6526 + }, + { + "epoch": 13.054, + "grad_norm": 1.1603494882583618, + "learning_rate": 2e-05, + "loss": 0.02313782, + "step": 6527 + }, + { + "epoch": 13.056, + "grad_norm": 2.081894636154175, + "learning_rate": 2e-05, + "loss": 0.02774069, + "step": 6528 + }, + { + "epoch": 13.058, + "grad_norm": 1.8797224760055542, + "learning_rate": 2e-05, + "loss": 0.05306084, + "step": 6529 + }, + { + "epoch": 13.06, + "grad_norm": 1.903994083404541, + "learning_rate": 2e-05, + "loss": 0.05133477, + "step": 6530 + }, + { + "epoch": 13.062, + "grad_norm": 1.7151387929916382, + "learning_rate": 2e-05, + "loss": 0.04850292, + "step": 6531 + }, + { + "epoch": 13.064, + "grad_norm": 2.1803505420684814, + "learning_rate": 2e-05, + "loss": 0.03637154, + "step": 6532 + }, + { + "epoch": 13.066, + "grad_norm": 1.204275369644165, + "learning_rate": 2e-05, + "loss": 0.04634327, + "step": 6533 + }, + { + "epoch": 13.068, + "grad_norm": 1.7479034662246704, + "learning_rate": 2e-05, + "loss": 0.04111974, + "step": 6534 + }, + { + "epoch": 13.07, + "grad_norm": 1.4405919313430786, + "learning_rate": 2e-05, + "loss": 0.03936667, + "step": 6535 + }, + { + "epoch": 13.072, + "grad_norm": 2.053112030029297, + "learning_rate": 2e-05, + "loss": 0.05977479, + "step": 6536 + }, + { + "epoch": 13.074, + "grad_norm": 3.923091173171997, + "learning_rate": 2e-05, + "loss": 0.03060072, + "step": 6537 + }, + { + "epoch": 13.076, + "grad_norm": 1.1463172435760498, + "learning_rate": 2e-05, + "loss": 0.03833826, + "step": 6538 + }, + { + "epoch": 13.078, + "grad_norm": 1.4521420001983643, + "learning_rate": 2e-05, + "loss": 0.02565883, + "step": 6539 + }, + { + "epoch": 13.08, + "grad_norm": 1.5646151304244995, + "learning_rate": 2e-05, + "loss": 0.04049197, + "step": 6540 + }, + { + "epoch": 13.082, + "grad_norm": 1.1040815114974976, + "learning_rate": 2e-05, + "loss": 0.04397867, + "step": 6541 + }, + { + "epoch": 13.084, + "grad_norm": 0.9532331228256226, + "learning_rate": 2e-05, + "loss": 0.02361744, + "step": 6542 + }, + { + "epoch": 13.086, + "grad_norm": 3.3594508171081543, + "learning_rate": 2e-05, + "loss": 0.06016827, + "step": 6543 + }, + { + "epoch": 13.088, + "grad_norm": 1.7429322004318237, + "learning_rate": 2e-05, + "loss": 0.04198964, + "step": 6544 + }, + { + "epoch": 13.09, + "grad_norm": 1.6168110370635986, + "learning_rate": 2e-05, + "loss": 0.02668218, + "step": 6545 + }, + { + "epoch": 13.092, + "grad_norm": 1.9579505920410156, + "learning_rate": 2e-05, + "loss": 0.0469409, + "step": 6546 + }, + { + "epoch": 13.094, + "grad_norm": 1.6008516550064087, + "learning_rate": 2e-05, + "loss": 0.04403362, + "step": 6547 + }, + { + "epoch": 13.096, + "grad_norm": 2.0377395153045654, + "learning_rate": 2e-05, + "loss": 0.04835338, + "step": 6548 + }, + { + "epoch": 13.098, + "grad_norm": 2.0079846382141113, + "learning_rate": 2e-05, + "loss": 0.04270446, + "step": 6549 + }, + { + "epoch": 13.1, + "grad_norm": 1.2587766647338867, + "learning_rate": 2e-05, + "loss": 0.04245213, + "step": 6550 + }, + { + "epoch": 13.102, + "grad_norm": 1.3419069051742554, + "learning_rate": 2e-05, + "loss": 0.03273055, + "step": 6551 + }, + { + "epoch": 13.104, + "grad_norm": 1.0695240497589111, + "learning_rate": 2e-05, + "loss": 0.02955955, + "step": 6552 + }, + { + "epoch": 13.106, + "grad_norm": 1.2960338592529297, + "learning_rate": 2e-05, + "loss": 0.03386617, + "step": 6553 + }, + { + "epoch": 13.108, + "grad_norm": 1.3421612977981567, + "learning_rate": 2e-05, + "loss": 0.05039021, + "step": 6554 + }, + { + "epoch": 13.11, + "grad_norm": 2.230473518371582, + "learning_rate": 2e-05, + "loss": 0.05388996, + "step": 6555 + }, + { + "epoch": 13.112, + "grad_norm": 1.5305428504943848, + "learning_rate": 2e-05, + "loss": 0.03787341, + "step": 6556 + }, + { + "epoch": 13.114, + "grad_norm": 1.2190325260162354, + "learning_rate": 2e-05, + "loss": 0.04556718, + "step": 6557 + }, + { + "epoch": 13.116, + "grad_norm": 1.5937390327453613, + "learning_rate": 2e-05, + "loss": 0.04280414, + "step": 6558 + }, + { + "epoch": 13.118, + "grad_norm": 1.9277565479278564, + "learning_rate": 2e-05, + "loss": 0.04627956, + "step": 6559 + }, + { + "epoch": 13.12, + "grad_norm": 1.2560091018676758, + "learning_rate": 2e-05, + "loss": 0.04250285, + "step": 6560 + }, + { + "epoch": 13.122, + "grad_norm": 1.1613867282867432, + "learning_rate": 2e-05, + "loss": 0.0368138, + "step": 6561 + }, + { + "epoch": 13.124, + "grad_norm": 1.8732789754867554, + "learning_rate": 2e-05, + "loss": 0.04468385, + "step": 6562 + }, + { + "epoch": 13.126, + "grad_norm": 1.9750572443008423, + "learning_rate": 2e-05, + "loss": 0.03603955, + "step": 6563 + }, + { + "epoch": 13.128, + "grad_norm": 1.3057758808135986, + "learning_rate": 2e-05, + "loss": 0.03955033, + "step": 6564 + }, + { + "epoch": 13.13, + "grad_norm": 2.027482032775879, + "learning_rate": 2e-05, + "loss": 0.05527831, + "step": 6565 + }, + { + "epoch": 13.132, + "grad_norm": 1.953943133354187, + "learning_rate": 2e-05, + "loss": 0.03958747, + "step": 6566 + }, + { + "epoch": 13.134, + "grad_norm": 1.0840398073196411, + "learning_rate": 2e-05, + "loss": 0.03809543, + "step": 6567 + }, + { + "epoch": 13.136, + "grad_norm": 2.0900790691375732, + "learning_rate": 2e-05, + "loss": 0.04004861, + "step": 6568 + }, + { + "epoch": 13.138, + "grad_norm": 1.2721304893493652, + "learning_rate": 2e-05, + "loss": 0.0365794, + "step": 6569 + }, + { + "epoch": 13.14, + "grad_norm": 1.4358141422271729, + "learning_rate": 2e-05, + "loss": 0.04888467, + "step": 6570 + }, + { + "epoch": 13.142, + "grad_norm": 1.0475200414657593, + "learning_rate": 2e-05, + "loss": 0.03867028, + "step": 6571 + }, + { + "epoch": 13.144, + "grad_norm": 1.4733918905258179, + "learning_rate": 2e-05, + "loss": 0.03798759, + "step": 6572 + }, + { + "epoch": 13.146, + "grad_norm": 1.2983602285385132, + "learning_rate": 2e-05, + "loss": 0.04571147, + "step": 6573 + }, + { + "epoch": 13.148, + "grad_norm": 1.9528359174728394, + "learning_rate": 2e-05, + "loss": 0.06328858, + "step": 6574 + }, + { + "epoch": 13.15, + "grad_norm": 2.274611234664917, + "learning_rate": 2e-05, + "loss": 0.05201408, + "step": 6575 + }, + { + "epoch": 13.152, + "grad_norm": 1.6580023765563965, + "learning_rate": 2e-05, + "loss": 0.04882085, + "step": 6576 + }, + { + "epoch": 13.154, + "grad_norm": 1.701705813407898, + "learning_rate": 2e-05, + "loss": 0.06361747, + "step": 6577 + }, + { + "epoch": 13.156, + "grad_norm": 1.3338755369186401, + "learning_rate": 2e-05, + "loss": 0.03825006, + "step": 6578 + }, + { + "epoch": 13.158, + "grad_norm": 1.2740284204483032, + "learning_rate": 2e-05, + "loss": 0.04740117, + "step": 6579 + }, + { + "epoch": 13.16, + "grad_norm": 0.855344831943512, + "learning_rate": 2e-05, + "loss": 0.02979115, + "step": 6580 + }, + { + "epoch": 13.162, + "grad_norm": 1.0931096076965332, + "learning_rate": 2e-05, + "loss": 0.03413231, + "step": 6581 + }, + { + "epoch": 13.164, + "grad_norm": 1.7539303302764893, + "learning_rate": 2e-05, + "loss": 0.03713946, + "step": 6582 + }, + { + "epoch": 13.166, + "grad_norm": 1.2781082391738892, + "learning_rate": 2e-05, + "loss": 0.0324952, + "step": 6583 + }, + { + "epoch": 13.168, + "grad_norm": 1.100340723991394, + "learning_rate": 2e-05, + "loss": 0.04220153, + "step": 6584 + }, + { + "epoch": 13.17, + "grad_norm": 1.4666935205459595, + "learning_rate": 2e-05, + "loss": 0.04995844, + "step": 6585 + }, + { + "epoch": 13.172, + "grad_norm": 0.8306145668029785, + "learning_rate": 2e-05, + "loss": 0.0289952, + "step": 6586 + }, + { + "epoch": 13.174, + "grad_norm": 1.7973495721817017, + "learning_rate": 2e-05, + "loss": 0.03114134, + "step": 6587 + }, + { + "epoch": 13.176, + "grad_norm": 1.2895548343658447, + "learning_rate": 2e-05, + "loss": 0.04376809, + "step": 6588 + }, + { + "epoch": 13.178, + "grad_norm": 4.245728015899658, + "learning_rate": 2e-05, + "loss": 0.07614356, + "step": 6589 + }, + { + "epoch": 13.18, + "grad_norm": 1.9204843044281006, + "learning_rate": 2e-05, + "loss": 0.05193898, + "step": 6590 + }, + { + "epoch": 13.182, + "grad_norm": 1.8683027029037476, + "learning_rate": 2e-05, + "loss": 0.02822795, + "step": 6591 + }, + { + "epoch": 13.184, + "grad_norm": 0.9471633434295654, + "learning_rate": 2e-05, + "loss": 0.02922757, + "step": 6592 + }, + { + "epoch": 13.186, + "grad_norm": 1.7378665208816528, + "learning_rate": 2e-05, + "loss": 0.05698539, + "step": 6593 + }, + { + "epoch": 13.188, + "grad_norm": 1.6019706726074219, + "learning_rate": 2e-05, + "loss": 0.05355989, + "step": 6594 + }, + { + "epoch": 13.19, + "grad_norm": 1.856338381767273, + "learning_rate": 2e-05, + "loss": 0.04503391, + "step": 6595 + }, + { + "epoch": 13.192, + "grad_norm": 1.4183052778244019, + "learning_rate": 2e-05, + "loss": 0.05698609, + "step": 6596 + }, + { + "epoch": 13.194, + "grad_norm": 2.9402050971984863, + "learning_rate": 2e-05, + "loss": 0.05300699, + "step": 6597 + }, + { + "epoch": 13.196, + "grad_norm": 1.2691911458969116, + "learning_rate": 2e-05, + "loss": 0.04289351, + "step": 6598 + }, + { + "epoch": 13.198, + "grad_norm": 1.9946510791778564, + "learning_rate": 2e-05, + "loss": 0.03787369, + "step": 6599 + }, + { + "epoch": 13.2, + "grad_norm": 1.5389318466186523, + "learning_rate": 2e-05, + "loss": 0.04896133, + "step": 6600 + }, + { + "epoch": 13.202, + "grad_norm": 1.1009681224822998, + "learning_rate": 2e-05, + "loss": 0.04099545, + "step": 6601 + }, + { + "epoch": 13.204, + "grad_norm": 1.298155665397644, + "learning_rate": 2e-05, + "loss": 0.04521285, + "step": 6602 + }, + { + "epoch": 13.206, + "grad_norm": 1.194268822669983, + "learning_rate": 2e-05, + "loss": 0.02646183, + "step": 6603 + }, + { + "epoch": 13.208, + "grad_norm": 1.166735053062439, + "learning_rate": 2e-05, + "loss": 0.03302169, + "step": 6604 + }, + { + "epoch": 13.21, + "grad_norm": 1.1501606702804565, + "learning_rate": 2e-05, + "loss": 0.03696949, + "step": 6605 + }, + { + "epoch": 13.212, + "grad_norm": 1.2692323923110962, + "learning_rate": 2e-05, + "loss": 0.04398295, + "step": 6606 + }, + { + "epoch": 13.214, + "grad_norm": 1.6035974025726318, + "learning_rate": 2e-05, + "loss": 0.03624269, + "step": 6607 + }, + { + "epoch": 13.216, + "grad_norm": 1.2560043334960938, + "learning_rate": 2e-05, + "loss": 0.03458342, + "step": 6608 + }, + { + "epoch": 13.218, + "grad_norm": 2.6903185844421387, + "learning_rate": 2e-05, + "loss": 0.03339992, + "step": 6609 + }, + { + "epoch": 13.22, + "grad_norm": 1.8403470516204834, + "learning_rate": 2e-05, + "loss": 0.03888453, + "step": 6610 + }, + { + "epoch": 13.222, + "grad_norm": 1.1549571752548218, + "learning_rate": 2e-05, + "loss": 0.03554495, + "step": 6611 + }, + { + "epoch": 13.224, + "grad_norm": 2.7406699657440186, + "learning_rate": 2e-05, + "loss": 0.04288154, + "step": 6612 + }, + { + "epoch": 13.226, + "grad_norm": 1.6124176979064941, + "learning_rate": 2e-05, + "loss": 0.04322715, + "step": 6613 + }, + { + "epoch": 13.228, + "grad_norm": 1.271803617477417, + "learning_rate": 2e-05, + "loss": 0.04564525, + "step": 6614 + }, + { + "epoch": 13.23, + "grad_norm": 2.073120594024658, + "learning_rate": 2e-05, + "loss": 0.04652428, + "step": 6615 + }, + { + "epoch": 13.232, + "grad_norm": 1.5669819116592407, + "learning_rate": 2e-05, + "loss": 0.04134148, + "step": 6616 + }, + { + "epoch": 13.234, + "grad_norm": 0.8931474685668945, + "learning_rate": 2e-05, + "loss": 0.03444675, + "step": 6617 + }, + { + "epoch": 13.236, + "grad_norm": 2.4369468688964844, + "learning_rate": 2e-05, + "loss": 0.04590879, + "step": 6618 + }, + { + "epoch": 13.238, + "grad_norm": 2.894359588623047, + "learning_rate": 2e-05, + "loss": 0.04611806, + "step": 6619 + }, + { + "epoch": 13.24, + "grad_norm": 1.5769118070602417, + "learning_rate": 2e-05, + "loss": 0.03493973, + "step": 6620 + }, + { + "epoch": 13.242, + "grad_norm": 1.3417714834213257, + "learning_rate": 2e-05, + "loss": 0.04988127, + "step": 6621 + }, + { + "epoch": 13.244, + "grad_norm": 1.8980623483657837, + "learning_rate": 2e-05, + "loss": 0.05576314, + "step": 6622 + }, + { + "epoch": 13.246, + "grad_norm": 1.0693109035491943, + "learning_rate": 2e-05, + "loss": 0.03685558, + "step": 6623 + }, + { + "epoch": 13.248, + "grad_norm": 1.5265095233917236, + "learning_rate": 2e-05, + "loss": 0.05537786, + "step": 6624 + }, + { + "epoch": 13.25, + "grad_norm": 2.378041982650757, + "learning_rate": 2e-05, + "loss": 0.03925188, + "step": 6625 + }, + { + "epoch": 13.252, + "grad_norm": 1.2043821811676025, + "learning_rate": 2e-05, + "loss": 0.030987, + "step": 6626 + }, + { + "epoch": 13.254, + "grad_norm": 1.6501712799072266, + "learning_rate": 2e-05, + "loss": 0.04117061, + "step": 6627 + }, + { + "epoch": 13.256, + "grad_norm": 2.1596996784210205, + "learning_rate": 2e-05, + "loss": 0.05108644, + "step": 6628 + }, + { + "epoch": 13.258, + "grad_norm": 2.158721446990967, + "learning_rate": 2e-05, + "loss": 0.04933317, + "step": 6629 + }, + { + "epoch": 13.26, + "grad_norm": 1.9402283430099487, + "learning_rate": 2e-05, + "loss": 0.04095349, + "step": 6630 + }, + { + "epoch": 13.262, + "grad_norm": 1.688955307006836, + "learning_rate": 2e-05, + "loss": 0.03976238, + "step": 6631 + }, + { + "epoch": 13.264, + "grad_norm": 1.8904914855957031, + "learning_rate": 2e-05, + "loss": 0.05259514, + "step": 6632 + }, + { + "epoch": 13.266, + "grad_norm": 1.3631291389465332, + "learning_rate": 2e-05, + "loss": 0.04219753, + "step": 6633 + }, + { + "epoch": 13.268, + "grad_norm": 1.141372799873352, + "learning_rate": 2e-05, + "loss": 0.03875969, + "step": 6634 + }, + { + "epoch": 13.27, + "grad_norm": 1.0180898904800415, + "learning_rate": 2e-05, + "loss": 0.02585149, + "step": 6635 + }, + { + "epoch": 13.272, + "grad_norm": 1.6760014295578003, + "learning_rate": 2e-05, + "loss": 0.04044281, + "step": 6636 + }, + { + "epoch": 13.274000000000001, + "grad_norm": 2.0628297328948975, + "learning_rate": 2e-05, + "loss": 0.05332665, + "step": 6637 + }, + { + "epoch": 13.276, + "grad_norm": 1.05376398563385, + "learning_rate": 2e-05, + "loss": 0.03772529, + "step": 6638 + }, + { + "epoch": 13.278, + "grad_norm": 1.2965894937515259, + "learning_rate": 2e-05, + "loss": 0.04613126, + "step": 6639 + }, + { + "epoch": 13.28, + "grad_norm": 1.0610442161560059, + "learning_rate": 2e-05, + "loss": 0.02312069, + "step": 6640 + }, + { + "epoch": 13.282, + "grad_norm": 1.1080660820007324, + "learning_rate": 2e-05, + "loss": 0.03545141, + "step": 6641 + }, + { + "epoch": 13.284, + "grad_norm": 2.201037883758545, + "learning_rate": 2e-05, + "loss": 0.03849367, + "step": 6642 + }, + { + "epoch": 13.286, + "grad_norm": 1.9222381114959717, + "learning_rate": 2e-05, + "loss": 0.0556365, + "step": 6643 + }, + { + "epoch": 13.288, + "grad_norm": 1.5982733964920044, + "learning_rate": 2e-05, + "loss": 0.03892791, + "step": 6644 + }, + { + "epoch": 13.29, + "grad_norm": 1.0715982913970947, + "learning_rate": 2e-05, + "loss": 0.02838083, + "step": 6645 + }, + { + "epoch": 13.292, + "grad_norm": 1.6198405027389526, + "learning_rate": 2e-05, + "loss": 0.05888985, + "step": 6646 + }, + { + "epoch": 13.294, + "grad_norm": 1.8655623197555542, + "learning_rate": 2e-05, + "loss": 0.0356196, + "step": 6647 + }, + { + "epoch": 13.296, + "grad_norm": 1.737970232963562, + "learning_rate": 2e-05, + "loss": 0.02180437, + "step": 6648 + }, + { + "epoch": 13.298, + "grad_norm": 0.9337108135223389, + "learning_rate": 2e-05, + "loss": 0.02707865, + "step": 6649 + }, + { + "epoch": 13.3, + "grad_norm": 0.9037566781044006, + "learning_rate": 2e-05, + "loss": 0.02494653, + "step": 6650 + }, + { + "epoch": 13.302, + "grad_norm": 0.9231191277503967, + "learning_rate": 2e-05, + "loss": 0.03142556, + "step": 6651 + }, + { + "epoch": 13.304, + "grad_norm": 3.1863584518432617, + "learning_rate": 2e-05, + "loss": 0.05755781, + "step": 6652 + }, + { + "epoch": 13.306, + "grad_norm": 0.9532139897346497, + "learning_rate": 2e-05, + "loss": 0.03102466, + "step": 6653 + }, + { + "epoch": 13.308, + "grad_norm": 1.7882435321807861, + "learning_rate": 2e-05, + "loss": 0.04434324, + "step": 6654 + }, + { + "epoch": 13.31, + "grad_norm": 1.4374666213989258, + "learning_rate": 2e-05, + "loss": 0.04442374, + "step": 6655 + }, + { + "epoch": 13.312, + "grad_norm": 1.6483453512191772, + "learning_rate": 2e-05, + "loss": 0.04483134, + "step": 6656 + }, + { + "epoch": 13.314, + "grad_norm": 1.6020697355270386, + "learning_rate": 2e-05, + "loss": 0.03660455, + "step": 6657 + }, + { + "epoch": 13.316, + "grad_norm": 2.8730368614196777, + "learning_rate": 2e-05, + "loss": 0.04435084, + "step": 6658 + }, + { + "epoch": 13.318, + "grad_norm": 1.9684791564941406, + "learning_rate": 2e-05, + "loss": 0.04246651, + "step": 6659 + }, + { + "epoch": 13.32, + "grad_norm": 1.097242832183838, + "learning_rate": 2e-05, + "loss": 0.03018874, + "step": 6660 + }, + { + "epoch": 13.322, + "grad_norm": 0.9127419590950012, + "learning_rate": 2e-05, + "loss": 0.03180817, + "step": 6661 + }, + { + "epoch": 13.324, + "grad_norm": 1.5429625511169434, + "learning_rate": 2e-05, + "loss": 0.05303571, + "step": 6662 + }, + { + "epoch": 13.326, + "grad_norm": 1.4380091428756714, + "learning_rate": 2e-05, + "loss": 0.05016965, + "step": 6663 + }, + { + "epoch": 13.328, + "grad_norm": 1.411498785018921, + "learning_rate": 2e-05, + "loss": 0.05673229, + "step": 6664 + }, + { + "epoch": 13.33, + "grad_norm": 1.6350773572921753, + "learning_rate": 2e-05, + "loss": 0.04110835, + "step": 6665 + }, + { + "epoch": 13.332, + "grad_norm": 1.2463691234588623, + "learning_rate": 2e-05, + "loss": 0.05402338, + "step": 6666 + }, + { + "epoch": 13.334, + "grad_norm": 2.4059319496154785, + "learning_rate": 2e-05, + "loss": 0.044544, + "step": 6667 + }, + { + "epoch": 13.336, + "grad_norm": 1.8783429861068726, + "learning_rate": 2e-05, + "loss": 0.04299849, + "step": 6668 + }, + { + "epoch": 13.338, + "grad_norm": 2.927933692932129, + "learning_rate": 2e-05, + "loss": 0.04565507, + "step": 6669 + }, + { + "epoch": 13.34, + "grad_norm": 1.7480547428131104, + "learning_rate": 2e-05, + "loss": 0.03766192, + "step": 6670 + }, + { + "epoch": 13.342, + "grad_norm": 2.074599266052246, + "learning_rate": 2e-05, + "loss": 0.04872315, + "step": 6671 + }, + { + "epoch": 13.344, + "grad_norm": 2.1086673736572266, + "learning_rate": 2e-05, + "loss": 0.06374644, + "step": 6672 + }, + { + "epoch": 13.346, + "grad_norm": 2.058213949203491, + "learning_rate": 2e-05, + "loss": 0.03495604, + "step": 6673 + }, + { + "epoch": 13.348, + "grad_norm": 1.0207751989364624, + "learning_rate": 2e-05, + "loss": 0.03142983, + "step": 6674 + }, + { + "epoch": 13.35, + "grad_norm": 2.3387722969055176, + "learning_rate": 2e-05, + "loss": 0.04296496, + "step": 6675 + }, + { + "epoch": 13.352, + "grad_norm": 1.1725760698318481, + "learning_rate": 2e-05, + "loss": 0.03329622, + "step": 6676 + }, + { + "epoch": 13.354, + "grad_norm": 1.225332498550415, + "learning_rate": 2e-05, + "loss": 0.03967672, + "step": 6677 + }, + { + "epoch": 13.356, + "grad_norm": 1.1926147937774658, + "learning_rate": 2e-05, + "loss": 0.03046719, + "step": 6678 + }, + { + "epoch": 13.358, + "grad_norm": 1.0180567502975464, + "learning_rate": 2e-05, + "loss": 0.03923386, + "step": 6679 + }, + { + "epoch": 13.36, + "grad_norm": 1.3657896518707275, + "learning_rate": 2e-05, + "loss": 0.03993597, + "step": 6680 + }, + { + "epoch": 13.362, + "grad_norm": 1.367834448814392, + "learning_rate": 2e-05, + "loss": 0.0454083, + "step": 6681 + }, + { + "epoch": 13.364, + "grad_norm": 1.1526918411254883, + "learning_rate": 2e-05, + "loss": 0.03554086, + "step": 6682 + }, + { + "epoch": 13.366, + "grad_norm": 1.4416218996047974, + "learning_rate": 2e-05, + "loss": 0.04877956, + "step": 6683 + }, + { + "epoch": 13.368, + "grad_norm": 1.735988736152649, + "learning_rate": 2e-05, + "loss": 0.0392142, + "step": 6684 + }, + { + "epoch": 13.37, + "grad_norm": 1.4720538854599, + "learning_rate": 2e-05, + "loss": 0.04314131, + "step": 6685 + }, + { + "epoch": 13.372, + "grad_norm": 1.0763517618179321, + "learning_rate": 2e-05, + "loss": 0.03109873, + "step": 6686 + }, + { + "epoch": 13.374, + "grad_norm": 1.1306136846542358, + "learning_rate": 2e-05, + "loss": 0.04201383, + "step": 6687 + }, + { + "epoch": 13.376, + "grad_norm": 1.4741770029067993, + "learning_rate": 2e-05, + "loss": 0.04738778, + "step": 6688 + }, + { + "epoch": 13.378, + "grad_norm": 1.487601399421692, + "learning_rate": 2e-05, + "loss": 0.03376631, + "step": 6689 + }, + { + "epoch": 13.38, + "grad_norm": 1.2999999523162842, + "learning_rate": 2e-05, + "loss": 0.04101618, + "step": 6690 + }, + { + "epoch": 13.382, + "grad_norm": 1.1709288358688354, + "learning_rate": 2e-05, + "loss": 0.03829162, + "step": 6691 + }, + { + "epoch": 13.384, + "grad_norm": 1.452234148979187, + "learning_rate": 2e-05, + "loss": 0.03547079, + "step": 6692 + }, + { + "epoch": 13.386, + "grad_norm": 0.8371902108192444, + "learning_rate": 2e-05, + "loss": 0.02435498, + "step": 6693 + }, + { + "epoch": 13.388, + "grad_norm": 1.7057963609695435, + "learning_rate": 2e-05, + "loss": 0.0380133, + "step": 6694 + }, + { + "epoch": 13.39, + "grad_norm": 1.3966196775436401, + "learning_rate": 2e-05, + "loss": 0.04224955, + "step": 6695 + }, + { + "epoch": 13.392, + "grad_norm": 1.1923191547393799, + "learning_rate": 2e-05, + "loss": 0.03705193, + "step": 6696 + }, + { + "epoch": 13.394, + "grad_norm": 0.9897952079772949, + "learning_rate": 2e-05, + "loss": 0.03031838, + "step": 6697 + }, + { + "epoch": 13.396, + "grad_norm": 1.9399019479751587, + "learning_rate": 2e-05, + "loss": 0.05891667, + "step": 6698 + }, + { + "epoch": 13.398, + "grad_norm": 1.0432417392730713, + "learning_rate": 2e-05, + "loss": 0.03055418, + "step": 6699 + }, + { + "epoch": 13.4, + "grad_norm": 0.9006956219673157, + "learning_rate": 2e-05, + "loss": 0.02497819, + "step": 6700 + }, + { + "epoch": 13.402, + "grad_norm": 1.248022198677063, + "learning_rate": 2e-05, + "loss": 0.02955998, + "step": 6701 + }, + { + "epoch": 13.404, + "grad_norm": 1.5496799945831299, + "learning_rate": 2e-05, + "loss": 0.06074677, + "step": 6702 + }, + { + "epoch": 13.406, + "grad_norm": 1.4711843729019165, + "learning_rate": 2e-05, + "loss": 0.04381853, + "step": 6703 + }, + { + "epoch": 13.408, + "grad_norm": 1.325208067893982, + "learning_rate": 2e-05, + "loss": 0.04054024, + "step": 6704 + }, + { + "epoch": 13.41, + "grad_norm": 1.5213847160339355, + "learning_rate": 2e-05, + "loss": 0.03479857, + "step": 6705 + }, + { + "epoch": 13.412, + "grad_norm": 1.5465024709701538, + "learning_rate": 2e-05, + "loss": 0.03222397, + "step": 6706 + }, + { + "epoch": 13.414, + "grad_norm": 1.4514657258987427, + "learning_rate": 2e-05, + "loss": 0.03864145, + "step": 6707 + }, + { + "epoch": 13.416, + "grad_norm": 1.5519639253616333, + "learning_rate": 2e-05, + "loss": 0.04974742, + "step": 6708 + }, + { + "epoch": 13.418, + "grad_norm": 1.543533205986023, + "learning_rate": 2e-05, + "loss": 0.04505913, + "step": 6709 + }, + { + "epoch": 13.42, + "grad_norm": 1.244042158126831, + "learning_rate": 2e-05, + "loss": 0.03692468, + "step": 6710 + }, + { + "epoch": 13.422, + "grad_norm": 1.8627500534057617, + "learning_rate": 2e-05, + "loss": 0.04253852, + "step": 6711 + }, + { + "epoch": 13.424, + "grad_norm": 1.4597536325454712, + "learning_rate": 2e-05, + "loss": 0.03926862, + "step": 6712 + }, + { + "epoch": 13.426, + "grad_norm": 1.841884732246399, + "learning_rate": 2e-05, + "loss": 0.0296302, + "step": 6713 + }, + { + "epoch": 13.428, + "grad_norm": 2.594111204147339, + "learning_rate": 2e-05, + "loss": 0.05455346, + "step": 6714 + }, + { + "epoch": 13.43, + "grad_norm": 1.466094732284546, + "learning_rate": 2e-05, + "loss": 0.03086076, + "step": 6715 + }, + { + "epoch": 13.432, + "grad_norm": 2.093371868133545, + "learning_rate": 2e-05, + "loss": 0.05538841, + "step": 6716 + }, + { + "epoch": 13.434, + "grad_norm": 1.1023695468902588, + "learning_rate": 2e-05, + "loss": 0.04121345, + "step": 6717 + }, + { + "epoch": 13.436, + "grad_norm": 2.032099485397339, + "learning_rate": 2e-05, + "loss": 0.05646689, + "step": 6718 + }, + { + "epoch": 13.438, + "grad_norm": 1.150732398033142, + "learning_rate": 2e-05, + "loss": 0.03391449, + "step": 6719 + }, + { + "epoch": 13.44, + "grad_norm": 2.225139617919922, + "learning_rate": 2e-05, + "loss": 0.05393933, + "step": 6720 + }, + { + "epoch": 13.442, + "grad_norm": 3.746210813522339, + "learning_rate": 2e-05, + "loss": 0.04924157, + "step": 6721 + }, + { + "epoch": 13.444, + "grad_norm": 1.8030920028686523, + "learning_rate": 2e-05, + "loss": 0.04365423, + "step": 6722 + }, + { + "epoch": 13.446, + "grad_norm": 0.9142201542854309, + "learning_rate": 2e-05, + "loss": 0.02320842, + "step": 6723 + }, + { + "epoch": 13.448, + "grad_norm": 1.0032247304916382, + "learning_rate": 2e-05, + "loss": 0.03920649, + "step": 6724 + }, + { + "epoch": 13.45, + "grad_norm": 1.1446279287338257, + "learning_rate": 2e-05, + "loss": 0.0443483, + "step": 6725 + }, + { + "epoch": 13.452, + "grad_norm": 1.371544599533081, + "learning_rate": 2e-05, + "loss": 0.05964202, + "step": 6726 + }, + { + "epoch": 13.454, + "grad_norm": 1.4129619598388672, + "learning_rate": 2e-05, + "loss": 0.0317092, + "step": 6727 + }, + { + "epoch": 13.456, + "grad_norm": 2.262296438217163, + "learning_rate": 2e-05, + "loss": 0.04459076, + "step": 6728 + }, + { + "epoch": 13.458, + "grad_norm": 1.2286860942840576, + "learning_rate": 2e-05, + "loss": 0.04396465, + "step": 6729 + }, + { + "epoch": 13.46, + "grad_norm": 1.2002842426300049, + "learning_rate": 2e-05, + "loss": 0.02897931, + "step": 6730 + }, + { + "epoch": 13.462, + "grad_norm": 1.1000972986221313, + "learning_rate": 2e-05, + "loss": 0.04501866, + "step": 6731 + }, + { + "epoch": 13.464, + "grad_norm": 1.1960804462432861, + "learning_rate": 2e-05, + "loss": 0.02948781, + "step": 6732 + }, + { + "epoch": 13.466, + "grad_norm": 1.0015963315963745, + "learning_rate": 2e-05, + "loss": 0.03117497, + "step": 6733 + }, + { + "epoch": 13.468, + "grad_norm": 1.2673149108886719, + "learning_rate": 2e-05, + "loss": 0.03281594, + "step": 6734 + }, + { + "epoch": 13.47, + "grad_norm": 1.048366665840149, + "learning_rate": 2e-05, + "loss": 0.02636184, + "step": 6735 + }, + { + "epoch": 13.472, + "grad_norm": 2.1534323692321777, + "learning_rate": 2e-05, + "loss": 0.06946588, + "step": 6736 + }, + { + "epoch": 13.474, + "grad_norm": 1.2543253898620605, + "learning_rate": 2e-05, + "loss": 0.03551532, + "step": 6737 + }, + { + "epoch": 13.475999999999999, + "grad_norm": 2.5918846130371094, + "learning_rate": 2e-05, + "loss": 0.04691193, + "step": 6738 + }, + { + "epoch": 13.478, + "grad_norm": 0.7969970107078552, + "learning_rate": 2e-05, + "loss": 0.01839239, + "step": 6739 + }, + { + "epoch": 13.48, + "grad_norm": 2.0504300594329834, + "learning_rate": 2e-05, + "loss": 0.06359866, + "step": 6740 + }, + { + "epoch": 13.482, + "grad_norm": 0.8780750036239624, + "learning_rate": 2e-05, + "loss": 0.0225176, + "step": 6741 + }, + { + "epoch": 13.484, + "grad_norm": 1.1083356142044067, + "learning_rate": 2e-05, + "loss": 0.0324852, + "step": 6742 + }, + { + "epoch": 13.486, + "grad_norm": 2.280796527862549, + "learning_rate": 2e-05, + "loss": 0.0467263, + "step": 6743 + }, + { + "epoch": 13.488, + "grad_norm": 1.2050200700759888, + "learning_rate": 2e-05, + "loss": 0.02860709, + "step": 6744 + }, + { + "epoch": 13.49, + "grad_norm": 1.8593145608901978, + "learning_rate": 2e-05, + "loss": 0.03531153, + "step": 6745 + }, + { + "epoch": 13.492, + "grad_norm": 1.798421025276184, + "learning_rate": 2e-05, + "loss": 0.0394549, + "step": 6746 + }, + { + "epoch": 13.494, + "grad_norm": 1.5639235973358154, + "learning_rate": 2e-05, + "loss": 0.05283806, + "step": 6747 + }, + { + "epoch": 13.496, + "grad_norm": 1.026511549949646, + "learning_rate": 2e-05, + "loss": 0.02420056, + "step": 6748 + }, + { + "epoch": 13.498, + "grad_norm": 2.5854506492614746, + "learning_rate": 2e-05, + "loss": 0.06519131, + "step": 6749 + }, + { + "epoch": 13.5, + "grad_norm": 1.8973256349563599, + "learning_rate": 2e-05, + "loss": 0.04225761, + "step": 6750 + }, + { + "epoch": 13.502, + "grad_norm": 2.533794403076172, + "learning_rate": 2e-05, + "loss": 0.02219105, + "step": 6751 + }, + { + "epoch": 13.504, + "grad_norm": 1.8227717876434326, + "learning_rate": 2e-05, + "loss": 0.04808221, + "step": 6752 + }, + { + "epoch": 13.506, + "grad_norm": 1.0413390398025513, + "learning_rate": 2e-05, + "loss": 0.03295096, + "step": 6753 + }, + { + "epoch": 13.508, + "grad_norm": 1.9891551733016968, + "learning_rate": 2e-05, + "loss": 0.03309003, + "step": 6754 + }, + { + "epoch": 13.51, + "grad_norm": 1.173137903213501, + "learning_rate": 2e-05, + "loss": 0.03191693, + "step": 6755 + }, + { + "epoch": 13.512, + "grad_norm": 1.9114857912063599, + "learning_rate": 2e-05, + "loss": 0.05413975, + "step": 6756 + }, + { + "epoch": 13.514, + "grad_norm": 1.889272689819336, + "learning_rate": 2e-05, + "loss": 0.0451472, + "step": 6757 + }, + { + "epoch": 13.516, + "grad_norm": 2.4712700843811035, + "learning_rate": 2e-05, + "loss": 0.07723338, + "step": 6758 + }, + { + "epoch": 13.518, + "grad_norm": 2.2369396686553955, + "learning_rate": 2e-05, + "loss": 0.03549297, + "step": 6759 + }, + { + "epoch": 13.52, + "grad_norm": 2.845893144607544, + "learning_rate": 2e-05, + "loss": 0.05792838, + "step": 6760 + }, + { + "epoch": 13.522, + "grad_norm": 1.513317346572876, + "learning_rate": 2e-05, + "loss": 0.04458487, + "step": 6761 + }, + { + "epoch": 13.524000000000001, + "grad_norm": 1.106540560722351, + "learning_rate": 2e-05, + "loss": 0.03790012, + "step": 6762 + }, + { + "epoch": 13.526, + "grad_norm": 1.4870750904083252, + "learning_rate": 2e-05, + "loss": 0.02564323, + "step": 6763 + }, + { + "epoch": 13.528, + "grad_norm": 1.8651138544082642, + "learning_rate": 2e-05, + "loss": 0.05272502, + "step": 6764 + }, + { + "epoch": 13.53, + "grad_norm": 1.0633503198623657, + "learning_rate": 2e-05, + "loss": 0.04007112, + "step": 6765 + }, + { + "epoch": 13.532, + "grad_norm": 1.0299681425094604, + "learning_rate": 2e-05, + "loss": 0.03321412, + "step": 6766 + }, + { + "epoch": 13.534, + "grad_norm": 2.307370185852051, + "learning_rate": 2e-05, + "loss": 0.04456577, + "step": 6767 + }, + { + "epoch": 13.536, + "grad_norm": 0.9137228727340698, + "learning_rate": 2e-05, + "loss": 0.02563262, + "step": 6768 + }, + { + "epoch": 13.538, + "grad_norm": 1.7050751447677612, + "learning_rate": 2e-05, + "loss": 0.05230839, + "step": 6769 + }, + { + "epoch": 13.54, + "grad_norm": 1.070886492729187, + "learning_rate": 2e-05, + "loss": 0.02357959, + "step": 6770 + }, + { + "epoch": 13.542, + "grad_norm": 1.695765495300293, + "learning_rate": 2e-05, + "loss": 0.04813949, + "step": 6771 + }, + { + "epoch": 13.544, + "grad_norm": 1.1727943420410156, + "learning_rate": 2e-05, + "loss": 0.04054714, + "step": 6772 + }, + { + "epoch": 13.546, + "grad_norm": 0.9893969297409058, + "learning_rate": 2e-05, + "loss": 0.02479752, + "step": 6773 + }, + { + "epoch": 13.548, + "grad_norm": 1.2786779403686523, + "learning_rate": 2e-05, + "loss": 0.04855724, + "step": 6774 + }, + { + "epoch": 13.55, + "grad_norm": 2.4117813110351562, + "learning_rate": 2e-05, + "loss": 0.06347778, + "step": 6775 + }, + { + "epoch": 13.552, + "grad_norm": 1.0595612525939941, + "learning_rate": 2e-05, + "loss": 0.04373883, + "step": 6776 + }, + { + "epoch": 13.554, + "grad_norm": 1.1950441598892212, + "learning_rate": 2e-05, + "loss": 0.04355627, + "step": 6777 + }, + { + "epoch": 13.556000000000001, + "grad_norm": 1.147063136100769, + "learning_rate": 2e-05, + "loss": 0.03419853, + "step": 6778 + }, + { + "epoch": 13.558, + "grad_norm": 1.4784750938415527, + "learning_rate": 2e-05, + "loss": 0.04642799, + "step": 6779 + }, + { + "epoch": 13.56, + "grad_norm": 1.2722127437591553, + "learning_rate": 2e-05, + "loss": 0.03892024, + "step": 6780 + }, + { + "epoch": 13.562, + "grad_norm": 1.6152026653289795, + "learning_rate": 2e-05, + "loss": 0.04028412, + "step": 6781 + }, + { + "epoch": 13.564, + "grad_norm": 1.1551095247268677, + "learning_rate": 2e-05, + "loss": 0.03943279, + "step": 6782 + }, + { + "epoch": 13.566, + "grad_norm": 2.295308828353882, + "learning_rate": 2e-05, + "loss": 0.054483, + "step": 6783 + }, + { + "epoch": 13.568, + "grad_norm": 1.0823659896850586, + "learning_rate": 2e-05, + "loss": 0.0385061, + "step": 6784 + }, + { + "epoch": 13.57, + "grad_norm": 1.4659216403961182, + "learning_rate": 2e-05, + "loss": 0.0353567, + "step": 6785 + }, + { + "epoch": 13.572, + "grad_norm": 1.425752878189087, + "learning_rate": 2e-05, + "loss": 0.03669987, + "step": 6786 + }, + { + "epoch": 13.574, + "grad_norm": 2.6012632846832275, + "learning_rate": 2e-05, + "loss": 0.04362625, + "step": 6787 + }, + { + "epoch": 13.576, + "grad_norm": 1.162837266921997, + "learning_rate": 2e-05, + "loss": 0.03585674, + "step": 6788 + }, + { + "epoch": 13.578, + "grad_norm": 1.107102394104004, + "learning_rate": 2e-05, + "loss": 0.03959781, + "step": 6789 + }, + { + "epoch": 13.58, + "grad_norm": 1.6900997161865234, + "learning_rate": 2e-05, + "loss": 0.06017102, + "step": 6790 + }, + { + "epoch": 13.582, + "grad_norm": 1.4423255920410156, + "learning_rate": 2e-05, + "loss": 0.04416816, + "step": 6791 + }, + { + "epoch": 13.584, + "grad_norm": 1.5928566455841064, + "learning_rate": 2e-05, + "loss": 0.03386574, + "step": 6792 + }, + { + "epoch": 13.586, + "grad_norm": 1.8104335069656372, + "learning_rate": 2e-05, + "loss": 0.0400222, + "step": 6793 + }, + { + "epoch": 13.588, + "grad_norm": 1.451529860496521, + "learning_rate": 2e-05, + "loss": 0.04577955, + "step": 6794 + }, + { + "epoch": 13.59, + "grad_norm": 1.7088810205459595, + "learning_rate": 2e-05, + "loss": 0.03847235, + "step": 6795 + }, + { + "epoch": 13.592, + "grad_norm": 1.926962971687317, + "learning_rate": 2e-05, + "loss": 0.03429279, + "step": 6796 + }, + { + "epoch": 13.594, + "grad_norm": 2.025388479232788, + "learning_rate": 2e-05, + "loss": 0.04720273, + "step": 6797 + }, + { + "epoch": 13.596, + "grad_norm": 1.016129732131958, + "learning_rate": 2e-05, + "loss": 0.03385385, + "step": 6798 + }, + { + "epoch": 13.598, + "grad_norm": 1.0439811944961548, + "learning_rate": 2e-05, + "loss": 0.02241655, + "step": 6799 + }, + { + "epoch": 13.6, + "grad_norm": 0.7543059587478638, + "learning_rate": 2e-05, + "loss": 0.01822535, + "step": 6800 + }, + { + "epoch": 13.602, + "grad_norm": 1.9524015188217163, + "learning_rate": 2e-05, + "loss": 0.04379565, + "step": 6801 + }, + { + "epoch": 13.604, + "grad_norm": 1.0359333753585815, + "learning_rate": 2e-05, + "loss": 0.0270812, + "step": 6802 + }, + { + "epoch": 13.606, + "grad_norm": 1.362512469291687, + "learning_rate": 2e-05, + "loss": 0.03527067, + "step": 6803 + }, + { + "epoch": 13.608, + "grad_norm": 2.679394245147705, + "learning_rate": 2e-05, + "loss": 0.06183048, + "step": 6804 + }, + { + "epoch": 13.61, + "grad_norm": 1.175911784172058, + "learning_rate": 2e-05, + "loss": 0.03599912, + "step": 6805 + }, + { + "epoch": 13.612, + "grad_norm": 2.0313289165496826, + "learning_rate": 2e-05, + "loss": 0.06214502, + "step": 6806 + }, + { + "epoch": 13.614, + "grad_norm": 1.8251034021377563, + "learning_rate": 2e-05, + "loss": 0.04615673, + "step": 6807 + }, + { + "epoch": 13.616, + "grad_norm": 1.214990258216858, + "learning_rate": 2e-05, + "loss": 0.03257399, + "step": 6808 + }, + { + "epoch": 13.618, + "grad_norm": 3.3506529331207275, + "learning_rate": 2e-05, + "loss": 0.04304178, + "step": 6809 + }, + { + "epoch": 13.62, + "grad_norm": 1.2393721342086792, + "learning_rate": 2e-05, + "loss": 0.04934491, + "step": 6810 + }, + { + "epoch": 13.622, + "grad_norm": 2.080261468887329, + "learning_rate": 2e-05, + "loss": 0.04631698, + "step": 6811 + }, + { + "epoch": 13.624, + "grad_norm": 1.2759827375411987, + "learning_rate": 2e-05, + "loss": 0.03095847, + "step": 6812 + }, + { + "epoch": 13.626, + "grad_norm": 1.7272143363952637, + "learning_rate": 2e-05, + "loss": 0.03796447, + "step": 6813 + }, + { + "epoch": 13.628, + "grad_norm": 1.4961469173431396, + "learning_rate": 2e-05, + "loss": 0.04623634, + "step": 6814 + }, + { + "epoch": 13.63, + "grad_norm": 1.548699140548706, + "learning_rate": 2e-05, + "loss": 0.0516596, + "step": 6815 + }, + { + "epoch": 13.632, + "grad_norm": 1.285744309425354, + "learning_rate": 2e-05, + "loss": 0.03057302, + "step": 6816 + }, + { + "epoch": 13.634, + "grad_norm": 1.525428295135498, + "learning_rate": 2e-05, + "loss": 0.05115666, + "step": 6817 + }, + { + "epoch": 13.636, + "grad_norm": 1.7536441087722778, + "learning_rate": 2e-05, + "loss": 0.04631253, + "step": 6818 + }, + { + "epoch": 13.638, + "grad_norm": 1.1306259632110596, + "learning_rate": 2e-05, + "loss": 0.04088272, + "step": 6819 + }, + { + "epoch": 13.64, + "grad_norm": 1.1280131340026855, + "learning_rate": 2e-05, + "loss": 0.04373218, + "step": 6820 + }, + { + "epoch": 13.642, + "grad_norm": 3.139237642288208, + "learning_rate": 2e-05, + "loss": 0.04641888, + "step": 6821 + }, + { + "epoch": 13.644, + "grad_norm": 1.6292080879211426, + "learning_rate": 2e-05, + "loss": 0.04293904, + "step": 6822 + }, + { + "epoch": 13.646, + "grad_norm": 1.7208157777786255, + "learning_rate": 2e-05, + "loss": 0.04404939, + "step": 6823 + }, + { + "epoch": 13.648, + "grad_norm": 1.2277165651321411, + "learning_rate": 2e-05, + "loss": 0.03341801, + "step": 6824 + }, + { + "epoch": 13.65, + "grad_norm": 1.2899742126464844, + "learning_rate": 2e-05, + "loss": 0.03240266, + "step": 6825 + }, + { + "epoch": 13.652, + "grad_norm": 1.4660921096801758, + "learning_rate": 2e-05, + "loss": 0.03453447, + "step": 6826 + }, + { + "epoch": 13.654, + "grad_norm": 1.2889078855514526, + "learning_rate": 2e-05, + "loss": 0.03558958, + "step": 6827 + }, + { + "epoch": 13.656, + "grad_norm": 1.38223397731781, + "learning_rate": 2e-05, + "loss": 0.05393769, + "step": 6828 + }, + { + "epoch": 13.658, + "grad_norm": 1.3276984691619873, + "learning_rate": 2e-05, + "loss": 0.03997407, + "step": 6829 + }, + { + "epoch": 13.66, + "grad_norm": 1.929145097732544, + "learning_rate": 2e-05, + "loss": 0.0430852, + "step": 6830 + }, + { + "epoch": 13.662, + "grad_norm": 1.0870046615600586, + "learning_rate": 2e-05, + "loss": 0.03382558, + "step": 6831 + }, + { + "epoch": 13.664, + "grad_norm": 1.1914732456207275, + "learning_rate": 2e-05, + "loss": 0.0535154, + "step": 6832 + }, + { + "epoch": 13.666, + "grad_norm": 2.2056117057800293, + "learning_rate": 2e-05, + "loss": 0.05700465, + "step": 6833 + }, + { + "epoch": 13.668, + "grad_norm": 1.1149606704711914, + "learning_rate": 2e-05, + "loss": 0.03322265, + "step": 6834 + }, + { + "epoch": 13.67, + "grad_norm": 2.3128271102905273, + "learning_rate": 2e-05, + "loss": 0.07779928, + "step": 6835 + }, + { + "epoch": 13.672, + "grad_norm": 1.0935770273208618, + "learning_rate": 2e-05, + "loss": 0.03507974, + "step": 6836 + }, + { + "epoch": 13.674, + "grad_norm": 1.0744456052780151, + "learning_rate": 2e-05, + "loss": 0.04397395, + "step": 6837 + }, + { + "epoch": 13.676, + "grad_norm": 1.3249156475067139, + "learning_rate": 2e-05, + "loss": 0.04180006, + "step": 6838 + }, + { + "epoch": 13.678, + "grad_norm": 1.1021473407745361, + "learning_rate": 2e-05, + "loss": 0.03645816, + "step": 6839 + }, + { + "epoch": 13.68, + "grad_norm": 1.929029941558838, + "learning_rate": 2e-05, + "loss": 0.02640364, + "step": 6840 + }, + { + "epoch": 13.682, + "grad_norm": 0.9546433687210083, + "learning_rate": 2e-05, + "loss": 0.02705884, + "step": 6841 + }, + { + "epoch": 13.684, + "grad_norm": 1.073384165763855, + "learning_rate": 2e-05, + "loss": 0.04102583, + "step": 6842 + }, + { + "epoch": 13.686, + "grad_norm": 1.257896900177002, + "learning_rate": 2e-05, + "loss": 0.04304846, + "step": 6843 + }, + { + "epoch": 13.688, + "grad_norm": 1.318122148513794, + "learning_rate": 2e-05, + "loss": 0.04488578, + "step": 6844 + }, + { + "epoch": 13.69, + "grad_norm": 0.9590141773223877, + "learning_rate": 2e-05, + "loss": 0.02748418, + "step": 6845 + }, + { + "epoch": 13.692, + "grad_norm": 1.7860455513000488, + "learning_rate": 2e-05, + "loss": 0.05183525, + "step": 6846 + }, + { + "epoch": 13.693999999999999, + "grad_norm": 0.9630289673805237, + "learning_rate": 2e-05, + "loss": 0.03061388, + "step": 6847 + }, + { + "epoch": 13.696, + "grad_norm": 1.2916054725646973, + "learning_rate": 2e-05, + "loss": 0.03421018, + "step": 6848 + }, + { + "epoch": 13.698, + "grad_norm": 2.0008859634399414, + "learning_rate": 2e-05, + "loss": 0.04936755, + "step": 6849 + }, + { + "epoch": 13.7, + "grad_norm": 2.6435530185699463, + "learning_rate": 2e-05, + "loss": 0.06780395, + "step": 6850 + }, + { + "epoch": 13.702, + "grad_norm": 0.9989877939224243, + "learning_rate": 2e-05, + "loss": 0.03243184, + "step": 6851 + }, + { + "epoch": 13.704, + "grad_norm": 1.6317846775054932, + "learning_rate": 2e-05, + "loss": 0.0594992, + "step": 6852 + }, + { + "epoch": 13.706, + "grad_norm": 1.2833094596862793, + "learning_rate": 2e-05, + "loss": 0.03055147, + "step": 6853 + }, + { + "epoch": 13.708, + "grad_norm": 0.9552610516548157, + "learning_rate": 2e-05, + "loss": 0.0297171, + "step": 6854 + }, + { + "epoch": 13.71, + "grad_norm": 1.4244675636291504, + "learning_rate": 2e-05, + "loss": 0.04444023, + "step": 6855 + }, + { + "epoch": 13.712, + "grad_norm": 1.2428985834121704, + "learning_rate": 2e-05, + "loss": 0.04104583, + "step": 6856 + }, + { + "epoch": 13.714, + "grad_norm": 1.4677211046218872, + "learning_rate": 2e-05, + "loss": 0.05508178, + "step": 6857 + }, + { + "epoch": 13.716, + "grad_norm": 1.8807919025421143, + "learning_rate": 2e-05, + "loss": 0.0591247, + "step": 6858 + }, + { + "epoch": 13.718, + "grad_norm": 1.9581363201141357, + "learning_rate": 2e-05, + "loss": 0.04779023, + "step": 6859 + }, + { + "epoch": 13.72, + "grad_norm": 2.0055696964263916, + "learning_rate": 2e-05, + "loss": 0.0408286, + "step": 6860 + }, + { + "epoch": 13.722, + "grad_norm": 1.830039143562317, + "learning_rate": 2e-05, + "loss": 0.04613936, + "step": 6861 + }, + { + "epoch": 13.724, + "grad_norm": 2.8258206844329834, + "learning_rate": 2e-05, + "loss": 0.04065733, + "step": 6862 + }, + { + "epoch": 13.725999999999999, + "grad_norm": 1.871401309967041, + "learning_rate": 2e-05, + "loss": 0.04201019, + "step": 6863 + }, + { + "epoch": 13.728, + "grad_norm": 1.449409008026123, + "learning_rate": 2e-05, + "loss": 0.03992969, + "step": 6864 + }, + { + "epoch": 13.73, + "grad_norm": 0.9711210131645203, + "learning_rate": 2e-05, + "loss": 0.0266035, + "step": 6865 + }, + { + "epoch": 13.732, + "grad_norm": 3.511932849884033, + "learning_rate": 2e-05, + "loss": 0.04639168, + "step": 6866 + }, + { + "epoch": 13.734, + "grad_norm": 1.1094722747802734, + "learning_rate": 2e-05, + "loss": 0.0350898, + "step": 6867 + }, + { + "epoch": 13.736, + "grad_norm": 1.4972585439682007, + "learning_rate": 2e-05, + "loss": 0.04061377, + "step": 6868 + }, + { + "epoch": 13.738, + "grad_norm": 1.1617933511734009, + "learning_rate": 2e-05, + "loss": 0.0347375, + "step": 6869 + }, + { + "epoch": 13.74, + "grad_norm": 1.5653934478759766, + "learning_rate": 2e-05, + "loss": 0.04277583, + "step": 6870 + }, + { + "epoch": 13.742, + "grad_norm": 1.7562743425369263, + "learning_rate": 2e-05, + "loss": 0.04624029, + "step": 6871 + }, + { + "epoch": 13.744, + "grad_norm": 1.6248326301574707, + "learning_rate": 2e-05, + "loss": 0.05260355, + "step": 6872 + }, + { + "epoch": 13.746, + "grad_norm": 1.7137155532836914, + "learning_rate": 2e-05, + "loss": 0.05001482, + "step": 6873 + }, + { + "epoch": 13.748, + "grad_norm": 1.3740524053573608, + "learning_rate": 2e-05, + "loss": 0.04304221, + "step": 6874 + }, + { + "epoch": 13.75, + "grad_norm": 1.070404291152954, + "learning_rate": 2e-05, + "loss": 0.02320381, + "step": 6875 + }, + { + "epoch": 13.752, + "grad_norm": 2.6879231929779053, + "learning_rate": 2e-05, + "loss": 0.04556799, + "step": 6876 + }, + { + "epoch": 13.754, + "grad_norm": 2.933372974395752, + "learning_rate": 2e-05, + "loss": 0.04154959, + "step": 6877 + }, + { + "epoch": 13.756, + "grad_norm": 1.3636802434921265, + "learning_rate": 2e-05, + "loss": 0.03800491, + "step": 6878 + }, + { + "epoch": 13.758, + "grad_norm": 1.157073736190796, + "learning_rate": 2e-05, + "loss": 0.03626024, + "step": 6879 + }, + { + "epoch": 13.76, + "grad_norm": 1.7253233194351196, + "learning_rate": 2e-05, + "loss": 0.04370718, + "step": 6880 + }, + { + "epoch": 13.762, + "grad_norm": 1.3973212242126465, + "learning_rate": 2e-05, + "loss": 0.03621276, + "step": 6881 + }, + { + "epoch": 13.764, + "grad_norm": 1.7019752264022827, + "learning_rate": 2e-05, + "loss": 0.03458362, + "step": 6882 + }, + { + "epoch": 13.766, + "grad_norm": 1.227394700050354, + "learning_rate": 2e-05, + "loss": 0.04752816, + "step": 6883 + }, + { + "epoch": 13.768, + "grad_norm": 1.493351697921753, + "learning_rate": 2e-05, + "loss": 0.03362212, + "step": 6884 + }, + { + "epoch": 13.77, + "grad_norm": 1.694974422454834, + "learning_rate": 2e-05, + "loss": 0.06347952, + "step": 6885 + }, + { + "epoch": 13.772, + "grad_norm": 1.4916173219680786, + "learning_rate": 2e-05, + "loss": 0.04393812, + "step": 6886 + }, + { + "epoch": 13.774000000000001, + "grad_norm": 1.1646591424942017, + "learning_rate": 2e-05, + "loss": 0.03405294, + "step": 6887 + }, + { + "epoch": 13.776, + "grad_norm": 1.1886578798294067, + "learning_rate": 2e-05, + "loss": 0.02821852, + "step": 6888 + }, + { + "epoch": 13.778, + "grad_norm": 1.2024027109146118, + "learning_rate": 2e-05, + "loss": 0.0455913, + "step": 6889 + }, + { + "epoch": 13.78, + "grad_norm": 1.684903621673584, + "learning_rate": 2e-05, + "loss": 0.04784643, + "step": 6890 + }, + { + "epoch": 13.782, + "grad_norm": 0.9186260104179382, + "learning_rate": 2e-05, + "loss": 0.02574374, + "step": 6891 + }, + { + "epoch": 13.784, + "grad_norm": 1.2807894945144653, + "learning_rate": 2e-05, + "loss": 0.03100124, + "step": 6892 + }, + { + "epoch": 13.786, + "grad_norm": 2.1691181659698486, + "learning_rate": 2e-05, + "loss": 0.02056332, + "step": 6893 + }, + { + "epoch": 13.788, + "grad_norm": 1.8777505159378052, + "learning_rate": 2e-05, + "loss": 0.04661623, + "step": 6894 + }, + { + "epoch": 13.79, + "grad_norm": 1.4597963094711304, + "learning_rate": 2e-05, + "loss": 0.05256153, + "step": 6895 + }, + { + "epoch": 13.792, + "grad_norm": 1.8397506475448608, + "learning_rate": 2e-05, + "loss": 0.05605321, + "step": 6896 + }, + { + "epoch": 13.794, + "grad_norm": 1.1106833219528198, + "learning_rate": 2e-05, + "loss": 0.03157957, + "step": 6897 + }, + { + "epoch": 13.796, + "grad_norm": 1.118843674659729, + "learning_rate": 2e-05, + "loss": 0.0361166, + "step": 6898 + }, + { + "epoch": 13.798, + "grad_norm": 1.4842638969421387, + "learning_rate": 2e-05, + "loss": 0.04007118, + "step": 6899 + }, + { + "epoch": 13.8, + "grad_norm": 1.4503836631774902, + "learning_rate": 2e-05, + "loss": 0.04612395, + "step": 6900 + }, + { + "epoch": 13.802, + "grad_norm": 1.2816123962402344, + "learning_rate": 2e-05, + "loss": 0.04101133, + "step": 6901 + }, + { + "epoch": 13.804, + "grad_norm": 1.273935317993164, + "learning_rate": 2e-05, + "loss": 0.03615731, + "step": 6902 + }, + { + "epoch": 13.806000000000001, + "grad_norm": 1.308467984199524, + "learning_rate": 2e-05, + "loss": 0.04421499, + "step": 6903 + }, + { + "epoch": 13.808, + "grad_norm": 1.472285509109497, + "learning_rate": 2e-05, + "loss": 0.04577631, + "step": 6904 + }, + { + "epoch": 13.81, + "grad_norm": 1.998026728630066, + "learning_rate": 2e-05, + "loss": 0.0379182, + "step": 6905 + }, + { + "epoch": 13.812, + "grad_norm": 1.1190993785858154, + "learning_rate": 2e-05, + "loss": 0.03913017, + "step": 6906 + }, + { + "epoch": 13.814, + "grad_norm": 3.0139517784118652, + "learning_rate": 2e-05, + "loss": 0.0456089, + "step": 6907 + }, + { + "epoch": 13.816, + "grad_norm": 1.1194919347763062, + "learning_rate": 2e-05, + "loss": 0.04423294, + "step": 6908 + }, + { + "epoch": 13.818, + "grad_norm": 1.2570995092391968, + "learning_rate": 2e-05, + "loss": 0.03813727, + "step": 6909 + }, + { + "epoch": 13.82, + "grad_norm": 1.317740797996521, + "learning_rate": 2e-05, + "loss": 0.04127891, + "step": 6910 + }, + { + "epoch": 13.822, + "grad_norm": 1.1890889406204224, + "learning_rate": 2e-05, + "loss": 0.0370289, + "step": 6911 + }, + { + "epoch": 13.824, + "grad_norm": 3.053663730621338, + "learning_rate": 2e-05, + "loss": 0.06167036, + "step": 6912 + }, + { + "epoch": 13.826, + "grad_norm": 1.2635729312896729, + "learning_rate": 2e-05, + "loss": 0.05142916, + "step": 6913 + }, + { + "epoch": 13.828, + "grad_norm": 2.6170880794525146, + "learning_rate": 2e-05, + "loss": 0.0427838, + "step": 6914 + }, + { + "epoch": 13.83, + "grad_norm": 1.7016270160675049, + "learning_rate": 2e-05, + "loss": 0.02756583, + "step": 6915 + }, + { + "epoch": 13.832, + "grad_norm": 1.1118059158325195, + "learning_rate": 2e-05, + "loss": 0.04283454, + "step": 6916 + }, + { + "epoch": 13.834, + "grad_norm": 1.7809730768203735, + "learning_rate": 2e-05, + "loss": 0.04107636, + "step": 6917 + }, + { + "epoch": 13.836, + "grad_norm": 1.776057243347168, + "learning_rate": 2e-05, + "loss": 0.06522802, + "step": 6918 + }, + { + "epoch": 13.838, + "grad_norm": 2.162116765975952, + "learning_rate": 2e-05, + "loss": 0.0393892, + "step": 6919 + }, + { + "epoch": 13.84, + "grad_norm": 1.280896544456482, + "learning_rate": 2e-05, + "loss": 0.05017384, + "step": 6920 + }, + { + "epoch": 13.842, + "grad_norm": 1.4492366313934326, + "learning_rate": 2e-05, + "loss": 0.04526861, + "step": 6921 + }, + { + "epoch": 13.844, + "grad_norm": 1.8244743347167969, + "learning_rate": 2e-05, + "loss": 0.06141206, + "step": 6922 + }, + { + "epoch": 13.846, + "grad_norm": 1.2145180702209473, + "learning_rate": 2e-05, + "loss": 0.03708153, + "step": 6923 + }, + { + "epoch": 13.848, + "grad_norm": 1.1694141626358032, + "learning_rate": 2e-05, + "loss": 0.03655352, + "step": 6924 + }, + { + "epoch": 13.85, + "grad_norm": 2.419154644012451, + "learning_rate": 2e-05, + "loss": 0.06464878, + "step": 6925 + }, + { + "epoch": 13.852, + "grad_norm": 1.4023338556289673, + "learning_rate": 2e-05, + "loss": 0.03684681, + "step": 6926 + }, + { + "epoch": 13.854, + "grad_norm": 1.3352280855178833, + "learning_rate": 2e-05, + "loss": 0.03953333, + "step": 6927 + }, + { + "epoch": 13.856, + "grad_norm": 0.8786143660545349, + "learning_rate": 2e-05, + "loss": 0.0229125, + "step": 6928 + }, + { + "epoch": 13.858, + "grad_norm": 1.3116097450256348, + "learning_rate": 2e-05, + "loss": 0.04031592, + "step": 6929 + }, + { + "epoch": 13.86, + "grad_norm": 1.9184274673461914, + "learning_rate": 2e-05, + "loss": 0.0427547, + "step": 6930 + }, + { + "epoch": 13.862, + "grad_norm": 1.721928596496582, + "learning_rate": 2e-05, + "loss": 0.06148085, + "step": 6931 + }, + { + "epoch": 13.864, + "grad_norm": 1.0398845672607422, + "learning_rate": 2e-05, + "loss": 0.0353225, + "step": 6932 + }, + { + "epoch": 13.866, + "grad_norm": 0.929514467716217, + "learning_rate": 2e-05, + "loss": 0.02477574, + "step": 6933 + }, + { + "epoch": 13.868, + "grad_norm": 1.3540958166122437, + "learning_rate": 2e-05, + "loss": 0.03898114, + "step": 6934 + }, + { + "epoch": 13.87, + "grad_norm": 1.7967790365219116, + "learning_rate": 2e-05, + "loss": 0.04478693, + "step": 6935 + }, + { + "epoch": 13.872, + "grad_norm": 1.5192323923110962, + "learning_rate": 2e-05, + "loss": 0.04696514, + "step": 6936 + }, + { + "epoch": 13.874, + "grad_norm": 1.3892743587493896, + "learning_rate": 2e-05, + "loss": 0.04166014, + "step": 6937 + }, + { + "epoch": 13.876, + "grad_norm": 1.4207967519760132, + "learning_rate": 2e-05, + "loss": 0.03712838, + "step": 6938 + }, + { + "epoch": 13.878, + "grad_norm": 1.5932114124298096, + "learning_rate": 2e-05, + "loss": 0.03432607, + "step": 6939 + }, + { + "epoch": 13.88, + "grad_norm": 1.9691510200500488, + "learning_rate": 2e-05, + "loss": 0.03547703, + "step": 6940 + }, + { + "epoch": 13.882, + "grad_norm": 1.0378472805023193, + "learning_rate": 2e-05, + "loss": 0.02458699, + "step": 6941 + }, + { + "epoch": 13.884, + "grad_norm": 2.503615140914917, + "learning_rate": 2e-05, + "loss": 0.04526497, + "step": 6942 + }, + { + "epoch": 13.886, + "grad_norm": 1.3788117170333862, + "learning_rate": 2e-05, + "loss": 0.04195864, + "step": 6943 + }, + { + "epoch": 13.888, + "grad_norm": 1.6336097717285156, + "learning_rate": 2e-05, + "loss": 0.03892026, + "step": 6944 + }, + { + "epoch": 13.89, + "grad_norm": 1.337511420249939, + "learning_rate": 2e-05, + "loss": 0.03862485, + "step": 6945 + }, + { + "epoch": 13.892, + "grad_norm": 1.2629504203796387, + "learning_rate": 2e-05, + "loss": 0.03670635, + "step": 6946 + }, + { + "epoch": 13.894, + "grad_norm": 1.307521104812622, + "learning_rate": 2e-05, + "loss": 0.04121678, + "step": 6947 + }, + { + "epoch": 13.896, + "grad_norm": 1.2313232421875, + "learning_rate": 2e-05, + "loss": 0.02896846, + "step": 6948 + }, + { + "epoch": 13.898, + "grad_norm": 1.50730299949646, + "learning_rate": 2e-05, + "loss": 0.04983806, + "step": 6949 + }, + { + "epoch": 13.9, + "grad_norm": 1.2840025424957275, + "learning_rate": 2e-05, + "loss": 0.03766822, + "step": 6950 + }, + { + "epoch": 13.902, + "grad_norm": 1.5729337930679321, + "learning_rate": 2e-05, + "loss": 0.04883525, + "step": 6951 + }, + { + "epoch": 13.904, + "grad_norm": 1.3678689002990723, + "learning_rate": 2e-05, + "loss": 0.02985792, + "step": 6952 + }, + { + "epoch": 13.906, + "grad_norm": 1.845995545387268, + "learning_rate": 2e-05, + "loss": 0.03586386, + "step": 6953 + }, + { + "epoch": 13.908, + "grad_norm": 1.7878332138061523, + "learning_rate": 2e-05, + "loss": 0.03654428, + "step": 6954 + }, + { + "epoch": 13.91, + "grad_norm": 3.085512161254883, + "learning_rate": 2e-05, + "loss": 0.05120496, + "step": 6955 + }, + { + "epoch": 13.912, + "grad_norm": 3.0082015991210938, + "learning_rate": 2e-05, + "loss": 0.05480274, + "step": 6956 + }, + { + "epoch": 13.914, + "grad_norm": 1.2339752912521362, + "learning_rate": 2e-05, + "loss": 0.03606438, + "step": 6957 + }, + { + "epoch": 13.916, + "grad_norm": 1.9148188829421997, + "learning_rate": 2e-05, + "loss": 0.04195002, + "step": 6958 + }, + { + "epoch": 13.918, + "grad_norm": 1.1235597133636475, + "learning_rate": 2e-05, + "loss": 0.0217904, + "step": 6959 + }, + { + "epoch": 13.92, + "grad_norm": 2.466542959213257, + "learning_rate": 2e-05, + "loss": 0.06078681, + "step": 6960 + }, + { + "epoch": 13.922, + "grad_norm": 1.385965347290039, + "learning_rate": 2e-05, + "loss": 0.02699665, + "step": 6961 + }, + { + "epoch": 13.924, + "grad_norm": 1.464664340019226, + "learning_rate": 2e-05, + "loss": 0.03695335, + "step": 6962 + }, + { + "epoch": 13.926, + "grad_norm": 1.523497462272644, + "learning_rate": 2e-05, + "loss": 0.04739627, + "step": 6963 + }, + { + "epoch": 13.928, + "grad_norm": 2.7052083015441895, + "learning_rate": 2e-05, + "loss": 0.05714166, + "step": 6964 + }, + { + "epoch": 13.93, + "grad_norm": 1.4660812616348267, + "learning_rate": 2e-05, + "loss": 0.03287406, + "step": 6965 + }, + { + "epoch": 13.932, + "grad_norm": 1.166961669921875, + "learning_rate": 2e-05, + "loss": 0.03073209, + "step": 6966 + }, + { + "epoch": 13.934, + "grad_norm": 2.519378423690796, + "learning_rate": 2e-05, + "loss": 0.06768637, + "step": 6967 + }, + { + "epoch": 13.936, + "grad_norm": 1.424590826034546, + "learning_rate": 2e-05, + "loss": 0.03274088, + "step": 6968 + }, + { + "epoch": 13.938, + "grad_norm": 1.7873649597167969, + "learning_rate": 2e-05, + "loss": 0.0431348, + "step": 6969 + }, + { + "epoch": 13.94, + "grad_norm": 1.6397364139556885, + "learning_rate": 2e-05, + "loss": 0.05216152, + "step": 6970 + }, + { + "epoch": 13.942, + "grad_norm": 1.9869099855422974, + "learning_rate": 2e-05, + "loss": 0.042454, + "step": 6971 + }, + { + "epoch": 13.943999999999999, + "grad_norm": 1.582697868347168, + "learning_rate": 2e-05, + "loss": 0.04447088, + "step": 6972 + }, + { + "epoch": 13.946, + "grad_norm": 1.1062511205673218, + "learning_rate": 2e-05, + "loss": 0.03642577, + "step": 6973 + }, + { + "epoch": 13.948, + "grad_norm": 1.2900327444076538, + "learning_rate": 2e-05, + "loss": 0.04951657, + "step": 6974 + }, + { + "epoch": 13.95, + "grad_norm": 1.1730434894561768, + "learning_rate": 2e-05, + "loss": 0.03196269, + "step": 6975 + }, + { + "epoch": 13.952, + "grad_norm": 1.5335745811462402, + "learning_rate": 2e-05, + "loss": 0.05588776, + "step": 6976 + }, + { + "epoch": 13.954, + "grad_norm": 2.73171329498291, + "learning_rate": 2e-05, + "loss": 0.0453019, + "step": 6977 + }, + { + "epoch": 13.956, + "grad_norm": 1.3756558895111084, + "learning_rate": 2e-05, + "loss": 0.04042321, + "step": 6978 + }, + { + "epoch": 13.958, + "grad_norm": 1.025495171546936, + "learning_rate": 2e-05, + "loss": 0.03254474, + "step": 6979 + }, + { + "epoch": 13.96, + "grad_norm": 0.8832976818084717, + "learning_rate": 2e-05, + "loss": 0.0262373, + "step": 6980 + }, + { + "epoch": 13.962, + "grad_norm": 1.3270471096038818, + "learning_rate": 2e-05, + "loss": 0.03066928, + "step": 6981 + }, + { + "epoch": 13.964, + "grad_norm": 1.8798532485961914, + "learning_rate": 2e-05, + "loss": 0.03866819, + "step": 6982 + }, + { + "epoch": 13.966, + "grad_norm": 1.69099760055542, + "learning_rate": 2e-05, + "loss": 0.05059724, + "step": 6983 + }, + { + "epoch": 13.968, + "grad_norm": 1.9903326034545898, + "learning_rate": 2e-05, + "loss": 0.05271498, + "step": 6984 + }, + { + "epoch": 13.97, + "grad_norm": 1.4454317092895508, + "learning_rate": 2e-05, + "loss": 0.03909637, + "step": 6985 + }, + { + "epoch": 13.972, + "grad_norm": 1.4461326599121094, + "learning_rate": 2e-05, + "loss": 0.04804809, + "step": 6986 + }, + { + "epoch": 13.974, + "grad_norm": 1.59605872631073, + "learning_rate": 2e-05, + "loss": 0.04185, + "step": 6987 + }, + { + "epoch": 13.975999999999999, + "grad_norm": 1.0969065427780151, + "learning_rate": 2e-05, + "loss": 0.03130201, + "step": 6988 + }, + { + "epoch": 13.978, + "grad_norm": 1.317733645439148, + "learning_rate": 2e-05, + "loss": 0.03786597, + "step": 6989 + }, + { + "epoch": 13.98, + "grad_norm": 1.1903131008148193, + "learning_rate": 2e-05, + "loss": 0.03440776, + "step": 6990 + }, + { + "epoch": 13.982, + "grad_norm": 1.5716062784194946, + "learning_rate": 2e-05, + "loss": 0.03851765, + "step": 6991 + }, + { + "epoch": 13.984, + "grad_norm": 1.9664435386657715, + "learning_rate": 2e-05, + "loss": 0.05616831, + "step": 6992 + }, + { + "epoch": 13.986, + "grad_norm": 2.0219123363494873, + "learning_rate": 2e-05, + "loss": 0.04097972, + "step": 6993 + }, + { + "epoch": 13.988, + "grad_norm": 1.4964755773544312, + "learning_rate": 2e-05, + "loss": 0.05085189, + "step": 6994 + }, + { + "epoch": 13.99, + "grad_norm": 1.520508885383606, + "learning_rate": 2e-05, + "loss": 0.03831444, + "step": 6995 + }, + { + "epoch": 13.992, + "grad_norm": 1.3518579006195068, + "learning_rate": 2e-05, + "loss": 0.0365081, + "step": 6996 + }, + { + "epoch": 13.994, + "grad_norm": 2.745910882949829, + "learning_rate": 2e-05, + "loss": 0.03853264, + "step": 6997 + }, + { + "epoch": 13.996, + "grad_norm": 1.2294520139694214, + "learning_rate": 2e-05, + "loss": 0.0548241, + "step": 6998 + }, + { + "epoch": 13.998, + "grad_norm": 1.276281714439392, + "learning_rate": 2e-05, + "loss": 0.04351184, + "step": 6999 + }, + { + "epoch": 14.0, + "grad_norm": 1.2956624031066895, + "learning_rate": 2e-05, + "loss": 0.04201851, + "step": 7000 + }, + { + "epoch": 14.0, + "eval_performance": { + "AngleClassification_1": 0.99, + "AngleClassification_2": 0.998, + "AngleClassification_3": 0.9600798403193613, + "Equal_1": 0.996, + "Equal_2": 0.9720558882235529, + "Equal_3": 0.9001996007984032, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9900199600798403, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 0.994, + "Perpendicular_2": 0.976, + "Perpendicular_3": 0.7324649298597194, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.9916, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9859719438877755, + "PointLiesOnLine_3": 0.9720558882235529 + }, + "eval_runtime": 318.9857, + "eval_samples_per_second": 32.917, + "eval_steps_per_second": 0.658, + "step": 7000 + }, + { + "epoch": 14.002, + "grad_norm": 1.2843459844589233, + "learning_rate": 2e-05, + "loss": 0.03288523, + "step": 7001 + }, + { + "epoch": 14.004, + "grad_norm": 1.4108099937438965, + "learning_rate": 2e-05, + "loss": 0.04203776, + "step": 7002 + }, + { + "epoch": 14.006, + "grad_norm": 1.3905504941940308, + "learning_rate": 2e-05, + "loss": 0.03418431, + "step": 7003 + }, + { + "epoch": 14.008, + "grad_norm": 1.3126283884048462, + "learning_rate": 2e-05, + "loss": 0.03478189, + "step": 7004 + }, + { + "epoch": 14.01, + "grad_norm": 1.513098120689392, + "learning_rate": 2e-05, + "loss": 0.02606986, + "step": 7005 + }, + { + "epoch": 14.012, + "grad_norm": 0.7904066443443298, + "learning_rate": 2e-05, + "loss": 0.01787382, + "step": 7006 + }, + { + "epoch": 14.014, + "grad_norm": 3.054617166519165, + "learning_rate": 2e-05, + "loss": 0.04634954, + "step": 7007 + }, + { + "epoch": 14.016, + "grad_norm": 1.0060993432998657, + "learning_rate": 2e-05, + "loss": 0.03051211, + "step": 7008 + }, + { + "epoch": 14.018, + "grad_norm": 1.1262900829315186, + "learning_rate": 2e-05, + "loss": 0.04555785, + "step": 7009 + }, + { + "epoch": 14.02, + "grad_norm": 1.0693005323410034, + "learning_rate": 2e-05, + "loss": 0.02980948, + "step": 7010 + }, + { + "epoch": 14.022, + "grad_norm": 2.0782830715179443, + "learning_rate": 2e-05, + "loss": 0.04862558, + "step": 7011 + }, + { + "epoch": 14.024, + "grad_norm": 1.079984188079834, + "learning_rate": 2e-05, + "loss": 0.03432882, + "step": 7012 + }, + { + "epoch": 14.026, + "grad_norm": 1.2360132932662964, + "learning_rate": 2e-05, + "loss": 0.02996394, + "step": 7013 + }, + { + "epoch": 14.028, + "grad_norm": 1.3257410526275635, + "learning_rate": 2e-05, + "loss": 0.04203, + "step": 7014 + }, + { + "epoch": 14.03, + "grad_norm": 1.318507194519043, + "learning_rate": 2e-05, + "loss": 0.04230007, + "step": 7015 + }, + { + "epoch": 14.032, + "grad_norm": 1.3276084661483765, + "learning_rate": 2e-05, + "loss": 0.03894487, + "step": 7016 + }, + { + "epoch": 14.034, + "grad_norm": 3.99560809135437, + "learning_rate": 2e-05, + "loss": 0.04019061, + "step": 7017 + }, + { + "epoch": 14.036, + "grad_norm": 0.97989422082901, + "learning_rate": 2e-05, + "loss": 0.03579887, + "step": 7018 + }, + { + "epoch": 14.038, + "grad_norm": 3.5206971168518066, + "learning_rate": 2e-05, + "loss": 0.04952036, + "step": 7019 + }, + { + "epoch": 14.04, + "grad_norm": 1.5767422914505005, + "learning_rate": 2e-05, + "loss": 0.04102598, + "step": 7020 + }, + { + "epoch": 14.042, + "grad_norm": 2.001638412475586, + "learning_rate": 2e-05, + "loss": 0.04028576, + "step": 7021 + }, + { + "epoch": 14.044, + "grad_norm": 1.0773245096206665, + "learning_rate": 2e-05, + "loss": 0.04221611, + "step": 7022 + }, + { + "epoch": 14.046, + "grad_norm": 1.02744460105896, + "learning_rate": 2e-05, + "loss": 0.02617998, + "step": 7023 + }, + { + "epoch": 14.048, + "grad_norm": 1.0974265336990356, + "learning_rate": 2e-05, + "loss": 0.02758575, + "step": 7024 + }, + { + "epoch": 14.05, + "grad_norm": 1.1479986906051636, + "learning_rate": 2e-05, + "loss": 0.03531769, + "step": 7025 + }, + { + "epoch": 14.052, + "grad_norm": 1.6043879985809326, + "learning_rate": 2e-05, + "loss": 0.04604035, + "step": 7026 + }, + { + "epoch": 14.054, + "grad_norm": 1.2420005798339844, + "learning_rate": 2e-05, + "loss": 0.04582789, + "step": 7027 + }, + { + "epoch": 14.056, + "grad_norm": 2.2049918174743652, + "learning_rate": 2e-05, + "loss": 0.05222199, + "step": 7028 + }, + { + "epoch": 14.058, + "grad_norm": 2.92496395111084, + "learning_rate": 2e-05, + "loss": 0.04773784, + "step": 7029 + }, + { + "epoch": 14.06, + "grad_norm": 1.3038673400878906, + "learning_rate": 2e-05, + "loss": 0.0424346, + "step": 7030 + }, + { + "epoch": 14.062, + "grad_norm": 1.3948389291763306, + "learning_rate": 2e-05, + "loss": 0.0277782, + "step": 7031 + }, + { + "epoch": 14.064, + "grad_norm": 1.42351233959198, + "learning_rate": 2e-05, + "loss": 0.04358821, + "step": 7032 + }, + { + "epoch": 14.066, + "grad_norm": 1.726454257965088, + "learning_rate": 2e-05, + "loss": 0.050067, + "step": 7033 + }, + { + "epoch": 14.068, + "grad_norm": 1.3726768493652344, + "learning_rate": 2e-05, + "loss": 0.04407897, + "step": 7034 + }, + { + "epoch": 14.07, + "grad_norm": 1.9279018640518188, + "learning_rate": 2e-05, + "loss": 0.0469468, + "step": 7035 + }, + { + "epoch": 14.072, + "grad_norm": 1.3610163927078247, + "learning_rate": 2e-05, + "loss": 0.0371495, + "step": 7036 + }, + { + "epoch": 14.074, + "grad_norm": 2.023139238357544, + "learning_rate": 2e-05, + "loss": 0.05333347, + "step": 7037 + }, + { + "epoch": 14.076, + "grad_norm": 1.167094349861145, + "learning_rate": 2e-05, + "loss": 0.03193357, + "step": 7038 + }, + { + "epoch": 14.078, + "grad_norm": 1.3578746318817139, + "learning_rate": 2e-05, + "loss": 0.04010125, + "step": 7039 + }, + { + "epoch": 14.08, + "grad_norm": 2.4089791774749756, + "learning_rate": 2e-05, + "loss": 0.05072846, + "step": 7040 + }, + { + "epoch": 14.082, + "grad_norm": 1.022179365158081, + "learning_rate": 2e-05, + "loss": 0.03089194, + "step": 7041 + }, + { + "epoch": 14.084, + "grad_norm": 1.3668341636657715, + "learning_rate": 2e-05, + "loss": 0.03569974, + "step": 7042 + }, + { + "epoch": 14.086, + "grad_norm": 1.1925535202026367, + "learning_rate": 2e-05, + "loss": 0.03366784, + "step": 7043 + }, + { + "epoch": 14.088, + "grad_norm": 1.3144326210021973, + "learning_rate": 2e-05, + "loss": 0.03443348, + "step": 7044 + }, + { + "epoch": 14.09, + "grad_norm": 1.794022560119629, + "learning_rate": 2e-05, + "loss": 0.04681649, + "step": 7045 + }, + { + "epoch": 14.092, + "grad_norm": 1.2918729782104492, + "learning_rate": 2e-05, + "loss": 0.04749175, + "step": 7046 + }, + { + "epoch": 14.094, + "grad_norm": 1.18765389919281, + "learning_rate": 2e-05, + "loss": 0.03726712, + "step": 7047 + }, + { + "epoch": 14.096, + "grad_norm": 1.0554254055023193, + "learning_rate": 2e-05, + "loss": 0.03222162, + "step": 7048 + }, + { + "epoch": 14.098, + "grad_norm": 0.9862664937973022, + "learning_rate": 2e-05, + "loss": 0.02374917, + "step": 7049 + }, + { + "epoch": 14.1, + "grad_norm": 1.021124005317688, + "learning_rate": 2e-05, + "loss": 0.02776304, + "step": 7050 + }, + { + "epoch": 14.102, + "grad_norm": 1.7811987400054932, + "learning_rate": 2e-05, + "loss": 0.04543906, + "step": 7051 + }, + { + "epoch": 14.104, + "grad_norm": 1.463122844696045, + "learning_rate": 2e-05, + "loss": 0.02648041, + "step": 7052 + }, + { + "epoch": 14.106, + "grad_norm": 0.7668428421020508, + "learning_rate": 2e-05, + "loss": 0.02454238, + "step": 7053 + }, + { + "epoch": 14.108, + "grad_norm": 1.272748589515686, + "learning_rate": 2e-05, + "loss": 0.0323597, + "step": 7054 + }, + { + "epoch": 14.11, + "grad_norm": 1.0245872735977173, + "learning_rate": 2e-05, + "loss": 0.03127266, + "step": 7055 + }, + { + "epoch": 14.112, + "grad_norm": 1.9159295558929443, + "learning_rate": 2e-05, + "loss": 0.04461393, + "step": 7056 + }, + { + "epoch": 14.114, + "grad_norm": 1.5680136680603027, + "learning_rate": 2e-05, + "loss": 0.04500045, + "step": 7057 + }, + { + "epoch": 14.116, + "grad_norm": 1.5329288244247437, + "learning_rate": 2e-05, + "loss": 0.03792113, + "step": 7058 + }, + { + "epoch": 14.118, + "grad_norm": 1.3381918668746948, + "learning_rate": 2e-05, + "loss": 0.04293994, + "step": 7059 + }, + { + "epoch": 14.12, + "grad_norm": 1.2890726327896118, + "learning_rate": 2e-05, + "loss": 0.04811954, + "step": 7060 + }, + { + "epoch": 14.122, + "grad_norm": 1.3316044807434082, + "learning_rate": 2e-05, + "loss": 0.04357966, + "step": 7061 + }, + { + "epoch": 14.124, + "grad_norm": 1.3575929403305054, + "learning_rate": 2e-05, + "loss": 0.03795036, + "step": 7062 + }, + { + "epoch": 14.126, + "grad_norm": 1.5020532608032227, + "learning_rate": 2e-05, + "loss": 0.04436047, + "step": 7063 + }, + { + "epoch": 14.128, + "grad_norm": 1.5422673225402832, + "learning_rate": 2e-05, + "loss": 0.04654428, + "step": 7064 + }, + { + "epoch": 14.13, + "grad_norm": 2.029569149017334, + "learning_rate": 2e-05, + "loss": 0.04016326, + "step": 7065 + }, + { + "epoch": 14.132, + "grad_norm": 1.814108967781067, + "learning_rate": 2e-05, + "loss": 0.04536037, + "step": 7066 + }, + { + "epoch": 14.134, + "grad_norm": 1.3232728242874146, + "learning_rate": 2e-05, + "loss": 0.02865727, + "step": 7067 + }, + { + "epoch": 14.136, + "grad_norm": 3.152207136154175, + "learning_rate": 2e-05, + "loss": 0.04350835, + "step": 7068 + }, + { + "epoch": 14.138, + "grad_norm": 1.4238955974578857, + "learning_rate": 2e-05, + "loss": 0.03182974, + "step": 7069 + }, + { + "epoch": 14.14, + "grad_norm": 1.6935443878173828, + "learning_rate": 2e-05, + "loss": 0.05719973, + "step": 7070 + }, + { + "epoch": 14.142, + "grad_norm": 1.262613296508789, + "learning_rate": 2e-05, + "loss": 0.04014334, + "step": 7071 + }, + { + "epoch": 14.144, + "grad_norm": 1.5854763984680176, + "learning_rate": 2e-05, + "loss": 0.05335325, + "step": 7072 + }, + { + "epoch": 14.146, + "grad_norm": 0.964347779750824, + "learning_rate": 2e-05, + "loss": 0.02348195, + "step": 7073 + }, + { + "epoch": 14.148, + "grad_norm": 1.284382700920105, + "learning_rate": 2e-05, + "loss": 0.04065283, + "step": 7074 + }, + { + "epoch": 14.15, + "grad_norm": 1.5653077363967896, + "learning_rate": 2e-05, + "loss": 0.06096688, + "step": 7075 + }, + { + "epoch": 14.152, + "grad_norm": 1.2976857423782349, + "learning_rate": 2e-05, + "loss": 0.04557998, + "step": 7076 + }, + { + "epoch": 14.154, + "grad_norm": 1.6498374938964844, + "learning_rate": 2e-05, + "loss": 0.04869898, + "step": 7077 + }, + { + "epoch": 14.156, + "grad_norm": 1.2892625331878662, + "learning_rate": 2e-05, + "loss": 0.03722683, + "step": 7078 + }, + { + "epoch": 14.158, + "grad_norm": 1.2497565746307373, + "learning_rate": 2e-05, + "loss": 0.04251852, + "step": 7079 + }, + { + "epoch": 14.16, + "grad_norm": 1.8106011152267456, + "learning_rate": 2e-05, + "loss": 0.02925546, + "step": 7080 + }, + { + "epoch": 14.162, + "grad_norm": 1.6762701272964478, + "learning_rate": 2e-05, + "loss": 0.06018007, + "step": 7081 + }, + { + "epoch": 14.164, + "grad_norm": 1.2910103797912598, + "learning_rate": 2e-05, + "loss": 0.04167475, + "step": 7082 + }, + { + "epoch": 14.166, + "grad_norm": 1.5248066186904907, + "learning_rate": 2e-05, + "loss": 0.04084751, + "step": 7083 + }, + { + "epoch": 14.168, + "grad_norm": 1.5304296016693115, + "learning_rate": 2e-05, + "loss": 0.04385792, + "step": 7084 + }, + { + "epoch": 14.17, + "grad_norm": 1.8973009586334229, + "learning_rate": 2e-05, + "loss": 0.04529943, + "step": 7085 + }, + { + "epoch": 14.172, + "grad_norm": 1.4018219709396362, + "learning_rate": 2e-05, + "loss": 0.03316261, + "step": 7086 + }, + { + "epoch": 14.174, + "grad_norm": 2.388532876968384, + "learning_rate": 2e-05, + "loss": 0.04198411, + "step": 7087 + }, + { + "epoch": 14.176, + "grad_norm": 1.7253050804138184, + "learning_rate": 2e-05, + "loss": 0.03549905, + "step": 7088 + }, + { + "epoch": 14.178, + "grad_norm": 1.5690933465957642, + "learning_rate": 2e-05, + "loss": 0.043695, + "step": 7089 + }, + { + "epoch": 14.18, + "grad_norm": 1.1658984422683716, + "learning_rate": 2e-05, + "loss": 0.03999247, + "step": 7090 + }, + { + "epoch": 14.182, + "grad_norm": 1.1882442235946655, + "learning_rate": 2e-05, + "loss": 0.04268787, + "step": 7091 + }, + { + "epoch": 14.184, + "grad_norm": 1.894171118736267, + "learning_rate": 2e-05, + "loss": 0.05581329, + "step": 7092 + }, + { + "epoch": 14.186, + "grad_norm": 1.7583495378494263, + "learning_rate": 2e-05, + "loss": 0.05542745, + "step": 7093 + }, + { + "epoch": 14.188, + "grad_norm": 1.0371955633163452, + "learning_rate": 2e-05, + "loss": 0.03195368, + "step": 7094 + }, + { + "epoch": 14.19, + "grad_norm": 1.2478240728378296, + "learning_rate": 2e-05, + "loss": 0.0411337, + "step": 7095 + }, + { + "epoch": 14.192, + "grad_norm": 1.3688163757324219, + "learning_rate": 2e-05, + "loss": 0.04424831, + "step": 7096 + }, + { + "epoch": 14.194, + "grad_norm": 1.3419855833053589, + "learning_rate": 2e-05, + "loss": 0.03607039, + "step": 7097 + }, + { + "epoch": 14.196, + "grad_norm": 1.9281373023986816, + "learning_rate": 2e-05, + "loss": 0.05275191, + "step": 7098 + }, + { + "epoch": 14.198, + "grad_norm": 1.3586539030075073, + "learning_rate": 2e-05, + "loss": 0.03638598, + "step": 7099 + }, + { + "epoch": 14.2, + "grad_norm": 1.104002833366394, + "learning_rate": 2e-05, + "loss": 0.03526532, + "step": 7100 + }, + { + "epoch": 14.202, + "grad_norm": 1.1839067935943604, + "learning_rate": 2e-05, + "loss": 0.03894392, + "step": 7101 + }, + { + "epoch": 14.204, + "grad_norm": 1.7897748947143555, + "learning_rate": 2e-05, + "loss": 0.0426039, + "step": 7102 + }, + { + "epoch": 14.206, + "grad_norm": 0.8262537717819214, + "learning_rate": 2e-05, + "loss": 0.02584026, + "step": 7103 + }, + { + "epoch": 14.208, + "grad_norm": 1.3617100715637207, + "learning_rate": 2e-05, + "loss": 0.03045266, + "step": 7104 + }, + { + "epoch": 14.21, + "grad_norm": 1.8911073207855225, + "learning_rate": 2e-05, + "loss": 0.04614431, + "step": 7105 + }, + { + "epoch": 14.212, + "grad_norm": 2.970533847808838, + "learning_rate": 2e-05, + "loss": 0.05797109, + "step": 7106 + }, + { + "epoch": 14.214, + "grad_norm": 1.425166368484497, + "learning_rate": 2e-05, + "loss": 0.03592297, + "step": 7107 + }, + { + "epoch": 14.216, + "grad_norm": 1.0085773468017578, + "learning_rate": 2e-05, + "loss": 0.03424015, + "step": 7108 + }, + { + "epoch": 14.218, + "grad_norm": 1.4457972049713135, + "learning_rate": 2e-05, + "loss": 0.0365461, + "step": 7109 + }, + { + "epoch": 14.22, + "grad_norm": 1.1994272470474243, + "learning_rate": 2e-05, + "loss": 0.02875265, + "step": 7110 + }, + { + "epoch": 14.222, + "grad_norm": 0.986005425453186, + "learning_rate": 2e-05, + "loss": 0.03896586, + "step": 7111 + }, + { + "epoch": 14.224, + "grad_norm": 1.2924270629882812, + "learning_rate": 2e-05, + "loss": 0.03565193, + "step": 7112 + }, + { + "epoch": 14.226, + "grad_norm": 1.6049319505691528, + "learning_rate": 2e-05, + "loss": 0.0383412, + "step": 7113 + }, + { + "epoch": 14.228, + "grad_norm": 1.1894758939743042, + "learning_rate": 2e-05, + "loss": 0.03838333, + "step": 7114 + }, + { + "epoch": 14.23, + "grad_norm": 2.1524245738983154, + "learning_rate": 2e-05, + "loss": 0.05140578, + "step": 7115 + }, + { + "epoch": 14.232, + "grad_norm": 1.6528891324996948, + "learning_rate": 2e-05, + "loss": 0.05099215, + "step": 7116 + }, + { + "epoch": 14.234, + "grad_norm": 1.2502809762954712, + "learning_rate": 2e-05, + "loss": 0.04991131, + "step": 7117 + }, + { + "epoch": 14.236, + "grad_norm": 2.5262179374694824, + "learning_rate": 2e-05, + "loss": 0.03753746, + "step": 7118 + }, + { + "epoch": 14.238, + "grad_norm": 1.5351922512054443, + "learning_rate": 2e-05, + "loss": 0.02967945, + "step": 7119 + }, + { + "epoch": 14.24, + "grad_norm": 0.9251724481582642, + "learning_rate": 2e-05, + "loss": 0.02450234, + "step": 7120 + }, + { + "epoch": 14.242, + "grad_norm": 1.6973142623901367, + "learning_rate": 2e-05, + "loss": 0.0343626, + "step": 7121 + }, + { + "epoch": 14.244, + "grad_norm": 0.960382878780365, + "learning_rate": 2e-05, + "loss": 0.02384898, + "step": 7122 + }, + { + "epoch": 14.246, + "grad_norm": 4.224244117736816, + "learning_rate": 2e-05, + "loss": 0.03810407, + "step": 7123 + }, + { + "epoch": 14.248, + "grad_norm": 1.0069541931152344, + "learning_rate": 2e-05, + "loss": 0.03257271, + "step": 7124 + }, + { + "epoch": 14.25, + "grad_norm": 1.2396907806396484, + "learning_rate": 2e-05, + "loss": 0.03927642, + "step": 7125 + }, + { + "epoch": 14.252, + "grad_norm": 1.199365258216858, + "learning_rate": 2e-05, + "loss": 0.04600928, + "step": 7126 + }, + { + "epoch": 14.254, + "grad_norm": 1.4900243282318115, + "learning_rate": 2e-05, + "loss": 0.03931838, + "step": 7127 + }, + { + "epoch": 14.256, + "grad_norm": 1.528289556503296, + "learning_rate": 2e-05, + "loss": 0.02803968, + "step": 7128 + }, + { + "epoch": 14.258, + "grad_norm": 1.347772240638733, + "learning_rate": 2e-05, + "loss": 0.03262594, + "step": 7129 + }, + { + "epoch": 14.26, + "grad_norm": 2.532207489013672, + "learning_rate": 2e-05, + "loss": 0.05416807, + "step": 7130 + }, + { + "epoch": 14.262, + "grad_norm": 1.2191015481948853, + "learning_rate": 2e-05, + "loss": 0.03117691, + "step": 7131 + }, + { + "epoch": 14.264, + "grad_norm": 2.1217501163482666, + "learning_rate": 2e-05, + "loss": 0.03810145, + "step": 7132 + }, + { + "epoch": 14.266, + "grad_norm": 1.506582498550415, + "learning_rate": 2e-05, + "loss": 0.0460251, + "step": 7133 + }, + { + "epoch": 14.268, + "grad_norm": 1.2872868776321411, + "learning_rate": 2e-05, + "loss": 0.04550285, + "step": 7134 + }, + { + "epoch": 14.27, + "grad_norm": 1.8632010221481323, + "learning_rate": 2e-05, + "loss": 0.04599176, + "step": 7135 + }, + { + "epoch": 14.272, + "grad_norm": 1.4420409202575684, + "learning_rate": 2e-05, + "loss": 0.04499146, + "step": 7136 + }, + { + "epoch": 14.274000000000001, + "grad_norm": 2.0591557025909424, + "learning_rate": 2e-05, + "loss": 0.04166877, + "step": 7137 + }, + { + "epoch": 14.276, + "grad_norm": 2.0912609100341797, + "learning_rate": 2e-05, + "loss": 0.05226006, + "step": 7138 + }, + { + "epoch": 14.278, + "grad_norm": 1.5423698425292969, + "learning_rate": 2e-05, + "loss": 0.03722187, + "step": 7139 + }, + { + "epoch": 14.28, + "grad_norm": 2.2733895778656006, + "learning_rate": 2e-05, + "loss": 0.04837454, + "step": 7140 + }, + { + "epoch": 14.282, + "grad_norm": 1.218680739402771, + "learning_rate": 2e-05, + "loss": 0.03407823, + "step": 7141 + }, + { + "epoch": 14.284, + "grad_norm": 1.3564642667770386, + "learning_rate": 2e-05, + "loss": 0.04712793, + "step": 7142 + }, + { + "epoch": 14.286, + "grad_norm": 1.7670422792434692, + "learning_rate": 2e-05, + "loss": 0.04199456, + "step": 7143 + }, + { + "epoch": 14.288, + "grad_norm": 1.2026097774505615, + "learning_rate": 2e-05, + "loss": 0.03444853, + "step": 7144 + }, + { + "epoch": 14.29, + "grad_norm": 1.443743348121643, + "learning_rate": 2e-05, + "loss": 0.04106067, + "step": 7145 + }, + { + "epoch": 14.292, + "grad_norm": 1.2027446031570435, + "learning_rate": 2e-05, + "loss": 0.03707718, + "step": 7146 + }, + { + "epoch": 14.294, + "grad_norm": 1.0716592073440552, + "learning_rate": 2e-05, + "loss": 0.04112831, + "step": 7147 + }, + { + "epoch": 14.296, + "grad_norm": 1.7344224452972412, + "learning_rate": 2e-05, + "loss": 0.03029624, + "step": 7148 + }, + { + "epoch": 14.298, + "grad_norm": 2.744713068008423, + "learning_rate": 2e-05, + "loss": 0.03705482, + "step": 7149 + }, + { + "epoch": 14.3, + "grad_norm": 1.1636744737625122, + "learning_rate": 2e-05, + "loss": 0.0361629, + "step": 7150 + }, + { + "epoch": 14.302, + "grad_norm": 1.6198397874832153, + "learning_rate": 2e-05, + "loss": 0.05550705, + "step": 7151 + }, + { + "epoch": 14.304, + "grad_norm": 1.4462108612060547, + "learning_rate": 2e-05, + "loss": 0.03110327, + "step": 7152 + }, + { + "epoch": 14.306, + "grad_norm": 2.7012014389038086, + "learning_rate": 2e-05, + "loss": 0.05303191, + "step": 7153 + }, + { + "epoch": 14.308, + "grad_norm": 0.8201614022254944, + "learning_rate": 2e-05, + "loss": 0.02300132, + "step": 7154 + }, + { + "epoch": 14.31, + "grad_norm": 1.4760310649871826, + "learning_rate": 2e-05, + "loss": 0.04489978, + "step": 7155 + }, + { + "epoch": 14.312, + "grad_norm": 1.3183153867721558, + "learning_rate": 2e-05, + "loss": 0.04012213, + "step": 7156 + }, + { + "epoch": 14.314, + "grad_norm": 1.5775775909423828, + "learning_rate": 2e-05, + "loss": 0.04109515, + "step": 7157 + }, + { + "epoch": 14.316, + "grad_norm": 1.6039639711380005, + "learning_rate": 2e-05, + "loss": 0.04469662, + "step": 7158 + }, + { + "epoch": 14.318, + "grad_norm": 1.8237558603286743, + "learning_rate": 2e-05, + "loss": 0.05006624, + "step": 7159 + }, + { + "epoch": 14.32, + "grad_norm": 1.929965615272522, + "learning_rate": 2e-05, + "loss": 0.05642921, + "step": 7160 + }, + { + "epoch": 14.322, + "grad_norm": 1.932225227355957, + "learning_rate": 2e-05, + "loss": 0.05633236, + "step": 7161 + }, + { + "epoch": 14.324, + "grad_norm": 1.1771067380905151, + "learning_rate": 2e-05, + "loss": 0.03297538, + "step": 7162 + }, + { + "epoch": 14.326, + "grad_norm": 1.7193629741668701, + "learning_rate": 2e-05, + "loss": 0.04758242, + "step": 7163 + }, + { + "epoch": 14.328, + "grad_norm": 1.122786521911621, + "learning_rate": 2e-05, + "loss": 0.04002685, + "step": 7164 + }, + { + "epoch": 14.33, + "grad_norm": 1.0502139329910278, + "learning_rate": 2e-05, + "loss": 0.03614987, + "step": 7165 + }, + { + "epoch": 14.332, + "grad_norm": 1.1357371807098389, + "learning_rate": 2e-05, + "loss": 0.03864967, + "step": 7166 + }, + { + "epoch": 14.334, + "grad_norm": 1.074387788772583, + "learning_rate": 2e-05, + "loss": 0.03561043, + "step": 7167 + }, + { + "epoch": 14.336, + "grad_norm": 1.5098060369491577, + "learning_rate": 2e-05, + "loss": 0.04273804, + "step": 7168 + }, + { + "epoch": 14.338, + "grad_norm": 1.3217259645462036, + "learning_rate": 2e-05, + "loss": 0.05451931, + "step": 7169 + }, + { + "epoch": 14.34, + "grad_norm": 1.2757598161697388, + "learning_rate": 2e-05, + "loss": 0.03788436, + "step": 7170 + }, + { + "epoch": 14.342, + "grad_norm": 0.9583383798599243, + "learning_rate": 2e-05, + "loss": 0.03389092, + "step": 7171 + }, + { + "epoch": 14.344, + "grad_norm": 1.8409156799316406, + "learning_rate": 2e-05, + "loss": 0.05510804, + "step": 7172 + }, + { + "epoch": 14.346, + "grad_norm": 1.3510578870773315, + "learning_rate": 2e-05, + "loss": 0.03992224, + "step": 7173 + }, + { + "epoch": 14.348, + "grad_norm": 1.52592933177948, + "learning_rate": 2e-05, + "loss": 0.04563883, + "step": 7174 + }, + { + "epoch": 14.35, + "grad_norm": 1.946403980255127, + "learning_rate": 2e-05, + "loss": 0.03222995, + "step": 7175 + }, + { + "epoch": 14.352, + "grad_norm": 1.9366340637207031, + "learning_rate": 2e-05, + "loss": 0.05663209, + "step": 7176 + }, + { + "epoch": 14.354, + "grad_norm": 0.7667877078056335, + "learning_rate": 2e-05, + "loss": 0.01848971, + "step": 7177 + }, + { + "epoch": 14.356, + "grad_norm": 2.2338192462921143, + "learning_rate": 2e-05, + "loss": 0.0433307, + "step": 7178 + }, + { + "epoch": 14.358, + "grad_norm": 3.6205711364746094, + "learning_rate": 2e-05, + "loss": 0.0502264, + "step": 7179 + }, + { + "epoch": 14.36, + "grad_norm": 1.8776448965072632, + "learning_rate": 2e-05, + "loss": 0.05654108, + "step": 7180 + }, + { + "epoch": 14.362, + "grad_norm": 1.7132173776626587, + "learning_rate": 2e-05, + "loss": 0.03880975, + "step": 7181 + }, + { + "epoch": 14.364, + "grad_norm": 1.5955792665481567, + "learning_rate": 2e-05, + "loss": 0.04336097, + "step": 7182 + }, + { + "epoch": 14.366, + "grad_norm": 1.046798825263977, + "learning_rate": 2e-05, + "loss": 0.03609883, + "step": 7183 + }, + { + "epoch": 14.368, + "grad_norm": 1.6806223392486572, + "learning_rate": 2e-05, + "loss": 0.04325953, + "step": 7184 + }, + { + "epoch": 14.37, + "grad_norm": 1.8983646631240845, + "learning_rate": 2e-05, + "loss": 0.04521517, + "step": 7185 + }, + { + "epoch": 14.372, + "grad_norm": 2.0524489879608154, + "learning_rate": 2e-05, + "loss": 0.03215559, + "step": 7186 + }, + { + "epoch": 14.374, + "grad_norm": 1.1298598051071167, + "learning_rate": 2e-05, + "loss": 0.03601304, + "step": 7187 + }, + { + "epoch": 14.376, + "grad_norm": 1.4891972541809082, + "learning_rate": 2e-05, + "loss": 0.03756123, + "step": 7188 + }, + { + "epoch": 14.378, + "grad_norm": 1.3420482873916626, + "learning_rate": 2e-05, + "loss": 0.04583802, + "step": 7189 + }, + { + "epoch": 14.38, + "grad_norm": 5.2002716064453125, + "learning_rate": 2e-05, + "loss": 0.04100659, + "step": 7190 + }, + { + "epoch": 14.382, + "grad_norm": 1.079121470451355, + "learning_rate": 2e-05, + "loss": 0.03196656, + "step": 7191 + }, + { + "epoch": 14.384, + "grad_norm": 2.1130754947662354, + "learning_rate": 2e-05, + "loss": 0.05068072, + "step": 7192 + }, + { + "epoch": 14.386, + "grad_norm": 1.6315432786941528, + "learning_rate": 2e-05, + "loss": 0.03901272, + "step": 7193 + }, + { + "epoch": 14.388, + "grad_norm": 1.7703206539154053, + "learning_rate": 2e-05, + "loss": 0.03575943, + "step": 7194 + }, + { + "epoch": 14.39, + "grad_norm": 1.4890453815460205, + "learning_rate": 2e-05, + "loss": 0.03911097, + "step": 7195 + }, + { + "epoch": 14.392, + "grad_norm": 1.2721261978149414, + "learning_rate": 2e-05, + "loss": 0.04248876, + "step": 7196 + }, + { + "epoch": 14.394, + "grad_norm": 1.8303420543670654, + "learning_rate": 2e-05, + "loss": 0.05177449, + "step": 7197 + }, + { + "epoch": 14.396, + "grad_norm": 1.322717308998108, + "learning_rate": 2e-05, + "loss": 0.02881128, + "step": 7198 + }, + { + "epoch": 14.398, + "grad_norm": 1.5059839487075806, + "learning_rate": 2e-05, + "loss": 0.04383879, + "step": 7199 + }, + { + "epoch": 14.4, + "grad_norm": 1.64779794216156, + "learning_rate": 2e-05, + "loss": 0.0525568, + "step": 7200 + }, + { + "epoch": 14.402, + "grad_norm": 2.455418586730957, + "learning_rate": 2e-05, + "loss": 0.04642213, + "step": 7201 + }, + { + "epoch": 14.404, + "grad_norm": 1.2725389003753662, + "learning_rate": 2e-05, + "loss": 0.03739651, + "step": 7202 + }, + { + "epoch": 14.406, + "grad_norm": 1.4967567920684814, + "learning_rate": 2e-05, + "loss": 0.04378046, + "step": 7203 + }, + { + "epoch": 14.408, + "grad_norm": 1.1510059833526611, + "learning_rate": 2e-05, + "loss": 0.03842362, + "step": 7204 + }, + { + "epoch": 14.41, + "grad_norm": 1.4995372295379639, + "learning_rate": 2e-05, + "loss": 0.04412612, + "step": 7205 + }, + { + "epoch": 14.412, + "grad_norm": 0.94809490442276, + "learning_rate": 2e-05, + "loss": 0.03201088, + "step": 7206 + }, + { + "epoch": 14.414, + "grad_norm": 1.5775501728057861, + "learning_rate": 2e-05, + "loss": 0.03421389, + "step": 7207 + }, + { + "epoch": 14.416, + "grad_norm": 3.60548996925354, + "learning_rate": 2e-05, + "loss": 0.0545315, + "step": 7208 + }, + { + "epoch": 14.418, + "grad_norm": 1.0393273830413818, + "learning_rate": 2e-05, + "loss": 0.03813236, + "step": 7209 + }, + { + "epoch": 14.42, + "grad_norm": 1.0074549913406372, + "learning_rate": 2e-05, + "loss": 0.02983594, + "step": 7210 + }, + { + "epoch": 14.422, + "grad_norm": 1.203730583190918, + "learning_rate": 2e-05, + "loss": 0.04196813, + "step": 7211 + }, + { + "epoch": 14.424, + "grad_norm": 1.2036176919937134, + "learning_rate": 2e-05, + "loss": 0.02699343, + "step": 7212 + }, + { + "epoch": 14.426, + "grad_norm": 1.3977973461151123, + "learning_rate": 2e-05, + "loss": 0.02824752, + "step": 7213 + }, + { + "epoch": 14.428, + "grad_norm": 1.019425392150879, + "learning_rate": 2e-05, + "loss": 0.03191173, + "step": 7214 + }, + { + "epoch": 14.43, + "grad_norm": 0.8422673940658569, + "learning_rate": 2e-05, + "loss": 0.02386178, + "step": 7215 + }, + { + "epoch": 14.432, + "grad_norm": 1.4867335557937622, + "learning_rate": 2e-05, + "loss": 0.03482441, + "step": 7216 + }, + { + "epoch": 14.434, + "grad_norm": 2.1186928749084473, + "learning_rate": 2e-05, + "loss": 0.05948957, + "step": 7217 + }, + { + "epoch": 14.436, + "grad_norm": 1.7550668716430664, + "learning_rate": 2e-05, + "loss": 0.04442699, + "step": 7218 + }, + { + "epoch": 14.438, + "grad_norm": 1.0397560596466064, + "learning_rate": 2e-05, + "loss": 0.0290125, + "step": 7219 + }, + { + "epoch": 14.44, + "grad_norm": 1.3454571962356567, + "learning_rate": 2e-05, + "loss": 0.03583448, + "step": 7220 + }, + { + "epoch": 14.442, + "grad_norm": 1.2962967157363892, + "learning_rate": 2e-05, + "loss": 0.0369962, + "step": 7221 + }, + { + "epoch": 14.444, + "grad_norm": 1.420045256614685, + "learning_rate": 2e-05, + "loss": 0.03899933, + "step": 7222 + }, + { + "epoch": 14.446, + "grad_norm": 2.546292781829834, + "learning_rate": 2e-05, + "loss": 0.05628195, + "step": 7223 + }, + { + "epoch": 14.448, + "grad_norm": 1.8872132301330566, + "learning_rate": 2e-05, + "loss": 0.03206664, + "step": 7224 + }, + { + "epoch": 14.45, + "grad_norm": 0.9982779622077942, + "learning_rate": 2e-05, + "loss": 0.01746661, + "step": 7225 + }, + { + "epoch": 14.452, + "grad_norm": 2.069234609603882, + "learning_rate": 2e-05, + "loss": 0.04197174, + "step": 7226 + }, + { + "epoch": 14.454, + "grad_norm": 1.5738203525543213, + "learning_rate": 2e-05, + "loss": 0.04109071, + "step": 7227 + }, + { + "epoch": 14.456, + "grad_norm": 1.1500931978225708, + "learning_rate": 2e-05, + "loss": 0.03633848, + "step": 7228 + }, + { + "epoch": 14.458, + "grad_norm": 1.2482191324234009, + "learning_rate": 2e-05, + "loss": 0.04544804, + "step": 7229 + }, + { + "epoch": 14.46, + "grad_norm": 1.5206077098846436, + "learning_rate": 2e-05, + "loss": 0.03966599, + "step": 7230 + }, + { + "epoch": 14.462, + "grad_norm": 1.76610267162323, + "learning_rate": 2e-05, + "loss": 0.04585, + "step": 7231 + }, + { + "epoch": 14.464, + "grad_norm": 1.5877853631973267, + "learning_rate": 2e-05, + "loss": 0.03790272, + "step": 7232 + }, + { + "epoch": 14.466, + "grad_norm": 1.1064239740371704, + "learning_rate": 2e-05, + "loss": 0.03053769, + "step": 7233 + }, + { + "epoch": 14.468, + "grad_norm": 1.0071042776107788, + "learning_rate": 2e-05, + "loss": 0.03742989, + "step": 7234 + }, + { + "epoch": 14.47, + "grad_norm": 2.6944468021392822, + "learning_rate": 2e-05, + "loss": 0.05254611, + "step": 7235 + }, + { + "epoch": 14.472, + "grad_norm": 1.4075301885604858, + "learning_rate": 2e-05, + "loss": 0.05554926, + "step": 7236 + }, + { + "epoch": 14.474, + "grad_norm": 1.048970103263855, + "learning_rate": 2e-05, + "loss": 0.0344836, + "step": 7237 + }, + { + "epoch": 14.475999999999999, + "grad_norm": 0.9587178826332092, + "learning_rate": 2e-05, + "loss": 0.03106603, + "step": 7238 + }, + { + "epoch": 14.478, + "grad_norm": 1.0842715501785278, + "learning_rate": 2e-05, + "loss": 0.03258051, + "step": 7239 + }, + { + "epoch": 14.48, + "grad_norm": 1.1892338991165161, + "learning_rate": 2e-05, + "loss": 0.04680737, + "step": 7240 + }, + { + "epoch": 14.482, + "grad_norm": 1.856372594833374, + "learning_rate": 2e-05, + "loss": 0.05747771, + "step": 7241 + }, + { + "epoch": 14.484, + "grad_norm": 1.1554561853408813, + "learning_rate": 2e-05, + "loss": 0.02923879, + "step": 7242 + }, + { + "epoch": 14.486, + "grad_norm": 1.0455029010772705, + "learning_rate": 2e-05, + "loss": 0.03191773, + "step": 7243 + }, + { + "epoch": 14.488, + "grad_norm": 2.2019166946411133, + "learning_rate": 2e-05, + "loss": 0.04987948, + "step": 7244 + }, + { + "epoch": 14.49, + "grad_norm": 1.404489278793335, + "learning_rate": 2e-05, + "loss": 0.03941087, + "step": 7245 + }, + { + "epoch": 14.492, + "grad_norm": 1.0887120962142944, + "learning_rate": 2e-05, + "loss": 0.03620033, + "step": 7246 + }, + { + "epoch": 14.494, + "grad_norm": 1.727051019668579, + "learning_rate": 2e-05, + "loss": 0.03951932, + "step": 7247 + }, + { + "epoch": 14.496, + "grad_norm": 1.1405435800552368, + "learning_rate": 2e-05, + "loss": 0.04180471, + "step": 7248 + }, + { + "epoch": 14.498, + "grad_norm": 1.45590341091156, + "learning_rate": 2e-05, + "loss": 0.035485, + "step": 7249 + }, + { + "epoch": 14.5, + "grad_norm": 1.9948844909667969, + "learning_rate": 2e-05, + "loss": 0.04214467, + "step": 7250 + }, + { + "epoch": 14.502, + "grad_norm": 1.238334059715271, + "learning_rate": 2e-05, + "loss": 0.03087533, + "step": 7251 + }, + { + "epoch": 14.504, + "grad_norm": 1.136520504951477, + "learning_rate": 2e-05, + "loss": 0.03111343, + "step": 7252 + }, + { + "epoch": 14.506, + "grad_norm": 1.5107890367507935, + "learning_rate": 2e-05, + "loss": 0.03291022, + "step": 7253 + }, + { + "epoch": 14.508, + "grad_norm": 1.3895598649978638, + "learning_rate": 2e-05, + "loss": 0.04216557, + "step": 7254 + }, + { + "epoch": 14.51, + "grad_norm": 2.2805137634277344, + "learning_rate": 2e-05, + "loss": 0.04727894, + "step": 7255 + }, + { + "epoch": 14.512, + "grad_norm": 1.5463753938674927, + "learning_rate": 2e-05, + "loss": 0.03055894, + "step": 7256 + }, + { + "epoch": 14.514, + "grad_norm": 0.9866666793823242, + "learning_rate": 2e-05, + "loss": 0.02884018, + "step": 7257 + }, + { + "epoch": 14.516, + "grad_norm": 1.2003675699234009, + "learning_rate": 2e-05, + "loss": 0.03621934, + "step": 7258 + }, + { + "epoch": 14.518, + "grad_norm": 2.279313802719116, + "learning_rate": 2e-05, + "loss": 0.08503312, + "step": 7259 + }, + { + "epoch": 14.52, + "grad_norm": 2.878683090209961, + "learning_rate": 2e-05, + "loss": 0.03383691, + "step": 7260 + }, + { + "epoch": 14.522, + "grad_norm": 1.457414150238037, + "learning_rate": 2e-05, + "loss": 0.04102724, + "step": 7261 + }, + { + "epoch": 14.524000000000001, + "grad_norm": 1.2874579429626465, + "learning_rate": 2e-05, + "loss": 0.02973592, + "step": 7262 + }, + { + "epoch": 14.526, + "grad_norm": 1.9015241861343384, + "learning_rate": 2e-05, + "loss": 0.04436665, + "step": 7263 + }, + { + "epoch": 14.528, + "grad_norm": 1.2383368015289307, + "learning_rate": 2e-05, + "loss": 0.03040004, + "step": 7264 + }, + { + "epoch": 14.53, + "grad_norm": 1.5358835458755493, + "learning_rate": 2e-05, + "loss": 0.05725804, + "step": 7265 + }, + { + "epoch": 14.532, + "grad_norm": 2.0177321434020996, + "learning_rate": 2e-05, + "loss": 0.04995024, + "step": 7266 + }, + { + "epoch": 14.534, + "grad_norm": 1.6162405014038086, + "learning_rate": 2e-05, + "loss": 0.03423398, + "step": 7267 + }, + { + "epoch": 14.536, + "grad_norm": 1.3776979446411133, + "learning_rate": 2e-05, + "loss": 0.03925561, + "step": 7268 + }, + { + "epoch": 14.538, + "grad_norm": 1.2735576629638672, + "learning_rate": 2e-05, + "loss": 0.03249297, + "step": 7269 + }, + { + "epoch": 14.54, + "grad_norm": 1.3519257307052612, + "learning_rate": 2e-05, + "loss": 0.03134355, + "step": 7270 + }, + { + "epoch": 14.542, + "grad_norm": 1.2011692523956299, + "learning_rate": 2e-05, + "loss": 0.03818227, + "step": 7271 + }, + { + "epoch": 14.544, + "grad_norm": 1.6174978017807007, + "learning_rate": 2e-05, + "loss": 0.04904899, + "step": 7272 + }, + { + "epoch": 14.546, + "grad_norm": 1.3549933433532715, + "learning_rate": 2e-05, + "loss": 0.04195243, + "step": 7273 + }, + { + "epoch": 14.548, + "grad_norm": 1.4086804389953613, + "learning_rate": 2e-05, + "loss": 0.03927261, + "step": 7274 + }, + { + "epoch": 14.55, + "grad_norm": 1.495468258857727, + "learning_rate": 2e-05, + "loss": 0.03486994, + "step": 7275 + }, + { + "epoch": 14.552, + "grad_norm": 1.4550012350082397, + "learning_rate": 2e-05, + "loss": 0.04446776, + "step": 7276 + }, + { + "epoch": 14.554, + "grad_norm": 1.0507636070251465, + "learning_rate": 2e-05, + "loss": 0.02539073, + "step": 7277 + }, + { + "epoch": 14.556000000000001, + "grad_norm": 6.91660213470459, + "learning_rate": 2e-05, + "loss": 0.03748736, + "step": 7278 + }, + { + "epoch": 14.558, + "grad_norm": 2.021453857421875, + "learning_rate": 2e-05, + "loss": 0.05216369, + "step": 7279 + }, + { + "epoch": 14.56, + "grad_norm": 1.6024717092514038, + "learning_rate": 2e-05, + "loss": 0.05282374, + "step": 7280 + }, + { + "epoch": 14.562, + "grad_norm": 1.2320939302444458, + "learning_rate": 2e-05, + "loss": 0.03876211, + "step": 7281 + }, + { + "epoch": 14.564, + "grad_norm": 1.198306679725647, + "learning_rate": 2e-05, + "loss": 0.03282074, + "step": 7282 + }, + { + "epoch": 14.566, + "grad_norm": 1.5773409605026245, + "learning_rate": 2e-05, + "loss": 0.03945959, + "step": 7283 + }, + { + "epoch": 14.568, + "grad_norm": 1.21599543094635, + "learning_rate": 2e-05, + "loss": 0.04125897, + "step": 7284 + }, + { + "epoch": 14.57, + "grad_norm": 2.8407862186431885, + "learning_rate": 2e-05, + "loss": 0.04945897, + "step": 7285 + }, + { + "epoch": 14.572, + "grad_norm": 1.447121500968933, + "learning_rate": 2e-05, + "loss": 0.03155572, + "step": 7286 + }, + { + "epoch": 14.574, + "grad_norm": 1.1770319938659668, + "learning_rate": 2e-05, + "loss": 0.03424954, + "step": 7287 + }, + { + "epoch": 14.576, + "grad_norm": 1.8243449926376343, + "learning_rate": 2e-05, + "loss": 0.04884764, + "step": 7288 + }, + { + "epoch": 14.578, + "grad_norm": 0.871478259563446, + "learning_rate": 2e-05, + "loss": 0.02348526, + "step": 7289 + }, + { + "epoch": 14.58, + "grad_norm": 1.1215943098068237, + "learning_rate": 2e-05, + "loss": 0.03072283, + "step": 7290 + }, + { + "epoch": 14.582, + "grad_norm": 1.7615373134613037, + "learning_rate": 2e-05, + "loss": 0.05309131, + "step": 7291 + }, + { + "epoch": 14.584, + "grad_norm": 2.3434910774230957, + "learning_rate": 2e-05, + "loss": 0.03328982, + "step": 7292 + }, + { + "epoch": 14.586, + "grad_norm": 2.394543409347534, + "learning_rate": 2e-05, + "loss": 0.03665608, + "step": 7293 + }, + { + "epoch": 14.588, + "grad_norm": 1.7310210466384888, + "learning_rate": 2e-05, + "loss": 0.04254977, + "step": 7294 + }, + { + "epoch": 14.59, + "grad_norm": 1.5410484075546265, + "learning_rate": 2e-05, + "loss": 0.03172578, + "step": 7295 + }, + { + "epoch": 14.592, + "grad_norm": 1.7711609601974487, + "learning_rate": 2e-05, + "loss": 0.04883514, + "step": 7296 + }, + { + "epoch": 14.594, + "grad_norm": 1.606502652168274, + "learning_rate": 2e-05, + "loss": 0.03369958, + "step": 7297 + }, + { + "epoch": 14.596, + "grad_norm": 1.9675401449203491, + "learning_rate": 2e-05, + "loss": 0.03461675, + "step": 7298 + }, + { + "epoch": 14.598, + "grad_norm": 1.6831592321395874, + "learning_rate": 2e-05, + "loss": 0.04358295, + "step": 7299 + }, + { + "epoch": 14.6, + "grad_norm": 1.846706748008728, + "learning_rate": 2e-05, + "loss": 0.04121499, + "step": 7300 + }, + { + "epoch": 14.602, + "grad_norm": 1.6495356559753418, + "learning_rate": 2e-05, + "loss": 0.04087245, + "step": 7301 + }, + { + "epoch": 14.604, + "grad_norm": 1.3375145196914673, + "learning_rate": 2e-05, + "loss": 0.02481322, + "step": 7302 + }, + { + "epoch": 14.606, + "grad_norm": 1.3421443700790405, + "learning_rate": 2e-05, + "loss": 0.04807069, + "step": 7303 + }, + { + "epoch": 14.608, + "grad_norm": 1.5804858207702637, + "learning_rate": 2e-05, + "loss": 0.03445378, + "step": 7304 + }, + { + "epoch": 14.61, + "grad_norm": 3.078519344329834, + "learning_rate": 2e-05, + "loss": 0.05874301, + "step": 7305 + }, + { + "epoch": 14.612, + "grad_norm": 1.269014596939087, + "learning_rate": 2e-05, + "loss": 0.03813095, + "step": 7306 + }, + { + "epoch": 14.614, + "grad_norm": 1.667083978652954, + "learning_rate": 2e-05, + "loss": 0.05303952, + "step": 7307 + }, + { + "epoch": 14.616, + "grad_norm": 1.014502763748169, + "learning_rate": 2e-05, + "loss": 0.0317478, + "step": 7308 + }, + { + "epoch": 14.618, + "grad_norm": 1.066616177558899, + "learning_rate": 2e-05, + "loss": 0.02247791, + "step": 7309 + }, + { + "epoch": 14.62, + "grad_norm": 1.208592176437378, + "learning_rate": 2e-05, + "loss": 0.02994377, + "step": 7310 + }, + { + "epoch": 14.622, + "grad_norm": 0.8381512761116028, + "learning_rate": 2e-05, + "loss": 0.02272216, + "step": 7311 + }, + { + "epoch": 14.624, + "grad_norm": 1.4301813840866089, + "learning_rate": 2e-05, + "loss": 0.04580442, + "step": 7312 + }, + { + "epoch": 14.626, + "grad_norm": 1.006054401397705, + "learning_rate": 2e-05, + "loss": 0.02406761, + "step": 7313 + }, + { + "epoch": 14.628, + "grad_norm": 2.2879605293273926, + "learning_rate": 2e-05, + "loss": 0.05421574, + "step": 7314 + }, + { + "epoch": 14.63, + "grad_norm": 1.347216010093689, + "learning_rate": 2e-05, + "loss": 0.03223877, + "step": 7315 + }, + { + "epoch": 14.632, + "grad_norm": 1.3925851583480835, + "learning_rate": 2e-05, + "loss": 0.04525027, + "step": 7316 + }, + { + "epoch": 14.634, + "grad_norm": 1.5058016777038574, + "learning_rate": 2e-05, + "loss": 0.05671232, + "step": 7317 + }, + { + "epoch": 14.636, + "grad_norm": 1.2585381269454956, + "learning_rate": 2e-05, + "loss": 0.04648329, + "step": 7318 + }, + { + "epoch": 14.638, + "grad_norm": 2.345365047454834, + "learning_rate": 2e-05, + "loss": 0.02963935, + "step": 7319 + }, + { + "epoch": 14.64, + "grad_norm": 1.1654967069625854, + "learning_rate": 2e-05, + "loss": 0.0446506, + "step": 7320 + }, + { + "epoch": 14.642, + "grad_norm": 2.1558237075805664, + "learning_rate": 2e-05, + "loss": 0.04308884, + "step": 7321 + }, + { + "epoch": 14.644, + "grad_norm": 1.271641492843628, + "learning_rate": 2e-05, + "loss": 0.0442902, + "step": 7322 + }, + { + "epoch": 14.646, + "grad_norm": 1.7214982509613037, + "learning_rate": 2e-05, + "loss": 0.0474602, + "step": 7323 + }, + { + "epoch": 14.648, + "grad_norm": 1.1854116916656494, + "learning_rate": 2e-05, + "loss": 0.04027828, + "step": 7324 + }, + { + "epoch": 14.65, + "grad_norm": 1.9680629968643188, + "learning_rate": 2e-05, + "loss": 0.04050666, + "step": 7325 + }, + { + "epoch": 14.652, + "grad_norm": 1.021872878074646, + "learning_rate": 2e-05, + "loss": 0.03418166, + "step": 7326 + }, + { + "epoch": 14.654, + "grad_norm": 1.3861567974090576, + "learning_rate": 2e-05, + "loss": 0.04764885, + "step": 7327 + }, + { + "epoch": 14.656, + "grad_norm": 0.8934645056724548, + "learning_rate": 2e-05, + "loss": 0.02765589, + "step": 7328 + }, + { + "epoch": 14.658, + "grad_norm": 1.2780135869979858, + "learning_rate": 2e-05, + "loss": 0.03148212, + "step": 7329 + }, + { + "epoch": 14.66, + "grad_norm": 1.3805195093154907, + "learning_rate": 2e-05, + "loss": 0.04608523, + "step": 7330 + }, + { + "epoch": 14.662, + "grad_norm": 1.4747601747512817, + "learning_rate": 2e-05, + "loss": 0.0399397, + "step": 7331 + }, + { + "epoch": 14.664, + "grad_norm": 2.1372389793395996, + "learning_rate": 2e-05, + "loss": 0.05007102, + "step": 7332 + }, + { + "epoch": 14.666, + "grad_norm": 1.1365556716918945, + "learning_rate": 2e-05, + "loss": 0.0315646, + "step": 7333 + }, + { + "epoch": 14.668, + "grad_norm": 1.1190814971923828, + "learning_rate": 2e-05, + "loss": 0.03757735, + "step": 7334 + }, + { + "epoch": 14.67, + "grad_norm": 1.0525307655334473, + "learning_rate": 2e-05, + "loss": 0.02962991, + "step": 7335 + }, + { + "epoch": 14.672, + "grad_norm": 1.0830663442611694, + "learning_rate": 2e-05, + "loss": 0.03256768, + "step": 7336 + }, + { + "epoch": 14.674, + "grad_norm": 1.400888204574585, + "learning_rate": 2e-05, + "loss": 0.03843899, + "step": 7337 + }, + { + "epoch": 14.676, + "grad_norm": 1.4905816316604614, + "learning_rate": 2e-05, + "loss": 0.0448906, + "step": 7338 + }, + { + "epoch": 14.678, + "grad_norm": 1.2425509691238403, + "learning_rate": 2e-05, + "loss": 0.0399595, + "step": 7339 + }, + { + "epoch": 14.68, + "grad_norm": 1.1389684677124023, + "learning_rate": 2e-05, + "loss": 0.03919307, + "step": 7340 + }, + { + "epoch": 14.682, + "grad_norm": 1.3145670890808105, + "learning_rate": 2e-05, + "loss": 0.03573439, + "step": 7341 + }, + { + "epoch": 14.684, + "grad_norm": 1.6601872444152832, + "learning_rate": 2e-05, + "loss": 0.04692688, + "step": 7342 + }, + { + "epoch": 14.686, + "grad_norm": 1.6752896308898926, + "learning_rate": 2e-05, + "loss": 0.02998948, + "step": 7343 + }, + { + "epoch": 14.688, + "grad_norm": 1.4124311208724976, + "learning_rate": 2e-05, + "loss": 0.0381167, + "step": 7344 + }, + { + "epoch": 14.69, + "grad_norm": 1.7222316265106201, + "learning_rate": 2e-05, + "loss": 0.04596563, + "step": 7345 + }, + { + "epoch": 14.692, + "grad_norm": 1.5030279159545898, + "learning_rate": 2e-05, + "loss": 0.03170253, + "step": 7346 + }, + { + "epoch": 14.693999999999999, + "grad_norm": 1.9802442789077759, + "learning_rate": 2e-05, + "loss": 0.05240085, + "step": 7347 + }, + { + "epoch": 14.696, + "grad_norm": 1.9767365455627441, + "learning_rate": 2e-05, + "loss": 0.0302583, + "step": 7348 + }, + { + "epoch": 14.698, + "grad_norm": 1.5814634561538696, + "learning_rate": 2e-05, + "loss": 0.03863395, + "step": 7349 + }, + { + "epoch": 14.7, + "grad_norm": 0.9202812910079956, + "learning_rate": 2e-05, + "loss": 0.02484436, + "step": 7350 + }, + { + "epoch": 14.702, + "grad_norm": 1.5224653482437134, + "learning_rate": 2e-05, + "loss": 0.04415604, + "step": 7351 + }, + { + "epoch": 14.704, + "grad_norm": 1.4382812976837158, + "learning_rate": 2e-05, + "loss": 0.04574153, + "step": 7352 + }, + { + "epoch": 14.706, + "grad_norm": 1.1333105564117432, + "learning_rate": 2e-05, + "loss": 0.03003317, + "step": 7353 + }, + { + "epoch": 14.708, + "grad_norm": 1.1706465482711792, + "learning_rate": 2e-05, + "loss": 0.03788555, + "step": 7354 + }, + { + "epoch": 14.71, + "grad_norm": 2.07646107673645, + "learning_rate": 2e-05, + "loss": 0.04297365, + "step": 7355 + }, + { + "epoch": 14.712, + "grad_norm": 1.1796342134475708, + "learning_rate": 2e-05, + "loss": 0.02585716, + "step": 7356 + }, + { + "epoch": 14.714, + "grad_norm": 1.6022311449050903, + "learning_rate": 2e-05, + "loss": 0.03057653, + "step": 7357 + }, + { + "epoch": 14.716, + "grad_norm": 1.4186314344406128, + "learning_rate": 2e-05, + "loss": 0.04783071, + "step": 7358 + }, + { + "epoch": 14.718, + "grad_norm": 1.163080096244812, + "learning_rate": 2e-05, + "loss": 0.04012655, + "step": 7359 + }, + { + "epoch": 14.72, + "grad_norm": 1.7531554698944092, + "learning_rate": 2e-05, + "loss": 0.06195234, + "step": 7360 + }, + { + "epoch": 14.722, + "grad_norm": 0.8606005311012268, + "learning_rate": 2e-05, + "loss": 0.02377318, + "step": 7361 + }, + { + "epoch": 14.724, + "grad_norm": 2.0389902591705322, + "learning_rate": 2e-05, + "loss": 0.03832042, + "step": 7362 + }, + { + "epoch": 14.725999999999999, + "grad_norm": 2.013206720352173, + "learning_rate": 2e-05, + "loss": 0.03251911, + "step": 7363 + }, + { + "epoch": 14.728, + "grad_norm": 1.606460690498352, + "learning_rate": 2e-05, + "loss": 0.03653719, + "step": 7364 + }, + { + "epoch": 14.73, + "grad_norm": 0.8811898827552795, + "learning_rate": 2e-05, + "loss": 0.03039603, + "step": 7365 + }, + { + "epoch": 14.732, + "grad_norm": 1.03880774974823, + "learning_rate": 2e-05, + "loss": 0.0278089, + "step": 7366 + }, + { + "epoch": 14.734, + "grad_norm": 1.3265225887298584, + "learning_rate": 2e-05, + "loss": 0.03921036, + "step": 7367 + }, + { + "epoch": 14.736, + "grad_norm": 1.2806612253189087, + "learning_rate": 2e-05, + "loss": 0.04968328, + "step": 7368 + }, + { + "epoch": 14.738, + "grad_norm": 1.5343149900436401, + "learning_rate": 2e-05, + "loss": 0.04367102, + "step": 7369 + }, + { + "epoch": 14.74, + "grad_norm": 1.47340726852417, + "learning_rate": 2e-05, + "loss": 0.03140695, + "step": 7370 + }, + { + "epoch": 14.742, + "grad_norm": 1.4524576663970947, + "learning_rate": 2e-05, + "loss": 0.03347445, + "step": 7371 + }, + { + "epoch": 14.744, + "grad_norm": 2.033576250076294, + "learning_rate": 2e-05, + "loss": 0.04897091, + "step": 7372 + }, + { + "epoch": 14.746, + "grad_norm": 0.9620274305343628, + "learning_rate": 2e-05, + "loss": 0.02743071, + "step": 7373 + }, + { + "epoch": 14.748, + "grad_norm": 1.0374791622161865, + "learning_rate": 2e-05, + "loss": 0.04549268, + "step": 7374 + }, + { + "epoch": 14.75, + "grad_norm": 1.0183595418930054, + "learning_rate": 2e-05, + "loss": 0.02789947, + "step": 7375 + }, + { + "epoch": 14.752, + "grad_norm": 0.9441385269165039, + "learning_rate": 2e-05, + "loss": 0.03046933, + "step": 7376 + }, + { + "epoch": 14.754, + "grad_norm": 1.2050881385803223, + "learning_rate": 2e-05, + "loss": 0.04037102, + "step": 7377 + }, + { + "epoch": 14.756, + "grad_norm": 1.4334502220153809, + "learning_rate": 2e-05, + "loss": 0.04295122, + "step": 7378 + }, + { + "epoch": 14.758, + "grad_norm": 1.652840495109558, + "learning_rate": 2e-05, + "loss": 0.03737336, + "step": 7379 + }, + { + "epoch": 14.76, + "grad_norm": 1.331262469291687, + "learning_rate": 2e-05, + "loss": 0.04912826, + "step": 7380 + }, + { + "epoch": 14.762, + "grad_norm": 1.5204346179962158, + "learning_rate": 2e-05, + "loss": 0.04136185, + "step": 7381 + }, + { + "epoch": 14.764, + "grad_norm": 1.3094161748886108, + "learning_rate": 2e-05, + "loss": 0.04010442, + "step": 7382 + }, + { + "epoch": 14.766, + "grad_norm": 1.716664433479309, + "learning_rate": 2e-05, + "loss": 0.04056903, + "step": 7383 + }, + { + "epoch": 14.768, + "grad_norm": 2.7664754390716553, + "learning_rate": 2e-05, + "loss": 0.04453249, + "step": 7384 + }, + { + "epoch": 14.77, + "grad_norm": 1.1787066459655762, + "learning_rate": 2e-05, + "loss": 0.04342939, + "step": 7385 + }, + { + "epoch": 14.772, + "grad_norm": 1.3815988302230835, + "learning_rate": 2e-05, + "loss": 0.04383612, + "step": 7386 + }, + { + "epoch": 14.774000000000001, + "grad_norm": 1.0485109090805054, + "learning_rate": 2e-05, + "loss": 0.03212638, + "step": 7387 + }, + { + "epoch": 14.776, + "grad_norm": 1.444206714630127, + "learning_rate": 2e-05, + "loss": 0.04447151, + "step": 7388 + }, + { + "epoch": 14.778, + "grad_norm": 0.9816635847091675, + "learning_rate": 2e-05, + "loss": 0.03149556, + "step": 7389 + }, + { + "epoch": 14.78, + "grad_norm": 1.0881997346878052, + "learning_rate": 2e-05, + "loss": 0.03420866, + "step": 7390 + }, + { + "epoch": 14.782, + "grad_norm": 1.6856642961502075, + "learning_rate": 2e-05, + "loss": 0.04961979, + "step": 7391 + }, + { + "epoch": 14.784, + "grad_norm": 4.939731597900391, + "learning_rate": 2e-05, + "loss": 0.05440149, + "step": 7392 + }, + { + "epoch": 14.786, + "grad_norm": 1.1305186748504639, + "learning_rate": 2e-05, + "loss": 0.03616676, + "step": 7393 + }, + { + "epoch": 14.788, + "grad_norm": 1.8306798934936523, + "learning_rate": 2e-05, + "loss": 0.04979054, + "step": 7394 + }, + { + "epoch": 14.79, + "grad_norm": 1.601699948310852, + "learning_rate": 2e-05, + "loss": 0.0460626, + "step": 7395 + }, + { + "epoch": 14.792, + "grad_norm": 1.8894065618515015, + "learning_rate": 2e-05, + "loss": 0.05282786, + "step": 7396 + }, + { + "epoch": 14.794, + "grad_norm": 1.5035409927368164, + "learning_rate": 2e-05, + "loss": 0.03724657, + "step": 7397 + }, + { + "epoch": 14.796, + "grad_norm": 2.4653127193450928, + "learning_rate": 2e-05, + "loss": 0.03604883, + "step": 7398 + }, + { + "epoch": 14.798, + "grad_norm": 1.3967053890228271, + "learning_rate": 2e-05, + "loss": 0.04081305, + "step": 7399 + }, + { + "epoch": 14.8, + "grad_norm": 1.280266523361206, + "learning_rate": 2e-05, + "loss": 0.03488631, + "step": 7400 + }, + { + "epoch": 14.802, + "grad_norm": 1.1659793853759766, + "learning_rate": 2e-05, + "loss": 0.03118264, + "step": 7401 + }, + { + "epoch": 14.804, + "grad_norm": 1.2721163034439087, + "learning_rate": 2e-05, + "loss": 0.0195914, + "step": 7402 + }, + { + "epoch": 14.806000000000001, + "grad_norm": 0.9028842449188232, + "learning_rate": 2e-05, + "loss": 0.02684617, + "step": 7403 + }, + { + "epoch": 14.808, + "grad_norm": 1.3119491338729858, + "learning_rate": 2e-05, + "loss": 0.0373154, + "step": 7404 + }, + { + "epoch": 14.81, + "grad_norm": 2.559068202972412, + "learning_rate": 2e-05, + "loss": 0.03095316, + "step": 7405 + }, + { + "epoch": 14.812, + "grad_norm": 1.241502046585083, + "learning_rate": 2e-05, + "loss": 0.02786666, + "step": 7406 + }, + { + "epoch": 14.814, + "grad_norm": 1.2434546947479248, + "learning_rate": 2e-05, + "loss": 0.03359643, + "step": 7407 + }, + { + "epoch": 14.816, + "grad_norm": 1.3161190748214722, + "learning_rate": 2e-05, + "loss": 0.03395464, + "step": 7408 + }, + { + "epoch": 14.818, + "grad_norm": 1.1681431531906128, + "learning_rate": 2e-05, + "loss": 0.03358738, + "step": 7409 + }, + { + "epoch": 14.82, + "grad_norm": 2.086354970932007, + "learning_rate": 2e-05, + "loss": 0.0546408, + "step": 7410 + }, + { + "epoch": 14.822, + "grad_norm": 1.3456135988235474, + "learning_rate": 2e-05, + "loss": 0.04479208, + "step": 7411 + }, + { + "epoch": 14.824, + "grad_norm": 1.053308367729187, + "learning_rate": 2e-05, + "loss": 0.02650757, + "step": 7412 + }, + { + "epoch": 14.826, + "grad_norm": 1.4386178255081177, + "learning_rate": 2e-05, + "loss": 0.03358426, + "step": 7413 + }, + { + "epoch": 14.828, + "grad_norm": 1.846922755241394, + "learning_rate": 2e-05, + "loss": 0.037901, + "step": 7414 + }, + { + "epoch": 14.83, + "grad_norm": 1.393807053565979, + "learning_rate": 2e-05, + "loss": 0.05257457, + "step": 7415 + }, + { + "epoch": 14.832, + "grad_norm": 1.5299506187438965, + "learning_rate": 2e-05, + "loss": 0.04238684, + "step": 7416 + }, + { + "epoch": 14.834, + "grad_norm": 1.1853361129760742, + "learning_rate": 2e-05, + "loss": 0.04620481, + "step": 7417 + }, + { + "epoch": 14.836, + "grad_norm": 1.9609429836273193, + "learning_rate": 2e-05, + "loss": 0.05831449, + "step": 7418 + }, + { + "epoch": 14.838, + "grad_norm": 0.9670793414115906, + "learning_rate": 2e-05, + "loss": 0.0317945, + "step": 7419 + }, + { + "epoch": 14.84, + "grad_norm": 2.3657424449920654, + "learning_rate": 2e-05, + "loss": 0.06117197, + "step": 7420 + }, + { + "epoch": 14.842, + "grad_norm": 1.2410660982131958, + "learning_rate": 2e-05, + "loss": 0.03587608, + "step": 7421 + }, + { + "epoch": 14.844, + "grad_norm": 1.5402605533599854, + "learning_rate": 2e-05, + "loss": 0.03572353, + "step": 7422 + }, + { + "epoch": 14.846, + "grad_norm": 1.4933197498321533, + "learning_rate": 2e-05, + "loss": 0.03981678, + "step": 7423 + }, + { + "epoch": 14.848, + "grad_norm": 1.4539066553115845, + "learning_rate": 2e-05, + "loss": 0.05267172, + "step": 7424 + }, + { + "epoch": 14.85, + "grad_norm": 1.239725947380066, + "learning_rate": 2e-05, + "loss": 0.04814597, + "step": 7425 + }, + { + "epoch": 14.852, + "grad_norm": 1.062360405921936, + "learning_rate": 2e-05, + "loss": 0.03394141, + "step": 7426 + }, + { + "epoch": 14.854, + "grad_norm": 1.3617041110992432, + "learning_rate": 2e-05, + "loss": 0.04762523, + "step": 7427 + }, + { + "epoch": 14.856, + "grad_norm": 1.218767762184143, + "learning_rate": 2e-05, + "loss": 0.0463365, + "step": 7428 + }, + { + "epoch": 14.858, + "grad_norm": 1.1264668703079224, + "learning_rate": 2e-05, + "loss": 0.02673528, + "step": 7429 + }, + { + "epoch": 14.86, + "grad_norm": 1.9460324048995972, + "learning_rate": 2e-05, + "loss": 0.03690345, + "step": 7430 + }, + { + "epoch": 14.862, + "grad_norm": 1.3760199546813965, + "learning_rate": 2e-05, + "loss": 0.04101796, + "step": 7431 + }, + { + "epoch": 14.864, + "grad_norm": 1.8361059427261353, + "learning_rate": 2e-05, + "loss": 0.05019148, + "step": 7432 + }, + { + "epoch": 14.866, + "grad_norm": 2.861609697341919, + "learning_rate": 2e-05, + "loss": 0.04384901, + "step": 7433 + }, + { + "epoch": 14.868, + "grad_norm": 2.696000814437866, + "learning_rate": 2e-05, + "loss": 0.04344347, + "step": 7434 + }, + { + "epoch": 14.87, + "grad_norm": 1.0894790887832642, + "learning_rate": 2e-05, + "loss": 0.03905128, + "step": 7435 + }, + { + "epoch": 14.872, + "grad_norm": 1.1012567281723022, + "learning_rate": 2e-05, + "loss": 0.0368652, + "step": 7436 + }, + { + "epoch": 14.874, + "grad_norm": 1.441910982131958, + "learning_rate": 2e-05, + "loss": 0.04437996, + "step": 7437 + }, + { + "epoch": 14.876, + "grad_norm": 1.305418610572815, + "learning_rate": 2e-05, + "loss": 0.03403091, + "step": 7438 + }, + { + "epoch": 14.878, + "grad_norm": 0.936972975730896, + "learning_rate": 2e-05, + "loss": 0.03193845, + "step": 7439 + }, + { + "epoch": 14.88, + "grad_norm": 1.2693448066711426, + "learning_rate": 2e-05, + "loss": 0.03593332, + "step": 7440 + }, + { + "epoch": 14.882, + "grad_norm": 0.9230713844299316, + "learning_rate": 2e-05, + "loss": 0.02694279, + "step": 7441 + }, + { + "epoch": 14.884, + "grad_norm": 1.4311070442199707, + "learning_rate": 2e-05, + "loss": 0.03495328, + "step": 7442 + }, + { + "epoch": 14.886, + "grad_norm": 2.3351352214813232, + "learning_rate": 2e-05, + "loss": 0.04533555, + "step": 7443 + }, + { + "epoch": 14.888, + "grad_norm": 1.6613550186157227, + "learning_rate": 2e-05, + "loss": 0.0545169, + "step": 7444 + }, + { + "epoch": 14.89, + "grad_norm": 3.134610414505005, + "learning_rate": 2e-05, + "loss": 0.06269054, + "step": 7445 + }, + { + "epoch": 14.892, + "grad_norm": 1.2900604009628296, + "learning_rate": 2e-05, + "loss": 0.0403505, + "step": 7446 + }, + { + "epoch": 14.894, + "grad_norm": 1.0350453853607178, + "learning_rate": 2e-05, + "loss": 0.02884517, + "step": 7447 + }, + { + "epoch": 14.896, + "grad_norm": 2.1216492652893066, + "learning_rate": 2e-05, + "loss": 0.05400493, + "step": 7448 + }, + { + "epoch": 14.898, + "grad_norm": 1.5325112342834473, + "learning_rate": 2e-05, + "loss": 0.03483828, + "step": 7449 + }, + { + "epoch": 14.9, + "grad_norm": 1.1589800119400024, + "learning_rate": 2e-05, + "loss": 0.03094343, + "step": 7450 + }, + { + "epoch": 14.902, + "grad_norm": 1.6227322816848755, + "learning_rate": 2e-05, + "loss": 0.0469828, + "step": 7451 + }, + { + "epoch": 14.904, + "grad_norm": 1.6878936290740967, + "learning_rate": 2e-05, + "loss": 0.04225905, + "step": 7452 + }, + { + "epoch": 14.906, + "grad_norm": 1.1886628866195679, + "learning_rate": 2e-05, + "loss": 0.03824829, + "step": 7453 + }, + { + "epoch": 14.908, + "grad_norm": 1.4336858987808228, + "learning_rate": 2e-05, + "loss": 0.04599863, + "step": 7454 + }, + { + "epoch": 14.91, + "grad_norm": 1.2423670291900635, + "learning_rate": 2e-05, + "loss": 0.05265063, + "step": 7455 + }, + { + "epoch": 14.912, + "grad_norm": 1.2401492595672607, + "learning_rate": 2e-05, + "loss": 0.03636886, + "step": 7456 + }, + { + "epoch": 14.914, + "grad_norm": 0.7528156638145447, + "learning_rate": 2e-05, + "loss": 0.02554782, + "step": 7457 + }, + { + "epoch": 14.916, + "grad_norm": 1.5916630029678345, + "learning_rate": 2e-05, + "loss": 0.03105753, + "step": 7458 + }, + { + "epoch": 14.918, + "grad_norm": 1.8795676231384277, + "learning_rate": 2e-05, + "loss": 0.03935018, + "step": 7459 + }, + { + "epoch": 14.92, + "grad_norm": 1.8167335987091064, + "learning_rate": 2e-05, + "loss": 0.04380289, + "step": 7460 + }, + { + "epoch": 14.922, + "grad_norm": 1.1691914796829224, + "learning_rate": 2e-05, + "loss": 0.03274843, + "step": 7461 + }, + { + "epoch": 14.924, + "grad_norm": 1.3650342226028442, + "learning_rate": 2e-05, + "loss": 0.04208003, + "step": 7462 + }, + { + "epoch": 14.926, + "grad_norm": 1.24591863155365, + "learning_rate": 2e-05, + "loss": 0.04226547, + "step": 7463 + }, + { + "epoch": 14.928, + "grad_norm": 1.3983882665634155, + "learning_rate": 2e-05, + "loss": 0.05762651, + "step": 7464 + }, + { + "epoch": 14.93, + "grad_norm": 1.1452008485794067, + "learning_rate": 2e-05, + "loss": 0.02998473, + "step": 7465 + }, + { + "epoch": 14.932, + "grad_norm": 0.9939238429069519, + "learning_rate": 2e-05, + "loss": 0.02803001, + "step": 7466 + }, + { + "epoch": 14.934, + "grad_norm": 1.5362166166305542, + "learning_rate": 2e-05, + "loss": 0.05231043, + "step": 7467 + }, + { + "epoch": 14.936, + "grad_norm": 1.7269541025161743, + "learning_rate": 2e-05, + "loss": 0.04780792, + "step": 7468 + }, + { + "epoch": 14.938, + "grad_norm": 1.1512809991836548, + "learning_rate": 2e-05, + "loss": 0.03671274, + "step": 7469 + }, + { + "epoch": 14.94, + "grad_norm": 1.2893776893615723, + "learning_rate": 2e-05, + "loss": 0.03543502, + "step": 7470 + }, + { + "epoch": 14.942, + "grad_norm": 2.2255067825317383, + "learning_rate": 2e-05, + "loss": 0.05238653, + "step": 7471 + }, + { + "epoch": 14.943999999999999, + "grad_norm": 0.9970213770866394, + "learning_rate": 2e-05, + "loss": 0.02268671, + "step": 7472 + }, + { + "epoch": 14.946, + "grad_norm": 1.0443769693374634, + "learning_rate": 2e-05, + "loss": 0.03668858, + "step": 7473 + }, + { + "epoch": 14.948, + "grad_norm": 0.9335085153579712, + "learning_rate": 2e-05, + "loss": 0.03486544, + "step": 7474 + }, + { + "epoch": 14.95, + "grad_norm": 2.1096580028533936, + "learning_rate": 2e-05, + "loss": 0.06607223, + "step": 7475 + }, + { + "epoch": 14.952, + "grad_norm": 2.1149537563323975, + "learning_rate": 2e-05, + "loss": 0.03609411, + "step": 7476 + }, + { + "epoch": 14.954, + "grad_norm": 1.5304067134857178, + "learning_rate": 2e-05, + "loss": 0.03244342, + "step": 7477 + }, + { + "epoch": 14.956, + "grad_norm": 1.4157954454421997, + "learning_rate": 2e-05, + "loss": 0.04437434, + "step": 7478 + }, + { + "epoch": 14.958, + "grad_norm": 1.3668354749679565, + "learning_rate": 2e-05, + "loss": 0.03368328, + "step": 7479 + }, + { + "epoch": 14.96, + "grad_norm": 1.1357492208480835, + "learning_rate": 2e-05, + "loss": 0.03183534, + "step": 7480 + }, + { + "epoch": 14.962, + "grad_norm": 1.2593474388122559, + "learning_rate": 2e-05, + "loss": 0.03054814, + "step": 7481 + }, + { + "epoch": 14.964, + "grad_norm": 1.0651172399520874, + "learning_rate": 2e-05, + "loss": 0.03272149, + "step": 7482 + }, + { + "epoch": 14.966, + "grad_norm": 1.3296829462051392, + "learning_rate": 2e-05, + "loss": 0.02990423, + "step": 7483 + }, + { + "epoch": 14.968, + "grad_norm": 1.153295874595642, + "learning_rate": 2e-05, + "loss": 0.04132929, + "step": 7484 + }, + { + "epoch": 14.97, + "grad_norm": 1.8791780471801758, + "learning_rate": 2e-05, + "loss": 0.0307649, + "step": 7485 + }, + { + "epoch": 14.972, + "grad_norm": 1.850902795791626, + "learning_rate": 2e-05, + "loss": 0.05484937, + "step": 7486 + }, + { + "epoch": 14.974, + "grad_norm": 1.170993685722351, + "learning_rate": 2e-05, + "loss": 0.03880947, + "step": 7487 + }, + { + "epoch": 14.975999999999999, + "grad_norm": 1.0112059116363525, + "learning_rate": 2e-05, + "loss": 0.05152441, + "step": 7488 + }, + { + "epoch": 14.978, + "grad_norm": 0.9202816486358643, + "learning_rate": 2e-05, + "loss": 0.03082973, + "step": 7489 + }, + { + "epoch": 14.98, + "grad_norm": 1.5370155572891235, + "learning_rate": 2e-05, + "loss": 0.03976063, + "step": 7490 + }, + { + "epoch": 14.982, + "grad_norm": 1.6214145421981812, + "learning_rate": 2e-05, + "loss": 0.03335922, + "step": 7491 + }, + { + "epoch": 14.984, + "grad_norm": 1.5763835906982422, + "learning_rate": 2e-05, + "loss": 0.05680116, + "step": 7492 + }, + { + "epoch": 14.986, + "grad_norm": 1.2244529724121094, + "learning_rate": 2e-05, + "loss": 0.03010012, + "step": 7493 + }, + { + "epoch": 14.988, + "grad_norm": 1.0402621030807495, + "learning_rate": 2e-05, + "loss": 0.03202302, + "step": 7494 + }, + { + "epoch": 14.99, + "grad_norm": 1.1672792434692383, + "learning_rate": 2e-05, + "loss": 0.0357578, + "step": 7495 + }, + { + "epoch": 14.992, + "grad_norm": 1.5200659036636353, + "learning_rate": 2e-05, + "loss": 0.04126337, + "step": 7496 + }, + { + "epoch": 14.994, + "grad_norm": 3.4498698711395264, + "learning_rate": 2e-05, + "loss": 0.06399542, + "step": 7497 + }, + { + "epoch": 14.996, + "grad_norm": 2.6905369758605957, + "learning_rate": 2e-05, + "loss": 0.05302805, + "step": 7498 + }, + { + "epoch": 14.998, + "grad_norm": 1.3441663980484009, + "learning_rate": 2e-05, + "loss": 0.03965209, + "step": 7499 + }, + { + "epoch": 15.0, + "grad_norm": 1.0610202550888062, + "learning_rate": 2e-05, + "loss": 0.03335898, + "step": 7500 + }, + { + "epoch": 15.0, + "eval_performance": { + "AngleClassification_1": 0.982, + "AngleClassification_2": 0.998, + "AngleClassification_3": 0.9600798403193613, + "Equal_1": 0.994, + "Equal_2": 0.9640718562874252, + "Equal_3": 0.8982035928143712, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9960079840319361, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.99, + "Perpendicular_1": 0.992, + "Perpendicular_2": 0.972, + "Perpendicular_3": 0.7114228456913828, + "PointLiesOnCircle_1": 0.9939879759519038, + "PointLiesOnCircle_2": 0.996, + "PointLiesOnCircle_3": 0.99, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9860279441117764 + }, + "eval_runtime": 320.0741, + "eval_samples_per_second": 32.805, + "eval_steps_per_second": 0.656, + "step": 7500 + }, + { + "epoch": 15.002, + "grad_norm": 1.5323970317840576, + "learning_rate": 2e-05, + "loss": 0.03357163, + "step": 7501 + }, + { + "epoch": 15.004, + "grad_norm": 1.106204628944397, + "learning_rate": 2e-05, + "loss": 0.02693918, + "step": 7502 + }, + { + "epoch": 15.006, + "grad_norm": 2.7281370162963867, + "learning_rate": 2e-05, + "loss": 0.07801263, + "step": 7503 + }, + { + "epoch": 15.008, + "grad_norm": 0.9546897411346436, + "learning_rate": 2e-05, + "loss": 0.0307308, + "step": 7504 + }, + { + "epoch": 15.01, + "grad_norm": 1.819274663925171, + "learning_rate": 2e-05, + "loss": 0.02562927, + "step": 7505 + }, + { + "epoch": 15.012, + "grad_norm": 2.0995659828186035, + "learning_rate": 2e-05, + "loss": 0.05087785, + "step": 7506 + }, + { + "epoch": 15.014, + "grad_norm": 1.5401643514633179, + "learning_rate": 2e-05, + "loss": 0.05163966, + "step": 7507 + }, + { + "epoch": 15.016, + "grad_norm": 3.4083080291748047, + "learning_rate": 2e-05, + "loss": 0.04428158, + "step": 7508 + }, + { + "epoch": 15.018, + "grad_norm": 3.521087646484375, + "learning_rate": 2e-05, + "loss": 0.04372428, + "step": 7509 + }, + { + "epoch": 15.02, + "grad_norm": 1.4367711544036865, + "learning_rate": 2e-05, + "loss": 0.0497531, + "step": 7510 + }, + { + "epoch": 15.022, + "grad_norm": 1.270221471786499, + "learning_rate": 2e-05, + "loss": 0.03810829, + "step": 7511 + }, + { + "epoch": 15.024, + "grad_norm": 1.317637324333191, + "learning_rate": 2e-05, + "loss": 0.0325052, + "step": 7512 + }, + { + "epoch": 15.026, + "grad_norm": 1.2471699714660645, + "learning_rate": 2e-05, + "loss": 0.02949313, + "step": 7513 + }, + { + "epoch": 15.028, + "grad_norm": 1.197861671447754, + "learning_rate": 2e-05, + "loss": 0.03684509, + "step": 7514 + }, + { + "epoch": 15.03, + "grad_norm": 1.8183475732803345, + "learning_rate": 2e-05, + "loss": 0.06737492, + "step": 7515 + }, + { + "epoch": 15.032, + "grad_norm": 1.03377366065979, + "learning_rate": 2e-05, + "loss": 0.03917066, + "step": 7516 + }, + { + "epoch": 15.034, + "grad_norm": 2.038936138153076, + "learning_rate": 2e-05, + "loss": 0.03911228, + "step": 7517 + }, + { + "epoch": 15.036, + "grad_norm": 0.945249617099762, + "learning_rate": 2e-05, + "loss": 0.02501803, + "step": 7518 + }, + { + "epoch": 15.038, + "grad_norm": 3.087181568145752, + "learning_rate": 2e-05, + "loss": 0.0438556, + "step": 7519 + }, + { + "epoch": 15.04, + "grad_norm": 1.3081676959991455, + "learning_rate": 2e-05, + "loss": 0.04287355, + "step": 7520 + }, + { + "epoch": 15.042, + "grad_norm": 1.7691004276275635, + "learning_rate": 2e-05, + "loss": 0.03438368, + "step": 7521 + }, + { + "epoch": 15.044, + "grad_norm": 1.5667086839675903, + "learning_rate": 2e-05, + "loss": 0.02922621, + "step": 7522 + }, + { + "epoch": 15.046, + "grad_norm": 2.0105979442596436, + "learning_rate": 2e-05, + "loss": 0.04000949, + "step": 7523 + }, + { + "epoch": 15.048, + "grad_norm": 1.2400439977645874, + "learning_rate": 2e-05, + "loss": 0.0464882, + "step": 7524 + }, + { + "epoch": 15.05, + "grad_norm": 1.4981132745742798, + "learning_rate": 2e-05, + "loss": 0.0353303, + "step": 7525 + }, + { + "epoch": 15.052, + "grad_norm": 1.7976100444793701, + "learning_rate": 2e-05, + "loss": 0.04217862, + "step": 7526 + }, + { + "epoch": 15.054, + "grad_norm": 1.2558151483535767, + "learning_rate": 2e-05, + "loss": 0.03909191, + "step": 7527 + }, + { + "epoch": 15.056, + "grad_norm": 0.9325153231620789, + "learning_rate": 2e-05, + "loss": 0.03274619, + "step": 7528 + }, + { + "epoch": 15.058, + "grad_norm": 1.0450353622436523, + "learning_rate": 2e-05, + "loss": 0.03170678, + "step": 7529 + }, + { + "epoch": 15.06, + "grad_norm": 2.4308180809020996, + "learning_rate": 2e-05, + "loss": 0.04601308, + "step": 7530 + }, + { + "epoch": 15.062, + "grad_norm": 1.7169846296310425, + "learning_rate": 2e-05, + "loss": 0.03847077, + "step": 7531 + }, + { + "epoch": 15.064, + "grad_norm": 1.9664361476898193, + "learning_rate": 2e-05, + "loss": 0.06695174, + "step": 7532 + }, + { + "epoch": 15.066, + "grad_norm": 1.3882360458374023, + "learning_rate": 2e-05, + "loss": 0.03631517, + "step": 7533 + }, + { + "epoch": 15.068, + "grad_norm": 1.6631497144699097, + "learning_rate": 2e-05, + "loss": 0.03166562, + "step": 7534 + }, + { + "epoch": 15.07, + "grad_norm": 1.215599536895752, + "learning_rate": 2e-05, + "loss": 0.04256611, + "step": 7535 + }, + { + "epoch": 15.072, + "grad_norm": 1.148917555809021, + "learning_rate": 2e-05, + "loss": 0.04094999, + "step": 7536 + }, + { + "epoch": 15.074, + "grad_norm": 1.262528419494629, + "learning_rate": 2e-05, + "loss": 0.02669748, + "step": 7537 + }, + { + "epoch": 15.076, + "grad_norm": 1.4280608892440796, + "learning_rate": 2e-05, + "loss": 0.03824629, + "step": 7538 + }, + { + "epoch": 15.078, + "grad_norm": 1.1914440393447876, + "learning_rate": 2e-05, + "loss": 0.03420604, + "step": 7539 + }, + { + "epoch": 15.08, + "grad_norm": 1.5418035984039307, + "learning_rate": 2e-05, + "loss": 0.04447289, + "step": 7540 + }, + { + "epoch": 15.082, + "grad_norm": 1.7850476503372192, + "learning_rate": 2e-05, + "loss": 0.05204378, + "step": 7541 + }, + { + "epoch": 15.084, + "grad_norm": 1.7155487537384033, + "learning_rate": 2e-05, + "loss": 0.04036555, + "step": 7542 + }, + { + "epoch": 15.086, + "grad_norm": 1.1043587923049927, + "learning_rate": 2e-05, + "loss": 0.03954956, + "step": 7543 + }, + { + "epoch": 15.088, + "grad_norm": 1.4283198118209839, + "learning_rate": 2e-05, + "loss": 0.04688943, + "step": 7544 + }, + { + "epoch": 15.09, + "grad_norm": 1.2153453826904297, + "learning_rate": 2e-05, + "loss": 0.03587115, + "step": 7545 + }, + { + "epoch": 15.092, + "grad_norm": 1.3699722290039062, + "learning_rate": 2e-05, + "loss": 0.04313603, + "step": 7546 + }, + { + "epoch": 15.094, + "grad_norm": 1.0918267965316772, + "learning_rate": 2e-05, + "loss": 0.03959726, + "step": 7547 + }, + { + "epoch": 15.096, + "grad_norm": 1.2050516605377197, + "learning_rate": 2e-05, + "loss": 0.03665368, + "step": 7548 + }, + { + "epoch": 15.098, + "grad_norm": 1.611580729484558, + "learning_rate": 2e-05, + "loss": 0.04823206, + "step": 7549 + }, + { + "epoch": 15.1, + "grad_norm": 1.3868486881256104, + "learning_rate": 2e-05, + "loss": 0.04774088, + "step": 7550 + }, + { + "epoch": 15.102, + "grad_norm": 2.077451229095459, + "learning_rate": 2e-05, + "loss": 0.03221234, + "step": 7551 + }, + { + "epoch": 15.104, + "grad_norm": 1.458793044090271, + "learning_rate": 2e-05, + "loss": 0.02752698, + "step": 7552 + }, + { + "epoch": 15.106, + "grad_norm": 1.0230228900909424, + "learning_rate": 2e-05, + "loss": 0.03414164, + "step": 7553 + }, + { + "epoch": 15.108, + "grad_norm": 1.8648970127105713, + "learning_rate": 2e-05, + "loss": 0.04431803, + "step": 7554 + }, + { + "epoch": 15.11, + "grad_norm": 1.213136076927185, + "learning_rate": 2e-05, + "loss": 0.03528218, + "step": 7555 + }, + { + "epoch": 15.112, + "grad_norm": 1.1067938804626465, + "learning_rate": 2e-05, + "loss": 0.03371948, + "step": 7556 + }, + { + "epoch": 15.114, + "grad_norm": 1.099187970161438, + "learning_rate": 2e-05, + "loss": 0.04380881, + "step": 7557 + }, + { + "epoch": 15.116, + "grad_norm": 1.4904146194458008, + "learning_rate": 2e-05, + "loss": 0.04857348, + "step": 7558 + }, + { + "epoch": 15.118, + "grad_norm": 1.9208067655563354, + "learning_rate": 2e-05, + "loss": 0.02925631, + "step": 7559 + }, + { + "epoch": 15.12, + "grad_norm": 1.3808106184005737, + "learning_rate": 2e-05, + "loss": 0.05164766, + "step": 7560 + }, + { + "epoch": 15.122, + "grad_norm": 1.7450789213180542, + "learning_rate": 2e-05, + "loss": 0.05939767, + "step": 7561 + }, + { + "epoch": 15.124, + "grad_norm": 1.3187388181686401, + "learning_rate": 2e-05, + "loss": 0.04064866, + "step": 7562 + }, + { + "epoch": 15.126, + "grad_norm": 2.158761501312256, + "learning_rate": 2e-05, + "loss": 0.03640309, + "step": 7563 + }, + { + "epoch": 15.128, + "grad_norm": 1.668043613433838, + "learning_rate": 2e-05, + "loss": 0.03924328, + "step": 7564 + }, + { + "epoch": 15.13, + "grad_norm": 1.2158105373382568, + "learning_rate": 2e-05, + "loss": 0.03478661, + "step": 7565 + }, + { + "epoch": 15.132, + "grad_norm": 1.681146264076233, + "learning_rate": 2e-05, + "loss": 0.05600887, + "step": 7566 + }, + { + "epoch": 15.134, + "grad_norm": 0.9708520770072937, + "learning_rate": 2e-05, + "loss": 0.01993345, + "step": 7567 + }, + { + "epoch": 15.136, + "grad_norm": 0.926152765750885, + "learning_rate": 2e-05, + "loss": 0.03074206, + "step": 7568 + }, + { + "epoch": 15.138, + "grad_norm": 1.0853475332260132, + "learning_rate": 2e-05, + "loss": 0.03965416, + "step": 7569 + }, + { + "epoch": 15.14, + "grad_norm": 1.4762166738510132, + "learning_rate": 2e-05, + "loss": 0.0375259, + "step": 7570 + }, + { + "epoch": 15.142, + "grad_norm": 1.307599425315857, + "learning_rate": 2e-05, + "loss": 0.04117063, + "step": 7571 + }, + { + "epoch": 15.144, + "grad_norm": 1.364780068397522, + "learning_rate": 2e-05, + "loss": 0.03955507, + "step": 7572 + }, + { + "epoch": 15.146, + "grad_norm": 1.8140677213668823, + "learning_rate": 2e-05, + "loss": 0.04545042, + "step": 7573 + }, + { + "epoch": 15.148, + "grad_norm": 1.234650731086731, + "learning_rate": 2e-05, + "loss": 0.03562243, + "step": 7574 + }, + { + "epoch": 15.15, + "grad_norm": 1.4778765439987183, + "learning_rate": 2e-05, + "loss": 0.05000856, + "step": 7575 + }, + { + "epoch": 15.152, + "grad_norm": 1.323702096939087, + "learning_rate": 2e-05, + "loss": 0.02919011, + "step": 7576 + }, + { + "epoch": 15.154, + "grad_norm": 0.9931415319442749, + "learning_rate": 2e-05, + "loss": 0.03235832, + "step": 7577 + }, + { + "epoch": 15.156, + "grad_norm": 1.086990237236023, + "learning_rate": 2e-05, + "loss": 0.02611094, + "step": 7578 + }, + { + "epoch": 15.158, + "grad_norm": 1.284932255744934, + "learning_rate": 2e-05, + "loss": 0.04641385, + "step": 7579 + }, + { + "epoch": 15.16, + "grad_norm": 0.9622644782066345, + "learning_rate": 2e-05, + "loss": 0.02520053, + "step": 7580 + }, + { + "epoch": 15.162, + "grad_norm": 1.0503311157226562, + "learning_rate": 2e-05, + "loss": 0.03852223, + "step": 7581 + }, + { + "epoch": 15.164, + "grad_norm": 1.5426743030548096, + "learning_rate": 2e-05, + "loss": 0.04453558, + "step": 7582 + }, + { + "epoch": 15.166, + "grad_norm": 1.0924078226089478, + "learning_rate": 2e-05, + "loss": 0.03900549, + "step": 7583 + }, + { + "epoch": 15.168, + "grad_norm": 1.5701236724853516, + "learning_rate": 2e-05, + "loss": 0.03460903, + "step": 7584 + }, + { + "epoch": 15.17, + "grad_norm": 1.7909417152404785, + "learning_rate": 2e-05, + "loss": 0.02750588, + "step": 7585 + }, + { + "epoch": 15.172, + "grad_norm": 3.5445761680603027, + "learning_rate": 2e-05, + "loss": 0.05145767, + "step": 7586 + }, + { + "epoch": 15.174, + "grad_norm": 1.7344753742218018, + "learning_rate": 2e-05, + "loss": 0.04307804, + "step": 7587 + }, + { + "epoch": 15.176, + "grad_norm": 1.1254594326019287, + "learning_rate": 2e-05, + "loss": 0.03898653, + "step": 7588 + }, + { + "epoch": 15.178, + "grad_norm": 1.2491363286972046, + "learning_rate": 2e-05, + "loss": 0.0345161, + "step": 7589 + }, + { + "epoch": 15.18, + "grad_norm": 1.5165410041809082, + "learning_rate": 2e-05, + "loss": 0.04044965, + "step": 7590 + }, + { + "epoch": 15.182, + "grad_norm": 1.306868553161621, + "learning_rate": 2e-05, + "loss": 0.04725231, + "step": 7591 + }, + { + "epoch": 15.184, + "grad_norm": 1.8153162002563477, + "learning_rate": 2e-05, + "loss": 0.0434998, + "step": 7592 + }, + { + "epoch": 15.186, + "grad_norm": 2.0734148025512695, + "learning_rate": 2e-05, + "loss": 0.04822708, + "step": 7593 + }, + { + "epoch": 15.188, + "grad_norm": 1.193581223487854, + "learning_rate": 2e-05, + "loss": 0.04005368, + "step": 7594 + }, + { + "epoch": 15.19, + "grad_norm": 1.9884793758392334, + "learning_rate": 2e-05, + "loss": 0.03887493, + "step": 7595 + }, + { + "epoch": 15.192, + "grad_norm": 2.539429187774658, + "learning_rate": 2e-05, + "loss": 0.03816453, + "step": 7596 + }, + { + "epoch": 15.194, + "grad_norm": 1.7256981134414673, + "learning_rate": 2e-05, + "loss": 0.02610845, + "step": 7597 + }, + { + "epoch": 15.196, + "grad_norm": 0.9873558282852173, + "learning_rate": 2e-05, + "loss": 0.02836427, + "step": 7598 + }, + { + "epoch": 15.198, + "grad_norm": 3.33852481842041, + "learning_rate": 2e-05, + "loss": 0.06002682, + "step": 7599 + }, + { + "epoch": 15.2, + "grad_norm": 2.537189483642578, + "learning_rate": 2e-05, + "loss": 0.07417429, + "step": 7600 + }, + { + "epoch": 15.202, + "grad_norm": 0.9124858379364014, + "learning_rate": 2e-05, + "loss": 0.03152982, + "step": 7601 + }, + { + "epoch": 15.204, + "grad_norm": 1.0165115594863892, + "learning_rate": 2e-05, + "loss": 0.03425337, + "step": 7602 + }, + { + "epoch": 15.206, + "grad_norm": 1.362256407737732, + "learning_rate": 2e-05, + "loss": 0.04571758, + "step": 7603 + }, + { + "epoch": 15.208, + "grad_norm": 1.2816169261932373, + "learning_rate": 2e-05, + "loss": 0.04699683, + "step": 7604 + }, + { + "epoch": 15.21, + "grad_norm": 1.6668884754180908, + "learning_rate": 2e-05, + "loss": 0.03962279, + "step": 7605 + }, + { + "epoch": 15.212, + "grad_norm": 1.2966806888580322, + "learning_rate": 2e-05, + "loss": 0.04509297, + "step": 7606 + }, + { + "epoch": 15.214, + "grad_norm": 1.191419243812561, + "learning_rate": 2e-05, + "loss": 0.04549539, + "step": 7607 + }, + { + "epoch": 15.216, + "grad_norm": 0.941321611404419, + "learning_rate": 2e-05, + "loss": 0.02918254, + "step": 7608 + }, + { + "epoch": 15.218, + "grad_norm": 1.7886515855789185, + "learning_rate": 2e-05, + "loss": 0.04620847, + "step": 7609 + }, + { + "epoch": 15.22, + "grad_norm": 1.8236616849899292, + "learning_rate": 2e-05, + "loss": 0.04852483, + "step": 7610 + }, + { + "epoch": 15.222, + "grad_norm": 1.224780797958374, + "learning_rate": 2e-05, + "loss": 0.0375507, + "step": 7611 + }, + { + "epoch": 15.224, + "grad_norm": 1.335411548614502, + "learning_rate": 2e-05, + "loss": 0.04444863, + "step": 7612 + }, + { + "epoch": 15.226, + "grad_norm": 1.0695840120315552, + "learning_rate": 2e-05, + "loss": 0.03443304, + "step": 7613 + }, + { + "epoch": 15.228, + "grad_norm": 1.1740689277648926, + "learning_rate": 2e-05, + "loss": 0.03335565, + "step": 7614 + }, + { + "epoch": 15.23, + "grad_norm": 2.6707682609558105, + "learning_rate": 2e-05, + "loss": 0.05265085, + "step": 7615 + }, + { + "epoch": 15.232, + "grad_norm": 4.135421276092529, + "learning_rate": 2e-05, + "loss": 0.0563576, + "step": 7616 + }, + { + "epoch": 15.234, + "grad_norm": 1.716046690940857, + "learning_rate": 2e-05, + "loss": 0.04460949, + "step": 7617 + }, + { + "epoch": 15.236, + "grad_norm": 1.721646785736084, + "learning_rate": 2e-05, + "loss": 0.04767875, + "step": 7618 + }, + { + "epoch": 15.238, + "grad_norm": 1.284906029701233, + "learning_rate": 2e-05, + "loss": 0.01877031, + "step": 7619 + }, + { + "epoch": 15.24, + "grad_norm": 1.517042636871338, + "learning_rate": 2e-05, + "loss": 0.04870673, + "step": 7620 + }, + { + "epoch": 15.242, + "grad_norm": 0.9522220492362976, + "learning_rate": 2e-05, + "loss": 0.02781921, + "step": 7621 + }, + { + "epoch": 15.244, + "grad_norm": 1.4575544595718384, + "learning_rate": 2e-05, + "loss": 0.04900608, + "step": 7622 + }, + { + "epoch": 15.246, + "grad_norm": 2.664926052093506, + "learning_rate": 2e-05, + "loss": 0.05986939, + "step": 7623 + }, + { + "epoch": 15.248, + "grad_norm": 1.0192816257476807, + "learning_rate": 2e-05, + "loss": 0.03001048, + "step": 7624 + }, + { + "epoch": 15.25, + "grad_norm": 2.431614637374878, + "learning_rate": 2e-05, + "loss": 0.05669653, + "step": 7625 + }, + { + "epoch": 15.252, + "grad_norm": 1.4480475187301636, + "learning_rate": 2e-05, + "loss": 0.0410952, + "step": 7626 + }, + { + "epoch": 15.254, + "grad_norm": 1.0749616622924805, + "learning_rate": 2e-05, + "loss": 0.03243171, + "step": 7627 + }, + { + "epoch": 15.256, + "grad_norm": 1.4723212718963623, + "learning_rate": 2e-05, + "loss": 0.04396664, + "step": 7628 + }, + { + "epoch": 15.258, + "grad_norm": 1.4612725973129272, + "learning_rate": 2e-05, + "loss": 0.05846308, + "step": 7629 + }, + { + "epoch": 15.26, + "grad_norm": 1.0961060523986816, + "learning_rate": 2e-05, + "loss": 0.03180789, + "step": 7630 + }, + { + "epoch": 15.262, + "grad_norm": 1.0098501443862915, + "learning_rate": 2e-05, + "loss": 0.02932501, + "step": 7631 + }, + { + "epoch": 15.264, + "grad_norm": 1.500989317893982, + "learning_rate": 2e-05, + "loss": 0.05016597, + "step": 7632 + }, + { + "epoch": 15.266, + "grad_norm": 1.462710976600647, + "learning_rate": 2e-05, + "loss": 0.04754984, + "step": 7633 + }, + { + "epoch": 15.268, + "grad_norm": 1.3882156610488892, + "learning_rate": 2e-05, + "loss": 0.04774838, + "step": 7634 + }, + { + "epoch": 15.27, + "grad_norm": 1.2340351343154907, + "learning_rate": 2e-05, + "loss": 0.05167802, + "step": 7635 + }, + { + "epoch": 15.272, + "grad_norm": 1.213295817375183, + "learning_rate": 2e-05, + "loss": 0.04757159, + "step": 7636 + }, + { + "epoch": 15.274000000000001, + "grad_norm": 1.2619282007217407, + "learning_rate": 2e-05, + "loss": 0.05256531, + "step": 7637 + }, + { + "epoch": 15.276, + "grad_norm": 1.552124261856079, + "learning_rate": 2e-05, + "loss": 0.04303239, + "step": 7638 + }, + { + "epoch": 15.278, + "grad_norm": 1.615976333618164, + "learning_rate": 2e-05, + "loss": 0.05144659, + "step": 7639 + }, + { + "epoch": 15.28, + "grad_norm": 1.2104065418243408, + "learning_rate": 2e-05, + "loss": 0.02734979, + "step": 7640 + }, + { + "epoch": 15.282, + "grad_norm": 2.24019455909729, + "learning_rate": 2e-05, + "loss": 0.04673993, + "step": 7641 + }, + { + "epoch": 15.284, + "grad_norm": 1.7466453313827515, + "learning_rate": 2e-05, + "loss": 0.03623344, + "step": 7642 + }, + { + "epoch": 15.286, + "grad_norm": 1.443344235420227, + "learning_rate": 2e-05, + "loss": 0.06160107, + "step": 7643 + }, + { + "epoch": 15.288, + "grad_norm": 0.9850043654441833, + "learning_rate": 2e-05, + "loss": 0.03402284, + "step": 7644 + }, + { + "epoch": 15.29, + "grad_norm": 1.4699482917785645, + "learning_rate": 2e-05, + "loss": 0.04960431, + "step": 7645 + }, + { + "epoch": 15.292, + "grad_norm": 0.8527708053588867, + "learning_rate": 2e-05, + "loss": 0.02158963, + "step": 7646 + }, + { + "epoch": 15.294, + "grad_norm": 0.7715446352958679, + "learning_rate": 2e-05, + "loss": 0.02164961, + "step": 7647 + }, + { + "epoch": 15.296, + "grad_norm": 1.1812517642974854, + "learning_rate": 2e-05, + "loss": 0.04358544, + "step": 7648 + }, + { + "epoch": 15.298, + "grad_norm": 1.6417049169540405, + "learning_rate": 2e-05, + "loss": 0.05505785, + "step": 7649 + }, + { + "epoch": 15.3, + "grad_norm": 0.9022408723831177, + "learning_rate": 2e-05, + "loss": 0.02869208, + "step": 7650 + }, + { + "epoch": 15.302, + "grad_norm": 1.2955352067947388, + "learning_rate": 2e-05, + "loss": 0.05353476, + "step": 7651 + }, + { + "epoch": 15.304, + "grad_norm": 2.33793044090271, + "learning_rate": 2e-05, + "loss": 0.02959348, + "step": 7652 + }, + { + "epoch": 15.306, + "grad_norm": 1.1095025539398193, + "learning_rate": 2e-05, + "loss": 0.03602049, + "step": 7653 + }, + { + "epoch": 15.308, + "grad_norm": 1.6043916940689087, + "learning_rate": 2e-05, + "loss": 0.03583729, + "step": 7654 + }, + { + "epoch": 15.31, + "grad_norm": 1.0492905378341675, + "learning_rate": 2e-05, + "loss": 0.03110175, + "step": 7655 + }, + { + "epoch": 15.312, + "grad_norm": 1.7085450887680054, + "learning_rate": 2e-05, + "loss": 0.04954446, + "step": 7656 + }, + { + "epoch": 15.314, + "grad_norm": 1.9505372047424316, + "learning_rate": 2e-05, + "loss": 0.03658181, + "step": 7657 + }, + { + "epoch": 15.316, + "grad_norm": 0.9936202168464661, + "learning_rate": 2e-05, + "loss": 0.02535901, + "step": 7658 + }, + { + "epoch": 15.318, + "grad_norm": 1.0207105875015259, + "learning_rate": 2e-05, + "loss": 0.03539135, + "step": 7659 + }, + { + "epoch": 15.32, + "grad_norm": 1.5546672344207764, + "learning_rate": 2e-05, + "loss": 0.04938481, + "step": 7660 + }, + { + "epoch": 15.322, + "grad_norm": 1.247316837310791, + "learning_rate": 2e-05, + "loss": 0.03931482, + "step": 7661 + }, + { + "epoch": 15.324, + "grad_norm": 1.2438799142837524, + "learning_rate": 2e-05, + "loss": 0.02923568, + "step": 7662 + }, + { + "epoch": 15.326, + "grad_norm": 1.87889564037323, + "learning_rate": 2e-05, + "loss": 0.04272472, + "step": 7663 + }, + { + "epoch": 15.328, + "grad_norm": 1.0760419368743896, + "learning_rate": 2e-05, + "loss": 0.03573871, + "step": 7664 + }, + { + "epoch": 15.33, + "grad_norm": 2.030569314956665, + "learning_rate": 2e-05, + "loss": 0.04339175, + "step": 7665 + }, + { + "epoch": 15.332, + "grad_norm": 1.7053855657577515, + "learning_rate": 2e-05, + "loss": 0.04916566, + "step": 7666 + }, + { + "epoch": 15.334, + "grad_norm": 1.4660038948059082, + "learning_rate": 2e-05, + "loss": 0.03246354, + "step": 7667 + }, + { + "epoch": 15.336, + "grad_norm": 1.257335901260376, + "learning_rate": 2e-05, + "loss": 0.04967248, + "step": 7668 + }, + { + "epoch": 15.338, + "grad_norm": 1.4573662281036377, + "learning_rate": 2e-05, + "loss": 0.04021806, + "step": 7669 + }, + { + "epoch": 15.34, + "grad_norm": 1.271093487739563, + "learning_rate": 2e-05, + "loss": 0.04290817, + "step": 7670 + }, + { + "epoch": 15.342, + "grad_norm": 0.9368219971656799, + "learning_rate": 2e-05, + "loss": 0.0271676, + "step": 7671 + }, + { + "epoch": 15.344, + "grad_norm": 1.1794713735580444, + "learning_rate": 2e-05, + "loss": 0.03208122, + "step": 7672 + }, + { + "epoch": 15.346, + "grad_norm": 0.9926052689552307, + "learning_rate": 2e-05, + "loss": 0.02915346, + "step": 7673 + }, + { + "epoch": 15.348, + "grad_norm": 1.1521005630493164, + "learning_rate": 2e-05, + "loss": 0.03530318, + "step": 7674 + }, + { + "epoch": 15.35, + "grad_norm": 1.2285836935043335, + "learning_rate": 2e-05, + "loss": 0.03318744, + "step": 7675 + }, + { + "epoch": 15.352, + "grad_norm": 2.876605987548828, + "learning_rate": 2e-05, + "loss": 0.02903116, + "step": 7676 + }, + { + "epoch": 15.354, + "grad_norm": 2.023113489151001, + "learning_rate": 2e-05, + "loss": 0.04902362, + "step": 7677 + }, + { + "epoch": 15.356, + "grad_norm": 1.7338629961013794, + "learning_rate": 2e-05, + "loss": 0.03628127, + "step": 7678 + }, + { + "epoch": 15.358, + "grad_norm": 1.0066721439361572, + "learning_rate": 2e-05, + "loss": 0.0289756, + "step": 7679 + }, + { + "epoch": 15.36, + "grad_norm": 1.7639458179473877, + "learning_rate": 2e-05, + "loss": 0.04403928, + "step": 7680 + }, + { + "epoch": 15.362, + "grad_norm": 1.4575209617614746, + "learning_rate": 2e-05, + "loss": 0.0353655, + "step": 7681 + }, + { + "epoch": 15.364, + "grad_norm": 1.1432892084121704, + "learning_rate": 2e-05, + "loss": 0.03814424, + "step": 7682 + }, + { + "epoch": 15.366, + "grad_norm": 1.755494236946106, + "learning_rate": 2e-05, + "loss": 0.0562299, + "step": 7683 + }, + { + "epoch": 15.368, + "grad_norm": 1.7338649034500122, + "learning_rate": 2e-05, + "loss": 0.03804263, + "step": 7684 + }, + { + "epoch": 15.37, + "grad_norm": 2.1905031204223633, + "learning_rate": 2e-05, + "loss": 0.03826035, + "step": 7685 + }, + { + "epoch": 15.372, + "grad_norm": 3.7067763805389404, + "learning_rate": 2e-05, + "loss": 0.05445283, + "step": 7686 + }, + { + "epoch": 15.374, + "grad_norm": 2.2767043113708496, + "learning_rate": 2e-05, + "loss": 0.06008461, + "step": 7687 + }, + { + "epoch": 15.376, + "grad_norm": 1.2579028606414795, + "learning_rate": 2e-05, + "loss": 0.03845136, + "step": 7688 + }, + { + "epoch": 15.378, + "grad_norm": 1.5080291032791138, + "learning_rate": 2e-05, + "loss": 0.04014179, + "step": 7689 + }, + { + "epoch": 15.38, + "grad_norm": 1.7374647855758667, + "learning_rate": 2e-05, + "loss": 0.03181672, + "step": 7690 + }, + { + "epoch": 15.382, + "grad_norm": 1.6790274381637573, + "learning_rate": 2e-05, + "loss": 0.03847949, + "step": 7691 + }, + { + "epoch": 15.384, + "grad_norm": 2.135026216506958, + "learning_rate": 2e-05, + "loss": 0.04469639, + "step": 7692 + }, + { + "epoch": 15.386, + "grad_norm": 1.4211525917053223, + "learning_rate": 2e-05, + "loss": 0.04342383, + "step": 7693 + }, + { + "epoch": 15.388, + "grad_norm": 1.0718973875045776, + "learning_rate": 2e-05, + "loss": 0.03107191, + "step": 7694 + }, + { + "epoch": 15.39, + "grad_norm": 0.8555018901824951, + "learning_rate": 2e-05, + "loss": 0.02420723, + "step": 7695 + }, + { + "epoch": 15.392, + "grad_norm": 1.2442560195922852, + "learning_rate": 2e-05, + "loss": 0.04132719, + "step": 7696 + }, + { + "epoch": 15.394, + "grad_norm": 1.0570979118347168, + "learning_rate": 2e-05, + "loss": 0.03066903, + "step": 7697 + }, + { + "epoch": 15.396, + "grad_norm": 1.171759009361267, + "learning_rate": 2e-05, + "loss": 0.03119994, + "step": 7698 + }, + { + "epoch": 15.398, + "grad_norm": 1.6701782941818237, + "learning_rate": 2e-05, + "loss": 0.03852317, + "step": 7699 + }, + { + "epoch": 15.4, + "grad_norm": 1.8172271251678467, + "learning_rate": 2e-05, + "loss": 0.02544076, + "step": 7700 + }, + { + "epoch": 15.402, + "grad_norm": 1.2106478214263916, + "learning_rate": 2e-05, + "loss": 0.03819906, + "step": 7701 + }, + { + "epoch": 15.404, + "grad_norm": 2.4221479892730713, + "learning_rate": 2e-05, + "loss": 0.04959261, + "step": 7702 + }, + { + "epoch": 15.406, + "grad_norm": 1.6806089878082275, + "learning_rate": 2e-05, + "loss": 0.04468667, + "step": 7703 + }, + { + "epoch": 15.408, + "grad_norm": 1.3105989694595337, + "learning_rate": 2e-05, + "loss": 0.03792799, + "step": 7704 + }, + { + "epoch": 15.41, + "grad_norm": 1.3647056818008423, + "learning_rate": 2e-05, + "loss": 0.04311409, + "step": 7705 + }, + { + "epoch": 15.412, + "grad_norm": 1.6955467462539673, + "learning_rate": 2e-05, + "loss": 0.04090759, + "step": 7706 + }, + { + "epoch": 15.414, + "grad_norm": 1.6445177793502808, + "learning_rate": 2e-05, + "loss": 0.050005, + "step": 7707 + }, + { + "epoch": 15.416, + "grad_norm": 2.115290403366089, + "learning_rate": 2e-05, + "loss": 0.04707139, + "step": 7708 + }, + { + "epoch": 15.418, + "grad_norm": 1.0016669034957886, + "learning_rate": 2e-05, + "loss": 0.02786621, + "step": 7709 + }, + { + "epoch": 15.42, + "grad_norm": 1.5075523853302002, + "learning_rate": 2e-05, + "loss": 0.04682674, + "step": 7710 + }, + { + "epoch": 15.422, + "grad_norm": 1.5943173170089722, + "learning_rate": 2e-05, + "loss": 0.0265224, + "step": 7711 + }, + { + "epoch": 15.424, + "grad_norm": 1.5050104856491089, + "learning_rate": 2e-05, + "loss": 0.04416554, + "step": 7712 + }, + { + "epoch": 15.426, + "grad_norm": 0.9919576644897461, + "learning_rate": 2e-05, + "loss": 0.02424739, + "step": 7713 + }, + { + "epoch": 15.428, + "grad_norm": 1.0542445182800293, + "learning_rate": 2e-05, + "loss": 0.03701505, + "step": 7714 + }, + { + "epoch": 15.43, + "grad_norm": 0.9411125779151917, + "learning_rate": 2e-05, + "loss": 0.03089548, + "step": 7715 + }, + { + "epoch": 15.432, + "grad_norm": 1.2335981130599976, + "learning_rate": 2e-05, + "loss": 0.04131325, + "step": 7716 + }, + { + "epoch": 15.434, + "grad_norm": 1.460959553718567, + "learning_rate": 2e-05, + "loss": 0.03338287, + "step": 7717 + }, + { + "epoch": 15.436, + "grad_norm": 0.9841430187225342, + "learning_rate": 2e-05, + "loss": 0.03597587, + "step": 7718 + }, + { + "epoch": 15.438, + "grad_norm": 1.1647802591323853, + "learning_rate": 2e-05, + "loss": 0.03573506, + "step": 7719 + }, + { + "epoch": 15.44, + "grad_norm": 1.4322996139526367, + "learning_rate": 2e-05, + "loss": 0.04496816, + "step": 7720 + }, + { + "epoch": 15.442, + "grad_norm": 3.0110630989074707, + "learning_rate": 2e-05, + "loss": 0.04119972, + "step": 7721 + }, + { + "epoch": 15.444, + "grad_norm": 1.2038313150405884, + "learning_rate": 2e-05, + "loss": 0.03304677, + "step": 7722 + }, + { + "epoch": 15.446, + "grad_norm": 1.3562027215957642, + "learning_rate": 2e-05, + "loss": 0.04422673, + "step": 7723 + }, + { + "epoch": 15.448, + "grad_norm": 1.345652461051941, + "learning_rate": 2e-05, + "loss": 0.04346151, + "step": 7724 + }, + { + "epoch": 15.45, + "grad_norm": 0.9705491662025452, + "learning_rate": 2e-05, + "loss": 0.02673662, + "step": 7725 + }, + { + "epoch": 15.452, + "grad_norm": 1.6109822988510132, + "learning_rate": 2e-05, + "loss": 0.04332538, + "step": 7726 + }, + { + "epoch": 15.454, + "grad_norm": 1.4140931367874146, + "learning_rate": 2e-05, + "loss": 0.04497489, + "step": 7727 + }, + { + "epoch": 15.456, + "grad_norm": 1.2737836837768555, + "learning_rate": 2e-05, + "loss": 0.03585126, + "step": 7728 + }, + { + "epoch": 15.458, + "grad_norm": 1.0965650081634521, + "learning_rate": 2e-05, + "loss": 0.02634549, + "step": 7729 + }, + { + "epoch": 15.46, + "grad_norm": 1.3999693393707275, + "learning_rate": 2e-05, + "loss": 0.04551007, + "step": 7730 + }, + { + "epoch": 15.462, + "grad_norm": 1.013411045074463, + "learning_rate": 2e-05, + "loss": 0.02215614, + "step": 7731 + }, + { + "epoch": 15.464, + "grad_norm": 1.3173936605453491, + "learning_rate": 2e-05, + "loss": 0.04206047, + "step": 7732 + }, + { + "epoch": 15.466, + "grad_norm": 1.0382493734359741, + "learning_rate": 2e-05, + "loss": 0.02827413, + "step": 7733 + }, + { + "epoch": 15.468, + "grad_norm": 2.1286673545837402, + "learning_rate": 2e-05, + "loss": 0.04167566, + "step": 7734 + }, + { + "epoch": 15.47, + "grad_norm": 2.1657168865203857, + "learning_rate": 2e-05, + "loss": 0.04439145, + "step": 7735 + }, + { + "epoch": 15.472, + "grad_norm": 1.6205779314041138, + "learning_rate": 2e-05, + "loss": 0.04393222, + "step": 7736 + }, + { + "epoch": 15.474, + "grad_norm": 1.3515499830245972, + "learning_rate": 2e-05, + "loss": 0.04231175, + "step": 7737 + }, + { + "epoch": 15.475999999999999, + "grad_norm": 1.089202880859375, + "learning_rate": 2e-05, + "loss": 0.02547337, + "step": 7738 + }, + { + "epoch": 15.478, + "grad_norm": 1.4004868268966675, + "learning_rate": 2e-05, + "loss": 0.0451882, + "step": 7739 + }, + { + "epoch": 15.48, + "grad_norm": 1.272910714149475, + "learning_rate": 2e-05, + "loss": 0.01924961, + "step": 7740 + }, + { + "epoch": 15.482, + "grad_norm": 1.2643826007843018, + "learning_rate": 2e-05, + "loss": 0.04131962, + "step": 7741 + }, + { + "epoch": 15.484, + "grad_norm": 2.1027934551239014, + "learning_rate": 2e-05, + "loss": 0.06375703, + "step": 7742 + }, + { + "epoch": 15.486, + "grad_norm": 1.2265628576278687, + "learning_rate": 2e-05, + "loss": 0.02747299, + "step": 7743 + }, + { + "epoch": 15.488, + "grad_norm": 2.7004458904266357, + "learning_rate": 2e-05, + "loss": 0.06089799, + "step": 7744 + }, + { + "epoch": 15.49, + "grad_norm": 0.9138990044593811, + "learning_rate": 2e-05, + "loss": 0.02951707, + "step": 7745 + }, + { + "epoch": 15.492, + "grad_norm": 1.3887770175933838, + "learning_rate": 2e-05, + "loss": 0.03592655, + "step": 7746 + }, + { + "epoch": 15.494, + "grad_norm": 1.399113655090332, + "learning_rate": 2e-05, + "loss": 0.03822273, + "step": 7747 + }, + { + "epoch": 15.496, + "grad_norm": 1.7515449523925781, + "learning_rate": 2e-05, + "loss": 0.04199392, + "step": 7748 + }, + { + "epoch": 15.498, + "grad_norm": 1.4153516292572021, + "learning_rate": 2e-05, + "loss": 0.04716617, + "step": 7749 + }, + { + "epoch": 15.5, + "grad_norm": 2.602876901626587, + "learning_rate": 2e-05, + "loss": 0.04844777, + "step": 7750 + }, + { + "epoch": 15.502, + "grad_norm": 1.2918318510055542, + "learning_rate": 2e-05, + "loss": 0.04199758, + "step": 7751 + }, + { + "epoch": 15.504, + "grad_norm": 0.7387645244598389, + "learning_rate": 2e-05, + "loss": 0.01926706, + "step": 7752 + }, + { + "epoch": 15.506, + "grad_norm": 1.028178095817566, + "learning_rate": 2e-05, + "loss": 0.02961706, + "step": 7753 + }, + { + "epoch": 15.508, + "grad_norm": 1.2695460319519043, + "learning_rate": 2e-05, + "loss": 0.03514853, + "step": 7754 + }, + { + "epoch": 15.51, + "grad_norm": 2.6682016849517822, + "learning_rate": 2e-05, + "loss": 0.06611276, + "step": 7755 + }, + { + "epoch": 15.512, + "grad_norm": 1.2907907962799072, + "learning_rate": 2e-05, + "loss": 0.03901846, + "step": 7756 + }, + { + "epoch": 15.514, + "grad_norm": 1.2247368097305298, + "learning_rate": 2e-05, + "loss": 0.03462955, + "step": 7757 + }, + { + "epoch": 15.516, + "grad_norm": 1.1045002937316895, + "learning_rate": 2e-05, + "loss": 0.02914516, + "step": 7758 + }, + { + "epoch": 15.518, + "grad_norm": 1.3329474925994873, + "learning_rate": 2e-05, + "loss": 0.03299643, + "step": 7759 + }, + { + "epoch": 15.52, + "grad_norm": 0.9705315828323364, + "learning_rate": 2e-05, + "loss": 0.0249468, + "step": 7760 + }, + { + "epoch": 15.522, + "grad_norm": 1.3083375692367554, + "learning_rate": 2e-05, + "loss": 0.04079135, + "step": 7761 + }, + { + "epoch": 15.524000000000001, + "grad_norm": 1.0932661294937134, + "learning_rate": 2e-05, + "loss": 0.03963382, + "step": 7762 + }, + { + "epoch": 15.526, + "grad_norm": 1.1822131872177124, + "learning_rate": 2e-05, + "loss": 0.03383718, + "step": 7763 + }, + { + "epoch": 15.528, + "grad_norm": 1.1285481452941895, + "learning_rate": 2e-05, + "loss": 0.0485891, + "step": 7764 + }, + { + "epoch": 15.53, + "grad_norm": 2.6022520065307617, + "learning_rate": 2e-05, + "loss": 0.04467677, + "step": 7765 + }, + { + "epoch": 15.532, + "grad_norm": 2.1171436309814453, + "learning_rate": 2e-05, + "loss": 0.05000626, + "step": 7766 + }, + { + "epoch": 15.534, + "grad_norm": 0.948603093624115, + "learning_rate": 2e-05, + "loss": 0.02811343, + "step": 7767 + }, + { + "epoch": 15.536, + "grad_norm": 3.1038739681243896, + "learning_rate": 2e-05, + "loss": 0.04482667, + "step": 7768 + }, + { + "epoch": 15.538, + "grad_norm": 1.6356004476547241, + "learning_rate": 2e-05, + "loss": 0.05364137, + "step": 7769 + }, + { + "epoch": 15.54, + "grad_norm": 1.1389317512512207, + "learning_rate": 2e-05, + "loss": 0.03385459, + "step": 7770 + }, + { + "epoch": 15.542, + "grad_norm": 1.53994619846344, + "learning_rate": 2e-05, + "loss": 0.04406751, + "step": 7771 + }, + { + "epoch": 15.544, + "grad_norm": 1.5611374378204346, + "learning_rate": 2e-05, + "loss": 0.0473385, + "step": 7772 + }, + { + "epoch": 15.546, + "grad_norm": 1.0507432222366333, + "learning_rate": 2e-05, + "loss": 0.02707745, + "step": 7773 + }, + { + "epoch": 15.548, + "grad_norm": 1.1123173236846924, + "learning_rate": 2e-05, + "loss": 0.03449173, + "step": 7774 + }, + { + "epoch": 15.55, + "grad_norm": 1.3029831647872925, + "learning_rate": 2e-05, + "loss": 0.03552601, + "step": 7775 + }, + { + "epoch": 15.552, + "grad_norm": 1.6921583414077759, + "learning_rate": 2e-05, + "loss": 0.05067451, + "step": 7776 + }, + { + "epoch": 15.554, + "grad_norm": 1.275075078010559, + "learning_rate": 2e-05, + "loss": 0.02784466, + "step": 7777 + }, + { + "epoch": 15.556000000000001, + "grad_norm": 1.4142465591430664, + "learning_rate": 2e-05, + "loss": 0.04801814, + "step": 7778 + }, + { + "epoch": 15.558, + "grad_norm": 1.0732700824737549, + "learning_rate": 2e-05, + "loss": 0.03766344, + "step": 7779 + }, + { + "epoch": 15.56, + "grad_norm": 1.280580997467041, + "learning_rate": 2e-05, + "loss": 0.03040556, + "step": 7780 + }, + { + "epoch": 15.562, + "grad_norm": 1.8353358507156372, + "learning_rate": 2e-05, + "loss": 0.05565303, + "step": 7781 + }, + { + "epoch": 15.564, + "grad_norm": 1.4312429428100586, + "learning_rate": 2e-05, + "loss": 0.03630222, + "step": 7782 + }, + { + "epoch": 15.566, + "grad_norm": 0.9180557131767273, + "learning_rate": 2e-05, + "loss": 0.03424197, + "step": 7783 + }, + { + "epoch": 15.568, + "grad_norm": 1.8518563508987427, + "learning_rate": 2e-05, + "loss": 0.05018555, + "step": 7784 + }, + { + "epoch": 15.57, + "grad_norm": 1.6587504148483276, + "learning_rate": 2e-05, + "loss": 0.05404566, + "step": 7785 + }, + { + "epoch": 15.572, + "grad_norm": 1.2998889684677124, + "learning_rate": 2e-05, + "loss": 0.03402172, + "step": 7786 + }, + { + "epoch": 15.574, + "grad_norm": 1.43665611743927, + "learning_rate": 2e-05, + "loss": 0.03349872, + "step": 7787 + }, + { + "epoch": 15.576, + "grad_norm": 1.3342171907424927, + "learning_rate": 2e-05, + "loss": 0.04050234, + "step": 7788 + }, + { + "epoch": 15.578, + "grad_norm": 1.462536096572876, + "learning_rate": 2e-05, + "loss": 0.03564973, + "step": 7789 + }, + { + "epoch": 15.58, + "grad_norm": 1.421918511390686, + "learning_rate": 2e-05, + "loss": 0.02840041, + "step": 7790 + }, + { + "epoch": 15.582, + "grad_norm": 2.16733980178833, + "learning_rate": 2e-05, + "loss": 0.04823796, + "step": 7791 + }, + { + "epoch": 15.584, + "grad_norm": 1.9504356384277344, + "learning_rate": 2e-05, + "loss": 0.04157623, + "step": 7792 + }, + { + "epoch": 15.586, + "grad_norm": 2.9325337409973145, + "learning_rate": 2e-05, + "loss": 0.03892434, + "step": 7793 + }, + { + "epoch": 15.588, + "grad_norm": 1.5955098867416382, + "learning_rate": 2e-05, + "loss": 0.04307658, + "step": 7794 + }, + { + "epoch": 15.59, + "grad_norm": 1.1761583089828491, + "learning_rate": 2e-05, + "loss": 0.03487573, + "step": 7795 + }, + { + "epoch": 15.592, + "grad_norm": 1.8221702575683594, + "learning_rate": 2e-05, + "loss": 0.05856226, + "step": 7796 + }, + { + "epoch": 15.594, + "grad_norm": 1.3372985124588013, + "learning_rate": 2e-05, + "loss": 0.04135559, + "step": 7797 + }, + { + "epoch": 15.596, + "grad_norm": 1.5400853157043457, + "learning_rate": 2e-05, + "loss": 0.04417802, + "step": 7798 + }, + { + "epoch": 15.598, + "grad_norm": 1.4287244081497192, + "learning_rate": 2e-05, + "loss": 0.03800631, + "step": 7799 + }, + { + "epoch": 15.6, + "grad_norm": 1.5493749380111694, + "learning_rate": 2e-05, + "loss": 0.03243243, + "step": 7800 + }, + { + "epoch": 15.602, + "grad_norm": 1.8770220279693604, + "learning_rate": 2e-05, + "loss": 0.04838708, + "step": 7801 + }, + { + "epoch": 15.604, + "grad_norm": 1.8443878889083862, + "learning_rate": 2e-05, + "loss": 0.04210856, + "step": 7802 + }, + { + "epoch": 15.606, + "grad_norm": 1.1599087715148926, + "learning_rate": 2e-05, + "loss": 0.03727244, + "step": 7803 + }, + { + "epoch": 15.608, + "grad_norm": 0.8143905997276306, + "learning_rate": 2e-05, + "loss": 0.02723217, + "step": 7804 + }, + { + "epoch": 15.61, + "grad_norm": 1.514428734779358, + "learning_rate": 2e-05, + "loss": 0.04333169, + "step": 7805 + }, + { + "epoch": 15.612, + "grad_norm": 1.0530762672424316, + "learning_rate": 2e-05, + "loss": 0.02669099, + "step": 7806 + }, + { + "epoch": 15.614, + "grad_norm": 1.2812447547912598, + "learning_rate": 2e-05, + "loss": 0.03541653, + "step": 7807 + }, + { + "epoch": 15.616, + "grad_norm": 0.8917434215545654, + "learning_rate": 2e-05, + "loss": 0.02033772, + "step": 7808 + }, + { + "epoch": 15.618, + "grad_norm": 1.388106346130371, + "learning_rate": 2e-05, + "loss": 0.0356602, + "step": 7809 + }, + { + "epoch": 15.62, + "grad_norm": 1.629955530166626, + "learning_rate": 2e-05, + "loss": 0.04595727, + "step": 7810 + }, + { + "epoch": 15.622, + "grad_norm": 1.964969277381897, + "learning_rate": 2e-05, + "loss": 0.04757115, + "step": 7811 + }, + { + "epoch": 15.624, + "grad_norm": 1.00034761428833, + "learning_rate": 2e-05, + "loss": 0.02988995, + "step": 7812 + }, + { + "epoch": 15.626, + "grad_norm": 1.4420907497406006, + "learning_rate": 2e-05, + "loss": 0.03713204, + "step": 7813 + }, + { + "epoch": 15.628, + "grad_norm": 1.7254669666290283, + "learning_rate": 2e-05, + "loss": 0.05012753, + "step": 7814 + }, + { + "epoch": 15.63, + "grad_norm": 1.617090106010437, + "learning_rate": 2e-05, + "loss": 0.04258318, + "step": 7815 + }, + { + "epoch": 15.632, + "grad_norm": 1.8159074783325195, + "learning_rate": 2e-05, + "loss": 0.0533081, + "step": 7816 + }, + { + "epoch": 15.634, + "grad_norm": 1.0557678937911987, + "learning_rate": 2e-05, + "loss": 0.03816307, + "step": 7817 + }, + { + "epoch": 15.636, + "grad_norm": 1.1911511421203613, + "learning_rate": 2e-05, + "loss": 0.02825535, + "step": 7818 + }, + { + "epoch": 15.638, + "grad_norm": 1.1290611028671265, + "learning_rate": 2e-05, + "loss": 0.03365977, + "step": 7819 + }, + { + "epoch": 15.64, + "grad_norm": 1.4087905883789062, + "learning_rate": 2e-05, + "loss": 0.03113106, + "step": 7820 + }, + { + "epoch": 15.642, + "grad_norm": 1.3932369947433472, + "learning_rate": 2e-05, + "loss": 0.03929411, + "step": 7821 + }, + { + "epoch": 15.644, + "grad_norm": 1.611405611038208, + "learning_rate": 2e-05, + "loss": 0.05581418, + "step": 7822 + }, + { + "epoch": 15.646, + "grad_norm": 1.394471526145935, + "learning_rate": 2e-05, + "loss": 0.03460547, + "step": 7823 + }, + { + "epoch": 15.648, + "grad_norm": 1.114952564239502, + "learning_rate": 2e-05, + "loss": 0.03128414, + "step": 7824 + }, + { + "epoch": 15.65, + "grad_norm": 1.2153035402297974, + "learning_rate": 2e-05, + "loss": 0.03026392, + "step": 7825 + }, + { + "epoch": 15.652, + "grad_norm": 1.245772361755371, + "learning_rate": 2e-05, + "loss": 0.04067977, + "step": 7826 + }, + { + "epoch": 15.654, + "grad_norm": 1.2498624324798584, + "learning_rate": 2e-05, + "loss": 0.0399544, + "step": 7827 + }, + { + "epoch": 15.656, + "grad_norm": 1.4734687805175781, + "learning_rate": 2e-05, + "loss": 0.0403645, + "step": 7828 + }, + { + "epoch": 15.658, + "grad_norm": 1.184058666229248, + "learning_rate": 2e-05, + "loss": 0.03515352, + "step": 7829 + }, + { + "epoch": 15.66, + "grad_norm": 1.6777468919754028, + "learning_rate": 2e-05, + "loss": 0.04460485, + "step": 7830 + }, + { + "epoch": 15.662, + "grad_norm": 1.869699478149414, + "learning_rate": 2e-05, + "loss": 0.03454949, + "step": 7831 + }, + { + "epoch": 15.664, + "grad_norm": 1.1511545181274414, + "learning_rate": 2e-05, + "loss": 0.03649542, + "step": 7832 + }, + { + "epoch": 15.666, + "grad_norm": 1.5351507663726807, + "learning_rate": 2e-05, + "loss": 0.03351032, + "step": 7833 + }, + { + "epoch": 15.668, + "grad_norm": 1.0590790510177612, + "learning_rate": 2e-05, + "loss": 0.04073928, + "step": 7834 + }, + { + "epoch": 15.67, + "grad_norm": 1.9279193878173828, + "learning_rate": 2e-05, + "loss": 0.05023941, + "step": 7835 + }, + { + "epoch": 15.672, + "grad_norm": 1.389474630355835, + "learning_rate": 2e-05, + "loss": 0.0375604, + "step": 7836 + }, + { + "epoch": 15.674, + "grad_norm": 1.5739963054656982, + "learning_rate": 2e-05, + "loss": 0.05597922, + "step": 7837 + }, + { + "epoch": 15.676, + "grad_norm": 2.872504472732544, + "learning_rate": 2e-05, + "loss": 0.03245638, + "step": 7838 + }, + { + "epoch": 15.678, + "grad_norm": 1.1751714944839478, + "learning_rate": 2e-05, + "loss": 0.04733478, + "step": 7839 + }, + { + "epoch": 15.68, + "grad_norm": 1.265234351158142, + "learning_rate": 2e-05, + "loss": 0.04258277, + "step": 7840 + }, + { + "epoch": 15.682, + "grad_norm": 1.1896003484725952, + "learning_rate": 2e-05, + "loss": 0.03169346, + "step": 7841 + }, + { + "epoch": 15.684, + "grad_norm": 1.7013142108917236, + "learning_rate": 2e-05, + "loss": 0.05300517, + "step": 7842 + }, + { + "epoch": 15.686, + "grad_norm": 1.666005253791809, + "learning_rate": 2e-05, + "loss": 0.04027229, + "step": 7843 + }, + { + "epoch": 15.688, + "grad_norm": 1.2857798337936401, + "learning_rate": 2e-05, + "loss": 0.02931785, + "step": 7844 + }, + { + "epoch": 15.69, + "grad_norm": 1.989986777305603, + "learning_rate": 2e-05, + "loss": 0.06056058, + "step": 7845 + }, + { + "epoch": 15.692, + "grad_norm": 1.0686649084091187, + "learning_rate": 2e-05, + "loss": 0.03009918, + "step": 7846 + }, + { + "epoch": 15.693999999999999, + "grad_norm": 1.2238051891326904, + "learning_rate": 2e-05, + "loss": 0.03506152, + "step": 7847 + }, + { + "epoch": 15.696, + "grad_norm": 1.7078624963760376, + "learning_rate": 2e-05, + "loss": 0.04438978, + "step": 7848 + }, + { + "epoch": 15.698, + "grad_norm": 2.4543190002441406, + "learning_rate": 2e-05, + "loss": 0.03001017, + "step": 7849 + }, + { + "epoch": 15.7, + "grad_norm": 1.3647072315216064, + "learning_rate": 2e-05, + "loss": 0.04168668, + "step": 7850 + }, + { + "epoch": 15.702, + "grad_norm": 1.3066121339797974, + "learning_rate": 2e-05, + "loss": 0.04153648, + "step": 7851 + }, + { + "epoch": 15.704, + "grad_norm": 1.0811482667922974, + "learning_rate": 2e-05, + "loss": 0.03107862, + "step": 7852 + }, + { + "epoch": 15.706, + "grad_norm": 3.0416204929351807, + "learning_rate": 2e-05, + "loss": 0.05520962, + "step": 7853 + }, + { + "epoch": 15.708, + "grad_norm": 2.0119881629943848, + "learning_rate": 2e-05, + "loss": 0.03293385, + "step": 7854 + }, + { + "epoch": 15.71, + "grad_norm": 1.3695378303527832, + "learning_rate": 2e-05, + "loss": 0.03456626, + "step": 7855 + }, + { + "epoch": 15.712, + "grad_norm": 1.410203218460083, + "learning_rate": 2e-05, + "loss": 0.0467374, + "step": 7856 + }, + { + "epoch": 15.714, + "grad_norm": 0.9683268666267395, + "learning_rate": 2e-05, + "loss": 0.02218588, + "step": 7857 + }, + { + "epoch": 15.716, + "grad_norm": 1.1725350618362427, + "learning_rate": 2e-05, + "loss": 0.0399307, + "step": 7858 + }, + { + "epoch": 15.718, + "grad_norm": 1.156903862953186, + "learning_rate": 2e-05, + "loss": 0.04084864, + "step": 7859 + }, + { + "epoch": 15.72, + "grad_norm": 1.099974274635315, + "learning_rate": 2e-05, + "loss": 0.03805087, + "step": 7860 + }, + { + "epoch": 15.722, + "grad_norm": 0.9585137367248535, + "learning_rate": 2e-05, + "loss": 0.02989551, + "step": 7861 + }, + { + "epoch": 15.724, + "grad_norm": 0.9844270348548889, + "learning_rate": 2e-05, + "loss": 0.02908732, + "step": 7862 + }, + { + "epoch": 15.725999999999999, + "grad_norm": 1.854518175125122, + "learning_rate": 2e-05, + "loss": 0.04460219, + "step": 7863 + }, + { + "epoch": 15.728, + "grad_norm": 1.2902519702911377, + "learning_rate": 2e-05, + "loss": 0.0421297, + "step": 7864 + }, + { + "epoch": 15.73, + "grad_norm": 0.9230936169624329, + "learning_rate": 2e-05, + "loss": 0.02882043, + "step": 7865 + }, + { + "epoch": 15.732, + "grad_norm": 1.5599287748336792, + "learning_rate": 2e-05, + "loss": 0.03928513, + "step": 7866 + }, + { + "epoch": 15.734, + "grad_norm": 1.1548335552215576, + "learning_rate": 2e-05, + "loss": 0.03820485, + "step": 7867 + }, + { + "epoch": 15.736, + "grad_norm": 1.2006397247314453, + "learning_rate": 2e-05, + "loss": 0.02806983, + "step": 7868 + }, + { + "epoch": 15.738, + "grad_norm": 2.920689582824707, + "learning_rate": 2e-05, + "loss": 0.0410319, + "step": 7869 + }, + { + "epoch": 15.74, + "grad_norm": 1.2876091003417969, + "learning_rate": 2e-05, + "loss": 0.03893904, + "step": 7870 + }, + { + "epoch": 15.742, + "grad_norm": 1.2860698699951172, + "learning_rate": 2e-05, + "loss": 0.03412844, + "step": 7871 + }, + { + "epoch": 15.744, + "grad_norm": 1.402205467224121, + "learning_rate": 2e-05, + "loss": 0.04057181, + "step": 7872 + }, + { + "epoch": 15.746, + "grad_norm": 1.643515706062317, + "learning_rate": 2e-05, + "loss": 0.03522312, + "step": 7873 + }, + { + "epoch": 15.748, + "grad_norm": 1.7808668613433838, + "learning_rate": 2e-05, + "loss": 0.051851, + "step": 7874 + }, + { + "epoch": 15.75, + "grad_norm": 1.259395956993103, + "learning_rate": 2e-05, + "loss": 0.03302209, + "step": 7875 + }, + { + "epoch": 15.752, + "grad_norm": 1.9360871315002441, + "learning_rate": 2e-05, + "loss": 0.03717834, + "step": 7876 + }, + { + "epoch": 15.754, + "grad_norm": 1.372341513633728, + "learning_rate": 2e-05, + "loss": 0.04391986, + "step": 7877 + }, + { + "epoch": 15.756, + "grad_norm": 2.2007229328155518, + "learning_rate": 2e-05, + "loss": 0.03659112, + "step": 7878 + }, + { + "epoch": 15.758, + "grad_norm": 2.7030446529388428, + "learning_rate": 2e-05, + "loss": 0.05342212, + "step": 7879 + }, + { + "epoch": 15.76, + "grad_norm": 1.5437792539596558, + "learning_rate": 2e-05, + "loss": 0.03884444, + "step": 7880 + }, + { + "epoch": 15.762, + "grad_norm": 0.9608680605888367, + "learning_rate": 2e-05, + "loss": 0.0303247, + "step": 7881 + }, + { + "epoch": 15.764, + "grad_norm": 1.3708651065826416, + "learning_rate": 2e-05, + "loss": 0.03259217, + "step": 7882 + }, + { + "epoch": 15.766, + "grad_norm": 1.9985793828964233, + "learning_rate": 2e-05, + "loss": 0.04025662, + "step": 7883 + }, + { + "epoch": 15.768, + "grad_norm": 1.3265514373779297, + "learning_rate": 2e-05, + "loss": 0.04500604, + "step": 7884 + }, + { + "epoch": 15.77, + "grad_norm": 1.3119704723358154, + "learning_rate": 2e-05, + "loss": 0.03770127, + "step": 7885 + }, + { + "epoch": 15.772, + "grad_norm": 1.0361597537994385, + "learning_rate": 2e-05, + "loss": 0.02482234, + "step": 7886 + }, + { + "epoch": 15.774000000000001, + "grad_norm": 1.0766968727111816, + "learning_rate": 2e-05, + "loss": 0.03262966, + "step": 7887 + }, + { + "epoch": 15.776, + "grad_norm": 1.372813105583191, + "learning_rate": 2e-05, + "loss": 0.04390799, + "step": 7888 + }, + { + "epoch": 15.778, + "grad_norm": 1.996690273284912, + "learning_rate": 2e-05, + "loss": 0.04705422, + "step": 7889 + }, + { + "epoch": 15.78, + "grad_norm": 1.3125027418136597, + "learning_rate": 2e-05, + "loss": 0.05340045, + "step": 7890 + }, + { + "epoch": 15.782, + "grad_norm": 1.0259758234024048, + "learning_rate": 2e-05, + "loss": 0.03187435, + "step": 7891 + }, + { + "epoch": 15.784, + "grad_norm": 1.176693320274353, + "learning_rate": 2e-05, + "loss": 0.03257523, + "step": 7892 + }, + { + "epoch": 15.786, + "grad_norm": 0.9599425196647644, + "learning_rate": 2e-05, + "loss": 0.02640433, + "step": 7893 + }, + { + "epoch": 15.788, + "grad_norm": 1.4598140716552734, + "learning_rate": 2e-05, + "loss": 0.03303267, + "step": 7894 + }, + { + "epoch": 15.79, + "grad_norm": 1.3222160339355469, + "learning_rate": 2e-05, + "loss": 0.03670667, + "step": 7895 + }, + { + "epoch": 15.792, + "grad_norm": 0.9463991522789001, + "learning_rate": 2e-05, + "loss": 0.02554088, + "step": 7896 + }, + { + "epoch": 15.794, + "grad_norm": 1.3123785257339478, + "learning_rate": 2e-05, + "loss": 0.0461954, + "step": 7897 + }, + { + "epoch": 15.796, + "grad_norm": 1.9543800354003906, + "learning_rate": 2e-05, + "loss": 0.0520162, + "step": 7898 + }, + { + "epoch": 15.798, + "grad_norm": 1.3964611291885376, + "learning_rate": 2e-05, + "loss": 0.03359364, + "step": 7899 + }, + { + "epoch": 15.8, + "grad_norm": 1.7428091764450073, + "learning_rate": 2e-05, + "loss": 0.04074088, + "step": 7900 + }, + { + "epoch": 15.802, + "grad_norm": 1.9242103099822998, + "learning_rate": 2e-05, + "loss": 0.02629791, + "step": 7901 + }, + { + "epoch": 15.804, + "grad_norm": 2.1996188163757324, + "learning_rate": 2e-05, + "loss": 0.04311199, + "step": 7902 + }, + { + "epoch": 15.806000000000001, + "grad_norm": 1.1592961549758911, + "learning_rate": 2e-05, + "loss": 0.0427461, + "step": 7903 + }, + { + "epoch": 15.808, + "grad_norm": 1.3940585851669312, + "learning_rate": 2e-05, + "loss": 0.04456202, + "step": 7904 + }, + { + "epoch": 15.81, + "grad_norm": 1.175157070159912, + "learning_rate": 2e-05, + "loss": 0.02939461, + "step": 7905 + }, + { + "epoch": 15.812, + "grad_norm": 1.855078935623169, + "learning_rate": 2e-05, + "loss": 0.06233601, + "step": 7906 + }, + { + "epoch": 15.814, + "grad_norm": 1.4420384168624878, + "learning_rate": 2e-05, + "loss": 0.04433277, + "step": 7907 + }, + { + "epoch": 15.816, + "grad_norm": 1.6261996030807495, + "learning_rate": 2e-05, + "loss": 0.05150939, + "step": 7908 + }, + { + "epoch": 15.818, + "grad_norm": 0.9924352765083313, + "learning_rate": 2e-05, + "loss": 0.03078244, + "step": 7909 + }, + { + "epoch": 15.82, + "grad_norm": 1.1630253791809082, + "learning_rate": 2e-05, + "loss": 0.02466009, + "step": 7910 + }, + { + "epoch": 15.822, + "grad_norm": 1.1106728315353394, + "learning_rate": 2e-05, + "loss": 0.04763328, + "step": 7911 + }, + { + "epoch": 15.824, + "grad_norm": 1.9522145986557007, + "learning_rate": 2e-05, + "loss": 0.04942587, + "step": 7912 + }, + { + "epoch": 15.826, + "grad_norm": 0.9739556312561035, + "learning_rate": 2e-05, + "loss": 0.03213318, + "step": 7913 + }, + { + "epoch": 15.828, + "grad_norm": 2.2910735607147217, + "learning_rate": 2e-05, + "loss": 0.04557914, + "step": 7914 + }, + { + "epoch": 15.83, + "grad_norm": 2.2116222381591797, + "learning_rate": 2e-05, + "loss": 0.04482364, + "step": 7915 + }, + { + "epoch": 15.832, + "grad_norm": 1.637697696685791, + "learning_rate": 2e-05, + "loss": 0.03125627, + "step": 7916 + }, + { + "epoch": 15.834, + "grad_norm": 1.0995755195617676, + "learning_rate": 2e-05, + "loss": 0.03378412, + "step": 7917 + }, + { + "epoch": 15.836, + "grad_norm": 2.1459877490997314, + "learning_rate": 2e-05, + "loss": 0.03629637, + "step": 7918 + }, + { + "epoch": 15.838, + "grad_norm": 1.2889842987060547, + "learning_rate": 2e-05, + "loss": 0.04306239, + "step": 7919 + }, + { + "epoch": 15.84, + "grad_norm": 4.523320198059082, + "learning_rate": 2e-05, + "loss": 0.04296661, + "step": 7920 + }, + { + "epoch": 15.842, + "grad_norm": 1.7894271612167358, + "learning_rate": 2e-05, + "loss": 0.03566499, + "step": 7921 + }, + { + "epoch": 15.844, + "grad_norm": 1.401320219039917, + "learning_rate": 2e-05, + "loss": 0.02614049, + "step": 7922 + }, + { + "epoch": 15.846, + "grad_norm": 1.4759875535964966, + "learning_rate": 2e-05, + "loss": 0.02819363, + "step": 7923 + }, + { + "epoch": 15.848, + "grad_norm": 1.3998092412948608, + "learning_rate": 2e-05, + "loss": 0.03906362, + "step": 7924 + }, + { + "epoch": 15.85, + "grad_norm": 1.1265228986740112, + "learning_rate": 2e-05, + "loss": 0.03560297, + "step": 7925 + }, + { + "epoch": 15.852, + "grad_norm": 2.067176342010498, + "learning_rate": 2e-05, + "loss": 0.04355159, + "step": 7926 + }, + { + "epoch": 15.854, + "grad_norm": 1.7233660221099854, + "learning_rate": 2e-05, + "loss": 0.029684, + "step": 7927 + }, + { + "epoch": 15.856, + "grad_norm": 2.1673214435577393, + "learning_rate": 2e-05, + "loss": 0.04684749, + "step": 7928 + }, + { + "epoch": 15.858, + "grad_norm": 3.047663450241089, + "learning_rate": 2e-05, + "loss": 0.04030456, + "step": 7929 + }, + { + "epoch": 15.86, + "grad_norm": 1.677051305770874, + "learning_rate": 2e-05, + "loss": 0.03150005, + "step": 7930 + }, + { + "epoch": 15.862, + "grad_norm": 1.5520917177200317, + "learning_rate": 2e-05, + "loss": 0.03935989, + "step": 7931 + }, + { + "epoch": 15.864, + "grad_norm": 2.1706817150115967, + "learning_rate": 2e-05, + "loss": 0.03663088, + "step": 7932 + }, + { + "epoch": 15.866, + "grad_norm": 1.029218316078186, + "learning_rate": 2e-05, + "loss": 0.03442633, + "step": 7933 + }, + { + "epoch": 15.868, + "grad_norm": 0.9123724699020386, + "learning_rate": 2e-05, + "loss": 0.02511028, + "step": 7934 + }, + { + "epoch": 15.87, + "grad_norm": 1.1108697652816772, + "learning_rate": 2e-05, + "loss": 0.04436678, + "step": 7935 + }, + { + "epoch": 15.872, + "grad_norm": 1.1441642045974731, + "learning_rate": 2e-05, + "loss": 0.03152635, + "step": 7936 + }, + { + "epoch": 15.874, + "grad_norm": 0.9864327907562256, + "learning_rate": 2e-05, + "loss": 0.02202858, + "step": 7937 + }, + { + "epoch": 15.876, + "grad_norm": 0.9307705760002136, + "learning_rate": 2e-05, + "loss": 0.02411098, + "step": 7938 + }, + { + "epoch": 15.878, + "grad_norm": 1.2462923526763916, + "learning_rate": 2e-05, + "loss": 0.03807527, + "step": 7939 + }, + { + "epoch": 15.88, + "grad_norm": 0.9074599146842957, + "learning_rate": 2e-05, + "loss": 0.02559845, + "step": 7940 + }, + { + "epoch": 15.882, + "grad_norm": 1.3017256259918213, + "learning_rate": 2e-05, + "loss": 0.04016859, + "step": 7941 + }, + { + "epoch": 15.884, + "grad_norm": 2.846317768096924, + "learning_rate": 2e-05, + "loss": 0.0447731, + "step": 7942 + }, + { + "epoch": 15.886, + "grad_norm": 2.893179416656494, + "learning_rate": 2e-05, + "loss": 0.05213889, + "step": 7943 + }, + { + "epoch": 15.888, + "grad_norm": 1.6494814157485962, + "learning_rate": 2e-05, + "loss": 0.0391501, + "step": 7944 + }, + { + "epoch": 15.89, + "grad_norm": 1.257524847984314, + "learning_rate": 2e-05, + "loss": 0.05023734, + "step": 7945 + }, + { + "epoch": 15.892, + "grad_norm": 1.13717782497406, + "learning_rate": 2e-05, + "loss": 0.0257063, + "step": 7946 + }, + { + "epoch": 15.894, + "grad_norm": 1.18727707862854, + "learning_rate": 2e-05, + "loss": 0.03686336, + "step": 7947 + }, + { + "epoch": 15.896, + "grad_norm": 1.3855704069137573, + "learning_rate": 2e-05, + "loss": 0.03288057, + "step": 7948 + }, + { + "epoch": 15.898, + "grad_norm": 1.4399868249893188, + "learning_rate": 2e-05, + "loss": 0.04080542, + "step": 7949 + }, + { + "epoch": 15.9, + "grad_norm": 1.5103329420089722, + "learning_rate": 2e-05, + "loss": 0.04284345, + "step": 7950 + }, + { + "epoch": 15.902, + "grad_norm": 1.8133221864700317, + "learning_rate": 2e-05, + "loss": 0.04338891, + "step": 7951 + }, + { + "epoch": 15.904, + "grad_norm": 1.0757697820663452, + "learning_rate": 2e-05, + "loss": 0.03184106, + "step": 7952 + }, + { + "epoch": 15.906, + "grad_norm": 1.2824019193649292, + "learning_rate": 2e-05, + "loss": 0.03566642, + "step": 7953 + }, + { + "epoch": 15.908, + "grad_norm": 1.087025761604309, + "learning_rate": 2e-05, + "loss": 0.02856821, + "step": 7954 + }, + { + "epoch": 15.91, + "grad_norm": 3.879120349884033, + "learning_rate": 2e-05, + "loss": 0.0441326, + "step": 7955 + }, + { + "epoch": 15.912, + "grad_norm": 1.081153392791748, + "learning_rate": 2e-05, + "loss": 0.04057573, + "step": 7956 + }, + { + "epoch": 15.914, + "grad_norm": 2.346388578414917, + "learning_rate": 2e-05, + "loss": 0.04326249, + "step": 7957 + }, + { + "epoch": 15.916, + "grad_norm": 1.8399007320404053, + "learning_rate": 2e-05, + "loss": 0.05309239, + "step": 7958 + }, + { + "epoch": 15.918, + "grad_norm": 1.1926047801971436, + "learning_rate": 2e-05, + "loss": 0.02422775, + "step": 7959 + }, + { + "epoch": 15.92, + "grad_norm": 2.166907787322998, + "learning_rate": 2e-05, + "loss": 0.03924531, + "step": 7960 + }, + { + "epoch": 15.922, + "grad_norm": 2.182678461074829, + "learning_rate": 2e-05, + "loss": 0.03237635, + "step": 7961 + }, + { + "epoch": 15.924, + "grad_norm": 1.284359335899353, + "learning_rate": 2e-05, + "loss": 0.03156743, + "step": 7962 + }, + { + "epoch": 15.926, + "grad_norm": 2.093848705291748, + "learning_rate": 2e-05, + "loss": 0.05261733, + "step": 7963 + }, + { + "epoch": 15.928, + "grad_norm": 2.00311541557312, + "learning_rate": 2e-05, + "loss": 0.03726339, + "step": 7964 + }, + { + "epoch": 15.93, + "grad_norm": 1.8436921834945679, + "learning_rate": 2e-05, + "loss": 0.05885259, + "step": 7965 + }, + { + "epoch": 15.932, + "grad_norm": 1.5446103811264038, + "learning_rate": 2e-05, + "loss": 0.0387977, + "step": 7966 + }, + { + "epoch": 15.934, + "grad_norm": 1.519590973854065, + "learning_rate": 2e-05, + "loss": 0.03424424, + "step": 7967 + }, + { + "epoch": 15.936, + "grad_norm": 1.6442177295684814, + "learning_rate": 2e-05, + "loss": 0.05894388, + "step": 7968 + }, + { + "epoch": 15.938, + "grad_norm": 2.041292905807495, + "learning_rate": 2e-05, + "loss": 0.04844762, + "step": 7969 + }, + { + "epoch": 15.94, + "grad_norm": 1.1668561697006226, + "learning_rate": 2e-05, + "loss": 0.03171788, + "step": 7970 + }, + { + "epoch": 15.942, + "grad_norm": 2.4591891765594482, + "learning_rate": 2e-05, + "loss": 0.05597981, + "step": 7971 + }, + { + "epoch": 15.943999999999999, + "grad_norm": 1.7161046266555786, + "learning_rate": 2e-05, + "loss": 0.04831922, + "step": 7972 + }, + { + "epoch": 15.946, + "grad_norm": 1.3476585149765015, + "learning_rate": 2e-05, + "loss": 0.03861203, + "step": 7973 + }, + { + "epoch": 15.948, + "grad_norm": 1.5570892095565796, + "learning_rate": 2e-05, + "loss": 0.04717612, + "step": 7974 + }, + { + "epoch": 15.95, + "grad_norm": 2.5573222637176514, + "learning_rate": 2e-05, + "loss": 0.04429121, + "step": 7975 + }, + { + "epoch": 15.952, + "grad_norm": 1.9069948196411133, + "learning_rate": 2e-05, + "loss": 0.04974307, + "step": 7976 + }, + { + "epoch": 15.954, + "grad_norm": 3.160170316696167, + "learning_rate": 2e-05, + "loss": 0.03945491, + "step": 7977 + }, + { + "epoch": 15.956, + "grad_norm": 1.0503807067871094, + "learning_rate": 2e-05, + "loss": 0.03511816, + "step": 7978 + }, + { + "epoch": 15.958, + "grad_norm": 1.0775737762451172, + "learning_rate": 2e-05, + "loss": 0.03104691, + "step": 7979 + }, + { + "epoch": 15.96, + "grad_norm": 1.1653412580490112, + "learning_rate": 2e-05, + "loss": 0.04440337, + "step": 7980 + }, + { + "epoch": 15.962, + "grad_norm": 1.088532567024231, + "learning_rate": 2e-05, + "loss": 0.04312788, + "step": 7981 + }, + { + "epoch": 15.964, + "grad_norm": 1.1584064960479736, + "learning_rate": 2e-05, + "loss": 0.02283434, + "step": 7982 + }, + { + "epoch": 15.966, + "grad_norm": 1.043766975402832, + "learning_rate": 2e-05, + "loss": 0.02896805, + "step": 7983 + }, + { + "epoch": 15.968, + "grad_norm": 0.9442967176437378, + "learning_rate": 2e-05, + "loss": 0.02886542, + "step": 7984 + }, + { + "epoch": 15.97, + "grad_norm": 1.3705523014068604, + "learning_rate": 2e-05, + "loss": 0.04226013, + "step": 7985 + }, + { + "epoch": 15.972, + "grad_norm": 3.2384965419769287, + "learning_rate": 2e-05, + "loss": 0.05349871, + "step": 7986 + }, + { + "epoch": 15.974, + "grad_norm": 1.393133282661438, + "learning_rate": 2e-05, + "loss": 0.02859312, + "step": 7987 + }, + { + "epoch": 15.975999999999999, + "grad_norm": 1.0506641864776611, + "learning_rate": 2e-05, + "loss": 0.03882669, + "step": 7988 + }, + { + "epoch": 15.978, + "grad_norm": 1.3604516983032227, + "learning_rate": 2e-05, + "loss": 0.04398809, + "step": 7989 + }, + { + "epoch": 15.98, + "grad_norm": 1.3526203632354736, + "learning_rate": 2e-05, + "loss": 0.03669821, + "step": 7990 + }, + { + "epoch": 15.982, + "grad_norm": 1.1345425844192505, + "learning_rate": 2e-05, + "loss": 0.03266115, + "step": 7991 + }, + { + "epoch": 15.984, + "grad_norm": 1.23661470413208, + "learning_rate": 2e-05, + "loss": 0.02663805, + "step": 7992 + }, + { + "epoch": 15.986, + "grad_norm": 2.231520652770996, + "learning_rate": 2e-05, + "loss": 0.034388, + "step": 7993 + }, + { + "epoch": 15.988, + "grad_norm": 1.1724952459335327, + "learning_rate": 2e-05, + "loss": 0.0357694, + "step": 7994 + }, + { + "epoch": 15.99, + "grad_norm": 1.698375940322876, + "learning_rate": 2e-05, + "loss": 0.04616644, + "step": 7995 + }, + { + "epoch": 15.992, + "grad_norm": 1.5685293674468994, + "learning_rate": 2e-05, + "loss": 0.03160156, + "step": 7996 + }, + { + "epoch": 15.994, + "grad_norm": 2.4769506454467773, + "learning_rate": 2e-05, + "loss": 0.04571313, + "step": 7997 + }, + { + "epoch": 15.996, + "grad_norm": 1.7058403491973877, + "learning_rate": 2e-05, + "loss": 0.03262362, + "step": 7998 + }, + { + "epoch": 15.998, + "grad_norm": 1.0402840375900269, + "learning_rate": 2e-05, + "loss": 0.02683425, + "step": 7999 + }, + { + "epoch": 16.0, + "grad_norm": 1.4686156511306763, + "learning_rate": 2e-05, + "loss": 0.03669425, + "step": 8000 + }, + { + "epoch": 16.0, + "eval_performance": { + "AngleClassification_1": 0.996, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9481037924151696, + "Equal_1": 0.994, + "Equal_2": 0.9660678642714571, + "Equal_3": 0.8902195608782435, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9879759519038076, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.986, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.982, + "Perpendicular_3": 0.7274549098196392, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9996666666666667, + "PointLiesOnCircle_3": 0.9912666666666666, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9780439121756487 + }, + "eval_runtime": 319.9457, + "eval_samples_per_second": 32.818, + "eval_steps_per_second": 0.656, + "step": 8000 + }, + { + "epoch": 16.002, + "grad_norm": 1.1055775880813599, + "learning_rate": 2e-05, + "loss": 0.0335177, + "step": 8001 + }, + { + "epoch": 16.004, + "grad_norm": 1.2921119928359985, + "learning_rate": 2e-05, + "loss": 0.02823064, + "step": 8002 + }, + { + "epoch": 16.006, + "grad_norm": 2.824982166290283, + "learning_rate": 2e-05, + "loss": 0.02834114, + "step": 8003 + }, + { + "epoch": 16.008, + "grad_norm": 2.268012523651123, + "learning_rate": 2e-05, + "loss": 0.03194273, + "step": 8004 + }, + { + "epoch": 16.01, + "grad_norm": 1.4545838832855225, + "learning_rate": 2e-05, + "loss": 0.02953038, + "step": 8005 + }, + { + "epoch": 16.012, + "grad_norm": 1.845914363861084, + "learning_rate": 2e-05, + "loss": 0.03039683, + "step": 8006 + }, + { + "epoch": 16.014, + "grad_norm": 1.2593659162521362, + "learning_rate": 2e-05, + "loss": 0.03985323, + "step": 8007 + }, + { + "epoch": 16.016, + "grad_norm": 1.1578806638717651, + "learning_rate": 2e-05, + "loss": 0.04368738, + "step": 8008 + }, + { + "epoch": 16.018, + "grad_norm": 2.670903444290161, + "learning_rate": 2e-05, + "loss": 0.05551417, + "step": 8009 + }, + { + "epoch": 16.02, + "grad_norm": 1.4139609336853027, + "learning_rate": 2e-05, + "loss": 0.03750867, + "step": 8010 + }, + { + "epoch": 16.022, + "grad_norm": 1.3667073249816895, + "learning_rate": 2e-05, + "loss": 0.05370542, + "step": 8011 + }, + { + "epoch": 16.024, + "grad_norm": 2.2714273929595947, + "learning_rate": 2e-05, + "loss": 0.0414988, + "step": 8012 + }, + { + "epoch": 16.026, + "grad_norm": 2.840731382369995, + "learning_rate": 2e-05, + "loss": 0.03340321, + "step": 8013 + }, + { + "epoch": 16.028, + "grad_norm": 2.810915946960449, + "learning_rate": 2e-05, + "loss": 0.05696411, + "step": 8014 + }, + { + "epoch": 16.03, + "grad_norm": 1.45564866065979, + "learning_rate": 2e-05, + "loss": 0.04313691, + "step": 8015 + }, + { + "epoch": 16.032, + "grad_norm": 1.4301260709762573, + "learning_rate": 2e-05, + "loss": 0.04525199, + "step": 8016 + }, + { + "epoch": 16.034, + "grad_norm": 1.1844078302383423, + "learning_rate": 2e-05, + "loss": 0.0316691, + "step": 8017 + }, + { + "epoch": 16.036, + "grad_norm": 1.4068323373794556, + "learning_rate": 2e-05, + "loss": 0.03633066, + "step": 8018 + }, + { + "epoch": 16.038, + "grad_norm": 1.5108731985092163, + "learning_rate": 2e-05, + "loss": 0.04725716, + "step": 8019 + }, + { + "epoch": 16.04, + "grad_norm": 1.654616355895996, + "learning_rate": 2e-05, + "loss": 0.04446092, + "step": 8020 + }, + { + "epoch": 16.042, + "grad_norm": 1.4129464626312256, + "learning_rate": 2e-05, + "loss": 0.03359355, + "step": 8021 + }, + { + "epoch": 16.044, + "grad_norm": 1.0702934265136719, + "learning_rate": 2e-05, + "loss": 0.02224928, + "step": 8022 + }, + { + "epoch": 16.046, + "grad_norm": 1.9897451400756836, + "learning_rate": 2e-05, + "loss": 0.04445896, + "step": 8023 + }, + { + "epoch": 16.048, + "grad_norm": 1.2325469255447388, + "learning_rate": 2e-05, + "loss": 0.03780691, + "step": 8024 + }, + { + "epoch": 16.05, + "grad_norm": 1.2062650918960571, + "learning_rate": 2e-05, + "loss": 0.03396706, + "step": 8025 + }, + { + "epoch": 16.052, + "grad_norm": 1.6414982080459595, + "learning_rate": 2e-05, + "loss": 0.04513337, + "step": 8026 + }, + { + "epoch": 16.054, + "grad_norm": 3.0704195499420166, + "learning_rate": 2e-05, + "loss": 0.05739868, + "step": 8027 + }, + { + "epoch": 16.056, + "grad_norm": 1.746216893196106, + "learning_rate": 2e-05, + "loss": 0.04020358, + "step": 8028 + }, + { + "epoch": 16.058, + "grad_norm": 1.0398463010787964, + "learning_rate": 2e-05, + "loss": 0.02914593, + "step": 8029 + }, + { + "epoch": 16.06, + "grad_norm": 1.3823471069335938, + "learning_rate": 2e-05, + "loss": 0.0427549, + "step": 8030 + }, + { + "epoch": 16.062, + "grad_norm": 0.9713700413703918, + "learning_rate": 2e-05, + "loss": 0.02133579, + "step": 8031 + }, + { + "epoch": 16.064, + "grad_norm": 2.3745758533477783, + "learning_rate": 2e-05, + "loss": 0.03649534, + "step": 8032 + }, + { + "epoch": 16.066, + "grad_norm": 1.386365294456482, + "learning_rate": 2e-05, + "loss": 0.04500858, + "step": 8033 + }, + { + "epoch": 16.068, + "grad_norm": 2.186955690383911, + "learning_rate": 2e-05, + "loss": 0.05879531, + "step": 8034 + }, + { + "epoch": 16.07, + "grad_norm": 2.0882043838500977, + "learning_rate": 2e-05, + "loss": 0.04632348, + "step": 8035 + }, + { + "epoch": 16.072, + "grad_norm": 1.3914663791656494, + "learning_rate": 2e-05, + "loss": 0.03216641, + "step": 8036 + }, + { + "epoch": 16.074, + "grad_norm": 1.4008915424346924, + "learning_rate": 2e-05, + "loss": 0.03341067, + "step": 8037 + }, + { + "epoch": 16.076, + "grad_norm": 1.3791346549987793, + "learning_rate": 2e-05, + "loss": 0.0505546, + "step": 8038 + }, + { + "epoch": 16.078, + "grad_norm": 1.302667498588562, + "learning_rate": 2e-05, + "loss": 0.04582535, + "step": 8039 + }, + { + "epoch": 16.08, + "grad_norm": 1.6824592351913452, + "learning_rate": 2e-05, + "loss": 0.0420664, + "step": 8040 + }, + { + "epoch": 16.082, + "grad_norm": 1.354354977607727, + "learning_rate": 2e-05, + "loss": 0.02892293, + "step": 8041 + }, + { + "epoch": 16.084, + "grad_norm": 1.0839284658432007, + "learning_rate": 2e-05, + "loss": 0.03057658, + "step": 8042 + }, + { + "epoch": 16.086, + "grad_norm": 1.1779735088348389, + "learning_rate": 2e-05, + "loss": 0.02183773, + "step": 8043 + }, + { + "epoch": 16.088, + "grad_norm": 1.1066539287567139, + "learning_rate": 2e-05, + "loss": 0.03328647, + "step": 8044 + }, + { + "epoch": 16.09, + "grad_norm": 1.5935084819793701, + "learning_rate": 2e-05, + "loss": 0.04813398, + "step": 8045 + }, + { + "epoch": 16.092, + "grad_norm": 1.7510586977005005, + "learning_rate": 2e-05, + "loss": 0.03419684, + "step": 8046 + }, + { + "epoch": 16.094, + "grad_norm": 1.2361801862716675, + "learning_rate": 2e-05, + "loss": 0.03396645, + "step": 8047 + }, + { + "epoch": 16.096, + "grad_norm": 1.069996953010559, + "learning_rate": 2e-05, + "loss": 0.02971686, + "step": 8048 + }, + { + "epoch": 16.098, + "grad_norm": 1.4288445711135864, + "learning_rate": 2e-05, + "loss": 0.03799113, + "step": 8049 + }, + { + "epoch": 16.1, + "grad_norm": 0.9517697691917419, + "learning_rate": 2e-05, + "loss": 0.02629692, + "step": 8050 + }, + { + "epoch": 16.102, + "grad_norm": 1.0445160865783691, + "learning_rate": 2e-05, + "loss": 0.03541003, + "step": 8051 + }, + { + "epoch": 16.104, + "grad_norm": 1.2751644849777222, + "learning_rate": 2e-05, + "loss": 0.03903734, + "step": 8052 + }, + { + "epoch": 16.106, + "grad_norm": 1.3192799091339111, + "learning_rate": 2e-05, + "loss": 0.04088945, + "step": 8053 + }, + { + "epoch": 16.108, + "grad_norm": 0.8523271679878235, + "learning_rate": 2e-05, + "loss": 0.02510538, + "step": 8054 + }, + { + "epoch": 16.11, + "grad_norm": 1.4209495782852173, + "learning_rate": 2e-05, + "loss": 0.03651667, + "step": 8055 + }, + { + "epoch": 16.112, + "grad_norm": 2.66572904586792, + "learning_rate": 2e-05, + "loss": 0.02939776, + "step": 8056 + }, + { + "epoch": 16.114, + "grad_norm": 1.7730138301849365, + "learning_rate": 2e-05, + "loss": 0.05316716, + "step": 8057 + }, + { + "epoch": 16.116, + "grad_norm": 1.0424127578735352, + "learning_rate": 2e-05, + "loss": 0.03889843, + "step": 8058 + }, + { + "epoch": 16.118, + "grad_norm": 1.0265769958496094, + "learning_rate": 2e-05, + "loss": 0.03288617, + "step": 8059 + }, + { + "epoch": 16.12, + "grad_norm": 1.0434588193893433, + "learning_rate": 2e-05, + "loss": 0.04037582, + "step": 8060 + }, + { + "epoch": 16.122, + "grad_norm": 1.2070543766021729, + "learning_rate": 2e-05, + "loss": 0.02952406, + "step": 8061 + }, + { + "epoch": 16.124, + "grad_norm": 1.59628427028656, + "learning_rate": 2e-05, + "loss": 0.04046272, + "step": 8062 + }, + { + "epoch": 16.126, + "grad_norm": 1.1951675415039062, + "learning_rate": 2e-05, + "loss": 0.03478213, + "step": 8063 + }, + { + "epoch": 16.128, + "grad_norm": 1.0106141567230225, + "learning_rate": 2e-05, + "loss": 0.03325225, + "step": 8064 + }, + { + "epoch": 16.13, + "grad_norm": 0.993449330329895, + "learning_rate": 2e-05, + "loss": 0.01964446, + "step": 8065 + }, + { + "epoch": 16.132, + "grad_norm": 1.4265450239181519, + "learning_rate": 2e-05, + "loss": 0.03322139, + "step": 8066 + }, + { + "epoch": 16.134, + "grad_norm": 1.0770517587661743, + "learning_rate": 2e-05, + "loss": 0.03661602, + "step": 8067 + }, + { + "epoch": 16.136, + "grad_norm": 1.4938647747039795, + "learning_rate": 2e-05, + "loss": 0.03427796, + "step": 8068 + }, + { + "epoch": 16.138, + "grad_norm": 1.4040374755859375, + "learning_rate": 2e-05, + "loss": 0.03253736, + "step": 8069 + }, + { + "epoch": 16.14, + "grad_norm": 1.381060242652893, + "learning_rate": 2e-05, + "loss": 0.03683833, + "step": 8070 + }, + { + "epoch": 16.142, + "grad_norm": 1.1228476762771606, + "learning_rate": 2e-05, + "loss": 0.04204592, + "step": 8071 + }, + { + "epoch": 16.144, + "grad_norm": 1.1399726867675781, + "learning_rate": 2e-05, + "loss": 0.0393802, + "step": 8072 + }, + { + "epoch": 16.146, + "grad_norm": 1.6664113998413086, + "learning_rate": 2e-05, + "loss": 0.03525341, + "step": 8073 + }, + { + "epoch": 16.148, + "grad_norm": 1.0361666679382324, + "learning_rate": 2e-05, + "loss": 0.03461679, + "step": 8074 + }, + { + "epoch": 16.15, + "grad_norm": 1.3984792232513428, + "learning_rate": 2e-05, + "loss": 0.04337152, + "step": 8075 + }, + { + "epoch": 16.152, + "grad_norm": 1.3185367584228516, + "learning_rate": 2e-05, + "loss": 0.04654099, + "step": 8076 + }, + { + "epoch": 16.154, + "grad_norm": 2.2266247272491455, + "learning_rate": 2e-05, + "loss": 0.05011526, + "step": 8077 + }, + { + "epoch": 16.156, + "grad_norm": 1.3377346992492676, + "learning_rate": 2e-05, + "loss": 0.04082262, + "step": 8078 + }, + { + "epoch": 16.158, + "grad_norm": 3.1692075729370117, + "learning_rate": 2e-05, + "loss": 0.05622632, + "step": 8079 + }, + { + "epoch": 16.16, + "grad_norm": 1.0942643880844116, + "learning_rate": 2e-05, + "loss": 0.02746512, + "step": 8080 + }, + { + "epoch": 16.162, + "grad_norm": 1.217752456665039, + "learning_rate": 2e-05, + "loss": 0.03617308, + "step": 8081 + }, + { + "epoch": 16.164, + "grad_norm": 1.8734045028686523, + "learning_rate": 2e-05, + "loss": 0.04952653, + "step": 8082 + }, + { + "epoch": 16.166, + "grad_norm": 1.50890052318573, + "learning_rate": 2e-05, + "loss": 0.04156789, + "step": 8083 + }, + { + "epoch": 16.168, + "grad_norm": 1.4338157176971436, + "learning_rate": 2e-05, + "loss": 0.05392047, + "step": 8084 + }, + { + "epoch": 16.17, + "grad_norm": 1.7338756322860718, + "learning_rate": 2e-05, + "loss": 0.04553423, + "step": 8085 + }, + { + "epoch": 16.172, + "grad_norm": 1.5352826118469238, + "learning_rate": 2e-05, + "loss": 0.04915183, + "step": 8086 + }, + { + "epoch": 16.174, + "grad_norm": 1.7722485065460205, + "learning_rate": 2e-05, + "loss": 0.03585491, + "step": 8087 + }, + { + "epoch": 16.176, + "grad_norm": 0.9967431426048279, + "learning_rate": 2e-05, + "loss": 0.02942163, + "step": 8088 + }, + { + "epoch": 16.178, + "grad_norm": 1.2177917957305908, + "learning_rate": 2e-05, + "loss": 0.04380299, + "step": 8089 + }, + { + "epoch": 16.18, + "grad_norm": 1.2559581995010376, + "learning_rate": 2e-05, + "loss": 0.02480926, + "step": 8090 + }, + { + "epoch": 16.182, + "grad_norm": 1.5222054719924927, + "learning_rate": 2e-05, + "loss": 0.04941765, + "step": 8091 + }, + { + "epoch": 16.184, + "grad_norm": 1.0515002012252808, + "learning_rate": 2e-05, + "loss": 0.04209872, + "step": 8092 + }, + { + "epoch": 16.186, + "grad_norm": 1.8783875703811646, + "learning_rate": 2e-05, + "loss": 0.03835857, + "step": 8093 + }, + { + "epoch": 16.188, + "grad_norm": 1.2935268878936768, + "learning_rate": 2e-05, + "loss": 0.05065227, + "step": 8094 + }, + { + "epoch": 16.19, + "grad_norm": 1.6219897270202637, + "learning_rate": 2e-05, + "loss": 0.02984177, + "step": 8095 + }, + { + "epoch": 16.192, + "grad_norm": 1.3317407369613647, + "learning_rate": 2e-05, + "loss": 0.0331154, + "step": 8096 + }, + { + "epoch": 16.194, + "grad_norm": 1.2899963855743408, + "learning_rate": 2e-05, + "loss": 0.03418003, + "step": 8097 + }, + { + "epoch": 16.196, + "grad_norm": 1.1565256118774414, + "learning_rate": 2e-05, + "loss": 0.03147539, + "step": 8098 + }, + { + "epoch": 16.198, + "grad_norm": 1.2315301895141602, + "learning_rate": 2e-05, + "loss": 0.0391296, + "step": 8099 + }, + { + "epoch": 16.2, + "grad_norm": 1.1125531196594238, + "learning_rate": 2e-05, + "loss": 0.03473473, + "step": 8100 + }, + { + "epoch": 16.202, + "grad_norm": 1.5574398040771484, + "learning_rate": 2e-05, + "loss": 0.0310582, + "step": 8101 + }, + { + "epoch": 16.204, + "grad_norm": 1.4571691751480103, + "learning_rate": 2e-05, + "loss": 0.03549339, + "step": 8102 + }, + { + "epoch": 16.206, + "grad_norm": 1.2197107076644897, + "learning_rate": 2e-05, + "loss": 0.04195347, + "step": 8103 + }, + { + "epoch": 16.208, + "grad_norm": 2.789921760559082, + "learning_rate": 2e-05, + "loss": 0.03652881, + "step": 8104 + }, + { + "epoch": 16.21, + "grad_norm": 4.851873397827148, + "learning_rate": 2e-05, + "loss": 0.03688388, + "step": 8105 + }, + { + "epoch": 16.212, + "grad_norm": 1.457202434539795, + "learning_rate": 2e-05, + "loss": 0.03209008, + "step": 8106 + }, + { + "epoch": 16.214, + "grad_norm": 1.0909388065338135, + "learning_rate": 2e-05, + "loss": 0.02918109, + "step": 8107 + }, + { + "epoch": 16.216, + "grad_norm": 2.0140578746795654, + "learning_rate": 2e-05, + "loss": 0.05105082, + "step": 8108 + }, + { + "epoch": 16.218, + "grad_norm": 1.9580172300338745, + "learning_rate": 2e-05, + "loss": 0.04800195, + "step": 8109 + }, + { + "epoch": 16.22, + "grad_norm": 3.5140187740325928, + "learning_rate": 2e-05, + "loss": 0.04388273, + "step": 8110 + }, + { + "epoch": 16.222, + "grad_norm": 1.116141676902771, + "learning_rate": 2e-05, + "loss": 0.03350087, + "step": 8111 + }, + { + "epoch": 16.224, + "grad_norm": 2.4265329837799072, + "learning_rate": 2e-05, + "loss": 0.04413512, + "step": 8112 + }, + { + "epoch": 16.226, + "grad_norm": 1.9018596410751343, + "learning_rate": 2e-05, + "loss": 0.0421547, + "step": 8113 + }, + { + "epoch": 16.228, + "grad_norm": 1.3331454992294312, + "learning_rate": 2e-05, + "loss": 0.04760406, + "step": 8114 + }, + { + "epoch": 16.23, + "grad_norm": 0.9366099834442139, + "learning_rate": 2e-05, + "loss": 0.02583786, + "step": 8115 + }, + { + "epoch": 16.232, + "grad_norm": 2.820237636566162, + "learning_rate": 2e-05, + "loss": 0.02830683, + "step": 8116 + }, + { + "epoch": 16.234, + "grad_norm": 1.1773334741592407, + "learning_rate": 2e-05, + "loss": 0.03467422, + "step": 8117 + }, + { + "epoch": 16.236, + "grad_norm": 1.685434103012085, + "learning_rate": 2e-05, + "loss": 0.04194804, + "step": 8118 + }, + { + "epoch": 16.238, + "grad_norm": 1.8741134405136108, + "learning_rate": 2e-05, + "loss": 0.0288431, + "step": 8119 + }, + { + "epoch": 16.24, + "grad_norm": 1.3268035650253296, + "learning_rate": 2e-05, + "loss": 0.0287002, + "step": 8120 + }, + { + "epoch": 16.242, + "grad_norm": 1.3486356735229492, + "learning_rate": 2e-05, + "loss": 0.03664606, + "step": 8121 + }, + { + "epoch": 16.244, + "grad_norm": 1.631987452507019, + "learning_rate": 2e-05, + "loss": 0.05182088, + "step": 8122 + }, + { + "epoch": 16.246, + "grad_norm": 2.88786244392395, + "learning_rate": 2e-05, + "loss": 0.05489088, + "step": 8123 + }, + { + "epoch": 16.248, + "grad_norm": 2.0187666416168213, + "learning_rate": 2e-05, + "loss": 0.04556795, + "step": 8124 + }, + { + "epoch": 16.25, + "grad_norm": 1.089003562927246, + "learning_rate": 2e-05, + "loss": 0.03417504, + "step": 8125 + }, + { + "epoch": 16.252, + "grad_norm": 1.238042950630188, + "learning_rate": 2e-05, + "loss": 0.03643361, + "step": 8126 + }, + { + "epoch": 16.254, + "grad_norm": 1.347186803817749, + "learning_rate": 2e-05, + "loss": 0.0338303, + "step": 8127 + }, + { + "epoch": 16.256, + "grad_norm": 1.772897720336914, + "learning_rate": 2e-05, + "loss": 0.05571172, + "step": 8128 + }, + { + "epoch": 16.258, + "grad_norm": 0.9345927834510803, + "learning_rate": 2e-05, + "loss": 0.02508531, + "step": 8129 + }, + { + "epoch": 16.26, + "grad_norm": 1.491607904434204, + "learning_rate": 2e-05, + "loss": 0.03616347, + "step": 8130 + }, + { + "epoch": 16.262, + "grad_norm": 1.141614317893982, + "learning_rate": 2e-05, + "loss": 0.03462991, + "step": 8131 + }, + { + "epoch": 16.264, + "grad_norm": 1.2147083282470703, + "learning_rate": 2e-05, + "loss": 0.03725596, + "step": 8132 + }, + { + "epoch": 16.266, + "grad_norm": 1.7106729745864868, + "learning_rate": 2e-05, + "loss": 0.03632404, + "step": 8133 + }, + { + "epoch": 16.268, + "grad_norm": 1.0277926921844482, + "learning_rate": 2e-05, + "loss": 0.0347311, + "step": 8134 + }, + { + "epoch": 16.27, + "grad_norm": 1.2781480550765991, + "learning_rate": 2e-05, + "loss": 0.03461632, + "step": 8135 + }, + { + "epoch": 16.272, + "grad_norm": 1.013649582862854, + "learning_rate": 2e-05, + "loss": 0.03419242, + "step": 8136 + }, + { + "epoch": 16.274, + "grad_norm": 1.7054558992385864, + "learning_rate": 2e-05, + "loss": 0.04766083, + "step": 8137 + }, + { + "epoch": 16.276, + "grad_norm": 1.1981877088546753, + "learning_rate": 2e-05, + "loss": 0.03875602, + "step": 8138 + }, + { + "epoch": 16.278, + "grad_norm": 1.4614924192428589, + "learning_rate": 2e-05, + "loss": 0.04386155, + "step": 8139 + }, + { + "epoch": 16.28, + "grad_norm": 0.8466894626617432, + "learning_rate": 2e-05, + "loss": 0.01991762, + "step": 8140 + }, + { + "epoch": 16.282, + "grad_norm": 2.6902008056640625, + "learning_rate": 2e-05, + "loss": 0.04461161, + "step": 8141 + }, + { + "epoch": 16.284, + "grad_norm": 1.712175965309143, + "learning_rate": 2e-05, + "loss": 0.04943609, + "step": 8142 + }, + { + "epoch": 16.286, + "grad_norm": 1.3116623163223267, + "learning_rate": 2e-05, + "loss": 0.03155245, + "step": 8143 + }, + { + "epoch": 16.288, + "grad_norm": 2.068293809890747, + "learning_rate": 2e-05, + "loss": 0.04026989, + "step": 8144 + }, + { + "epoch": 16.29, + "grad_norm": 2.457705020904541, + "learning_rate": 2e-05, + "loss": 0.05160169, + "step": 8145 + }, + { + "epoch": 16.292, + "grad_norm": 1.1721792221069336, + "learning_rate": 2e-05, + "loss": 0.03861858, + "step": 8146 + }, + { + "epoch": 16.294, + "grad_norm": 1.3003913164138794, + "learning_rate": 2e-05, + "loss": 0.03377434, + "step": 8147 + }, + { + "epoch": 16.296, + "grad_norm": 1.6648625135421753, + "learning_rate": 2e-05, + "loss": 0.05673098, + "step": 8148 + }, + { + "epoch": 16.298, + "grad_norm": 1.6256797313690186, + "learning_rate": 2e-05, + "loss": 0.03393706, + "step": 8149 + }, + { + "epoch": 16.3, + "grad_norm": 1.4245861768722534, + "learning_rate": 2e-05, + "loss": 0.03638397, + "step": 8150 + }, + { + "epoch": 16.302, + "grad_norm": 3.2517616748809814, + "learning_rate": 2e-05, + "loss": 0.04278335, + "step": 8151 + }, + { + "epoch": 16.304, + "grad_norm": 0.8710333108901978, + "learning_rate": 2e-05, + "loss": 0.02735967, + "step": 8152 + }, + { + "epoch": 16.306, + "grad_norm": 3.8048529624938965, + "learning_rate": 2e-05, + "loss": 0.04639032, + "step": 8153 + }, + { + "epoch": 16.308, + "grad_norm": 1.7357240915298462, + "learning_rate": 2e-05, + "loss": 0.04244914, + "step": 8154 + }, + { + "epoch": 16.31, + "grad_norm": 1.8791494369506836, + "learning_rate": 2e-05, + "loss": 0.04231622, + "step": 8155 + }, + { + "epoch": 16.312, + "grad_norm": 1.1524310111999512, + "learning_rate": 2e-05, + "loss": 0.03676322, + "step": 8156 + }, + { + "epoch": 16.314, + "grad_norm": 2.3864612579345703, + "learning_rate": 2e-05, + "loss": 0.04578413, + "step": 8157 + }, + { + "epoch": 16.316, + "grad_norm": 1.7564839124679565, + "learning_rate": 2e-05, + "loss": 0.04500315, + "step": 8158 + }, + { + "epoch": 16.318, + "grad_norm": 2.045438766479492, + "learning_rate": 2e-05, + "loss": 0.0306298, + "step": 8159 + }, + { + "epoch": 16.32, + "grad_norm": 0.8264663815498352, + "learning_rate": 2e-05, + "loss": 0.02070304, + "step": 8160 + }, + { + "epoch": 16.322, + "grad_norm": 1.1368653774261475, + "learning_rate": 2e-05, + "loss": 0.03928949, + "step": 8161 + }, + { + "epoch": 16.324, + "grad_norm": 0.9940206408500671, + "learning_rate": 2e-05, + "loss": 0.034535, + "step": 8162 + }, + { + "epoch": 16.326, + "grad_norm": 1.2441214323043823, + "learning_rate": 2e-05, + "loss": 0.0337163, + "step": 8163 + }, + { + "epoch": 16.328, + "grad_norm": 1.5334255695343018, + "learning_rate": 2e-05, + "loss": 0.04570875, + "step": 8164 + }, + { + "epoch": 16.33, + "grad_norm": 1.4286037683486938, + "learning_rate": 2e-05, + "loss": 0.05645541, + "step": 8165 + }, + { + "epoch": 16.332, + "grad_norm": 2.280270576477051, + "learning_rate": 2e-05, + "loss": 0.0485545, + "step": 8166 + }, + { + "epoch": 16.334, + "grad_norm": 1.9893189668655396, + "learning_rate": 2e-05, + "loss": 0.03430265, + "step": 8167 + }, + { + "epoch": 16.336, + "grad_norm": 1.4144779443740845, + "learning_rate": 2e-05, + "loss": 0.04236887, + "step": 8168 + }, + { + "epoch": 16.338, + "grad_norm": 1.0118204355239868, + "learning_rate": 2e-05, + "loss": 0.02984769, + "step": 8169 + }, + { + "epoch": 16.34, + "grad_norm": 1.6643238067626953, + "learning_rate": 2e-05, + "loss": 0.04789416, + "step": 8170 + }, + { + "epoch": 16.342, + "grad_norm": 0.8409438729286194, + "learning_rate": 2e-05, + "loss": 0.01973891, + "step": 8171 + }, + { + "epoch": 16.344, + "grad_norm": 1.0004379749298096, + "learning_rate": 2e-05, + "loss": 0.03158588, + "step": 8172 + }, + { + "epoch": 16.346, + "grad_norm": 2.194171667098999, + "learning_rate": 2e-05, + "loss": 0.06659742, + "step": 8173 + }, + { + "epoch": 16.348, + "grad_norm": 1.4098418951034546, + "learning_rate": 2e-05, + "loss": 0.03775009, + "step": 8174 + }, + { + "epoch": 16.35, + "grad_norm": 1.5424364805221558, + "learning_rate": 2e-05, + "loss": 0.04169576, + "step": 8175 + }, + { + "epoch": 16.352, + "grad_norm": 1.8581490516662598, + "learning_rate": 2e-05, + "loss": 0.07275459, + "step": 8176 + }, + { + "epoch": 16.354, + "grad_norm": 1.3709255456924438, + "learning_rate": 2e-05, + "loss": 0.03472374, + "step": 8177 + }, + { + "epoch": 16.356, + "grad_norm": 1.9072877168655396, + "learning_rate": 2e-05, + "loss": 0.0435157, + "step": 8178 + }, + { + "epoch": 16.358, + "grad_norm": 2.198082208633423, + "learning_rate": 2e-05, + "loss": 0.02928953, + "step": 8179 + }, + { + "epoch": 16.36, + "grad_norm": 0.9633461833000183, + "learning_rate": 2e-05, + "loss": 0.03266443, + "step": 8180 + }, + { + "epoch": 16.362, + "grad_norm": 2.41123366355896, + "learning_rate": 2e-05, + "loss": 0.04052667, + "step": 8181 + }, + { + "epoch": 16.364, + "grad_norm": 1.4949405193328857, + "learning_rate": 2e-05, + "loss": 0.05271856, + "step": 8182 + }, + { + "epoch": 16.366, + "grad_norm": 1.5633461475372314, + "learning_rate": 2e-05, + "loss": 0.04020818, + "step": 8183 + }, + { + "epoch": 16.368, + "grad_norm": 1.0727436542510986, + "learning_rate": 2e-05, + "loss": 0.03464281, + "step": 8184 + }, + { + "epoch": 16.37, + "grad_norm": 1.2527438402175903, + "learning_rate": 2e-05, + "loss": 0.04834267, + "step": 8185 + }, + { + "epoch": 16.372, + "grad_norm": 1.9610435962677002, + "learning_rate": 2e-05, + "loss": 0.05433511, + "step": 8186 + }, + { + "epoch": 16.374, + "grad_norm": 1.6253420114517212, + "learning_rate": 2e-05, + "loss": 0.04721621, + "step": 8187 + }, + { + "epoch": 16.376, + "grad_norm": 2.0063889026641846, + "learning_rate": 2e-05, + "loss": 0.04768866, + "step": 8188 + }, + { + "epoch": 16.378, + "grad_norm": 1.5373892784118652, + "learning_rate": 2e-05, + "loss": 0.04332329, + "step": 8189 + }, + { + "epoch": 16.38, + "grad_norm": 1.7891055345535278, + "learning_rate": 2e-05, + "loss": 0.0316738, + "step": 8190 + }, + { + "epoch": 16.382, + "grad_norm": 1.626381516456604, + "learning_rate": 2e-05, + "loss": 0.0384635, + "step": 8191 + }, + { + "epoch": 16.384, + "grad_norm": 1.2403956651687622, + "learning_rate": 2e-05, + "loss": 0.03793911, + "step": 8192 + }, + { + "epoch": 16.386, + "grad_norm": 1.388485074043274, + "learning_rate": 2e-05, + "loss": 0.05434792, + "step": 8193 + }, + { + "epoch": 16.388, + "grad_norm": 0.903146505355835, + "learning_rate": 2e-05, + "loss": 0.02984499, + "step": 8194 + }, + { + "epoch": 16.39, + "grad_norm": 1.14999258518219, + "learning_rate": 2e-05, + "loss": 0.0324681, + "step": 8195 + }, + { + "epoch": 16.392, + "grad_norm": 0.8718903660774231, + "learning_rate": 2e-05, + "loss": 0.03075159, + "step": 8196 + }, + { + "epoch": 16.394, + "grad_norm": 0.9332501888275146, + "learning_rate": 2e-05, + "loss": 0.03167652, + "step": 8197 + }, + { + "epoch": 16.396, + "grad_norm": 1.2964439392089844, + "learning_rate": 2e-05, + "loss": 0.05365559, + "step": 8198 + }, + { + "epoch": 16.398, + "grad_norm": 1.2190895080566406, + "learning_rate": 2e-05, + "loss": 0.04543667, + "step": 8199 + }, + { + "epoch": 16.4, + "grad_norm": 1.297627568244934, + "learning_rate": 2e-05, + "loss": 0.03421879, + "step": 8200 + }, + { + "epoch": 16.402, + "grad_norm": 1.3913570642471313, + "learning_rate": 2e-05, + "loss": 0.03622192, + "step": 8201 + }, + { + "epoch": 16.404, + "grad_norm": 0.967580258846283, + "learning_rate": 2e-05, + "loss": 0.03260601, + "step": 8202 + }, + { + "epoch": 16.406, + "grad_norm": 1.093234896659851, + "learning_rate": 2e-05, + "loss": 0.04168686, + "step": 8203 + }, + { + "epoch": 16.408, + "grad_norm": 1.2416380643844604, + "learning_rate": 2e-05, + "loss": 0.04545043, + "step": 8204 + }, + { + "epoch": 16.41, + "grad_norm": 2.081451892852783, + "learning_rate": 2e-05, + "loss": 0.03653099, + "step": 8205 + }, + { + "epoch": 16.412, + "grad_norm": 1.4831323623657227, + "learning_rate": 2e-05, + "loss": 0.04202024, + "step": 8206 + }, + { + "epoch": 16.414, + "grad_norm": 0.8882843852043152, + "learning_rate": 2e-05, + "loss": 0.02640081, + "step": 8207 + }, + { + "epoch": 16.416, + "grad_norm": 1.2519268989562988, + "learning_rate": 2e-05, + "loss": 0.03396625, + "step": 8208 + }, + { + "epoch": 16.418, + "grad_norm": 1.0374751091003418, + "learning_rate": 2e-05, + "loss": 0.03408733, + "step": 8209 + }, + { + "epoch": 16.42, + "grad_norm": 1.2353500127792358, + "learning_rate": 2e-05, + "loss": 0.04257344, + "step": 8210 + }, + { + "epoch": 16.422, + "grad_norm": 1.310669183731079, + "learning_rate": 2e-05, + "loss": 0.03693325, + "step": 8211 + }, + { + "epoch": 16.424, + "grad_norm": 2.0331413745880127, + "learning_rate": 2e-05, + "loss": 0.03563491, + "step": 8212 + }, + { + "epoch": 16.426, + "grad_norm": 1.2886682748794556, + "learning_rate": 2e-05, + "loss": 0.03784215, + "step": 8213 + }, + { + "epoch": 16.428, + "grad_norm": 1.1843023300170898, + "learning_rate": 2e-05, + "loss": 0.04167389, + "step": 8214 + }, + { + "epoch": 16.43, + "grad_norm": 2.5486996173858643, + "learning_rate": 2e-05, + "loss": 0.05189614, + "step": 8215 + }, + { + "epoch": 16.432, + "grad_norm": 1.908674955368042, + "learning_rate": 2e-05, + "loss": 0.02384158, + "step": 8216 + }, + { + "epoch": 16.434, + "grad_norm": 1.7661586999893188, + "learning_rate": 2e-05, + "loss": 0.03849943, + "step": 8217 + }, + { + "epoch": 16.436, + "grad_norm": 1.0904186964035034, + "learning_rate": 2e-05, + "loss": 0.04790416, + "step": 8218 + }, + { + "epoch": 16.438, + "grad_norm": 1.0966798067092896, + "learning_rate": 2e-05, + "loss": 0.03781455, + "step": 8219 + }, + { + "epoch": 16.44, + "grad_norm": 0.9001188278198242, + "learning_rate": 2e-05, + "loss": 0.02561498, + "step": 8220 + }, + { + "epoch": 16.442, + "grad_norm": 0.9903857111930847, + "learning_rate": 2e-05, + "loss": 0.0389318, + "step": 8221 + }, + { + "epoch": 16.444, + "grad_norm": 1.5551356077194214, + "learning_rate": 2e-05, + "loss": 0.05141037, + "step": 8222 + }, + { + "epoch": 16.446, + "grad_norm": 1.2812175750732422, + "learning_rate": 2e-05, + "loss": 0.03643136, + "step": 8223 + }, + { + "epoch": 16.448, + "grad_norm": 1.0215216875076294, + "learning_rate": 2e-05, + "loss": 0.02926203, + "step": 8224 + }, + { + "epoch": 16.45, + "grad_norm": 1.4955259561538696, + "learning_rate": 2e-05, + "loss": 0.03899873, + "step": 8225 + }, + { + "epoch": 16.452, + "grad_norm": 1.4788180589675903, + "learning_rate": 2e-05, + "loss": 0.05681079, + "step": 8226 + }, + { + "epoch": 16.454, + "grad_norm": 1.0436245203018188, + "learning_rate": 2e-05, + "loss": 0.03019481, + "step": 8227 + }, + { + "epoch": 16.456, + "grad_norm": 0.839566707611084, + "learning_rate": 2e-05, + "loss": 0.02787675, + "step": 8228 + }, + { + "epoch": 16.458, + "grad_norm": 1.0782071352005005, + "learning_rate": 2e-05, + "loss": 0.0280141, + "step": 8229 + }, + { + "epoch": 16.46, + "grad_norm": 1.0634349584579468, + "learning_rate": 2e-05, + "loss": 0.03917925, + "step": 8230 + }, + { + "epoch": 16.462, + "grad_norm": 1.0668877363204956, + "learning_rate": 2e-05, + "loss": 0.03235231, + "step": 8231 + }, + { + "epoch": 16.464, + "grad_norm": 1.0869591236114502, + "learning_rate": 2e-05, + "loss": 0.03207518, + "step": 8232 + }, + { + "epoch": 16.466, + "grad_norm": 1.7971811294555664, + "learning_rate": 2e-05, + "loss": 0.04344584, + "step": 8233 + }, + { + "epoch": 16.468, + "grad_norm": 2.0052719116210938, + "learning_rate": 2e-05, + "loss": 0.03197655, + "step": 8234 + }, + { + "epoch": 16.47, + "grad_norm": 1.3378233909606934, + "learning_rate": 2e-05, + "loss": 0.03234541, + "step": 8235 + }, + { + "epoch": 16.472, + "grad_norm": 2.523751735687256, + "learning_rate": 2e-05, + "loss": 0.04990536, + "step": 8236 + }, + { + "epoch": 16.474, + "grad_norm": 1.0454021692276, + "learning_rate": 2e-05, + "loss": 0.03331465, + "step": 8237 + }, + { + "epoch": 16.476, + "grad_norm": 1.1137912273406982, + "learning_rate": 2e-05, + "loss": 0.03970309, + "step": 8238 + }, + { + "epoch": 16.478, + "grad_norm": 1.585533618927002, + "learning_rate": 2e-05, + "loss": 0.0488726, + "step": 8239 + }, + { + "epoch": 16.48, + "grad_norm": 3.0131523609161377, + "learning_rate": 2e-05, + "loss": 0.04734076, + "step": 8240 + }, + { + "epoch": 16.482, + "grad_norm": 0.9826610684394836, + "learning_rate": 2e-05, + "loss": 0.0252513, + "step": 8241 + }, + { + "epoch": 16.484, + "grad_norm": 1.9169968366622925, + "learning_rate": 2e-05, + "loss": 0.03856509, + "step": 8242 + }, + { + "epoch": 16.486, + "grad_norm": 1.338022232055664, + "learning_rate": 2e-05, + "loss": 0.03761451, + "step": 8243 + }, + { + "epoch": 16.488, + "grad_norm": 1.4171205759048462, + "learning_rate": 2e-05, + "loss": 0.04478035, + "step": 8244 + }, + { + "epoch": 16.49, + "grad_norm": 1.2033617496490479, + "learning_rate": 2e-05, + "loss": 0.03369937, + "step": 8245 + }, + { + "epoch": 16.492, + "grad_norm": 1.676818609237671, + "learning_rate": 2e-05, + "loss": 0.03760742, + "step": 8246 + }, + { + "epoch": 16.494, + "grad_norm": 1.0126457214355469, + "learning_rate": 2e-05, + "loss": 0.03277501, + "step": 8247 + }, + { + "epoch": 16.496, + "grad_norm": 1.122615933418274, + "learning_rate": 2e-05, + "loss": 0.03475213, + "step": 8248 + }, + { + "epoch": 16.498, + "grad_norm": 1.2609951496124268, + "learning_rate": 2e-05, + "loss": 0.03638831, + "step": 8249 + }, + { + "epoch": 16.5, + "grad_norm": 1.3109709024429321, + "learning_rate": 2e-05, + "loss": 0.04073593, + "step": 8250 + }, + { + "epoch": 16.502, + "grad_norm": 1.6363704204559326, + "learning_rate": 2e-05, + "loss": 0.04546898, + "step": 8251 + }, + { + "epoch": 16.504, + "grad_norm": 1.0490771532058716, + "learning_rate": 2e-05, + "loss": 0.03132774, + "step": 8252 + }, + { + "epoch": 16.506, + "grad_norm": 1.9119375944137573, + "learning_rate": 2e-05, + "loss": 0.04283144, + "step": 8253 + }, + { + "epoch": 16.508, + "grad_norm": 1.7922325134277344, + "learning_rate": 2e-05, + "loss": 0.04706988, + "step": 8254 + }, + { + "epoch": 16.51, + "grad_norm": 2.0722665786743164, + "learning_rate": 2e-05, + "loss": 0.04967377, + "step": 8255 + }, + { + "epoch": 16.512, + "grad_norm": 1.6956374645233154, + "learning_rate": 2e-05, + "loss": 0.03436553, + "step": 8256 + }, + { + "epoch": 16.514, + "grad_norm": 1.2904715538024902, + "learning_rate": 2e-05, + "loss": 0.03789183, + "step": 8257 + }, + { + "epoch": 16.516, + "grad_norm": 1.4763702154159546, + "learning_rate": 2e-05, + "loss": 0.04574104, + "step": 8258 + }, + { + "epoch": 16.518, + "grad_norm": 1.6451772451400757, + "learning_rate": 2e-05, + "loss": 0.05261879, + "step": 8259 + }, + { + "epoch": 16.52, + "grad_norm": 1.7381635904312134, + "learning_rate": 2e-05, + "loss": 0.03526191, + "step": 8260 + }, + { + "epoch": 16.522, + "grad_norm": 1.263259768486023, + "learning_rate": 2e-05, + "loss": 0.03281062, + "step": 8261 + }, + { + "epoch": 16.524, + "grad_norm": 1.220733642578125, + "learning_rate": 2e-05, + "loss": 0.03505442, + "step": 8262 + }, + { + "epoch": 16.526, + "grad_norm": 1.2341818809509277, + "learning_rate": 2e-05, + "loss": 0.04311585, + "step": 8263 + }, + { + "epoch": 16.528, + "grad_norm": 0.9565370678901672, + "learning_rate": 2e-05, + "loss": 0.02928256, + "step": 8264 + }, + { + "epoch": 16.53, + "grad_norm": 0.8718287944793701, + "learning_rate": 2e-05, + "loss": 0.02539262, + "step": 8265 + }, + { + "epoch": 16.532, + "grad_norm": 1.1354038715362549, + "learning_rate": 2e-05, + "loss": 0.03417098, + "step": 8266 + }, + { + "epoch": 16.534, + "grad_norm": 1.1788674592971802, + "learning_rate": 2e-05, + "loss": 0.03965496, + "step": 8267 + }, + { + "epoch": 16.536, + "grad_norm": 1.3101824522018433, + "learning_rate": 2e-05, + "loss": 0.03684475, + "step": 8268 + }, + { + "epoch": 16.538, + "grad_norm": 1.3392512798309326, + "learning_rate": 2e-05, + "loss": 0.04036241, + "step": 8269 + }, + { + "epoch": 16.54, + "grad_norm": 2.521176338195801, + "learning_rate": 2e-05, + "loss": 0.04675987, + "step": 8270 + }, + { + "epoch": 16.542, + "grad_norm": 1.11541748046875, + "learning_rate": 2e-05, + "loss": 0.03730694, + "step": 8271 + }, + { + "epoch": 16.544, + "grad_norm": 1.528278112411499, + "learning_rate": 2e-05, + "loss": 0.06005888, + "step": 8272 + }, + { + "epoch": 16.546, + "grad_norm": 1.2424397468566895, + "learning_rate": 2e-05, + "loss": 0.05591384, + "step": 8273 + }, + { + "epoch": 16.548000000000002, + "grad_norm": 1.0814480781555176, + "learning_rate": 2e-05, + "loss": 0.03427697, + "step": 8274 + }, + { + "epoch": 16.55, + "grad_norm": 2.6472551822662354, + "learning_rate": 2e-05, + "loss": 0.0521223, + "step": 8275 + }, + { + "epoch": 16.552, + "grad_norm": 1.3621563911437988, + "learning_rate": 2e-05, + "loss": 0.04933754, + "step": 8276 + }, + { + "epoch": 16.554, + "grad_norm": 2.2583024501800537, + "learning_rate": 2e-05, + "loss": 0.05284938, + "step": 8277 + }, + { + "epoch": 16.556, + "grad_norm": 1.0987781286239624, + "learning_rate": 2e-05, + "loss": 0.02926936, + "step": 8278 + }, + { + "epoch": 16.558, + "grad_norm": 1.283735752105713, + "learning_rate": 2e-05, + "loss": 0.03353404, + "step": 8279 + }, + { + "epoch": 16.56, + "grad_norm": 1.0234546661376953, + "learning_rate": 2e-05, + "loss": 0.03110966, + "step": 8280 + }, + { + "epoch": 16.562, + "grad_norm": 1.325693130493164, + "learning_rate": 2e-05, + "loss": 0.04735526, + "step": 8281 + }, + { + "epoch": 16.564, + "grad_norm": 1.5526974201202393, + "learning_rate": 2e-05, + "loss": 0.04662727, + "step": 8282 + }, + { + "epoch": 16.566, + "grad_norm": 2.4452273845672607, + "learning_rate": 2e-05, + "loss": 0.03830102, + "step": 8283 + }, + { + "epoch": 16.568, + "grad_norm": 1.2660468816757202, + "learning_rate": 2e-05, + "loss": 0.04160655, + "step": 8284 + }, + { + "epoch": 16.57, + "grad_norm": 1.1161909103393555, + "learning_rate": 2e-05, + "loss": 0.04231364, + "step": 8285 + }, + { + "epoch": 16.572, + "grad_norm": 1.012315273284912, + "learning_rate": 2e-05, + "loss": 0.03184524, + "step": 8286 + }, + { + "epoch": 16.574, + "grad_norm": 1.1147305965423584, + "learning_rate": 2e-05, + "loss": 0.03896227, + "step": 8287 + }, + { + "epoch": 16.576, + "grad_norm": 1.0445970296859741, + "learning_rate": 2e-05, + "loss": 0.03200065, + "step": 8288 + }, + { + "epoch": 16.578, + "grad_norm": 1.7669461965560913, + "learning_rate": 2e-05, + "loss": 0.04407949, + "step": 8289 + }, + { + "epoch": 16.58, + "grad_norm": 1.550229549407959, + "learning_rate": 2e-05, + "loss": 0.02869645, + "step": 8290 + }, + { + "epoch": 16.582, + "grad_norm": 1.0623831748962402, + "learning_rate": 2e-05, + "loss": 0.0305838, + "step": 8291 + }, + { + "epoch": 16.584, + "grad_norm": 1.5914502143859863, + "learning_rate": 2e-05, + "loss": 0.04625592, + "step": 8292 + }, + { + "epoch": 16.586, + "grad_norm": 1.5053743124008179, + "learning_rate": 2e-05, + "loss": 0.03940097, + "step": 8293 + }, + { + "epoch": 16.588, + "grad_norm": 1.9061319828033447, + "learning_rate": 2e-05, + "loss": 0.05159453, + "step": 8294 + }, + { + "epoch": 16.59, + "grad_norm": 1.4841290712356567, + "learning_rate": 2e-05, + "loss": 0.05897345, + "step": 8295 + }, + { + "epoch": 16.592, + "grad_norm": 2.136908769607544, + "learning_rate": 2e-05, + "loss": 0.05323712, + "step": 8296 + }, + { + "epoch": 16.594, + "grad_norm": 1.3379402160644531, + "learning_rate": 2e-05, + "loss": 0.03460371, + "step": 8297 + }, + { + "epoch": 16.596, + "grad_norm": 1.020004391670227, + "learning_rate": 2e-05, + "loss": 0.03321714, + "step": 8298 + }, + { + "epoch": 16.598, + "grad_norm": 2.1319894790649414, + "learning_rate": 2e-05, + "loss": 0.04367604, + "step": 8299 + }, + { + "epoch": 16.6, + "grad_norm": 1.1898785829544067, + "learning_rate": 2e-05, + "loss": 0.03668055, + "step": 8300 + }, + { + "epoch": 16.602, + "grad_norm": 1.096915364265442, + "learning_rate": 2e-05, + "loss": 0.03318947, + "step": 8301 + }, + { + "epoch": 16.604, + "grad_norm": 1.0413960218429565, + "learning_rate": 2e-05, + "loss": 0.03442251, + "step": 8302 + }, + { + "epoch": 16.606, + "grad_norm": 1.2012441158294678, + "learning_rate": 2e-05, + "loss": 0.02705516, + "step": 8303 + }, + { + "epoch": 16.608, + "grad_norm": 1.2894916534423828, + "learning_rate": 2e-05, + "loss": 0.03271394, + "step": 8304 + }, + { + "epoch": 16.61, + "grad_norm": 1.2636003494262695, + "learning_rate": 2e-05, + "loss": 0.04137167, + "step": 8305 + }, + { + "epoch": 16.612, + "grad_norm": 1.2958347797393799, + "learning_rate": 2e-05, + "loss": 0.04770758, + "step": 8306 + }, + { + "epoch": 16.614, + "grad_norm": 2.002460479736328, + "learning_rate": 2e-05, + "loss": 0.0475505, + "step": 8307 + }, + { + "epoch": 16.616, + "grad_norm": 0.9963358640670776, + "learning_rate": 2e-05, + "loss": 0.03287155, + "step": 8308 + }, + { + "epoch": 16.618, + "grad_norm": 1.1301119327545166, + "learning_rate": 2e-05, + "loss": 0.04497526, + "step": 8309 + }, + { + "epoch": 16.62, + "grad_norm": 1.0189284086227417, + "learning_rate": 2e-05, + "loss": 0.0293178, + "step": 8310 + }, + { + "epoch": 16.622, + "grad_norm": 0.8906993269920349, + "learning_rate": 2e-05, + "loss": 0.01907645, + "step": 8311 + }, + { + "epoch": 16.624, + "grad_norm": 1.2180598974227905, + "learning_rate": 2e-05, + "loss": 0.03251302, + "step": 8312 + }, + { + "epoch": 16.626, + "grad_norm": 1.7638144493103027, + "learning_rate": 2e-05, + "loss": 0.03417107, + "step": 8313 + }, + { + "epoch": 16.628, + "grad_norm": 1.3163055181503296, + "learning_rate": 2e-05, + "loss": 0.03712931, + "step": 8314 + }, + { + "epoch": 16.63, + "grad_norm": 0.903484046459198, + "learning_rate": 2e-05, + "loss": 0.02302654, + "step": 8315 + }, + { + "epoch": 16.632, + "grad_norm": 2.890394926071167, + "learning_rate": 2e-05, + "loss": 0.05785254, + "step": 8316 + }, + { + "epoch": 16.634, + "grad_norm": 1.6316351890563965, + "learning_rate": 2e-05, + "loss": 0.05490664, + "step": 8317 + }, + { + "epoch": 16.636, + "grad_norm": 1.3253639936447144, + "learning_rate": 2e-05, + "loss": 0.03378769, + "step": 8318 + }, + { + "epoch": 16.638, + "grad_norm": 0.8841352462768555, + "learning_rate": 2e-05, + "loss": 0.02580001, + "step": 8319 + }, + { + "epoch": 16.64, + "grad_norm": 1.5405997037887573, + "learning_rate": 2e-05, + "loss": 0.0418197, + "step": 8320 + }, + { + "epoch": 16.642, + "grad_norm": 1.6014611721038818, + "learning_rate": 2e-05, + "loss": 0.05297592, + "step": 8321 + }, + { + "epoch": 16.644, + "grad_norm": 0.9699270129203796, + "learning_rate": 2e-05, + "loss": 0.03074204, + "step": 8322 + }, + { + "epoch": 16.646, + "grad_norm": 1.0691285133361816, + "learning_rate": 2e-05, + "loss": 0.02828666, + "step": 8323 + }, + { + "epoch": 16.648, + "grad_norm": 1.4978876113891602, + "learning_rate": 2e-05, + "loss": 0.04264086, + "step": 8324 + }, + { + "epoch": 16.65, + "grad_norm": 1.067944884300232, + "learning_rate": 2e-05, + "loss": 0.04064353, + "step": 8325 + }, + { + "epoch": 16.652, + "grad_norm": 1.609089732170105, + "learning_rate": 2e-05, + "loss": 0.03844542, + "step": 8326 + }, + { + "epoch": 16.654, + "grad_norm": 1.3730612993240356, + "learning_rate": 2e-05, + "loss": 0.03860483, + "step": 8327 + }, + { + "epoch": 16.656, + "grad_norm": 0.989010214805603, + "learning_rate": 2e-05, + "loss": 0.02395894, + "step": 8328 + }, + { + "epoch": 16.658, + "grad_norm": 0.89674311876297, + "learning_rate": 2e-05, + "loss": 0.02376273, + "step": 8329 + }, + { + "epoch": 16.66, + "grad_norm": 1.1137092113494873, + "learning_rate": 2e-05, + "loss": 0.01789567, + "step": 8330 + }, + { + "epoch": 16.662, + "grad_norm": 1.0320215225219727, + "learning_rate": 2e-05, + "loss": 0.0323988, + "step": 8331 + }, + { + "epoch": 16.664, + "grad_norm": 2.489551067352295, + "learning_rate": 2e-05, + "loss": 0.05527273, + "step": 8332 + }, + { + "epoch": 16.666, + "grad_norm": 1.2354681491851807, + "learning_rate": 2e-05, + "loss": 0.03027255, + "step": 8333 + }, + { + "epoch": 16.668, + "grad_norm": 1.5036636590957642, + "learning_rate": 2e-05, + "loss": 0.03653241, + "step": 8334 + }, + { + "epoch": 16.67, + "grad_norm": 1.5149270296096802, + "learning_rate": 2e-05, + "loss": 0.04814313, + "step": 8335 + }, + { + "epoch": 16.672, + "grad_norm": 1.3045963048934937, + "learning_rate": 2e-05, + "loss": 0.03031799, + "step": 8336 + }, + { + "epoch": 16.674, + "grad_norm": 1.2806516885757446, + "learning_rate": 2e-05, + "loss": 0.04692192, + "step": 8337 + }, + { + "epoch": 16.676, + "grad_norm": 1.1512733697891235, + "learning_rate": 2e-05, + "loss": 0.04004252, + "step": 8338 + }, + { + "epoch": 16.678, + "grad_norm": 3.1383249759674072, + "learning_rate": 2e-05, + "loss": 0.0324553, + "step": 8339 + }, + { + "epoch": 16.68, + "grad_norm": 1.2203539609909058, + "learning_rate": 2e-05, + "loss": 0.03611985, + "step": 8340 + }, + { + "epoch": 16.682, + "grad_norm": 1.2353651523590088, + "learning_rate": 2e-05, + "loss": 0.03453249, + "step": 8341 + }, + { + "epoch": 16.684, + "grad_norm": 1.2120214700698853, + "learning_rate": 2e-05, + "loss": 0.04656072, + "step": 8342 + }, + { + "epoch": 16.686, + "grad_norm": 1.1942620277404785, + "learning_rate": 2e-05, + "loss": 0.04031057, + "step": 8343 + }, + { + "epoch": 16.688, + "grad_norm": 1.174004077911377, + "learning_rate": 2e-05, + "loss": 0.03165752, + "step": 8344 + }, + { + "epoch": 16.69, + "grad_norm": 2.1396734714508057, + "learning_rate": 2e-05, + "loss": 0.04839662, + "step": 8345 + }, + { + "epoch": 16.692, + "grad_norm": 1.1642301082611084, + "learning_rate": 2e-05, + "loss": 0.0288971, + "step": 8346 + }, + { + "epoch": 16.694, + "grad_norm": 1.6050286293029785, + "learning_rate": 2e-05, + "loss": 0.03548468, + "step": 8347 + }, + { + "epoch": 16.696, + "grad_norm": 1.416585922241211, + "learning_rate": 2e-05, + "loss": 0.03986949, + "step": 8348 + }, + { + "epoch": 16.698, + "grad_norm": 1.3496794700622559, + "learning_rate": 2e-05, + "loss": 0.04378842, + "step": 8349 + }, + { + "epoch": 16.7, + "grad_norm": 3.526726722717285, + "learning_rate": 2e-05, + "loss": 0.04224211, + "step": 8350 + }, + { + "epoch": 16.701999999999998, + "grad_norm": 2.085320234298706, + "learning_rate": 2e-05, + "loss": 0.02257501, + "step": 8351 + }, + { + "epoch": 16.704, + "grad_norm": 1.418104648590088, + "learning_rate": 2e-05, + "loss": 0.03896306, + "step": 8352 + }, + { + "epoch": 16.706, + "grad_norm": 1.8346154689788818, + "learning_rate": 2e-05, + "loss": 0.04356946, + "step": 8353 + }, + { + "epoch": 16.708, + "grad_norm": 1.2784098386764526, + "learning_rate": 2e-05, + "loss": 0.04127508, + "step": 8354 + }, + { + "epoch": 16.71, + "grad_norm": 1.3677000999450684, + "learning_rate": 2e-05, + "loss": 0.03775769, + "step": 8355 + }, + { + "epoch": 16.712, + "grad_norm": 1.7542495727539062, + "learning_rate": 2e-05, + "loss": 0.05272632, + "step": 8356 + }, + { + "epoch": 16.714, + "grad_norm": 1.9329322576522827, + "learning_rate": 2e-05, + "loss": 0.041315, + "step": 8357 + }, + { + "epoch": 16.716, + "grad_norm": 1.4365530014038086, + "learning_rate": 2e-05, + "loss": 0.03562573, + "step": 8358 + }, + { + "epoch": 16.718, + "grad_norm": 1.7461820840835571, + "learning_rate": 2e-05, + "loss": 0.04044671, + "step": 8359 + }, + { + "epoch": 16.72, + "grad_norm": 5.9637556076049805, + "learning_rate": 2e-05, + "loss": 0.04565287, + "step": 8360 + }, + { + "epoch": 16.722, + "grad_norm": 1.428763747215271, + "learning_rate": 2e-05, + "loss": 0.03354582, + "step": 8361 + }, + { + "epoch": 16.724, + "grad_norm": 1.4038552045822144, + "learning_rate": 2e-05, + "loss": 0.0424789, + "step": 8362 + }, + { + "epoch": 16.726, + "grad_norm": 1.2637503147125244, + "learning_rate": 2e-05, + "loss": 0.03532393, + "step": 8363 + }, + { + "epoch": 16.728, + "grad_norm": 1.817488670349121, + "learning_rate": 2e-05, + "loss": 0.04140956, + "step": 8364 + }, + { + "epoch": 16.73, + "grad_norm": 1.4110076427459717, + "learning_rate": 2e-05, + "loss": 0.03238542, + "step": 8365 + }, + { + "epoch": 16.732, + "grad_norm": 1.412273645401001, + "learning_rate": 2e-05, + "loss": 0.0464592, + "step": 8366 + }, + { + "epoch": 16.734, + "grad_norm": 1.112044334411621, + "learning_rate": 2e-05, + "loss": 0.0404929, + "step": 8367 + }, + { + "epoch": 16.736, + "grad_norm": 1.867051362991333, + "learning_rate": 2e-05, + "loss": 0.06586152, + "step": 8368 + }, + { + "epoch": 16.738, + "grad_norm": 6.789832592010498, + "learning_rate": 2e-05, + "loss": 0.03430699, + "step": 8369 + }, + { + "epoch": 16.74, + "grad_norm": 1.4201772212982178, + "learning_rate": 2e-05, + "loss": 0.03015824, + "step": 8370 + }, + { + "epoch": 16.742, + "grad_norm": 1.1724557876586914, + "learning_rate": 2e-05, + "loss": 0.03560322, + "step": 8371 + }, + { + "epoch": 16.744, + "grad_norm": 1.1797523498535156, + "learning_rate": 2e-05, + "loss": 0.02636663, + "step": 8372 + }, + { + "epoch": 16.746, + "grad_norm": 1.3166332244873047, + "learning_rate": 2e-05, + "loss": 0.03184573, + "step": 8373 + }, + { + "epoch": 16.748, + "grad_norm": 1.034335970878601, + "learning_rate": 2e-05, + "loss": 0.02896601, + "step": 8374 + }, + { + "epoch": 16.75, + "grad_norm": 1.0973848104476929, + "learning_rate": 2e-05, + "loss": 0.03897912, + "step": 8375 + }, + { + "epoch": 16.752, + "grad_norm": 1.1487135887145996, + "learning_rate": 2e-05, + "loss": 0.03742709, + "step": 8376 + }, + { + "epoch": 16.754, + "grad_norm": 1.0410828590393066, + "learning_rate": 2e-05, + "loss": 0.03710461, + "step": 8377 + }, + { + "epoch": 16.756, + "grad_norm": 1.2444871664047241, + "learning_rate": 2e-05, + "loss": 0.03904767, + "step": 8378 + }, + { + "epoch": 16.758, + "grad_norm": 1.5975366830825806, + "learning_rate": 2e-05, + "loss": 0.04068123, + "step": 8379 + }, + { + "epoch": 16.76, + "grad_norm": 1.1661255359649658, + "learning_rate": 2e-05, + "loss": 0.02516171, + "step": 8380 + }, + { + "epoch": 16.762, + "grad_norm": 2.5861120223999023, + "learning_rate": 2e-05, + "loss": 0.04131169, + "step": 8381 + }, + { + "epoch": 16.764, + "grad_norm": 2.497673749923706, + "learning_rate": 2e-05, + "loss": 0.04161269, + "step": 8382 + }, + { + "epoch": 16.766, + "grad_norm": 0.9345721006393433, + "learning_rate": 2e-05, + "loss": 0.02630493, + "step": 8383 + }, + { + "epoch": 16.768, + "grad_norm": 1.4174412488937378, + "learning_rate": 2e-05, + "loss": 0.03265426, + "step": 8384 + }, + { + "epoch": 16.77, + "grad_norm": 1.2070684432983398, + "learning_rate": 2e-05, + "loss": 0.03450275, + "step": 8385 + }, + { + "epoch": 16.772, + "grad_norm": 1.3117280006408691, + "learning_rate": 2e-05, + "loss": 0.04178444, + "step": 8386 + }, + { + "epoch": 16.774, + "grad_norm": 1.5097566843032837, + "learning_rate": 2e-05, + "loss": 0.04680046, + "step": 8387 + }, + { + "epoch": 16.776, + "grad_norm": 1.4381500482559204, + "learning_rate": 2e-05, + "loss": 0.04752283, + "step": 8388 + }, + { + "epoch": 16.778, + "grad_norm": 1.1138615608215332, + "learning_rate": 2e-05, + "loss": 0.03561661, + "step": 8389 + }, + { + "epoch": 16.78, + "grad_norm": 1.2291978597640991, + "learning_rate": 2e-05, + "loss": 0.03155664, + "step": 8390 + }, + { + "epoch": 16.782, + "grad_norm": 1.3546652793884277, + "learning_rate": 2e-05, + "loss": 0.03348469, + "step": 8391 + }, + { + "epoch": 16.784, + "grad_norm": 1.1350245475769043, + "learning_rate": 2e-05, + "loss": 0.04207388, + "step": 8392 + }, + { + "epoch": 16.786, + "grad_norm": 1.6995071172714233, + "learning_rate": 2e-05, + "loss": 0.05210853, + "step": 8393 + }, + { + "epoch": 16.788, + "grad_norm": 2.271347761154175, + "learning_rate": 2e-05, + "loss": 0.05933637, + "step": 8394 + }, + { + "epoch": 16.79, + "grad_norm": 0.8941410779953003, + "learning_rate": 2e-05, + "loss": 0.02386924, + "step": 8395 + }, + { + "epoch": 16.792, + "grad_norm": 1.6003661155700684, + "learning_rate": 2e-05, + "loss": 0.05291647, + "step": 8396 + }, + { + "epoch": 16.794, + "grad_norm": 1.3317174911499023, + "learning_rate": 2e-05, + "loss": 0.03087507, + "step": 8397 + }, + { + "epoch": 16.796, + "grad_norm": 0.9487977623939514, + "learning_rate": 2e-05, + "loss": 0.02690414, + "step": 8398 + }, + { + "epoch": 16.798000000000002, + "grad_norm": 1.009123682975769, + "learning_rate": 2e-05, + "loss": 0.02644867, + "step": 8399 + }, + { + "epoch": 16.8, + "grad_norm": 1.2167582511901855, + "learning_rate": 2e-05, + "loss": 0.03899329, + "step": 8400 + }, + { + "epoch": 16.802, + "grad_norm": 1.482376217842102, + "learning_rate": 2e-05, + "loss": 0.02873115, + "step": 8401 + }, + { + "epoch": 16.804, + "grad_norm": 1.1082066297531128, + "learning_rate": 2e-05, + "loss": 0.03039532, + "step": 8402 + }, + { + "epoch": 16.806, + "grad_norm": 1.2599750757217407, + "learning_rate": 2e-05, + "loss": 0.04293299, + "step": 8403 + }, + { + "epoch": 16.808, + "grad_norm": 0.9543814063072205, + "learning_rate": 2e-05, + "loss": 0.03469695, + "step": 8404 + }, + { + "epoch": 16.81, + "grad_norm": 1.0824414491653442, + "learning_rate": 2e-05, + "loss": 0.03982303, + "step": 8405 + }, + { + "epoch": 16.812, + "grad_norm": 1.099599838256836, + "learning_rate": 2e-05, + "loss": 0.02591001, + "step": 8406 + }, + { + "epoch": 16.814, + "grad_norm": 1.7804385423660278, + "learning_rate": 2e-05, + "loss": 0.0323559, + "step": 8407 + }, + { + "epoch": 16.816, + "grad_norm": 1.8087066411972046, + "learning_rate": 2e-05, + "loss": 0.04229058, + "step": 8408 + }, + { + "epoch": 16.818, + "grad_norm": 1.2992652654647827, + "learning_rate": 2e-05, + "loss": 0.0409187, + "step": 8409 + }, + { + "epoch": 16.82, + "grad_norm": 1.1120244264602661, + "learning_rate": 2e-05, + "loss": 0.03508178, + "step": 8410 + }, + { + "epoch": 16.822, + "grad_norm": 0.9184687733650208, + "learning_rate": 2e-05, + "loss": 0.02861979, + "step": 8411 + }, + { + "epoch": 16.824, + "grad_norm": 1.2980962991714478, + "learning_rate": 2e-05, + "loss": 0.02535828, + "step": 8412 + }, + { + "epoch": 16.826, + "grad_norm": 1.9372985363006592, + "learning_rate": 2e-05, + "loss": 0.03568355, + "step": 8413 + }, + { + "epoch": 16.828, + "grad_norm": 1.514122486114502, + "learning_rate": 2e-05, + "loss": 0.03849922, + "step": 8414 + }, + { + "epoch": 16.83, + "grad_norm": 1.285515308380127, + "learning_rate": 2e-05, + "loss": 0.03315242, + "step": 8415 + }, + { + "epoch": 16.832, + "grad_norm": 0.9878953099250793, + "learning_rate": 2e-05, + "loss": 0.02930378, + "step": 8416 + }, + { + "epoch": 16.834, + "grad_norm": 1.765859842300415, + "learning_rate": 2e-05, + "loss": 0.05877309, + "step": 8417 + }, + { + "epoch": 16.836, + "grad_norm": 1.3087562322616577, + "learning_rate": 2e-05, + "loss": 0.04466981, + "step": 8418 + }, + { + "epoch": 16.838, + "grad_norm": 3.0149006843566895, + "learning_rate": 2e-05, + "loss": 0.05240371, + "step": 8419 + }, + { + "epoch": 16.84, + "grad_norm": 1.1996604204177856, + "learning_rate": 2e-05, + "loss": 0.04400566, + "step": 8420 + }, + { + "epoch": 16.842, + "grad_norm": 1.4025583267211914, + "learning_rate": 2e-05, + "loss": 0.04432599, + "step": 8421 + }, + { + "epoch": 16.844, + "grad_norm": 2.9091427326202393, + "learning_rate": 2e-05, + "loss": 0.04849138, + "step": 8422 + }, + { + "epoch": 16.846, + "grad_norm": 2.77856707572937, + "learning_rate": 2e-05, + "loss": 0.05462526, + "step": 8423 + }, + { + "epoch": 16.848, + "grad_norm": 1.108500361442566, + "learning_rate": 2e-05, + "loss": 0.03238573, + "step": 8424 + }, + { + "epoch": 16.85, + "grad_norm": 1.0004764795303345, + "learning_rate": 2e-05, + "loss": 0.02566298, + "step": 8425 + }, + { + "epoch": 16.852, + "grad_norm": 6.166355133056641, + "learning_rate": 2e-05, + "loss": 0.05295902, + "step": 8426 + }, + { + "epoch": 16.854, + "grad_norm": 1.1335303783416748, + "learning_rate": 2e-05, + "loss": 0.0286171, + "step": 8427 + }, + { + "epoch": 16.856, + "grad_norm": 1.1951411962509155, + "learning_rate": 2e-05, + "loss": 0.03944296, + "step": 8428 + }, + { + "epoch": 16.858, + "grad_norm": 1.430793046951294, + "learning_rate": 2e-05, + "loss": 0.03612977, + "step": 8429 + }, + { + "epoch": 16.86, + "grad_norm": 1.3680472373962402, + "learning_rate": 2e-05, + "loss": 0.03450888, + "step": 8430 + }, + { + "epoch": 16.862, + "grad_norm": 1.0445443391799927, + "learning_rate": 2e-05, + "loss": 0.03357092, + "step": 8431 + }, + { + "epoch": 16.864, + "grad_norm": 1.1048623323440552, + "learning_rate": 2e-05, + "loss": 0.02782473, + "step": 8432 + }, + { + "epoch": 16.866, + "grad_norm": 1.264748215675354, + "learning_rate": 2e-05, + "loss": 0.05136178, + "step": 8433 + }, + { + "epoch": 16.868, + "grad_norm": 3.1316561698913574, + "learning_rate": 2e-05, + "loss": 0.0381931, + "step": 8434 + }, + { + "epoch": 16.87, + "grad_norm": 0.8829813599586487, + "learning_rate": 2e-05, + "loss": 0.02602118, + "step": 8435 + }, + { + "epoch": 16.872, + "grad_norm": 1.240877389907837, + "learning_rate": 2e-05, + "loss": 0.0411177, + "step": 8436 + }, + { + "epoch": 16.874, + "grad_norm": 0.9366046786308289, + "learning_rate": 2e-05, + "loss": 0.02752039, + "step": 8437 + }, + { + "epoch": 16.876, + "grad_norm": 1.1578431129455566, + "learning_rate": 2e-05, + "loss": 0.03548286, + "step": 8438 + }, + { + "epoch": 16.878, + "grad_norm": 1.1822301149368286, + "learning_rate": 2e-05, + "loss": 0.03064628, + "step": 8439 + }, + { + "epoch": 16.88, + "grad_norm": 2.111692428588867, + "learning_rate": 2e-05, + "loss": 0.05242971, + "step": 8440 + }, + { + "epoch": 16.882, + "grad_norm": 1.7009152173995972, + "learning_rate": 2e-05, + "loss": 0.03527777, + "step": 8441 + }, + { + "epoch": 16.884, + "grad_norm": 1.7795860767364502, + "learning_rate": 2e-05, + "loss": 0.04584951, + "step": 8442 + }, + { + "epoch": 16.886, + "grad_norm": 1.7482233047485352, + "learning_rate": 2e-05, + "loss": 0.04331917, + "step": 8443 + }, + { + "epoch": 16.888, + "grad_norm": 1.1555484533309937, + "learning_rate": 2e-05, + "loss": 0.03320149, + "step": 8444 + }, + { + "epoch": 16.89, + "grad_norm": 1.5613912343978882, + "learning_rate": 2e-05, + "loss": 0.03231899, + "step": 8445 + }, + { + "epoch": 16.892, + "grad_norm": 1.4290865659713745, + "learning_rate": 2e-05, + "loss": 0.03894955, + "step": 8446 + }, + { + "epoch": 16.894, + "grad_norm": 1.1222296953201294, + "learning_rate": 2e-05, + "loss": 0.03836842, + "step": 8447 + }, + { + "epoch": 16.896, + "grad_norm": 2.30586838722229, + "learning_rate": 2e-05, + "loss": 0.04976833, + "step": 8448 + }, + { + "epoch": 16.898, + "grad_norm": 1.0983035564422607, + "learning_rate": 2e-05, + "loss": 0.03981961, + "step": 8449 + }, + { + "epoch": 16.9, + "grad_norm": 1.6317425966262817, + "learning_rate": 2e-05, + "loss": 0.05492754, + "step": 8450 + }, + { + "epoch": 16.902, + "grad_norm": 1.9192370176315308, + "learning_rate": 2e-05, + "loss": 0.03952098, + "step": 8451 + }, + { + "epoch": 16.904, + "grad_norm": 1.1875005960464478, + "learning_rate": 2e-05, + "loss": 0.03277217, + "step": 8452 + }, + { + "epoch": 16.906, + "grad_norm": 1.4212887287139893, + "learning_rate": 2e-05, + "loss": 0.02984127, + "step": 8453 + }, + { + "epoch": 16.908, + "grad_norm": 1.5091922283172607, + "learning_rate": 2e-05, + "loss": 0.03316611, + "step": 8454 + }, + { + "epoch": 16.91, + "grad_norm": 2.024031400680542, + "learning_rate": 2e-05, + "loss": 0.04787176, + "step": 8455 + }, + { + "epoch": 16.912, + "grad_norm": 2.3478596210479736, + "learning_rate": 2e-05, + "loss": 0.0493004, + "step": 8456 + }, + { + "epoch": 16.914, + "grad_norm": 1.2963180541992188, + "learning_rate": 2e-05, + "loss": 0.02935142, + "step": 8457 + }, + { + "epoch": 16.916, + "grad_norm": 1.0010120868682861, + "learning_rate": 2e-05, + "loss": 0.02706702, + "step": 8458 + }, + { + "epoch": 16.918, + "grad_norm": 1.579359531402588, + "learning_rate": 2e-05, + "loss": 0.03226943, + "step": 8459 + }, + { + "epoch": 16.92, + "grad_norm": 1.0996006727218628, + "learning_rate": 2e-05, + "loss": 0.03097832, + "step": 8460 + }, + { + "epoch": 16.922, + "grad_norm": 1.2066538333892822, + "learning_rate": 2e-05, + "loss": 0.03681121, + "step": 8461 + }, + { + "epoch": 16.924, + "grad_norm": 1.255723476409912, + "learning_rate": 2e-05, + "loss": 0.03210659, + "step": 8462 + }, + { + "epoch": 16.926, + "grad_norm": 1.2862356901168823, + "learning_rate": 2e-05, + "loss": 0.03344401, + "step": 8463 + }, + { + "epoch": 16.928, + "grad_norm": 1.046955943107605, + "learning_rate": 2e-05, + "loss": 0.02581613, + "step": 8464 + }, + { + "epoch": 16.93, + "grad_norm": 1.5476090908050537, + "learning_rate": 2e-05, + "loss": 0.04508192, + "step": 8465 + }, + { + "epoch": 16.932, + "grad_norm": 1.1604325771331787, + "learning_rate": 2e-05, + "loss": 0.03390781, + "step": 8466 + }, + { + "epoch": 16.934, + "grad_norm": 1.613224983215332, + "learning_rate": 2e-05, + "loss": 0.05440042, + "step": 8467 + }, + { + "epoch": 16.936, + "grad_norm": 1.0437114238739014, + "learning_rate": 2e-05, + "loss": 0.03564305, + "step": 8468 + }, + { + "epoch": 16.938, + "grad_norm": 1.7123421430587769, + "learning_rate": 2e-05, + "loss": 0.03162618, + "step": 8469 + }, + { + "epoch": 16.94, + "grad_norm": 1.2123863697052002, + "learning_rate": 2e-05, + "loss": 0.03680079, + "step": 8470 + }, + { + "epoch": 16.942, + "grad_norm": 1.7789182662963867, + "learning_rate": 2e-05, + "loss": 0.05218393, + "step": 8471 + }, + { + "epoch": 16.944, + "grad_norm": 1.4410932064056396, + "learning_rate": 2e-05, + "loss": 0.03325985, + "step": 8472 + }, + { + "epoch": 16.946, + "grad_norm": 0.8621723055839539, + "learning_rate": 2e-05, + "loss": 0.0242934, + "step": 8473 + }, + { + "epoch": 16.948, + "grad_norm": 1.3893159627914429, + "learning_rate": 2e-05, + "loss": 0.05090608, + "step": 8474 + }, + { + "epoch": 16.95, + "grad_norm": 1.0170800685882568, + "learning_rate": 2e-05, + "loss": 0.03791419, + "step": 8475 + }, + { + "epoch": 16.951999999999998, + "grad_norm": 1.146407127380371, + "learning_rate": 2e-05, + "loss": 0.04361141, + "step": 8476 + }, + { + "epoch": 16.954, + "grad_norm": 1.6457027196884155, + "learning_rate": 2e-05, + "loss": 0.04759943, + "step": 8477 + }, + { + "epoch": 16.956, + "grad_norm": 1.1141819953918457, + "learning_rate": 2e-05, + "loss": 0.02985225, + "step": 8478 + }, + { + "epoch": 16.958, + "grad_norm": 1.0538251399993896, + "learning_rate": 2e-05, + "loss": 0.02504754, + "step": 8479 + }, + { + "epoch": 16.96, + "grad_norm": 1.3906612396240234, + "learning_rate": 2e-05, + "loss": 0.04034654, + "step": 8480 + }, + { + "epoch": 16.962, + "grad_norm": 1.228425145149231, + "learning_rate": 2e-05, + "loss": 0.05102978, + "step": 8481 + }, + { + "epoch": 16.964, + "grad_norm": 1.1037037372589111, + "learning_rate": 2e-05, + "loss": 0.02688233, + "step": 8482 + }, + { + "epoch": 16.966, + "grad_norm": 1.2161399126052856, + "learning_rate": 2e-05, + "loss": 0.03387916, + "step": 8483 + }, + { + "epoch": 16.968, + "grad_norm": 2.475914716720581, + "learning_rate": 2e-05, + "loss": 0.04503019, + "step": 8484 + }, + { + "epoch": 16.97, + "grad_norm": 2.1681854724884033, + "learning_rate": 2e-05, + "loss": 0.03996513, + "step": 8485 + }, + { + "epoch": 16.972, + "grad_norm": 2.7576403617858887, + "learning_rate": 2e-05, + "loss": 0.06164122, + "step": 8486 + }, + { + "epoch": 16.974, + "grad_norm": 1.2548010349273682, + "learning_rate": 2e-05, + "loss": 0.0354467, + "step": 8487 + }, + { + "epoch": 16.976, + "grad_norm": 1.4516505002975464, + "learning_rate": 2e-05, + "loss": 0.03556127, + "step": 8488 + }, + { + "epoch": 16.978, + "grad_norm": 2.353919744491577, + "learning_rate": 2e-05, + "loss": 0.03970582, + "step": 8489 + }, + { + "epoch": 16.98, + "grad_norm": 1.0244505405426025, + "learning_rate": 2e-05, + "loss": 0.03089332, + "step": 8490 + }, + { + "epoch": 16.982, + "grad_norm": 1.5581693649291992, + "learning_rate": 2e-05, + "loss": 0.03672716, + "step": 8491 + }, + { + "epoch": 16.984, + "grad_norm": 1.7653076648712158, + "learning_rate": 2e-05, + "loss": 0.04039037, + "step": 8492 + }, + { + "epoch": 16.986, + "grad_norm": 1.0333586931228638, + "learning_rate": 2e-05, + "loss": 0.03089204, + "step": 8493 + }, + { + "epoch": 16.988, + "grad_norm": 1.0320618152618408, + "learning_rate": 2e-05, + "loss": 0.0225993, + "step": 8494 + }, + { + "epoch": 16.99, + "grad_norm": 1.119234561920166, + "learning_rate": 2e-05, + "loss": 0.03324748, + "step": 8495 + }, + { + "epoch": 16.992, + "grad_norm": 1.9788683652877808, + "learning_rate": 2e-05, + "loss": 0.03121308, + "step": 8496 + }, + { + "epoch": 16.994, + "grad_norm": 1.3806347846984863, + "learning_rate": 2e-05, + "loss": 0.04654794, + "step": 8497 + }, + { + "epoch": 16.996, + "grad_norm": 1.1207362413406372, + "learning_rate": 2e-05, + "loss": 0.04140043, + "step": 8498 + }, + { + "epoch": 16.998, + "grad_norm": 1.039371371269226, + "learning_rate": 2e-05, + "loss": 0.03301085, + "step": 8499 + }, + { + "epoch": 17.0, + "grad_norm": 2.4387052059173584, + "learning_rate": 2e-05, + "loss": 0.04641984, + "step": 8500 + }, + { + "epoch": 17.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9760479041916168, + "Equal_1": 0.996, + "Equal_2": 0.9680638722554891, + "Equal_3": 0.9181636726546906, + "LineComparison_1": 1.0, + "LineComparison_2": 0.998003992015968, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.99, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.986, + "Perpendicular_3": 0.7755511022044088, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9936666666666667, + "PointLiesOnCircle_3": 0.99, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9919839679358717, + "PointLiesOnLine_3": 0.9820359281437125 + }, + "eval_runtime": 320.1697, + "eval_samples_per_second": 32.795, + "eval_steps_per_second": 0.656, + "step": 8500 + }, + { + "epoch": 17.002, + "grad_norm": 1.4980146884918213, + "learning_rate": 2e-05, + "loss": 0.04539901, + "step": 8501 + }, + { + "epoch": 17.004, + "grad_norm": 1.0780891180038452, + "learning_rate": 2e-05, + "loss": 0.03377684, + "step": 8502 + }, + { + "epoch": 17.006, + "grad_norm": 1.0425775051116943, + "learning_rate": 2e-05, + "loss": 0.02465243, + "step": 8503 + }, + { + "epoch": 17.008, + "grad_norm": 1.0172406435012817, + "learning_rate": 2e-05, + "loss": 0.02660679, + "step": 8504 + }, + { + "epoch": 17.01, + "grad_norm": 1.2166515588760376, + "learning_rate": 2e-05, + "loss": 0.04529484, + "step": 8505 + }, + { + "epoch": 17.012, + "grad_norm": 1.4045565128326416, + "learning_rate": 2e-05, + "loss": 0.04070731, + "step": 8506 + }, + { + "epoch": 17.014, + "grad_norm": 1.53168785572052, + "learning_rate": 2e-05, + "loss": 0.04683904, + "step": 8507 + }, + { + "epoch": 17.016, + "grad_norm": 1.4374054670333862, + "learning_rate": 2e-05, + "loss": 0.05705344, + "step": 8508 + }, + { + "epoch": 17.018, + "grad_norm": 1.0057858228683472, + "learning_rate": 2e-05, + "loss": 0.03421285, + "step": 8509 + }, + { + "epoch": 17.02, + "grad_norm": 1.1364305019378662, + "learning_rate": 2e-05, + "loss": 0.03909264, + "step": 8510 + }, + { + "epoch": 17.022, + "grad_norm": 1.0018776655197144, + "learning_rate": 2e-05, + "loss": 0.03444162, + "step": 8511 + }, + { + "epoch": 17.024, + "grad_norm": 1.4022345542907715, + "learning_rate": 2e-05, + "loss": 0.05768448, + "step": 8512 + }, + { + "epoch": 17.026, + "grad_norm": 2.5111844539642334, + "learning_rate": 2e-05, + "loss": 0.0339235, + "step": 8513 + }, + { + "epoch": 17.028, + "grad_norm": 1.921712040901184, + "learning_rate": 2e-05, + "loss": 0.03785212, + "step": 8514 + }, + { + "epoch": 17.03, + "grad_norm": 1.1148395538330078, + "learning_rate": 2e-05, + "loss": 0.02421642, + "step": 8515 + }, + { + "epoch": 17.032, + "grad_norm": 1.0764633417129517, + "learning_rate": 2e-05, + "loss": 0.03142115, + "step": 8516 + }, + { + "epoch": 17.034, + "grad_norm": 1.4113231897354126, + "learning_rate": 2e-05, + "loss": 0.03779317, + "step": 8517 + }, + { + "epoch": 17.036, + "grad_norm": 1.0931544303894043, + "learning_rate": 2e-05, + "loss": 0.02576648, + "step": 8518 + }, + { + "epoch": 17.038, + "grad_norm": 1.8094255924224854, + "learning_rate": 2e-05, + "loss": 0.04606591, + "step": 8519 + }, + { + "epoch": 17.04, + "grad_norm": 1.4644229412078857, + "learning_rate": 2e-05, + "loss": 0.043492, + "step": 8520 + }, + { + "epoch": 17.042, + "grad_norm": 1.4266523122787476, + "learning_rate": 2e-05, + "loss": 0.03921729, + "step": 8521 + }, + { + "epoch": 17.044, + "grad_norm": 1.3539372682571411, + "learning_rate": 2e-05, + "loss": 0.03920222, + "step": 8522 + }, + { + "epoch": 17.046, + "grad_norm": 1.028294563293457, + "learning_rate": 2e-05, + "loss": 0.03639862, + "step": 8523 + }, + { + "epoch": 17.048, + "grad_norm": 1.2716553211212158, + "learning_rate": 2e-05, + "loss": 0.03918849, + "step": 8524 + }, + { + "epoch": 17.05, + "grad_norm": 0.8702855706214905, + "learning_rate": 2e-05, + "loss": 0.02787583, + "step": 8525 + }, + { + "epoch": 17.052, + "grad_norm": 1.2607735395431519, + "learning_rate": 2e-05, + "loss": 0.04798802, + "step": 8526 + }, + { + "epoch": 17.054, + "grad_norm": 1.8052009344100952, + "learning_rate": 2e-05, + "loss": 0.04934546, + "step": 8527 + }, + { + "epoch": 17.056, + "grad_norm": 1.3691691160202026, + "learning_rate": 2e-05, + "loss": 0.02806879, + "step": 8528 + }, + { + "epoch": 17.058, + "grad_norm": 2.2015974521636963, + "learning_rate": 2e-05, + "loss": 0.03393783, + "step": 8529 + }, + { + "epoch": 17.06, + "grad_norm": 1.7185814380645752, + "learning_rate": 2e-05, + "loss": 0.04044708, + "step": 8530 + }, + { + "epoch": 17.062, + "grad_norm": 1.6913961172103882, + "learning_rate": 2e-05, + "loss": 0.04489831, + "step": 8531 + }, + { + "epoch": 17.064, + "grad_norm": 1.6114678382873535, + "learning_rate": 2e-05, + "loss": 0.06014886, + "step": 8532 + }, + { + "epoch": 17.066, + "grad_norm": 1.866416335105896, + "learning_rate": 2e-05, + "loss": 0.05753294, + "step": 8533 + }, + { + "epoch": 17.068, + "grad_norm": 2.823669910430908, + "learning_rate": 2e-05, + "loss": 0.0364582, + "step": 8534 + }, + { + "epoch": 17.07, + "grad_norm": 1.3200300931930542, + "learning_rate": 2e-05, + "loss": 0.02338468, + "step": 8535 + }, + { + "epoch": 17.072, + "grad_norm": 1.4321677684783936, + "learning_rate": 2e-05, + "loss": 0.03681827, + "step": 8536 + }, + { + "epoch": 17.074, + "grad_norm": 1.1837363243103027, + "learning_rate": 2e-05, + "loss": 0.03107641, + "step": 8537 + }, + { + "epoch": 17.076, + "grad_norm": 1.5008882284164429, + "learning_rate": 2e-05, + "loss": 0.04763446, + "step": 8538 + }, + { + "epoch": 17.078, + "grad_norm": 0.9194531440734863, + "learning_rate": 2e-05, + "loss": 0.03133226, + "step": 8539 + }, + { + "epoch": 17.08, + "grad_norm": 1.7025022506713867, + "learning_rate": 2e-05, + "loss": 0.0493839, + "step": 8540 + }, + { + "epoch": 17.082, + "grad_norm": 1.8602598905563354, + "learning_rate": 2e-05, + "loss": 0.0486247, + "step": 8541 + }, + { + "epoch": 17.084, + "grad_norm": 6.84015417098999, + "learning_rate": 2e-05, + "loss": 0.02411661, + "step": 8542 + }, + { + "epoch": 17.086, + "grad_norm": 1.2023508548736572, + "learning_rate": 2e-05, + "loss": 0.03706846, + "step": 8543 + }, + { + "epoch": 17.088, + "grad_norm": 1.4911589622497559, + "learning_rate": 2e-05, + "loss": 0.05094575, + "step": 8544 + }, + { + "epoch": 17.09, + "grad_norm": 0.8137642741203308, + "learning_rate": 2e-05, + "loss": 0.02076185, + "step": 8545 + }, + { + "epoch": 17.092, + "grad_norm": 1.6235312223434448, + "learning_rate": 2e-05, + "loss": 0.04601521, + "step": 8546 + }, + { + "epoch": 17.094, + "grad_norm": 2.2012600898742676, + "learning_rate": 2e-05, + "loss": 0.04425218, + "step": 8547 + }, + { + "epoch": 17.096, + "grad_norm": 1.039987564086914, + "learning_rate": 2e-05, + "loss": 0.02848525, + "step": 8548 + }, + { + "epoch": 17.098, + "grad_norm": 1.7109042406082153, + "learning_rate": 2e-05, + "loss": 0.06454369, + "step": 8549 + }, + { + "epoch": 17.1, + "grad_norm": 1.8760957717895508, + "learning_rate": 2e-05, + "loss": 0.02889769, + "step": 8550 + }, + { + "epoch": 17.102, + "grad_norm": 1.0814695358276367, + "learning_rate": 2e-05, + "loss": 0.0295868, + "step": 8551 + }, + { + "epoch": 17.104, + "grad_norm": 1.5878161191940308, + "learning_rate": 2e-05, + "loss": 0.03735021, + "step": 8552 + }, + { + "epoch": 17.106, + "grad_norm": 1.4773187637329102, + "learning_rate": 2e-05, + "loss": 0.04390514, + "step": 8553 + }, + { + "epoch": 17.108, + "grad_norm": 1.8776631355285645, + "learning_rate": 2e-05, + "loss": 0.04321947, + "step": 8554 + }, + { + "epoch": 17.11, + "grad_norm": 1.309813141822815, + "learning_rate": 2e-05, + "loss": 0.03866652, + "step": 8555 + }, + { + "epoch": 17.112, + "grad_norm": 0.9536056518554688, + "learning_rate": 2e-05, + "loss": 0.02671996, + "step": 8556 + }, + { + "epoch": 17.114, + "grad_norm": 1.3391567468643188, + "learning_rate": 2e-05, + "loss": 0.02907237, + "step": 8557 + }, + { + "epoch": 17.116, + "grad_norm": 1.0060744285583496, + "learning_rate": 2e-05, + "loss": 0.03439762, + "step": 8558 + }, + { + "epoch": 17.118, + "grad_norm": 0.9535341262817383, + "learning_rate": 2e-05, + "loss": 0.02899779, + "step": 8559 + }, + { + "epoch": 17.12, + "grad_norm": 0.7711392045021057, + "learning_rate": 2e-05, + "loss": 0.02512438, + "step": 8560 + }, + { + "epoch": 17.122, + "grad_norm": 1.0597705841064453, + "learning_rate": 2e-05, + "loss": 0.03412605, + "step": 8561 + }, + { + "epoch": 17.124, + "grad_norm": 1.8042422533035278, + "learning_rate": 2e-05, + "loss": 0.05712911, + "step": 8562 + }, + { + "epoch": 17.126, + "grad_norm": 1.9867520332336426, + "learning_rate": 2e-05, + "loss": 0.04097403, + "step": 8563 + }, + { + "epoch": 17.128, + "grad_norm": 1.8169209957122803, + "learning_rate": 2e-05, + "loss": 0.05447955, + "step": 8564 + }, + { + "epoch": 17.13, + "grad_norm": 1.8252878189086914, + "learning_rate": 2e-05, + "loss": 0.03339032, + "step": 8565 + }, + { + "epoch": 17.132, + "grad_norm": 2.2705142498016357, + "learning_rate": 2e-05, + "loss": 0.04022797, + "step": 8566 + }, + { + "epoch": 17.134, + "grad_norm": 1.9331085681915283, + "learning_rate": 2e-05, + "loss": 0.03970163, + "step": 8567 + }, + { + "epoch": 17.136, + "grad_norm": 1.5278258323669434, + "learning_rate": 2e-05, + "loss": 0.03794595, + "step": 8568 + }, + { + "epoch": 17.138, + "grad_norm": 2.026817560195923, + "learning_rate": 2e-05, + "loss": 0.0471566, + "step": 8569 + }, + { + "epoch": 17.14, + "grad_norm": 1.5275219678878784, + "learning_rate": 2e-05, + "loss": 0.04161655, + "step": 8570 + }, + { + "epoch": 17.142, + "grad_norm": 1.6281226873397827, + "learning_rate": 2e-05, + "loss": 0.03525121, + "step": 8571 + }, + { + "epoch": 17.144, + "grad_norm": 2.1644468307495117, + "learning_rate": 2e-05, + "loss": 0.05372618, + "step": 8572 + }, + { + "epoch": 17.146, + "grad_norm": 1.3852871656417847, + "learning_rate": 2e-05, + "loss": 0.04645745, + "step": 8573 + }, + { + "epoch": 17.148, + "grad_norm": 1.2143161296844482, + "learning_rate": 2e-05, + "loss": 0.02777028, + "step": 8574 + }, + { + "epoch": 17.15, + "grad_norm": 1.1853028535842896, + "learning_rate": 2e-05, + "loss": 0.03904351, + "step": 8575 + }, + { + "epoch": 17.152, + "grad_norm": 1.9432642459869385, + "learning_rate": 2e-05, + "loss": 0.04074139, + "step": 8576 + }, + { + "epoch": 17.154, + "grad_norm": 1.6851083040237427, + "learning_rate": 2e-05, + "loss": 0.05192745, + "step": 8577 + }, + { + "epoch": 17.156, + "grad_norm": 1.3564956188201904, + "learning_rate": 2e-05, + "loss": 0.03743386, + "step": 8578 + }, + { + "epoch": 17.158, + "grad_norm": 1.7471429109573364, + "learning_rate": 2e-05, + "loss": 0.0443368, + "step": 8579 + }, + { + "epoch": 17.16, + "grad_norm": 1.41841459274292, + "learning_rate": 2e-05, + "loss": 0.04000156, + "step": 8580 + }, + { + "epoch": 17.162, + "grad_norm": 0.9492871761322021, + "learning_rate": 2e-05, + "loss": 0.0318904, + "step": 8581 + }, + { + "epoch": 17.164, + "grad_norm": 1.0863397121429443, + "learning_rate": 2e-05, + "loss": 0.02380545, + "step": 8582 + }, + { + "epoch": 17.166, + "grad_norm": 3.9473729133605957, + "learning_rate": 2e-05, + "loss": 0.03920302, + "step": 8583 + }, + { + "epoch": 17.168, + "grad_norm": 1.176370620727539, + "learning_rate": 2e-05, + "loss": 0.04776629, + "step": 8584 + }, + { + "epoch": 17.17, + "grad_norm": 1.3243099451065063, + "learning_rate": 2e-05, + "loss": 0.03770189, + "step": 8585 + }, + { + "epoch": 17.172, + "grad_norm": 1.1418389081954956, + "learning_rate": 2e-05, + "loss": 0.02544946, + "step": 8586 + }, + { + "epoch": 17.174, + "grad_norm": 1.9030910730361938, + "learning_rate": 2e-05, + "loss": 0.03700698, + "step": 8587 + }, + { + "epoch": 17.176, + "grad_norm": 1.3709733486175537, + "learning_rate": 2e-05, + "loss": 0.04343133, + "step": 8588 + }, + { + "epoch": 17.178, + "grad_norm": 1.4506902694702148, + "learning_rate": 2e-05, + "loss": 0.03628192, + "step": 8589 + }, + { + "epoch": 17.18, + "grad_norm": 1.3070485591888428, + "learning_rate": 2e-05, + "loss": 0.04146688, + "step": 8590 + }, + { + "epoch": 17.182, + "grad_norm": 1.234598994255066, + "learning_rate": 2e-05, + "loss": 0.03088631, + "step": 8591 + }, + { + "epoch": 17.184, + "grad_norm": 2.7125167846679688, + "learning_rate": 2e-05, + "loss": 0.03890502, + "step": 8592 + }, + { + "epoch": 17.186, + "grad_norm": 1.535182237625122, + "learning_rate": 2e-05, + "loss": 0.04280488, + "step": 8593 + }, + { + "epoch": 17.188, + "grad_norm": 1.0927315950393677, + "learning_rate": 2e-05, + "loss": 0.0398199, + "step": 8594 + }, + { + "epoch": 17.19, + "grad_norm": 2.3022119998931885, + "learning_rate": 2e-05, + "loss": 0.04995221, + "step": 8595 + }, + { + "epoch": 17.192, + "grad_norm": 1.2546242475509644, + "learning_rate": 2e-05, + "loss": 0.03015491, + "step": 8596 + }, + { + "epoch": 17.194, + "grad_norm": 1.2892979383468628, + "learning_rate": 2e-05, + "loss": 0.02786731, + "step": 8597 + }, + { + "epoch": 17.196, + "grad_norm": 1.2983757257461548, + "learning_rate": 2e-05, + "loss": 0.03557572, + "step": 8598 + }, + { + "epoch": 17.198, + "grad_norm": 1.1861157417297363, + "learning_rate": 2e-05, + "loss": 0.03920743, + "step": 8599 + }, + { + "epoch": 17.2, + "grad_norm": 1.5425999164581299, + "learning_rate": 2e-05, + "loss": 0.03938346, + "step": 8600 + }, + { + "epoch": 17.202, + "grad_norm": 1.861615538597107, + "learning_rate": 2e-05, + "loss": 0.03425018, + "step": 8601 + }, + { + "epoch": 17.204, + "grad_norm": 1.7203572988510132, + "learning_rate": 2e-05, + "loss": 0.04107557, + "step": 8602 + }, + { + "epoch": 17.206, + "grad_norm": 1.869676113128662, + "learning_rate": 2e-05, + "loss": 0.02804127, + "step": 8603 + }, + { + "epoch": 17.208, + "grad_norm": 1.1111414432525635, + "learning_rate": 2e-05, + "loss": 0.03163296, + "step": 8604 + }, + { + "epoch": 17.21, + "grad_norm": 1.1464769840240479, + "learning_rate": 2e-05, + "loss": 0.03462142, + "step": 8605 + }, + { + "epoch": 17.212, + "grad_norm": 1.6060582399368286, + "learning_rate": 2e-05, + "loss": 0.06490205, + "step": 8606 + }, + { + "epoch": 17.214, + "grad_norm": 1.082814335823059, + "learning_rate": 2e-05, + "loss": 0.02265838, + "step": 8607 + }, + { + "epoch": 17.216, + "grad_norm": 1.1353710889816284, + "learning_rate": 2e-05, + "loss": 0.03856842, + "step": 8608 + }, + { + "epoch": 17.218, + "grad_norm": 1.3957148790359497, + "learning_rate": 2e-05, + "loss": 0.03138808, + "step": 8609 + }, + { + "epoch": 17.22, + "grad_norm": 1.2920960187911987, + "learning_rate": 2e-05, + "loss": 0.0430844, + "step": 8610 + }, + { + "epoch": 17.222, + "grad_norm": 1.2752081155776978, + "learning_rate": 2e-05, + "loss": 0.04843235, + "step": 8611 + }, + { + "epoch": 17.224, + "grad_norm": 1.4055542945861816, + "learning_rate": 2e-05, + "loss": 0.03793164, + "step": 8612 + }, + { + "epoch": 17.226, + "grad_norm": 1.3925482034683228, + "learning_rate": 2e-05, + "loss": 0.03343806, + "step": 8613 + }, + { + "epoch": 17.228, + "grad_norm": 1.2577686309814453, + "learning_rate": 2e-05, + "loss": 0.03357682, + "step": 8614 + }, + { + "epoch": 17.23, + "grad_norm": 1.5118682384490967, + "learning_rate": 2e-05, + "loss": 0.05048358, + "step": 8615 + }, + { + "epoch": 17.232, + "grad_norm": 1.094104528427124, + "learning_rate": 2e-05, + "loss": 0.02639047, + "step": 8616 + }, + { + "epoch": 17.234, + "grad_norm": 1.337664008140564, + "learning_rate": 2e-05, + "loss": 0.03451654, + "step": 8617 + }, + { + "epoch": 17.236, + "grad_norm": 1.4902300834655762, + "learning_rate": 2e-05, + "loss": 0.03678834, + "step": 8618 + }, + { + "epoch": 17.238, + "grad_norm": 0.9843270778656006, + "learning_rate": 2e-05, + "loss": 0.02527812, + "step": 8619 + }, + { + "epoch": 17.24, + "grad_norm": 1.1602085828781128, + "learning_rate": 2e-05, + "loss": 0.02774549, + "step": 8620 + }, + { + "epoch": 17.242, + "grad_norm": 1.4860682487487793, + "learning_rate": 2e-05, + "loss": 0.04466415, + "step": 8621 + }, + { + "epoch": 17.244, + "grad_norm": 1.3373606204986572, + "learning_rate": 2e-05, + "loss": 0.03324435, + "step": 8622 + }, + { + "epoch": 17.246, + "grad_norm": 1.4892737865447998, + "learning_rate": 2e-05, + "loss": 0.03874163, + "step": 8623 + }, + { + "epoch": 17.248, + "grad_norm": 1.1000738143920898, + "learning_rate": 2e-05, + "loss": 0.03442953, + "step": 8624 + }, + { + "epoch": 17.25, + "grad_norm": 1.588263988494873, + "learning_rate": 2e-05, + "loss": 0.05064608, + "step": 8625 + }, + { + "epoch": 17.252, + "grad_norm": 2.7729272842407227, + "learning_rate": 2e-05, + "loss": 0.03416018, + "step": 8626 + }, + { + "epoch": 17.254, + "grad_norm": 1.785430669784546, + "learning_rate": 2e-05, + "loss": 0.04151814, + "step": 8627 + }, + { + "epoch": 17.256, + "grad_norm": 1.0420557260513306, + "learning_rate": 2e-05, + "loss": 0.03002853, + "step": 8628 + }, + { + "epoch": 17.258, + "grad_norm": 1.5617362260818481, + "learning_rate": 2e-05, + "loss": 0.0326589, + "step": 8629 + }, + { + "epoch": 17.26, + "grad_norm": 1.3772300481796265, + "learning_rate": 2e-05, + "loss": 0.03914788, + "step": 8630 + }, + { + "epoch": 17.262, + "grad_norm": 1.2935206890106201, + "learning_rate": 2e-05, + "loss": 0.03735418, + "step": 8631 + }, + { + "epoch": 17.264, + "grad_norm": 1.2990622520446777, + "learning_rate": 2e-05, + "loss": 0.03097917, + "step": 8632 + }, + { + "epoch": 17.266, + "grad_norm": 1.4346680641174316, + "learning_rate": 2e-05, + "loss": 0.03221875, + "step": 8633 + }, + { + "epoch": 17.268, + "grad_norm": 1.139218807220459, + "learning_rate": 2e-05, + "loss": 0.03914971, + "step": 8634 + }, + { + "epoch": 17.27, + "grad_norm": 1.8043378591537476, + "learning_rate": 2e-05, + "loss": 0.05375669, + "step": 8635 + }, + { + "epoch": 17.272, + "grad_norm": 1.302628993988037, + "learning_rate": 2e-05, + "loss": 0.03872466, + "step": 8636 + }, + { + "epoch": 17.274, + "grad_norm": 1.0800704956054688, + "learning_rate": 2e-05, + "loss": 0.03145566, + "step": 8637 + }, + { + "epoch": 17.276, + "grad_norm": 1.0463593006134033, + "learning_rate": 2e-05, + "loss": 0.03460503, + "step": 8638 + }, + { + "epoch": 17.278, + "grad_norm": 1.3378190994262695, + "learning_rate": 2e-05, + "loss": 0.05418106, + "step": 8639 + }, + { + "epoch": 17.28, + "grad_norm": 1.3952049016952515, + "learning_rate": 2e-05, + "loss": 0.04103397, + "step": 8640 + }, + { + "epoch": 17.282, + "grad_norm": 1.4025013446807861, + "learning_rate": 2e-05, + "loss": 0.03793554, + "step": 8641 + }, + { + "epoch": 17.284, + "grad_norm": 1.229583978652954, + "learning_rate": 2e-05, + "loss": 0.05052347, + "step": 8642 + }, + { + "epoch": 17.286, + "grad_norm": 1.4108766317367554, + "learning_rate": 2e-05, + "loss": 0.02928887, + "step": 8643 + }, + { + "epoch": 17.288, + "grad_norm": 1.744924783706665, + "learning_rate": 2e-05, + "loss": 0.04452213, + "step": 8644 + }, + { + "epoch": 17.29, + "grad_norm": 1.015102744102478, + "learning_rate": 2e-05, + "loss": 0.0326981, + "step": 8645 + }, + { + "epoch": 17.292, + "grad_norm": 2.1634633541107178, + "learning_rate": 2e-05, + "loss": 0.04052316, + "step": 8646 + }, + { + "epoch": 17.294, + "grad_norm": 0.9109699130058289, + "learning_rate": 2e-05, + "loss": 0.02265767, + "step": 8647 + }, + { + "epoch": 17.296, + "grad_norm": 1.964977502822876, + "learning_rate": 2e-05, + "loss": 0.07206996, + "step": 8648 + }, + { + "epoch": 17.298, + "grad_norm": 1.3658807277679443, + "learning_rate": 2e-05, + "loss": 0.04043622, + "step": 8649 + }, + { + "epoch": 17.3, + "grad_norm": 1.3623096942901611, + "learning_rate": 2e-05, + "loss": 0.02930884, + "step": 8650 + }, + { + "epoch": 17.302, + "grad_norm": 1.2337470054626465, + "learning_rate": 2e-05, + "loss": 0.03874534, + "step": 8651 + }, + { + "epoch": 17.304, + "grad_norm": 0.892915666103363, + "learning_rate": 2e-05, + "loss": 0.03198083, + "step": 8652 + }, + { + "epoch": 17.306, + "grad_norm": 0.9995888471603394, + "learning_rate": 2e-05, + "loss": 0.03072457, + "step": 8653 + }, + { + "epoch": 17.308, + "grad_norm": 1.4063588380813599, + "learning_rate": 2e-05, + "loss": 0.04875659, + "step": 8654 + }, + { + "epoch": 17.31, + "grad_norm": 1.2465662956237793, + "learning_rate": 2e-05, + "loss": 0.03184005, + "step": 8655 + }, + { + "epoch": 17.312, + "grad_norm": 1.2158682346343994, + "learning_rate": 2e-05, + "loss": 0.03214359, + "step": 8656 + }, + { + "epoch": 17.314, + "grad_norm": 0.8416528701782227, + "learning_rate": 2e-05, + "loss": 0.02059457, + "step": 8657 + }, + { + "epoch": 17.316, + "grad_norm": 1.3326908349990845, + "learning_rate": 2e-05, + "loss": 0.04002319, + "step": 8658 + }, + { + "epoch": 17.318, + "grad_norm": 1.201299786567688, + "learning_rate": 2e-05, + "loss": 0.03972005, + "step": 8659 + }, + { + "epoch": 17.32, + "grad_norm": 0.8614779114723206, + "learning_rate": 2e-05, + "loss": 0.02517816, + "step": 8660 + }, + { + "epoch": 17.322, + "grad_norm": 1.0429071187973022, + "learning_rate": 2e-05, + "loss": 0.0332981, + "step": 8661 + }, + { + "epoch": 17.324, + "grad_norm": 1.0974829196929932, + "learning_rate": 2e-05, + "loss": 0.0395813, + "step": 8662 + }, + { + "epoch": 17.326, + "grad_norm": 1.7790201902389526, + "learning_rate": 2e-05, + "loss": 0.04459688, + "step": 8663 + }, + { + "epoch": 17.328, + "grad_norm": 2.299643039703369, + "learning_rate": 2e-05, + "loss": 0.04014507, + "step": 8664 + }, + { + "epoch": 17.33, + "grad_norm": 1.0453588962554932, + "learning_rate": 2e-05, + "loss": 0.02698977, + "step": 8665 + }, + { + "epoch": 17.332, + "grad_norm": 0.8015127778053284, + "learning_rate": 2e-05, + "loss": 0.02366449, + "step": 8666 + }, + { + "epoch": 17.334, + "grad_norm": 1.241165280342102, + "learning_rate": 2e-05, + "loss": 0.03793594, + "step": 8667 + }, + { + "epoch": 17.336, + "grad_norm": 3.191179037094116, + "learning_rate": 2e-05, + "loss": 0.05078402, + "step": 8668 + }, + { + "epoch": 17.338, + "grad_norm": 1.3432191610336304, + "learning_rate": 2e-05, + "loss": 0.03950462, + "step": 8669 + }, + { + "epoch": 17.34, + "grad_norm": 1.2250608205795288, + "learning_rate": 2e-05, + "loss": 0.04277131, + "step": 8670 + }, + { + "epoch": 17.342, + "grad_norm": 1.4782729148864746, + "learning_rate": 2e-05, + "loss": 0.03562623, + "step": 8671 + }, + { + "epoch": 17.344, + "grad_norm": 1.5145196914672852, + "learning_rate": 2e-05, + "loss": 0.03654065, + "step": 8672 + }, + { + "epoch": 17.346, + "grad_norm": 1.0246665477752686, + "learning_rate": 2e-05, + "loss": 0.03273585, + "step": 8673 + }, + { + "epoch": 17.348, + "grad_norm": 0.9382162690162659, + "learning_rate": 2e-05, + "loss": 0.03068813, + "step": 8674 + }, + { + "epoch": 17.35, + "grad_norm": 1.0336867570877075, + "learning_rate": 2e-05, + "loss": 0.03111012, + "step": 8675 + }, + { + "epoch": 17.352, + "grad_norm": 1.593807339668274, + "learning_rate": 2e-05, + "loss": 0.0379719, + "step": 8676 + }, + { + "epoch": 17.354, + "grad_norm": 1.819577693939209, + "learning_rate": 2e-05, + "loss": 0.03360122, + "step": 8677 + }, + { + "epoch": 17.356, + "grad_norm": 1.5745559930801392, + "learning_rate": 2e-05, + "loss": 0.05366817, + "step": 8678 + }, + { + "epoch": 17.358, + "grad_norm": 1.227429986000061, + "learning_rate": 2e-05, + "loss": 0.04186604, + "step": 8679 + }, + { + "epoch": 17.36, + "grad_norm": 1.5559841394424438, + "learning_rate": 2e-05, + "loss": 0.0397791, + "step": 8680 + }, + { + "epoch": 17.362, + "grad_norm": 1.0777933597564697, + "learning_rate": 2e-05, + "loss": 0.02661498, + "step": 8681 + }, + { + "epoch": 17.364, + "grad_norm": 1.2830750942230225, + "learning_rate": 2e-05, + "loss": 0.03172344, + "step": 8682 + }, + { + "epoch": 17.366, + "grad_norm": 0.9228249788284302, + "learning_rate": 2e-05, + "loss": 0.03080669, + "step": 8683 + }, + { + "epoch": 17.368, + "grad_norm": 1.545691967010498, + "learning_rate": 2e-05, + "loss": 0.04729048, + "step": 8684 + }, + { + "epoch": 17.37, + "grad_norm": 2.842621326446533, + "learning_rate": 2e-05, + "loss": 0.03637499, + "step": 8685 + }, + { + "epoch": 17.372, + "grad_norm": 2.176117181777954, + "learning_rate": 2e-05, + "loss": 0.04287196, + "step": 8686 + }, + { + "epoch": 17.374, + "grad_norm": 2.0805728435516357, + "learning_rate": 2e-05, + "loss": 0.03709119, + "step": 8687 + }, + { + "epoch": 17.376, + "grad_norm": 1.0376954078674316, + "learning_rate": 2e-05, + "loss": 0.02746601, + "step": 8688 + }, + { + "epoch": 17.378, + "grad_norm": 4.563030242919922, + "learning_rate": 2e-05, + "loss": 0.0574781, + "step": 8689 + }, + { + "epoch": 17.38, + "grad_norm": 1.4875022172927856, + "learning_rate": 2e-05, + "loss": 0.0384874, + "step": 8690 + }, + { + "epoch": 17.382, + "grad_norm": 1.295417070388794, + "learning_rate": 2e-05, + "loss": 0.04185217, + "step": 8691 + }, + { + "epoch": 17.384, + "grad_norm": 1.428702712059021, + "learning_rate": 2e-05, + "loss": 0.04110858, + "step": 8692 + }, + { + "epoch": 17.386, + "grad_norm": 1.7342407703399658, + "learning_rate": 2e-05, + "loss": 0.04000304, + "step": 8693 + }, + { + "epoch": 17.388, + "grad_norm": 1.2365176677703857, + "learning_rate": 2e-05, + "loss": 0.0342959, + "step": 8694 + }, + { + "epoch": 17.39, + "grad_norm": 2.3056838512420654, + "learning_rate": 2e-05, + "loss": 0.04070599, + "step": 8695 + }, + { + "epoch": 17.392, + "grad_norm": 1.948683261871338, + "learning_rate": 2e-05, + "loss": 0.03871055, + "step": 8696 + }, + { + "epoch": 17.394, + "grad_norm": 3.8803670406341553, + "learning_rate": 2e-05, + "loss": 0.03387075, + "step": 8697 + }, + { + "epoch": 17.396, + "grad_norm": 1.0368173122406006, + "learning_rate": 2e-05, + "loss": 0.03241309, + "step": 8698 + }, + { + "epoch": 17.398, + "grad_norm": 3.3204972743988037, + "learning_rate": 2e-05, + "loss": 0.06573039, + "step": 8699 + }, + { + "epoch": 17.4, + "grad_norm": 1.3388797044754028, + "learning_rate": 2e-05, + "loss": 0.03789306, + "step": 8700 + }, + { + "epoch": 17.402, + "grad_norm": 2.9801526069641113, + "learning_rate": 2e-05, + "loss": 0.03678794, + "step": 8701 + }, + { + "epoch": 17.404, + "grad_norm": 1.5335742235183716, + "learning_rate": 2e-05, + "loss": 0.0323316, + "step": 8702 + }, + { + "epoch": 17.406, + "grad_norm": 1.2214679718017578, + "learning_rate": 2e-05, + "loss": 0.02849279, + "step": 8703 + }, + { + "epoch": 17.408, + "grad_norm": 1.08639395236969, + "learning_rate": 2e-05, + "loss": 0.04005399, + "step": 8704 + }, + { + "epoch": 17.41, + "grad_norm": 1.7272155284881592, + "learning_rate": 2e-05, + "loss": 0.04677524, + "step": 8705 + }, + { + "epoch": 17.412, + "grad_norm": 1.4193323850631714, + "learning_rate": 2e-05, + "loss": 0.04003004, + "step": 8706 + }, + { + "epoch": 17.414, + "grad_norm": 1.1759463548660278, + "learning_rate": 2e-05, + "loss": 0.0381237, + "step": 8707 + }, + { + "epoch": 17.416, + "grad_norm": 2.2209203243255615, + "learning_rate": 2e-05, + "loss": 0.04388089, + "step": 8708 + }, + { + "epoch": 17.418, + "grad_norm": 1.5851181745529175, + "learning_rate": 2e-05, + "loss": 0.02392046, + "step": 8709 + }, + { + "epoch": 17.42, + "grad_norm": 0.9647265076637268, + "learning_rate": 2e-05, + "loss": 0.02776775, + "step": 8710 + }, + { + "epoch": 17.422, + "grad_norm": 1.6217008829116821, + "learning_rate": 2e-05, + "loss": 0.04731276, + "step": 8711 + }, + { + "epoch": 17.424, + "grad_norm": 0.8883227109909058, + "learning_rate": 2e-05, + "loss": 0.02925518, + "step": 8712 + }, + { + "epoch": 17.426, + "grad_norm": 1.7081272602081299, + "learning_rate": 2e-05, + "loss": 0.03178982, + "step": 8713 + }, + { + "epoch": 17.428, + "grad_norm": 1.0577318668365479, + "learning_rate": 2e-05, + "loss": 0.02738732, + "step": 8714 + }, + { + "epoch": 17.43, + "grad_norm": 1.5216280221939087, + "learning_rate": 2e-05, + "loss": 0.0358897, + "step": 8715 + }, + { + "epoch": 17.432, + "grad_norm": 1.733811855316162, + "learning_rate": 2e-05, + "loss": 0.05327031, + "step": 8716 + }, + { + "epoch": 17.434, + "grad_norm": 1.0504800081253052, + "learning_rate": 2e-05, + "loss": 0.02647152, + "step": 8717 + }, + { + "epoch": 17.436, + "grad_norm": 1.4641132354736328, + "learning_rate": 2e-05, + "loss": 0.03930076, + "step": 8718 + }, + { + "epoch": 17.438, + "grad_norm": 0.9525814652442932, + "learning_rate": 2e-05, + "loss": 0.03154223, + "step": 8719 + }, + { + "epoch": 17.44, + "grad_norm": 1.2682076692581177, + "learning_rate": 2e-05, + "loss": 0.04108154, + "step": 8720 + }, + { + "epoch": 17.442, + "grad_norm": 0.9335964918136597, + "learning_rate": 2e-05, + "loss": 0.0300378, + "step": 8721 + }, + { + "epoch": 17.444, + "grad_norm": 2.1530227661132812, + "learning_rate": 2e-05, + "loss": 0.0590164, + "step": 8722 + }, + { + "epoch": 17.446, + "grad_norm": 1.495179295539856, + "learning_rate": 2e-05, + "loss": 0.04352273, + "step": 8723 + }, + { + "epoch": 17.448, + "grad_norm": 1.4746037721633911, + "learning_rate": 2e-05, + "loss": 0.04175236, + "step": 8724 + }, + { + "epoch": 17.45, + "grad_norm": 1.263691782951355, + "learning_rate": 2e-05, + "loss": 0.03465231, + "step": 8725 + }, + { + "epoch": 17.452, + "grad_norm": 1.277445912361145, + "learning_rate": 2e-05, + "loss": 0.0429166, + "step": 8726 + }, + { + "epoch": 17.454, + "grad_norm": 0.9855414628982544, + "learning_rate": 2e-05, + "loss": 0.02733266, + "step": 8727 + }, + { + "epoch": 17.456, + "grad_norm": 1.5489836931228638, + "learning_rate": 2e-05, + "loss": 0.04620903, + "step": 8728 + }, + { + "epoch": 17.458, + "grad_norm": 1.0529900789260864, + "learning_rate": 2e-05, + "loss": 0.03364294, + "step": 8729 + }, + { + "epoch": 17.46, + "grad_norm": 1.1789546012878418, + "learning_rate": 2e-05, + "loss": 0.03275872, + "step": 8730 + }, + { + "epoch": 17.462, + "grad_norm": 0.9303401112556458, + "learning_rate": 2e-05, + "loss": 0.0350357, + "step": 8731 + }, + { + "epoch": 17.464, + "grad_norm": 0.8787309527397156, + "learning_rate": 2e-05, + "loss": 0.02384308, + "step": 8732 + }, + { + "epoch": 17.466, + "grad_norm": 1.6333367824554443, + "learning_rate": 2e-05, + "loss": 0.03150726, + "step": 8733 + }, + { + "epoch": 17.468, + "grad_norm": 1.1511269807815552, + "learning_rate": 2e-05, + "loss": 0.03846743, + "step": 8734 + }, + { + "epoch": 17.47, + "grad_norm": 1.0783816576004028, + "learning_rate": 2e-05, + "loss": 0.02969732, + "step": 8735 + }, + { + "epoch": 17.472, + "grad_norm": 1.126899003982544, + "learning_rate": 2e-05, + "loss": 0.03826762, + "step": 8736 + }, + { + "epoch": 17.474, + "grad_norm": 0.972001314163208, + "learning_rate": 2e-05, + "loss": 0.0226456, + "step": 8737 + }, + { + "epoch": 17.476, + "grad_norm": 2.0911567211151123, + "learning_rate": 2e-05, + "loss": 0.04673334, + "step": 8738 + }, + { + "epoch": 17.478, + "grad_norm": 1.2967848777770996, + "learning_rate": 2e-05, + "loss": 0.03325557, + "step": 8739 + }, + { + "epoch": 17.48, + "grad_norm": 1.4275823831558228, + "learning_rate": 2e-05, + "loss": 0.03418783, + "step": 8740 + }, + { + "epoch": 17.482, + "grad_norm": 1.6219322681427002, + "learning_rate": 2e-05, + "loss": 0.03391339, + "step": 8741 + }, + { + "epoch": 17.484, + "grad_norm": 2.5044708251953125, + "learning_rate": 2e-05, + "loss": 0.04077835, + "step": 8742 + }, + { + "epoch": 17.486, + "grad_norm": 1.286624789237976, + "learning_rate": 2e-05, + "loss": 0.03713045, + "step": 8743 + }, + { + "epoch": 17.488, + "grad_norm": 1.0048493146896362, + "learning_rate": 2e-05, + "loss": 0.02878037, + "step": 8744 + }, + { + "epoch": 17.49, + "grad_norm": 1.4934651851654053, + "learning_rate": 2e-05, + "loss": 0.03150274, + "step": 8745 + }, + { + "epoch": 17.492, + "grad_norm": 1.1032167673110962, + "learning_rate": 2e-05, + "loss": 0.03681361, + "step": 8746 + }, + { + "epoch": 17.494, + "grad_norm": 1.4902517795562744, + "learning_rate": 2e-05, + "loss": 0.04430451, + "step": 8747 + }, + { + "epoch": 17.496, + "grad_norm": 2.1936278343200684, + "learning_rate": 2e-05, + "loss": 0.05962869, + "step": 8748 + }, + { + "epoch": 17.498, + "grad_norm": 1.479569673538208, + "learning_rate": 2e-05, + "loss": 0.04059211, + "step": 8749 + }, + { + "epoch": 17.5, + "grad_norm": 1.139105200767517, + "learning_rate": 2e-05, + "loss": 0.03791492, + "step": 8750 + }, + { + "epoch": 17.502, + "grad_norm": 1.4118876457214355, + "learning_rate": 2e-05, + "loss": 0.03690355, + "step": 8751 + }, + { + "epoch": 17.504, + "grad_norm": 1.1906758546829224, + "learning_rate": 2e-05, + "loss": 0.03069144, + "step": 8752 + }, + { + "epoch": 17.506, + "grad_norm": 1.3194007873535156, + "learning_rate": 2e-05, + "loss": 0.04116385, + "step": 8753 + }, + { + "epoch": 17.508, + "grad_norm": 2.3798933029174805, + "learning_rate": 2e-05, + "loss": 0.03834019, + "step": 8754 + }, + { + "epoch": 17.51, + "grad_norm": 1.7592543363571167, + "learning_rate": 2e-05, + "loss": 0.05264074, + "step": 8755 + }, + { + "epoch": 17.512, + "grad_norm": 1.1170209646224976, + "learning_rate": 2e-05, + "loss": 0.03822346, + "step": 8756 + }, + { + "epoch": 17.514, + "grad_norm": 1.0373886823654175, + "learning_rate": 2e-05, + "loss": 0.03700188, + "step": 8757 + }, + { + "epoch": 17.516, + "grad_norm": 1.619546890258789, + "learning_rate": 2e-05, + "loss": 0.03848478, + "step": 8758 + }, + { + "epoch": 17.518, + "grad_norm": 0.9552158117294312, + "learning_rate": 2e-05, + "loss": 0.0324291, + "step": 8759 + }, + { + "epoch": 17.52, + "grad_norm": 1.3211631774902344, + "learning_rate": 2e-05, + "loss": 0.04355022, + "step": 8760 + }, + { + "epoch": 17.522, + "grad_norm": 1.0997872352600098, + "learning_rate": 2e-05, + "loss": 0.03464405, + "step": 8761 + }, + { + "epoch": 17.524, + "grad_norm": 1.546210527420044, + "learning_rate": 2e-05, + "loss": 0.03780903, + "step": 8762 + }, + { + "epoch": 17.526, + "grad_norm": 1.3221256732940674, + "learning_rate": 2e-05, + "loss": 0.03689417, + "step": 8763 + }, + { + "epoch": 17.528, + "grad_norm": 1.8055446147918701, + "learning_rate": 2e-05, + "loss": 0.04009835, + "step": 8764 + }, + { + "epoch": 17.53, + "grad_norm": 1.3256739377975464, + "learning_rate": 2e-05, + "loss": 0.02718241, + "step": 8765 + }, + { + "epoch": 17.532, + "grad_norm": 1.0728418827056885, + "learning_rate": 2e-05, + "loss": 0.02585372, + "step": 8766 + }, + { + "epoch": 17.534, + "grad_norm": 2.8010523319244385, + "learning_rate": 2e-05, + "loss": 0.03689571, + "step": 8767 + }, + { + "epoch": 17.536, + "grad_norm": 1.8129520416259766, + "learning_rate": 2e-05, + "loss": 0.04276496, + "step": 8768 + }, + { + "epoch": 17.538, + "grad_norm": 1.5692745447158813, + "learning_rate": 2e-05, + "loss": 0.03856734, + "step": 8769 + }, + { + "epoch": 17.54, + "grad_norm": 1.0610696077346802, + "learning_rate": 2e-05, + "loss": 0.02780731, + "step": 8770 + }, + { + "epoch": 17.542, + "grad_norm": 2.569103479385376, + "learning_rate": 2e-05, + "loss": 0.04206748, + "step": 8771 + }, + { + "epoch": 17.544, + "grad_norm": 2.11444354057312, + "learning_rate": 2e-05, + "loss": 0.03847713, + "step": 8772 + }, + { + "epoch": 17.546, + "grad_norm": 1.2605483531951904, + "learning_rate": 2e-05, + "loss": 0.03328384, + "step": 8773 + }, + { + "epoch": 17.548000000000002, + "grad_norm": 2.3000173568725586, + "learning_rate": 2e-05, + "loss": 0.0356028, + "step": 8774 + }, + { + "epoch": 17.55, + "grad_norm": 1.8785638809204102, + "learning_rate": 2e-05, + "loss": 0.04467007, + "step": 8775 + }, + { + "epoch": 17.552, + "grad_norm": 1.260977864265442, + "learning_rate": 2e-05, + "loss": 0.02562046, + "step": 8776 + }, + { + "epoch": 17.554, + "grad_norm": 9.660441398620605, + "learning_rate": 2e-05, + "loss": 0.04144814, + "step": 8777 + }, + { + "epoch": 17.556, + "grad_norm": 1.8776023387908936, + "learning_rate": 2e-05, + "loss": 0.04732294, + "step": 8778 + }, + { + "epoch": 17.558, + "grad_norm": 1.1368457078933716, + "learning_rate": 2e-05, + "loss": 0.04169676, + "step": 8779 + }, + { + "epoch": 17.56, + "grad_norm": 1.5295716524124146, + "learning_rate": 2e-05, + "loss": 0.05010715, + "step": 8780 + }, + { + "epoch": 17.562, + "grad_norm": 0.9344932436943054, + "learning_rate": 2e-05, + "loss": 0.02512026, + "step": 8781 + }, + { + "epoch": 17.564, + "grad_norm": 1.0945249795913696, + "learning_rate": 2e-05, + "loss": 0.03761927, + "step": 8782 + }, + { + "epoch": 17.566, + "grad_norm": 1.4906353950500488, + "learning_rate": 2e-05, + "loss": 0.04850148, + "step": 8783 + }, + { + "epoch": 17.568, + "grad_norm": 1.0239425897598267, + "learning_rate": 2e-05, + "loss": 0.0349384, + "step": 8784 + }, + { + "epoch": 17.57, + "grad_norm": 1.3439842462539673, + "learning_rate": 2e-05, + "loss": 0.02740002, + "step": 8785 + }, + { + "epoch": 17.572, + "grad_norm": 1.8573307991027832, + "learning_rate": 2e-05, + "loss": 0.03505997, + "step": 8786 + }, + { + "epoch": 17.574, + "grad_norm": 1.1892509460449219, + "learning_rate": 2e-05, + "loss": 0.03444887, + "step": 8787 + }, + { + "epoch": 17.576, + "grad_norm": 1.6804590225219727, + "learning_rate": 2e-05, + "loss": 0.03725114, + "step": 8788 + }, + { + "epoch": 17.578, + "grad_norm": 1.5094798803329468, + "learning_rate": 2e-05, + "loss": 0.03691319, + "step": 8789 + }, + { + "epoch": 17.58, + "grad_norm": 1.433280110359192, + "learning_rate": 2e-05, + "loss": 0.03601858, + "step": 8790 + }, + { + "epoch": 17.582, + "grad_norm": 0.9142979383468628, + "learning_rate": 2e-05, + "loss": 0.03045445, + "step": 8791 + }, + { + "epoch": 17.584, + "grad_norm": 0.9239221811294556, + "learning_rate": 2e-05, + "loss": 0.02360635, + "step": 8792 + }, + { + "epoch": 17.586, + "grad_norm": 2.742828369140625, + "learning_rate": 2e-05, + "loss": 0.04096455, + "step": 8793 + }, + { + "epoch": 17.588, + "grad_norm": 1.469237208366394, + "learning_rate": 2e-05, + "loss": 0.03894402, + "step": 8794 + }, + { + "epoch": 17.59, + "grad_norm": 1.224602222442627, + "learning_rate": 2e-05, + "loss": 0.03480183, + "step": 8795 + }, + { + "epoch": 17.592, + "grad_norm": 2.1132779121398926, + "learning_rate": 2e-05, + "loss": 0.04112736, + "step": 8796 + }, + { + "epoch": 17.594, + "grad_norm": 1.4963428974151611, + "learning_rate": 2e-05, + "loss": 0.03678083, + "step": 8797 + }, + { + "epoch": 17.596, + "grad_norm": 1.1321431398391724, + "learning_rate": 2e-05, + "loss": 0.02224684, + "step": 8798 + }, + { + "epoch": 17.598, + "grad_norm": 1.7322660684585571, + "learning_rate": 2e-05, + "loss": 0.04057566, + "step": 8799 + }, + { + "epoch": 17.6, + "grad_norm": 1.437656283378601, + "learning_rate": 2e-05, + "loss": 0.03213751, + "step": 8800 + }, + { + "epoch": 17.602, + "grad_norm": 1.932984471321106, + "learning_rate": 2e-05, + "loss": 0.04008548, + "step": 8801 + }, + { + "epoch": 17.604, + "grad_norm": 1.0793958902359009, + "learning_rate": 2e-05, + "loss": 0.02873497, + "step": 8802 + }, + { + "epoch": 17.606, + "grad_norm": 1.9617685079574585, + "learning_rate": 2e-05, + "loss": 0.04457264, + "step": 8803 + }, + { + "epoch": 17.608, + "grad_norm": 1.7142366170883179, + "learning_rate": 2e-05, + "loss": 0.03925028, + "step": 8804 + }, + { + "epoch": 17.61, + "grad_norm": 1.638058066368103, + "learning_rate": 2e-05, + "loss": 0.04914237, + "step": 8805 + }, + { + "epoch": 17.612, + "grad_norm": 1.3433510065078735, + "learning_rate": 2e-05, + "loss": 0.03332677, + "step": 8806 + }, + { + "epoch": 17.614, + "grad_norm": 0.9329344034194946, + "learning_rate": 2e-05, + "loss": 0.03115493, + "step": 8807 + }, + { + "epoch": 17.616, + "grad_norm": 0.9696981310844421, + "learning_rate": 2e-05, + "loss": 0.0317462, + "step": 8808 + }, + { + "epoch": 17.618, + "grad_norm": 2.5762224197387695, + "learning_rate": 2e-05, + "loss": 0.04256003, + "step": 8809 + }, + { + "epoch": 17.62, + "grad_norm": 1.8832440376281738, + "learning_rate": 2e-05, + "loss": 0.03043187, + "step": 8810 + }, + { + "epoch": 17.622, + "grad_norm": 2.1648950576782227, + "learning_rate": 2e-05, + "loss": 0.04386858, + "step": 8811 + }, + { + "epoch": 17.624, + "grad_norm": 1.578770637512207, + "learning_rate": 2e-05, + "loss": 0.03511237, + "step": 8812 + }, + { + "epoch": 17.626, + "grad_norm": 1.0861693620681763, + "learning_rate": 2e-05, + "loss": 0.04000392, + "step": 8813 + }, + { + "epoch": 17.628, + "grad_norm": 1.1005021333694458, + "learning_rate": 2e-05, + "loss": 0.03653765, + "step": 8814 + }, + { + "epoch": 17.63, + "grad_norm": 1.5705111026763916, + "learning_rate": 2e-05, + "loss": 0.04100737, + "step": 8815 + }, + { + "epoch": 17.632, + "grad_norm": 0.9939731359481812, + "learning_rate": 2e-05, + "loss": 0.02468416, + "step": 8816 + }, + { + "epoch": 17.634, + "grad_norm": 1.6492228507995605, + "learning_rate": 2e-05, + "loss": 0.03122869, + "step": 8817 + }, + { + "epoch": 17.636, + "grad_norm": 1.1242082118988037, + "learning_rate": 2e-05, + "loss": 0.03778775, + "step": 8818 + }, + { + "epoch": 17.638, + "grad_norm": 1.0076687335968018, + "learning_rate": 2e-05, + "loss": 0.03823631, + "step": 8819 + }, + { + "epoch": 17.64, + "grad_norm": 1.2714289426803589, + "learning_rate": 2e-05, + "loss": 0.03733125, + "step": 8820 + }, + { + "epoch": 17.642, + "grad_norm": 1.4574763774871826, + "learning_rate": 2e-05, + "loss": 0.04100376, + "step": 8821 + }, + { + "epoch": 17.644, + "grad_norm": 1.4745090007781982, + "learning_rate": 2e-05, + "loss": 0.04590669, + "step": 8822 + }, + { + "epoch": 17.646, + "grad_norm": 1.2290929555892944, + "learning_rate": 2e-05, + "loss": 0.02955338, + "step": 8823 + }, + { + "epoch": 17.648, + "grad_norm": 1.5989232063293457, + "learning_rate": 2e-05, + "loss": 0.04057942, + "step": 8824 + }, + { + "epoch": 17.65, + "grad_norm": 1.0911918878555298, + "learning_rate": 2e-05, + "loss": 0.0314585, + "step": 8825 + }, + { + "epoch": 17.652, + "grad_norm": 1.1875574588775635, + "learning_rate": 2e-05, + "loss": 0.02796382, + "step": 8826 + }, + { + "epoch": 17.654, + "grad_norm": 0.8497743010520935, + "learning_rate": 2e-05, + "loss": 0.02476349, + "step": 8827 + }, + { + "epoch": 17.656, + "grad_norm": 1.1606796979904175, + "learning_rate": 2e-05, + "loss": 0.03331729, + "step": 8828 + }, + { + "epoch": 17.658, + "grad_norm": 1.929567575454712, + "learning_rate": 2e-05, + "loss": 0.04759387, + "step": 8829 + }, + { + "epoch": 17.66, + "grad_norm": 1.10890531539917, + "learning_rate": 2e-05, + "loss": 0.03875232, + "step": 8830 + }, + { + "epoch": 17.662, + "grad_norm": 1.1237422227859497, + "learning_rate": 2e-05, + "loss": 0.02639496, + "step": 8831 + }, + { + "epoch": 17.664, + "grad_norm": 1.3612616062164307, + "learning_rate": 2e-05, + "loss": 0.04055498, + "step": 8832 + }, + { + "epoch": 17.666, + "grad_norm": 1.0668309926986694, + "learning_rate": 2e-05, + "loss": 0.02782569, + "step": 8833 + }, + { + "epoch": 17.668, + "grad_norm": 1.186408519744873, + "learning_rate": 2e-05, + "loss": 0.03598, + "step": 8834 + }, + { + "epoch": 17.67, + "grad_norm": 2.71864914894104, + "learning_rate": 2e-05, + "loss": 0.05295214, + "step": 8835 + }, + { + "epoch": 17.672, + "grad_norm": 2.222414016723633, + "learning_rate": 2e-05, + "loss": 0.04588395, + "step": 8836 + }, + { + "epoch": 17.674, + "grad_norm": 1.272736668586731, + "learning_rate": 2e-05, + "loss": 0.03839596, + "step": 8837 + }, + { + "epoch": 17.676, + "grad_norm": 1.5483832359313965, + "learning_rate": 2e-05, + "loss": 0.0436643, + "step": 8838 + }, + { + "epoch": 17.678, + "grad_norm": 3.551170587539673, + "learning_rate": 2e-05, + "loss": 0.03059383, + "step": 8839 + }, + { + "epoch": 17.68, + "grad_norm": 1.3416359424591064, + "learning_rate": 2e-05, + "loss": 0.03237988, + "step": 8840 + }, + { + "epoch": 17.682, + "grad_norm": 2.128422975540161, + "learning_rate": 2e-05, + "loss": 0.04019484, + "step": 8841 + }, + { + "epoch": 17.684, + "grad_norm": 1.4956728219985962, + "learning_rate": 2e-05, + "loss": 0.03178658, + "step": 8842 + }, + { + "epoch": 17.686, + "grad_norm": 1.644662618637085, + "learning_rate": 2e-05, + "loss": 0.05027148, + "step": 8843 + }, + { + "epoch": 17.688, + "grad_norm": 3.315823554992676, + "learning_rate": 2e-05, + "loss": 0.03754386, + "step": 8844 + }, + { + "epoch": 17.69, + "grad_norm": 2.4120471477508545, + "learning_rate": 2e-05, + "loss": 0.03160707, + "step": 8845 + }, + { + "epoch": 17.692, + "grad_norm": 3.103656053543091, + "learning_rate": 2e-05, + "loss": 0.04504135, + "step": 8846 + }, + { + "epoch": 17.694, + "grad_norm": 1.1503428220748901, + "learning_rate": 2e-05, + "loss": 0.03259352, + "step": 8847 + }, + { + "epoch": 17.696, + "grad_norm": 1.2756330966949463, + "learning_rate": 2e-05, + "loss": 0.03456133, + "step": 8848 + }, + { + "epoch": 17.698, + "grad_norm": 1.1104017496109009, + "learning_rate": 2e-05, + "loss": 0.02761788, + "step": 8849 + }, + { + "epoch": 17.7, + "grad_norm": 0.9424893856048584, + "learning_rate": 2e-05, + "loss": 0.02640409, + "step": 8850 + }, + { + "epoch": 17.701999999999998, + "grad_norm": 1.0416557788848877, + "learning_rate": 2e-05, + "loss": 0.0349526, + "step": 8851 + }, + { + "epoch": 17.704, + "grad_norm": 1.6049892902374268, + "learning_rate": 2e-05, + "loss": 0.02749394, + "step": 8852 + }, + { + "epoch": 17.706, + "grad_norm": 1.6006180047988892, + "learning_rate": 2e-05, + "loss": 0.03950756, + "step": 8853 + }, + { + "epoch": 17.708, + "grad_norm": 1.0300740003585815, + "learning_rate": 2e-05, + "loss": 0.03133941, + "step": 8854 + }, + { + "epoch": 17.71, + "grad_norm": 3.491128444671631, + "learning_rate": 2e-05, + "loss": 0.04869714, + "step": 8855 + }, + { + "epoch": 17.712, + "grad_norm": 1.389901041984558, + "learning_rate": 2e-05, + "loss": 0.04399059, + "step": 8856 + }, + { + "epoch": 17.714, + "grad_norm": 1.09212327003479, + "learning_rate": 2e-05, + "loss": 0.03761019, + "step": 8857 + }, + { + "epoch": 17.716, + "grad_norm": 1.3363914489746094, + "learning_rate": 2e-05, + "loss": 0.0412604, + "step": 8858 + }, + { + "epoch": 17.718, + "grad_norm": 1.353605031967163, + "learning_rate": 2e-05, + "loss": 0.04291609, + "step": 8859 + }, + { + "epoch": 17.72, + "grad_norm": 1.0849765539169312, + "learning_rate": 2e-05, + "loss": 0.03004822, + "step": 8860 + }, + { + "epoch": 17.722, + "grad_norm": 1.2996422052383423, + "learning_rate": 2e-05, + "loss": 0.04499619, + "step": 8861 + }, + { + "epoch": 17.724, + "grad_norm": 1.1636970043182373, + "learning_rate": 2e-05, + "loss": 0.03615176, + "step": 8862 + }, + { + "epoch": 17.726, + "grad_norm": 0.9753794074058533, + "learning_rate": 2e-05, + "loss": 0.03914835, + "step": 8863 + }, + { + "epoch": 17.728, + "grad_norm": 1.3837984800338745, + "learning_rate": 2e-05, + "loss": 0.05555439, + "step": 8864 + }, + { + "epoch": 17.73, + "grad_norm": 1.9436976909637451, + "learning_rate": 2e-05, + "loss": 0.05068126, + "step": 8865 + }, + { + "epoch": 17.732, + "grad_norm": 1.1782814264297485, + "learning_rate": 2e-05, + "loss": 0.03269964, + "step": 8866 + }, + { + "epoch": 17.734, + "grad_norm": 1.342545747756958, + "learning_rate": 2e-05, + "loss": 0.051434, + "step": 8867 + }, + { + "epoch": 17.736, + "grad_norm": 1.1777751445770264, + "learning_rate": 2e-05, + "loss": 0.03173701, + "step": 8868 + }, + { + "epoch": 17.738, + "grad_norm": 1.666817307472229, + "learning_rate": 2e-05, + "loss": 0.03970077, + "step": 8869 + }, + { + "epoch": 17.74, + "grad_norm": 1.2977221012115479, + "learning_rate": 2e-05, + "loss": 0.04394735, + "step": 8870 + }, + { + "epoch": 17.742, + "grad_norm": 1.076141357421875, + "learning_rate": 2e-05, + "loss": 0.03446401, + "step": 8871 + }, + { + "epoch": 17.744, + "grad_norm": 1.037126898765564, + "learning_rate": 2e-05, + "loss": 0.03374257, + "step": 8872 + }, + { + "epoch": 17.746, + "grad_norm": 1.9888211488723755, + "learning_rate": 2e-05, + "loss": 0.05602897, + "step": 8873 + }, + { + "epoch": 17.748, + "grad_norm": 1.2631540298461914, + "learning_rate": 2e-05, + "loss": 0.03305461, + "step": 8874 + }, + { + "epoch": 17.75, + "grad_norm": 1.383660078048706, + "learning_rate": 2e-05, + "loss": 0.03023957, + "step": 8875 + }, + { + "epoch": 17.752, + "grad_norm": 1.6274633407592773, + "learning_rate": 2e-05, + "loss": 0.02720205, + "step": 8876 + }, + { + "epoch": 17.754, + "grad_norm": 2.328664541244507, + "learning_rate": 2e-05, + "loss": 0.04531138, + "step": 8877 + }, + { + "epoch": 17.756, + "grad_norm": 1.0562291145324707, + "learning_rate": 2e-05, + "loss": 0.02581457, + "step": 8878 + }, + { + "epoch": 17.758, + "grad_norm": 1.2635678052902222, + "learning_rate": 2e-05, + "loss": 0.03207887, + "step": 8879 + }, + { + "epoch": 17.76, + "grad_norm": 1.1033519506454468, + "learning_rate": 2e-05, + "loss": 0.03914376, + "step": 8880 + }, + { + "epoch": 17.762, + "grad_norm": 2.1870508193969727, + "learning_rate": 2e-05, + "loss": 0.04750444, + "step": 8881 + }, + { + "epoch": 17.764, + "grad_norm": 1.3173456192016602, + "learning_rate": 2e-05, + "loss": 0.04112824, + "step": 8882 + }, + { + "epoch": 17.766, + "grad_norm": 1.5373280048370361, + "learning_rate": 2e-05, + "loss": 0.03573046, + "step": 8883 + }, + { + "epoch": 17.768, + "grad_norm": 1.249330759048462, + "learning_rate": 2e-05, + "loss": 0.03755806, + "step": 8884 + }, + { + "epoch": 17.77, + "grad_norm": 1.2358646392822266, + "learning_rate": 2e-05, + "loss": 0.02392829, + "step": 8885 + }, + { + "epoch": 17.772, + "grad_norm": 0.6719095706939697, + "learning_rate": 2e-05, + "loss": 0.01649454, + "step": 8886 + }, + { + "epoch": 17.774, + "grad_norm": 1.3092347383499146, + "learning_rate": 2e-05, + "loss": 0.04142372, + "step": 8887 + }, + { + "epoch": 17.776, + "grad_norm": 1.4076433181762695, + "learning_rate": 2e-05, + "loss": 0.03731369, + "step": 8888 + }, + { + "epoch": 17.778, + "grad_norm": 1.1862058639526367, + "learning_rate": 2e-05, + "loss": 0.02469937, + "step": 8889 + }, + { + "epoch": 17.78, + "grad_norm": 1.0884705781936646, + "learning_rate": 2e-05, + "loss": 0.03976347, + "step": 8890 + }, + { + "epoch": 17.782, + "grad_norm": 1.3288317918777466, + "learning_rate": 2e-05, + "loss": 0.03909069, + "step": 8891 + }, + { + "epoch": 17.784, + "grad_norm": 1.437835693359375, + "learning_rate": 2e-05, + "loss": 0.04604793, + "step": 8892 + }, + { + "epoch": 17.786, + "grad_norm": 1.2429791688919067, + "learning_rate": 2e-05, + "loss": 0.03789811, + "step": 8893 + }, + { + "epoch": 17.788, + "grad_norm": 0.8929753303527832, + "learning_rate": 2e-05, + "loss": 0.02409481, + "step": 8894 + }, + { + "epoch": 17.79, + "grad_norm": 1.292466163635254, + "learning_rate": 2e-05, + "loss": 0.04581561, + "step": 8895 + }, + { + "epoch": 17.792, + "grad_norm": 1.1110408306121826, + "learning_rate": 2e-05, + "loss": 0.03448091, + "step": 8896 + }, + { + "epoch": 17.794, + "grad_norm": 1.2385011911392212, + "learning_rate": 2e-05, + "loss": 0.03106187, + "step": 8897 + }, + { + "epoch": 17.796, + "grad_norm": 2.670729637145996, + "learning_rate": 2e-05, + "loss": 0.04714116, + "step": 8898 + }, + { + "epoch": 17.798000000000002, + "grad_norm": 0.9738274216651917, + "learning_rate": 2e-05, + "loss": 0.02809829, + "step": 8899 + }, + { + "epoch": 17.8, + "grad_norm": 1.753575325012207, + "learning_rate": 2e-05, + "loss": 0.04472911, + "step": 8900 + }, + { + "epoch": 17.802, + "grad_norm": 1.6126110553741455, + "learning_rate": 2e-05, + "loss": 0.0548849, + "step": 8901 + }, + { + "epoch": 17.804, + "grad_norm": 1.9280973672866821, + "learning_rate": 2e-05, + "loss": 0.0414143, + "step": 8902 + }, + { + "epoch": 17.806, + "grad_norm": 0.8208984136581421, + "learning_rate": 2e-05, + "loss": 0.01933501, + "step": 8903 + }, + { + "epoch": 17.808, + "grad_norm": 2.380815029144287, + "learning_rate": 2e-05, + "loss": 0.03992394, + "step": 8904 + }, + { + "epoch": 17.81, + "grad_norm": 2.3603808879852295, + "learning_rate": 2e-05, + "loss": 0.06256126, + "step": 8905 + }, + { + "epoch": 17.812, + "grad_norm": 1.170880675315857, + "learning_rate": 2e-05, + "loss": 0.0337522, + "step": 8906 + }, + { + "epoch": 17.814, + "grad_norm": 1.7238221168518066, + "learning_rate": 2e-05, + "loss": 0.0442452, + "step": 8907 + }, + { + "epoch": 17.816, + "grad_norm": 1.3815233707427979, + "learning_rate": 2e-05, + "loss": 0.03570531, + "step": 8908 + }, + { + "epoch": 17.818, + "grad_norm": 1.077028751373291, + "learning_rate": 2e-05, + "loss": 0.02792567, + "step": 8909 + }, + { + "epoch": 17.82, + "grad_norm": 1.8009798526763916, + "learning_rate": 2e-05, + "loss": 0.03502089, + "step": 8910 + }, + { + "epoch": 17.822, + "grad_norm": 1.4753614664077759, + "learning_rate": 2e-05, + "loss": 0.04613937, + "step": 8911 + }, + { + "epoch": 17.824, + "grad_norm": 1.8474282026290894, + "learning_rate": 2e-05, + "loss": 0.05605477, + "step": 8912 + }, + { + "epoch": 17.826, + "grad_norm": 1.3313781023025513, + "learning_rate": 2e-05, + "loss": 0.04386368, + "step": 8913 + }, + { + "epoch": 17.828, + "grad_norm": 1.3753377199172974, + "learning_rate": 2e-05, + "loss": 0.0385115, + "step": 8914 + }, + { + "epoch": 17.83, + "grad_norm": 0.8378182053565979, + "learning_rate": 2e-05, + "loss": 0.02854342, + "step": 8915 + }, + { + "epoch": 17.832, + "grad_norm": 1.0748872756958008, + "learning_rate": 2e-05, + "loss": 0.02578593, + "step": 8916 + }, + { + "epoch": 17.834, + "grad_norm": 0.881872832775116, + "learning_rate": 2e-05, + "loss": 0.02822345, + "step": 8917 + }, + { + "epoch": 17.836, + "grad_norm": 2.561361074447632, + "learning_rate": 2e-05, + "loss": 0.04112708, + "step": 8918 + }, + { + "epoch": 17.838, + "grad_norm": 1.0299837589263916, + "learning_rate": 2e-05, + "loss": 0.030092, + "step": 8919 + }, + { + "epoch": 17.84, + "grad_norm": 1.040921688079834, + "learning_rate": 2e-05, + "loss": 0.02578293, + "step": 8920 + }, + { + "epoch": 17.842, + "grad_norm": 1.1591360569000244, + "learning_rate": 2e-05, + "loss": 0.04461095, + "step": 8921 + }, + { + "epoch": 17.844, + "grad_norm": 0.929568886756897, + "learning_rate": 2e-05, + "loss": 0.03212486, + "step": 8922 + }, + { + "epoch": 17.846, + "grad_norm": 1.0860724449157715, + "learning_rate": 2e-05, + "loss": 0.02192564, + "step": 8923 + }, + { + "epoch": 17.848, + "grad_norm": 1.2266968488693237, + "learning_rate": 2e-05, + "loss": 0.0363798, + "step": 8924 + }, + { + "epoch": 17.85, + "grad_norm": 1.622202754020691, + "learning_rate": 2e-05, + "loss": 0.03073082, + "step": 8925 + }, + { + "epoch": 17.852, + "grad_norm": 1.0483282804489136, + "learning_rate": 2e-05, + "loss": 0.03715392, + "step": 8926 + }, + { + "epoch": 17.854, + "grad_norm": 1.1092106103897095, + "learning_rate": 2e-05, + "loss": 0.03876258, + "step": 8927 + }, + { + "epoch": 17.856, + "grad_norm": 0.9673099517822266, + "learning_rate": 2e-05, + "loss": 0.02583649, + "step": 8928 + }, + { + "epoch": 17.858, + "grad_norm": 1.635252833366394, + "learning_rate": 2e-05, + "loss": 0.05052019, + "step": 8929 + }, + { + "epoch": 17.86, + "grad_norm": 1.153358817100525, + "learning_rate": 2e-05, + "loss": 0.03466346, + "step": 8930 + }, + { + "epoch": 17.862, + "grad_norm": 2.039275884628296, + "learning_rate": 2e-05, + "loss": 0.05241006, + "step": 8931 + }, + { + "epoch": 17.864, + "grad_norm": 1.3057268857955933, + "learning_rate": 2e-05, + "loss": 0.04647853, + "step": 8932 + }, + { + "epoch": 17.866, + "grad_norm": 1.776149868965149, + "learning_rate": 2e-05, + "loss": 0.03174097, + "step": 8933 + }, + { + "epoch": 17.868, + "grad_norm": 1.7656913995742798, + "learning_rate": 2e-05, + "loss": 0.03435258, + "step": 8934 + }, + { + "epoch": 17.87, + "grad_norm": 1.3331656455993652, + "learning_rate": 2e-05, + "loss": 0.04026199, + "step": 8935 + }, + { + "epoch": 17.872, + "grad_norm": 1.0311089754104614, + "learning_rate": 2e-05, + "loss": 0.02966706, + "step": 8936 + }, + { + "epoch": 17.874, + "grad_norm": 1.0116089582443237, + "learning_rate": 2e-05, + "loss": 0.03720299, + "step": 8937 + }, + { + "epoch": 17.876, + "grad_norm": 1.1523536443710327, + "learning_rate": 2e-05, + "loss": 0.0370546, + "step": 8938 + }, + { + "epoch": 17.878, + "grad_norm": 1.1194231510162354, + "learning_rate": 2e-05, + "loss": 0.03367734, + "step": 8939 + }, + { + "epoch": 17.88, + "grad_norm": 1.4645055532455444, + "learning_rate": 2e-05, + "loss": 0.04109664, + "step": 8940 + }, + { + "epoch": 17.882, + "grad_norm": 1.3934130668640137, + "learning_rate": 2e-05, + "loss": 0.04116001, + "step": 8941 + }, + { + "epoch": 17.884, + "grad_norm": 1.2432390451431274, + "learning_rate": 2e-05, + "loss": 0.0318905, + "step": 8942 + }, + { + "epoch": 17.886, + "grad_norm": 1.9031294584274292, + "learning_rate": 2e-05, + "loss": 0.04899106, + "step": 8943 + }, + { + "epoch": 17.888, + "grad_norm": 1.3559234142303467, + "learning_rate": 2e-05, + "loss": 0.04302301, + "step": 8944 + }, + { + "epoch": 17.89, + "grad_norm": 1.1823681592941284, + "learning_rate": 2e-05, + "loss": 0.03750154, + "step": 8945 + }, + { + "epoch": 17.892, + "grad_norm": 2.3975961208343506, + "learning_rate": 2e-05, + "loss": 0.04103041, + "step": 8946 + }, + { + "epoch": 17.894, + "grad_norm": 1.2826277017593384, + "learning_rate": 2e-05, + "loss": 0.03688804, + "step": 8947 + }, + { + "epoch": 17.896, + "grad_norm": 2.822355270385742, + "learning_rate": 2e-05, + "loss": 0.05060627, + "step": 8948 + }, + { + "epoch": 17.898, + "grad_norm": 1.0254136323928833, + "learning_rate": 2e-05, + "loss": 0.03011078, + "step": 8949 + }, + { + "epoch": 17.9, + "grad_norm": 1.1992510557174683, + "learning_rate": 2e-05, + "loss": 0.04118895, + "step": 8950 + }, + { + "epoch": 17.902, + "grad_norm": 1.697759747505188, + "learning_rate": 2e-05, + "loss": 0.04727785, + "step": 8951 + }, + { + "epoch": 17.904, + "grad_norm": 1.5532244443893433, + "learning_rate": 2e-05, + "loss": 0.05363618, + "step": 8952 + }, + { + "epoch": 17.906, + "grad_norm": 1.0093700885772705, + "learning_rate": 2e-05, + "loss": 0.03118528, + "step": 8953 + }, + { + "epoch": 17.908, + "grad_norm": 1.895651936531067, + "learning_rate": 2e-05, + "loss": 0.06020194, + "step": 8954 + }, + { + "epoch": 17.91, + "grad_norm": 1.0814530849456787, + "learning_rate": 2e-05, + "loss": 0.0355964, + "step": 8955 + }, + { + "epoch": 17.912, + "grad_norm": 1.521867275238037, + "learning_rate": 2e-05, + "loss": 0.04056902, + "step": 8956 + }, + { + "epoch": 17.914, + "grad_norm": 1.2673386335372925, + "learning_rate": 2e-05, + "loss": 0.03534425, + "step": 8957 + }, + { + "epoch": 17.916, + "grad_norm": 2.1670548915863037, + "learning_rate": 2e-05, + "loss": 0.03837797, + "step": 8958 + }, + { + "epoch": 17.918, + "grad_norm": 0.9772452116012573, + "learning_rate": 2e-05, + "loss": 0.02949987, + "step": 8959 + }, + { + "epoch": 17.92, + "grad_norm": 1.940874695777893, + "learning_rate": 2e-05, + "loss": 0.06990305, + "step": 8960 + }, + { + "epoch": 17.922, + "grad_norm": 2.148874044418335, + "learning_rate": 2e-05, + "loss": 0.03925563, + "step": 8961 + }, + { + "epoch": 17.924, + "grad_norm": 1.2279850244522095, + "learning_rate": 2e-05, + "loss": 0.03267869, + "step": 8962 + }, + { + "epoch": 17.926, + "grad_norm": 1.2713209390640259, + "learning_rate": 2e-05, + "loss": 0.03438141, + "step": 8963 + }, + { + "epoch": 17.928, + "grad_norm": 1.0285924673080444, + "learning_rate": 2e-05, + "loss": 0.03503043, + "step": 8964 + }, + { + "epoch": 17.93, + "grad_norm": 1.2333204746246338, + "learning_rate": 2e-05, + "loss": 0.03405837, + "step": 8965 + }, + { + "epoch": 17.932, + "grad_norm": 2.2821717262268066, + "learning_rate": 2e-05, + "loss": 0.03725312, + "step": 8966 + }, + { + "epoch": 17.934, + "grad_norm": 3.1810193061828613, + "learning_rate": 2e-05, + "loss": 0.0413457, + "step": 8967 + }, + { + "epoch": 17.936, + "grad_norm": 1.8636484146118164, + "learning_rate": 2e-05, + "loss": 0.0562732, + "step": 8968 + }, + { + "epoch": 17.938, + "grad_norm": 2.1253907680511475, + "learning_rate": 2e-05, + "loss": 0.04173985, + "step": 8969 + }, + { + "epoch": 17.94, + "grad_norm": 1.4649451971054077, + "learning_rate": 2e-05, + "loss": 0.04346614, + "step": 8970 + }, + { + "epoch": 17.942, + "grad_norm": 1.5256109237670898, + "learning_rate": 2e-05, + "loss": 0.05348255, + "step": 8971 + }, + { + "epoch": 17.944, + "grad_norm": 1.0634779930114746, + "learning_rate": 2e-05, + "loss": 0.04019362, + "step": 8972 + }, + { + "epoch": 17.946, + "grad_norm": 1.4219046831130981, + "learning_rate": 2e-05, + "loss": 0.03420949, + "step": 8973 + }, + { + "epoch": 17.948, + "grad_norm": 1.1320511102676392, + "learning_rate": 2e-05, + "loss": 0.03867266, + "step": 8974 + }, + { + "epoch": 17.95, + "grad_norm": 1.6765214204788208, + "learning_rate": 2e-05, + "loss": 0.04724558, + "step": 8975 + }, + { + "epoch": 17.951999999999998, + "grad_norm": 1.8259295225143433, + "learning_rate": 2e-05, + "loss": 0.03461309, + "step": 8976 + }, + { + "epoch": 17.954, + "grad_norm": 1.2042784690856934, + "learning_rate": 2e-05, + "loss": 0.04839892, + "step": 8977 + }, + { + "epoch": 17.956, + "grad_norm": 1.5858327150344849, + "learning_rate": 2e-05, + "loss": 0.0500534, + "step": 8978 + }, + { + "epoch": 17.958, + "grad_norm": 1.3547513484954834, + "learning_rate": 2e-05, + "loss": 0.02594247, + "step": 8979 + }, + { + "epoch": 17.96, + "grad_norm": 1.0181814432144165, + "learning_rate": 2e-05, + "loss": 0.02916558, + "step": 8980 + }, + { + "epoch": 17.962, + "grad_norm": 1.466145634651184, + "learning_rate": 2e-05, + "loss": 0.04672647, + "step": 8981 + }, + { + "epoch": 17.964, + "grad_norm": 1.3053315877914429, + "learning_rate": 2e-05, + "loss": 0.03587544, + "step": 8982 + }, + { + "epoch": 17.966, + "grad_norm": 1.0494365692138672, + "learning_rate": 2e-05, + "loss": 0.02380068, + "step": 8983 + }, + { + "epoch": 17.968, + "grad_norm": 1.8024916648864746, + "learning_rate": 2e-05, + "loss": 0.03359849, + "step": 8984 + }, + { + "epoch": 17.97, + "grad_norm": 0.9630800485610962, + "learning_rate": 2e-05, + "loss": 0.03356433, + "step": 8985 + }, + { + "epoch": 17.972, + "grad_norm": 1.1515108346939087, + "learning_rate": 2e-05, + "loss": 0.03505271, + "step": 8986 + }, + { + "epoch": 17.974, + "grad_norm": 1.3020286560058594, + "learning_rate": 2e-05, + "loss": 0.05472892, + "step": 8987 + }, + { + "epoch": 17.976, + "grad_norm": 1.7581522464752197, + "learning_rate": 2e-05, + "loss": 0.0482644, + "step": 8988 + }, + { + "epoch": 17.978, + "grad_norm": 1.5518864393234253, + "learning_rate": 2e-05, + "loss": 0.04105756, + "step": 8989 + }, + { + "epoch": 17.98, + "grad_norm": 0.9530649781227112, + "learning_rate": 2e-05, + "loss": 0.02671498, + "step": 8990 + }, + { + "epoch": 17.982, + "grad_norm": 1.1680177450180054, + "learning_rate": 2e-05, + "loss": 0.02665569, + "step": 8991 + }, + { + "epoch": 17.984, + "grad_norm": 1.260867714881897, + "learning_rate": 2e-05, + "loss": 0.02883239, + "step": 8992 + }, + { + "epoch": 17.986, + "grad_norm": 1.3985310792922974, + "learning_rate": 2e-05, + "loss": 0.03463531, + "step": 8993 + }, + { + "epoch": 17.988, + "grad_norm": 1.3712165355682373, + "learning_rate": 2e-05, + "loss": 0.04792679, + "step": 8994 + }, + { + "epoch": 17.99, + "grad_norm": 1.8040891885757446, + "learning_rate": 2e-05, + "loss": 0.04169717, + "step": 8995 + }, + { + "epoch": 17.992, + "grad_norm": 2.077030897140503, + "learning_rate": 2e-05, + "loss": 0.04134944, + "step": 8996 + }, + { + "epoch": 17.994, + "grad_norm": 0.8118720054626465, + "learning_rate": 2e-05, + "loss": 0.01849389, + "step": 8997 + }, + { + "epoch": 17.996, + "grad_norm": 0.983228862285614, + "learning_rate": 2e-05, + "loss": 0.02795386, + "step": 8998 + }, + { + "epoch": 17.998, + "grad_norm": 1.1231356859207153, + "learning_rate": 2e-05, + "loss": 0.02436145, + "step": 8999 + }, + { + "epoch": 18.0, + "grad_norm": 1.8219008445739746, + "learning_rate": 2e-05, + "loss": 0.03983248, + "step": 9000 + }, + { + "epoch": 18.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 0.998, + "AngleClassification_3": 0.9740518962075848, + "Equal_1": 0.996, + "Equal_2": 0.9700598802395209, + "Equal_3": 0.93812375249501, + "LineComparison_1": 0.998, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9879759519038076, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.992, + "Perpendicular_1": 0.996, + "Perpendicular_2": 0.978, + "Perpendicular_3": 0.7464929859719439, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9973333333333333, + "PointLiesOnCircle_3": 0.9896666666666666, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9740518962075848 + }, + "eval_runtime": 320.1045, + "eval_samples_per_second": 32.802, + "eval_steps_per_second": 0.656, + "step": 9000 + }, + { + "epoch": 18.002, + "grad_norm": 1.8216357231140137, + "learning_rate": 2e-05, + "loss": 0.03083301, + "step": 9001 + }, + { + "epoch": 18.004, + "grad_norm": 0.8909358978271484, + "learning_rate": 2e-05, + "loss": 0.02501141, + "step": 9002 + }, + { + "epoch": 18.006, + "grad_norm": 1.4257256984710693, + "learning_rate": 2e-05, + "loss": 0.03546458, + "step": 9003 + }, + { + "epoch": 18.008, + "grad_norm": 1.231162667274475, + "learning_rate": 2e-05, + "loss": 0.04436444, + "step": 9004 + }, + { + "epoch": 18.01, + "grad_norm": 2.071355104446411, + "learning_rate": 2e-05, + "loss": 0.05257452, + "step": 9005 + }, + { + "epoch": 18.012, + "grad_norm": 2.303741455078125, + "learning_rate": 2e-05, + "loss": 0.04662026, + "step": 9006 + }, + { + "epoch": 18.014, + "grad_norm": 1.0220463275909424, + "learning_rate": 2e-05, + "loss": 0.03330886, + "step": 9007 + }, + { + "epoch": 18.016, + "grad_norm": 2.040062665939331, + "learning_rate": 2e-05, + "loss": 0.04663801, + "step": 9008 + }, + { + "epoch": 18.018, + "grad_norm": 2.4291489124298096, + "learning_rate": 2e-05, + "loss": 0.05256157, + "step": 9009 + }, + { + "epoch": 18.02, + "grad_norm": 2.1016221046447754, + "learning_rate": 2e-05, + "loss": 0.04151364, + "step": 9010 + }, + { + "epoch": 18.022, + "grad_norm": 1.0601978302001953, + "learning_rate": 2e-05, + "loss": 0.04119922, + "step": 9011 + }, + { + "epoch": 18.024, + "grad_norm": 1.0514633655548096, + "learning_rate": 2e-05, + "loss": 0.03135135, + "step": 9012 + }, + { + "epoch": 18.026, + "grad_norm": 1.4121065139770508, + "learning_rate": 2e-05, + "loss": 0.04619808, + "step": 9013 + }, + { + "epoch": 18.028, + "grad_norm": 0.9003342986106873, + "learning_rate": 2e-05, + "loss": 0.02542493, + "step": 9014 + }, + { + "epoch": 18.03, + "grad_norm": 2.492168664932251, + "learning_rate": 2e-05, + "loss": 0.04310292, + "step": 9015 + }, + { + "epoch": 18.032, + "grad_norm": 1.2616229057312012, + "learning_rate": 2e-05, + "loss": 0.03716831, + "step": 9016 + }, + { + "epoch": 18.034, + "grad_norm": 1.5388479232788086, + "learning_rate": 2e-05, + "loss": 0.05547727, + "step": 9017 + }, + { + "epoch": 18.036, + "grad_norm": 2.8383405208587646, + "learning_rate": 2e-05, + "loss": 0.04588626, + "step": 9018 + }, + { + "epoch": 18.038, + "grad_norm": 1.0203205347061157, + "learning_rate": 2e-05, + "loss": 0.03126208, + "step": 9019 + }, + { + "epoch": 18.04, + "grad_norm": 1.4815860986709595, + "learning_rate": 2e-05, + "loss": 0.04387635, + "step": 9020 + }, + { + "epoch": 18.042, + "grad_norm": 1.163259744644165, + "learning_rate": 2e-05, + "loss": 0.0341371, + "step": 9021 + }, + { + "epoch": 18.044, + "grad_norm": 1.398336410522461, + "learning_rate": 2e-05, + "loss": 0.03288767, + "step": 9022 + }, + { + "epoch": 18.046, + "grad_norm": 1.4146158695220947, + "learning_rate": 2e-05, + "loss": 0.03783367, + "step": 9023 + }, + { + "epoch": 18.048, + "grad_norm": 1.2341772317886353, + "learning_rate": 2e-05, + "loss": 0.0348506, + "step": 9024 + }, + { + "epoch": 18.05, + "grad_norm": 1.4742604494094849, + "learning_rate": 2e-05, + "loss": 0.04025548, + "step": 9025 + }, + { + "epoch": 18.052, + "grad_norm": 1.2109090089797974, + "learning_rate": 2e-05, + "loss": 0.02520112, + "step": 9026 + }, + { + "epoch": 18.054, + "grad_norm": 0.9972143173217773, + "learning_rate": 2e-05, + "loss": 0.03415427, + "step": 9027 + }, + { + "epoch": 18.056, + "grad_norm": 1.110215425491333, + "learning_rate": 2e-05, + "loss": 0.03701129, + "step": 9028 + }, + { + "epoch": 18.058, + "grad_norm": 1.2750699520111084, + "learning_rate": 2e-05, + "loss": 0.03478452, + "step": 9029 + }, + { + "epoch": 18.06, + "grad_norm": 0.906257152557373, + "learning_rate": 2e-05, + "loss": 0.02598533, + "step": 9030 + }, + { + "epoch": 18.062, + "grad_norm": 2.5207362174987793, + "learning_rate": 2e-05, + "loss": 0.04807776, + "step": 9031 + }, + { + "epoch": 18.064, + "grad_norm": 1.246948480606079, + "learning_rate": 2e-05, + "loss": 0.03488917, + "step": 9032 + }, + { + "epoch": 18.066, + "grad_norm": 2.9162485599517822, + "learning_rate": 2e-05, + "loss": 0.03648629, + "step": 9033 + }, + { + "epoch": 18.068, + "grad_norm": 1.169908046722412, + "learning_rate": 2e-05, + "loss": 0.02641434, + "step": 9034 + }, + { + "epoch": 18.07, + "grad_norm": 1.152937889099121, + "learning_rate": 2e-05, + "loss": 0.03226584, + "step": 9035 + }, + { + "epoch": 18.072, + "grad_norm": 1.7191685438156128, + "learning_rate": 2e-05, + "loss": 0.05071743, + "step": 9036 + }, + { + "epoch": 18.074, + "grad_norm": 1.6077672243118286, + "learning_rate": 2e-05, + "loss": 0.02946869, + "step": 9037 + }, + { + "epoch": 18.076, + "grad_norm": 1.787205457687378, + "learning_rate": 2e-05, + "loss": 0.02539303, + "step": 9038 + }, + { + "epoch": 18.078, + "grad_norm": 1.3960199356079102, + "learning_rate": 2e-05, + "loss": 0.03590056, + "step": 9039 + }, + { + "epoch": 18.08, + "grad_norm": 1.8743473291397095, + "learning_rate": 2e-05, + "loss": 0.03464347, + "step": 9040 + }, + { + "epoch": 18.082, + "grad_norm": 1.2113933563232422, + "learning_rate": 2e-05, + "loss": 0.03156437, + "step": 9041 + }, + { + "epoch": 18.084, + "grad_norm": 1.8189163208007812, + "learning_rate": 2e-05, + "loss": 0.04444864, + "step": 9042 + }, + { + "epoch": 18.086, + "grad_norm": 1.960848331451416, + "learning_rate": 2e-05, + "loss": 0.03387512, + "step": 9043 + }, + { + "epoch": 18.088, + "grad_norm": 1.3409546613693237, + "learning_rate": 2e-05, + "loss": 0.03057274, + "step": 9044 + }, + { + "epoch": 18.09, + "grad_norm": 1.4044934511184692, + "learning_rate": 2e-05, + "loss": 0.03606112, + "step": 9045 + }, + { + "epoch": 18.092, + "grad_norm": 1.1396024227142334, + "learning_rate": 2e-05, + "loss": 0.03115062, + "step": 9046 + }, + { + "epoch": 18.094, + "grad_norm": 1.4510269165039062, + "learning_rate": 2e-05, + "loss": 0.02674197, + "step": 9047 + }, + { + "epoch": 18.096, + "grad_norm": 1.1940547227859497, + "learning_rate": 2e-05, + "loss": 0.03157366, + "step": 9048 + }, + { + "epoch": 18.098, + "grad_norm": 1.4505969285964966, + "learning_rate": 2e-05, + "loss": 0.04264753, + "step": 9049 + }, + { + "epoch": 18.1, + "grad_norm": 1.8302192687988281, + "learning_rate": 2e-05, + "loss": 0.04496709, + "step": 9050 + }, + { + "epoch": 18.102, + "grad_norm": 1.3621551990509033, + "learning_rate": 2e-05, + "loss": 0.03655332, + "step": 9051 + }, + { + "epoch": 18.104, + "grad_norm": 0.982716977596283, + "learning_rate": 2e-05, + "loss": 0.03123835, + "step": 9052 + }, + { + "epoch": 18.106, + "grad_norm": 1.5590128898620605, + "learning_rate": 2e-05, + "loss": 0.04147382, + "step": 9053 + }, + { + "epoch": 18.108, + "grad_norm": 1.2122939825057983, + "learning_rate": 2e-05, + "loss": 0.03553297, + "step": 9054 + }, + { + "epoch": 18.11, + "grad_norm": 1.2034052610397339, + "learning_rate": 2e-05, + "loss": 0.04050352, + "step": 9055 + }, + { + "epoch": 18.112, + "grad_norm": 1.3900257349014282, + "learning_rate": 2e-05, + "loss": 0.05008593, + "step": 9056 + }, + { + "epoch": 18.114, + "grad_norm": 1.8129817247390747, + "learning_rate": 2e-05, + "loss": 0.03101974, + "step": 9057 + }, + { + "epoch": 18.116, + "grad_norm": 1.3509774208068848, + "learning_rate": 2e-05, + "loss": 0.03191446, + "step": 9058 + }, + { + "epoch": 18.118, + "grad_norm": 1.6840113401412964, + "learning_rate": 2e-05, + "loss": 0.03167528, + "step": 9059 + }, + { + "epoch": 18.12, + "grad_norm": 1.132699966430664, + "learning_rate": 2e-05, + "loss": 0.03625121, + "step": 9060 + }, + { + "epoch": 18.122, + "grad_norm": 1.4251673221588135, + "learning_rate": 2e-05, + "loss": 0.04312788, + "step": 9061 + }, + { + "epoch": 18.124, + "grad_norm": 1.160861611366272, + "learning_rate": 2e-05, + "loss": 0.03482513, + "step": 9062 + }, + { + "epoch": 18.126, + "grad_norm": 1.3063910007476807, + "learning_rate": 2e-05, + "loss": 0.03584398, + "step": 9063 + }, + { + "epoch": 18.128, + "grad_norm": 1.410151481628418, + "learning_rate": 2e-05, + "loss": 0.04368694, + "step": 9064 + }, + { + "epoch": 18.13, + "grad_norm": 2.592433452606201, + "learning_rate": 2e-05, + "loss": 0.03493256, + "step": 9065 + }, + { + "epoch": 18.132, + "grad_norm": 1.192308783531189, + "learning_rate": 2e-05, + "loss": 0.03010586, + "step": 9066 + }, + { + "epoch": 18.134, + "grad_norm": 1.1783413887023926, + "learning_rate": 2e-05, + "loss": 0.04549911, + "step": 9067 + }, + { + "epoch": 18.136, + "grad_norm": 0.9659681916236877, + "learning_rate": 2e-05, + "loss": 0.02878298, + "step": 9068 + }, + { + "epoch": 18.138, + "grad_norm": 1.4795477390289307, + "learning_rate": 2e-05, + "loss": 0.03338488, + "step": 9069 + }, + { + "epoch": 18.14, + "grad_norm": 1.3997915983200073, + "learning_rate": 2e-05, + "loss": 0.04527793, + "step": 9070 + }, + { + "epoch": 18.142, + "grad_norm": 0.974898099899292, + "learning_rate": 2e-05, + "loss": 0.02718613, + "step": 9071 + }, + { + "epoch": 18.144, + "grad_norm": 1.643044114112854, + "learning_rate": 2e-05, + "loss": 0.04503238, + "step": 9072 + }, + { + "epoch": 18.146, + "grad_norm": 2.4663209915161133, + "learning_rate": 2e-05, + "loss": 0.05017319, + "step": 9073 + }, + { + "epoch": 18.148, + "grad_norm": 1.3137898445129395, + "learning_rate": 2e-05, + "loss": 0.03679071, + "step": 9074 + }, + { + "epoch": 18.15, + "grad_norm": 1.2270176410675049, + "learning_rate": 2e-05, + "loss": 0.02902906, + "step": 9075 + }, + { + "epoch": 18.152, + "grad_norm": 1.5809822082519531, + "learning_rate": 2e-05, + "loss": 0.02031413, + "step": 9076 + }, + { + "epoch": 18.154, + "grad_norm": 1.235666275024414, + "learning_rate": 2e-05, + "loss": 0.0429434, + "step": 9077 + }, + { + "epoch": 18.156, + "grad_norm": 1.3485252857208252, + "learning_rate": 2e-05, + "loss": 0.04158372, + "step": 9078 + }, + { + "epoch": 18.158, + "grad_norm": 1.5450528860092163, + "learning_rate": 2e-05, + "loss": 0.03034053, + "step": 9079 + }, + { + "epoch": 18.16, + "grad_norm": 1.102346420288086, + "learning_rate": 2e-05, + "loss": 0.03850158, + "step": 9080 + }, + { + "epoch": 18.162, + "grad_norm": 1.7495108842849731, + "learning_rate": 2e-05, + "loss": 0.04332937, + "step": 9081 + }, + { + "epoch": 18.164, + "grad_norm": 1.059772253036499, + "learning_rate": 2e-05, + "loss": 0.04180691, + "step": 9082 + }, + { + "epoch": 18.166, + "grad_norm": 2.6385281085968018, + "learning_rate": 2e-05, + "loss": 0.04071419, + "step": 9083 + }, + { + "epoch": 18.168, + "grad_norm": 1.432405948638916, + "learning_rate": 2e-05, + "loss": 0.02765241, + "step": 9084 + }, + { + "epoch": 18.17, + "grad_norm": 1.8066904544830322, + "learning_rate": 2e-05, + "loss": 0.04745378, + "step": 9085 + }, + { + "epoch": 18.172, + "grad_norm": 0.885301411151886, + "learning_rate": 2e-05, + "loss": 0.02941084, + "step": 9086 + }, + { + "epoch": 18.174, + "grad_norm": 1.0743720531463623, + "learning_rate": 2e-05, + "loss": 0.03165472, + "step": 9087 + }, + { + "epoch": 18.176, + "grad_norm": 1.8421827554702759, + "learning_rate": 2e-05, + "loss": 0.03389416, + "step": 9088 + }, + { + "epoch": 18.178, + "grad_norm": 1.3630011081695557, + "learning_rate": 2e-05, + "loss": 0.03734118, + "step": 9089 + }, + { + "epoch": 18.18, + "grad_norm": 1.5092341899871826, + "learning_rate": 2e-05, + "loss": 0.04258218, + "step": 9090 + }, + { + "epoch": 18.182, + "grad_norm": 2.4805521965026855, + "learning_rate": 2e-05, + "loss": 0.05131678, + "step": 9091 + }, + { + "epoch": 18.184, + "grad_norm": 1.2178826332092285, + "learning_rate": 2e-05, + "loss": 0.03363734, + "step": 9092 + }, + { + "epoch": 18.186, + "grad_norm": 1.2770280838012695, + "learning_rate": 2e-05, + "loss": 0.04184774, + "step": 9093 + }, + { + "epoch": 18.188, + "grad_norm": 2.7290360927581787, + "learning_rate": 2e-05, + "loss": 0.04432489, + "step": 9094 + }, + { + "epoch": 18.19, + "grad_norm": 1.8310285806655884, + "learning_rate": 2e-05, + "loss": 0.03530759, + "step": 9095 + }, + { + "epoch": 18.192, + "grad_norm": 1.0748549699783325, + "learning_rate": 2e-05, + "loss": 0.03645511, + "step": 9096 + }, + { + "epoch": 18.194, + "grad_norm": 1.8210999965667725, + "learning_rate": 2e-05, + "loss": 0.04495174, + "step": 9097 + }, + { + "epoch": 18.196, + "grad_norm": 1.0958075523376465, + "learning_rate": 2e-05, + "loss": 0.02407749, + "step": 9098 + }, + { + "epoch": 18.198, + "grad_norm": 1.8292564153671265, + "learning_rate": 2e-05, + "loss": 0.0362148, + "step": 9099 + }, + { + "epoch": 18.2, + "grad_norm": 1.0899569988250732, + "learning_rate": 2e-05, + "loss": 0.03374222, + "step": 9100 + }, + { + "epoch": 18.202, + "grad_norm": 1.0096633434295654, + "learning_rate": 2e-05, + "loss": 0.03209589, + "step": 9101 + }, + { + "epoch": 18.204, + "grad_norm": 1.1050734519958496, + "learning_rate": 2e-05, + "loss": 0.03580528, + "step": 9102 + }, + { + "epoch": 18.206, + "grad_norm": 2.117532253265381, + "learning_rate": 2e-05, + "loss": 0.04968563, + "step": 9103 + }, + { + "epoch": 18.208, + "grad_norm": 2.2677221298217773, + "learning_rate": 2e-05, + "loss": 0.06845517, + "step": 9104 + }, + { + "epoch": 18.21, + "grad_norm": 0.8368435502052307, + "learning_rate": 2e-05, + "loss": 0.02537123, + "step": 9105 + }, + { + "epoch": 18.212, + "grad_norm": 1.149839162826538, + "learning_rate": 2e-05, + "loss": 0.03183338, + "step": 9106 + }, + { + "epoch": 18.214, + "grad_norm": 1.2164126634597778, + "learning_rate": 2e-05, + "loss": 0.0268788, + "step": 9107 + }, + { + "epoch": 18.216, + "grad_norm": 1.1083381175994873, + "learning_rate": 2e-05, + "loss": 0.0347072, + "step": 9108 + }, + { + "epoch": 18.218, + "grad_norm": 1.0997363328933716, + "learning_rate": 2e-05, + "loss": 0.03136883, + "step": 9109 + }, + { + "epoch": 18.22, + "grad_norm": 1.3243005275726318, + "learning_rate": 2e-05, + "loss": 0.04105141, + "step": 9110 + }, + { + "epoch": 18.222, + "grad_norm": 1.123883843421936, + "learning_rate": 2e-05, + "loss": 0.04037739, + "step": 9111 + }, + { + "epoch": 18.224, + "grad_norm": 1.3631707429885864, + "learning_rate": 2e-05, + "loss": 0.02595056, + "step": 9112 + }, + { + "epoch": 18.226, + "grad_norm": 1.5019488334655762, + "learning_rate": 2e-05, + "loss": 0.03916592, + "step": 9113 + }, + { + "epoch": 18.228, + "grad_norm": 1.153467059135437, + "learning_rate": 2e-05, + "loss": 0.03865172, + "step": 9114 + }, + { + "epoch": 18.23, + "grad_norm": 1.4573817253112793, + "learning_rate": 2e-05, + "loss": 0.02908707, + "step": 9115 + }, + { + "epoch": 18.232, + "grad_norm": 1.7540217638015747, + "learning_rate": 2e-05, + "loss": 0.03963351, + "step": 9116 + }, + { + "epoch": 18.234, + "grad_norm": 2.5012366771698, + "learning_rate": 2e-05, + "loss": 0.04361957, + "step": 9117 + }, + { + "epoch": 18.236, + "grad_norm": 1.4801292419433594, + "learning_rate": 2e-05, + "loss": 0.03021835, + "step": 9118 + }, + { + "epoch": 18.238, + "grad_norm": 1.3169306516647339, + "learning_rate": 2e-05, + "loss": 0.03268102, + "step": 9119 + }, + { + "epoch": 18.24, + "grad_norm": 1.0948139429092407, + "learning_rate": 2e-05, + "loss": 0.02789821, + "step": 9120 + }, + { + "epoch": 18.242, + "grad_norm": 1.2871372699737549, + "learning_rate": 2e-05, + "loss": 0.03492365, + "step": 9121 + }, + { + "epoch": 18.244, + "grad_norm": 1.7145776748657227, + "learning_rate": 2e-05, + "loss": 0.03364137, + "step": 9122 + }, + { + "epoch": 18.246, + "grad_norm": 1.5056641101837158, + "learning_rate": 2e-05, + "loss": 0.03888101, + "step": 9123 + }, + { + "epoch": 18.248, + "grad_norm": 2.125732660293579, + "learning_rate": 2e-05, + "loss": 0.04138645, + "step": 9124 + }, + { + "epoch": 18.25, + "grad_norm": 2.0914463996887207, + "learning_rate": 2e-05, + "loss": 0.04323914, + "step": 9125 + }, + { + "epoch": 18.252, + "grad_norm": 1.3840261697769165, + "learning_rate": 2e-05, + "loss": 0.04744683, + "step": 9126 + }, + { + "epoch": 18.254, + "grad_norm": 1.3877594470977783, + "learning_rate": 2e-05, + "loss": 0.03428207, + "step": 9127 + }, + { + "epoch": 18.256, + "grad_norm": 1.9282671213150024, + "learning_rate": 2e-05, + "loss": 0.03370817, + "step": 9128 + }, + { + "epoch": 18.258, + "grad_norm": 1.2625566720962524, + "learning_rate": 2e-05, + "loss": 0.03845701, + "step": 9129 + }, + { + "epoch": 18.26, + "grad_norm": 1.497373104095459, + "learning_rate": 2e-05, + "loss": 0.02765183, + "step": 9130 + }, + { + "epoch": 18.262, + "grad_norm": 0.938334584236145, + "learning_rate": 2e-05, + "loss": 0.0283683, + "step": 9131 + }, + { + "epoch": 18.264, + "grad_norm": 0.8075963258743286, + "learning_rate": 2e-05, + "loss": 0.0257271, + "step": 9132 + }, + { + "epoch": 18.266, + "grad_norm": 2.6727170944213867, + "learning_rate": 2e-05, + "loss": 0.03528313, + "step": 9133 + }, + { + "epoch": 18.268, + "grad_norm": 1.1714603900909424, + "learning_rate": 2e-05, + "loss": 0.03839287, + "step": 9134 + }, + { + "epoch": 18.27, + "grad_norm": 1.3799819946289062, + "learning_rate": 2e-05, + "loss": 0.04040816, + "step": 9135 + }, + { + "epoch": 18.272, + "grad_norm": 0.8756110668182373, + "learning_rate": 2e-05, + "loss": 0.02800023, + "step": 9136 + }, + { + "epoch": 18.274, + "grad_norm": 1.278294563293457, + "learning_rate": 2e-05, + "loss": 0.03553412, + "step": 9137 + }, + { + "epoch": 18.276, + "grad_norm": 1.649017333984375, + "learning_rate": 2e-05, + "loss": 0.0240197, + "step": 9138 + }, + { + "epoch": 18.278, + "grad_norm": 1.2307323217391968, + "learning_rate": 2e-05, + "loss": 0.03196525, + "step": 9139 + }, + { + "epoch": 18.28, + "grad_norm": 1.2029914855957031, + "learning_rate": 2e-05, + "loss": 0.04263403, + "step": 9140 + }, + { + "epoch": 18.282, + "grad_norm": 1.1589373350143433, + "learning_rate": 2e-05, + "loss": 0.04057025, + "step": 9141 + }, + { + "epoch": 18.284, + "grad_norm": 1.0348948240280151, + "learning_rate": 2e-05, + "loss": 0.04266765, + "step": 9142 + }, + { + "epoch": 18.286, + "grad_norm": 0.8373733162879944, + "learning_rate": 2e-05, + "loss": 0.01983244, + "step": 9143 + }, + { + "epoch": 18.288, + "grad_norm": 2.752629041671753, + "learning_rate": 2e-05, + "loss": 0.04166656, + "step": 9144 + }, + { + "epoch": 18.29, + "grad_norm": 1.1935979127883911, + "learning_rate": 2e-05, + "loss": 0.04558745, + "step": 9145 + }, + { + "epoch": 18.292, + "grad_norm": 0.9616577625274658, + "learning_rate": 2e-05, + "loss": 0.02630688, + "step": 9146 + }, + { + "epoch": 18.294, + "grad_norm": 1.3004382848739624, + "learning_rate": 2e-05, + "loss": 0.04482454, + "step": 9147 + }, + { + "epoch": 18.296, + "grad_norm": 2.1716208457946777, + "learning_rate": 2e-05, + "loss": 0.03779503, + "step": 9148 + }, + { + "epoch": 18.298, + "grad_norm": 1.1647388935089111, + "learning_rate": 2e-05, + "loss": 0.03616937, + "step": 9149 + }, + { + "epoch": 18.3, + "grad_norm": 1.6994318962097168, + "learning_rate": 2e-05, + "loss": 0.03434645, + "step": 9150 + }, + { + "epoch": 18.302, + "grad_norm": 1.055916666984558, + "learning_rate": 2e-05, + "loss": 0.04265877, + "step": 9151 + }, + { + "epoch": 18.304, + "grad_norm": 1.1713130474090576, + "learning_rate": 2e-05, + "loss": 0.03589529, + "step": 9152 + }, + { + "epoch": 18.306, + "grad_norm": 1.2549328804016113, + "learning_rate": 2e-05, + "loss": 0.03371196, + "step": 9153 + }, + { + "epoch": 18.308, + "grad_norm": 0.9361233115196228, + "learning_rate": 2e-05, + "loss": 0.02483992, + "step": 9154 + }, + { + "epoch": 18.31, + "grad_norm": 1.0725373029708862, + "learning_rate": 2e-05, + "loss": 0.03467751, + "step": 9155 + }, + { + "epoch": 18.312, + "grad_norm": 1.8279783725738525, + "learning_rate": 2e-05, + "loss": 0.05167603, + "step": 9156 + }, + { + "epoch": 18.314, + "grad_norm": 0.995499312877655, + "learning_rate": 2e-05, + "loss": 0.02847271, + "step": 9157 + }, + { + "epoch": 18.316, + "grad_norm": 1.0232125520706177, + "learning_rate": 2e-05, + "loss": 0.0327347, + "step": 9158 + }, + { + "epoch": 18.318, + "grad_norm": 1.25034761428833, + "learning_rate": 2e-05, + "loss": 0.04012694, + "step": 9159 + }, + { + "epoch": 18.32, + "grad_norm": 2.3386640548706055, + "learning_rate": 2e-05, + "loss": 0.03319935, + "step": 9160 + }, + { + "epoch": 18.322, + "grad_norm": 1.7142624855041504, + "learning_rate": 2e-05, + "loss": 0.03917816, + "step": 9161 + }, + { + "epoch": 18.324, + "grad_norm": 1.209825038909912, + "learning_rate": 2e-05, + "loss": 0.03275396, + "step": 9162 + }, + { + "epoch": 18.326, + "grad_norm": 1.3008837699890137, + "learning_rate": 2e-05, + "loss": 0.05405723, + "step": 9163 + }, + { + "epoch": 18.328, + "grad_norm": 1.4746818542480469, + "learning_rate": 2e-05, + "loss": 0.05087238, + "step": 9164 + }, + { + "epoch": 18.33, + "grad_norm": 1.1503615379333496, + "learning_rate": 2e-05, + "loss": 0.03378589, + "step": 9165 + }, + { + "epoch": 18.332, + "grad_norm": 1.8128838539123535, + "learning_rate": 2e-05, + "loss": 0.0437137, + "step": 9166 + }, + { + "epoch": 18.334, + "grad_norm": 0.9782925248146057, + "learning_rate": 2e-05, + "loss": 0.02864728, + "step": 9167 + }, + { + "epoch": 18.336, + "grad_norm": 1.0823737382888794, + "learning_rate": 2e-05, + "loss": 0.03686111, + "step": 9168 + }, + { + "epoch": 18.338, + "grad_norm": 1.7901039123535156, + "learning_rate": 2e-05, + "loss": 0.05045727, + "step": 9169 + }, + { + "epoch": 18.34, + "grad_norm": 1.2802319526672363, + "learning_rate": 2e-05, + "loss": 0.031836, + "step": 9170 + }, + { + "epoch": 18.342, + "grad_norm": 1.067213535308838, + "learning_rate": 2e-05, + "loss": 0.04307377, + "step": 9171 + }, + { + "epoch": 18.344, + "grad_norm": 2.6261112689971924, + "learning_rate": 2e-05, + "loss": 0.03852256, + "step": 9172 + }, + { + "epoch": 18.346, + "grad_norm": 2.581172466278076, + "learning_rate": 2e-05, + "loss": 0.04666492, + "step": 9173 + }, + { + "epoch": 18.348, + "grad_norm": 1.2471551895141602, + "learning_rate": 2e-05, + "loss": 0.03403299, + "step": 9174 + }, + { + "epoch": 18.35, + "grad_norm": 0.8967499732971191, + "learning_rate": 2e-05, + "loss": 0.02881518, + "step": 9175 + }, + { + "epoch": 18.352, + "grad_norm": 1.7108482122421265, + "learning_rate": 2e-05, + "loss": 0.03858823, + "step": 9176 + }, + { + "epoch": 18.354, + "grad_norm": 1.9164936542510986, + "learning_rate": 2e-05, + "loss": 0.02809501, + "step": 9177 + }, + { + "epoch": 18.356, + "grad_norm": 1.0232477188110352, + "learning_rate": 2e-05, + "loss": 0.02891929, + "step": 9178 + }, + { + "epoch": 18.358, + "grad_norm": 1.1295270919799805, + "learning_rate": 2e-05, + "loss": 0.0412226, + "step": 9179 + }, + { + "epoch": 18.36, + "grad_norm": 1.951155662536621, + "learning_rate": 2e-05, + "loss": 0.05396903, + "step": 9180 + }, + { + "epoch": 18.362, + "grad_norm": 1.2782940864562988, + "learning_rate": 2e-05, + "loss": 0.03953922, + "step": 9181 + }, + { + "epoch": 18.364, + "grad_norm": 1.5052493810653687, + "learning_rate": 2e-05, + "loss": 0.04199969, + "step": 9182 + }, + { + "epoch": 18.366, + "grad_norm": 2.334589958190918, + "learning_rate": 2e-05, + "loss": 0.03334442, + "step": 9183 + }, + { + "epoch": 18.368, + "grad_norm": 1.534926176071167, + "learning_rate": 2e-05, + "loss": 0.03135791, + "step": 9184 + }, + { + "epoch": 18.37, + "grad_norm": 2.0098283290863037, + "learning_rate": 2e-05, + "loss": 0.04160352, + "step": 9185 + }, + { + "epoch": 18.372, + "grad_norm": 0.8825958967208862, + "learning_rate": 2e-05, + "loss": 0.02310374, + "step": 9186 + }, + { + "epoch": 18.374, + "grad_norm": 1.5516126155853271, + "learning_rate": 2e-05, + "loss": 0.03449169, + "step": 9187 + }, + { + "epoch": 18.376, + "grad_norm": 1.0211329460144043, + "learning_rate": 2e-05, + "loss": 0.03518451, + "step": 9188 + }, + { + "epoch": 18.378, + "grad_norm": 1.4555100202560425, + "learning_rate": 2e-05, + "loss": 0.03537372, + "step": 9189 + }, + { + "epoch": 18.38, + "grad_norm": 1.6372886896133423, + "learning_rate": 2e-05, + "loss": 0.03058396, + "step": 9190 + }, + { + "epoch": 18.382, + "grad_norm": 1.5627180337905884, + "learning_rate": 2e-05, + "loss": 0.02930931, + "step": 9191 + }, + { + "epoch": 18.384, + "grad_norm": 1.7449994087219238, + "learning_rate": 2e-05, + "loss": 0.03934868, + "step": 9192 + }, + { + "epoch": 18.386, + "grad_norm": 0.9360613226890564, + "learning_rate": 2e-05, + "loss": 0.02767797, + "step": 9193 + }, + { + "epoch": 18.388, + "grad_norm": 1.376953125, + "learning_rate": 2e-05, + "loss": 0.02869485, + "step": 9194 + }, + { + "epoch": 18.39, + "grad_norm": 0.9833387732505798, + "learning_rate": 2e-05, + "loss": 0.02881843, + "step": 9195 + }, + { + "epoch": 18.392, + "grad_norm": 1.3985074758529663, + "learning_rate": 2e-05, + "loss": 0.03320839, + "step": 9196 + }, + { + "epoch": 18.394, + "grad_norm": 1.116469144821167, + "learning_rate": 2e-05, + "loss": 0.031327, + "step": 9197 + }, + { + "epoch": 18.396, + "grad_norm": 1.913405418395996, + "learning_rate": 2e-05, + "loss": 0.03109054, + "step": 9198 + }, + { + "epoch": 18.398, + "grad_norm": 0.9129771590232849, + "learning_rate": 2e-05, + "loss": 0.02565073, + "step": 9199 + }, + { + "epoch": 18.4, + "grad_norm": 0.8751816749572754, + "learning_rate": 2e-05, + "loss": 0.02509234, + "step": 9200 + }, + { + "epoch": 18.402, + "grad_norm": 1.0799328088760376, + "learning_rate": 2e-05, + "loss": 0.02929278, + "step": 9201 + }, + { + "epoch": 18.404, + "grad_norm": 1.4381532669067383, + "learning_rate": 2e-05, + "loss": 0.03288494, + "step": 9202 + }, + { + "epoch": 18.406, + "grad_norm": 1.2862842082977295, + "learning_rate": 2e-05, + "loss": 0.04411345, + "step": 9203 + }, + { + "epoch": 18.408, + "grad_norm": 1.2750723361968994, + "learning_rate": 2e-05, + "loss": 0.04329745, + "step": 9204 + }, + { + "epoch": 18.41, + "grad_norm": 1.3029229640960693, + "learning_rate": 2e-05, + "loss": 0.03379948, + "step": 9205 + }, + { + "epoch": 18.412, + "grad_norm": 1.3032692670822144, + "learning_rate": 2e-05, + "loss": 0.04243733, + "step": 9206 + }, + { + "epoch": 18.414, + "grad_norm": 1.4686411619186401, + "learning_rate": 2e-05, + "loss": 0.04486408, + "step": 9207 + }, + { + "epoch": 18.416, + "grad_norm": 2.9766223430633545, + "learning_rate": 2e-05, + "loss": 0.04959375, + "step": 9208 + }, + { + "epoch": 18.418, + "grad_norm": 1.8033760786056519, + "learning_rate": 2e-05, + "loss": 0.04121406, + "step": 9209 + }, + { + "epoch": 18.42, + "grad_norm": 1.320888876914978, + "learning_rate": 2e-05, + "loss": 0.03473954, + "step": 9210 + }, + { + "epoch": 18.422, + "grad_norm": 1.6498913764953613, + "learning_rate": 2e-05, + "loss": 0.04184615, + "step": 9211 + }, + { + "epoch": 18.424, + "grad_norm": 1.0657768249511719, + "learning_rate": 2e-05, + "loss": 0.03823342, + "step": 9212 + }, + { + "epoch": 18.426, + "grad_norm": 1.5349171161651611, + "learning_rate": 2e-05, + "loss": 0.04052274, + "step": 9213 + }, + { + "epoch": 18.428, + "grad_norm": 1.3626680374145508, + "learning_rate": 2e-05, + "loss": 0.03621456, + "step": 9214 + }, + { + "epoch": 18.43, + "grad_norm": 2.154515027999878, + "learning_rate": 2e-05, + "loss": 0.05104381, + "step": 9215 + }, + { + "epoch": 18.432, + "grad_norm": 2.7905545234680176, + "learning_rate": 2e-05, + "loss": 0.04180931, + "step": 9216 + }, + { + "epoch": 18.434, + "grad_norm": 1.2199686765670776, + "learning_rate": 2e-05, + "loss": 0.0278045, + "step": 9217 + }, + { + "epoch": 18.436, + "grad_norm": 1.7791190147399902, + "learning_rate": 2e-05, + "loss": 0.03621379, + "step": 9218 + }, + { + "epoch": 18.438, + "grad_norm": 2.6824915409088135, + "learning_rate": 2e-05, + "loss": 0.04269816, + "step": 9219 + }, + { + "epoch": 18.44, + "grad_norm": 1.4576973915100098, + "learning_rate": 2e-05, + "loss": 0.04696831, + "step": 9220 + }, + { + "epoch": 18.442, + "grad_norm": 1.0133570432662964, + "learning_rate": 2e-05, + "loss": 0.03169259, + "step": 9221 + }, + { + "epoch": 18.444, + "grad_norm": 1.0140568017959595, + "learning_rate": 2e-05, + "loss": 0.02977751, + "step": 9222 + }, + { + "epoch": 18.446, + "grad_norm": 1.2628326416015625, + "learning_rate": 2e-05, + "loss": 0.0529662, + "step": 9223 + }, + { + "epoch": 18.448, + "grad_norm": 1.7104601860046387, + "learning_rate": 2e-05, + "loss": 0.05581909, + "step": 9224 + }, + { + "epoch": 18.45, + "grad_norm": 0.982793390750885, + "learning_rate": 2e-05, + "loss": 0.03106131, + "step": 9225 + }, + { + "epoch": 18.452, + "grad_norm": 1.2446315288543701, + "learning_rate": 2e-05, + "loss": 0.04647019, + "step": 9226 + }, + { + "epoch": 18.454, + "grad_norm": 1.1005967855453491, + "learning_rate": 2e-05, + "loss": 0.02941687, + "step": 9227 + }, + { + "epoch": 18.456, + "grad_norm": 1.3427083492279053, + "learning_rate": 2e-05, + "loss": 0.03301994, + "step": 9228 + }, + { + "epoch": 18.458, + "grad_norm": 0.9936813712120056, + "learning_rate": 2e-05, + "loss": 0.03047378, + "step": 9229 + }, + { + "epoch": 18.46, + "grad_norm": 1.9145174026489258, + "learning_rate": 2e-05, + "loss": 0.03703724, + "step": 9230 + }, + { + "epoch": 18.462, + "grad_norm": 1.308974027633667, + "learning_rate": 2e-05, + "loss": 0.04389992, + "step": 9231 + }, + { + "epoch": 18.464, + "grad_norm": 0.7866896390914917, + "learning_rate": 2e-05, + "loss": 0.0197884, + "step": 9232 + }, + { + "epoch": 18.466, + "grad_norm": 1.48411226272583, + "learning_rate": 2e-05, + "loss": 0.03440548, + "step": 9233 + }, + { + "epoch": 18.468, + "grad_norm": 1.2252159118652344, + "learning_rate": 2e-05, + "loss": 0.03730087, + "step": 9234 + }, + { + "epoch": 18.47, + "grad_norm": 2.652885675430298, + "learning_rate": 2e-05, + "loss": 0.04888747, + "step": 9235 + }, + { + "epoch": 18.472, + "grad_norm": 0.9633825421333313, + "learning_rate": 2e-05, + "loss": 0.03162334, + "step": 9236 + }, + { + "epoch": 18.474, + "grad_norm": 1.207777500152588, + "learning_rate": 2e-05, + "loss": 0.03481243, + "step": 9237 + }, + { + "epoch": 18.476, + "grad_norm": 2.7908196449279785, + "learning_rate": 2e-05, + "loss": 0.06109419, + "step": 9238 + }, + { + "epoch": 18.478, + "grad_norm": 1.291577935218811, + "learning_rate": 2e-05, + "loss": 0.04738308, + "step": 9239 + }, + { + "epoch": 18.48, + "grad_norm": 1.2779804468154907, + "learning_rate": 2e-05, + "loss": 0.03293496, + "step": 9240 + }, + { + "epoch": 18.482, + "grad_norm": 1.388176679611206, + "learning_rate": 2e-05, + "loss": 0.03266312, + "step": 9241 + }, + { + "epoch": 18.484, + "grad_norm": 1.3886147737503052, + "learning_rate": 2e-05, + "loss": 0.02937507, + "step": 9242 + }, + { + "epoch": 18.486, + "grad_norm": 1.400505781173706, + "learning_rate": 2e-05, + "loss": 0.04522613, + "step": 9243 + }, + { + "epoch": 18.488, + "grad_norm": 1.2880887985229492, + "learning_rate": 2e-05, + "loss": 0.03753714, + "step": 9244 + }, + { + "epoch": 18.49, + "grad_norm": 1.0358575582504272, + "learning_rate": 2e-05, + "loss": 0.03539129, + "step": 9245 + }, + { + "epoch": 18.492, + "grad_norm": 3.62886905670166, + "learning_rate": 2e-05, + "loss": 0.03471145, + "step": 9246 + }, + { + "epoch": 18.494, + "grad_norm": 1.1280561685562134, + "learning_rate": 2e-05, + "loss": 0.03343772, + "step": 9247 + }, + { + "epoch": 18.496, + "grad_norm": 1.5975375175476074, + "learning_rate": 2e-05, + "loss": 0.0507987, + "step": 9248 + }, + { + "epoch": 18.498, + "grad_norm": 1.5075079202651978, + "learning_rate": 2e-05, + "loss": 0.05328429, + "step": 9249 + }, + { + "epoch": 18.5, + "grad_norm": 0.8867833614349365, + "learning_rate": 2e-05, + "loss": 0.02282133, + "step": 9250 + }, + { + "epoch": 18.502, + "grad_norm": 0.9173626899719238, + "learning_rate": 2e-05, + "loss": 0.02405676, + "step": 9251 + }, + { + "epoch": 18.504, + "grad_norm": 0.9300764799118042, + "learning_rate": 2e-05, + "loss": 0.03014572, + "step": 9252 + }, + { + "epoch": 18.506, + "grad_norm": 0.6862570643424988, + "learning_rate": 2e-05, + "loss": 0.01409816, + "step": 9253 + }, + { + "epoch": 18.508, + "grad_norm": 1.5550457239151, + "learning_rate": 2e-05, + "loss": 0.04886139, + "step": 9254 + }, + { + "epoch": 18.51, + "grad_norm": 1.2698708772659302, + "learning_rate": 2e-05, + "loss": 0.03988787, + "step": 9255 + }, + { + "epoch": 18.512, + "grad_norm": 1.4811738729476929, + "learning_rate": 2e-05, + "loss": 0.03565231, + "step": 9256 + }, + { + "epoch": 18.514, + "grad_norm": 2.2947709560394287, + "learning_rate": 2e-05, + "loss": 0.03940463, + "step": 9257 + }, + { + "epoch": 18.516, + "grad_norm": 1.0649924278259277, + "learning_rate": 2e-05, + "loss": 0.02794782, + "step": 9258 + }, + { + "epoch": 18.518, + "grad_norm": 1.2015745639801025, + "learning_rate": 2e-05, + "loss": 0.03615477, + "step": 9259 + }, + { + "epoch": 18.52, + "grad_norm": 1.1461511850357056, + "learning_rate": 2e-05, + "loss": 0.03559923, + "step": 9260 + }, + { + "epoch": 18.522, + "grad_norm": 1.8255705833435059, + "learning_rate": 2e-05, + "loss": 0.04908207, + "step": 9261 + }, + { + "epoch": 18.524, + "grad_norm": 1.0880002975463867, + "learning_rate": 2e-05, + "loss": 0.02855574, + "step": 9262 + }, + { + "epoch": 18.526, + "grad_norm": 2.4838008880615234, + "learning_rate": 2e-05, + "loss": 0.03444799, + "step": 9263 + }, + { + "epoch": 18.528, + "grad_norm": 1.1246633529663086, + "learning_rate": 2e-05, + "loss": 0.02719912, + "step": 9264 + }, + { + "epoch": 18.53, + "grad_norm": 0.9572923183441162, + "learning_rate": 2e-05, + "loss": 0.02941136, + "step": 9265 + }, + { + "epoch": 18.532, + "grad_norm": 1.3162039518356323, + "learning_rate": 2e-05, + "loss": 0.0248658, + "step": 9266 + }, + { + "epoch": 18.534, + "grad_norm": 1.1983442306518555, + "learning_rate": 2e-05, + "loss": 0.04123002, + "step": 9267 + }, + { + "epoch": 18.536, + "grad_norm": 1.4255578517913818, + "learning_rate": 2e-05, + "loss": 0.03819581, + "step": 9268 + }, + { + "epoch": 18.538, + "grad_norm": 1.2587826251983643, + "learning_rate": 2e-05, + "loss": 0.03482603, + "step": 9269 + }, + { + "epoch": 18.54, + "grad_norm": 1.2358354330062866, + "learning_rate": 2e-05, + "loss": 0.03826098, + "step": 9270 + }, + { + "epoch": 18.542, + "grad_norm": 1.1539580821990967, + "learning_rate": 2e-05, + "loss": 0.03745293, + "step": 9271 + }, + { + "epoch": 18.544, + "grad_norm": 1.8408349752426147, + "learning_rate": 2e-05, + "loss": 0.04294346, + "step": 9272 + }, + { + "epoch": 18.546, + "grad_norm": 2.7005791664123535, + "learning_rate": 2e-05, + "loss": 0.03372211, + "step": 9273 + }, + { + "epoch": 18.548000000000002, + "grad_norm": 2.7007863521575928, + "learning_rate": 2e-05, + "loss": 0.0421975, + "step": 9274 + }, + { + "epoch": 18.55, + "grad_norm": 1.045002818107605, + "learning_rate": 2e-05, + "loss": 0.03461481, + "step": 9275 + }, + { + "epoch": 18.552, + "grad_norm": 1.1427747011184692, + "learning_rate": 2e-05, + "loss": 0.0413534, + "step": 9276 + }, + { + "epoch": 18.554, + "grad_norm": 1.6876646280288696, + "learning_rate": 2e-05, + "loss": 0.04642463, + "step": 9277 + }, + { + "epoch": 18.556, + "grad_norm": 0.9396681189537048, + "learning_rate": 2e-05, + "loss": 0.02529829, + "step": 9278 + }, + { + "epoch": 18.558, + "grad_norm": 1.2221046686172485, + "learning_rate": 2e-05, + "loss": 0.03684171, + "step": 9279 + }, + { + "epoch": 18.56, + "grad_norm": 1.505953073501587, + "learning_rate": 2e-05, + "loss": 0.04012023, + "step": 9280 + }, + { + "epoch": 18.562, + "grad_norm": 1.1318151950836182, + "learning_rate": 2e-05, + "loss": 0.03027987, + "step": 9281 + }, + { + "epoch": 18.564, + "grad_norm": 2.6939613819122314, + "learning_rate": 2e-05, + "loss": 0.0387833, + "step": 9282 + }, + { + "epoch": 18.566, + "grad_norm": 1.3286248445510864, + "learning_rate": 2e-05, + "loss": 0.03856936, + "step": 9283 + }, + { + "epoch": 18.568, + "grad_norm": 2.2467446327209473, + "learning_rate": 2e-05, + "loss": 0.03416197, + "step": 9284 + }, + { + "epoch": 18.57, + "grad_norm": 0.8738142848014832, + "learning_rate": 2e-05, + "loss": 0.030899, + "step": 9285 + }, + { + "epoch": 18.572, + "grad_norm": 1.4438823461532593, + "learning_rate": 2e-05, + "loss": 0.03023734, + "step": 9286 + }, + { + "epoch": 18.574, + "grad_norm": 1.0707905292510986, + "learning_rate": 2e-05, + "loss": 0.03663265, + "step": 9287 + }, + { + "epoch": 18.576, + "grad_norm": 3.671177864074707, + "learning_rate": 2e-05, + "loss": 0.03615709, + "step": 9288 + }, + { + "epoch": 18.578, + "grad_norm": 1.270842432975769, + "learning_rate": 2e-05, + "loss": 0.03984865, + "step": 9289 + }, + { + "epoch": 18.58, + "grad_norm": 1.2456284761428833, + "learning_rate": 2e-05, + "loss": 0.03165886, + "step": 9290 + }, + { + "epoch": 18.582, + "grad_norm": 1.7004910707473755, + "learning_rate": 2e-05, + "loss": 0.0358137, + "step": 9291 + }, + { + "epoch": 18.584, + "grad_norm": 1.0085270404815674, + "learning_rate": 2e-05, + "loss": 0.03215361, + "step": 9292 + }, + { + "epoch": 18.586, + "grad_norm": 1.081712245941162, + "learning_rate": 2e-05, + "loss": 0.02386444, + "step": 9293 + }, + { + "epoch": 18.588, + "grad_norm": 1.1721680164337158, + "learning_rate": 2e-05, + "loss": 0.02527738, + "step": 9294 + }, + { + "epoch": 18.59, + "grad_norm": 0.7640558481216431, + "learning_rate": 2e-05, + "loss": 0.02131698, + "step": 9295 + }, + { + "epoch": 18.592, + "grad_norm": 1.278498888015747, + "learning_rate": 2e-05, + "loss": 0.03705659, + "step": 9296 + }, + { + "epoch": 18.594, + "grad_norm": 1.4128645658493042, + "learning_rate": 2e-05, + "loss": 0.02741608, + "step": 9297 + }, + { + "epoch": 18.596, + "grad_norm": 0.9089370369911194, + "learning_rate": 2e-05, + "loss": 0.02473187, + "step": 9298 + }, + { + "epoch": 18.598, + "grad_norm": 1.2044442892074585, + "learning_rate": 2e-05, + "loss": 0.03941024, + "step": 9299 + }, + { + "epoch": 18.6, + "grad_norm": 1.4648476839065552, + "learning_rate": 2e-05, + "loss": 0.02977973, + "step": 9300 + }, + { + "epoch": 18.602, + "grad_norm": 2.9288134574890137, + "learning_rate": 2e-05, + "loss": 0.05101031, + "step": 9301 + }, + { + "epoch": 18.604, + "grad_norm": 1.2922507524490356, + "learning_rate": 2e-05, + "loss": 0.03375695, + "step": 9302 + }, + { + "epoch": 18.606, + "grad_norm": 3.628523588180542, + "learning_rate": 2e-05, + "loss": 0.03046746, + "step": 9303 + }, + { + "epoch": 18.608, + "grad_norm": 1.5554486513137817, + "learning_rate": 2e-05, + "loss": 0.03474073, + "step": 9304 + }, + { + "epoch": 18.61, + "grad_norm": 1.2011704444885254, + "learning_rate": 2e-05, + "loss": 0.03348228, + "step": 9305 + }, + { + "epoch": 18.612, + "grad_norm": 1.0347377061843872, + "learning_rate": 2e-05, + "loss": 0.03440222, + "step": 9306 + }, + { + "epoch": 18.614, + "grad_norm": 1.410341739654541, + "learning_rate": 2e-05, + "loss": 0.04505805, + "step": 9307 + }, + { + "epoch": 18.616, + "grad_norm": 1.230013370513916, + "learning_rate": 2e-05, + "loss": 0.04353497, + "step": 9308 + }, + { + "epoch": 18.618, + "grad_norm": 0.8737069964408875, + "learning_rate": 2e-05, + "loss": 0.02157835, + "step": 9309 + }, + { + "epoch": 18.62, + "grad_norm": 1.7111151218414307, + "learning_rate": 2e-05, + "loss": 0.0389607, + "step": 9310 + }, + { + "epoch": 18.622, + "grad_norm": 1.636101245880127, + "learning_rate": 2e-05, + "loss": 0.04191142, + "step": 9311 + }, + { + "epoch": 18.624, + "grad_norm": 2.3487284183502197, + "learning_rate": 2e-05, + "loss": 0.04457171, + "step": 9312 + }, + { + "epoch": 18.626, + "grad_norm": 1.1017884016036987, + "learning_rate": 2e-05, + "loss": 0.03230072, + "step": 9313 + }, + { + "epoch": 18.628, + "grad_norm": 1.428139567375183, + "learning_rate": 2e-05, + "loss": 0.03711209, + "step": 9314 + }, + { + "epoch": 18.63, + "grad_norm": 1.0108582973480225, + "learning_rate": 2e-05, + "loss": 0.03145947, + "step": 9315 + }, + { + "epoch": 18.632, + "grad_norm": 1.5939596891403198, + "learning_rate": 2e-05, + "loss": 0.03615252, + "step": 9316 + }, + { + "epoch": 18.634, + "grad_norm": 1.2237117290496826, + "learning_rate": 2e-05, + "loss": 0.03127633, + "step": 9317 + }, + { + "epoch": 18.636, + "grad_norm": 0.8785711526870728, + "learning_rate": 2e-05, + "loss": 0.01814137, + "step": 9318 + }, + { + "epoch": 18.638, + "grad_norm": 1.659043550491333, + "learning_rate": 2e-05, + "loss": 0.03371783, + "step": 9319 + }, + { + "epoch": 18.64, + "grad_norm": 1.5230258703231812, + "learning_rate": 2e-05, + "loss": 0.04253473, + "step": 9320 + }, + { + "epoch": 18.642, + "grad_norm": 1.0703048706054688, + "learning_rate": 2e-05, + "loss": 0.02780952, + "step": 9321 + }, + { + "epoch": 18.644, + "grad_norm": 2.55739688873291, + "learning_rate": 2e-05, + "loss": 0.05347919, + "step": 9322 + }, + { + "epoch": 18.646, + "grad_norm": 1.2097032070159912, + "learning_rate": 2e-05, + "loss": 0.03133424, + "step": 9323 + }, + { + "epoch": 18.648, + "grad_norm": 1.7412601709365845, + "learning_rate": 2e-05, + "loss": 0.04346561, + "step": 9324 + }, + { + "epoch": 18.65, + "grad_norm": 0.9796632528305054, + "learning_rate": 2e-05, + "loss": 0.02903066, + "step": 9325 + }, + { + "epoch": 18.652, + "grad_norm": 1.5429102182388306, + "learning_rate": 2e-05, + "loss": 0.03083556, + "step": 9326 + }, + { + "epoch": 18.654, + "grad_norm": 3.025692939758301, + "learning_rate": 2e-05, + "loss": 0.04267042, + "step": 9327 + }, + { + "epoch": 18.656, + "grad_norm": 1.8448047637939453, + "learning_rate": 2e-05, + "loss": 0.04758323, + "step": 9328 + }, + { + "epoch": 18.658, + "grad_norm": 1.5989882946014404, + "learning_rate": 2e-05, + "loss": 0.04394848, + "step": 9329 + }, + { + "epoch": 18.66, + "grad_norm": 1.270090103149414, + "learning_rate": 2e-05, + "loss": 0.03506234, + "step": 9330 + }, + { + "epoch": 18.662, + "grad_norm": 1.0124268531799316, + "learning_rate": 2e-05, + "loss": 0.02979084, + "step": 9331 + }, + { + "epoch": 18.664, + "grad_norm": 1.4778650999069214, + "learning_rate": 2e-05, + "loss": 0.03793935, + "step": 9332 + }, + { + "epoch": 18.666, + "grad_norm": 1.5023845434188843, + "learning_rate": 2e-05, + "loss": 0.0416206, + "step": 9333 + }, + { + "epoch": 18.668, + "grad_norm": 0.8940832614898682, + "learning_rate": 2e-05, + "loss": 0.01142349, + "step": 9334 + }, + { + "epoch": 18.67, + "grad_norm": 1.727463960647583, + "learning_rate": 2e-05, + "loss": 0.03897662, + "step": 9335 + }, + { + "epoch": 18.672, + "grad_norm": 1.1958836317062378, + "learning_rate": 2e-05, + "loss": 0.04261857, + "step": 9336 + }, + { + "epoch": 18.674, + "grad_norm": 1.020642638206482, + "learning_rate": 2e-05, + "loss": 0.02642402, + "step": 9337 + }, + { + "epoch": 18.676, + "grad_norm": 2.0219318866729736, + "learning_rate": 2e-05, + "loss": 0.0680473, + "step": 9338 + }, + { + "epoch": 18.678, + "grad_norm": 1.318962574005127, + "learning_rate": 2e-05, + "loss": 0.02908128, + "step": 9339 + }, + { + "epoch": 18.68, + "grad_norm": 1.432193636894226, + "learning_rate": 2e-05, + "loss": 0.03741326, + "step": 9340 + }, + { + "epoch": 18.682, + "grad_norm": 1.0566816329956055, + "learning_rate": 2e-05, + "loss": 0.03124825, + "step": 9341 + }, + { + "epoch": 18.684, + "grad_norm": 1.3076931238174438, + "learning_rate": 2e-05, + "loss": 0.04089387, + "step": 9342 + }, + { + "epoch": 18.686, + "grad_norm": 1.0010727643966675, + "learning_rate": 2e-05, + "loss": 0.03195993, + "step": 9343 + }, + { + "epoch": 18.688, + "grad_norm": 1.1595393419265747, + "learning_rate": 2e-05, + "loss": 0.03774239, + "step": 9344 + }, + { + "epoch": 18.69, + "grad_norm": 0.8064015507698059, + "learning_rate": 2e-05, + "loss": 0.02562628, + "step": 9345 + }, + { + "epoch": 18.692, + "grad_norm": 1.1677731275558472, + "learning_rate": 2e-05, + "loss": 0.02810512, + "step": 9346 + }, + { + "epoch": 18.694, + "grad_norm": 2.073591709136963, + "learning_rate": 2e-05, + "loss": 0.04026121, + "step": 9347 + }, + { + "epoch": 18.696, + "grad_norm": 0.8951675891876221, + "learning_rate": 2e-05, + "loss": 0.02346078, + "step": 9348 + }, + { + "epoch": 18.698, + "grad_norm": 1.4167735576629639, + "learning_rate": 2e-05, + "loss": 0.04481205, + "step": 9349 + }, + { + "epoch": 18.7, + "grad_norm": 1.254514217376709, + "learning_rate": 2e-05, + "loss": 0.02620357, + "step": 9350 + }, + { + "epoch": 18.701999999999998, + "grad_norm": 2.4272637367248535, + "learning_rate": 2e-05, + "loss": 0.04456335, + "step": 9351 + }, + { + "epoch": 18.704, + "grad_norm": 1.1000620126724243, + "learning_rate": 2e-05, + "loss": 0.04522509, + "step": 9352 + }, + { + "epoch": 18.706, + "grad_norm": 1.0736738443374634, + "learning_rate": 2e-05, + "loss": 0.04045687, + "step": 9353 + }, + { + "epoch": 18.708, + "grad_norm": 2.3483619689941406, + "learning_rate": 2e-05, + "loss": 0.05076428, + "step": 9354 + }, + { + "epoch": 18.71, + "grad_norm": 0.972253680229187, + "learning_rate": 2e-05, + "loss": 0.02645556, + "step": 9355 + }, + { + "epoch": 18.712, + "grad_norm": 0.9229649305343628, + "learning_rate": 2e-05, + "loss": 0.02200065, + "step": 9356 + }, + { + "epoch": 18.714, + "grad_norm": 1.2092173099517822, + "learning_rate": 2e-05, + "loss": 0.03871221, + "step": 9357 + }, + { + "epoch": 18.716, + "grad_norm": 1.9008631706237793, + "learning_rate": 2e-05, + "loss": 0.04458017, + "step": 9358 + }, + { + "epoch": 18.718, + "grad_norm": 1.0636965036392212, + "learning_rate": 2e-05, + "loss": 0.02840264, + "step": 9359 + }, + { + "epoch": 18.72, + "grad_norm": 1.2430663108825684, + "learning_rate": 2e-05, + "loss": 0.03148837, + "step": 9360 + }, + { + "epoch": 18.722, + "grad_norm": 1.123664140701294, + "learning_rate": 2e-05, + "loss": 0.03394098, + "step": 9361 + }, + { + "epoch": 18.724, + "grad_norm": 1.0054221153259277, + "learning_rate": 2e-05, + "loss": 0.03370092, + "step": 9362 + }, + { + "epoch": 18.726, + "grad_norm": 1.5903475284576416, + "learning_rate": 2e-05, + "loss": 0.04029074, + "step": 9363 + }, + { + "epoch": 18.728, + "grad_norm": 1.9477814435958862, + "learning_rate": 2e-05, + "loss": 0.06019651, + "step": 9364 + }, + { + "epoch": 18.73, + "grad_norm": 2.084622383117676, + "learning_rate": 2e-05, + "loss": 0.02988496, + "step": 9365 + }, + { + "epoch": 18.732, + "grad_norm": 1.7150975465774536, + "learning_rate": 2e-05, + "loss": 0.05392132, + "step": 9366 + }, + { + "epoch": 18.734, + "grad_norm": 0.9419837594032288, + "learning_rate": 2e-05, + "loss": 0.02924297, + "step": 9367 + }, + { + "epoch": 18.736, + "grad_norm": 1.4422987699508667, + "learning_rate": 2e-05, + "loss": 0.03685901, + "step": 9368 + }, + { + "epoch": 18.738, + "grad_norm": 0.9840115904808044, + "learning_rate": 2e-05, + "loss": 0.02506851, + "step": 9369 + }, + { + "epoch": 18.74, + "grad_norm": 1.052154302597046, + "learning_rate": 2e-05, + "loss": 0.03087404, + "step": 9370 + }, + { + "epoch": 18.742, + "grad_norm": 1.2569382190704346, + "learning_rate": 2e-05, + "loss": 0.04886915, + "step": 9371 + }, + { + "epoch": 18.744, + "grad_norm": 1.5744709968566895, + "learning_rate": 2e-05, + "loss": 0.04542425, + "step": 9372 + }, + { + "epoch": 18.746, + "grad_norm": 2.963470458984375, + "learning_rate": 2e-05, + "loss": 0.03224288, + "step": 9373 + }, + { + "epoch": 18.748, + "grad_norm": 1.8991296291351318, + "learning_rate": 2e-05, + "loss": 0.046722, + "step": 9374 + }, + { + "epoch": 18.75, + "grad_norm": 0.9656346440315247, + "learning_rate": 2e-05, + "loss": 0.03440123, + "step": 9375 + }, + { + "epoch": 18.752, + "grad_norm": 1.0450048446655273, + "learning_rate": 2e-05, + "loss": 0.03775831, + "step": 9376 + }, + { + "epoch": 18.754, + "grad_norm": 1.7124624252319336, + "learning_rate": 2e-05, + "loss": 0.05066157, + "step": 9377 + }, + { + "epoch": 18.756, + "grad_norm": 1.501320242881775, + "learning_rate": 2e-05, + "loss": 0.03876662, + "step": 9378 + }, + { + "epoch": 18.758, + "grad_norm": 1.2065678834915161, + "learning_rate": 2e-05, + "loss": 0.03976551, + "step": 9379 + }, + { + "epoch": 18.76, + "grad_norm": 1.4827100038528442, + "learning_rate": 2e-05, + "loss": 0.03961009, + "step": 9380 + }, + { + "epoch": 18.762, + "grad_norm": 1.3919841051101685, + "learning_rate": 2e-05, + "loss": 0.0386041, + "step": 9381 + }, + { + "epoch": 18.764, + "grad_norm": 1.1867378950119019, + "learning_rate": 2e-05, + "loss": 0.03478377, + "step": 9382 + }, + { + "epoch": 18.766, + "grad_norm": 1.0938994884490967, + "learning_rate": 2e-05, + "loss": 0.03122077, + "step": 9383 + }, + { + "epoch": 18.768, + "grad_norm": 2.3887717723846436, + "learning_rate": 2e-05, + "loss": 0.04409286, + "step": 9384 + }, + { + "epoch": 18.77, + "grad_norm": 1.0273598432540894, + "learning_rate": 2e-05, + "loss": 0.02485752, + "step": 9385 + }, + { + "epoch": 18.772, + "grad_norm": 1.7304757833480835, + "learning_rate": 2e-05, + "loss": 0.03049651, + "step": 9386 + }, + { + "epoch": 18.774, + "grad_norm": 1.9166194200515747, + "learning_rate": 2e-05, + "loss": 0.04044746, + "step": 9387 + }, + { + "epoch": 18.776, + "grad_norm": 1.7198108434677124, + "learning_rate": 2e-05, + "loss": 0.0405063, + "step": 9388 + }, + { + "epoch": 18.778, + "grad_norm": 1.8082114458084106, + "learning_rate": 2e-05, + "loss": 0.03769893, + "step": 9389 + }, + { + "epoch": 18.78, + "grad_norm": 0.9580773115158081, + "learning_rate": 2e-05, + "loss": 0.03357112, + "step": 9390 + }, + { + "epoch": 18.782, + "grad_norm": 1.059820532798767, + "learning_rate": 2e-05, + "loss": 0.02640706, + "step": 9391 + }, + { + "epoch": 18.784, + "grad_norm": 4.033216953277588, + "learning_rate": 2e-05, + "loss": 0.03864349, + "step": 9392 + }, + { + "epoch": 18.786, + "grad_norm": 1.1333658695220947, + "learning_rate": 2e-05, + "loss": 0.03008582, + "step": 9393 + }, + { + "epoch": 18.788, + "grad_norm": 2.1095573902130127, + "learning_rate": 2e-05, + "loss": 0.03836686, + "step": 9394 + }, + { + "epoch": 18.79, + "grad_norm": 0.7854803204536438, + "learning_rate": 2e-05, + "loss": 0.01916845, + "step": 9395 + }, + { + "epoch": 18.792, + "grad_norm": 1.424535870552063, + "learning_rate": 2e-05, + "loss": 0.03236981, + "step": 9396 + }, + { + "epoch": 18.794, + "grad_norm": 1.234863042831421, + "learning_rate": 2e-05, + "loss": 0.0439956, + "step": 9397 + }, + { + "epoch": 18.796, + "grad_norm": 1.0828542709350586, + "learning_rate": 2e-05, + "loss": 0.03070717, + "step": 9398 + }, + { + "epoch": 18.798000000000002, + "grad_norm": 1.125207781791687, + "learning_rate": 2e-05, + "loss": 0.03891785, + "step": 9399 + }, + { + "epoch": 18.8, + "grad_norm": 1.4716780185699463, + "learning_rate": 2e-05, + "loss": 0.04283679, + "step": 9400 + }, + { + "epoch": 18.802, + "grad_norm": 2.2038402557373047, + "learning_rate": 2e-05, + "loss": 0.04184093, + "step": 9401 + }, + { + "epoch": 18.804, + "grad_norm": 2.594099283218384, + "learning_rate": 2e-05, + "loss": 0.0254508, + "step": 9402 + }, + { + "epoch": 18.806, + "grad_norm": 0.977802038192749, + "learning_rate": 2e-05, + "loss": 0.02622373, + "step": 9403 + }, + { + "epoch": 18.808, + "grad_norm": 1.5012508630752563, + "learning_rate": 2e-05, + "loss": 0.05293182, + "step": 9404 + }, + { + "epoch": 18.81, + "grad_norm": 1.1779042482376099, + "learning_rate": 2e-05, + "loss": 0.03067445, + "step": 9405 + }, + { + "epoch": 18.812, + "grad_norm": 1.9061391353607178, + "learning_rate": 2e-05, + "loss": 0.04364157, + "step": 9406 + }, + { + "epoch": 18.814, + "grad_norm": 1.0848644971847534, + "learning_rate": 2e-05, + "loss": 0.03666593, + "step": 9407 + }, + { + "epoch": 18.816, + "grad_norm": 4.1378960609436035, + "learning_rate": 2e-05, + "loss": 0.03240601, + "step": 9408 + }, + { + "epoch": 18.818, + "grad_norm": 1.168457269668579, + "learning_rate": 2e-05, + "loss": 0.03614372, + "step": 9409 + }, + { + "epoch": 18.82, + "grad_norm": 1.1519936323165894, + "learning_rate": 2e-05, + "loss": 0.04695103, + "step": 9410 + }, + { + "epoch": 18.822, + "grad_norm": 1.9238035678863525, + "learning_rate": 2e-05, + "loss": 0.04975787, + "step": 9411 + }, + { + "epoch": 18.824, + "grad_norm": 1.304675579071045, + "learning_rate": 2e-05, + "loss": 0.04074424, + "step": 9412 + }, + { + "epoch": 18.826, + "grad_norm": 2.158198833465576, + "learning_rate": 2e-05, + "loss": 0.0502829, + "step": 9413 + }, + { + "epoch": 18.828, + "grad_norm": 1.3099114894866943, + "learning_rate": 2e-05, + "loss": 0.04483989, + "step": 9414 + }, + { + "epoch": 18.83, + "grad_norm": 1.9079896211624146, + "learning_rate": 2e-05, + "loss": 0.04286634, + "step": 9415 + }, + { + "epoch": 18.832, + "grad_norm": 1.7033705711364746, + "learning_rate": 2e-05, + "loss": 0.03799369, + "step": 9416 + }, + { + "epoch": 18.834, + "grad_norm": 0.991951584815979, + "learning_rate": 2e-05, + "loss": 0.03253823, + "step": 9417 + }, + { + "epoch": 18.836, + "grad_norm": 1.4874738454818726, + "learning_rate": 2e-05, + "loss": 0.05053546, + "step": 9418 + }, + { + "epoch": 18.838, + "grad_norm": 1.4518730640411377, + "learning_rate": 2e-05, + "loss": 0.03553774, + "step": 9419 + }, + { + "epoch": 18.84, + "grad_norm": 1.0209287405014038, + "learning_rate": 2e-05, + "loss": 0.03443102, + "step": 9420 + }, + { + "epoch": 18.842, + "grad_norm": 1.0967309474945068, + "learning_rate": 2e-05, + "loss": 0.02932144, + "step": 9421 + }, + { + "epoch": 18.844, + "grad_norm": 2.3560574054718018, + "learning_rate": 2e-05, + "loss": 0.04702324, + "step": 9422 + }, + { + "epoch": 18.846, + "grad_norm": 1.2899200916290283, + "learning_rate": 2e-05, + "loss": 0.0305502, + "step": 9423 + }, + { + "epoch": 18.848, + "grad_norm": 1.8223811388015747, + "learning_rate": 2e-05, + "loss": 0.05424388, + "step": 9424 + }, + { + "epoch": 18.85, + "grad_norm": 1.6738855838775635, + "learning_rate": 2e-05, + "loss": 0.04101273, + "step": 9425 + }, + { + "epoch": 18.852, + "grad_norm": 1.2631632089614868, + "learning_rate": 2e-05, + "loss": 0.03289703, + "step": 9426 + }, + { + "epoch": 18.854, + "grad_norm": 1.1822715997695923, + "learning_rate": 2e-05, + "loss": 0.03998087, + "step": 9427 + }, + { + "epoch": 18.856, + "grad_norm": 1.7935429811477661, + "learning_rate": 2e-05, + "loss": 0.04708499, + "step": 9428 + }, + { + "epoch": 18.858, + "grad_norm": 1.6989531517028809, + "learning_rate": 2e-05, + "loss": 0.03702974, + "step": 9429 + }, + { + "epoch": 18.86, + "grad_norm": 1.411005973815918, + "learning_rate": 2e-05, + "loss": 0.03301138, + "step": 9430 + }, + { + "epoch": 18.862, + "grad_norm": 1.575526475906372, + "learning_rate": 2e-05, + "loss": 0.0518712, + "step": 9431 + }, + { + "epoch": 18.864, + "grad_norm": 0.8966975808143616, + "learning_rate": 2e-05, + "loss": 0.02268358, + "step": 9432 + }, + { + "epoch": 18.866, + "grad_norm": 1.3550505638122559, + "learning_rate": 2e-05, + "loss": 0.03749962, + "step": 9433 + }, + { + "epoch": 18.868, + "grad_norm": 1.207885980606079, + "learning_rate": 2e-05, + "loss": 0.0400594, + "step": 9434 + }, + { + "epoch": 18.87, + "grad_norm": 1.0403681993484497, + "learning_rate": 2e-05, + "loss": 0.0228872, + "step": 9435 + }, + { + "epoch": 18.872, + "grad_norm": 2.362938404083252, + "learning_rate": 2e-05, + "loss": 0.06088825, + "step": 9436 + }, + { + "epoch": 18.874, + "grad_norm": 0.9768615365028381, + "learning_rate": 2e-05, + "loss": 0.03623533, + "step": 9437 + }, + { + "epoch": 18.876, + "grad_norm": 0.7810379862785339, + "learning_rate": 2e-05, + "loss": 0.01865301, + "step": 9438 + }, + { + "epoch": 18.878, + "grad_norm": 1.4565064907073975, + "learning_rate": 2e-05, + "loss": 0.04857381, + "step": 9439 + }, + { + "epoch": 18.88, + "grad_norm": 1.5316739082336426, + "learning_rate": 2e-05, + "loss": 0.04568139, + "step": 9440 + }, + { + "epoch": 18.882, + "grad_norm": 1.5420665740966797, + "learning_rate": 2e-05, + "loss": 0.03343572, + "step": 9441 + }, + { + "epoch": 18.884, + "grad_norm": 1.7624716758728027, + "learning_rate": 2e-05, + "loss": 0.03562377, + "step": 9442 + }, + { + "epoch": 18.886, + "grad_norm": 1.3358241319656372, + "learning_rate": 2e-05, + "loss": 0.06009006, + "step": 9443 + }, + { + "epoch": 18.888, + "grad_norm": 1.3039106130599976, + "learning_rate": 2e-05, + "loss": 0.04013005, + "step": 9444 + }, + { + "epoch": 18.89, + "grad_norm": 0.8214094638824463, + "learning_rate": 2e-05, + "loss": 0.02453827, + "step": 9445 + }, + { + "epoch": 18.892, + "grad_norm": 2.3487486839294434, + "learning_rate": 2e-05, + "loss": 0.04817913, + "step": 9446 + }, + { + "epoch": 18.894, + "grad_norm": 2.1647088527679443, + "learning_rate": 2e-05, + "loss": 0.03035383, + "step": 9447 + }, + { + "epoch": 18.896, + "grad_norm": 1.254131555557251, + "learning_rate": 2e-05, + "loss": 0.03613605, + "step": 9448 + }, + { + "epoch": 18.898, + "grad_norm": 1.1320441961288452, + "learning_rate": 2e-05, + "loss": 0.03620258, + "step": 9449 + }, + { + "epoch": 18.9, + "grad_norm": 1.6133835315704346, + "learning_rate": 2e-05, + "loss": 0.03855583, + "step": 9450 + }, + { + "epoch": 18.902, + "grad_norm": 0.9080222845077515, + "learning_rate": 2e-05, + "loss": 0.02581986, + "step": 9451 + }, + { + "epoch": 18.904, + "grad_norm": 1.1648834943771362, + "learning_rate": 2e-05, + "loss": 0.04252192, + "step": 9452 + }, + { + "epoch": 18.906, + "grad_norm": 1.1894398927688599, + "learning_rate": 2e-05, + "loss": 0.03881192, + "step": 9453 + }, + { + "epoch": 18.908, + "grad_norm": 2.000978469848633, + "learning_rate": 2e-05, + "loss": 0.03810741, + "step": 9454 + }, + { + "epoch": 18.91, + "grad_norm": 1.288516879081726, + "learning_rate": 2e-05, + "loss": 0.0437531, + "step": 9455 + }, + { + "epoch": 18.912, + "grad_norm": 1.3083631992340088, + "learning_rate": 2e-05, + "loss": 0.03637118, + "step": 9456 + }, + { + "epoch": 18.914, + "grad_norm": 1.1857115030288696, + "learning_rate": 2e-05, + "loss": 0.04493545, + "step": 9457 + }, + { + "epoch": 18.916, + "grad_norm": 1.9343324899673462, + "learning_rate": 2e-05, + "loss": 0.04948426, + "step": 9458 + }, + { + "epoch": 18.918, + "grad_norm": 1.187826156616211, + "learning_rate": 2e-05, + "loss": 0.03746122, + "step": 9459 + }, + { + "epoch": 18.92, + "grad_norm": 2.1284165382385254, + "learning_rate": 2e-05, + "loss": 0.04689957, + "step": 9460 + }, + { + "epoch": 18.922, + "grad_norm": 1.2563556432724, + "learning_rate": 2e-05, + "loss": 0.03775541, + "step": 9461 + }, + { + "epoch": 18.924, + "grad_norm": 1.2920955419540405, + "learning_rate": 2e-05, + "loss": 0.02915121, + "step": 9462 + }, + { + "epoch": 18.926, + "grad_norm": 1.910418152809143, + "learning_rate": 2e-05, + "loss": 0.04141461, + "step": 9463 + }, + { + "epoch": 18.928, + "grad_norm": 0.9791347980499268, + "learning_rate": 2e-05, + "loss": 0.03308304, + "step": 9464 + }, + { + "epoch": 18.93, + "grad_norm": 2.0668258666992188, + "learning_rate": 2e-05, + "loss": 0.04485352, + "step": 9465 + }, + { + "epoch": 18.932, + "grad_norm": 0.8224574327468872, + "learning_rate": 2e-05, + "loss": 0.02453919, + "step": 9466 + }, + { + "epoch": 18.934, + "grad_norm": 0.975167453289032, + "learning_rate": 2e-05, + "loss": 0.03143321, + "step": 9467 + }, + { + "epoch": 18.936, + "grad_norm": 1.0642677545547485, + "learning_rate": 2e-05, + "loss": 0.03415136, + "step": 9468 + }, + { + "epoch": 18.938, + "grad_norm": 0.8453977704048157, + "learning_rate": 2e-05, + "loss": 0.02789571, + "step": 9469 + }, + { + "epoch": 18.94, + "grad_norm": 2.6224365234375, + "learning_rate": 2e-05, + "loss": 0.03373963, + "step": 9470 + }, + { + "epoch": 18.942, + "grad_norm": 1.6013566255569458, + "learning_rate": 2e-05, + "loss": 0.02791347, + "step": 9471 + }, + { + "epoch": 18.944, + "grad_norm": 1.6802016496658325, + "learning_rate": 2e-05, + "loss": 0.04182333, + "step": 9472 + }, + { + "epoch": 18.946, + "grad_norm": 1.069710373878479, + "learning_rate": 2e-05, + "loss": 0.03297665, + "step": 9473 + }, + { + "epoch": 18.948, + "grad_norm": 1.6262844800949097, + "learning_rate": 2e-05, + "loss": 0.05678248, + "step": 9474 + }, + { + "epoch": 18.95, + "grad_norm": 1.6136468648910522, + "learning_rate": 2e-05, + "loss": 0.03860488, + "step": 9475 + }, + { + "epoch": 18.951999999999998, + "grad_norm": 1.6454910039901733, + "learning_rate": 2e-05, + "loss": 0.0360266, + "step": 9476 + }, + { + "epoch": 18.954, + "grad_norm": 1.3030211925506592, + "learning_rate": 2e-05, + "loss": 0.03737117, + "step": 9477 + }, + { + "epoch": 18.956, + "grad_norm": 1.0022649765014648, + "learning_rate": 2e-05, + "loss": 0.03304816, + "step": 9478 + }, + { + "epoch": 18.958, + "grad_norm": 2.1013195514678955, + "learning_rate": 2e-05, + "loss": 0.04374126, + "step": 9479 + }, + { + "epoch": 18.96, + "grad_norm": 0.9510260224342346, + "learning_rate": 2e-05, + "loss": 0.03056635, + "step": 9480 + }, + { + "epoch": 18.962, + "grad_norm": 1.7025002241134644, + "learning_rate": 2e-05, + "loss": 0.04591848, + "step": 9481 + }, + { + "epoch": 18.964, + "grad_norm": 2.784985303878784, + "learning_rate": 2e-05, + "loss": 0.04702611, + "step": 9482 + }, + { + "epoch": 18.966, + "grad_norm": 1.0321415662765503, + "learning_rate": 2e-05, + "loss": 0.03965008, + "step": 9483 + }, + { + "epoch": 18.968, + "grad_norm": 1.3998851776123047, + "learning_rate": 2e-05, + "loss": 0.04656575, + "step": 9484 + }, + { + "epoch": 18.97, + "grad_norm": 1.053924322128296, + "learning_rate": 2e-05, + "loss": 0.04150043, + "step": 9485 + }, + { + "epoch": 18.972, + "grad_norm": 1.0211212635040283, + "learning_rate": 2e-05, + "loss": 0.03201551, + "step": 9486 + }, + { + "epoch": 18.974, + "grad_norm": 1.109754204750061, + "learning_rate": 2e-05, + "loss": 0.03495897, + "step": 9487 + }, + { + "epoch": 18.976, + "grad_norm": 0.9835900664329529, + "learning_rate": 2e-05, + "loss": 0.03508302, + "step": 9488 + }, + { + "epoch": 18.978, + "grad_norm": 1.2372853755950928, + "learning_rate": 2e-05, + "loss": 0.02763392, + "step": 9489 + }, + { + "epoch": 18.98, + "grad_norm": 1.1497222185134888, + "learning_rate": 2e-05, + "loss": 0.04323982, + "step": 9490 + }, + { + "epoch": 18.982, + "grad_norm": 0.9017625451087952, + "learning_rate": 2e-05, + "loss": 0.03254882, + "step": 9491 + }, + { + "epoch": 18.984, + "grad_norm": 1.0615708827972412, + "learning_rate": 2e-05, + "loss": 0.02894014, + "step": 9492 + }, + { + "epoch": 18.986, + "grad_norm": 1.5303292274475098, + "learning_rate": 2e-05, + "loss": 0.0392815, + "step": 9493 + }, + { + "epoch": 18.988, + "grad_norm": 1.4052002429962158, + "learning_rate": 2e-05, + "loss": 0.0530249, + "step": 9494 + }, + { + "epoch": 18.99, + "grad_norm": 1.3222413063049316, + "learning_rate": 2e-05, + "loss": 0.03406934, + "step": 9495 + }, + { + "epoch": 18.992, + "grad_norm": 1.365356683731079, + "learning_rate": 2e-05, + "loss": 0.03618102, + "step": 9496 + }, + { + "epoch": 18.994, + "grad_norm": 1.526374101638794, + "learning_rate": 2e-05, + "loss": 0.03938029, + "step": 9497 + }, + { + "epoch": 18.996, + "grad_norm": 1.2375332117080688, + "learning_rate": 2e-05, + "loss": 0.04605676, + "step": 9498 + }, + { + "epoch": 18.998, + "grad_norm": 1.3788480758666992, + "learning_rate": 2e-05, + "loss": 0.04089382, + "step": 9499 + }, + { + "epoch": 19.0, + "grad_norm": 1.1874690055847168, + "learning_rate": 2e-05, + "loss": 0.03390002, + "step": 9500 + }, + { + "epoch": 19.0, + "eval_performance": { + "AngleClassification_1": 0.992, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9720558882235529, + "Equal_1": 0.994, + "Equal_2": 0.9720558882235529, + "Equal_3": 0.9620758483033932, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.992, + "Perpendicular_1": 0.994, + "Perpendicular_2": 0.986, + "Perpendicular_3": 0.7935871743486974, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9973333333333333, + "PointLiesOnCircle_3": 0.9933333333333333, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9780439121756487 + }, + "eval_runtime": 319.3618, + "eval_samples_per_second": 32.878, + "eval_steps_per_second": 0.658, + "step": 9500 + }, + { + "epoch": 19.002, + "grad_norm": 0.7328594923019409, + "learning_rate": 2e-05, + "loss": 0.02319442, + "step": 9501 + }, + { + "epoch": 19.004, + "grad_norm": 1.1689836978912354, + "learning_rate": 2e-05, + "loss": 0.03362505, + "step": 9502 + }, + { + "epoch": 19.006, + "grad_norm": 1.2494908571243286, + "learning_rate": 2e-05, + "loss": 0.04948422, + "step": 9503 + }, + { + "epoch": 19.008, + "grad_norm": 1.5237233638763428, + "learning_rate": 2e-05, + "loss": 0.03897671, + "step": 9504 + }, + { + "epoch": 19.01, + "grad_norm": 1.9613434076309204, + "learning_rate": 2e-05, + "loss": 0.0261043, + "step": 9505 + }, + { + "epoch": 19.012, + "grad_norm": 0.9122869372367859, + "learning_rate": 2e-05, + "loss": 0.0235957, + "step": 9506 + }, + { + "epoch": 19.014, + "grad_norm": 2.309724807739258, + "learning_rate": 2e-05, + "loss": 0.04885814, + "step": 9507 + }, + { + "epoch": 19.016, + "grad_norm": 1.0690571069717407, + "learning_rate": 2e-05, + "loss": 0.0407089, + "step": 9508 + }, + { + "epoch": 19.018, + "grad_norm": 1.2194948196411133, + "learning_rate": 2e-05, + "loss": 0.03530276, + "step": 9509 + }, + { + "epoch": 19.02, + "grad_norm": 1.3204545974731445, + "learning_rate": 2e-05, + "loss": 0.03693368, + "step": 9510 + }, + { + "epoch": 19.022, + "grad_norm": 1.1874244213104248, + "learning_rate": 2e-05, + "loss": 0.03080766, + "step": 9511 + }, + { + "epoch": 19.024, + "grad_norm": 2.5260133743286133, + "learning_rate": 2e-05, + "loss": 0.04506269, + "step": 9512 + }, + { + "epoch": 19.026, + "grad_norm": 0.8363938331604004, + "learning_rate": 2e-05, + "loss": 0.02124131, + "step": 9513 + }, + { + "epoch": 19.028, + "grad_norm": 2.5137112140655518, + "learning_rate": 2e-05, + "loss": 0.03363831, + "step": 9514 + }, + { + "epoch": 19.03, + "grad_norm": 1.983594536781311, + "learning_rate": 2e-05, + "loss": 0.04035079, + "step": 9515 + }, + { + "epoch": 19.032, + "grad_norm": 2.5143167972564697, + "learning_rate": 2e-05, + "loss": 0.05136511, + "step": 9516 + }, + { + "epoch": 19.034, + "grad_norm": 1.130037784576416, + "learning_rate": 2e-05, + "loss": 0.03966995, + "step": 9517 + }, + { + "epoch": 19.036, + "grad_norm": 1.764394760131836, + "learning_rate": 2e-05, + "loss": 0.04287095, + "step": 9518 + }, + { + "epoch": 19.038, + "grad_norm": 1.359204888343811, + "learning_rate": 2e-05, + "loss": 0.03334723, + "step": 9519 + }, + { + "epoch": 19.04, + "grad_norm": 1.1975325345993042, + "learning_rate": 2e-05, + "loss": 0.03555122, + "step": 9520 + }, + { + "epoch": 19.042, + "grad_norm": 0.9960451126098633, + "learning_rate": 2e-05, + "loss": 0.02776124, + "step": 9521 + }, + { + "epoch": 19.044, + "grad_norm": 1.2696477174758911, + "learning_rate": 2e-05, + "loss": 0.02704079, + "step": 9522 + }, + { + "epoch": 19.046, + "grad_norm": 1.127091646194458, + "learning_rate": 2e-05, + "loss": 0.03138428, + "step": 9523 + }, + { + "epoch": 19.048, + "grad_norm": 2.801011800765991, + "learning_rate": 2e-05, + "loss": 0.04420168, + "step": 9524 + }, + { + "epoch": 19.05, + "grad_norm": 1.672225832939148, + "learning_rate": 2e-05, + "loss": 0.03045794, + "step": 9525 + }, + { + "epoch": 19.052, + "grad_norm": 1.3001759052276611, + "learning_rate": 2e-05, + "loss": 0.03507664, + "step": 9526 + }, + { + "epoch": 19.054, + "grad_norm": 1.307578444480896, + "learning_rate": 2e-05, + "loss": 0.03051586, + "step": 9527 + }, + { + "epoch": 19.056, + "grad_norm": 2.060009479522705, + "learning_rate": 2e-05, + "loss": 0.04484456, + "step": 9528 + }, + { + "epoch": 19.058, + "grad_norm": 1.9255805015563965, + "learning_rate": 2e-05, + "loss": 0.03917917, + "step": 9529 + }, + { + "epoch": 19.06, + "grad_norm": 0.9870061874389648, + "learning_rate": 2e-05, + "loss": 0.02856109, + "step": 9530 + }, + { + "epoch": 19.062, + "grad_norm": 1.398775577545166, + "learning_rate": 2e-05, + "loss": 0.04870712, + "step": 9531 + }, + { + "epoch": 19.064, + "grad_norm": 1.2352436780929565, + "learning_rate": 2e-05, + "loss": 0.04471589, + "step": 9532 + }, + { + "epoch": 19.066, + "grad_norm": 1.8325438499450684, + "learning_rate": 2e-05, + "loss": 0.04382673, + "step": 9533 + }, + { + "epoch": 19.068, + "grad_norm": 1.0861464738845825, + "learning_rate": 2e-05, + "loss": 0.02810282, + "step": 9534 + }, + { + "epoch": 19.07, + "grad_norm": 3.1042890548706055, + "learning_rate": 2e-05, + "loss": 0.04670685, + "step": 9535 + }, + { + "epoch": 19.072, + "grad_norm": 1.2983977794647217, + "learning_rate": 2e-05, + "loss": 0.03450656, + "step": 9536 + }, + { + "epoch": 19.074, + "grad_norm": 2.436805248260498, + "learning_rate": 2e-05, + "loss": 0.03273325, + "step": 9537 + }, + { + "epoch": 19.076, + "grad_norm": 1.8270798921585083, + "learning_rate": 2e-05, + "loss": 0.05142706, + "step": 9538 + }, + { + "epoch": 19.078, + "grad_norm": 1.7410194873809814, + "learning_rate": 2e-05, + "loss": 0.04839787, + "step": 9539 + }, + { + "epoch": 19.08, + "grad_norm": 0.7884871959686279, + "learning_rate": 2e-05, + "loss": 0.02119577, + "step": 9540 + }, + { + "epoch": 19.082, + "grad_norm": 1.4480359554290771, + "learning_rate": 2e-05, + "loss": 0.03697205, + "step": 9541 + }, + { + "epoch": 19.084, + "grad_norm": 0.9422207474708557, + "learning_rate": 2e-05, + "loss": 0.02769065, + "step": 9542 + }, + { + "epoch": 19.086, + "grad_norm": 0.791770339012146, + "learning_rate": 2e-05, + "loss": 0.01657294, + "step": 9543 + }, + { + "epoch": 19.088, + "grad_norm": 1.4777929782867432, + "learning_rate": 2e-05, + "loss": 0.04559475, + "step": 9544 + }, + { + "epoch": 19.09, + "grad_norm": 1.4669833183288574, + "learning_rate": 2e-05, + "loss": 0.03167301, + "step": 9545 + }, + { + "epoch": 19.092, + "grad_norm": 1.2533551454544067, + "learning_rate": 2e-05, + "loss": 0.02627946, + "step": 9546 + }, + { + "epoch": 19.094, + "grad_norm": 1.4709750413894653, + "learning_rate": 2e-05, + "loss": 0.04461734, + "step": 9547 + }, + { + "epoch": 19.096, + "grad_norm": 1.149090051651001, + "learning_rate": 2e-05, + "loss": 0.03871008, + "step": 9548 + }, + { + "epoch": 19.098, + "grad_norm": 1.1180267333984375, + "learning_rate": 2e-05, + "loss": 0.03474029, + "step": 9549 + }, + { + "epoch": 19.1, + "grad_norm": 1.3500217199325562, + "learning_rate": 2e-05, + "loss": 0.02843937, + "step": 9550 + }, + { + "epoch": 19.102, + "grad_norm": 1.7513254880905151, + "learning_rate": 2e-05, + "loss": 0.0540585, + "step": 9551 + }, + { + "epoch": 19.104, + "grad_norm": 1.781827688217163, + "learning_rate": 2e-05, + "loss": 0.04126269, + "step": 9552 + }, + { + "epoch": 19.106, + "grad_norm": 1.1269136667251587, + "learning_rate": 2e-05, + "loss": 0.03399577, + "step": 9553 + }, + { + "epoch": 19.108, + "grad_norm": 1.8535258769989014, + "learning_rate": 2e-05, + "loss": 0.05306114, + "step": 9554 + }, + { + "epoch": 19.11, + "grad_norm": 2.0714800357818604, + "learning_rate": 2e-05, + "loss": 0.0468117, + "step": 9555 + }, + { + "epoch": 19.112, + "grad_norm": 1.0497852563858032, + "learning_rate": 2e-05, + "loss": 0.02673732, + "step": 9556 + }, + { + "epoch": 19.114, + "grad_norm": 0.9615012407302856, + "learning_rate": 2e-05, + "loss": 0.02658118, + "step": 9557 + }, + { + "epoch": 19.116, + "grad_norm": 3.7830493450164795, + "learning_rate": 2e-05, + "loss": 0.04481242, + "step": 9558 + }, + { + "epoch": 19.118, + "grad_norm": 1.9422591924667358, + "learning_rate": 2e-05, + "loss": 0.05550155, + "step": 9559 + }, + { + "epoch": 19.12, + "grad_norm": 1.294343113899231, + "learning_rate": 2e-05, + "loss": 0.03953799, + "step": 9560 + }, + { + "epoch": 19.122, + "grad_norm": 1.5340063571929932, + "learning_rate": 2e-05, + "loss": 0.03284139, + "step": 9561 + }, + { + "epoch": 19.124, + "grad_norm": 1.1982046365737915, + "learning_rate": 2e-05, + "loss": 0.04478461, + "step": 9562 + }, + { + "epoch": 19.126, + "grad_norm": 1.1274042129516602, + "learning_rate": 2e-05, + "loss": 0.03823815, + "step": 9563 + }, + { + "epoch": 19.128, + "grad_norm": 1.3324651718139648, + "learning_rate": 2e-05, + "loss": 0.04417039, + "step": 9564 + }, + { + "epoch": 19.13, + "grad_norm": 3.788226366043091, + "learning_rate": 2e-05, + "loss": 0.04432316, + "step": 9565 + }, + { + "epoch": 19.132, + "grad_norm": 1.0010766983032227, + "learning_rate": 2e-05, + "loss": 0.02808233, + "step": 9566 + }, + { + "epoch": 19.134, + "grad_norm": 1.5182632207870483, + "learning_rate": 2e-05, + "loss": 0.04026213, + "step": 9567 + }, + { + "epoch": 19.136, + "grad_norm": 1.1371148824691772, + "learning_rate": 2e-05, + "loss": 0.03047445, + "step": 9568 + }, + { + "epoch": 19.138, + "grad_norm": 1.003008246421814, + "learning_rate": 2e-05, + "loss": 0.03161223, + "step": 9569 + }, + { + "epoch": 19.14, + "grad_norm": 1.9005447626113892, + "learning_rate": 2e-05, + "loss": 0.04390565, + "step": 9570 + }, + { + "epoch": 19.142, + "grad_norm": 1.1964303255081177, + "learning_rate": 2e-05, + "loss": 0.03318737, + "step": 9571 + }, + { + "epoch": 19.144, + "grad_norm": 1.471183180809021, + "learning_rate": 2e-05, + "loss": 0.04245757, + "step": 9572 + }, + { + "epoch": 19.146, + "grad_norm": 1.0873215198516846, + "learning_rate": 2e-05, + "loss": 0.03344093, + "step": 9573 + }, + { + "epoch": 19.148, + "grad_norm": 1.1384050846099854, + "learning_rate": 2e-05, + "loss": 0.03804214, + "step": 9574 + }, + { + "epoch": 19.15, + "grad_norm": 1.8956702947616577, + "learning_rate": 2e-05, + "loss": 0.04579899, + "step": 9575 + }, + { + "epoch": 19.152, + "grad_norm": 0.9021958112716675, + "learning_rate": 2e-05, + "loss": 0.02364812, + "step": 9576 + }, + { + "epoch": 19.154, + "grad_norm": 0.9730253219604492, + "learning_rate": 2e-05, + "loss": 0.0266248, + "step": 9577 + }, + { + "epoch": 19.156, + "grad_norm": 1.2630407810211182, + "learning_rate": 2e-05, + "loss": 0.03151676, + "step": 9578 + }, + { + "epoch": 19.158, + "grad_norm": 1.0007412433624268, + "learning_rate": 2e-05, + "loss": 0.03561787, + "step": 9579 + }, + { + "epoch": 19.16, + "grad_norm": 1.62027907371521, + "learning_rate": 2e-05, + "loss": 0.05210748, + "step": 9580 + }, + { + "epoch": 19.162, + "grad_norm": 0.950829029083252, + "learning_rate": 2e-05, + "loss": 0.03114957, + "step": 9581 + }, + { + "epoch": 19.164, + "grad_norm": 1.3806723356246948, + "learning_rate": 2e-05, + "loss": 0.04665224, + "step": 9582 + }, + { + "epoch": 19.166, + "grad_norm": 4.24110221862793, + "learning_rate": 2e-05, + "loss": 0.03417417, + "step": 9583 + }, + { + "epoch": 19.168, + "grad_norm": 1.3316015005111694, + "learning_rate": 2e-05, + "loss": 0.04237635, + "step": 9584 + }, + { + "epoch": 19.17, + "grad_norm": 1.7277883291244507, + "learning_rate": 2e-05, + "loss": 0.03498271, + "step": 9585 + }, + { + "epoch": 19.172, + "grad_norm": 1.2385425567626953, + "learning_rate": 2e-05, + "loss": 0.03356936, + "step": 9586 + }, + { + "epoch": 19.174, + "grad_norm": 0.8457931280136108, + "learning_rate": 2e-05, + "loss": 0.01994565, + "step": 9587 + }, + { + "epoch": 19.176, + "grad_norm": 1.2455838918685913, + "learning_rate": 2e-05, + "loss": 0.03502472, + "step": 9588 + }, + { + "epoch": 19.178, + "grad_norm": 1.9572054147720337, + "learning_rate": 2e-05, + "loss": 0.03634415, + "step": 9589 + }, + { + "epoch": 19.18, + "grad_norm": 1.2908679246902466, + "learning_rate": 2e-05, + "loss": 0.03044868, + "step": 9590 + }, + { + "epoch": 19.182, + "grad_norm": 2.5042431354522705, + "learning_rate": 2e-05, + "loss": 0.04156593, + "step": 9591 + }, + { + "epoch": 19.184, + "grad_norm": 1.0194517374038696, + "learning_rate": 2e-05, + "loss": 0.02526481, + "step": 9592 + }, + { + "epoch": 19.186, + "grad_norm": 1.0619885921478271, + "learning_rate": 2e-05, + "loss": 0.0298758, + "step": 9593 + }, + { + "epoch": 19.188, + "grad_norm": 1.263814926147461, + "learning_rate": 2e-05, + "loss": 0.03626692, + "step": 9594 + }, + { + "epoch": 19.19, + "grad_norm": 2.5439209938049316, + "learning_rate": 2e-05, + "loss": 0.05030162, + "step": 9595 + }, + { + "epoch": 19.192, + "grad_norm": 1.6095026731491089, + "learning_rate": 2e-05, + "loss": 0.04484971, + "step": 9596 + }, + { + "epoch": 19.194, + "grad_norm": 1.921481728553772, + "learning_rate": 2e-05, + "loss": 0.05408298, + "step": 9597 + }, + { + "epoch": 19.196, + "grad_norm": 2.8024349212646484, + "learning_rate": 2e-05, + "loss": 0.05455287, + "step": 9598 + }, + { + "epoch": 19.198, + "grad_norm": 0.9602892398834229, + "learning_rate": 2e-05, + "loss": 0.02859609, + "step": 9599 + }, + { + "epoch": 19.2, + "grad_norm": 1.6992692947387695, + "learning_rate": 2e-05, + "loss": 0.03378526, + "step": 9600 + }, + { + "epoch": 19.202, + "grad_norm": 2.3561947345733643, + "learning_rate": 2e-05, + "loss": 0.03298889, + "step": 9601 + }, + { + "epoch": 19.204, + "grad_norm": 1.4221720695495605, + "learning_rate": 2e-05, + "loss": 0.04421243, + "step": 9602 + }, + { + "epoch": 19.206, + "grad_norm": 1.0201853513717651, + "learning_rate": 2e-05, + "loss": 0.02438156, + "step": 9603 + }, + { + "epoch": 19.208, + "grad_norm": 0.9587017893791199, + "learning_rate": 2e-05, + "loss": 0.02957552, + "step": 9604 + }, + { + "epoch": 19.21, + "grad_norm": 1.138653039932251, + "learning_rate": 2e-05, + "loss": 0.02991896, + "step": 9605 + }, + { + "epoch": 19.212, + "grad_norm": 2.0702502727508545, + "learning_rate": 2e-05, + "loss": 0.03139401, + "step": 9606 + }, + { + "epoch": 19.214, + "grad_norm": 1.0454638004302979, + "learning_rate": 2e-05, + "loss": 0.02659562, + "step": 9607 + }, + { + "epoch": 19.216, + "grad_norm": 2.350715160369873, + "learning_rate": 2e-05, + "loss": 0.05148138, + "step": 9608 + }, + { + "epoch": 19.218, + "grad_norm": 0.7986869812011719, + "learning_rate": 2e-05, + "loss": 0.02145108, + "step": 9609 + }, + { + "epoch": 19.22, + "grad_norm": 1.9818875789642334, + "learning_rate": 2e-05, + "loss": 0.04348661, + "step": 9610 + }, + { + "epoch": 19.222, + "grad_norm": 1.72350013256073, + "learning_rate": 2e-05, + "loss": 0.03537084, + "step": 9611 + }, + { + "epoch": 19.224, + "grad_norm": 0.917288601398468, + "learning_rate": 2e-05, + "loss": 0.02850287, + "step": 9612 + }, + { + "epoch": 19.226, + "grad_norm": 2.157785177230835, + "learning_rate": 2e-05, + "loss": 0.04212007, + "step": 9613 + }, + { + "epoch": 19.228, + "grad_norm": 1.6661065816879272, + "learning_rate": 2e-05, + "loss": 0.04836546, + "step": 9614 + }, + { + "epoch": 19.23, + "grad_norm": 1.0794106721878052, + "learning_rate": 2e-05, + "loss": 0.0291114, + "step": 9615 + }, + { + "epoch": 19.232, + "grad_norm": 1.1897625923156738, + "learning_rate": 2e-05, + "loss": 0.04130687, + "step": 9616 + }, + { + "epoch": 19.234, + "grad_norm": 1.4216676950454712, + "learning_rate": 2e-05, + "loss": 0.03022526, + "step": 9617 + }, + { + "epoch": 19.236, + "grad_norm": 1.3475738763809204, + "learning_rate": 2e-05, + "loss": 0.04719062, + "step": 9618 + }, + { + "epoch": 19.238, + "grad_norm": 1.4195927381515503, + "learning_rate": 2e-05, + "loss": 0.03779185, + "step": 9619 + }, + { + "epoch": 19.24, + "grad_norm": 1.1476095914840698, + "learning_rate": 2e-05, + "loss": 0.0434536, + "step": 9620 + }, + { + "epoch": 19.242, + "grad_norm": 1.682695984840393, + "learning_rate": 2e-05, + "loss": 0.05591667, + "step": 9621 + }, + { + "epoch": 19.244, + "grad_norm": 1.1905351877212524, + "learning_rate": 2e-05, + "loss": 0.0391013, + "step": 9622 + }, + { + "epoch": 19.246, + "grad_norm": 1.3827733993530273, + "learning_rate": 2e-05, + "loss": 0.02527856, + "step": 9623 + }, + { + "epoch": 19.248, + "grad_norm": 1.235648512840271, + "learning_rate": 2e-05, + "loss": 0.03677778, + "step": 9624 + }, + { + "epoch": 19.25, + "grad_norm": 2.980347156524658, + "learning_rate": 2e-05, + "loss": 0.06639814, + "step": 9625 + }, + { + "epoch": 19.252, + "grad_norm": 1.079358458518982, + "learning_rate": 2e-05, + "loss": 0.0342607, + "step": 9626 + }, + { + "epoch": 19.254, + "grad_norm": 1.3070192337036133, + "learning_rate": 2e-05, + "loss": 0.03877965, + "step": 9627 + }, + { + "epoch": 19.256, + "grad_norm": 1.132627010345459, + "learning_rate": 2e-05, + "loss": 0.03127551, + "step": 9628 + }, + { + "epoch": 19.258, + "grad_norm": 1.1576032638549805, + "learning_rate": 2e-05, + "loss": 0.03139985, + "step": 9629 + }, + { + "epoch": 19.26, + "grad_norm": 1.2397043704986572, + "learning_rate": 2e-05, + "loss": 0.04228866, + "step": 9630 + }, + { + "epoch": 19.262, + "grad_norm": 1.0662872791290283, + "learning_rate": 2e-05, + "loss": 0.03128354, + "step": 9631 + }, + { + "epoch": 19.264, + "grad_norm": 3.369060754776001, + "learning_rate": 2e-05, + "loss": 0.03271961, + "step": 9632 + }, + { + "epoch": 19.266, + "grad_norm": 1.7384709119796753, + "learning_rate": 2e-05, + "loss": 0.04713779, + "step": 9633 + }, + { + "epoch": 19.268, + "grad_norm": 1.8644956350326538, + "learning_rate": 2e-05, + "loss": 0.03739468, + "step": 9634 + }, + { + "epoch": 19.27, + "grad_norm": 1.260280966758728, + "learning_rate": 2e-05, + "loss": 0.0264209, + "step": 9635 + }, + { + "epoch": 19.272, + "grad_norm": 1.2313249111175537, + "learning_rate": 2e-05, + "loss": 0.03959552, + "step": 9636 + }, + { + "epoch": 19.274, + "grad_norm": 1.3005576133728027, + "learning_rate": 2e-05, + "loss": 0.03051363, + "step": 9637 + }, + { + "epoch": 19.276, + "grad_norm": 1.6750763654708862, + "learning_rate": 2e-05, + "loss": 0.05048709, + "step": 9638 + }, + { + "epoch": 19.278, + "grad_norm": 1.6315701007843018, + "learning_rate": 2e-05, + "loss": 0.02604922, + "step": 9639 + }, + { + "epoch": 19.28, + "grad_norm": 1.7863506078720093, + "learning_rate": 2e-05, + "loss": 0.03610089, + "step": 9640 + }, + { + "epoch": 19.282, + "grad_norm": 1.3061033487319946, + "learning_rate": 2e-05, + "loss": 0.04164078, + "step": 9641 + }, + { + "epoch": 19.284, + "grad_norm": 1.4289368391036987, + "learning_rate": 2e-05, + "loss": 0.02338367, + "step": 9642 + }, + { + "epoch": 19.286, + "grad_norm": 1.3021084070205688, + "learning_rate": 2e-05, + "loss": 0.04391333, + "step": 9643 + }, + { + "epoch": 19.288, + "grad_norm": 1.178717851638794, + "learning_rate": 2e-05, + "loss": 0.03647342, + "step": 9644 + }, + { + "epoch": 19.29, + "grad_norm": 0.8449578881263733, + "learning_rate": 2e-05, + "loss": 0.02405243, + "step": 9645 + }, + { + "epoch": 19.292, + "grad_norm": 1.4764927625656128, + "learning_rate": 2e-05, + "loss": 0.05233892, + "step": 9646 + }, + { + "epoch": 19.294, + "grad_norm": 1.8029993772506714, + "learning_rate": 2e-05, + "loss": 0.03435803, + "step": 9647 + }, + { + "epoch": 19.296, + "grad_norm": 1.4311555624008179, + "learning_rate": 2e-05, + "loss": 0.03782001, + "step": 9648 + }, + { + "epoch": 19.298, + "grad_norm": 1.300187587738037, + "learning_rate": 2e-05, + "loss": 0.05648835, + "step": 9649 + }, + { + "epoch": 19.3, + "grad_norm": 1.4080634117126465, + "learning_rate": 2e-05, + "loss": 0.03893708, + "step": 9650 + }, + { + "epoch": 19.302, + "grad_norm": 2.201977014541626, + "learning_rate": 2e-05, + "loss": 0.04349912, + "step": 9651 + }, + { + "epoch": 19.304, + "grad_norm": 0.9720236659049988, + "learning_rate": 2e-05, + "loss": 0.02697369, + "step": 9652 + }, + { + "epoch": 19.306, + "grad_norm": 1.6441378593444824, + "learning_rate": 2e-05, + "loss": 0.05696252, + "step": 9653 + }, + { + "epoch": 19.308, + "grad_norm": 1.6382756233215332, + "learning_rate": 2e-05, + "loss": 0.03626692, + "step": 9654 + }, + { + "epoch": 19.31, + "grad_norm": 1.034172534942627, + "learning_rate": 2e-05, + "loss": 0.03107112, + "step": 9655 + }, + { + "epoch": 19.312, + "grad_norm": 1.183516263961792, + "learning_rate": 2e-05, + "loss": 0.03426999, + "step": 9656 + }, + { + "epoch": 19.314, + "grad_norm": 1.6998095512390137, + "learning_rate": 2e-05, + "loss": 0.03958786, + "step": 9657 + }, + { + "epoch": 19.316, + "grad_norm": 1.598720669746399, + "learning_rate": 2e-05, + "loss": 0.03835572, + "step": 9658 + }, + { + "epoch": 19.318, + "grad_norm": 1.1745725870132446, + "learning_rate": 2e-05, + "loss": 0.03925322, + "step": 9659 + }, + { + "epoch": 19.32, + "grad_norm": 1.0180588960647583, + "learning_rate": 2e-05, + "loss": 0.03550194, + "step": 9660 + }, + { + "epoch": 19.322, + "grad_norm": 0.9355264902114868, + "learning_rate": 2e-05, + "loss": 0.03325229, + "step": 9661 + }, + { + "epoch": 19.324, + "grad_norm": 1.2007991075515747, + "learning_rate": 2e-05, + "loss": 0.04493001, + "step": 9662 + }, + { + "epoch": 19.326, + "grad_norm": 0.86708664894104, + "learning_rate": 2e-05, + "loss": 0.02545637, + "step": 9663 + }, + { + "epoch": 19.328, + "grad_norm": 1.0623421669006348, + "learning_rate": 2e-05, + "loss": 0.03543958, + "step": 9664 + }, + { + "epoch": 19.33, + "grad_norm": 1.6305174827575684, + "learning_rate": 2e-05, + "loss": 0.04481956, + "step": 9665 + }, + { + "epoch": 19.332, + "grad_norm": 2.0141186714172363, + "learning_rate": 2e-05, + "loss": 0.04622241, + "step": 9666 + }, + { + "epoch": 19.334, + "grad_norm": 1.1043932437896729, + "learning_rate": 2e-05, + "loss": 0.03127149, + "step": 9667 + }, + { + "epoch": 19.336, + "grad_norm": 1.262362003326416, + "learning_rate": 2e-05, + "loss": 0.02693222, + "step": 9668 + }, + { + "epoch": 19.338, + "grad_norm": 2.6822168827056885, + "learning_rate": 2e-05, + "loss": 0.03670801, + "step": 9669 + }, + { + "epoch": 19.34, + "grad_norm": 1.137508749961853, + "learning_rate": 2e-05, + "loss": 0.03024655, + "step": 9670 + }, + { + "epoch": 19.342, + "grad_norm": 1.265586018562317, + "learning_rate": 2e-05, + "loss": 0.03883039, + "step": 9671 + }, + { + "epoch": 19.344, + "grad_norm": 1.6028434038162231, + "learning_rate": 2e-05, + "loss": 0.04389777, + "step": 9672 + }, + { + "epoch": 19.346, + "grad_norm": 2.5654678344726562, + "learning_rate": 2e-05, + "loss": 0.049094, + "step": 9673 + }, + { + "epoch": 19.348, + "grad_norm": 1.247218370437622, + "learning_rate": 2e-05, + "loss": 0.04227669, + "step": 9674 + }, + { + "epoch": 19.35, + "grad_norm": 0.9476075768470764, + "learning_rate": 2e-05, + "loss": 0.02782707, + "step": 9675 + }, + { + "epoch": 19.352, + "grad_norm": 1.2325526475906372, + "learning_rate": 2e-05, + "loss": 0.02918253, + "step": 9676 + }, + { + "epoch": 19.354, + "grad_norm": 1.1856415271759033, + "learning_rate": 2e-05, + "loss": 0.0287297, + "step": 9677 + }, + { + "epoch": 19.356, + "grad_norm": 0.832682728767395, + "learning_rate": 2e-05, + "loss": 0.02484965, + "step": 9678 + }, + { + "epoch": 19.358, + "grad_norm": 2.037794828414917, + "learning_rate": 2e-05, + "loss": 0.0416809, + "step": 9679 + }, + { + "epoch": 19.36, + "grad_norm": 1.2113049030303955, + "learning_rate": 2e-05, + "loss": 0.03574871, + "step": 9680 + }, + { + "epoch": 19.362, + "grad_norm": 1.0858124494552612, + "learning_rate": 2e-05, + "loss": 0.03182752, + "step": 9681 + }, + { + "epoch": 19.364, + "grad_norm": 1.3244341611862183, + "learning_rate": 2e-05, + "loss": 0.0379288, + "step": 9682 + }, + { + "epoch": 19.366, + "grad_norm": 0.96357661485672, + "learning_rate": 2e-05, + "loss": 0.03285439, + "step": 9683 + }, + { + "epoch": 19.368, + "grad_norm": 0.9765337109565735, + "learning_rate": 2e-05, + "loss": 0.02620794, + "step": 9684 + }, + { + "epoch": 19.37, + "grad_norm": 0.8831155300140381, + "learning_rate": 2e-05, + "loss": 0.02626178, + "step": 9685 + }, + { + "epoch": 19.372, + "grad_norm": 1.1981676816940308, + "learning_rate": 2e-05, + "loss": 0.02839826, + "step": 9686 + }, + { + "epoch": 19.374, + "grad_norm": 3.4404029846191406, + "learning_rate": 2e-05, + "loss": 0.04103406, + "step": 9687 + }, + { + "epoch": 19.376, + "grad_norm": 1.0752460956573486, + "learning_rate": 2e-05, + "loss": 0.02952642, + "step": 9688 + }, + { + "epoch": 19.378, + "grad_norm": 1.3563824892044067, + "learning_rate": 2e-05, + "loss": 0.0425565, + "step": 9689 + }, + { + "epoch": 19.38, + "grad_norm": 1.359243631362915, + "learning_rate": 2e-05, + "loss": 0.02972773, + "step": 9690 + }, + { + "epoch": 19.382, + "grad_norm": 1.3436927795410156, + "learning_rate": 2e-05, + "loss": 0.04154352, + "step": 9691 + }, + { + "epoch": 19.384, + "grad_norm": 1.4160605669021606, + "learning_rate": 2e-05, + "loss": 0.04017557, + "step": 9692 + }, + { + "epoch": 19.386, + "grad_norm": 2.5221076011657715, + "learning_rate": 2e-05, + "loss": 0.04263007, + "step": 9693 + }, + { + "epoch": 19.388, + "grad_norm": 1.3207120895385742, + "learning_rate": 2e-05, + "loss": 0.04798321, + "step": 9694 + }, + { + "epoch": 19.39, + "grad_norm": 1.1256715059280396, + "learning_rate": 2e-05, + "loss": 0.03117888, + "step": 9695 + }, + { + "epoch": 19.392, + "grad_norm": 1.1344276666641235, + "learning_rate": 2e-05, + "loss": 0.03253318, + "step": 9696 + }, + { + "epoch": 19.394, + "grad_norm": 1.3823455572128296, + "learning_rate": 2e-05, + "loss": 0.03994993, + "step": 9697 + }, + { + "epoch": 19.396, + "grad_norm": 2.88213849067688, + "learning_rate": 2e-05, + "loss": 0.03269661, + "step": 9698 + }, + { + "epoch": 19.398, + "grad_norm": 1.369133472442627, + "learning_rate": 2e-05, + "loss": 0.04676585, + "step": 9699 + }, + { + "epoch": 19.4, + "grad_norm": 1.3740583658218384, + "learning_rate": 2e-05, + "loss": 0.03353403, + "step": 9700 + }, + { + "epoch": 19.402, + "grad_norm": 1.107534646987915, + "learning_rate": 2e-05, + "loss": 0.03524677, + "step": 9701 + }, + { + "epoch": 19.404, + "grad_norm": 1.0297796726226807, + "learning_rate": 2e-05, + "loss": 0.03444082, + "step": 9702 + }, + { + "epoch": 19.406, + "grad_norm": 1.664433240890503, + "learning_rate": 2e-05, + "loss": 0.04659187, + "step": 9703 + }, + { + "epoch": 19.408, + "grad_norm": 1.3409240245819092, + "learning_rate": 2e-05, + "loss": 0.03735002, + "step": 9704 + }, + { + "epoch": 19.41, + "grad_norm": 2.016294479370117, + "learning_rate": 2e-05, + "loss": 0.03130295, + "step": 9705 + }, + { + "epoch": 19.412, + "grad_norm": 2.1538240909576416, + "learning_rate": 2e-05, + "loss": 0.05228285, + "step": 9706 + }, + { + "epoch": 19.414, + "grad_norm": 1.5620522499084473, + "learning_rate": 2e-05, + "loss": 0.03496569, + "step": 9707 + }, + { + "epoch": 19.416, + "grad_norm": 2.113233804702759, + "learning_rate": 2e-05, + "loss": 0.03931812, + "step": 9708 + }, + { + "epoch": 19.418, + "grad_norm": 1.4287680387496948, + "learning_rate": 2e-05, + "loss": 0.02812807, + "step": 9709 + }, + { + "epoch": 19.42, + "grad_norm": 1.4849001169204712, + "learning_rate": 2e-05, + "loss": 0.03003873, + "step": 9710 + }, + { + "epoch": 19.422, + "grad_norm": 2.0336673259735107, + "learning_rate": 2e-05, + "loss": 0.05090691, + "step": 9711 + }, + { + "epoch": 19.424, + "grad_norm": 1.0571931600570679, + "learning_rate": 2e-05, + "loss": 0.0383536, + "step": 9712 + }, + { + "epoch": 19.426, + "grad_norm": 0.924149751663208, + "learning_rate": 2e-05, + "loss": 0.02016873, + "step": 9713 + }, + { + "epoch": 19.428, + "grad_norm": 1.2180832624435425, + "learning_rate": 2e-05, + "loss": 0.02738358, + "step": 9714 + }, + { + "epoch": 19.43, + "grad_norm": 3.771353006362915, + "learning_rate": 2e-05, + "loss": 0.03492292, + "step": 9715 + }, + { + "epoch": 19.432, + "grad_norm": 0.9769635796546936, + "learning_rate": 2e-05, + "loss": 0.0286561, + "step": 9716 + }, + { + "epoch": 19.434, + "grad_norm": 1.5093841552734375, + "learning_rate": 2e-05, + "loss": 0.03438292, + "step": 9717 + }, + { + "epoch": 19.436, + "grad_norm": 1.6384273767471313, + "learning_rate": 2e-05, + "loss": 0.03842306, + "step": 9718 + }, + { + "epoch": 19.438, + "grad_norm": 1.3384544849395752, + "learning_rate": 2e-05, + "loss": 0.03926391, + "step": 9719 + }, + { + "epoch": 19.44, + "grad_norm": 1.1322165727615356, + "learning_rate": 2e-05, + "loss": 0.04284387, + "step": 9720 + }, + { + "epoch": 19.442, + "grad_norm": 1.9500755071640015, + "learning_rate": 2e-05, + "loss": 0.03185847, + "step": 9721 + }, + { + "epoch": 19.444, + "grad_norm": 2.0812883377075195, + "learning_rate": 2e-05, + "loss": 0.05557717, + "step": 9722 + }, + { + "epoch": 19.446, + "grad_norm": 0.8484706878662109, + "learning_rate": 2e-05, + "loss": 0.01811722, + "step": 9723 + }, + { + "epoch": 19.448, + "grad_norm": 1.0412821769714355, + "learning_rate": 2e-05, + "loss": 0.0315608, + "step": 9724 + }, + { + "epoch": 19.45, + "grad_norm": 1.6890451908111572, + "learning_rate": 2e-05, + "loss": 0.0433359, + "step": 9725 + }, + { + "epoch": 19.452, + "grad_norm": 1.1269593238830566, + "learning_rate": 2e-05, + "loss": 0.0354099, + "step": 9726 + }, + { + "epoch": 19.454, + "grad_norm": 1.0882933139801025, + "learning_rate": 2e-05, + "loss": 0.02488111, + "step": 9727 + }, + { + "epoch": 19.456, + "grad_norm": 2.212448835372925, + "learning_rate": 2e-05, + "loss": 0.0372538, + "step": 9728 + }, + { + "epoch": 19.458, + "grad_norm": 1.6880683898925781, + "learning_rate": 2e-05, + "loss": 0.03757115, + "step": 9729 + }, + { + "epoch": 19.46, + "grad_norm": 1.3081451654434204, + "learning_rate": 2e-05, + "loss": 0.04177973, + "step": 9730 + }, + { + "epoch": 19.462, + "grad_norm": 1.5396312475204468, + "learning_rate": 2e-05, + "loss": 0.0427272, + "step": 9731 + }, + { + "epoch": 19.464, + "grad_norm": 1.6483396291732788, + "learning_rate": 2e-05, + "loss": 0.04573702, + "step": 9732 + }, + { + "epoch": 19.466, + "grad_norm": 2.9675261974334717, + "learning_rate": 2e-05, + "loss": 0.05595337, + "step": 9733 + }, + { + "epoch": 19.468, + "grad_norm": 1.0395759344100952, + "learning_rate": 2e-05, + "loss": 0.03035801, + "step": 9734 + }, + { + "epoch": 19.47, + "grad_norm": 1.7170274257659912, + "learning_rate": 2e-05, + "loss": 0.04833669, + "step": 9735 + }, + { + "epoch": 19.472, + "grad_norm": 0.9310649633407593, + "learning_rate": 2e-05, + "loss": 0.03122351, + "step": 9736 + }, + { + "epoch": 19.474, + "grad_norm": 1.8986892700195312, + "learning_rate": 2e-05, + "loss": 0.0391847, + "step": 9737 + }, + { + "epoch": 19.476, + "grad_norm": 1.0801862478256226, + "learning_rate": 2e-05, + "loss": 0.04195695, + "step": 9738 + }, + { + "epoch": 19.478, + "grad_norm": 2.033712148666382, + "learning_rate": 2e-05, + "loss": 0.03731752, + "step": 9739 + }, + { + "epoch": 19.48, + "grad_norm": 1.0139687061309814, + "learning_rate": 2e-05, + "loss": 0.03789478, + "step": 9740 + }, + { + "epoch": 19.482, + "grad_norm": 1.770564317703247, + "learning_rate": 2e-05, + "loss": 0.04420698, + "step": 9741 + }, + { + "epoch": 19.484, + "grad_norm": 1.256616234779358, + "learning_rate": 2e-05, + "loss": 0.03891748, + "step": 9742 + }, + { + "epoch": 19.486, + "grad_norm": 1.4175989627838135, + "learning_rate": 2e-05, + "loss": 0.03451204, + "step": 9743 + }, + { + "epoch": 19.488, + "grad_norm": 1.5900920629501343, + "learning_rate": 2e-05, + "loss": 0.05277488, + "step": 9744 + }, + { + "epoch": 19.49, + "grad_norm": 1.8523023128509521, + "learning_rate": 2e-05, + "loss": 0.04177346, + "step": 9745 + }, + { + "epoch": 19.492, + "grad_norm": 1.2272021770477295, + "learning_rate": 2e-05, + "loss": 0.02621971, + "step": 9746 + }, + { + "epoch": 19.494, + "grad_norm": 2.3542540073394775, + "learning_rate": 2e-05, + "loss": 0.04458754, + "step": 9747 + }, + { + "epoch": 19.496, + "grad_norm": 1.0178302526474, + "learning_rate": 2e-05, + "loss": 0.02959592, + "step": 9748 + }, + { + "epoch": 19.498, + "grad_norm": 1.0469385385513306, + "learning_rate": 2e-05, + "loss": 0.03299112, + "step": 9749 + }, + { + "epoch": 19.5, + "grad_norm": 1.7686867713928223, + "learning_rate": 2e-05, + "loss": 0.03709473, + "step": 9750 + }, + { + "epoch": 19.502, + "grad_norm": 1.1636873483657837, + "learning_rate": 2e-05, + "loss": 0.0360143, + "step": 9751 + }, + { + "epoch": 19.504, + "grad_norm": 1.5657356977462769, + "learning_rate": 2e-05, + "loss": 0.02885246, + "step": 9752 + }, + { + "epoch": 19.506, + "grad_norm": 1.2220760583877563, + "learning_rate": 2e-05, + "loss": 0.03826221, + "step": 9753 + }, + { + "epoch": 19.508, + "grad_norm": 1.124636173248291, + "learning_rate": 2e-05, + "loss": 0.03203499, + "step": 9754 + }, + { + "epoch": 19.51, + "grad_norm": 1.681024193763733, + "learning_rate": 2e-05, + "loss": 0.03180341, + "step": 9755 + }, + { + "epoch": 19.512, + "grad_norm": 2.359029531478882, + "learning_rate": 2e-05, + "loss": 0.04386077, + "step": 9756 + }, + { + "epoch": 19.514, + "grad_norm": 1.4641876220703125, + "learning_rate": 2e-05, + "loss": 0.03449754, + "step": 9757 + }, + { + "epoch": 19.516, + "grad_norm": 1.5766197443008423, + "learning_rate": 2e-05, + "loss": 0.02979608, + "step": 9758 + }, + { + "epoch": 19.518, + "grad_norm": 2.0075721740722656, + "learning_rate": 2e-05, + "loss": 0.0430277, + "step": 9759 + }, + { + "epoch": 19.52, + "grad_norm": 1.359182596206665, + "learning_rate": 2e-05, + "loss": 0.03539016, + "step": 9760 + }, + { + "epoch": 19.522, + "grad_norm": 1.0913418531417847, + "learning_rate": 2e-05, + "loss": 0.03243357, + "step": 9761 + }, + { + "epoch": 19.524, + "grad_norm": 1.3150811195373535, + "learning_rate": 2e-05, + "loss": 0.03395069, + "step": 9762 + }, + { + "epoch": 19.526, + "grad_norm": 1.6575753688812256, + "learning_rate": 2e-05, + "loss": 0.03828706, + "step": 9763 + }, + { + "epoch": 19.528, + "grad_norm": 1.700451374053955, + "learning_rate": 2e-05, + "loss": 0.05022026, + "step": 9764 + }, + { + "epoch": 19.53, + "grad_norm": 1.0677083730697632, + "learning_rate": 2e-05, + "loss": 0.03512496, + "step": 9765 + }, + { + "epoch": 19.532, + "grad_norm": 1.0207821130752563, + "learning_rate": 2e-05, + "loss": 0.03350664, + "step": 9766 + }, + { + "epoch": 19.534, + "grad_norm": 1.220189094543457, + "learning_rate": 2e-05, + "loss": 0.03028592, + "step": 9767 + }, + { + "epoch": 19.536, + "grad_norm": 1.0478119850158691, + "learning_rate": 2e-05, + "loss": 0.02295764, + "step": 9768 + }, + { + "epoch": 19.538, + "grad_norm": 1.4206011295318604, + "learning_rate": 2e-05, + "loss": 0.04312293, + "step": 9769 + }, + { + "epoch": 19.54, + "grad_norm": 2.566202163696289, + "learning_rate": 2e-05, + "loss": 0.04163282, + "step": 9770 + }, + { + "epoch": 19.542, + "grad_norm": 1.8683357238769531, + "learning_rate": 2e-05, + "loss": 0.02420815, + "step": 9771 + }, + { + "epoch": 19.544, + "grad_norm": 1.4970320463180542, + "learning_rate": 2e-05, + "loss": 0.04130397, + "step": 9772 + }, + { + "epoch": 19.546, + "grad_norm": 1.3153479099273682, + "learning_rate": 2e-05, + "loss": 0.04295473, + "step": 9773 + }, + { + "epoch": 19.548000000000002, + "grad_norm": 1.5751969814300537, + "learning_rate": 2e-05, + "loss": 0.05208491, + "step": 9774 + }, + { + "epoch": 19.55, + "grad_norm": 1.0567619800567627, + "learning_rate": 2e-05, + "loss": 0.02717445, + "step": 9775 + }, + { + "epoch": 19.552, + "grad_norm": 1.5886588096618652, + "learning_rate": 2e-05, + "loss": 0.05369675, + "step": 9776 + }, + { + "epoch": 19.554, + "grad_norm": 1.6979105472564697, + "learning_rate": 2e-05, + "loss": 0.04253804, + "step": 9777 + }, + { + "epoch": 19.556, + "grad_norm": 1.3951151371002197, + "learning_rate": 2e-05, + "loss": 0.04301263, + "step": 9778 + }, + { + "epoch": 19.558, + "grad_norm": 3.395491361618042, + "learning_rate": 2e-05, + "loss": 0.03041153, + "step": 9779 + }, + { + "epoch": 19.56, + "grad_norm": 1.3777649402618408, + "learning_rate": 2e-05, + "loss": 0.04786427, + "step": 9780 + }, + { + "epoch": 19.562, + "grad_norm": 1.122922420501709, + "learning_rate": 2e-05, + "loss": 0.03683935, + "step": 9781 + }, + { + "epoch": 19.564, + "grad_norm": 2.4130606651306152, + "learning_rate": 2e-05, + "loss": 0.04994403, + "step": 9782 + }, + { + "epoch": 19.566, + "grad_norm": 2.533428907394409, + "learning_rate": 2e-05, + "loss": 0.03640559, + "step": 9783 + }, + { + "epoch": 19.568, + "grad_norm": 1.6431173086166382, + "learning_rate": 2e-05, + "loss": 0.03542903, + "step": 9784 + }, + { + "epoch": 19.57, + "grad_norm": 0.8823850750923157, + "learning_rate": 2e-05, + "loss": 0.02428854, + "step": 9785 + }, + { + "epoch": 19.572, + "grad_norm": 1.7447413206100464, + "learning_rate": 2e-05, + "loss": 0.05290275, + "step": 9786 + }, + { + "epoch": 19.574, + "grad_norm": 1.2419053316116333, + "learning_rate": 2e-05, + "loss": 0.04086098, + "step": 9787 + }, + { + "epoch": 19.576, + "grad_norm": 2.450349807739258, + "learning_rate": 2e-05, + "loss": 0.0346169, + "step": 9788 + }, + { + "epoch": 19.578, + "grad_norm": 1.867328405380249, + "learning_rate": 2e-05, + "loss": 0.04954457, + "step": 9789 + }, + { + "epoch": 19.58, + "grad_norm": 1.358357310295105, + "learning_rate": 2e-05, + "loss": 0.04382439, + "step": 9790 + }, + { + "epoch": 19.582, + "grad_norm": 1.9822919368743896, + "learning_rate": 2e-05, + "loss": 0.03268323, + "step": 9791 + }, + { + "epoch": 19.584, + "grad_norm": 1.2838702201843262, + "learning_rate": 2e-05, + "loss": 0.02432247, + "step": 9792 + }, + { + "epoch": 19.586, + "grad_norm": 1.2836079597473145, + "learning_rate": 2e-05, + "loss": 0.03453143, + "step": 9793 + }, + { + "epoch": 19.588, + "grad_norm": 1.7768176794052124, + "learning_rate": 2e-05, + "loss": 0.03774252, + "step": 9794 + }, + { + "epoch": 19.59, + "grad_norm": 1.0157089233398438, + "learning_rate": 2e-05, + "loss": 0.03381804, + "step": 9795 + }, + { + "epoch": 19.592, + "grad_norm": 2.1426620483398438, + "learning_rate": 2e-05, + "loss": 0.04200416, + "step": 9796 + }, + { + "epoch": 19.594, + "grad_norm": 1.004624605178833, + "learning_rate": 2e-05, + "loss": 0.02966635, + "step": 9797 + }, + { + "epoch": 19.596, + "grad_norm": 1.0430629253387451, + "learning_rate": 2e-05, + "loss": 0.03210926, + "step": 9798 + }, + { + "epoch": 19.598, + "grad_norm": 0.9733067750930786, + "learning_rate": 2e-05, + "loss": 0.02974671, + "step": 9799 + }, + { + "epoch": 19.6, + "grad_norm": 2.1266605854034424, + "learning_rate": 2e-05, + "loss": 0.04483049, + "step": 9800 + }, + { + "epoch": 19.602, + "grad_norm": 1.2150872945785522, + "learning_rate": 2e-05, + "loss": 0.04423954, + "step": 9801 + }, + { + "epoch": 19.604, + "grad_norm": 1.1437227725982666, + "learning_rate": 2e-05, + "loss": 0.03814924, + "step": 9802 + }, + { + "epoch": 19.606, + "grad_norm": 1.234365701675415, + "learning_rate": 2e-05, + "loss": 0.04211161, + "step": 9803 + }, + { + "epoch": 19.608, + "grad_norm": 1.2904542684555054, + "learning_rate": 2e-05, + "loss": 0.039315, + "step": 9804 + }, + { + "epoch": 19.61, + "grad_norm": 1.0852253437042236, + "learning_rate": 2e-05, + "loss": 0.03015836, + "step": 9805 + }, + { + "epoch": 19.612, + "grad_norm": 2.3996999263763428, + "learning_rate": 2e-05, + "loss": 0.04053651, + "step": 9806 + }, + { + "epoch": 19.614, + "grad_norm": 1.0746995210647583, + "learning_rate": 2e-05, + "loss": 0.03122765, + "step": 9807 + }, + { + "epoch": 19.616, + "grad_norm": 1.0592286586761475, + "learning_rate": 2e-05, + "loss": 0.03569851, + "step": 9808 + }, + { + "epoch": 19.618, + "grad_norm": 1.2815927267074585, + "learning_rate": 2e-05, + "loss": 0.03232668, + "step": 9809 + }, + { + "epoch": 19.62, + "grad_norm": 1.2451739311218262, + "learning_rate": 2e-05, + "loss": 0.02683158, + "step": 9810 + }, + { + "epoch": 19.622, + "grad_norm": 0.9535860419273376, + "learning_rate": 2e-05, + "loss": 0.02343744, + "step": 9811 + }, + { + "epoch": 19.624, + "grad_norm": 2.1150941848754883, + "learning_rate": 2e-05, + "loss": 0.03646648, + "step": 9812 + }, + { + "epoch": 19.626, + "grad_norm": 1.4279240369796753, + "learning_rate": 2e-05, + "loss": 0.03931186, + "step": 9813 + }, + { + "epoch": 19.628, + "grad_norm": 1.9593764543533325, + "learning_rate": 2e-05, + "loss": 0.04083652, + "step": 9814 + }, + { + "epoch": 19.63, + "grad_norm": 1.289267659187317, + "learning_rate": 2e-05, + "loss": 0.03806636, + "step": 9815 + }, + { + "epoch": 19.632, + "grad_norm": 2.0705997943878174, + "learning_rate": 2e-05, + "loss": 0.0471487, + "step": 9816 + }, + { + "epoch": 19.634, + "grad_norm": 1.577562928199768, + "learning_rate": 2e-05, + "loss": 0.02332294, + "step": 9817 + }, + { + "epoch": 19.636, + "grad_norm": 1.0667752027511597, + "learning_rate": 2e-05, + "loss": 0.03316651, + "step": 9818 + }, + { + "epoch": 19.638, + "grad_norm": 1.272627592086792, + "learning_rate": 2e-05, + "loss": 0.0328108, + "step": 9819 + }, + { + "epoch": 19.64, + "grad_norm": 1.1442145109176636, + "learning_rate": 2e-05, + "loss": 0.03424655, + "step": 9820 + }, + { + "epoch": 19.642, + "grad_norm": 0.977202832698822, + "learning_rate": 2e-05, + "loss": 0.02730237, + "step": 9821 + }, + { + "epoch": 19.644, + "grad_norm": 1.2419651746749878, + "learning_rate": 2e-05, + "loss": 0.03938533, + "step": 9822 + }, + { + "epoch": 19.646, + "grad_norm": 1.5671879053115845, + "learning_rate": 2e-05, + "loss": 0.03334674, + "step": 9823 + }, + { + "epoch": 19.648, + "grad_norm": 1.2008931636810303, + "learning_rate": 2e-05, + "loss": 0.03198599, + "step": 9824 + }, + { + "epoch": 19.65, + "grad_norm": 0.8605839610099792, + "learning_rate": 2e-05, + "loss": 0.02071075, + "step": 9825 + }, + { + "epoch": 19.652, + "grad_norm": 1.1936160326004028, + "learning_rate": 2e-05, + "loss": 0.041041, + "step": 9826 + }, + { + "epoch": 19.654, + "grad_norm": 0.7941514253616333, + "learning_rate": 2e-05, + "loss": 0.01732956, + "step": 9827 + }, + { + "epoch": 19.656, + "grad_norm": 0.9034144282341003, + "learning_rate": 2e-05, + "loss": 0.02284335, + "step": 9828 + }, + { + "epoch": 19.658, + "grad_norm": 2.9886972904205322, + "learning_rate": 2e-05, + "loss": 0.05936865, + "step": 9829 + }, + { + "epoch": 19.66, + "grad_norm": 1.7433563470840454, + "learning_rate": 2e-05, + "loss": 0.04660515, + "step": 9830 + }, + { + "epoch": 19.662, + "grad_norm": 1.4639521837234497, + "learning_rate": 2e-05, + "loss": 0.03722664, + "step": 9831 + }, + { + "epoch": 19.664, + "grad_norm": 1.6266924142837524, + "learning_rate": 2e-05, + "loss": 0.04071777, + "step": 9832 + }, + { + "epoch": 19.666, + "grad_norm": 1.52934730052948, + "learning_rate": 2e-05, + "loss": 0.03618839, + "step": 9833 + }, + { + "epoch": 19.668, + "grad_norm": 1.321384310722351, + "learning_rate": 2e-05, + "loss": 0.04987688, + "step": 9834 + }, + { + "epoch": 19.67, + "grad_norm": 1.1635421514511108, + "learning_rate": 2e-05, + "loss": 0.03412395, + "step": 9835 + }, + { + "epoch": 19.672, + "grad_norm": 2.087972402572632, + "learning_rate": 2e-05, + "loss": 0.02932746, + "step": 9836 + }, + { + "epoch": 19.674, + "grad_norm": 1.8116962909698486, + "learning_rate": 2e-05, + "loss": 0.05201422, + "step": 9837 + }, + { + "epoch": 19.676, + "grad_norm": 1.7317609786987305, + "learning_rate": 2e-05, + "loss": 0.03608788, + "step": 9838 + }, + { + "epoch": 19.678, + "grad_norm": 1.6292537450790405, + "learning_rate": 2e-05, + "loss": 0.03296152, + "step": 9839 + }, + { + "epoch": 19.68, + "grad_norm": 1.4448356628417969, + "learning_rate": 2e-05, + "loss": 0.04338595, + "step": 9840 + }, + { + "epoch": 19.682, + "grad_norm": 1.1970990896224976, + "learning_rate": 2e-05, + "loss": 0.02565513, + "step": 9841 + }, + { + "epoch": 19.684, + "grad_norm": 1.4799765348434448, + "learning_rate": 2e-05, + "loss": 0.04556567, + "step": 9842 + }, + { + "epoch": 19.686, + "grad_norm": 1.1363059282302856, + "learning_rate": 2e-05, + "loss": 0.03603154, + "step": 9843 + }, + { + "epoch": 19.688, + "grad_norm": 2.854661703109741, + "learning_rate": 2e-05, + "loss": 0.05047087, + "step": 9844 + }, + { + "epoch": 19.69, + "grad_norm": 0.8889420032501221, + "learning_rate": 2e-05, + "loss": 0.02277106, + "step": 9845 + }, + { + "epoch": 19.692, + "grad_norm": 1.6678924560546875, + "learning_rate": 2e-05, + "loss": 0.03258391, + "step": 9846 + }, + { + "epoch": 19.694, + "grad_norm": 3.123685598373413, + "learning_rate": 2e-05, + "loss": 0.03615994, + "step": 9847 + }, + { + "epoch": 19.696, + "grad_norm": 1.1917728185653687, + "learning_rate": 2e-05, + "loss": 0.0365363, + "step": 9848 + }, + { + "epoch": 19.698, + "grad_norm": 1.2014756202697754, + "learning_rate": 2e-05, + "loss": 0.03089717, + "step": 9849 + }, + { + "epoch": 19.7, + "grad_norm": 2.1376969814300537, + "learning_rate": 2e-05, + "loss": 0.04704689, + "step": 9850 + }, + { + "epoch": 19.701999999999998, + "grad_norm": 1.4255774021148682, + "learning_rate": 2e-05, + "loss": 0.02486222, + "step": 9851 + }, + { + "epoch": 19.704, + "grad_norm": 1.1981037855148315, + "learning_rate": 2e-05, + "loss": 0.04301386, + "step": 9852 + }, + { + "epoch": 19.706, + "grad_norm": 1.5258287191390991, + "learning_rate": 2e-05, + "loss": 0.05485792, + "step": 9853 + }, + { + "epoch": 19.708, + "grad_norm": 1.6070016622543335, + "learning_rate": 2e-05, + "loss": 0.03380585, + "step": 9854 + }, + { + "epoch": 19.71, + "grad_norm": 1.0304168462753296, + "learning_rate": 2e-05, + "loss": 0.02755873, + "step": 9855 + }, + { + "epoch": 19.712, + "grad_norm": 2.085618257522583, + "learning_rate": 2e-05, + "loss": 0.04198353, + "step": 9856 + }, + { + "epoch": 19.714, + "grad_norm": 1.2231016159057617, + "learning_rate": 2e-05, + "loss": 0.03908198, + "step": 9857 + }, + { + "epoch": 19.716, + "grad_norm": 2.0941994190216064, + "learning_rate": 2e-05, + "loss": 0.04153078, + "step": 9858 + }, + { + "epoch": 19.718, + "grad_norm": 1.3140674829483032, + "learning_rate": 2e-05, + "loss": 0.03461843, + "step": 9859 + }, + { + "epoch": 19.72, + "grad_norm": 2.5073509216308594, + "learning_rate": 2e-05, + "loss": 0.05611764, + "step": 9860 + }, + { + "epoch": 19.722, + "grad_norm": 1.2403875589370728, + "learning_rate": 2e-05, + "loss": 0.04040193, + "step": 9861 + }, + { + "epoch": 19.724, + "grad_norm": 1.2236120700836182, + "learning_rate": 2e-05, + "loss": 0.04124648, + "step": 9862 + }, + { + "epoch": 19.726, + "grad_norm": 1.410003662109375, + "learning_rate": 2e-05, + "loss": 0.0384649, + "step": 9863 + }, + { + "epoch": 19.728, + "grad_norm": 1.1842060089111328, + "learning_rate": 2e-05, + "loss": 0.03703492, + "step": 9864 + }, + { + "epoch": 19.73, + "grad_norm": 1.596753478050232, + "learning_rate": 2e-05, + "loss": 0.03340092, + "step": 9865 + }, + { + "epoch": 19.732, + "grad_norm": 1.6975880861282349, + "learning_rate": 2e-05, + "loss": 0.04212178, + "step": 9866 + }, + { + "epoch": 19.734, + "grad_norm": 2.210099458694458, + "learning_rate": 2e-05, + "loss": 0.03261279, + "step": 9867 + }, + { + "epoch": 19.736, + "grad_norm": 1.1055867671966553, + "learning_rate": 2e-05, + "loss": 0.03092555, + "step": 9868 + }, + { + "epoch": 19.738, + "grad_norm": 1.1590442657470703, + "learning_rate": 2e-05, + "loss": 0.02638486, + "step": 9869 + }, + { + "epoch": 19.74, + "grad_norm": 1.1977529525756836, + "learning_rate": 2e-05, + "loss": 0.03209082, + "step": 9870 + }, + { + "epoch": 19.742, + "grad_norm": 1.3903778791427612, + "learning_rate": 2e-05, + "loss": 0.03176203, + "step": 9871 + }, + { + "epoch": 19.744, + "grad_norm": 1.7422199249267578, + "learning_rate": 2e-05, + "loss": 0.03671364, + "step": 9872 + }, + { + "epoch": 19.746, + "grad_norm": 1.0997118949890137, + "learning_rate": 2e-05, + "loss": 0.03464361, + "step": 9873 + }, + { + "epoch": 19.748, + "grad_norm": 1.4066072702407837, + "learning_rate": 2e-05, + "loss": 0.03441656, + "step": 9874 + }, + { + "epoch": 19.75, + "grad_norm": 1.0262517929077148, + "learning_rate": 2e-05, + "loss": 0.03322373, + "step": 9875 + }, + { + "epoch": 19.752, + "grad_norm": 1.3730692863464355, + "learning_rate": 2e-05, + "loss": 0.04439168, + "step": 9876 + }, + { + "epoch": 19.754, + "grad_norm": 1.2978415489196777, + "learning_rate": 2e-05, + "loss": 0.04093964, + "step": 9877 + }, + { + "epoch": 19.756, + "grad_norm": 1.0929715633392334, + "learning_rate": 2e-05, + "loss": 0.0319971, + "step": 9878 + }, + { + "epoch": 19.758, + "grad_norm": 1.8838789463043213, + "learning_rate": 2e-05, + "loss": 0.0498081, + "step": 9879 + }, + { + "epoch": 19.76, + "grad_norm": 1.343527913093567, + "learning_rate": 2e-05, + "loss": 0.03659369, + "step": 9880 + }, + { + "epoch": 19.762, + "grad_norm": 2.9458236694335938, + "learning_rate": 2e-05, + "loss": 0.05308362, + "step": 9881 + }, + { + "epoch": 19.764, + "grad_norm": 1.1302599906921387, + "learning_rate": 2e-05, + "loss": 0.03225252, + "step": 9882 + }, + { + "epoch": 19.766, + "grad_norm": 1.005309820175171, + "learning_rate": 2e-05, + "loss": 0.02554948, + "step": 9883 + }, + { + "epoch": 19.768, + "grad_norm": 1.148569107055664, + "learning_rate": 2e-05, + "loss": 0.0364852, + "step": 9884 + }, + { + "epoch": 19.77, + "grad_norm": 1.1067266464233398, + "learning_rate": 2e-05, + "loss": 0.03569014, + "step": 9885 + }, + { + "epoch": 19.772, + "grad_norm": 1.2811511754989624, + "learning_rate": 2e-05, + "loss": 0.02505126, + "step": 9886 + }, + { + "epoch": 19.774, + "grad_norm": 1.8366814851760864, + "learning_rate": 2e-05, + "loss": 0.04801732, + "step": 9887 + }, + { + "epoch": 19.776, + "grad_norm": 1.0430152416229248, + "learning_rate": 2e-05, + "loss": 0.02715734, + "step": 9888 + }, + { + "epoch": 19.778, + "grad_norm": 1.6979142427444458, + "learning_rate": 2e-05, + "loss": 0.02943612, + "step": 9889 + }, + { + "epoch": 19.78, + "grad_norm": 2.1606314182281494, + "learning_rate": 2e-05, + "loss": 0.05470781, + "step": 9890 + }, + { + "epoch": 19.782, + "grad_norm": 1.1862667798995972, + "learning_rate": 2e-05, + "loss": 0.0372127, + "step": 9891 + }, + { + "epoch": 19.784, + "grad_norm": 1.5462552309036255, + "learning_rate": 2e-05, + "loss": 0.04436889, + "step": 9892 + }, + { + "epoch": 19.786, + "grad_norm": 1.1272681951522827, + "learning_rate": 2e-05, + "loss": 0.02969773, + "step": 9893 + }, + { + "epoch": 19.788, + "grad_norm": 1.8199870586395264, + "learning_rate": 2e-05, + "loss": 0.04257311, + "step": 9894 + }, + { + "epoch": 19.79, + "grad_norm": 1.1364978551864624, + "learning_rate": 2e-05, + "loss": 0.03960226, + "step": 9895 + }, + { + "epoch": 19.792, + "grad_norm": 5.6248908042907715, + "learning_rate": 2e-05, + "loss": 0.05272412, + "step": 9896 + }, + { + "epoch": 19.794, + "grad_norm": 1.9296268224716187, + "learning_rate": 2e-05, + "loss": 0.0427717, + "step": 9897 + }, + { + "epoch": 19.796, + "grad_norm": 1.5250190496444702, + "learning_rate": 2e-05, + "loss": 0.04301464, + "step": 9898 + }, + { + "epoch": 19.798000000000002, + "grad_norm": 3.7405781745910645, + "learning_rate": 2e-05, + "loss": 0.03769503, + "step": 9899 + }, + { + "epoch": 19.8, + "grad_norm": 1.0620787143707275, + "learning_rate": 2e-05, + "loss": 0.02631702, + "step": 9900 + }, + { + "epoch": 19.802, + "grad_norm": 1.9227813482284546, + "learning_rate": 2e-05, + "loss": 0.03936617, + "step": 9901 + }, + { + "epoch": 19.804, + "grad_norm": 1.4326848983764648, + "learning_rate": 2e-05, + "loss": 0.03811381, + "step": 9902 + }, + { + "epoch": 19.806, + "grad_norm": 0.9656258821487427, + "learning_rate": 2e-05, + "loss": 0.02461326, + "step": 9903 + }, + { + "epoch": 19.808, + "grad_norm": 1.468076229095459, + "learning_rate": 2e-05, + "loss": 0.04942961, + "step": 9904 + }, + { + "epoch": 19.81, + "grad_norm": 1.0729457139968872, + "learning_rate": 2e-05, + "loss": 0.03775813, + "step": 9905 + }, + { + "epoch": 19.812, + "grad_norm": 2.371782064437866, + "learning_rate": 2e-05, + "loss": 0.04209847, + "step": 9906 + }, + { + "epoch": 19.814, + "grad_norm": 1.1211527585983276, + "learning_rate": 2e-05, + "loss": 0.03618193, + "step": 9907 + }, + { + "epoch": 19.816, + "grad_norm": 1.2822222709655762, + "learning_rate": 2e-05, + "loss": 0.02676849, + "step": 9908 + }, + { + "epoch": 19.818, + "grad_norm": 1.2275272607803345, + "learning_rate": 2e-05, + "loss": 0.03364244, + "step": 9909 + }, + { + "epoch": 19.82, + "grad_norm": 1.4807791709899902, + "learning_rate": 2e-05, + "loss": 0.03341034, + "step": 9910 + }, + { + "epoch": 19.822, + "grad_norm": 0.8294656872749329, + "learning_rate": 2e-05, + "loss": 0.02359098, + "step": 9911 + }, + { + "epoch": 19.824, + "grad_norm": 1.1592644453048706, + "learning_rate": 2e-05, + "loss": 0.0353507, + "step": 9912 + }, + { + "epoch": 19.826, + "grad_norm": 1.078616976737976, + "learning_rate": 2e-05, + "loss": 0.02933605, + "step": 9913 + }, + { + "epoch": 19.828, + "grad_norm": 1.74703848361969, + "learning_rate": 2e-05, + "loss": 0.03395237, + "step": 9914 + }, + { + "epoch": 19.83, + "grad_norm": 0.9906914830207825, + "learning_rate": 2e-05, + "loss": 0.03834459, + "step": 9915 + }, + { + "epoch": 19.832, + "grad_norm": 1.5938894748687744, + "learning_rate": 2e-05, + "loss": 0.04390221, + "step": 9916 + }, + { + "epoch": 19.834, + "grad_norm": 0.8304893374443054, + "learning_rate": 2e-05, + "loss": 0.02339101, + "step": 9917 + }, + { + "epoch": 19.836, + "grad_norm": 2.1542248725891113, + "learning_rate": 2e-05, + "loss": 0.04835535, + "step": 9918 + }, + { + "epoch": 19.838, + "grad_norm": 1.7245725393295288, + "learning_rate": 2e-05, + "loss": 0.04382911, + "step": 9919 + }, + { + "epoch": 19.84, + "grad_norm": 1.0681260824203491, + "learning_rate": 2e-05, + "loss": 0.0333902, + "step": 9920 + }, + { + "epoch": 19.842, + "grad_norm": 1.5578216314315796, + "learning_rate": 2e-05, + "loss": 0.03641013, + "step": 9921 + }, + { + "epoch": 19.844, + "grad_norm": 0.9164597392082214, + "learning_rate": 2e-05, + "loss": 0.02537297, + "step": 9922 + }, + { + "epoch": 19.846, + "grad_norm": 2.094348192214966, + "learning_rate": 2e-05, + "loss": 0.04775488, + "step": 9923 + }, + { + "epoch": 19.848, + "grad_norm": 2.1640682220458984, + "learning_rate": 2e-05, + "loss": 0.03262997, + "step": 9924 + }, + { + "epoch": 19.85, + "grad_norm": 1.3162791728973389, + "learning_rate": 2e-05, + "loss": 0.03451164, + "step": 9925 + }, + { + "epoch": 19.852, + "grad_norm": 1.047416090965271, + "learning_rate": 2e-05, + "loss": 0.02686568, + "step": 9926 + }, + { + "epoch": 19.854, + "grad_norm": 1.4626446962356567, + "learning_rate": 2e-05, + "loss": 0.05131056, + "step": 9927 + }, + { + "epoch": 19.856, + "grad_norm": 1.1286470890045166, + "learning_rate": 2e-05, + "loss": 0.03642206, + "step": 9928 + }, + { + "epoch": 19.858, + "grad_norm": 1.7644599676132202, + "learning_rate": 2e-05, + "loss": 0.04343008, + "step": 9929 + }, + { + "epoch": 19.86, + "grad_norm": 0.8793272972106934, + "learning_rate": 2e-05, + "loss": 0.02633665, + "step": 9930 + }, + { + "epoch": 19.862, + "grad_norm": 1.3826559782028198, + "learning_rate": 2e-05, + "loss": 0.03667331, + "step": 9931 + }, + { + "epoch": 19.864, + "grad_norm": 1.5260701179504395, + "learning_rate": 2e-05, + "loss": 0.03625732, + "step": 9932 + }, + { + "epoch": 19.866, + "grad_norm": 1.5434454679489136, + "learning_rate": 2e-05, + "loss": 0.04190221, + "step": 9933 + }, + { + "epoch": 19.868, + "grad_norm": 1.4295403957366943, + "learning_rate": 2e-05, + "loss": 0.03398123, + "step": 9934 + }, + { + "epoch": 19.87, + "grad_norm": 1.359604835510254, + "learning_rate": 2e-05, + "loss": 0.04142676, + "step": 9935 + }, + { + "epoch": 19.872, + "grad_norm": 2.0198137760162354, + "learning_rate": 2e-05, + "loss": 0.05195599, + "step": 9936 + }, + { + "epoch": 19.874, + "grad_norm": 0.9484997987747192, + "learning_rate": 2e-05, + "loss": 0.0277728, + "step": 9937 + }, + { + "epoch": 19.876, + "grad_norm": 1.3963159322738647, + "learning_rate": 2e-05, + "loss": 0.05602345, + "step": 9938 + }, + { + "epoch": 19.878, + "grad_norm": 1.419621229171753, + "learning_rate": 2e-05, + "loss": 0.02756134, + "step": 9939 + }, + { + "epoch": 19.88, + "grad_norm": 2.4479565620422363, + "learning_rate": 2e-05, + "loss": 0.0412005, + "step": 9940 + }, + { + "epoch": 19.882, + "grad_norm": 1.392535924911499, + "learning_rate": 2e-05, + "loss": 0.03324911, + "step": 9941 + }, + { + "epoch": 19.884, + "grad_norm": 0.9907611608505249, + "learning_rate": 2e-05, + "loss": 0.03080793, + "step": 9942 + }, + { + "epoch": 19.886, + "grad_norm": 1.6985739469528198, + "learning_rate": 2e-05, + "loss": 0.04235946, + "step": 9943 + }, + { + "epoch": 19.888, + "grad_norm": 1.201528787612915, + "learning_rate": 2e-05, + "loss": 0.02874477, + "step": 9944 + }, + { + "epoch": 19.89, + "grad_norm": 1.272756814956665, + "learning_rate": 2e-05, + "loss": 0.03114101, + "step": 9945 + }, + { + "epoch": 19.892, + "grad_norm": 2.536895513534546, + "learning_rate": 2e-05, + "loss": 0.04412911, + "step": 9946 + }, + { + "epoch": 19.894, + "grad_norm": 1.345763087272644, + "learning_rate": 2e-05, + "loss": 0.03955846, + "step": 9947 + }, + { + "epoch": 19.896, + "grad_norm": 1.703199863433838, + "learning_rate": 2e-05, + "loss": 0.04491033, + "step": 9948 + }, + { + "epoch": 19.898, + "grad_norm": 2.1777453422546387, + "learning_rate": 2e-05, + "loss": 0.0519994, + "step": 9949 + }, + { + "epoch": 19.9, + "grad_norm": 1.9769400358200073, + "learning_rate": 2e-05, + "loss": 0.04786195, + "step": 9950 + }, + { + "epoch": 19.902, + "grad_norm": 0.9934007525444031, + "learning_rate": 2e-05, + "loss": 0.02457796, + "step": 9951 + }, + { + "epoch": 19.904, + "grad_norm": 1.0393855571746826, + "learning_rate": 2e-05, + "loss": 0.03693616, + "step": 9952 + }, + { + "epoch": 19.906, + "grad_norm": 1.4611011743545532, + "learning_rate": 2e-05, + "loss": 0.03181311, + "step": 9953 + }, + { + "epoch": 19.908, + "grad_norm": 1.9323896169662476, + "learning_rate": 2e-05, + "loss": 0.04420885, + "step": 9954 + }, + { + "epoch": 19.91, + "grad_norm": 1.2806192636489868, + "learning_rate": 2e-05, + "loss": 0.0216569, + "step": 9955 + }, + { + "epoch": 19.912, + "grad_norm": 2.6190237998962402, + "learning_rate": 2e-05, + "loss": 0.04326738, + "step": 9956 + }, + { + "epoch": 19.914, + "grad_norm": 1.5687636137008667, + "learning_rate": 2e-05, + "loss": 0.03183849, + "step": 9957 + }, + { + "epoch": 19.916, + "grad_norm": 1.1878702640533447, + "learning_rate": 2e-05, + "loss": 0.03523634, + "step": 9958 + }, + { + "epoch": 19.918, + "grad_norm": 1.3091981410980225, + "learning_rate": 2e-05, + "loss": 0.03142114, + "step": 9959 + }, + { + "epoch": 19.92, + "grad_norm": 2.1375269889831543, + "learning_rate": 2e-05, + "loss": 0.04402493, + "step": 9960 + }, + { + "epoch": 19.922, + "grad_norm": 1.4389981031417847, + "learning_rate": 2e-05, + "loss": 0.03660287, + "step": 9961 + }, + { + "epoch": 19.924, + "grad_norm": 1.4258970022201538, + "learning_rate": 2e-05, + "loss": 0.03248418, + "step": 9962 + }, + { + "epoch": 19.926, + "grad_norm": 1.94060480594635, + "learning_rate": 2e-05, + "loss": 0.05550197, + "step": 9963 + }, + { + "epoch": 19.928, + "grad_norm": 2.0639617443084717, + "learning_rate": 2e-05, + "loss": 0.04555191, + "step": 9964 + }, + { + "epoch": 19.93, + "grad_norm": 1.4543213844299316, + "learning_rate": 2e-05, + "loss": 0.02980587, + "step": 9965 + }, + { + "epoch": 19.932, + "grad_norm": 1.0913479328155518, + "learning_rate": 2e-05, + "loss": 0.03366387, + "step": 9966 + }, + { + "epoch": 19.934, + "grad_norm": 1.459293246269226, + "learning_rate": 2e-05, + "loss": 0.03716648, + "step": 9967 + }, + { + "epoch": 19.936, + "grad_norm": 1.5800535678863525, + "learning_rate": 2e-05, + "loss": 0.04051695, + "step": 9968 + }, + { + "epoch": 19.938, + "grad_norm": 1.6184855699539185, + "learning_rate": 2e-05, + "loss": 0.0413382, + "step": 9969 + }, + { + "epoch": 19.94, + "grad_norm": 1.287768840789795, + "learning_rate": 2e-05, + "loss": 0.02909556, + "step": 9970 + }, + { + "epoch": 19.942, + "grad_norm": 2.3062894344329834, + "learning_rate": 2e-05, + "loss": 0.04965471, + "step": 9971 + }, + { + "epoch": 19.944, + "grad_norm": 1.639174222946167, + "learning_rate": 2e-05, + "loss": 0.03276775, + "step": 9972 + }, + { + "epoch": 19.946, + "grad_norm": 1.1336901187896729, + "learning_rate": 2e-05, + "loss": 0.03398212, + "step": 9973 + }, + { + "epoch": 19.948, + "grad_norm": 1.7506204843521118, + "learning_rate": 2e-05, + "loss": 0.05709363, + "step": 9974 + }, + { + "epoch": 19.95, + "grad_norm": 1.0985453128814697, + "learning_rate": 2e-05, + "loss": 0.03749266, + "step": 9975 + }, + { + "epoch": 19.951999999999998, + "grad_norm": 1.533744215965271, + "learning_rate": 2e-05, + "loss": 0.04838315, + "step": 9976 + }, + { + "epoch": 19.954, + "grad_norm": 1.545249104499817, + "learning_rate": 2e-05, + "loss": 0.03769622, + "step": 9977 + }, + { + "epoch": 19.956, + "grad_norm": 1.0320427417755127, + "learning_rate": 2e-05, + "loss": 0.02421537, + "step": 9978 + }, + { + "epoch": 19.958, + "grad_norm": 1.2649873495101929, + "learning_rate": 2e-05, + "loss": 0.03971934, + "step": 9979 + }, + { + "epoch": 19.96, + "grad_norm": 1.2545596361160278, + "learning_rate": 2e-05, + "loss": 0.04188591, + "step": 9980 + }, + { + "epoch": 19.962, + "grad_norm": 1.0163325071334839, + "learning_rate": 2e-05, + "loss": 0.02098066, + "step": 9981 + }, + { + "epoch": 19.964, + "grad_norm": 0.8341543674468994, + "learning_rate": 2e-05, + "loss": 0.01667216, + "step": 9982 + }, + { + "epoch": 19.966, + "grad_norm": 1.1888387203216553, + "learning_rate": 2e-05, + "loss": 0.0333435, + "step": 9983 + }, + { + "epoch": 19.968, + "grad_norm": 1.6849191188812256, + "learning_rate": 2e-05, + "loss": 0.04021031, + "step": 9984 + }, + { + "epoch": 19.97, + "grad_norm": 1.1306986808776855, + "learning_rate": 2e-05, + "loss": 0.03524192, + "step": 9985 + }, + { + "epoch": 19.972, + "grad_norm": 1.2705979347229004, + "learning_rate": 2e-05, + "loss": 0.045315, + "step": 9986 + }, + { + "epoch": 19.974, + "grad_norm": 3.2684991359710693, + "learning_rate": 2e-05, + "loss": 0.05361638, + "step": 9987 + }, + { + "epoch": 19.976, + "grad_norm": 2.3001787662506104, + "learning_rate": 2e-05, + "loss": 0.0321941, + "step": 9988 + }, + { + "epoch": 19.978, + "grad_norm": 1.2926708459854126, + "learning_rate": 2e-05, + "loss": 0.03842488, + "step": 9989 + }, + { + "epoch": 19.98, + "grad_norm": 1.6358253955841064, + "learning_rate": 2e-05, + "loss": 0.04522712, + "step": 9990 + }, + { + "epoch": 19.982, + "grad_norm": 1.0329595804214478, + "learning_rate": 2e-05, + "loss": 0.02501506, + "step": 9991 + }, + { + "epoch": 19.984, + "grad_norm": 1.3090020418167114, + "learning_rate": 2e-05, + "loss": 0.03367433, + "step": 9992 + }, + { + "epoch": 19.986, + "grad_norm": 1.0016107559204102, + "learning_rate": 2e-05, + "loss": 0.02978981, + "step": 9993 + }, + { + "epoch": 19.988, + "grad_norm": 1.311272382736206, + "learning_rate": 2e-05, + "loss": 0.04261601, + "step": 9994 + }, + { + "epoch": 19.99, + "grad_norm": 0.9918799996376038, + "learning_rate": 2e-05, + "loss": 0.02678192, + "step": 9995 + }, + { + "epoch": 19.992, + "grad_norm": 1.2438549995422363, + "learning_rate": 2e-05, + "loss": 0.04806288, + "step": 9996 + }, + { + "epoch": 19.994, + "grad_norm": 1.1533282995224, + "learning_rate": 2e-05, + "loss": 0.03929444, + "step": 9997 + }, + { + "epoch": 19.996, + "grad_norm": 1.9555519819259644, + "learning_rate": 2e-05, + "loss": 0.04684892, + "step": 9998 + }, + { + "epoch": 19.998, + "grad_norm": 1.2001152038574219, + "learning_rate": 2e-05, + "loss": 0.02799315, + "step": 9999 + }, + { + "epoch": 20.0, + "grad_norm": 1.3295609951019287, + "learning_rate": 2e-05, + "loss": 0.04259039, + "step": 10000 + }, + { + "epoch": 20.0, + "eval_performance": { + "AngleClassification_1": 0.99, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9740518962075848, + "Equal_1": 0.996, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9520958083832335, + "LineComparison_1": 1.0, + "LineComparison_2": 0.998003992015968, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9879759519038076, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.99, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.986, + "Perpendicular_3": 0.7975951903807615, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.998, + "PointLiesOnCircle_3": 0.994, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9919839679358717, + "PointLiesOnLine_3": 0.9800399201596807 + }, + "eval_runtime": 319.7877, + "eval_samples_per_second": 32.834, + "eval_steps_per_second": 0.657, + "step": 10000 + }, + { + "epoch": 20.002, + "grad_norm": 1.157810926437378, + "learning_rate": 2e-05, + "loss": 0.02725174, + "step": 10001 + }, + { + "epoch": 20.004, + "grad_norm": 2.2255258560180664, + "learning_rate": 2e-05, + "loss": 0.04750681, + "step": 10002 + }, + { + "epoch": 20.006, + "grad_norm": 0.977530300617218, + "learning_rate": 2e-05, + "loss": 0.03366182, + "step": 10003 + }, + { + "epoch": 20.008, + "grad_norm": 2.368126392364502, + "learning_rate": 2e-05, + "loss": 0.03093569, + "step": 10004 + }, + { + "epoch": 20.01, + "grad_norm": 1.1914955377578735, + "learning_rate": 2e-05, + "loss": 0.04483454, + "step": 10005 + }, + { + "epoch": 20.012, + "grad_norm": 1.2124160528182983, + "learning_rate": 2e-05, + "loss": 0.03203357, + "step": 10006 + }, + { + "epoch": 20.014, + "grad_norm": 2.622993230819702, + "learning_rate": 2e-05, + "loss": 0.0496589, + "step": 10007 + }, + { + "epoch": 20.016, + "grad_norm": 1.2373409271240234, + "learning_rate": 2e-05, + "loss": 0.04058324, + "step": 10008 + }, + { + "epoch": 20.018, + "grad_norm": 1.6478288173675537, + "learning_rate": 2e-05, + "loss": 0.04132287, + "step": 10009 + }, + { + "epoch": 20.02, + "grad_norm": 1.0980284214019775, + "learning_rate": 2e-05, + "loss": 0.03450535, + "step": 10010 + }, + { + "epoch": 20.022, + "grad_norm": 0.9690496921539307, + "learning_rate": 2e-05, + "loss": 0.02480958, + "step": 10011 + }, + { + "epoch": 20.024, + "grad_norm": 3.7256979942321777, + "learning_rate": 2e-05, + "loss": 0.03763371, + "step": 10012 + }, + { + "epoch": 20.026, + "grad_norm": 0.9773795008659363, + "learning_rate": 2e-05, + "loss": 0.02543466, + "step": 10013 + }, + { + "epoch": 20.028, + "grad_norm": 1.7975311279296875, + "learning_rate": 2e-05, + "loss": 0.0339796, + "step": 10014 + }, + { + "epoch": 20.03, + "grad_norm": 1.9980261325836182, + "learning_rate": 2e-05, + "loss": 0.03168187, + "step": 10015 + }, + { + "epoch": 20.032, + "grad_norm": 1.4407600164413452, + "learning_rate": 2e-05, + "loss": 0.05185389, + "step": 10016 + }, + { + "epoch": 20.034, + "grad_norm": 1.0228748321533203, + "learning_rate": 2e-05, + "loss": 0.02575734, + "step": 10017 + }, + { + "epoch": 20.036, + "grad_norm": 1.1408131122589111, + "learning_rate": 2e-05, + "loss": 0.03414143, + "step": 10018 + }, + { + "epoch": 20.038, + "grad_norm": 1.2371625900268555, + "learning_rate": 2e-05, + "loss": 0.03162018, + "step": 10019 + }, + { + "epoch": 20.04, + "grad_norm": 0.982409656047821, + "learning_rate": 2e-05, + "loss": 0.0248183, + "step": 10020 + }, + { + "epoch": 20.042, + "grad_norm": 1.5073639154434204, + "learning_rate": 2e-05, + "loss": 0.04884845, + "step": 10021 + }, + { + "epoch": 20.044, + "grad_norm": 1.28328537940979, + "learning_rate": 2e-05, + "loss": 0.04167043, + "step": 10022 + }, + { + "epoch": 20.046, + "grad_norm": 1.0359996557235718, + "learning_rate": 2e-05, + "loss": 0.0376031, + "step": 10023 + }, + { + "epoch": 20.048, + "grad_norm": 1.150727391242981, + "learning_rate": 2e-05, + "loss": 0.02745938, + "step": 10024 + }, + { + "epoch": 20.05, + "grad_norm": 1.7619673013687134, + "learning_rate": 2e-05, + "loss": 0.04664058, + "step": 10025 + }, + { + "epoch": 20.052, + "grad_norm": 1.849218726158142, + "learning_rate": 2e-05, + "loss": 0.0393936, + "step": 10026 + }, + { + "epoch": 20.054, + "grad_norm": 1.425082802772522, + "learning_rate": 2e-05, + "loss": 0.03768519, + "step": 10027 + }, + { + "epoch": 20.056, + "grad_norm": 2.417065143585205, + "learning_rate": 2e-05, + "loss": 0.04438198, + "step": 10028 + }, + { + "epoch": 20.058, + "grad_norm": 1.103333592414856, + "learning_rate": 2e-05, + "loss": 0.0304841, + "step": 10029 + }, + { + "epoch": 20.06, + "grad_norm": 1.0776103734970093, + "learning_rate": 2e-05, + "loss": 0.03893217, + "step": 10030 + }, + { + "epoch": 20.062, + "grad_norm": 1.4815536737442017, + "learning_rate": 2e-05, + "loss": 0.05196786, + "step": 10031 + }, + { + "epoch": 20.064, + "grad_norm": 1.3739426136016846, + "learning_rate": 2e-05, + "loss": 0.03645986, + "step": 10032 + }, + { + "epoch": 20.066, + "grad_norm": 1.1692328453063965, + "learning_rate": 2e-05, + "loss": 0.03630241, + "step": 10033 + }, + { + "epoch": 20.068, + "grad_norm": 1.0185339450836182, + "learning_rate": 2e-05, + "loss": 0.0344325, + "step": 10034 + }, + { + "epoch": 20.07, + "grad_norm": 0.9241943359375, + "learning_rate": 2e-05, + "loss": 0.02990804, + "step": 10035 + }, + { + "epoch": 20.072, + "grad_norm": 2.4085822105407715, + "learning_rate": 2e-05, + "loss": 0.04084285, + "step": 10036 + }, + { + "epoch": 20.074, + "grad_norm": 1.6510875225067139, + "learning_rate": 2e-05, + "loss": 0.03414775, + "step": 10037 + }, + { + "epoch": 20.076, + "grad_norm": 1.0024648904800415, + "learning_rate": 2e-05, + "loss": 0.02915655, + "step": 10038 + }, + { + "epoch": 20.078, + "grad_norm": 1.1941813230514526, + "learning_rate": 2e-05, + "loss": 0.03444272, + "step": 10039 + }, + { + "epoch": 20.08, + "grad_norm": 1.5588897466659546, + "learning_rate": 2e-05, + "loss": 0.04292718, + "step": 10040 + }, + { + "epoch": 20.082, + "grad_norm": 1.1992404460906982, + "learning_rate": 2e-05, + "loss": 0.03661583, + "step": 10041 + }, + { + "epoch": 20.084, + "grad_norm": 1.6118459701538086, + "learning_rate": 2e-05, + "loss": 0.03280291, + "step": 10042 + }, + { + "epoch": 20.086, + "grad_norm": 1.184962272644043, + "learning_rate": 2e-05, + "loss": 0.03350543, + "step": 10043 + }, + { + "epoch": 20.088, + "grad_norm": 1.2698707580566406, + "learning_rate": 2e-05, + "loss": 0.03131055, + "step": 10044 + }, + { + "epoch": 20.09, + "grad_norm": 1.2608489990234375, + "learning_rate": 2e-05, + "loss": 0.04219927, + "step": 10045 + }, + { + "epoch": 20.092, + "grad_norm": 1.3150699138641357, + "learning_rate": 2e-05, + "loss": 0.04263498, + "step": 10046 + }, + { + "epoch": 20.094, + "grad_norm": 0.7644698619842529, + "learning_rate": 2e-05, + "loss": 0.01624379, + "step": 10047 + }, + { + "epoch": 20.096, + "grad_norm": 1.202438235282898, + "learning_rate": 2e-05, + "loss": 0.04117297, + "step": 10048 + }, + { + "epoch": 20.098, + "grad_norm": 1.1358537673950195, + "learning_rate": 2e-05, + "loss": 0.0320413, + "step": 10049 + }, + { + "epoch": 20.1, + "grad_norm": 1.1852686405181885, + "learning_rate": 2e-05, + "loss": 0.0407727, + "step": 10050 + }, + { + "epoch": 20.102, + "grad_norm": 1.051586389541626, + "learning_rate": 2e-05, + "loss": 0.03409174, + "step": 10051 + }, + { + "epoch": 20.104, + "grad_norm": 0.8840859532356262, + "learning_rate": 2e-05, + "loss": 0.02449731, + "step": 10052 + }, + { + "epoch": 20.106, + "grad_norm": 0.8056784868240356, + "learning_rate": 2e-05, + "loss": 0.02186309, + "step": 10053 + }, + { + "epoch": 20.108, + "grad_norm": 1.1816960573196411, + "learning_rate": 2e-05, + "loss": 0.03838727, + "step": 10054 + }, + { + "epoch": 20.11, + "grad_norm": 1.3361520767211914, + "learning_rate": 2e-05, + "loss": 0.046068, + "step": 10055 + }, + { + "epoch": 20.112, + "grad_norm": 1.7416718006134033, + "learning_rate": 2e-05, + "loss": 0.02897919, + "step": 10056 + }, + { + "epoch": 20.114, + "grad_norm": 0.8946969509124756, + "learning_rate": 2e-05, + "loss": 0.02574443, + "step": 10057 + }, + { + "epoch": 20.116, + "grad_norm": 0.9283491969108582, + "learning_rate": 2e-05, + "loss": 0.02700006, + "step": 10058 + }, + { + "epoch": 20.118, + "grad_norm": 1.316556692123413, + "learning_rate": 2e-05, + "loss": 0.03163679, + "step": 10059 + }, + { + "epoch": 20.12, + "grad_norm": 1.7087239027023315, + "learning_rate": 2e-05, + "loss": 0.05993263, + "step": 10060 + }, + { + "epoch": 20.122, + "grad_norm": 1.1801674365997314, + "learning_rate": 2e-05, + "loss": 0.03400156, + "step": 10061 + }, + { + "epoch": 20.124, + "grad_norm": 1.5751726627349854, + "learning_rate": 2e-05, + "loss": 0.03422957, + "step": 10062 + }, + { + "epoch": 20.126, + "grad_norm": 0.9739691019058228, + "learning_rate": 2e-05, + "loss": 0.02737168, + "step": 10063 + }, + { + "epoch": 20.128, + "grad_norm": 1.8991624116897583, + "learning_rate": 2e-05, + "loss": 0.05057021, + "step": 10064 + }, + { + "epoch": 20.13, + "grad_norm": 1.7493529319763184, + "learning_rate": 2e-05, + "loss": 0.03247079, + "step": 10065 + }, + { + "epoch": 20.132, + "grad_norm": 0.9996808767318726, + "learning_rate": 2e-05, + "loss": 0.03386039, + "step": 10066 + }, + { + "epoch": 20.134, + "grad_norm": 1.3784937858581543, + "learning_rate": 2e-05, + "loss": 0.04235096, + "step": 10067 + }, + { + "epoch": 20.136, + "grad_norm": 0.9882324934005737, + "learning_rate": 2e-05, + "loss": 0.03374637, + "step": 10068 + }, + { + "epoch": 20.138, + "grad_norm": 1.1563467979431152, + "learning_rate": 2e-05, + "loss": 0.03350766, + "step": 10069 + }, + { + "epoch": 20.14, + "grad_norm": 1.2875149250030518, + "learning_rate": 2e-05, + "loss": 0.03958068, + "step": 10070 + }, + { + "epoch": 20.142, + "grad_norm": 1.2082184553146362, + "learning_rate": 2e-05, + "loss": 0.03617564, + "step": 10071 + }, + { + "epoch": 20.144, + "grad_norm": 1.5155640840530396, + "learning_rate": 2e-05, + "loss": 0.03431886, + "step": 10072 + }, + { + "epoch": 20.146, + "grad_norm": 1.241598129272461, + "learning_rate": 2e-05, + "loss": 0.05081156, + "step": 10073 + }, + { + "epoch": 20.148, + "grad_norm": 1.3844859600067139, + "learning_rate": 2e-05, + "loss": 0.04880771, + "step": 10074 + }, + { + "epoch": 20.15, + "grad_norm": 1.1351920366287231, + "learning_rate": 2e-05, + "loss": 0.04456853, + "step": 10075 + }, + { + "epoch": 20.152, + "grad_norm": 1.7876904010772705, + "learning_rate": 2e-05, + "loss": 0.04733547, + "step": 10076 + }, + { + "epoch": 20.154, + "grad_norm": 1.676299810409546, + "learning_rate": 2e-05, + "loss": 0.04053468, + "step": 10077 + }, + { + "epoch": 20.156, + "grad_norm": 0.9991481304168701, + "learning_rate": 2e-05, + "loss": 0.03337016, + "step": 10078 + }, + { + "epoch": 20.158, + "grad_norm": 1.0327903032302856, + "learning_rate": 2e-05, + "loss": 0.03195174, + "step": 10079 + }, + { + "epoch": 20.16, + "grad_norm": 1.0681126117706299, + "learning_rate": 2e-05, + "loss": 0.03106238, + "step": 10080 + }, + { + "epoch": 20.162, + "grad_norm": 2.461606979370117, + "learning_rate": 2e-05, + "loss": 0.04076942, + "step": 10081 + }, + { + "epoch": 20.164, + "grad_norm": 1.4891592264175415, + "learning_rate": 2e-05, + "loss": 0.0378016, + "step": 10082 + }, + { + "epoch": 20.166, + "grad_norm": 2.7966361045837402, + "learning_rate": 2e-05, + "loss": 0.04116115, + "step": 10083 + }, + { + "epoch": 20.168, + "grad_norm": 1.0875968933105469, + "learning_rate": 2e-05, + "loss": 0.03374017, + "step": 10084 + }, + { + "epoch": 20.17, + "grad_norm": 1.6224273443222046, + "learning_rate": 2e-05, + "loss": 0.04455825, + "step": 10085 + }, + { + "epoch": 20.172, + "grad_norm": 1.2282185554504395, + "learning_rate": 2e-05, + "loss": 0.03824051, + "step": 10086 + }, + { + "epoch": 20.174, + "grad_norm": 1.5995073318481445, + "learning_rate": 2e-05, + "loss": 0.03549742, + "step": 10087 + }, + { + "epoch": 20.176, + "grad_norm": 1.2183595895767212, + "learning_rate": 2e-05, + "loss": 0.03944673, + "step": 10088 + }, + { + "epoch": 20.178, + "grad_norm": 1.4532480239868164, + "learning_rate": 2e-05, + "loss": 0.03099249, + "step": 10089 + }, + { + "epoch": 20.18, + "grad_norm": 1.6714351177215576, + "learning_rate": 2e-05, + "loss": 0.05370768, + "step": 10090 + }, + { + "epoch": 20.182, + "grad_norm": 1.2066564559936523, + "learning_rate": 2e-05, + "loss": 0.03310648, + "step": 10091 + }, + { + "epoch": 20.184, + "grad_norm": 1.9016071557998657, + "learning_rate": 2e-05, + "loss": 0.06958458, + "step": 10092 + }, + { + "epoch": 20.186, + "grad_norm": 1.0576119422912598, + "learning_rate": 2e-05, + "loss": 0.03138105, + "step": 10093 + }, + { + "epoch": 20.188, + "grad_norm": 1.632576823234558, + "learning_rate": 2e-05, + "loss": 0.05661856, + "step": 10094 + }, + { + "epoch": 20.19, + "grad_norm": 1.3058475255966187, + "learning_rate": 2e-05, + "loss": 0.02934664, + "step": 10095 + }, + { + "epoch": 20.192, + "grad_norm": 1.5141475200653076, + "learning_rate": 2e-05, + "loss": 0.04773264, + "step": 10096 + }, + { + "epoch": 20.194, + "grad_norm": 1.2892210483551025, + "learning_rate": 2e-05, + "loss": 0.0409392, + "step": 10097 + }, + { + "epoch": 20.196, + "grad_norm": 1.0145940780639648, + "learning_rate": 2e-05, + "loss": 0.03348306, + "step": 10098 + }, + { + "epoch": 20.198, + "grad_norm": 1.3150386810302734, + "learning_rate": 2e-05, + "loss": 0.04228361, + "step": 10099 + }, + { + "epoch": 20.2, + "grad_norm": 1.4747895002365112, + "learning_rate": 2e-05, + "loss": 0.04473066, + "step": 10100 + }, + { + "epoch": 20.202, + "grad_norm": 1.3429152965545654, + "learning_rate": 2e-05, + "loss": 0.03698963, + "step": 10101 + }, + { + "epoch": 20.204, + "grad_norm": 1.5666682720184326, + "learning_rate": 2e-05, + "loss": 0.03814546, + "step": 10102 + }, + { + "epoch": 20.206, + "grad_norm": 1.1224846839904785, + "learning_rate": 2e-05, + "loss": 0.03632937, + "step": 10103 + }, + { + "epoch": 20.208, + "grad_norm": 1.2551809549331665, + "learning_rate": 2e-05, + "loss": 0.05948218, + "step": 10104 + }, + { + "epoch": 20.21, + "grad_norm": 1.0920709371566772, + "learning_rate": 2e-05, + "loss": 0.03180977, + "step": 10105 + }, + { + "epoch": 20.212, + "grad_norm": 1.1358767747879028, + "learning_rate": 2e-05, + "loss": 0.05127998, + "step": 10106 + }, + { + "epoch": 20.214, + "grad_norm": 1.1630680561065674, + "learning_rate": 2e-05, + "loss": 0.03373503, + "step": 10107 + }, + { + "epoch": 20.216, + "grad_norm": 1.0587674379348755, + "learning_rate": 2e-05, + "loss": 0.03114433, + "step": 10108 + }, + { + "epoch": 20.218, + "grad_norm": 2.387716054916382, + "learning_rate": 2e-05, + "loss": 0.05491668, + "step": 10109 + }, + { + "epoch": 20.22, + "grad_norm": 1.344147801399231, + "learning_rate": 2e-05, + "loss": 0.03537369, + "step": 10110 + }, + { + "epoch": 20.222, + "grad_norm": 1.4900367259979248, + "learning_rate": 2e-05, + "loss": 0.04168044, + "step": 10111 + }, + { + "epoch": 20.224, + "grad_norm": 1.3198118209838867, + "learning_rate": 2e-05, + "loss": 0.03236744, + "step": 10112 + }, + { + "epoch": 20.226, + "grad_norm": 1.05833101272583, + "learning_rate": 2e-05, + "loss": 0.03506115, + "step": 10113 + }, + { + "epoch": 20.228, + "grad_norm": 3.1193032264709473, + "learning_rate": 2e-05, + "loss": 0.06916459, + "step": 10114 + }, + { + "epoch": 20.23, + "grad_norm": 0.9400885105133057, + "learning_rate": 2e-05, + "loss": 0.02644516, + "step": 10115 + }, + { + "epoch": 20.232, + "grad_norm": 1.744911551475525, + "learning_rate": 2e-05, + "loss": 0.03334912, + "step": 10116 + }, + { + "epoch": 20.234, + "grad_norm": 1.1233267784118652, + "learning_rate": 2e-05, + "loss": 0.02539495, + "step": 10117 + }, + { + "epoch": 20.236, + "grad_norm": 0.8864102959632874, + "learning_rate": 2e-05, + "loss": 0.02736823, + "step": 10118 + }, + { + "epoch": 20.238, + "grad_norm": 0.9077542424201965, + "learning_rate": 2e-05, + "loss": 0.02917507, + "step": 10119 + }, + { + "epoch": 20.24, + "grad_norm": 1.7056787014007568, + "learning_rate": 2e-05, + "loss": 0.04501066, + "step": 10120 + }, + { + "epoch": 20.242, + "grad_norm": 1.0653247833251953, + "learning_rate": 2e-05, + "loss": 0.03258765, + "step": 10121 + }, + { + "epoch": 20.244, + "grad_norm": 1.5916606187820435, + "learning_rate": 2e-05, + "loss": 0.03004246, + "step": 10122 + }, + { + "epoch": 20.246, + "grad_norm": 0.992498517036438, + "learning_rate": 2e-05, + "loss": 0.03313632, + "step": 10123 + }, + { + "epoch": 20.248, + "grad_norm": 1.408166527748108, + "learning_rate": 2e-05, + "loss": 0.03982632, + "step": 10124 + }, + { + "epoch": 20.25, + "grad_norm": 2.0703201293945312, + "learning_rate": 2e-05, + "loss": 0.03419951, + "step": 10125 + }, + { + "epoch": 20.252, + "grad_norm": 2.027787446975708, + "learning_rate": 2e-05, + "loss": 0.04078956, + "step": 10126 + }, + { + "epoch": 20.254, + "grad_norm": 1.7033048868179321, + "learning_rate": 2e-05, + "loss": 0.04830636, + "step": 10127 + }, + { + "epoch": 20.256, + "grad_norm": 1.3836185932159424, + "learning_rate": 2e-05, + "loss": 0.05766873, + "step": 10128 + }, + { + "epoch": 20.258, + "grad_norm": 0.874599039554596, + "learning_rate": 2e-05, + "loss": 0.02754955, + "step": 10129 + }, + { + "epoch": 20.26, + "grad_norm": 1.280648946762085, + "learning_rate": 2e-05, + "loss": 0.03249458, + "step": 10130 + }, + { + "epoch": 20.262, + "grad_norm": 3.8084380626678467, + "learning_rate": 2e-05, + "loss": 0.03238885, + "step": 10131 + }, + { + "epoch": 20.264, + "grad_norm": 1.1068285703659058, + "learning_rate": 2e-05, + "loss": 0.03581792, + "step": 10132 + }, + { + "epoch": 20.266, + "grad_norm": 1.4236106872558594, + "learning_rate": 2e-05, + "loss": 0.03960142, + "step": 10133 + }, + { + "epoch": 20.268, + "grad_norm": 1.674912452697754, + "learning_rate": 2e-05, + "loss": 0.04210643, + "step": 10134 + }, + { + "epoch": 20.27, + "grad_norm": 1.6190119981765747, + "learning_rate": 2e-05, + "loss": 0.03533464, + "step": 10135 + }, + { + "epoch": 20.272, + "grad_norm": 1.0870835781097412, + "learning_rate": 2e-05, + "loss": 0.03944698, + "step": 10136 + }, + { + "epoch": 20.274, + "grad_norm": 1.3776017427444458, + "learning_rate": 2e-05, + "loss": 0.03751071, + "step": 10137 + }, + { + "epoch": 20.276, + "grad_norm": 1.1693474054336548, + "learning_rate": 2e-05, + "loss": 0.03089845, + "step": 10138 + }, + { + "epoch": 20.278, + "grad_norm": 1.5422534942626953, + "learning_rate": 2e-05, + "loss": 0.0364728, + "step": 10139 + }, + { + "epoch": 20.28, + "grad_norm": 1.130757212638855, + "learning_rate": 2e-05, + "loss": 0.03968713, + "step": 10140 + }, + { + "epoch": 20.282, + "grad_norm": 1.601479172706604, + "learning_rate": 2e-05, + "loss": 0.0454113, + "step": 10141 + }, + { + "epoch": 20.284, + "grad_norm": 1.0961538553237915, + "learning_rate": 2e-05, + "loss": 0.03847424, + "step": 10142 + }, + { + "epoch": 20.286, + "grad_norm": 1.3003602027893066, + "learning_rate": 2e-05, + "loss": 0.03098981, + "step": 10143 + }, + { + "epoch": 20.288, + "grad_norm": 1.9535208940505981, + "learning_rate": 2e-05, + "loss": 0.03620074, + "step": 10144 + }, + { + "epoch": 20.29, + "grad_norm": 1.0742915868759155, + "learning_rate": 2e-05, + "loss": 0.03768191, + "step": 10145 + }, + { + "epoch": 20.292, + "grad_norm": 0.8740436434745789, + "learning_rate": 2e-05, + "loss": 0.0226828, + "step": 10146 + }, + { + "epoch": 20.294, + "grad_norm": 0.9478847980499268, + "learning_rate": 2e-05, + "loss": 0.03354997, + "step": 10147 + }, + { + "epoch": 20.296, + "grad_norm": 1.195736289024353, + "learning_rate": 2e-05, + "loss": 0.03266739, + "step": 10148 + }, + { + "epoch": 20.298, + "grad_norm": 1.1402510404586792, + "learning_rate": 2e-05, + "loss": 0.03164797, + "step": 10149 + }, + { + "epoch": 20.3, + "grad_norm": 0.9935315847396851, + "learning_rate": 2e-05, + "loss": 0.0263091, + "step": 10150 + }, + { + "epoch": 20.302, + "grad_norm": 1.2046669721603394, + "learning_rate": 2e-05, + "loss": 0.03377987, + "step": 10151 + }, + { + "epoch": 20.304, + "grad_norm": 1.2782913446426392, + "learning_rate": 2e-05, + "loss": 0.03555286, + "step": 10152 + }, + { + "epoch": 20.306, + "grad_norm": 1.155943512916565, + "learning_rate": 2e-05, + "loss": 0.03053265, + "step": 10153 + }, + { + "epoch": 20.308, + "grad_norm": 1.3335613012313843, + "learning_rate": 2e-05, + "loss": 0.02797567, + "step": 10154 + }, + { + "epoch": 20.31, + "grad_norm": 1.3252575397491455, + "learning_rate": 2e-05, + "loss": 0.04167391, + "step": 10155 + }, + { + "epoch": 20.312, + "grad_norm": 1.257401704788208, + "learning_rate": 2e-05, + "loss": 0.04301036, + "step": 10156 + }, + { + "epoch": 20.314, + "grad_norm": 1.2897112369537354, + "learning_rate": 2e-05, + "loss": 0.02559963, + "step": 10157 + }, + { + "epoch": 20.316, + "grad_norm": 1.0978411436080933, + "learning_rate": 2e-05, + "loss": 0.03113418, + "step": 10158 + }, + { + "epoch": 20.318, + "grad_norm": 0.9940638542175293, + "learning_rate": 2e-05, + "loss": 0.02741957, + "step": 10159 + }, + { + "epoch": 20.32, + "grad_norm": 1.1258987188339233, + "learning_rate": 2e-05, + "loss": 0.03458284, + "step": 10160 + }, + { + "epoch": 20.322, + "grad_norm": 1.3880150318145752, + "learning_rate": 2e-05, + "loss": 0.02557427, + "step": 10161 + }, + { + "epoch": 20.324, + "grad_norm": 0.89255690574646, + "learning_rate": 2e-05, + "loss": 0.02698753, + "step": 10162 + }, + { + "epoch": 20.326, + "grad_norm": 0.9891429543495178, + "learning_rate": 2e-05, + "loss": 0.03858247, + "step": 10163 + }, + { + "epoch": 20.328, + "grad_norm": 3.275198459625244, + "learning_rate": 2e-05, + "loss": 0.04408592, + "step": 10164 + }, + { + "epoch": 20.33, + "grad_norm": 2.566772699356079, + "learning_rate": 2e-05, + "loss": 0.04836412, + "step": 10165 + }, + { + "epoch": 20.332, + "grad_norm": 1.2465749979019165, + "learning_rate": 2e-05, + "loss": 0.03609654, + "step": 10166 + }, + { + "epoch": 20.334, + "grad_norm": 0.8518936634063721, + "learning_rate": 2e-05, + "loss": 0.02846691, + "step": 10167 + }, + { + "epoch": 20.336, + "grad_norm": 0.8058186769485474, + "learning_rate": 2e-05, + "loss": 0.02809364, + "step": 10168 + }, + { + "epoch": 20.338, + "grad_norm": 1.2480473518371582, + "learning_rate": 2e-05, + "loss": 0.03842864, + "step": 10169 + }, + { + "epoch": 20.34, + "grad_norm": 1.6874932050704956, + "learning_rate": 2e-05, + "loss": 0.02990905, + "step": 10170 + }, + { + "epoch": 20.342, + "grad_norm": 1.2645959854125977, + "learning_rate": 2e-05, + "loss": 0.02560621, + "step": 10171 + }, + { + "epoch": 20.344, + "grad_norm": 1.209860920906067, + "learning_rate": 2e-05, + "loss": 0.04289609, + "step": 10172 + }, + { + "epoch": 20.346, + "grad_norm": 0.9277303814888, + "learning_rate": 2e-05, + "loss": 0.02414119, + "step": 10173 + }, + { + "epoch": 20.348, + "grad_norm": 1.4776111841201782, + "learning_rate": 2e-05, + "loss": 0.03888512, + "step": 10174 + }, + { + "epoch": 20.35, + "grad_norm": 10.23653507232666, + "learning_rate": 2e-05, + "loss": 0.04151426, + "step": 10175 + }, + { + "epoch": 20.352, + "grad_norm": 1.4618253707885742, + "learning_rate": 2e-05, + "loss": 0.04169954, + "step": 10176 + }, + { + "epoch": 20.354, + "grad_norm": 1.517190933227539, + "learning_rate": 2e-05, + "loss": 0.03129986, + "step": 10177 + }, + { + "epoch": 20.356, + "grad_norm": 1.265041470527649, + "learning_rate": 2e-05, + "loss": 0.04625013, + "step": 10178 + }, + { + "epoch": 20.358, + "grad_norm": 0.9929906725883484, + "learning_rate": 2e-05, + "loss": 0.03318879, + "step": 10179 + }, + { + "epoch": 20.36, + "grad_norm": 1.1843352317810059, + "learning_rate": 2e-05, + "loss": 0.02937111, + "step": 10180 + }, + { + "epoch": 20.362, + "grad_norm": 3.79577374458313, + "learning_rate": 2e-05, + "loss": 0.04123313, + "step": 10181 + }, + { + "epoch": 20.364, + "grad_norm": 1.2206071615219116, + "learning_rate": 2e-05, + "loss": 0.03874102, + "step": 10182 + }, + { + "epoch": 20.366, + "grad_norm": 1.6392779350280762, + "learning_rate": 2e-05, + "loss": 0.03447268, + "step": 10183 + }, + { + "epoch": 20.368, + "grad_norm": 1.166060447692871, + "learning_rate": 2e-05, + "loss": 0.04190687, + "step": 10184 + }, + { + "epoch": 20.37, + "grad_norm": 1.3748960494995117, + "learning_rate": 2e-05, + "loss": 0.05042437, + "step": 10185 + }, + { + "epoch": 20.372, + "grad_norm": 1.016833782196045, + "learning_rate": 2e-05, + "loss": 0.0323528, + "step": 10186 + }, + { + "epoch": 20.374, + "grad_norm": 1.248270869255066, + "learning_rate": 2e-05, + "loss": 0.03355975, + "step": 10187 + }, + { + "epoch": 20.376, + "grad_norm": 2.965949058532715, + "learning_rate": 2e-05, + "loss": 0.03507435, + "step": 10188 + }, + { + "epoch": 20.378, + "grad_norm": 0.9657095074653625, + "learning_rate": 2e-05, + "loss": 0.03347619, + "step": 10189 + }, + { + "epoch": 20.38, + "grad_norm": 1.3561325073242188, + "learning_rate": 2e-05, + "loss": 0.03161833, + "step": 10190 + }, + { + "epoch": 20.382, + "grad_norm": 1.271353006362915, + "learning_rate": 2e-05, + "loss": 0.04460255, + "step": 10191 + }, + { + "epoch": 20.384, + "grad_norm": 1.1262338161468506, + "learning_rate": 2e-05, + "loss": 0.03606068, + "step": 10192 + }, + { + "epoch": 20.386, + "grad_norm": 1.0850212574005127, + "learning_rate": 2e-05, + "loss": 0.03550104, + "step": 10193 + }, + { + "epoch": 20.388, + "grad_norm": 1.1879736185073853, + "learning_rate": 2e-05, + "loss": 0.03706096, + "step": 10194 + }, + { + "epoch": 20.39, + "grad_norm": 0.8170276284217834, + "learning_rate": 2e-05, + "loss": 0.02326233, + "step": 10195 + }, + { + "epoch": 20.392, + "grad_norm": 1.2504932880401611, + "learning_rate": 2e-05, + "loss": 0.0375908, + "step": 10196 + }, + { + "epoch": 20.394, + "grad_norm": 2.0169646739959717, + "learning_rate": 2e-05, + "loss": 0.03000656, + "step": 10197 + }, + { + "epoch": 20.396, + "grad_norm": 1.223647952079773, + "learning_rate": 2e-05, + "loss": 0.03923958, + "step": 10198 + }, + { + "epoch": 20.398, + "grad_norm": 1.207067847251892, + "learning_rate": 2e-05, + "loss": 0.03308685, + "step": 10199 + }, + { + "epoch": 20.4, + "grad_norm": 1.9212299585342407, + "learning_rate": 2e-05, + "loss": 0.03529272, + "step": 10200 + }, + { + "epoch": 20.402, + "grad_norm": 2.4769113063812256, + "learning_rate": 2e-05, + "loss": 0.03606252, + "step": 10201 + }, + { + "epoch": 20.404, + "grad_norm": 1.2097878456115723, + "learning_rate": 2e-05, + "loss": 0.03837938, + "step": 10202 + }, + { + "epoch": 20.406, + "grad_norm": 1.7725907564163208, + "learning_rate": 2e-05, + "loss": 0.03187569, + "step": 10203 + }, + { + "epoch": 20.408, + "grad_norm": 1.7797393798828125, + "learning_rate": 2e-05, + "loss": 0.04215318, + "step": 10204 + }, + { + "epoch": 20.41, + "grad_norm": 1.4638700485229492, + "learning_rate": 2e-05, + "loss": 0.04161576, + "step": 10205 + }, + { + "epoch": 20.412, + "grad_norm": 1.297315001487732, + "learning_rate": 2e-05, + "loss": 0.03515932, + "step": 10206 + }, + { + "epoch": 20.414, + "grad_norm": 1.8025538921356201, + "learning_rate": 2e-05, + "loss": 0.03362041, + "step": 10207 + }, + { + "epoch": 20.416, + "grad_norm": 1.6176317930221558, + "learning_rate": 2e-05, + "loss": 0.040805, + "step": 10208 + }, + { + "epoch": 20.418, + "grad_norm": 0.9706657528877258, + "learning_rate": 2e-05, + "loss": 0.02987682, + "step": 10209 + }, + { + "epoch": 20.42, + "grad_norm": 1.238422155380249, + "learning_rate": 2e-05, + "loss": 0.03280898, + "step": 10210 + }, + { + "epoch": 20.422, + "grad_norm": 1.4637218713760376, + "learning_rate": 2e-05, + "loss": 0.03068679, + "step": 10211 + }, + { + "epoch": 20.424, + "grad_norm": 2.2752456665039062, + "learning_rate": 2e-05, + "loss": 0.04319573, + "step": 10212 + }, + { + "epoch": 20.426, + "grad_norm": 0.9194338917732239, + "learning_rate": 2e-05, + "loss": 0.03249656, + "step": 10213 + }, + { + "epoch": 20.428, + "grad_norm": 1.2353630065917969, + "learning_rate": 2e-05, + "loss": 0.02999087, + "step": 10214 + }, + { + "epoch": 20.43, + "grad_norm": 1.2627156972885132, + "learning_rate": 2e-05, + "loss": 0.04242412, + "step": 10215 + }, + { + "epoch": 20.432, + "grad_norm": 1.1771798133850098, + "learning_rate": 2e-05, + "loss": 0.04588514, + "step": 10216 + }, + { + "epoch": 20.434, + "grad_norm": 1.0947905778884888, + "learning_rate": 2e-05, + "loss": 0.03065883, + "step": 10217 + }, + { + "epoch": 20.436, + "grad_norm": 1.1154054403305054, + "learning_rate": 2e-05, + "loss": 0.02652307, + "step": 10218 + }, + { + "epoch": 20.438, + "grad_norm": 1.1405092477798462, + "learning_rate": 2e-05, + "loss": 0.01361668, + "step": 10219 + }, + { + "epoch": 20.44, + "grad_norm": 1.1158807277679443, + "learning_rate": 2e-05, + "loss": 0.02150419, + "step": 10220 + }, + { + "epoch": 20.442, + "grad_norm": 1.2909730672836304, + "learning_rate": 2e-05, + "loss": 0.03332622, + "step": 10221 + }, + { + "epoch": 20.444, + "grad_norm": 1.8945097923278809, + "learning_rate": 2e-05, + "loss": 0.05206165, + "step": 10222 + }, + { + "epoch": 20.446, + "grad_norm": 1.1864949464797974, + "learning_rate": 2e-05, + "loss": 0.02879405, + "step": 10223 + }, + { + "epoch": 20.448, + "grad_norm": 1.5208016633987427, + "learning_rate": 2e-05, + "loss": 0.04240531, + "step": 10224 + }, + { + "epoch": 20.45, + "grad_norm": 1.8025298118591309, + "learning_rate": 2e-05, + "loss": 0.04271449, + "step": 10225 + }, + { + "epoch": 20.452, + "grad_norm": 1.7837144136428833, + "learning_rate": 2e-05, + "loss": 0.0441253, + "step": 10226 + }, + { + "epoch": 20.454, + "grad_norm": 1.146601915359497, + "learning_rate": 2e-05, + "loss": 0.03485624, + "step": 10227 + }, + { + "epoch": 20.456, + "grad_norm": 0.8954287171363831, + "learning_rate": 2e-05, + "loss": 0.02285667, + "step": 10228 + }, + { + "epoch": 20.458, + "grad_norm": 2.9269511699676514, + "learning_rate": 2e-05, + "loss": 0.03113767, + "step": 10229 + }, + { + "epoch": 20.46, + "grad_norm": 0.9090030193328857, + "learning_rate": 2e-05, + "loss": 0.02447719, + "step": 10230 + }, + { + "epoch": 20.462, + "grad_norm": 1.004408836364746, + "learning_rate": 2e-05, + "loss": 0.03344005, + "step": 10231 + }, + { + "epoch": 20.464, + "grad_norm": 1.294810175895691, + "learning_rate": 2e-05, + "loss": 0.02832571, + "step": 10232 + }, + { + "epoch": 20.466, + "grad_norm": 1.3520246744155884, + "learning_rate": 2e-05, + "loss": 0.05982265, + "step": 10233 + }, + { + "epoch": 20.468, + "grad_norm": 1.4492565393447876, + "learning_rate": 2e-05, + "loss": 0.04218536, + "step": 10234 + }, + { + "epoch": 20.47, + "grad_norm": 2.6897168159484863, + "learning_rate": 2e-05, + "loss": 0.04501551, + "step": 10235 + }, + { + "epoch": 20.472, + "grad_norm": 1.8297394514083862, + "learning_rate": 2e-05, + "loss": 0.03225635, + "step": 10236 + }, + { + "epoch": 20.474, + "grad_norm": 1.2002131938934326, + "learning_rate": 2e-05, + "loss": 0.03257868, + "step": 10237 + }, + { + "epoch": 20.476, + "grad_norm": 1.413238525390625, + "learning_rate": 2e-05, + "loss": 0.04716073, + "step": 10238 + }, + { + "epoch": 20.478, + "grad_norm": 1.317274808883667, + "learning_rate": 2e-05, + "loss": 0.04736836, + "step": 10239 + }, + { + "epoch": 20.48, + "grad_norm": 1.25471031665802, + "learning_rate": 2e-05, + "loss": 0.03034903, + "step": 10240 + }, + { + "epoch": 20.482, + "grad_norm": 0.9913167357444763, + "learning_rate": 2e-05, + "loss": 0.02995886, + "step": 10241 + }, + { + "epoch": 20.484, + "grad_norm": 1.0410956144332886, + "learning_rate": 2e-05, + "loss": 0.03229685, + "step": 10242 + }, + { + "epoch": 20.486, + "grad_norm": 1.0938940048217773, + "learning_rate": 2e-05, + "loss": 0.02566743, + "step": 10243 + }, + { + "epoch": 20.488, + "grad_norm": 1.5702996253967285, + "learning_rate": 2e-05, + "loss": 0.03801944, + "step": 10244 + }, + { + "epoch": 20.49, + "grad_norm": 1.0842231512069702, + "learning_rate": 2e-05, + "loss": 0.03229133, + "step": 10245 + }, + { + "epoch": 20.492, + "grad_norm": 4.291932106018066, + "learning_rate": 2e-05, + "loss": 0.03782872, + "step": 10246 + }, + { + "epoch": 20.494, + "grad_norm": 1.3879879713058472, + "learning_rate": 2e-05, + "loss": 0.05402232, + "step": 10247 + }, + { + "epoch": 20.496, + "grad_norm": 1.004504919052124, + "learning_rate": 2e-05, + "loss": 0.03432404, + "step": 10248 + }, + { + "epoch": 20.498, + "grad_norm": 0.948047399520874, + "learning_rate": 2e-05, + "loss": 0.02306466, + "step": 10249 + }, + { + "epoch": 20.5, + "grad_norm": 1.270635724067688, + "learning_rate": 2e-05, + "loss": 0.03679778, + "step": 10250 + }, + { + "epoch": 20.502, + "grad_norm": 1.9827309846878052, + "learning_rate": 2e-05, + "loss": 0.04181923, + "step": 10251 + }, + { + "epoch": 20.504, + "grad_norm": 1.788630485534668, + "learning_rate": 2e-05, + "loss": 0.03067524, + "step": 10252 + }, + { + "epoch": 20.506, + "grad_norm": 1.5751676559448242, + "learning_rate": 2e-05, + "loss": 0.04409302, + "step": 10253 + }, + { + "epoch": 20.508, + "grad_norm": 1.0470032691955566, + "learning_rate": 2e-05, + "loss": 0.02919145, + "step": 10254 + }, + { + "epoch": 20.51, + "grad_norm": 1.5532153844833374, + "learning_rate": 2e-05, + "loss": 0.0342193, + "step": 10255 + }, + { + "epoch": 20.512, + "grad_norm": 1.2492958307266235, + "learning_rate": 2e-05, + "loss": 0.03348667, + "step": 10256 + }, + { + "epoch": 20.514, + "grad_norm": 1.015598177909851, + "learning_rate": 2e-05, + "loss": 0.02103408, + "step": 10257 + }, + { + "epoch": 20.516, + "grad_norm": 1.0364876985549927, + "learning_rate": 2e-05, + "loss": 0.03669195, + "step": 10258 + }, + { + "epoch": 20.518, + "grad_norm": 1.7574011087417603, + "learning_rate": 2e-05, + "loss": 0.02889457, + "step": 10259 + }, + { + "epoch": 20.52, + "grad_norm": 2.7406435012817383, + "learning_rate": 2e-05, + "loss": 0.03528633, + "step": 10260 + }, + { + "epoch": 20.522, + "grad_norm": 1.9497075080871582, + "learning_rate": 2e-05, + "loss": 0.03676779, + "step": 10261 + }, + { + "epoch": 20.524, + "grad_norm": 1.2621787786483765, + "learning_rate": 2e-05, + "loss": 0.03754564, + "step": 10262 + }, + { + "epoch": 20.526, + "grad_norm": 1.5117218494415283, + "learning_rate": 2e-05, + "loss": 0.03292262, + "step": 10263 + }, + { + "epoch": 20.528, + "grad_norm": 0.9753475785255432, + "learning_rate": 2e-05, + "loss": 0.02898685, + "step": 10264 + }, + { + "epoch": 20.53, + "grad_norm": 1.2703803777694702, + "learning_rate": 2e-05, + "loss": 0.03447834, + "step": 10265 + }, + { + "epoch": 20.532, + "grad_norm": 1.037750244140625, + "learning_rate": 2e-05, + "loss": 0.02343212, + "step": 10266 + }, + { + "epoch": 20.534, + "grad_norm": 1.8119739294052124, + "learning_rate": 2e-05, + "loss": 0.0345272, + "step": 10267 + }, + { + "epoch": 20.536, + "grad_norm": 0.9879088401794434, + "learning_rate": 2e-05, + "loss": 0.03324077, + "step": 10268 + }, + { + "epoch": 20.538, + "grad_norm": 1.1989351511001587, + "learning_rate": 2e-05, + "loss": 0.04151801, + "step": 10269 + }, + { + "epoch": 20.54, + "grad_norm": 1.0397365093231201, + "learning_rate": 2e-05, + "loss": 0.03603576, + "step": 10270 + }, + { + "epoch": 20.542, + "grad_norm": 3.7377114295959473, + "learning_rate": 2e-05, + "loss": 0.03584925, + "step": 10271 + }, + { + "epoch": 20.544, + "grad_norm": 0.9478249549865723, + "learning_rate": 2e-05, + "loss": 0.02031682, + "step": 10272 + }, + { + "epoch": 20.546, + "grad_norm": 1.2334407567977905, + "learning_rate": 2e-05, + "loss": 0.04748309, + "step": 10273 + }, + { + "epoch": 20.548000000000002, + "grad_norm": 2.86969256401062, + "learning_rate": 2e-05, + "loss": 0.04576512, + "step": 10274 + }, + { + "epoch": 20.55, + "grad_norm": 1.3003041744232178, + "learning_rate": 2e-05, + "loss": 0.03236845, + "step": 10275 + }, + { + "epoch": 20.552, + "grad_norm": 2.296776533126831, + "learning_rate": 2e-05, + "loss": 0.04013178, + "step": 10276 + }, + { + "epoch": 20.554, + "grad_norm": 1.155997395515442, + "learning_rate": 2e-05, + "loss": 0.03783529, + "step": 10277 + }, + { + "epoch": 20.556, + "grad_norm": 1.140368938446045, + "learning_rate": 2e-05, + "loss": 0.03774282, + "step": 10278 + }, + { + "epoch": 20.558, + "grad_norm": 1.4872280359268188, + "learning_rate": 2e-05, + "loss": 0.03347996, + "step": 10279 + }, + { + "epoch": 20.56, + "grad_norm": 2.4217143058776855, + "learning_rate": 2e-05, + "loss": 0.0531043, + "step": 10280 + }, + { + "epoch": 20.562, + "grad_norm": 1.4787334203720093, + "learning_rate": 2e-05, + "loss": 0.03938647, + "step": 10281 + }, + { + "epoch": 20.564, + "grad_norm": 1.2541735172271729, + "learning_rate": 2e-05, + "loss": 0.03302208, + "step": 10282 + }, + { + "epoch": 20.566, + "grad_norm": 1.1605701446533203, + "learning_rate": 2e-05, + "loss": 0.03414555, + "step": 10283 + }, + { + "epoch": 20.568, + "grad_norm": 2.4995267391204834, + "learning_rate": 2e-05, + "loss": 0.03342891, + "step": 10284 + }, + { + "epoch": 20.57, + "grad_norm": 1.6872836351394653, + "learning_rate": 2e-05, + "loss": 0.0238953, + "step": 10285 + }, + { + "epoch": 20.572, + "grad_norm": 1.2707072496414185, + "learning_rate": 2e-05, + "loss": 0.03610934, + "step": 10286 + }, + { + "epoch": 20.574, + "grad_norm": 1.908408761024475, + "learning_rate": 2e-05, + "loss": 0.02757995, + "step": 10287 + }, + { + "epoch": 20.576, + "grad_norm": 0.9678316712379456, + "learning_rate": 2e-05, + "loss": 0.02639127, + "step": 10288 + }, + { + "epoch": 20.578, + "grad_norm": 1.0835262537002563, + "learning_rate": 2e-05, + "loss": 0.03382458, + "step": 10289 + }, + { + "epoch": 20.58, + "grad_norm": 1.21681547164917, + "learning_rate": 2e-05, + "loss": 0.02967013, + "step": 10290 + }, + { + "epoch": 20.582, + "grad_norm": 1.723157286643982, + "learning_rate": 2e-05, + "loss": 0.03698352, + "step": 10291 + }, + { + "epoch": 20.584, + "grad_norm": 1.0680166482925415, + "learning_rate": 2e-05, + "loss": 0.02927051, + "step": 10292 + }, + { + "epoch": 20.586, + "grad_norm": 0.9486793279647827, + "learning_rate": 2e-05, + "loss": 0.0270779, + "step": 10293 + }, + { + "epoch": 20.588, + "grad_norm": 2.921025037765503, + "learning_rate": 2e-05, + "loss": 0.04973118, + "step": 10294 + }, + { + "epoch": 20.59, + "grad_norm": 1.2367773056030273, + "learning_rate": 2e-05, + "loss": 0.04252058, + "step": 10295 + }, + { + "epoch": 20.592, + "grad_norm": 1.1579737663269043, + "learning_rate": 2e-05, + "loss": 0.03316213, + "step": 10296 + }, + { + "epoch": 20.594, + "grad_norm": 2.2136647701263428, + "learning_rate": 2e-05, + "loss": 0.04563174, + "step": 10297 + }, + { + "epoch": 20.596, + "grad_norm": 1.8893747329711914, + "learning_rate": 2e-05, + "loss": 0.03102884, + "step": 10298 + }, + { + "epoch": 20.598, + "grad_norm": 2.0692083835601807, + "learning_rate": 2e-05, + "loss": 0.05330367, + "step": 10299 + }, + { + "epoch": 20.6, + "grad_norm": 0.9594127535820007, + "learning_rate": 2e-05, + "loss": 0.02779751, + "step": 10300 + }, + { + "epoch": 20.602, + "grad_norm": 2.0204217433929443, + "learning_rate": 2e-05, + "loss": 0.05240842, + "step": 10301 + }, + { + "epoch": 20.604, + "grad_norm": 1.9438049793243408, + "learning_rate": 2e-05, + "loss": 0.0536239, + "step": 10302 + }, + { + "epoch": 20.606, + "grad_norm": 1.0696160793304443, + "learning_rate": 2e-05, + "loss": 0.02945369, + "step": 10303 + }, + { + "epoch": 20.608, + "grad_norm": 1.0507266521453857, + "learning_rate": 2e-05, + "loss": 0.0354307, + "step": 10304 + }, + { + "epoch": 20.61, + "grad_norm": 1.37172269821167, + "learning_rate": 2e-05, + "loss": 0.04699483, + "step": 10305 + }, + { + "epoch": 20.612, + "grad_norm": 1.324902057647705, + "learning_rate": 2e-05, + "loss": 0.02831972, + "step": 10306 + }, + { + "epoch": 20.614, + "grad_norm": 1.9394724369049072, + "learning_rate": 2e-05, + "loss": 0.03686791, + "step": 10307 + }, + { + "epoch": 20.616, + "grad_norm": 1.221810221672058, + "learning_rate": 2e-05, + "loss": 0.04077676, + "step": 10308 + }, + { + "epoch": 20.618, + "grad_norm": 1.411195993423462, + "learning_rate": 2e-05, + "loss": 0.02411193, + "step": 10309 + }, + { + "epoch": 20.62, + "grad_norm": 1.5662122964859009, + "learning_rate": 2e-05, + "loss": 0.04722676, + "step": 10310 + }, + { + "epoch": 20.622, + "grad_norm": 0.8571231365203857, + "learning_rate": 2e-05, + "loss": 0.03123869, + "step": 10311 + }, + { + "epoch": 20.624, + "grad_norm": 0.9552671313285828, + "learning_rate": 2e-05, + "loss": 0.02956947, + "step": 10312 + }, + { + "epoch": 20.626, + "grad_norm": 1.1970704793930054, + "learning_rate": 2e-05, + "loss": 0.03908085, + "step": 10313 + }, + { + "epoch": 20.628, + "grad_norm": 1.5940030813217163, + "learning_rate": 2e-05, + "loss": 0.03680157, + "step": 10314 + }, + { + "epoch": 20.63, + "grad_norm": 1.1858786344528198, + "learning_rate": 2e-05, + "loss": 0.0258833, + "step": 10315 + }, + { + "epoch": 20.632, + "grad_norm": 1.1859554052352905, + "learning_rate": 2e-05, + "loss": 0.0370981, + "step": 10316 + }, + { + "epoch": 20.634, + "grad_norm": 0.9557812809944153, + "learning_rate": 2e-05, + "loss": 0.03161231, + "step": 10317 + }, + { + "epoch": 20.636, + "grad_norm": 1.269998550415039, + "learning_rate": 2e-05, + "loss": 0.03466748, + "step": 10318 + }, + { + "epoch": 20.638, + "grad_norm": 1.5743147134780884, + "learning_rate": 2e-05, + "loss": 0.03005154, + "step": 10319 + }, + { + "epoch": 20.64, + "grad_norm": 1.3234727382659912, + "learning_rate": 2e-05, + "loss": 0.03330294, + "step": 10320 + }, + { + "epoch": 20.642, + "grad_norm": 1.6464325189590454, + "learning_rate": 2e-05, + "loss": 0.04935025, + "step": 10321 + }, + { + "epoch": 20.644, + "grad_norm": 1.5875946283340454, + "learning_rate": 2e-05, + "loss": 0.03549227, + "step": 10322 + }, + { + "epoch": 20.646, + "grad_norm": 1.4373325109481812, + "learning_rate": 2e-05, + "loss": 0.03930869, + "step": 10323 + }, + { + "epoch": 20.648, + "grad_norm": 0.8817594647407532, + "learning_rate": 2e-05, + "loss": 0.0271128, + "step": 10324 + }, + { + "epoch": 20.65, + "grad_norm": 1.6336750984191895, + "learning_rate": 2e-05, + "loss": 0.03263681, + "step": 10325 + }, + { + "epoch": 20.652, + "grad_norm": 1.517414927482605, + "learning_rate": 2e-05, + "loss": 0.04617567, + "step": 10326 + }, + { + "epoch": 20.654, + "grad_norm": 4.604348182678223, + "learning_rate": 2e-05, + "loss": 0.02881363, + "step": 10327 + }, + { + "epoch": 20.656, + "grad_norm": 1.4577995538711548, + "learning_rate": 2e-05, + "loss": 0.03543629, + "step": 10328 + }, + { + "epoch": 20.658, + "grad_norm": 0.9887872338294983, + "learning_rate": 2e-05, + "loss": 0.02208835, + "step": 10329 + }, + { + "epoch": 20.66, + "grad_norm": 1.5389630794525146, + "learning_rate": 2e-05, + "loss": 0.02897654, + "step": 10330 + }, + { + "epoch": 20.662, + "grad_norm": 1.308394432067871, + "learning_rate": 2e-05, + "loss": 0.03866008, + "step": 10331 + }, + { + "epoch": 20.664, + "grad_norm": 0.8264133334159851, + "learning_rate": 2e-05, + "loss": 0.02847798, + "step": 10332 + }, + { + "epoch": 20.666, + "grad_norm": 1.7936815023422241, + "learning_rate": 2e-05, + "loss": 0.03791363, + "step": 10333 + }, + { + "epoch": 20.668, + "grad_norm": 1.3060663938522339, + "learning_rate": 2e-05, + "loss": 0.03301517, + "step": 10334 + }, + { + "epoch": 20.67, + "grad_norm": 1.709163784980774, + "learning_rate": 2e-05, + "loss": 0.03455461, + "step": 10335 + }, + { + "epoch": 20.672, + "grad_norm": 1.8301975727081299, + "learning_rate": 2e-05, + "loss": 0.02920924, + "step": 10336 + }, + { + "epoch": 20.674, + "grad_norm": 1.4292941093444824, + "learning_rate": 2e-05, + "loss": 0.03628965, + "step": 10337 + }, + { + "epoch": 20.676, + "grad_norm": 1.2578301429748535, + "learning_rate": 2e-05, + "loss": 0.02768565, + "step": 10338 + }, + { + "epoch": 20.678, + "grad_norm": 1.4743505716323853, + "learning_rate": 2e-05, + "loss": 0.03460295, + "step": 10339 + }, + { + "epoch": 20.68, + "grad_norm": 0.9008990526199341, + "learning_rate": 2e-05, + "loss": 0.02599203, + "step": 10340 + }, + { + "epoch": 20.682, + "grad_norm": 2.693477153778076, + "learning_rate": 2e-05, + "loss": 0.02892848, + "step": 10341 + }, + { + "epoch": 20.684, + "grad_norm": 1.2961777448654175, + "learning_rate": 2e-05, + "loss": 0.03386326, + "step": 10342 + }, + { + "epoch": 20.686, + "grad_norm": 1.1865119934082031, + "learning_rate": 2e-05, + "loss": 0.02271389, + "step": 10343 + }, + { + "epoch": 20.688, + "grad_norm": 1.4448591470718384, + "learning_rate": 2e-05, + "loss": 0.03761415, + "step": 10344 + }, + { + "epoch": 20.69, + "grad_norm": 1.2941032648086548, + "learning_rate": 2e-05, + "loss": 0.04060592, + "step": 10345 + }, + { + "epoch": 20.692, + "grad_norm": 1.4364266395568848, + "learning_rate": 2e-05, + "loss": 0.0526364, + "step": 10346 + }, + { + "epoch": 20.694, + "grad_norm": 1.228283405303955, + "learning_rate": 2e-05, + "loss": 0.03359667, + "step": 10347 + }, + { + "epoch": 20.696, + "grad_norm": 1.265254259109497, + "learning_rate": 2e-05, + "loss": 0.04058044, + "step": 10348 + }, + { + "epoch": 20.698, + "grad_norm": 1.298516035079956, + "learning_rate": 2e-05, + "loss": 0.04035057, + "step": 10349 + }, + { + "epoch": 20.7, + "grad_norm": 1.272953748703003, + "learning_rate": 2e-05, + "loss": 0.03609696, + "step": 10350 + }, + { + "epoch": 20.701999999999998, + "grad_norm": 1.0742324590682983, + "learning_rate": 2e-05, + "loss": 0.03620451, + "step": 10351 + }, + { + "epoch": 20.704, + "grad_norm": 1.4301596879959106, + "learning_rate": 2e-05, + "loss": 0.04321437, + "step": 10352 + }, + { + "epoch": 20.706, + "grad_norm": 1.9156485795974731, + "learning_rate": 2e-05, + "loss": 0.03270895, + "step": 10353 + }, + { + "epoch": 20.708, + "grad_norm": 1.2552930116653442, + "learning_rate": 2e-05, + "loss": 0.02602671, + "step": 10354 + }, + { + "epoch": 20.71, + "grad_norm": 1.193481683731079, + "learning_rate": 2e-05, + "loss": 0.03754304, + "step": 10355 + }, + { + "epoch": 20.712, + "grad_norm": 1.4162143468856812, + "learning_rate": 2e-05, + "loss": 0.04919789, + "step": 10356 + }, + { + "epoch": 20.714, + "grad_norm": 1.1660418510437012, + "learning_rate": 2e-05, + "loss": 0.03052988, + "step": 10357 + }, + { + "epoch": 20.716, + "grad_norm": 0.9906470775604248, + "learning_rate": 2e-05, + "loss": 0.0268365, + "step": 10358 + }, + { + "epoch": 20.718, + "grad_norm": 1.7477552890777588, + "learning_rate": 2e-05, + "loss": 0.05105488, + "step": 10359 + }, + { + "epoch": 20.72, + "grad_norm": 1.1876134872436523, + "learning_rate": 2e-05, + "loss": 0.02859118, + "step": 10360 + }, + { + "epoch": 20.722, + "grad_norm": 1.3102295398712158, + "learning_rate": 2e-05, + "loss": 0.03058487, + "step": 10361 + }, + { + "epoch": 20.724, + "grad_norm": 1.7961902618408203, + "learning_rate": 2e-05, + "loss": 0.04488046, + "step": 10362 + }, + { + "epoch": 20.726, + "grad_norm": 1.058826208114624, + "learning_rate": 2e-05, + "loss": 0.03246313, + "step": 10363 + }, + { + "epoch": 20.728, + "grad_norm": 1.0731538534164429, + "learning_rate": 2e-05, + "loss": 0.02816553, + "step": 10364 + }, + { + "epoch": 20.73, + "grad_norm": 2.4187989234924316, + "learning_rate": 2e-05, + "loss": 0.04112451, + "step": 10365 + }, + { + "epoch": 20.732, + "grad_norm": 1.7762290239334106, + "learning_rate": 2e-05, + "loss": 0.04738872, + "step": 10366 + }, + { + "epoch": 20.734, + "grad_norm": 1.1634255647659302, + "learning_rate": 2e-05, + "loss": 0.03303593, + "step": 10367 + }, + { + "epoch": 20.736, + "grad_norm": 1.1194519996643066, + "learning_rate": 2e-05, + "loss": 0.03090698, + "step": 10368 + }, + { + "epoch": 20.738, + "grad_norm": 1.9328291416168213, + "learning_rate": 2e-05, + "loss": 0.03827959, + "step": 10369 + }, + { + "epoch": 20.74, + "grad_norm": 0.9472059607505798, + "learning_rate": 2e-05, + "loss": 0.02610377, + "step": 10370 + }, + { + "epoch": 20.742, + "grad_norm": 1.0899560451507568, + "learning_rate": 2e-05, + "loss": 0.04154932, + "step": 10371 + }, + { + "epoch": 20.744, + "grad_norm": 1.071056604385376, + "learning_rate": 2e-05, + "loss": 0.02690003, + "step": 10372 + }, + { + "epoch": 20.746, + "grad_norm": 1.2740556001663208, + "learning_rate": 2e-05, + "loss": 0.03140601, + "step": 10373 + }, + { + "epoch": 20.748, + "grad_norm": 1.2829887866973877, + "learning_rate": 2e-05, + "loss": 0.0347194, + "step": 10374 + }, + { + "epoch": 20.75, + "grad_norm": 0.9616041779518127, + "learning_rate": 2e-05, + "loss": 0.03521022, + "step": 10375 + }, + { + "epoch": 20.752, + "grad_norm": 1.2607543468475342, + "learning_rate": 2e-05, + "loss": 0.03321379, + "step": 10376 + }, + { + "epoch": 20.754, + "grad_norm": 1.4216960668563843, + "learning_rate": 2e-05, + "loss": 0.03590911, + "step": 10377 + }, + { + "epoch": 20.756, + "grad_norm": 1.5020769834518433, + "learning_rate": 2e-05, + "loss": 0.04069631, + "step": 10378 + }, + { + "epoch": 20.758, + "grad_norm": 1.044640302658081, + "learning_rate": 2e-05, + "loss": 0.02541364, + "step": 10379 + }, + { + "epoch": 20.76, + "grad_norm": 1.2528002262115479, + "learning_rate": 2e-05, + "loss": 0.03949961, + "step": 10380 + }, + { + "epoch": 20.762, + "grad_norm": 1.23416006565094, + "learning_rate": 2e-05, + "loss": 0.02047839, + "step": 10381 + }, + { + "epoch": 20.764, + "grad_norm": 0.9270185232162476, + "learning_rate": 2e-05, + "loss": 0.03086235, + "step": 10382 + }, + { + "epoch": 20.766, + "grad_norm": 1.3459302186965942, + "learning_rate": 2e-05, + "loss": 0.04397811, + "step": 10383 + }, + { + "epoch": 20.768, + "grad_norm": 1.8959941864013672, + "learning_rate": 2e-05, + "loss": 0.02847361, + "step": 10384 + }, + { + "epoch": 20.77, + "grad_norm": 1.0249385833740234, + "learning_rate": 2e-05, + "loss": 0.02996654, + "step": 10385 + }, + { + "epoch": 20.772, + "grad_norm": 1.1461448669433594, + "learning_rate": 2e-05, + "loss": 0.04238908, + "step": 10386 + }, + { + "epoch": 20.774, + "grad_norm": 0.896342933177948, + "learning_rate": 2e-05, + "loss": 0.02898027, + "step": 10387 + }, + { + "epoch": 20.776, + "grad_norm": 1.2837777137756348, + "learning_rate": 2e-05, + "loss": 0.03371213, + "step": 10388 + }, + { + "epoch": 20.778, + "grad_norm": 1.4851652383804321, + "learning_rate": 2e-05, + "loss": 0.04785133, + "step": 10389 + }, + { + "epoch": 20.78, + "grad_norm": 1.1204262971878052, + "learning_rate": 2e-05, + "loss": 0.03940021, + "step": 10390 + }, + { + "epoch": 20.782, + "grad_norm": 1.6720068454742432, + "learning_rate": 2e-05, + "loss": 0.03579511, + "step": 10391 + }, + { + "epoch": 20.784, + "grad_norm": 0.9738611578941345, + "learning_rate": 2e-05, + "loss": 0.0249313, + "step": 10392 + }, + { + "epoch": 20.786, + "grad_norm": 1.8201998472213745, + "learning_rate": 2e-05, + "loss": 0.048961, + "step": 10393 + }, + { + "epoch": 20.788, + "grad_norm": 1.2513413429260254, + "learning_rate": 2e-05, + "loss": 0.02755917, + "step": 10394 + }, + { + "epoch": 20.79, + "grad_norm": 1.3315002918243408, + "learning_rate": 2e-05, + "loss": 0.04382675, + "step": 10395 + }, + { + "epoch": 20.792, + "grad_norm": 1.3850572109222412, + "learning_rate": 2e-05, + "loss": 0.04453446, + "step": 10396 + }, + { + "epoch": 20.794, + "grad_norm": 1.5482556819915771, + "learning_rate": 2e-05, + "loss": 0.04031963, + "step": 10397 + }, + { + "epoch": 20.796, + "grad_norm": 1.8607449531555176, + "learning_rate": 2e-05, + "loss": 0.05087737, + "step": 10398 + }, + { + "epoch": 20.798000000000002, + "grad_norm": 1.6714609861373901, + "learning_rate": 2e-05, + "loss": 0.04965856, + "step": 10399 + }, + { + "epoch": 20.8, + "grad_norm": 1.9360687732696533, + "learning_rate": 2e-05, + "loss": 0.04098749, + "step": 10400 + }, + { + "epoch": 20.802, + "grad_norm": 0.939067542552948, + "learning_rate": 2e-05, + "loss": 0.02232962, + "step": 10401 + }, + { + "epoch": 20.804, + "grad_norm": 1.1615712642669678, + "learning_rate": 2e-05, + "loss": 0.03574735, + "step": 10402 + }, + { + "epoch": 20.806, + "grad_norm": 1.2097338438034058, + "learning_rate": 2e-05, + "loss": 0.03018945, + "step": 10403 + }, + { + "epoch": 20.808, + "grad_norm": 1.0195422172546387, + "learning_rate": 2e-05, + "loss": 0.03126357, + "step": 10404 + }, + { + "epoch": 20.81, + "grad_norm": 1.9747889041900635, + "learning_rate": 2e-05, + "loss": 0.04224984, + "step": 10405 + }, + { + "epoch": 20.812, + "grad_norm": 1.3474302291870117, + "learning_rate": 2e-05, + "loss": 0.04074406, + "step": 10406 + }, + { + "epoch": 20.814, + "grad_norm": 1.5049179792404175, + "learning_rate": 2e-05, + "loss": 0.04507799, + "step": 10407 + }, + { + "epoch": 20.816, + "grad_norm": 1.5321961641311646, + "learning_rate": 2e-05, + "loss": 0.04021778, + "step": 10408 + }, + { + "epoch": 20.818, + "grad_norm": 0.8743321895599365, + "learning_rate": 2e-05, + "loss": 0.0247119, + "step": 10409 + }, + { + "epoch": 20.82, + "grad_norm": 1.6426585912704468, + "learning_rate": 2e-05, + "loss": 0.04337841, + "step": 10410 + }, + { + "epoch": 20.822, + "grad_norm": 2.048464059829712, + "learning_rate": 2e-05, + "loss": 0.04811566, + "step": 10411 + }, + { + "epoch": 20.824, + "grad_norm": 1.0955586433410645, + "learning_rate": 2e-05, + "loss": 0.02928637, + "step": 10412 + }, + { + "epoch": 20.826, + "grad_norm": 3.2851905822753906, + "learning_rate": 2e-05, + "loss": 0.03792008, + "step": 10413 + }, + { + "epoch": 20.828, + "grad_norm": 1.1193740367889404, + "learning_rate": 2e-05, + "loss": 0.03241657, + "step": 10414 + }, + { + "epoch": 20.83, + "grad_norm": 2.217038631439209, + "learning_rate": 2e-05, + "loss": 0.04027008, + "step": 10415 + }, + { + "epoch": 20.832, + "grad_norm": 1.1971979141235352, + "learning_rate": 2e-05, + "loss": 0.0313881, + "step": 10416 + }, + { + "epoch": 20.834, + "grad_norm": 1.2874393463134766, + "learning_rate": 2e-05, + "loss": 0.03874963, + "step": 10417 + }, + { + "epoch": 20.836, + "grad_norm": 1.7825161218643188, + "learning_rate": 2e-05, + "loss": 0.0345277, + "step": 10418 + }, + { + "epoch": 20.838, + "grad_norm": 2.392350196838379, + "learning_rate": 2e-05, + "loss": 0.05566229, + "step": 10419 + }, + { + "epoch": 20.84, + "grad_norm": 1.6109018325805664, + "learning_rate": 2e-05, + "loss": 0.04091034, + "step": 10420 + }, + { + "epoch": 20.842, + "grad_norm": 1.0898905992507935, + "learning_rate": 2e-05, + "loss": 0.03517729, + "step": 10421 + }, + { + "epoch": 20.844, + "grad_norm": 1.2754164934158325, + "learning_rate": 2e-05, + "loss": 0.03659736, + "step": 10422 + }, + { + "epoch": 20.846, + "grad_norm": 1.520416259765625, + "learning_rate": 2e-05, + "loss": 0.03716481, + "step": 10423 + }, + { + "epoch": 20.848, + "grad_norm": 2.5298376083374023, + "learning_rate": 2e-05, + "loss": 0.04783663, + "step": 10424 + }, + { + "epoch": 20.85, + "grad_norm": 1.6789484024047852, + "learning_rate": 2e-05, + "loss": 0.03685528, + "step": 10425 + }, + { + "epoch": 20.852, + "grad_norm": 1.5136722326278687, + "learning_rate": 2e-05, + "loss": 0.03236245, + "step": 10426 + }, + { + "epoch": 20.854, + "grad_norm": 0.9097597002983093, + "learning_rate": 2e-05, + "loss": 0.02598057, + "step": 10427 + }, + { + "epoch": 20.856, + "grad_norm": 1.2258105278015137, + "learning_rate": 2e-05, + "loss": 0.02003474, + "step": 10428 + }, + { + "epoch": 20.858, + "grad_norm": 1.2743656635284424, + "learning_rate": 2e-05, + "loss": 0.04647747, + "step": 10429 + }, + { + "epoch": 20.86, + "grad_norm": 1.199223279953003, + "learning_rate": 2e-05, + "loss": 0.03247061, + "step": 10430 + }, + { + "epoch": 20.862, + "grad_norm": 1.2215161323547363, + "learning_rate": 2e-05, + "loss": 0.03422792, + "step": 10431 + }, + { + "epoch": 20.864, + "grad_norm": 1.02229642868042, + "learning_rate": 2e-05, + "loss": 0.02587418, + "step": 10432 + }, + { + "epoch": 20.866, + "grad_norm": 1.7659459114074707, + "learning_rate": 2e-05, + "loss": 0.03618574, + "step": 10433 + }, + { + "epoch": 20.868, + "grad_norm": 1.2492812871932983, + "learning_rate": 2e-05, + "loss": 0.03435018, + "step": 10434 + }, + { + "epoch": 20.87, + "grad_norm": 1.9097106456756592, + "learning_rate": 2e-05, + "loss": 0.05099519, + "step": 10435 + }, + { + "epoch": 20.872, + "grad_norm": 2.3891992568969727, + "learning_rate": 2e-05, + "loss": 0.03493572, + "step": 10436 + }, + { + "epoch": 20.874, + "grad_norm": 6.847945690155029, + "learning_rate": 2e-05, + "loss": 0.04543907, + "step": 10437 + }, + { + "epoch": 20.876, + "grad_norm": 0.8078843951225281, + "learning_rate": 2e-05, + "loss": 0.01693185, + "step": 10438 + }, + { + "epoch": 20.878, + "grad_norm": 1.1145824193954468, + "learning_rate": 2e-05, + "loss": 0.03014626, + "step": 10439 + }, + { + "epoch": 20.88, + "grad_norm": 1.4058823585510254, + "learning_rate": 2e-05, + "loss": 0.03669397, + "step": 10440 + }, + { + "epoch": 20.882, + "grad_norm": 1.749841570854187, + "learning_rate": 2e-05, + "loss": 0.03852821, + "step": 10441 + }, + { + "epoch": 20.884, + "grad_norm": 1.228166937828064, + "learning_rate": 2e-05, + "loss": 0.04459646, + "step": 10442 + }, + { + "epoch": 20.886, + "grad_norm": 1.1540074348449707, + "learning_rate": 2e-05, + "loss": 0.02661585, + "step": 10443 + }, + { + "epoch": 20.888, + "grad_norm": 2.69791841506958, + "learning_rate": 2e-05, + "loss": 0.03573243, + "step": 10444 + }, + { + "epoch": 20.89, + "grad_norm": 1.062644124031067, + "learning_rate": 2e-05, + "loss": 0.02639816, + "step": 10445 + }, + { + "epoch": 20.892, + "grad_norm": 1.1434491872787476, + "learning_rate": 2e-05, + "loss": 0.03068436, + "step": 10446 + }, + { + "epoch": 20.894, + "grad_norm": 1.0184530019760132, + "learning_rate": 2e-05, + "loss": 0.02732141, + "step": 10447 + }, + { + "epoch": 20.896, + "grad_norm": 1.670699954032898, + "learning_rate": 2e-05, + "loss": 0.02273174, + "step": 10448 + }, + { + "epoch": 20.898, + "grad_norm": 1.2134953737258911, + "learning_rate": 2e-05, + "loss": 0.03250109, + "step": 10449 + }, + { + "epoch": 20.9, + "grad_norm": 1.2033954858779907, + "learning_rate": 2e-05, + "loss": 0.04112671, + "step": 10450 + }, + { + "epoch": 20.902, + "grad_norm": 1.2063027620315552, + "learning_rate": 2e-05, + "loss": 0.03256904, + "step": 10451 + }, + { + "epoch": 20.904, + "grad_norm": 1.2613540887832642, + "learning_rate": 2e-05, + "loss": 0.02923998, + "step": 10452 + }, + { + "epoch": 20.906, + "grad_norm": 1.8170700073242188, + "learning_rate": 2e-05, + "loss": 0.04404435, + "step": 10453 + }, + { + "epoch": 20.908, + "grad_norm": 2.074605703353882, + "learning_rate": 2e-05, + "loss": 0.06511399, + "step": 10454 + }, + { + "epoch": 20.91, + "grad_norm": 1.1869606971740723, + "learning_rate": 2e-05, + "loss": 0.03975774, + "step": 10455 + }, + { + "epoch": 20.912, + "grad_norm": 1.0519496202468872, + "learning_rate": 2e-05, + "loss": 0.03061169, + "step": 10456 + }, + { + "epoch": 20.914, + "grad_norm": 1.1407067775726318, + "learning_rate": 2e-05, + "loss": 0.02600056, + "step": 10457 + }, + { + "epoch": 20.916, + "grad_norm": 1.2726540565490723, + "learning_rate": 2e-05, + "loss": 0.03579668, + "step": 10458 + }, + { + "epoch": 20.918, + "grad_norm": 1.0577654838562012, + "learning_rate": 2e-05, + "loss": 0.0273657, + "step": 10459 + }, + { + "epoch": 20.92, + "grad_norm": 3.370119333267212, + "learning_rate": 2e-05, + "loss": 0.04482879, + "step": 10460 + }, + { + "epoch": 20.922, + "grad_norm": 0.9270197153091431, + "learning_rate": 2e-05, + "loss": 0.02642347, + "step": 10461 + }, + { + "epoch": 20.924, + "grad_norm": 2.1031179428100586, + "learning_rate": 2e-05, + "loss": 0.03457503, + "step": 10462 + }, + { + "epoch": 20.926, + "grad_norm": 1.47755765914917, + "learning_rate": 2e-05, + "loss": 0.04129061, + "step": 10463 + }, + { + "epoch": 20.928, + "grad_norm": 1.221456527709961, + "learning_rate": 2e-05, + "loss": 0.0394924, + "step": 10464 + }, + { + "epoch": 20.93, + "grad_norm": 1.5422698259353638, + "learning_rate": 2e-05, + "loss": 0.04629602, + "step": 10465 + }, + { + "epoch": 20.932, + "grad_norm": 1.2598649263381958, + "learning_rate": 2e-05, + "loss": 0.03605943, + "step": 10466 + }, + { + "epoch": 20.934, + "grad_norm": 1.0877323150634766, + "learning_rate": 2e-05, + "loss": 0.03485626, + "step": 10467 + }, + { + "epoch": 20.936, + "grad_norm": 0.8516746163368225, + "learning_rate": 2e-05, + "loss": 0.01967622, + "step": 10468 + }, + { + "epoch": 20.938, + "grad_norm": 1.01011061668396, + "learning_rate": 2e-05, + "loss": 0.0255489, + "step": 10469 + }, + { + "epoch": 20.94, + "grad_norm": 1.1266335248947144, + "learning_rate": 2e-05, + "loss": 0.02532486, + "step": 10470 + }, + { + "epoch": 20.942, + "grad_norm": 1.5323753356933594, + "learning_rate": 2e-05, + "loss": 0.03835236, + "step": 10471 + }, + { + "epoch": 20.944, + "grad_norm": 1.58487069606781, + "learning_rate": 2e-05, + "loss": 0.03736504, + "step": 10472 + }, + { + "epoch": 20.946, + "grad_norm": 1.2376452684402466, + "learning_rate": 2e-05, + "loss": 0.03332981, + "step": 10473 + }, + { + "epoch": 20.948, + "grad_norm": 1.3151975870132446, + "learning_rate": 2e-05, + "loss": 0.03374566, + "step": 10474 + }, + { + "epoch": 20.95, + "grad_norm": 1.410962700843811, + "learning_rate": 2e-05, + "loss": 0.04390511, + "step": 10475 + }, + { + "epoch": 20.951999999999998, + "grad_norm": 3.6582796573638916, + "learning_rate": 2e-05, + "loss": 0.04068881, + "step": 10476 + }, + { + "epoch": 20.954, + "grad_norm": 1.183980941772461, + "learning_rate": 2e-05, + "loss": 0.03768382, + "step": 10477 + }, + { + "epoch": 20.956, + "grad_norm": 1.9583686590194702, + "learning_rate": 2e-05, + "loss": 0.04036102, + "step": 10478 + }, + { + "epoch": 20.958, + "grad_norm": 1.1289726495742798, + "learning_rate": 2e-05, + "loss": 0.02689121, + "step": 10479 + }, + { + "epoch": 20.96, + "grad_norm": 1.1652100086212158, + "learning_rate": 2e-05, + "loss": 0.03687986, + "step": 10480 + }, + { + "epoch": 20.962, + "grad_norm": 1.4179235696792603, + "learning_rate": 2e-05, + "loss": 0.03052982, + "step": 10481 + }, + { + "epoch": 20.964, + "grad_norm": 0.9714958071708679, + "learning_rate": 2e-05, + "loss": 0.02744197, + "step": 10482 + }, + { + "epoch": 20.966, + "grad_norm": 1.1505253314971924, + "learning_rate": 2e-05, + "loss": 0.03075511, + "step": 10483 + }, + { + "epoch": 20.968, + "grad_norm": 1.3006511926651, + "learning_rate": 2e-05, + "loss": 0.04219224, + "step": 10484 + }, + { + "epoch": 20.97, + "grad_norm": 1.2317531108856201, + "learning_rate": 2e-05, + "loss": 0.0362198, + "step": 10485 + }, + { + "epoch": 20.972, + "grad_norm": 1.2480534315109253, + "learning_rate": 2e-05, + "loss": 0.04598153, + "step": 10486 + }, + { + "epoch": 20.974, + "grad_norm": 1.9260978698730469, + "learning_rate": 2e-05, + "loss": 0.05104748, + "step": 10487 + }, + { + "epoch": 20.976, + "grad_norm": 1.0552128553390503, + "learning_rate": 2e-05, + "loss": 0.02821925, + "step": 10488 + }, + { + "epoch": 20.978, + "grad_norm": 1.0819281339645386, + "learning_rate": 2e-05, + "loss": 0.03735295, + "step": 10489 + }, + { + "epoch": 20.98, + "grad_norm": 0.9959445595741272, + "learning_rate": 2e-05, + "loss": 0.03469016, + "step": 10490 + }, + { + "epoch": 20.982, + "grad_norm": 1.0732587575912476, + "learning_rate": 2e-05, + "loss": 0.02674491, + "step": 10491 + }, + { + "epoch": 20.984, + "grad_norm": 1.1409895420074463, + "learning_rate": 2e-05, + "loss": 0.02786115, + "step": 10492 + }, + { + "epoch": 20.986, + "grad_norm": 0.8526608943939209, + "learning_rate": 2e-05, + "loss": 0.02098497, + "step": 10493 + }, + { + "epoch": 20.988, + "grad_norm": 1.0358623266220093, + "learning_rate": 2e-05, + "loss": 0.02189691, + "step": 10494 + }, + { + "epoch": 20.99, + "grad_norm": 1.4046947956085205, + "learning_rate": 2e-05, + "loss": 0.0338182, + "step": 10495 + }, + { + "epoch": 20.992, + "grad_norm": 1.1783146858215332, + "learning_rate": 2e-05, + "loss": 0.03898209, + "step": 10496 + }, + { + "epoch": 20.994, + "grad_norm": 0.9742079973220825, + "learning_rate": 2e-05, + "loss": 0.02756248, + "step": 10497 + }, + { + "epoch": 20.996, + "grad_norm": 2.197392463684082, + "learning_rate": 2e-05, + "loss": 0.04526626, + "step": 10498 + }, + { + "epoch": 20.998, + "grad_norm": 1.057904601097107, + "learning_rate": 2e-05, + "loss": 0.04241625, + "step": 10499 + }, + { + "epoch": 21.0, + "grad_norm": 1.4337189197540283, + "learning_rate": 2e-05, + "loss": 0.03345023, + "step": 10500 + }, + { + "epoch": 21.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9740518962075848, + "Equal_1": 0.998, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9680638722554891, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9879759519038076, + "Parallel_2": 0.9899799599198397, + "Parallel_3": 0.986, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.984, + "Perpendicular_3": 0.8186372745490982, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.994, + "PointLiesOnCircle_3": 0.988, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9899799599198397, + "PointLiesOnLine_3": 0.9840319361277445 + }, + "eval_runtime": 319.6633, + "eval_samples_per_second": 32.847, + "eval_steps_per_second": 0.657, + "step": 10500 + }, + { + "epoch": 21.002, + "grad_norm": 1.4172569513320923, + "learning_rate": 2e-05, + "loss": 0.04958025, + "step": 10501 + }, + { + "epoch": 21.004, + "grad_norm": 1.2641007900238037, + "learning_rate": 2e-05, + "loss": 0.04138844, + "step": 10502 + }, + { + "epoch": 21.006, + "grad_norm": 1.5297917127609253, + "learning_rate": 2e-05, + "loss": 0.05362614, + "step": 10503 + }, + { + "epoch": 21.008, + "grad_norm": 1.4040725231170654, + "learning_rate": 2e-05, + "loss": 0.02881721, + "step": 10504 + }, + { + "epoch": 21.01, + "grad_norm": 2.7434298992156982, + "learning_rate": 2e-05, + "loss": 0.05710435, + "step": 10505 + }, + { + "epoch": 21.012, + "grad_norm": 0.9089720845222473, + "learning_rate": 2e-05, + "loss": 0.02749285, + "step": 10506 + }, + { + "epoch": 21.014, + "grad_norm": 1.093634009361267, + "learning_rate": 2e-05, + "loss": 0.02694854, + "step": 10507 + }, + { + "epoch": 21.016, + "grad_norm": 0.8842881917953491, + "learning_rate": 2e-05, + "loss": 0.02866191, + "step": 10508 + }, + { + "epoch": 21.018, + "grad_norm": 1.1293853521347046, + "learning_rate": 2e-05, + "loss": 0.03138885, + "step": 10509 + }, + { + "epoch": 21.02, + "grad_norm": 2.5073678493499756, + "learning_rate": 2e-05, + "loss": 0.036281, + "step": 10510 + }, + { + "epoch": 21.022, + "grad_norm": 0.8775820136070251, + "learning_rate": 2e-05, + "loss": 0.0302633, + "step": 10511 + }, + { + "epoch": 21.024, + "grad_norm": 1.844066858291626, + "learning_rate": 2e-05, + "loss": 0.05106075, + "step": 10512 + }, + { + "epoch": 21.026, + "grad_norm": 1.2659056186676025, + "learning_rate": 2e-05, + "loss": 0.03020029, + "step": 10513 + }, + { + "epoch": 21.028, + "grad_norm": 1.132651686668396, + "learning_rate": 2e-05, + "loss": 0.02988936, + "step": 10514 + }, + { + "epoch": 21.03, + "grad_norm": 0.9240502119064331, + "learning_rate": 2e-05, + "loss": 0.02960332, + "step": 10515 + }, + { + "epoch": 21.032, + "grad_norm": 3.4578001499176025, + "learning_rate": 2e-05, + "loss": 0.03723548, + "step": 10516 + }, + { + "epoch": 21.034, + "grad_norm": 1.327440619468689, + "learning_rate": 2e-05, + "loss": 0.04209972, + "step": 10517 + }, + { + "epoch": 21.036, + "grad_norm": 0.7104504704475403, + "learning_rate": 2e-05, + "loss": 0.01660552, + "step": 10518 + }, + { + "epoch": 21.038, + "grad_norm": 1.9451689720153809, + "learning_rate": 2e-05, + "loss": 0.04870243, + "step": 10519 + }, + { + "epoch": 21.04, + "grad_norm": 1.5693484544754028, + "learning_rate": 2e-05, + "loss": 0.05037589, + "step": 10520 + }, + { + "epoch": 21.042, + "grad_norm": 1.0665810108184814, + "learning_rate": 2e-05, + "loss": 0.03087834, + "step": 10521 + }, + { + "epoch": 21.044, + "grad_norm": 1.309026837348938, + "learning_rate": 2e-05, + "loss": 0.03562339, + "step": 10522 + }, + { + "epoch": 21.046, + "grad_norm": 1.5604883432388306, + "learning_rate": 2e-05, + "loss": 0.044502, + "step": 10523 + }, + { + "epoch": 21.048, + "grad_norm": 0.9193902015686035, + "learning_rate": 2e-05, + "loss": 0.02742101, + "step": 10524 + }, + { + "epoch": 21.05, + "grad_norm": 1.5470359325408936, + "learning_rate": 2e-05, + "loss": 0.03719456, + "step": 10525 + }, + { + "epoch": 21.052, + "grad_norm": 0.9354279637336731, + "learning_rate": 2e-05, + "loss": 0.0307904, + "step": 10526 + }, + { + "epoch": 21.054, + "grad_norm": 0.9876614212989807, + "learning_rate": 2e-05, + "loss": 0.03531288, + "step": 10527 + }, + { + "epoch": 21.056, + "grad_norm": 1.2255337238311768, + "learning_rate": 2e-05, + "loss": 0.03369109, + "step": 10528 + }, + { + "epoch": 21.058, + "grad_norm": 1.4609296321868896, + "learning_rate": 2e-05, + "loss": 0.04581051, + "step": 10529 + }, + { + "epoch": 21.06, + "grad_norm": 1.1844290494918823, + "learning_rate": 2e-05, + "loss": 0.02935641, + "step": 10530 + }, + { + "epoch": 21.062, + "grad_norm": 1.3473209142684937, + "learning_rate": 2e-05, + "loss": 0.03874001, + "step": 10531 + }, + { + "epoch": 21.064, + "grad_norm": 1.2861710786819458, + "learning_rate": 2e-05, + "loss": 0.03785869, + "step": 10532 + }, + { + "epoch": 21.066, + "grad_norm": 1.2455552816390991, + "learning_rate": 2e-05, + "loss": 0.02974376, + "step": 10533 + }, + { + "epoch": 21.068, + "grad_norm": 1.0268170833587646, + "learning_rate": 2e-05, + "loss": 0.0280002, + "step": 10534 + }, + { + "epoch": 21.07, + "grad_norm": 2.149333953857422, + "learning_rate": 2e-05, + "loss": 0.0475785, + "step": 10535 + }, + { + "epoch": 21.072, + "grad_norm": 0.8871065378189087, + "learning_rate": 2e-05, + "loss": 0.02665732, + "step": 10536 + }, + { + "epoch": 21.074, + "grad_norm": 1.2057782411575317, + "learning_rate": 2e-05, + "loss": 0.04148353, + "step": 10537 + }, + { + "epoch": 21.076, + "grad_norm": 2.8635036945343018, + "learning_rate": 2e-05, + "loss": 0.04504614, + "step": 10538 + }, + { + "epoch": 21.078, + "grad_norm": 1.0987458229064941, + "learning_rate": 2e-05, + "loss": 0.02471746, + "step": 10539 + }, + { + "epoch": 21.08, + "grad_norm": 1.5803258419036865, + "learning_rate": 2e-05, + "loss": 0.04167273, + "step": 10540 + }, + { + "epoch": 21.082, + "grad_norm": 0.9525123834609985, + "learning_rate": 2e-05, + "loss": 0.02323587, + "step": 10541 + }, + { + "epoch": 21.084, + "grad_norm": 1.051931619644165, + "learning_rate": 2e-05, + "loss": 0.03668146, + "step": 10542 + }, + { + "epoch": 21.086, + "grad_norm": 1.020364761352539, + "learning_rate": 2e-05, + "loss": 0.03155644, + "step": 10543 + }, + { + "epoch": 21.088, + "grad_norm": 1.9535802602767944, + "learning_rate": 2e-05, + "loss": 0.04253891, + "step": 10544 + }, + { + "epoch": 21.09, + "grad_norm": 1.563684105873108, + "learning_rate": 2e-05, + "loss": 0.03040604, + "step": 10545 + }, + { + "epoch": 21.092, + "grad_norm": 1.5148497819900513, + "learning_rate": 2e-05, + "loss": 0.04719204, + "step": 10546 + }, + { + "epoch": 21.094, + "grad_norm": 1.0597929954528809, + "learning_rate": 2e-05, + "loss": 0.03394167, + "step": 10547 + }, + { + "epoch": 21.096, + "grad_norm": 0.9946009516716003, + "learning_rate": 2e-05, + "loss": 0.02729562, + "step": 10548 + }, + { + "epoch": 21.098, + "grad_norm": 1.0386313199996948, + "learning_rate": 2e-05, + "loss": 0.02707431, + "step": 10549 + }, + { + "epoch": 21.1, + "grad_norm": 1.930969476699829, + "learning_rate": 2e-05, + "loss": 0.03724827, + "step": 10550 + }, + { + "epoch": 21.102, + "grad_norm": 0.8346301913261414, + "learning_rate": 2e-05, + "loss": 0.02298054, + "step": 10551 + }, + { + "epoch": 21.104, + "grad_norm": 1.1870561838150024, + "learning_rate": 2e-05, + "loss": 0.03454594, + "step": 10552 + }, + { + "epoch": 21.106, + "grad_norm": 1.3278943300247192, + "learning_rate": 2e-05, + "loss": 0.03413796, + "step": 10553 + }, + { + "epoch": 21.108, + "grad_norm": 0.9806317090988159, + "learning_rate": 2e-05, + "loss": 0.03050506, + "step": 10554 + }, + { + "epoch": 21.11, + "grad_norm": 1.6823358535766602, + "learning_rate": 2e-05, + "loss": 0.04869309, + "step": 10555 + }, + { + "epoch": 21.112, + "grad_norm": 0.9092576503753662, + "learning_rate": 2e-05, + "loss": 0.02638655, + "step": 10556 + }, + { + "epoch": 21.114, + "grad_norm": 1.2272701263427734, + "learning_rate": 2e-05, + "loss": 0.03748032, + "step": 10557 + }, + { + "epoch": 21.116, + "grad_norm": 1.1986057758331299, + "learning_rate": 2e-05, + "loss": 0.02931621, + "step": 10558 + }, + { + "epoch": 21.118, + "grad_norm": 1.966601014137268, + "learning_rate": 2e-05, + "loss": 0.04338966, + "step": 10559 + }, + { + "epoch": 21.12, + "grad_norm": 2.9418132305145264, + "learning_rate": 2e-05, + "loss": 0.05720629, + "step": 10560 + }, + { + "epoch": 21.122, + "grad_norm": 1.3184374570846558, + "learning_rate": 2e-05, + "loss": 0.03051766, + "step": 10561 + }, + { + "epoch": 21.124, + "grad_norm": 1.2324519157409668, + "learning_rate": 2e-05, + "loss": 0.04240867, + "step": 10562 + }, + { + "epoch": 21.126, + "grad_norm": 1.0354188680648804, + "learning_rate": 2e-05, + "loss": 0.03184131, + "step": 10563 + }, + { + "epoch": 21.128, + "grad_norm": 1.3084588050842285, + "learning_rate": 2e-05, + "loss": 0.04317082, + "step": 10564 + }, + { + "epoch": 21.13, + "grad_norm": 1.621947169303894, + "learning_rate": 2e-05, + "loss": 0.04106213, + "step": 10565 + }, + { + "epoch": 21.132, + "grad_norm": 2.01584529876709, + "learning_rate": 2e-05, + "loss": 0.04236465, + "step": 10566 + }, + { + "epoch": 21.134, + "grad_norm": 1.481381893157959, + "learning_rate": 2e-05, + "loss": 0.02457852, + "step": 10567 + }, + { + "epoch": 21.136, + "grad_norm": 1.3118445873260498, + "learning_rate": 2e-05, + "loss": 0.04216508, + "step": 10568 + }, + { + "epoch": 21.138, + "grad_norm": 1.3927664756774902, + "learning_rate": 2e-05, + "loss": 0.04639062, + "step": 10569 + }, + { + "epoch": 21.14, + "grad_norm": 1.2951942682266235, + "learning_rate": 2e-05, + "loss": 0.03051125, + "step": 10570 + }, + { + "epoch": 21.142, + "grad_norm": 3.7828381061553955, + "learning_rate": 2e-05, + "loss": 0.03892427, + "step": 10571 + }, + { + "epoch": 21.144, + "grad_norm": 1.8915671110153198, + "learning_rate": 2e-05, + "loss": 0.03912808, + "step": 10572 + }, + { + "epoch": 21.146, + "grad_norm": 1.7816556692123413, + "learning_rate": 2e-05, + "loss": 0.03675058, + "step": 10573 + }, + { + "epoch": 21.148, + "grad_norm": 2.443760395050049, + "learning_rate": 2e-05, + "loss": 0.03171434, + "step": 10574 + }, + { + "epoch": 21.15, + "grad_norm": 1.0887539386749268, + "learning_rate": 2e-05, + "loss": 0.03385027, + "step": 10575 + }, + { + "epoch": 21.152, + "grad_norm": 1.2952345609664917, + "learning_rate": 2e-05, + "loss": 0.04195005, + "step": 10576 + }, + { + "epoch": 21.154, + "grad_norm": 1.1885733604431152, + "learning_rate": 2e-05, + "loss": 0.02599936, + "step": 10577 + }, + { + "epoch": 21.156, + "grad_norm": 1.1241782903671265, + "learning_rate": 2e-05, + "loss": 0.03315149, + "step": 10578 + }, + { + "epoch": 21.158, + "grad_norm": 1.3803486824035645, + "learning_rate": 2e-05, + "loss": 0.03571745, + "step": 10579 + }, + { + "epoch": 21.16, + "grad_norm": 1.2156238555908203, + "learning_rate": 2e-05, + "loss": 0.04436778, + "step": 10580 + }, + { + "epoch": 21.162, + "grad_norm": 1.586046576499939, + "learning_rate": 2e-05, + "loss": 0.04987106, + "step": 10581 + }, + { + "epoch": 21.164, + "grad_norm": 1.261434555053711, + "learning_rate": 2e-05, + "loss": 0.03705215, + "step": 10582 + }, + { + "epoch": 21.166, + "grad_norm": 0.886199414730072, + "learning_rate": 2e-05, + "loss": 0.02416335, + "step": 10583 + }, + { + "epoch": 21.168, + "grad_norm": 1.2906107902526855, + "learning_rate": 2e-05, + "loss": 0.04419378, + "step": 10584 + }, + { + "epoch": 21.17, + "grad_norm": 1.1108649969100952, + "learning_rate": 2e-05, + "loss": 0.03079815, + "step": 10585 + }, + { + "epoch": 21.172, + "grad_norm": 2.070798635482788, + "learning_rate": 2e-05, + "loss": 0.0433712, + "step": 10586 + }, + { + "epoch": 21.174, + "grad_norm": 1.1178715229034424, + "learning_rate": 2e-05, + "loss": 0.03338214, + "step": 10587 + }, + { + "epoch": 21.176, + "grad_norm": 1.2780183553695679, + "learning_rate": 2e-05, + "loss": 0.04754562, + "step": 10588 + }, + { + "epoch": 21.178, + "grad_norm": 1.154023289680481, + "learning_rate": 2e-05, + "loss": 0.03899136, + "step": 10589 + }, + { + "epoch": 21.18, + "grad_norm": 0.9963793158531189, + "learning_rate": 2e-05, + "loss": 0.02338295, + "step": 10590 + }, + { + "epoch": 21.182, + "grad_norm": 1.49871027469635, + "learning_rate": 2e-05, + "loss": 0.04136465, + "step": 10591 + }, + { + "epoch": 21.184, + "grad_norm": 1.006020426750183, + "learning_rate": 2e-05, + "loss": 0.02428103, + "step": 10592 + }, + { + "epoch": 21.186, + "grad_norm": 1.1841108798980713, + "learning_rate": 2e-05, + "loss": 0.03439215, + "step": 10593 + }, + { + "epoch": 21.188, + "grad_norm": 1.2510030269622803, + "learning_rate": 2e-05, + "loss": 0.03146035, + "step": 10594 + }, + { + "epoch": 21.19, + "grad_norm": 0.7315729260444641, + "learning_rate": 2e-05, + "loss": 0.01793394, + "step": 10595 + }, + { + "epoch": 21.192, + "grad_norm": 1.2097729444503784, + "learning_rate": 2e-05, + "loss": 0.03397623, + "step": 10596 + }, + { + "epoch": 21.194, + "grad_norm": 1.0274741649627686, + "learning_rate": 2e-05, + "loss": 0.02825071, + "step": 10597 + }, + { + "epoch": 21.196, + "grad_norm": 0.9847630262374878, + "learning_rate": 2e-05, + "loss": 0.03145049, + "step": 10598 + }, + { + "epoch": 21.198, + "grad_norm": 1.3303278684616089, + "learning_rate": 2e-05, + "loss": 0.04809329, + "step": 10599 + }, + { + "epoch": 21.2, + "grad_norm": 1.0973892211914062, + "learning_rate": 2e-05, + "loss": 0.03610747, + "step": 10600 + }, + { + "epoch": 21.202, + "grad_norm": 1.8867298364639282, + "learning_rate": 2e-05, + "loss": 0.04427704, + "step": 10601 + }, + { + "epoch": 21.204, + "grad_norm": 1.922042965888977, + "learning_rate": 2e-05, + "loss": 0.0379353, + "step": 10602 + }, + { + "epoch": 21.206, + "grad_norm": 1.1012451648712158, + "learning_rate": 2e-05, + "loss": 0.03171213, + "step": 10603 + }, + { + "epoch": 21.208, + "grad_norm": 1.0954500436782837, + "learning_rate": 2e-05, + "loss": 0.02412405, + "step": 10604 + }, + { + "epoch": 21.21, + "grad_norm": 1.3452584743499756, + "learning_rate": 2e-05, + "loss": 0.0347417, + "step": 10605 + }, + { + "epoch": 21.212, + "grad_norm": 1.3375858068466187, + "learning_rate": 2e-05, + "loss": 0.03575499, + "step": 10606 + }, + { + "epoch": 21.214, + "grad_norm": 1.139094352722168, + "learning_rate": 2e-05, + "loss": 0.0376694, + "step": 10607 + }, + { + "epoch": 21.216, + "grad_norm": 1.5566028356552124, + "learning_rate": 2e-05, + "loss": 0.03238266, + "step": 10608 + }, + { + "epoch": 21.218, + "grad_norm": 1.9407174587249756, + "learning_rate": 2e-05, + "loss": 0.04087084, + "step": 10609 + }, + { + "epoch": 21.22, + "grad_norm": 5.404819011688232, + "learning_rate": 2e-05, + "loss": 0.04306193, + "step": 10610 + }, + { + "epoch": 21.222, + "grad_norm": 1.1310769319534302, + "learning_rate": 2e-05, + "loss": 0.03149523, + "step": 10611 + }, + { + "epoch": 21.224, + "grad_norm": 2.3743486404418945, + "learning_rate": 2e-05, + "loss": 0.0347545, + "step": 10612 + }, + { + "epoch": 21.226, + "grad_norm": 1.9811265468597412, + "learning_rate": 2e-05, + "loss": 0.04471551, + "step": 10613 + }, + { + "epoch": 21.228, + "grad_norm": 1.034940242767334, + "learning_rate": 2e-05, + "loss": 0.02823063, + "step": 10614 + }, + { + "epoch": 21.23, + "grad_norm": 1.0800728797912598, + "learning_rate": 2e-05, + "loss": 0.04498807, + "step": 10615 + }, + { + "epoch": 21.232, + "grad_norm": 1.2216445207595825, + "learning_rate": 2e-05, + "loss": 0.03705755, + "step": 10616 + }, + { + "epoch": 21.234, + "grad_norm": 1.5470830202102661, + "learning_rate": 2e-05, + "loss": 0.03225365, + "step": 10617 + }, + { + "epoch": 21.236, + "grad_norm": 2.094444990158081, + "learning_rate": 2e-05, + "loss": 0.04915155, + "step": 10618 + }, + { + "epoch": 21.238, + "grad_norm": 1.5187581777572632, + "learning_rate": 2e-05, + "loss": 0.04350504, + "step": 10619 + }, + { + "epoch": 21.24, + "grad_norm": 1.3666698932647705, + "learning_rate": 2e-05, + "loss": 0.05134146, + "step": 10620 + }, + { + "epoch": 21.242, + "grad_norm": 0.9730545282363892, + "learning_rate": 2e-05, + "loss": 0.0246509, + "step": 10621 + }, + { + "epoch": 21.244, + "grad_norm": 1.3245638608932495, + "learning_rate": 2e-05, + "loss": 0.03410356, + "step": 10622 + }, + { + "epoch": 21.246, + "grad_norm": 1.0870567560195923, + "learning_rate": 2e-05, + "loss": 0.03783543, + "step": 10623 + }, + { + "epoch": 21.248, + "grad_norm": 1.4121298789978027, + "learning_rate": 2e-05, + "loss": 0.0571961, + "step": 10624 + }, + { + "epoch": 21.25, + "grad_norm": 1.5266447067260742, + "learning_rate": 2e-05, + "loss": 0.04245551, + "step": 10625 + }, + { + "epoch": 21.252, + "grad_norm": 1.5656712055206299, + "learning_rate": 2e-05, + "loss": 0.04033789, + "step": 10626 + }, + { + "epoch": 21.254, + "grad_norm": 1.1030497550964355, + "learning_rate": 2e-05, + "loss": 0.03589434, + "step": 10627 + }, + { + "epoch": 21.256, + "grad_norm": 1.4850778579711914, + "learning_rate": 2e-05, + "loss": 0.03722943, + "step": 10628 + }, + { + "epoch": 21.258, + "grad_norm": 1.0993595123291016, + "learning_rate": 2e-05, + "loss": 0.02807938, + "step": 10629 + }, + { + "epoch": 21.26, + "grad_norm": 1.2334455251693726, + "learning_rate": 2e-05, + "loss": 0.03748228, + "step": 10630 + }, + { + "epoch": 21.262, + "grad_norm": 1.5018088817596436, + "learning_rate": 2e-05, + "loss": 0.04651701, + "step": 10631 + }, + { + "epoch": 21.264, + "grad_norm": 1.5770760774612427, + "learning_rate": 2e-05, + "loss": 0.03737814, + "step": 10632 + }, + { + "epoch": 21.266, + "grad_norm": 1.3698883056640625, + "learning_rate": 2e-05, + "loss": 0.03702818, + "step": 10633 + }, + { + "epoch": 21.268, + "grad_norm": 1.316947340965271, + "learning_rate": 2e-05, + "loss": 0.0374598, + "step": 10634 + }, + { + "epoch": 21.27, + "grad_norm": 1.1345027685165405, + "learning_rate": 2e-05, + "loss": 0.03323581, + "step": 10635 + }, + { + "epoch": 21.272, + "grad_norm": 1.6421146392822266, + "learning_rate": 2e-05, + "loss": 0.03895211, + "step": 10636 + }, + { + "epoch": 21.274, + "grad_norm": 0.812617301940918, + "learning_rate": 2e-05, + "loss": 0.02046642, + "step": 10637 + }, + { + "epoch": 21.276, + "grad_norm": 1.6946412324905396, + "learning_rate": 2e-05, + "loss": 0.03659395, + "step": 10638 + }, + { + "epoch": 21.278, + "grad_norm": 2.288081645965576, + "learning_rate": 2e-05, + "loss": 0.05296513, + "step": 10639 + }, + { + "epoch": 21.28, + "grad_norm": 1.681496500968933, + "learning_rate": 2e-05, + "loss": 0.03126854, + "step": 10640 + }, + { + "epoch": 21.282, + "grad_norm": 3.044177532196045, + "learning_rate": 2e-05, + "loss": 0.0360253, + "step": 10641 + }, + { + "epoch": 21.284, + "grad_norm": 1.5990822315216064, + "learning_rate": 2e-05, + "loss": 0.03093606, + "step": 10642 + }, + { + "epoch": 21.286, + "grad_norm": 1.1280889511108398, + "learning_rate": 2e-05, + "loss": 0.0305376, + "step": 10643 + }, + { + "epoch": 21.288, + "grad_norm": 0.7883005738258362, + "learning_rate": 2e-05, + "loss": 0.02066907, + "step": 10644 + }, + { + "epoch": 21.29, + "grad_norm": 0.9310852885246277, + "learning_rate": 2e-05, + "loss": 0.02677006, + "step": 10645 + }, + { + "epoch": 21.292, + "grad_norm": 1.0146135091781616, + "learning_rate": 2e-05, + "loss": 0.03050572, + "step": 10646 + }, + { + "epoch": 21.294, + "grad_norm": 1.7839759588241577, + "learning_rate": 2e-05, + "loss": 0.03637395, + "step": 10647 + }, + { + "epoch": 21.296, + "grad_norm": 1.1365673542022705, + "learning_rate": 2e-05, + "loss": 0.03653955, + "step": 10648 + }, + { + "epoch": 21.298, + "grad_norm": 1.2921481132507324, + "learning_rate": 2e-05, + "loss": 0.04410624, + "step": 10649 + }, + { + "epoch": 21.3, + "grad_norm": 1.155529499053955, + "learning_rate": 2e-05, + "loss": 0.03705712, + "step": 10650 + }, + { + "epoch": 21.302, + "grad_norm": 1.44944167137146, + "learning_rate": 2e-05, + "loss": 0.04845714, + "step": 10651 + }, + { + "epoch": 21.304, + "grad_norm": 1.276952862739563, + "learning_rate": 2e-05, + "loss": 0.04431798, + "step": 10652 + }, + { + "epoch": 21.306, + "grad_norm": 1.0401476621627808, + "learning_rate": 2e-05, + "loss": 0.02794194, + "step": 10653 + }, + { + "epoch": 21.308, + "grad_norm": 1.4874932765960693, + "learning_rate": 2e-05, + "loss": 0.04844416, + "step": 10654 + }, + { + "epoch": 21.31, + "grad_norm": 1.2578763961791992, + "learning_rate": 2e-05, + "loss": 0.02141695, + "step": 10655 + }, + { + "epoch": 21.312, + "grad_norm": 1.0979886054992676, + "learning_rate": 2e-05, + "loss": 0.03747806, + "step": 10656 + }, + { + "epoch": 21.314, + "grad_norm": 1.5013819932937622, + "learning_rate": 2e-05, + "loss": 0.03525694, + "step": 10657 + }, + { + "epoch": 21.316, + "grad_norm": 2.042410373687744, + "learning_rate": 2e-05, + "loss": 0.04237705, + "step": 10658 + }, + { + "epoch": 21.318, + "grad_norm": 0.8237473964691162, + "learning_rate": 2e-05, + "loss": 0.02557302, + "step": 10659 + }, + { + "epoch": 21.32, + "grad_norm": 1.2069071531295776, + "learning_rate": 2e-05, + "loss": 0.03492039, + "step": 10660 + }, + { + "epoch": 21.322, + "grad_norm": 2.445582866668701, + "learning_rate": 2e-05, + "loss": 0.04500083, + "step": 10661 + }, + { + "epoch": 21.324, + "grad_norm": 1.7285526990890503, + "learning_rate": 2e-05, + "loss": 0.0372141, + "step": 10662 + }, + { + "epoch": 21.326, + "grad_norm": 1.2738267183303833, + "learning_rate": 2e-05, + "loss": 0.04208554, + "step": 10663 + }, + { + "epoch": 21.328, + "grad_norm": 1.3018332719802856, + "learning_rate": 2e-05, + "loss": 0.02245201, + "step": 10664 + }, + { + "epoch": 21.33, + "grad_norm": 1.3849666118621826, + "learning_rate": 2e-05, + "loss": 0.04083522, + "step": 10665 + }, + { + "epoch": 21.332, + "grad_norm": 1.0650460720062256, + "learning_rate": 2e-05, + "loss": 0.03396599, + "step": 10666 + }, + { + "epoch": 21.334, + "grad_norm": 1.0239198207855225, + "learning_rate": 2e-05, + "loss": 0.02632653, + "step": 10667 + }, + { + "epoch": 21.336, + "grad_norm": 1.1510930061340332, + "learning_rate": 2e-05, + "loss": 0.03786356, + "step": 10668 + }, + { + "epoch": 21.338, + "grad_norm": 1.134979486465454, + "learning_rate": 2e-05, + "loss": 0.03500597, + "step": 10669 + }, + { + "epoch": 21.34, + "grad_norm": 1.7041795253753662, + "learning_rate": 2e-05, + "loss": 0.04310299, + "step": 10670 + }, + { + "epoch": 21.342, + "grad_norm": 1.080082893371582, + "learning_rate": 2e-05, + "loss": 0.02889851, + "step": 10671 + }, + { + "epoch": 21.344, + "grad_norm": 1.2668462991714478, + "learning_rate": 2e-05, + "loss": 0.06208187, + "step": 10672 + }, + { + "epoch": 21.346, + "grad_norm": 0.9316691160202026, + "learning_rate": 2e-05, + "loss": 0.02679442, + "step": 10673 + }, + { + "epoch": 21.348, + "grad_norm": 1.3193854093551636, + "learning_rate": 2e-05, + "loss": 0.03310707, + "step": 10674 + }, + { + "epoch": 21.35, + "grad_norm": 1.5377140045166016, + "learning_rate": 2e-05, + "loss": 0.03435023, + "step": 10675 + }, + { + "epoch": 21.352, + "grad_norm": 1.0346852540969849, + "learning_rate": 2e-05, + "loss": 0.02915202, + "step": 10676 + }, + { + "epoch": 21.354, + "grad_norm": 1.0699576139450073, + "learning_rate": 2e-05, + "loss": 0.03234834, + "step": 10677 + }, + { + "epoch": 21.356, + "grad_norm": 4.232156753540039, + "learning_rate": 2e-05, + "loss": 0.04847697, + "step": 10678 + }, + { + "epoch": 21.358, + "grad_norm": 0.8950043320655823, + "learning_rate": 2e-05, + "loss": 0.02786786, + "step": 10679 + }, + { + "epoch": 21.36, + "grad_norm": 1.1412599086761475, + "learning_rate": 2e-05, + "loss": 0.02532648, + "step": 10680 + }, + { + "epoch": 21.362, + "grad_norm": 1.9713459014892578, + "learning_rate": 2e-05, + "loss": 0.04944416, + "step": 10681 + }, + { + "epoch": 21.364, + "grad_norm": 1.2647299766540527, + "learning_rate": 2e-05, + "loss": 0.04223265, + "step": 10682 + }, + { + "epoch": 21.366, + "grad_norm": 2.940858840942383, + "learning_rate": 2e-05, + "loss": 0.03530517, + "step": 10683 + }, + { + "epoch": 21.368, + "grad_norm": 0.9534826874732971, + "learning_rate": 2e-05, + "loss": 0.02479281, + "step": 10684 + }, + { + "epoch": 21.37, + "grad_norm": 1.3853439092636108, + "learning_rate": 2e-05, + "loss": 0.03295173, + "step": 10685 + }, + { + "epoch": 21.372, + "grad_norm": 1.2453714609146118, + "learning_rate": 2e-05, + "loss": 0.02070951, + "step": 10686 + }, + { + "epoch": 21.374, + "grad_norm": 1.1567037105560303, + "learning_rate": 2e-05, + "loss": 0.03087223, + "step": 10687 + }, + { + "epoch": 21.376, + "grad_norm": 0.9362542629241943, + "learning_rate": 2e-05, + "loss": 0.01926921, + "step": 10688 + }, + { + "epoch": 21.378, + "grad_norm": 0.9018503427505493, + "learning_rate": 2e-05, + "loss": 0.02120415, + "step": 10689 + }, + { + "epoch": 21.38, + "grad_norm": 1.236549735069275, + "learning_rate": 2e-05, + "loss": 0.04043602, + "step": 10690 + }, + { + "epoch": 21.382, + "grad_norm": 1.1474121809005737, + "learning_rate": 2e-05, + "loss": 0.03565197, + "step": 10691 + }, + { + "epoch": 21.384, + "grad_norm": 1.0585695505142212, + "learning_rate": 2e-05, + "loss": 0.03018987, + "step": 10692 + }, + { + "epoch": 21.386, + "grad_norm": 1.0328543186187744, + "learning_rate": 2e-05, + "loss": 0.03397262, + "step": 10693 + }, + { + "epoch": 21.388, + "grad_norm": 1.446506381034851, + "learning_rate": 2e-05, + "loss": 0.02783697, + "step": 10694 + }, + { + "epoch": 21.39, + "grad_norm": 1.6255260705947876, + "learning_rate": 2e-05, + "loss": 0.03847382, + "step": 10695 + }, + { + "epoch": 21.392, + "grad_norm": 1.0737837553024292, + "learning_rate": 2e-05, + "loss": 0.02603565, + "step": 10696 + }, + { + "epoch": 21.394, + "grad_norm": 1.392006754875183, + "learning_rate": 2e-05, + "loss": 0.03012003, + "step": 10697 + }, + { + "epoch": 21.396, + "grad_norm": 2.5857255458831787, + "learning_rate": 2e-05, + "loss": 0.03107409, + "step": 10698 + }, + { + "epoch": 21.398, + "grad_norm": 1.2561590671539307, + "learning_rate": 2e-05, + "loss": 0.03176247, + "step": 10699 + }, + { + "epoch": 21.4, + "grad_norm": 1.0437184572219849, + "learning_rate": 2e-05, + "loss": 0.03001492, + "step": 10700 + }, + { + "epoch": 21.402, + "grad_norm": 1.9724184274673462, + "learning_rate": 2e-05, + "loss": 0.02830181, + "step": 10701 + }, + { + "epoch": 21.404, + "grad_norm": 1.181110143661499, + "learning_rate": 2e-05, + "loss": 0.0300355, + "step": 10702 + }, + { + "epoch": 21.406, + "grad_norm": 2.5362696647644043, + "learning_rate": 2e-05, + "loss": 0.04556922, + "step": 10703 + }, + { + "epoch": 21.408, + "grad_norm": 1.9228975772857666, + "learning_rate": 2e-05, + "loss": 0.0374077, + "step": 10704 + }, + { + "epoch": 21.41, + "grad_norm": 1.4400721788406372, + "learning_rate": 2e-05, + "loss": 0.0367013, + "step": 10705 + }, + { + "epoch": 21.412, + "grad_norm": 1.2807708978652954, + "learning_rate": 2e-05, + "loss": 0.045549, + "step": 10706 + }, + { + "epoch": 21.414, + "grad_norm": 2.0344345569610596, + "learning_rate": 2e-05, + "loss": 0.04431529, + "step": 10707 + }, + { + "epoch": 21.416, + "grad_norm": 3.9442005157470703, + "learning_rate": 2e-05, + "loss": 0.04201176, + "step": 10708 + }, + { + "epoch": 21.418, + "grad_norm": 0.9622934460639954, + "learning_rate": 2e-05, + "loss": 0.0284987, + "step": 10709 + }, + { + "epoch": 21.42, + "grad_norm": 1.0971020460128784, + "learning_rate": 2e-05, + "loss": 0.036263, + "step": 10710 + }, + { + "epoch": 21.422, + "grad_norm": 0.9987945556640625, + "learning_rate": 2e-05, + "loss": 0.02871657, + "step": 10711 + }, + { + "epoch": 21.424, + "grad_norm": 0.8595823645591736, + "learning_rate": 2e-05, + "loss": 0.02600442, + "step": 10712 + }, + { + "epoch": 21.426, + "grad_norm": 0.8434016108512878, + "learning_rate": 2e-05, + "loss": 0.02094415, + "step": 10713 + }, + { + "epoch": 21.428, + "grad_norm": 1.313782811164856, + "learning_rate": 2e-05, + "loss": 0.04309317, + "step": 10714 + }, + { + "epoch": 21.43, + "grad_norm": 1.0133094787597656, + "learning_rate": 2e-05, + "loss": 0.03012585, + "step": 10715 + }, + { + "epoch": 21.432, + "grad_norm": 1.157460331916809, + "learning_rate": 2e-05, + "loss": 0.03752935, + "step": 10716 + }, + { + "epoch": 21.434, + "grad_norm": 1.142626166343689, + "learning_rate": 2e-05, + "loss": 0.02614125, + "step": 10717 + }, + { + "epoch": 21.436, + "grad_norm": 1.3068962097167969, + "learning_rate": 2e-05, + "loss": 0.04685502, + "step": 10718 + }, + { + "epoch": 21.438, + "grad_norm": 1.4647998809814453, + "learning_rate": 2e-05, + "loss": 0.04809128, + "step": 10719 + }, + { + "epoch": 21.44, + "grad_norm": 0.9506215453147888, + "learning_rate": 2e-05, + "loss": 0.02731987, + "step": 10720 + }, + { + "epoch": 21.442, + "grad_norm": 1.687732458114624, + "learning_rate": 2e-05, + "loss": 0.0411804, + "step": 10721 + }, + { + "epoch": 21.444, + "grad_norm": 1.8168407678604126, + "learning_rate": 2e-05, + "loss": 0.04425385, + "step": 10722 + }, + { + "epoch": 21.446, + "grad_norm": 1.2137858867645264, + "learning_rate": 2e-05, + "loss": 0.03381842, + "step": 10723 + }, + { + "epoch": 21.448, + "grad_norm": 1.0868464708328247, + "learning_rate": 2e-05, + "loss": 0.0301272, + "step": 10724 + }, + { + "epoch": 21.45, + "grad_norm": 2.293076753616333, + "learning_rate": 2e-05, + "loss": 0.05272197, + "step": 10725 + }, + { + "epoch": 21.452, + "grad_norm": 1.2076009511947632, + "learning_rate": 2e-05, + "loss": 0.04387645, + "step": 10726 + }, + { + "epoch": 21.454, + "grad_norm": 1.757698655128479, + "learning_rate": 2e-05, + "loss": 0.02590645, + "step": 10727 + }, + { + "epoch": 21.456, + "grad_norm": 1.1657187938690186, + "learning_rate": 2e-05, + "loss": 0.03529991, + "step": 10728 + }, + { + "epoch": 21.458, + "grad_norm": 1.177914023399353, + "learning_rate": 2e-05, + "loss": 0.03198649, + "step": 10729 + }, + { + "epoch": 21.46, + "grad_norm": 2.1333088874816895, + "learning_rate": 2e-05, + "loss": 0.05930211, + "step": 10730 + }, + { + "epoch": 21.462, + "grad_norm": 1.0726081132888794, + "learning_rate": 2e-05, + "loss": 0.02519402, + "step": 10731 + }, + { + "epoch": 21.464, + "grad_norm": 1.8570704460144043, + "learning_rate": 2e-05, + "loss": 0.04541021, + "step": 10732 + }, + { + "epoch": 21.466, + "grad_norm": 1.4362432956695557, + "learning_rate": 2e-05, + "loss": 0.03639318, + "step": 10733 + }, + { + "epoch": 21.468, + "grad_norm": 1.3196659088134766, + "learning_rate": 2e-05, + "loss": 0.03501175, + "step": 10734 + }, + { + "epoch": 21.47, + "grad_norm": 1.788077712059021, + "learning_rate": 2e-05, + "loss": 0.04476755, + "step": 10735 + }, + { + "epoch": 21.472, + "grad_norm": 2.1154396533966064, + "learning_rate": 2e-05, + "loss": 0.04601542, + "step": 10736 + }, + { + "epoch": 21.474, + "grad_norm": 0.9764267206192017, + "learning_rate": 2e-05, + "loss": 0.03156991, + "step": 10737 + }, + { + "epoch": 21.476, + "grad_norm": 0.862126886844635, + "learning_rate": 2e-05, + "loss": 0.02627644, + "step": 10738 + }, + { + "epoch": 21.478, + "grad_norm": 2.1665496826171875, + "learning_rate": 2e-05, + "loss": 0.03621161, + "step": 10739 + }, + { + "epoch": 21.48, + "grad_norm": 0.9720172882080078, + "learning_rate": 2e-05, + "loss": 0.02424107, + "step": 10740 + }, + { + "epoch": 21.482, + "grad_norm": 1.1352647542953491, + "learning_rate": 2e-05, + "loss": 0.0395835, + "step": 10741 + }, + { + "epoch": 21.484, + "grad_norm": 1.2119773626327515, + "learning_rate": 2e-05, + "loss": 0.03099651, + "step": 10742 + }, + { + "epoch": 21.486, + "grad_norm": 1.091362476348877, + "learning_rate": 2e-05, + "loss": 0.03427457, + "step": 10743 + }, + { + "epoch": 21.488, + "grad_norm": 1.5212750434875488, + "learning_rate": 2e-05, + "loss": 0.03149851, + "step": 10744 + }, + { + "epoch": 21.49, + "grad_norm": 2.698265552520752, + "learning_rate": 2e-05, + "loss": 0.03275203, + "step": 10745 + }, + { + "epoch": 21.492, + "grad_norm": 0.9697499871253967, + "learning_rate": 2e-05, + "loss": 0.02803354, + "step": 10746 + }, + { + "epoch": 21.494, + "grad_norm": 1.4407355785369873, + "learning_rate": 2e-05, + "loss": 0.04102314, + "step": 10747 + }, + { + "epoch": 21.496, + "grad_norm": 0.9825798869132996, + "learning_rate": 2e-05, + "loss": 0.02902812, + "step": 10748 + }, + { + "epoch": 21.498, + "grad_norm": 0.9173822999000549, + "learning_rate": 2e-05, + "loss": 0.02705132, + "step": 10749 + }, + { + "epoch": 21.5, + "grad_norm": 1.5008710622787476, + "learning_rate": 2e-05, + "loss": 0.03890024, + "step": 10750 + }, + { + "epoch": 21.502, + "grad_norm": 2.323859214782715, + "learning_rate": 2e-05, + "loss": 0.03871877, + "step": 10751 + }, + { + "epoch": 21.504, + "grad_norm": 0.9519139528274536, + "learning_rate": 2e-05, + "loss": 0.02205279, + "step": 10752 + }, + { + "epoch": 21.506, + "grad_norm": 1.150467038154602, + "learning_rate": 2e-05, + "loss": 0.03497121, + "step": 10753 + }, + { + "epoch": 21.508, + "grad_norm": 1.5614557266235352, + "learning_rate": 2e-05, + "loss": 0.03725933, + "step": 10754 + }, + { + "epoch": 21.51, + "grad_norm": 0.8480432629585266, + "learning_rate": 2e-05, + "loss": 0.02463213, + "step": 10755 + }, + { + "epoch": 21.512, + "grad_norm": 1.0168507099151611, + "learning_rate": 2e-05, + "loss": 0.03348152, + "step": 10756 + }, + { + "epoch": 21.514, + "grad_norm": 1.9600995779037476, + "learning_rate": 2e-05, + "loss": 0.04116196, + "step": 10757 + }, + { + "epoch": 21.516, + "grad_norm": 2.4078519344329834, + "learning_rate": 2e-05, + "loss": 0.05193283, + "step": 10758 + }, + { + "epoch": 21.518, + "grad_norm": 1.9011753797531128, + "learning_rate": 2e-05, + "loss": 0.04391605, + "step": 10759 + }, + { + "epoch": 21.52, + "grad_norm": 1.0992064476013184, + "learning_rate": 2e-05, + "loss": 0.03505319, + "step": 10760 + }, + { + "epoch": 21.522, + "grad_norm": 1.1341238021850586, + "learning_rate": 2e-05, + "loss": 0.04327236, + "step": 10761 + }, + { + "epoch": 21.524, + "grad_norm": 0.8820840716362, + "learning_rate": 2e-05, + "loss": 0.02557, + "step": 10762 + }, + { + "epoch": 21.526, + "grad_norm": 1.7110241651535034, + "learning_rate": 2e-05, + "loss": 0.02929918, + "step": 10763 + }, + { + "epoch": 21.528, + "grad_norm": 2.175755262374878, + "learning_rate": 2e-05, + "loss": 0.04092379, + "step": 10764 + }, + { + "epoch": 21.53, + "grad_norm": 0.9964827299118042, + "learning_rate": 2e-05, + "loss": 0.02928205, + "step": 10765 + }, + { + "epoch": 21.532, + "grad_norm": 1.0591028928756714, + "learning_rate": 2e-05, + "loss": 0.03802252, + "step": 10766 + }, + { + "epoch": 21.534, + "grad_norm": 1.0037879943847656, + "learning_rate": 2e-05, + "loss": 0.02184133, + "step": 10767 + }, + { + "epoch": 21.536, + "grad_norm": 1.0294376611709595, + "learning_rate": 2e-05, + "loss": 0.03275803, + "step": 10768 + }, + { + "epoch": 21.538, + "grad_norm": 1.2927500009536743, + "learning_rate": 2e-05, + "loss": 0.05052555, + "step": 10769 + }, + { + "epoch": 21.54, + "grad_norm": 1.269002079963684, + "learning_rate": 2e-05, + "loss": 0.04041898, + "step": 10770 + }, + { + "epoch": 21.542, + "grad_norm": 1.3406201601028442, + "learning_rate": 2e-05, + "loss": 0.0340461, + "step": 10771 + }, + { + "epoch": 21.544, + "grad_norm": 1.2689399719238281, + "learning_rate": 2e-05, + "loss": 0.02636706, + "step": 10772 + }, + { + "epoch": 21.546, + "grad_norm": 1.7005540132522583, + "learning_rate": 2e-05, + "loss": 0.03229015, + "step": 10773 + }, + { + "epoch": 21.548000000000002, + "grad_norm": 1.281423568725586, + "learning_rate": 2e-05, + "loss": 0.03840701, + "step": 10774 + }, + { + "epoch": 21.55, + "grad_norm": 1.9321891069412231, + "learning_rate": 2e-05, + "loss": 0.03856434, + "step": 10775 + }, + { + "epoch": 21.552, + "grad_norm": 2.3012235164642334, + "learning_rate": 2e-05, + "loss": 0.04851568, + "step": 10776 + }, + { + "epoch": 21.554, + "grad_norm": 0.8743276000022888, + "learning_rate": 2e-05, + "loss": 0.0196914, + "step": 10777 + }, + { + "epoch": 21.556, + "grad_norm": 1.0524275302886963, + "learning_rate": 2e-05, + "loss": 0.038031, + "step": 10778 + }, + { + "epoch": 21.558, + "grad_norm": 1.160980463027954, + "learning_rate": 2e-05, + "loss": 0.04083832, + "step": 10779 + }, + { + "epoch": 21.56, + "grad_norm": 3.0021450519561768, + "learning_rate": 2e-05, + "loss": 0.03252212, + "step": 10780 + }, + { + "epoch": 21.562, + "grad_norm": 2.0682501792907715, + "learning_rate": 2e-05, + "loss": 0.03913051, + "step": 10781 + }, + { + "epoch": 21.564, + "grad_norm": 1.703480839729309, + "learning_rate": 2e-05, + "loss": 0.04668419, + "step": 10782 + }, + { + "epoch": 21.566, + "grad_norm": 1.4658693075180054, + "learning_rate": 2e-05, + "loss": 0.03443877, + "step": 10783 + }, + { + "epoch": 21.568, + "grad_norm": 1.1543418169021606, + "learning_rate": 2e-05, + "loss": 0.02747395, + "step": 10784 + }, + { + "epoch": 21.57, + "grad_norm": 3.2979228496551514, + "learning_rate": 2e-05, + "loss": 0.05042007, + "step": 10785 + }, + { + "epoch": 21.572, + "grad_norm": 1.8336671590805054, + "learning_rate": 2e-05, + "loss": 0.02918666, + "step": 10786 + }, + { + "epoch": 21.574, + "grad_norm": 1.3803110122680664, + "learning_rate": 2e-05, + "loss": 0.03612125, + "step": 10787 + }, + { + "epoch": 21.576, + "grad_norm": 1.217238426208496, + "learning_rate": 2e-05, + "loss": 0.03045898, + "step": 10788 + }, + { + "epoch": 21.578, + "grad_norm": 0.8724953532218933, + "learning_rate": 2e-05, + "loss": 0.02683663, + "step": 10789 + }, + { + "epoch": 21.58, + "grad_norm": 1.0224673748016357, + "learning_rate": 2e-05, + "loss": 0.03132909, + "step": 10790 + }, + { + "epoch": 21.582, + "grad_norm": 1.0978913307189941, + "learning_rate": 2e-05, + "loss": 0.03923561, + "step": 10791 + }, + { + "epoch": 21.584, + "grad_norm": 1.336134672164917, + "learning_rate": 2e-05, + "loss": 0.03779566, + "step": 10792 + }, + { + "epoch": 21.586, + "grad_norm": 2.5332863330841064, + "learning_rate": 2e-05, + "loss": 0.04706411, + "step": 10793 + }, + { + "epoch": 21.588, + "grad_norm": 0.9782661199569702, + "learning_rate": 2e-05, + "loss": 0.02816963, + "step": 10794 + }, + { + "epoch": 21.59, + "grad_norm": 0.963988184928894, + "learning_rate": 2e-05, + "loss": 0.03094437, + "step": 10795 + }, + { + "epoch": 21.592, + "grad_norm": 2.6186649799346924, + "learning_rate": 2e-05, + "loss": 0.02780627, + "step": 10796 + }, + { + "epoch": 21.594, + "grad_norm": 1.1740295886993408, + "learning_rate": 2e-05, + "loss": 0.03014258, + "step": 10797 + }, + { + "epoch": 21.596, + "grad_norm": 1.2682572603225708, + "learning_rate": 2e-05, + "loss": 0.03773146, + "step": 10798 + }, + { + "epoch": 21.598, + "grad_norm": 1.0841542482376099, + "learning_rate": 2e-05, + "loss": 0.02988249, + "step": 10799 + }, + { + "epoch": 21.6, + "grad_norm": 1.3508737087249756, + "learning_rate": 2e-05, + "loss": 0.0372295, + "step": 10800 + }, + { + "epoch": 21.602, + "grad_norm": 1.229504942893982, + "learning_rate": 2e-05, + "loss": 0.03721376, + "step": 10801 + }, + { + "epoch": 21.604, + "grad_norm": 1.0140235424041748, + "learning_rate": 2e-05, + "loss": 0.02006028, + "step": 10802 + }, + { + "epoch": 21.606, + "grad_norm": 2.93935489654541, + "learning_rate": 2e-05, + "loss": 0.04623206, + "step": 10803 + }, + { + "epoch": 21.608, + "grad_norm": 1.6117465496063232, + "learning_rate": 2e-05, + "loss": 0.02327195, + "step": 10804 + }, + { + "epoch": 21.61, + "grad_norm": 1.7175722122192383, + "learning_rate": 2e-05, + "loss": 0.03024486, + "step": 10805 + }, + { + "epoch": 21.612, + "grad_norm": 2.5179755687713623, + "learning_rate": 2e-05, + "loss": 0.03515901, + "step": 10806 + }, + { + "epoch": 21.614, + "grad_norm": 1.4241502285003662, + "learning_rate": 2e-05, + "loss": 0.03859766, + "step": 10807 + }, + { + "epoch": 21.616, + "grad_norm": 2.0011253356933594, + "learning_rate": 2e-05, + "loss": 0.02655087, + "step": 10808 + }, + { + "epoch": 21.618, + "grad_norm": 1.9893662929534912, + "learning_rate": 2e-05, + "loss": 0.03004796, + "step": 10809 + }, + { + "epoch": 21.62, + "grad_norm": 1.3463841676712036, + "learning_rate": 2e-05, + "loss": 0.02972662, + "step": 10810 + }, + { + "epoch": 21.622, + "grad_norm": 1.1787477731704712, + "learning_rate": 2e-05, + "loss": 0.0288261, + "step": 10811 + }, + { + "epoch": 21.624, + "grad_norm": 1.237833857536316, + "learning_rate": 2e-05, + "loss": 0.03204871, + "step": 10812 + }, + { + "epoch": 21.626, + "grad_norm": 3.5159149169921875, + "learning_rate": 2e-05, + "loss": 0.04790648, + "step": 10813 + }, + { + "epoch": 21.628, + "grad_norm": 0.845460057258606, + "learning_rate": 2e-05, + "loss": 0.02701693, + "step": 10814 + }, + { + "epoch": 21.63, + "grad_norm": 1.7467832565307617, + "learning_rate": 2e-05, + "loss": 0.02598082, + "step": 10815 + }, + { + "epoch": 21.632, + "grad_norm": 2.1213173866271973, + "learning_rate": 2e-05, + "loss": 0.04864348, + "step": 10816 + }, + { + "epoch": 21.634, + "grad_norm": 1.6620383262634277, + "learning_rate": 2e-05, + "loss": 0.05002079, + "step": 10817 + }, + { + "epoch": 21.636, + "grad_norm": 1.2031606435775757, + "learning_rate": 2e-05, + "loss": 0.02948435, + "step": 10818 + }, + { + "epoch": 21.638, + "grad_norm": 3.743027687072754, + "learning_rate": 2e-05, + "loss": 0.05274049, + "step": 10819 + }, + { + "epoch": 21.64, + "grad_norm": 2.317620038986206, + "learning_rate": 2e-05, + "loss": 0.03110951, + "step": 10820 + }, + { + "epoch": 21.642, + "grad_norm": 1.2688599824905396, + "learning_rate": 2e-05, + "loss": 0.03901623, + "step": 10821 + }, + { + "epoch": 21.644, + "grad_norm": 1.1904910802841187, + "learning_rate": 2e-05, + "loss": 0.03599124, + "step": 10822 + }, + { + "epoch": 21.646, + "grad_norm": 1.423413634300232, + "learning_rate": 2e-05, + "loss": 0.04643358, + "step": 10823 + }, + { + "epoch": 21.648, + "grad_norm": 1.1683127880096436, + "learning_rate": 2e-05, + "loss": 0.03427366, + "step": 10824 + }, + { + "epoch": 21.65, + "grad_norm": 0.9204855561256409, + "learning_rate": 2e-05, + "loss": 0.03122223, + "step": 10825 + }, + { + "epoch": 21.652, + "grad_norm": 1.1113882064819336, + "learning_rate": 2e-05, + "loss": 0.0381987, + "step": 10826 + }, + { + "epoch": 21.654, + "grad_norm": 1.2636103630065918, + "learning_rate": 2e-05, + "loss": 0.03268664, + "step": 10827 + }, + { + "epoch": 21.656, + "grad_norm": 0.9222037196159363, + "learning_rate": 2e-05, + "loss": 0.03147195, + "step": 10828 + }, + { + "epoch": 21.658, + "grad_norm": 1.5140608549118042, + "learning_rate": 2e-05, + "loss": 0.03292232, + "step": 10829 + }, + { + "epoch": 21.66, + "grad_norm": 1.063954472541809, + "learning_rate": 2e-05, + "loss": 0.03509793, + "step": 10830 + }, + { + "epoch": 21.662, + "grad_norm": 1.110836148262024, + "learning_rate": 2e-05, + "loss": 0.03393824, + "step": 10831 + }, + { + "epoch": 21.664, + "grad_norm": 0.9153269529342651, + "learning_rate": 2e-05, + "loss": 0.02850673, + "step": 10832 + }, + { + "epoch": 21.666, + "grad_norm": 1.3513257503509521, + "learning_rate": 2e-05, + "loss": 0.03400831, + "step": 10833 + }, + { + "epoch": 21.668, + "grad_norm": 0.828287661075592, + "learning_rate": 2e-05, + "loss": 0.02091564, + "step": 10834 + }, + { + "epoch": 21.67, + "grad_norm": 1.5664379596710205, + "learning_rate": 2e-05, + "loss": 0.03538562, + "step": 10835 + }, + { + "epoch": 21.672, + "grad_norm": 2.088440179824829, + "learning_rate": 2e-05, + "loss": 0.03902008, + "step": 10836 + }, + { + "epoch": 21.674, + "grad_norm": 0.9757646322250366, + "learning_rate": 2e-05, + "loss": 0.03331909, + "step": 10837 + }, + { + "epoch": 21.676, + "grad_norm": 1.3544541597366333, + "learning_rate": 2e-05, + "loss": 0.03964958, + "step": 10838 + }, + { + "epoch": 21.678, + "grad_norm": 1.8273234367370605, + "learning_rate": 2e-05, + "loss": 0.03560327, + "step": 10839 + }, + { + "epoch": 21.68, + "grad_norm": 0.8851908445358276, + "learning_rate": 2e-05, + "loss": 0.02724171, + "step": 10840 + }, + { + "epoch": 21.682, + "grad_norm": 1.3723584413528442, + "learning_rate": 2e-05, + "loss": 0.03260377, + "step": 10841 + }, + { + "epoch": 21.684, + "grad_norm": 1.7293577194213867, + "learning_rate": 2e-05, + "loss": 0.04087131, + "step": 10842 + }, + { + "epoch": 21.686, + "grad_norm": 1.4376215934753418, + "learning_rate": 2e-05, + "loss": 0.03094893, + "step": 10843 + }, + { + "epoch": 21.688, + "grad_norm": 2.080545663833618, + "learning_rate": 2e-05, + "loss": 0.04254334, + "step": 10844 + }, + { + "epoch": 21.69, + "grad_norm": 1.221475601196289, + "learning_rate": 2e-05, + "loss": 0.042444, + "step": 10845 + }, + { + "epoch": 21.692, + "grad_norm": 1.0191986560821533, + "learning_rate": 2e-05, + "loss": 0.03012396, + "step": 10846 + }, + { + "epoch": 21.694, + "grad_norm": 1.5199073553085327, + "learning_rate": 2e-05, + "loss": 0.04867528, + "step": 10847 + }, + { + "epoch": 21.696, + "grad_norm": 1.0538051128387451, + "learning_rate": 2e-05, + "loss": 0.02168122, + "step": 10848 + }, + { + "epoch": 21.698, + "grad_norm": 1.2353911399841309, + "learning_rate": 2e-05, + "loss": 0.04300802, + "step": 10849 + }, + { + "epoch": 21.7, + "grad_norm": 1.1052453517913818, + "learning_rate": 2e-05, + "loss": 0.01482047, + "step": 10850 + }, + { + "epoch": 21.701999999999998, + "grad_norm": 1.7022722959518433, + "learning_rate": 2e-05, + "loss": 0.04935133, + "step": 10851 + }, + { + "epoch": 21.704, + "grad_norm": 1.097477674484253, + "learning_rate": 2e-05, + "loss": 0.03660508, + "step": 10852 + }, + { + "epoch": 21.706, + "grad_norm": 2.1557564735412598, + "learning_rate": 2e-05, + "loss": 0.03016577, + "step": 10853 + }, + { + "epoch": 21.708, + "grad_norm": 2.0594379901885986, + "learning_rate": 2e-05, + "loss": 0.02772827, + "step": 10854 + }, + { + "epoch": 21.71, + "grad_norm": 1.2176257371902466, + "learning_rate": 2e-05, + "loss": 0.04331783, + "step": 10855 + }, + { + "epoch": 21.712, + "grad_norm": 1.0477436780929565, + "learning_rate": 2e-05, + "loss": 0.03022527, + "step": 10856 + }, + { + "epoch": 21.714, + "grad_norm": 0.903574526309967, + "learning_rate": 2e-05, + "loss": 0.02697276, + "step": 10857 + }, + { + "epoch": 21.716, + "grad_norm": 1.0498381853103638, + "learning_rate": 2e-05, + "loss": 0.03695409, + "step": 10858 + }, + { + "epoch": 21.718, + "grad_norm": 1.2936769723892212, + "learning_rate": 2e-05, + "loss": 0.02531984, + "step": 10859 + }, + { + "epoch": 21.72, + "grad_norm": 1.3947609663009644, + "learning_rate": 2e-05, + "loss": 0.04383749, + "step": 10860 + }, + { + "epoch": 21.722, + "grad_norm": 1.824804425239563, + "learning_rate": 2e-05, + "loss": 0.05057254, + "step": 10861 + }, + { + "epoch": 21.724, + "grad_norm": 0.9313336610794067, + "learning_rate": 2e-05, + "loss": 0.02528257, + "step": 10862 + }, + { + "epoch": 21.726, + "grad_norm": 1.0199240446090698, + "learning_rate": 2e-05, + "loss": 0.02904977, + "step": 10863 + }, + { + "epoch": 21.728, + "grad_norm": 1.1817435026168823, + "learning_rate": 2e-05, + "loss": 0.04280265, + "step": 10864 + }, + { + "epoch": 21.73, + "grad_norm": 1.3964251279830933, + "learning_rate": 2e-05, + "loss": 0.03219439, + "step": 10865 + }, + { + "epoch": 21.732, + "grad_norm": 1.4096508026123047, + "learning_rate": 2e-05, + "loss": 0.03554904, + "step": 10866 + }, + { + "epoch": 21.734, + "grad_norm": 1.8143075704574585, + "learning_rate": 2e-05, + "loss": 0.03615258, + "step": 10867 + }, + { + "epoch": 21.736, + "grad_norm": 0.7444595694541931, + "learning_rate": 2e-05, + "loss": 0.01764659, + "step": 10868 + }, + { + "epoch": 21.738, + "grad_norm": 1.0963743925094604, + "learning_rate": 2e-05, + "loss": 0.03289765, + "step": 10869 + }, + { + "epoch": 21.74, + "grad_norm": 1.7623260021209717, + "learning_rate": 2e-05, + "loss": 0.03723545, + "step": 10870 + }, + { + "epoch": 21.742, + "grad_norm": 1.765259861946106, + "learning_rate": 2e-05, + "loss": 0.03898823, + "step": 10871 + }, + { + "epoch": 21.744, + "grad_norm": 1.039548635482788, + "learning_rate": 2e-05, + "loss": 0.02444161, + "step": 10872 + }, + { + "epoch": 21.746, + "grad_norm": 1.2688932418823242, + "learning_rate": 2e-05, + "loss": 0.0317545, + "step": 10873 + }, + { + "epoch": 21.748, + "grad_norm": 1.3791906833648682, + "learning_rate": 2e-05, + "loss": 0.03312441, + "step": 10874 + }, + { + "epoch": 21.75, + "grad_norm": 0.9484188556671143, + "learning_rate": 2e-05, + "loss": 0.0266212, + "step": 10875 + }, + { + "epoch": 21.752, + "grad_norm": 0.8790581226348877, + "learning_rate": 2e-05, + "loss": 0.0297839, + "step": 10876 + }, + { + "epoch": 21.754, + "grad_norm": 1.943943738937378, + "learning_rate": 2e-05, + "loss": 0.04509932, + "step": 10877 + }, + { + "epoch": 21.756, + "grad_norm": 1.0038764476776123, + "learning_rate": 2e-05, + "loss": 0.03279506, + "step": 10878 + }, + { + "epoch": 21.758, + "grad_norm": 1.6507289409637451, + "learning_rate": 2e-05, + "loss": 0.04125448, + "step": 10879 + }, + { + "epoch": 21.76, + "grad_norm": 1.6259219646453857, + "learning_rate": 2e-05, + "loss": 0.04601936, + "step": 10880 + }, + { + "epoch": 21.762, + "grad_norm": 1.209537386894226, + "learning_rate": 2e-05, + "loss": 0.03919362, + "step": 10881 + }, + { + "epoch": 21.764, + "grad_norm": 1.0259144306182861, + "learning_rate": 2e-05, + "loss": 0.03256726, + "step": 10882 + }, + { + "epoch": 21.766, + "grad_norm": 1.5114238262176514, + "learning_rate": 2e-05, + "loss": 0.04673877, + "step": 10883 + }, + { + "epoch": 21.768, + "grad_norm": 1.0645627975463867, + "learning_rate": 2e-05, + "loss": 0.03577953, + "step": 10884 + }, + { + "epoch": 21.77, + "grad_norm": 2.1890196800231934, + "learning_rate": 2e-05, + "loss": 0.0552833, + "step": 10885 + }, + { + "epoch": 21.772, + "grad_norm": 1.5644853115081787, + "learning_rate": 2e-05, + "loss": 0.05350343, + "step": 10886 + }, + { + "epoch": 21.774, + "grad_norm": 0.8159758448600769, + "learning_rate": 2e-05, + "loss": 0.0221364, + "step": 10887 + }, + { + "epoch": 21.776, + "grad_norm": 1.310706615447998, + "learning_rate": 2e-05, + "loss": 0.02867308, + "step": 10888 + }, + { + "epoch": 21.778, + "grad_norm": 1.3240361213684082, + "learning_rate": 2e-05, + "loss": 0.04336674, + "step": 10889 + }, + { + "epoch": 21.78, + "grad_norm": 1.7382214069366455, + "learning_rate": 2e-05, + "loss": 0.04056187, + "step": 10890 + }, + { + "epoch": 21.782, + "grad_norm": 1.1201266050338745, + "learning_rate": 2e-05, + "loss": 0.0285767, + "step": 10891 + }, + { + "epoch": 21.784, + "grad_norm": 1.0340373516082764, + "learning_rate": 2e-05, + "loss": 0.0394372, + "step": 10892 + }, + { + "epoch": 21.786, + "grad_norm": 1.3631528615951538, + "learning_rate": 2e-05, + "loss": 0.04386802, + "step": 10893 + }, + { + "epoch": 21.788, + "grad_norm": 1.7087925672531128, + "learning_rate": 2e-05, + "loss": 0.05740018, + "step": 10894 + }, + { + "epoch": 21.79, + "grad_norm": 1.2946619987487793, + "learning_rate": 2e-05, + "loss": 0.04351215, + "step": 10895 + }, + { + "epoch": 21.792, + "grad_norm": 1.9575139284133911, + "learning_rate": 2e-05, + "loss": 0.04059194, + "step": 10896 + }, + { + "epoch": 21.794, + "grad_norm": 1.6252760887145996, + "learning_rate": 2e-05, + "loss": 0.03451142, + "step": 10897 + }, + { + "epoch": 21.796, + "grad_norm": 1.5331840515136719, + "learning_rate": 2e-05, + "loss": 0.03251875, + "step": 10898 + }, + { + "epoch": 21.798000000000002, + "grad_norm": 1.0864592790603638, + "learning_rate": 2e-05, + "loss": 0.03452892, + "step": 10899 + }, + { + "epoch": 21.8, + "grad_norm": 0.8250343799591064, + "learning_rate": 2e-05, + "loss": 0.02099156, + "step": 10900 + }, + { + "epoch": 21.802, + "grad_norm": 1.166359782218933, + "learning_rate": 2e-05, + "loss": 0.04394403, + "step": 10901 + }, + { + "epoch": 21.804, + "grad_norm": 1.5257127285003662, + "learning_rate": 2e-05, + "loss": 0.03211277, + "step": 10902 + }, + { + "epoch": 21.806, + "grad_norm": 1.0145310163497925, + "learning_rate": 2e-05, + "loss": 0.02621819, + "step": 10903 + }, + { + "epoch": 21.808, + "grad_norm": 2.11439847946167, + "learning_rate": 2e-05, + "loss": 0.04509841, + "step": 10904 + }, + { + "epoch": 21.81, + "grad_norm": 1.0672552585601807, + "learning_rate": 2e-05, + "loss": 0.03072799, + "step": 10905 + }, + { + "epoch": 21.812, + "grad_norm": 1.1568739414215088, + "learning_rate": 2e-05, + "loss": 0.04023495, + "step": 10906 + }, + { + "epoch": 21.814, + "grad_norm": 1.9787969589233398, + "learning_rate": 2e-05, + "loss": 0.04461158, + "step": 10907 + }, + { + "epoch": 21.816, + "grad_norm": 1.268618106842041, + "learning_rate": 2e-05, + "loss": 0.04039661, + "step": 10908 + }, + { + "epoch": 21.818, + "grad_norm": 1.030121088027954, + "learning_rate": 2e-05, + "loss": 0.03256563, + "step": 10909 + }, + { + "epoch": 21.82, + "grad_norm": 1.3051402568817139, + "learning_rate": 2e-05, + "loss": 0.03315262, + "step": 10910 + }, + { + "epoch": 21.822, + "grad_norm": 1.599260687828064, + "learning_rate": 2e-05, + "loss": 0.05178364, + "step": 10911 + }, + { + "epoch": 21.824, + "grad_norm": 1.3491737842559814, + "learning_rate": 2e-05, + "loss": 0.03285315, + "step": 10912 + }, + { + "epoch": 21.826, + "grad_norm": 0.9857081770896912, + "learning_rate": 2e-05, + "loss": 0.02820598, + "step": 10913 + }, + { + "epoch": 21.828, + "grad_norm": 1.2068244218826294, + "learning_rate": 2e-05, + "loss": 0.02220411, + "step": 10914 + }, + { + "epoch": 21.83, + "grad_norm": 1.127752661705017, + "learning_rate": 2e-05, + "loss": 0.03646821, + "step": 10915 + }, + { + "epoch": 21.832, + "grad_norm": 1.1618894338607788, + "learning_rate": 2e-05, + "loss": 0.03601151, + "step": 10916 + }, + { + "epoch": 21.834, + "grad_norm": 2.112398624420166, + "learning_rate": 2e-05, + "loss": 0.03499029, + "step": 10917 + }, + { + "epoch": 21.836, + "grad_norm": 1.555198073387146, + "learning_rate": 2e-05, + "loss": 0.04573829, + "step": 10918 + }, + { + "epoch": 21.838, + "grad_norm": 0.9329383373260498, + "learning_rate": 2e-05, + "loss": 0.03361698, + "step": 10919 + }, + { + "epoch": 21.84, + "grad_norm": 1.7770673036575317, + "learning_rate": 2e-05, + "loss": 0.03091755, + "step": 10920 + }, + { + "epoch": 21.842, + "grad_norm": 1.157621145248413, + "learning_rate": 2e-05, + "loss": 0.03168878, + "step": 10921 + }, + { + "epoch": 21.844, + "grad_norm": 1.2026602029800415, + "learning_rate": 2e-05, + "loss": 0.02261108, + "step": 10922 + }, + { + "epoch": 21.846, + "grad_norm": 1.4362621307373047, + "learning_rate": 2e-05, + "loss": 0.02808312, + "step": 10923 + }, + { + "epoch": 21.848, + "grad_norm": 1.054166555404663, + "learning_rate": 2e-05, + "loss": 0.03865466, + "step": 10924 + }, + { + "epoch": 21.85, + "grad_norm": 1.0605528354644775, + "learning_rate": 2e-05, + "loss": 0.03541258, + "step": 10925 + }, + { + "epoch": 21.852, + "grad_norm": 1.4870340824127197, + "learning_rate": 2e-05, + "loss": 0.04259035, + "step": 10926 + }, + { + "epoch": 21.854, + "grad_norm": 1.6365232467651367, + "learning_rate": 2e-05, + "loss": 0.03734278, + "step": 10927 + }, + { + "epoch": 21.856, + "grad_norm": 0.7869289517402649, + "learning_rate": 2e-05, + "loss": 0.02219597, + "step": 10928 + }, + { + "epoch": 21.858, + "grad_norm": 1.1174979209899902, + "learning_rate": 2e-05, + "loss": 0.03317832, + "step": 10929 + }, + { + "epoch": 21.86, + "grad_norm": 1.7156922817230225, + "learning_rate": 2e-05, + "loss": 0.03578085, + "step": 10930 + }, + { + "epoch": 21.862, + "grad_norm": 1.5693000555038452, + "learning_rate": 2e-05, + "loss": 0.03383234, + "step": 10931 + }, + { + "epoch": 21.864, + "grad_norm": 2.124469041824341, + "learning_rate": 2e-05, + "loss": 0.029794, + "step": 10932 + }, + { + "epoch": 21.866, + "grad_norm": 1.151176929473877, + "learning_rate": 2e-05, + "loss": 0.04017279, + "step": 10933 + }, + { + "epoch": 21.868, + "grad_norm": 0.8996794819831848, + "learning_rate": 2e-05, + "loss": 0.02869819, + "step": 10934 + }, + { + "epoch": 21.87, + "grad_norm": 0.9696109294891357, + "learning_rate": 2e-05, + "loss": 0.032364, + "step": 10935 + }, + { + "epoch": 21.872, + "grad_norm": 1.4245364665985107, + "learning_rate": 2e-05, + "loss": 0.03265813, + "step": 10936 + }, + { + "epoch": 21.874, + "grad_norm": 1.5670595169067383, + "learning_rate": 2e-05, + "loss": 0.04186102, + "step": 10937 + }, + { + "epoch": 21.876, + "grad_norm": 1.4556249380111694, + "learning_rate": 2e-05, + "loss": 0.04338999, + "step": 10938 + }, + { + "epoch": 21.878, + "grad_norm": 1.3597520589828491, + "learning_rate": 2e-05, + "loss": 0.03609984, + "step": 10939 + }, + { + "epoch": 21.88, + "grad_norm": 1.7528231143951416, + "learning_rate": 2e-05, + "loss": 0.03558186, + "step": 10940 + }, + { + "epoch": 21.882, + "grad_norm": 1.0914018154144287, + "learning_rate": 2e-05, + "loss": 0.0334373, + "step": 10941 + }, + { + "epoch": 21.884, + "grad_norm": 1.0560919046401978, + "learning_rate": 2e-05, + "loss": 0.02701947, + "step": 10942 + }, + { + "epoch": 21.886, + "grad_norm": 1.0461790561676025, + "learning_rate": 2e-05, + "loss": 0.02477691, + "step": 10943 + }, + { + "epoch": 21.888, + "grad_norm": 1.0176409482955933, + "learning_rate": 2e-05, + "loss": 0.02632477, + "step": 10944 + }, + { + "epoch": 21.89, + "grad_norm": 1.188032627105713, + "learning_rate": 2e-05, + "loss": 0.03136832, + "step": 10945 + }, + { + "epoch": 21.892, + "grad_norm": 1.8192259073257446, + "learning_rate": 2e-05, + "loss": 0.03940712, + "step": 10946 + }, + { + "epoch": 21.894, + "grad_norm": 1.1689777374267578, + "learning_rate": 2e-05, + "loss": 0.04128121, + "step": 10947 + }, + { + "epoch": 21.896, + "grad_norm": 0.9614786505699158, + "learning_rate": 2e-05, + "loss": 0.03097624, + "step": 10948 + }, + { + "epoch": 21.898, + "grad_norm": 2.1277706623077393, + "learning_rate": 2e-05, + "loss": 0.04416885, + "step": 10949 + }, + { + "epoch": 21.9, + "grad_norm": 1.0847748517990112, + "learning_rate": 2e-05, + "loss": 0.047647, + "step": 10950 + }, + { + "epoch": 21.902, + "grad_norm": 1.4761464595794678, + "learning_rate": 2e-05, + "loss": 0.03700073, + "step": 10951 + }, + { + "epoch": 21.904, + "grad_norm": 1.335006833076477, + "learning_rate": 2e-05, + "loss": 0.02501204, + "step": 10952 + }, + { + "epoch": 21.906, + "grad_norm": 1.1310573816299438, + "learning_rate": 2e-05, + "loss": 0.03214348, + "step": 10953 + }, + { + "epoch": 21.908, + "grad_norm": 2.758718490600586, + "learning_rate": 2e-05, + "loss": 0.0428556, + "step": 10954 + }, + { + "epoch": 21.91, + "grad_norm": 1.922041654586792, + "learning_rate": 2e-05, + "loss": 0.04771857, + "step": 10955 + }, + { + "epoch": 21.912, + "grad_norm": 1.7065603733062744, + "learning_rate": 2e-05, + "loss": 0.0471417, + "step": 10956 + }, + { + "epoch": 21.914, + "grad_norm": 1.5573147535324097, + "learning_rate": 2e-05, + "loss": 0.03855333, + "step": 10957 + }, + { + "epoch": 21.916, + "grad_norm": 1.1030230522155762, + "learning_rate": 2e-05, + "loss": 0.03686146, + "step": 10958 + }, + { + "epoch": 21.918, + "grad_norm": 1.0365067720413208, + "learning_rate": 2e-05, + "loss": 0.02389233, + "step": 10959 + }, + { + "epoch": 21.92, + "grad_norm": 1.0167555809020996, + "learning_rate": 2e-05, + "loss": 0.03511351, + "step": 10960 + }, + { + "epoch": 21.922, + "grad_norm": 1.0623586177825928, + "learning_rate": 2e-05, + "loss": 0.0339056, + "step": 10961 + }, + { + "epoch": 21.924, + "grad_norm": 2.31209397315979, + "learning_rate": 2e-05, + "loss": 0.04047135, + "step": 10962 + }, + { + "epoch": 21.926, + "grad_norm": 1.1770862340927124, + "learning_rate": 2e-05, + "loss": 0.04121822, + "step": 10963 + }, + { + "epoch": 21.928, + "grad_norm": 0.8528270125389099, + "learning_rate": 2e-05, + "loss": 0.02435702, + "step": 10964 + }, + { + "epoch": 21.93, + "grad_norm": 1.0807077884674072, + "learning_rate": 2e-05, + "loss": 0.03478127, + "step": 10965 + }, + { + "epoch": 21.932, + "grad_norm": 1.1846781969070435, + "learning_rate": 2e-05, + "loss": 0.04354676, + "step": 10966 + }, + { + "epoch": 21.934, + "grad_norm": 1.334719181060791, + "learning_rate": 2e-05, + "loss": 0.02577735, + "step": 10967 + }, + { + "epoch": 21.936, + "grad_norm": 1.0082858800888062, + "learning_rate": 2e-05, + "loss": 0.0299839, + "step": 10968 + }, + { + "epoch": 21.938, + "grad_norm": 1.5717273950576782, + "learning_rate": 2e-05, + "loss": 0.02820227, + "step": 10969 + }, + { + "epoch": 21.94, + "grad_norm": 1.0491491556167603, + "learning_rate": 2e-05, + "loss": 0.02673104, + "step": 10970 + }, + { + "epoch": 21.942, + "grad_norm": 1.0635792016983032, + "learning_rate": 2e-05, + "loss": 0.02951121, + "step": 10971 + }, + { + "epoch": 21.944, + "grad_norm": 1.3966928720474243, + "learning_rate": 2e-05, + "loss": 0.04850317, + "step": 10972 + }, + { + "epoch": 21.946, + "grad_norm": 1.1007291078567505, + "learning_rate": 2e-05, + "loss": 0.02945408, + "step": 10973 + }, + { + "epoch": 21.948, + "grad_norm": 0.8906104564666748, + "learning_rate": 2e-05, + "loss": 0.03067725, + "step": 10974 + }, + { + "epoch": 21.95, + "grad_norm": 1.0863364934921265, + "learning_rate": 2e-05, + "loss": 0.02871832, + "step": 10975 + }, + { + "epoch": 21.951999999999998, + "grad_norm": 1.6591525077819824, + "learning_rate": 2e-05, + "loss": 0.04460109, + "step": 10976 + }, + { + "epoch": 21.954, + "grad_norm": 1.431624174118042, + "learning_rate": 2e-05, + "loss": 0.03387917, + "step": 10977 + }, + { + "epoch": 21.956, + "grad_norm": 1.215907335281372, + "learning_rate": 2e-05, + "loss": 0.05533048, + "step": 10978 + }, + { + "epoch": 21.958, + "grad_norm": 1.0936074256896973, + "learning_rate": 2e-05, + "loss": 0.02711234, + "step": 10979 + }, + { + "epoch": 21.96, + "grad_norm": 1.123211145401001, + "learning_rate": 2e-05, + "loss": 0.04708305, + "step": 10980 + }, + { + "epoch": 21.962, + "grad_norm": 1.28691828250885, + "learning_rate": 2e-05, + "loss": 0.03268384, + "step": 10981 + }, + { + "epoch": 21.964, + "grad_norm": 1.013840913772583, + "learning_rate": 2e-05, + "loss": 0.03659943, + "step": 10982 + }, + { + "epoch": 21.966, + "grad_norm": 1.2497045993804932, + "learning_rate": 2e-05, + "loss": 0.05112303, + "step": 10983 + }, + { + "epoch": 21.968, + "grad_norm": 0.9134369492530823, + "learning_rate": 2e-05, + "loss": 0.02654456, + "step": 10984 + }, + { + "epoch": 21.97, + "grad_norm": 1.2738003730773926, + "learning_rate": 2e-05, + "loss": 0.03234825, + "step": 10985 + }, + { + "epoch": 21.972, + "grad_norm": 0.8131256699562073, + "learning_rate": 2e-05, + "loss": 0.0200008, + "step": 10986 + }, + { + "epoch": 21.974, + "grad_norm": 1.4570480585098267, + "learning_rate": 2e-05, + "loss": 0.02533815, + "step": 10987 + }, + { + "epoch": 21.976, + "grad_norm": 0.9933975338935852, + "learning_rate": 2e-05, + "loss": 0.03682224, + "step": 10988 + }, + { + "epoch": 21.978, + "grad_norm": 2.5660200119018555, + "learning_rate": 2e-05, + "loss": 0.04893076, + "step": 10989 + }, + { + "epoch": 21.98, + "grad_norm": 1.247011661529541, + "learning_rate": 2e-05, + "loss": 0.03728838, + "step": 10990 + }, + { + "epoch": 21.982, + "grad_norm": 2.838491201400757, + "learning_rate": 2e-05, + "loss": 0.04673835, + "step": 10991 + }, + { + "epoch": 21.984, + "grad_norm": 1.4233250617980957, + "learning_rate": 2e-05, + "loss": 0.04645887, + "step": 10992 + }, + { + "epoch": 21.986, + "grad_norm": 1.8115314245224, + "learning_rate": 2e-05, + "loss": 0.03267002, + "step": 10993 + }, + { + "epoch": 21.988, + "grad_norm": 0.8721919655799866, + "learning_rate": 2e-05, + "loss": 0.03281524, + "step": 10994 + }, + { + "epoch": 21.99, + "grad_norm": 1.70564866065979, + "learning_rate": 2e-05, + "loss": 0.04003553, + "step": 10995 + }, + { + "epoch": 21.992, + "grad_norm": 0.9615834951400757, + "learning_rate": 2e-05, + "loss": 0.03332902, + "step": 10996 + }, + { + "epoch": 21.994, + "grad_norm": 1.3987194299697876, + "learning_rate": 2e-05, + "loss": 0.03447945, + "step": 10997 + }, + { + "epoch": 21.996, + "grad_norm": 0.9507851600646973, + "learning_rate": 2e-05, + "loss": 0.02853581, + "step": 10998 + }, + { + "epoch": 21.998, + "grad_norm": 0.9905051589012146, + "learning_rate": 2e-05, + "loss": 0.02947031, + "step": 10999 + }, + { + "epoch": 22.0, + "grad_norm": 1.9134247303009033, + "learning_rate": 2e-05, + "loss": 0.03030916, + "step": 11000 + }, + { + "epoch": 22.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9760479041916168, + "Equal_1": 0.998, + "Equal_2": 0.9800399201596807, + "Equal_3": 0.9740518962075848, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9960079840319361, + "Parallel_1": 0.9919839679358717, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.988, + "Perpendicular_3": 0.8296593186372746, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9956666666666667, + "PointLiesOnCircle_3": 0.988, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9800399201596807 + }, + "eval_runtime": 319.6303, + "eval_samples_per_second": 32.85, + "eval_steps_per_second": 0.657, + "step": 11000 + }, + { + "epoch": 22.002, + "grad_norm": 1.5013846158981323, + "learning_rate": 2e-05, + "loss": 0.04865558, + "step": 11001 + }, + { + "epoch": 22.004, + "grad_norm": 0.9657471179962158, + "learning_rate": 2e-05, + "loss": 0.03410444, + "step": 11002 + }, + { + "epoch": 22.006, + "grad_norm": 1.055569052696228, + "learning_rate": 2e-05, + "loss": 0.03405615, + "step": 11003 + }, + { + "epoch": 22.008, + "grad_norm": 1.843996524810791, + "learning_rate": 2e-05, + "loss": 0.05212854, + "step": 11004 + }, + { + "epoch": 22.01, + "grad_norm": 1.2965750694274902, + "learning_rate": 2e-05, + "loss": 0.04010913, + "step": 11005 + }, + { + "epoch": 22.012, + "grad_norm": 0.9425175189971924, + "learning_rate": 2e-05, + "loss": 0.02818494, + "step": 11006 + }, + { + "epoch": 22.014, + "grad_norm": 1.5576436519622803, + "learning_rate": 2e-05, + "loss": 0.05102572, + "step": 11007 + }, + { + "epoch": 22.016, + "grad_norm": 0.9978459477424622, + "learning_rate": 2e-05, + "loss": 0.03394839, + "step": 11008 + }, + { + "epoch": 22.018, + "grad_norm": 1.1211631298065186, + "learning_rate": 2e-05, + "loss": 0.04935118, + "step": 11009 + }, + { + "epoch": 22.02, + "grad_norm": 2.0036747455596924, + "learning_rate": 2e-05, + "loss": 0.04285323, + "step": 11010 + }, + { + "epoch": 22.022, + "grad_norm": 1.1582741737365723, + "learning_rate": 2e-05, + "loss": 0.04079919, + "step": 11011 + }, + { + "epoch": 22.024, + "grad_norm": 1.5606409311294556, + "learning_rate": 2e-05, + "loss": 0.03285683, + "step": 11012 + }, + { + "epoch": 22.026, + "grad_norm": 2.8261466026306152, + "learning_rate": 2e-05, + "loss": 0.04419506, + "step": 11013 + }, + { + "epoch": 22.028, + "grad_norm": 1.250126838684082, + "learning_rate": 2e-05, + "loss": 0.03550693, + "step": 11014 + }, + { + "epoch": 22.03, + "grad_norm": 2.362985372543335, + "learning_rate": 2e-05, + "loss": 0.03660453, + "step": 11015 + }, + { + "epoch": 22.032, + "grad_norm": 1.1419157981872559, + "learning_rate": 2e-05, + "loss": 0.03367894, + "step": 11016 + }, + { + "epoch": 22.034, + "grad_norm": 1.2348449230194092, + "learning_rate": 2e-05, + "loss": 0.03856073, + "step": 11017 + }, + { + "epoch": 22.036, + "grad_norm": 0.8628196120262146, + "learning_rate": 2e-05, + "loss": 0.02549962, + "step": 11018 + }, + { + "epoch": 22.038, + "grad_norm": 1.2292640209197998, + "learning_rate": 2e-05, + "loss": 0.03300656, + "step": 11019 + }, + { + "epoch": 22.04, + "grad_norm": 1.549729824066162, + "learning_rate": 2e-05, + "loss": 0.03245661, + "step": 11020 + }, + { + "epoch": 22.042, + "grad_norm": 1.387235164642334, + "learning_rate": 2e-05, + "loss": 0.03017138, + "step": 11021 + }, + { + "epoch": 22.044, + "grad_norm": 1.231819987297058, + "learning_rate": 2e-05, + "loss": 0.03833802, + "step": 11022 + }, + { + "epoch": 22.046, + "grad_norm": 0.9583532214164734, + "learning_rate": 2e-05, + "loss": 0.0348784, + "step": 11023 + }, + { + "epoch": 22.048, + "grad_norm": 2.0372843742370605, + "learning_rate": 2e-05, + "loss": 0.0497199, + "step": 11024 + }, + { + "epoch": 22.05, + "grad_norm": 1.5443356037139893, + "learning_rate": 2e-05, + "loss": 0.04319365, + "step": 11025 + }, + { + "epoch": 22.052, + "grad_norm": 0.9617211818695068, + "learning_rate": 2e-05, + "loss": 0.02863034, + "step": 11026 + }, + { + "epoch": 22.054, + "grad_norm": 1.008378028869629, + "learning_rate": 2e-05, + "loss": 0.03430276, + "step": 11027 + }, + { + "epoch": 22.056, + "grad_norm": 0.8539184927940369, + "learning_rate": 2e-05, + "loss": 0.02441457, + "step": 11028 + }, + { + "epoch": 22.058, + "grad_norm": 0.9425522685050964, + "learning_rate": 2e-05, + "loss": 0.03636635, + "step": 11029 + }, + { + "epoch": 22.06, + "grad_norm": 1.6154694557189941, + "learning_rate": 2e-05, + "loss": 0.03873938, + "step": 11030 + }, + { + "epoch": 22.062, + "grad_norm": 0.9321594834327698, + "learning_rate": 2e-05, + "loss": 0.02615494, + "step": 11031 + }, + { + "epoch": 22.064, + "grad_norm": 1.0639960765838623, + "learning_rate": 2e-05, + "loss": 0.03128612, + "step": 11032 + }, + { + "epoch": 22.066, + "grad_norm": 2.2421159744262695, + "learning_rate": 2e-05, + "loss": 0.04077518, + "step": 11033 + }, + { + "epoch": 22.068, + "grad_norm": 1.059031367301941, + "learning_rate": 2e-05, + "loss": 0.03726078, + "step": 11034 + }, + { + "epoch": 22.07, + "grad_norm": 1.1438366174697876, + "learning_rate": 2e-05, + "loss": 0.03756997, + "step": 11035 + }, + { + "epoch": 22.072, + "grad_norm": 1.1501072645187378, + "learning_rate": 2e-05, + "loss": 0.02969356, + "step": 11036 + }, + { + "epoch": 22.074, + "grad_norm": 1.339308738708496, + "learning_rate": 2e-05, + "loss": 0.02925887, + "step": 11037 + }, + { + "epoch": 22.076, + "grad_norm": 1.7447677850723267, + "learning_rate": 2e-05, + "loss": 0.04287693, + "step": 11038 + }, + { + "epoch": 22.078, + "grad_norm": 1.0596781969070435, + "learning_rate": 2e-05, + "loss": 0.02210912, + "step": 11039 + }, + { + "epoch": 22.08, + "grad_norm": 1.0472910404205322, + "learning_rate": 2e-05, + "loss": 0.03132582, + "step": 11040 + }, + { + "epoch": 22.082, + "grad_norm": 1.045034408569336, + "learning_rate": 2e-05, + "loss": 0.03018015, + "step": 11041 + }, + { + "epoch": 22.084, + "grad_norm": 0.9737959504127502, + "learning_rate": 2e-05, + "loss": 0.0249765, + "step": 11042 + }, + { + "epoch": 22.086, + "grad_norm": 1.3689839839935303, + "learning_rate": 2e-05, + "loss": 0.03519817, + "step": 11043 + }, + { + "epoch": 22.088, + "grad_norm": 1.2619421482086182, + "learning_rate": 2e-05, + "loss": 0.03652804, + "step": 11044 + }, + { + "epoch": 22.09, + "grad_norm": 1.0860421657562256, + "learning_rate": 2e-05, + "loss": 0.02551399, + "step": 11045 + }, + { + "epoch": 22.092, + "grad_norm": 0.9912066459655762, + "learning_rate": 2e-05, + "loss": 0.03179378, + "step": 11046 + }, + { + "epoch": 22.094, + "grad_norm": 1.0879795551300049, + "learning_rate": 2e-05, + "loss": 0.03158774, + "step": 11047 + }, + { + "epoch": 22.096, + "grad_norm": 0.8615609407424927, + "learning_rate": 2e-05, + "loss": 0.02629388, + "step": 11048 + }, + { + "epoch": 22.098, + "grad_norm": 1.7215999364852905, + "learning_rate": 2e-05, + "loss": 0.03771915, + "step": 11049 + }, + { + "epoch": 22.1, + "grad_norm": 0.9128824472427368, + "learning_rate": 2e-05, + "loss": 0.02178533, + "step": 11050 + }, + { + "epoch": 22.102, + "grad_norm": 1.2859452962875366, + "learning_rate": 2e-05, + "loss": 0.04280837, + "step": 11051 + }, + { + "epoch": 22.104, + "grad_norm": 1.0833191871643066, + "learning_rate": 2e-05, + "loss": 0.03400262, + "step": 11052 + }, + { + "epoch": 22.106, + "grad_norm": 0.8177392482757568, + "learning_rate": 2e-05, + "loss": 0.02097586, + "step": 11053 + }, + { + "epoch": 22.108, + "grad_norm": 3.141331195831299, + "learning_rate": 2e-05, + "loss": 0.02519385, + "step": 11054 + }, + { + "epoch": 22.11, + "grad_norm": 1.0663577318191528, + "learning_rate": 2e-05, + "loss": 0.03236621, + "step": 11055 + }, + { + "epoch": 22.112, + "grad_norm": 3.876330614089966, + "learning_rate": 2e-05, + "loss": 0.05157751, + "step": 11056 + }, + { + "epoch": 22.114, + "grad_norm": 1.0990570783615112, + "learning_rate": 2e-05, + "loss": 0.03127418, + "step": 11057 + }, + { + "epoch": 22.116, + "grad_norm": 2.1724092960357666, + "learning_rate": 2e-05, + "loss": 0.05966609, + "step": 11058 + }, + { + "epoch": 22.118, + "grad_norm": 2.2459487915039062, + "learning_rate": 2e-05, + "loss": 0.04710309, + "step": 11059 + }, + { + "epoch": 22.12, + "grad_norm": 1.9107338190078735, + "learning_rate": 2e-05, + "loss": 0.04102355, + "step": 11060 + }, + { + "epoch": 22.122, + "grad_norm": 1.542749285697937, + "learning_rate": 2e-05, + "loss": 0.04050046, + "step": 11061 + }, + { + "epoch": 22.124, + "grad_norm": 1.8505321741104126, + "learning_rate": 2e-05, + "loss": 0.04142891, + "step": 11062 + }, + { + "epoch": 22.126, + "grad_norm": 1.4331146478652954, + "learning_rate": 2e-05, + "loss": 0.03895901, + "step": 11063 + }, + { + "epoch": 22.128, + "grad_norm": 1.0269427299499512, + "learning_rate": 2e-05, + "loss": 0.03234866, + "step": 11064 + }, + { + "epoch": 22.13, + "grad_norm": 1.1322760581970215, + "learning_rate": 2e-05, + "loss": 0.03056391, + "step": 11065 + }, + { + "epoch": 22.132, + "grad_norm": 0.8751665949821472, + "learning_rate": 2e-05, + "loss": 0.01894474, + "step": 11066 + }, + { + "epoch": 22.134, + "grad_norm": 1.1677522659301758, + "learning_rate": 2e-05, + "loss": 0.03264951, + "step": 11067 + }, + { + "epoch": 22.136, + "grad_norm": 1.8770594596862793, + "learning_rate": 2e-05, + "loss": 0.04727862, + "step": 11068 + }, + { + "epoch": 22.138, + "grad_norm": 0.9931758642196655, + "learning_rate": 2e-05, + "loss": 0.02625104, + "step": 11069 + }, + { + "epoch": 22.14, + "grad_norm": 1.8161505460739136, + "learning_rate": 2e-05, + "loss": 0.0335126, + "step": 11070 + }, + { + "epoch": 22.142, + "grad_norm": 1.2563273906707764, + "learning_rate": 2e-05, + "loss": 0.03935399, + "step": 11071 + }, + { + "epoch": 22.144, + "grad_norm": 1.6449320316314697, + "learning_rate": 2e-05, + "loss": 0.03845852, + "step": 11072 + }, + { + "epoch": 22.146, + "grad_norm": 1.3035930395126343, + "learning_rate": 2e-05, + "loss": 0.03262253, + "step": 11073 + }, + { + "epoch": 22.148, + "grad_norm": 1.5313411951065063, + "learning_rate": 2e-05, + "loss": 0.03509212, + "step": 11074 + }, + { + "epoch": 22.15, + "grad_norm": 2.219724178314209, + "learning_rate": 2e-05, + "loss": 0.05217472, + "step": 11075 + }, + { + "epoch": 22.152, + "grad_norm": 1.2089931964874268, + "learning_rate": 2e-05, + "loss": 0.03936118, + "step": 11076 + }, + { + "epoch": 22.154, + "grad_norm": 1.1982935667037964, + "learning_rate": 2e-05, + "loss": 0.03750388, + "step": 11077 + }, + { + "epoch": 22.156, + "grad_norm": 1.3776034116744995, + "learning_rate": 2e-05, + "loss": 0.02474811, + "step": 11078 + }, + { + "epoch": 22.158, + "grad_norm": 1.5565482378005981, + "learning_rate": 2e-05, + "loss": 0.04205162, + "step": 11079 + }, + { + "epoch": 22.16, + "grad_norm": 1.5331981182098389, + "learning_rate": 2e-05, + "loss": 0.05201468, + "step": 11080 + }, + { + "epoch": 22.162, + "grad_norm": 1.260038137435913, + "learning_rate": 2e-05, + "loss": 0.03021825, + "step": 11081 + }, + { + "epoch": 22.164, + "grad_norm": 1.128806471824646, + "learning_rate": 2e-05, + "loss": 0.03233521, + "step": 11082 + }, + { + "epoch": 22.166, + "grad_norm": 1.0538668632507324, + "learning_rate": 2e-05, + "loss": 0.02831938, + "step": 11083 + }, + { + "epoch": 22.168, + "grad_norm": 1.506290078163147, + "learning_rate": 2e-05, + "loss": 0.03654964, + "step": 11084 + }, + { + "epoch": 22.17, + "grad_norm": 1.4972366094589233, + "learning_rate": 2e-05, + "loss": 0.04231321, + "step": 11085 + }, + { + "epoch": 22.172, + "grad_norm": 1.2762951850891113, + "learning_rate": 2e-05, + "loss": 0.04460282, + "step": 11086 + }, + { + "epoch": 22.174, + "grad_norm": 1.0688859224319458, + "learning_rate": 2e-05, + "loss": 0.03068903, + "step": 11087 + }, + { + "epoch": 22.176, + "grad_norm": 1.194064974784851, + "learning_rate": 2e-05, + "loss": 0.03391276, + "step": 11088 + }, + { + "epoch": 22.178, + "grad_norm": 0.7239233255386353, + "learning_rate": 2e-05, + "loss": 0.01693652, + "step": 11089 + }, + { + "epoch": 22.18, + "grad_norm": 1.0344641208648682, + "learning_rate": 2e-05, + "loss": 0.03364387, + "step": 11090 + }, + { + "epoch": 22.182, + "grad_norm": 1.1779170036315918, + "learning_rate": 2e-05, + "loss": 0.03767635, + "step": 11091 + }, + { + "epoch": 22.184, + "grad_norm": 1.9103333950042725, + "learning_rate": 2e-05, + "loss": 0.04368252, + "step": 11092 + }, + { + "epoch": 22.186, + "grad_norm": 1.6527727842330933, + "learning_rate": 2e-05, + "loss": 0.03112076, + "step": 11093 + }, + { + "epoch": 22.188, + "grad_norm": 0.8650185465812683, + "learning_rate": 2e-05, + "loss": 0.0230502, + "step": 11094 + }, + { + "epoch": 22.19, + "grad_norm": 2.3402581214904785, + "learning_rate": 2e-05, + "loss": 0.05151432, + "step": 11095 + }, + { + "epoch": 22.192, + "grad_norm": 1.087644100189209, + "learning_rate": 2e-05, + "loss": 0.02510383, + "step": 11096 + }, + { + "epoch": 22.194, + "grad_norm": 1.062873125076294, + "learning_rate": 2e-05, + "loss": 0.02815459, + "step": 11097 + }, + { + "epoch": 22.196, + "grad_norm": 1.5720903873443604, + "learning_rate": 2e-05, + "loss": 0.04108073, + "step": 11098 + }, + { + "epoch": 22.198, + "grad_norm": 1.1922218799591064, + "learning_rate": 2e-05, + "loss": 0.03966674, + "step": 11099 + }, + { + "epoch": 22.2, + "grad_norm": 1.7009609937667847, + "learning_rate": 2e-05, + "loss": 0.03648136, + "step": 11100 + }, + { + "epoch": 22.202, + "grad_norm": 1.3058127164840698, + "learning_rate": 2e-05, + "loss": 0.030958, + "step": 11101 + }, + { + "epoch": 22.204, + "grad_norm": 1.2523670196533203, + "learning_rate": 2e-05, + "loss": 0.03727802, + "step": 11102 + }, + { + "epoch": 22.206, + "grad_norm": 2.1313955783843994, + "learning_rate": 2e-05, + "loss": 0.03327603, + "step": 11103 + }, + { + "epoch": 22.208, + "grad_norm": 1.0386605262756348, + "learning_rate": 2e-05, + "loss": 0.03811013, + "step": 11104 + }, + { + "epoch": 22.21, + "grad_norm": 0.9282265305519104, + "learning_rate": 2e-05, + "loss": 0.02790239, + "step": 11105 + }, + { + "epoch": 22.212, + "grad_norm": 1.176169991493225, + "learning_rate": 2e-05, + "loss": 0.03345915, + "step": 11106 + }, + { + "epoch": 22.214, + "grad_norm": 1.745169758796692, + "learning_rate": 2e-05, + "loss": 0.04287583, + "step": 11107 + }, + { + "epoch": 22.216, + "grad_norm": 1.2296968698501587, + "learning_rate": 2e-05, + "loss": 0.04217129, + "step": 11108 + }, + { + "epoch": 22.218, + "grad_norm": 1.1114269495010376, + "learning_rate": 2e-05, + "loss": 0.04333063, + "step": 11109 + }, + { + "epoch": 22.22, + "grad_norm": 1.1733880043029785, + "learning_rate": 2e-05, + "loss": 0.03284467, + "step": 11110 + }, + { + "epoch": 22.222, + "grad_norm": 1.0831984281539917, + "learning_rate": 2e-05, + "loss": 0.03438616, + "step": 11111 + }, + { + "epoch": 22.224, + "grad_norm": 1.352116346359253, + "learning_rate": 2e-05, + "loss": 0.04775696, + "step": 11112 + }, + { + "epoch": 22.226, + "grad_norm": 1.0606354475021362, + "learning_rate": 2e-05, + "loss": 0.02460128, + "step": 11113 + }, + { + "epoch": 22.228, + "grad_norm": 1.1735827922821045, + "learning_rate": 2e-05, + "loss": 0.03722972, + "step": 11114 + }, + { + "epoch": 22.23, + "grad_norm": 0.9459177851676941, + "learning_rate": 2e-05, + "loss": 0.03487729, + "step": 11115 + }, + { + "epoch": 22.232, + "grad_norm": 1.0781097412109375, + "learning_rate": 2e-05, + "loss": 0.02943964, + "step": 11116 + }, + { + "epoch": 22.234, + "grad_norm": 1.2076375484466553, + "learning_rate": 2e-05, + "loss": 0.03915054, + "step": 11117 + }, + { + "epoch": 22.236, + "grad_norm": 0.8902519941329956, + "learning_rate": 2e-05, + "loss": 0.02953203, + "step": 11118 + }, + { + "epoch": 22.238, + "grad_norm": 1.423532247543335, + "learning_rate": 2e-05, + "loss": 0.03786097, + "step": 11119 + }, + { + "epoch": 22.24, + "grad_norm": 1.068610668182373, + "learning_rate": 2e-05, + "loss": 0.0284631, + "step": 11120 + }, + { + "epoch": 22.242, + "grad_norm": 1.3024731874465942, + "learning_rate": 2e-05, + "loss": 0.0295128, + "step": 11121 + }, + { + "epoch": 22.244, + "grad_norm": 0.8875964283943176, + "learning_rate": 2e-05, + "loss": 0.02831073, + "step": 11122 + }, + { + "epoch": 22.246, + "grad_norm": 1.04351806640625, + "learning_rate": 2e-05, + "loss": 0.03519507, + "step": 11123 + }, + { + "epoch": 22.248, + "grad_norm": 0.969316303730011, + "learning_rate": 2e-05, + "loss": 0.02787546, + "step": 11124 + }, + { + "epoch": 22.25, + "grad_norm": 1.1152201890945435, + "learning_rate": 2e-05, + "loss": 0.03151672, + "step": 11125 + }, + { + "epoch": 22.252, + "grad_norm": 1.1251158714294434, + "learning_rate": 2e-05, + "loss": 0.03841231, + "step": 11126 + }, + { + "epoch": 22.254, + "grad_norm": 1.6095986366271973, + "learning_rate": 2e-05, + "loss": 0.03633893, + "step": 11127 + }, + { + "epoch": 22.256, + "grad_norm": 1.3410676717758179, + "learning_rate": 2e-05, + "loss": 0.0304535, + "step": 11128 + }, + { + "epoch": 22.258, + "grad_norm": 1.5073579549789429, + "learning_rate": 2e-05, + "loss": 0.04205672, + "step": 11129 + }, + { + "epoch": 22.26, + "grad_norm": 1.2273343801498413, + "learning_rate": 2e-05, + "loss": 0.02800684, + "step": 11130 + }, + { + "epoch": 22.262, + "grad_norm": 1.2454593181610107, + "learning_rate": 2e-05, + "loss": 0.03668667, + "step": 11131 + }, + { + "epoch": 22.264, + "grad_norm": 0.9008411169052124, + "learning_rate": 2e-05, + "loss": 0.02018595, + "step": 11132 + }, + { + "epoch": 22.266, + "grad_norm": 1.4407562017440796, + "learning_rate": 2e-05, + "loss": 0.02953599, + "step": 11133 + }, + { + "epoch": 22.268, + "grad_norm": 1.4390043020248413, + "learning_rate": 2e-05, + "loss": 0.02695351, + "step": 11134 + }, + { + "epoch": 22.27, + "grad_norm": 1.291393756866455, + "learning_rate": 2e-05, + "loss": 0.03049627, + "step": 11135 + }, + { + "epoch": 22.272, + "grad_norm": 1.229018211364746, + "learning_rate": 2e-05, + "loss": 0.03298253, + "step": 11136 + }, + { + "epoch": 22.274, + "grad_norm": 2.3607177734375, + "learning_rate": 2e-05, + "loss": 0.03460927, + "step": 11137 + }, + { + "epoch": 22.276, + "grad_norm": 4.348043918609619, + "learning_rate": 2e-05, + "loss": 0.04996082, + "step": 11138 + }, + { + "epoch": 22.278, + "grad_norm": 1.7652407884597778, + "learning_rate": 2e-05, + "loss": 0.02721683, + "step": 11139 + }, + { + "epoch": 22.28, + "grad_norm": 1.593738317489624, + "learning_rate": 2e-05, + "loss": 0.03514788, + "step": 11140 + }, + { + "epoch": 22.282, + "grad_norm": 1.2631458044052124, + "learning_rate": 2e-05, + "loss": 0.03659892, + "step": 11141 + }, + { + "epoch": 22.284, + "grad_norm": 1.642995834350586, + "learning_rate": 2e-05, + "loss": 0.03794821, + "step": 11142 + }, + { + "epoch": 22.286, + "grad_norm": 1.9356567859649658, + "learning_rate": 2e-05, + "loss": 0.03757293, + "step": 11143 + }, + { + "epoch": 22.288, + "grad_norm": 0.9092358946800232, + "learning_rate": 2e-05, + "loss": 0.02223909, + "step": 11144 + }, + { + "epoch": 22.29, + "grad_norm": 3.2695510387420654, + "learning_rate": 2e-05, + "loss": 0.03481711, + "step": 11145 + }, + { + "epoch": 22.292, + "grad_norm": 0.9674291610717773, + "learning_rate": 2e-05, + "loss": 0.02906576, + "step": 11146 + }, + { + "epoch": 22.294, + "grad_norm": 6.927521705627441, + "learning_rate": 2e-05, + "loss": 0.05944148, + "step": 11147 + }, + { + "epoch": 22.296, + "grad_norm": 1.8510019779205322, + "learning_rate": 2e-05, + "loss": 0.04159633, + "step": 11148 + }, + { + "epoch": 22.298, + "grad_norm": 1.190011739730835, + "learning_rate": 2e-05, + "loss": 0.03650186, + "step": 11149 + }, + { + "epoch": 22.3, + "grad_norm": 0.9493358731269836, + "learning_rate": 2e-05, + "loss": 0.03242558, + "step": 11150 + }, + { + "epoch": 22.302, + "grad_norm": 3.193427324295044, + "learning_rate": 2e-05, + "loss": 0.04184601, + "step": 11151 + }, + { + "epoch": 22.304, + "grad_norm": 1.372183918952942, + "learning_rate": 2e-05, + "loss": 0.03488796, + "step": 11152 + }, + { + "epoch": 22.306, + "grad_norm": 2.2507684230804443, + "learning_rate": 2e-05, + "loss": 0.04868758, + "step": 11153 + }, + { + "epoch": 22.308, + "grad_norm": 1.2131235599517822, + "learning_rate": 2e-05, + "loss": 0.02608864, + "step": 11154 + }, + { + "epoch": 22.31, + "grad_norm": 1.179132103919983, + "learning_rate": 2e-05, + "loss": 0.03008761, + "step": 11155 + }, + { + "epoch": 22.312, + "grad_norm": 1.2756638526916504, + "learning_rate": 2e-05, + "loss": 0.03774351, + "step": 11156 + }, + { + "epoch": 22.314, + "grad_norm": 1.5968365669250488, + "learning_rate": 2e-05, + "loss": 0.0491349, + "step": 11157 + }, + { + "epoch": 22.316, + "grad_norm": 2.220818519592285, + "learning_rate": 2e-05, + "loss": 0.02767688, + "step": 11158 + }, + { + "epoch": 22.318, + "grad_norm": 0.9472852349281311, + "learning_rate": 2e-05, + "loss": 0.02426755, + "step": 11159 + }, + { + "epoch": 22.32, + "grad_norm": 2.503251552581787, + "learning_rate": 2e-05, + "loss": 0.0384932, + "step": 11160 + }, + { + "epoch": 22.322, + "grad_norm": 3.294299602508545, + "learning_rate": 2e-05, + "loss": 0.06644698, + "step": 11161 + }, + { + "epoch": 22.324, + "grad_norm": 1.2162548303604126, + "learning_rate": 2e-05, + "loss": 0.03608144, + "step": 11162 + }, + { + "epoch": 22.326, + "grad_norm": 1.3543113470077515, + "learning_rate": 2e-05, + "loss": 0.03112835, + "step": 11163 + }, + { + "epoch": 22.328, + "grad_norm": 1.1519101858139038, + "learning_rate": 2e-05, + "loss": 0.02835603, + "step": 11164 + }, + { + "epoch": 22.33, + "grad_norm": 0.905073344707489, + "learning_rate": 2e-05, + "loss": 0.03140076, + "step": 11165 + }, + { + "epoch": 22.332, + "grad_norm": 1.0447617769241333, + "learning_rate": 2e-05, + "loss": 0.02909181, + "step": 11166 + }, + { + "epoch": 22.334, + "grad_norm": 1.8674046993255615, + "learning_rate": 2e-05, + "loss": 0.0350634, + "step": 11167 + }, + { + "epoch": 22.336, + "grad_norm": 1.658823013305664, + "learning_rate": 2e-05, + "loss": 0.04215661, + "step": 11168 + }, + { + "epoch": 22.338, + "grad_norm": 0.9590745568275452, + "learning_rate": 2e-05, + "loss": 0.03000915, + "step": 11169 + }, + { + "epoch": 22.34, + "grad_norm": 1.0610401630401611, + "learning_rate": 2e-05, + "loss": 0.03618003, + "step": 11170 + }, + { + "epoch": 22.342, + "grad_norm": 1.1848183870315552, + "learning_rate": 2e-05, + "loss": 0.03476301, + "step": 11171 + }, + { + "epoch": 22.344, + "grad_norm": 1.2776687145233154, + "learning_rate": 2e-05, + "loss": 0.03656887, + "step": 11172 + }, + { + "epoch": 22.346, + "grad_norm": 1.9115201234817505, + "learning_rate": 2e-05, + "loss": 0.04781888, + "step": 11173 + }, + { + "epoch": 22.348, + "grad_norm": 2.2884371280670166, + "learning_rate": 2e-05, + "loss": 0.03745548, + "step": 11174 + }, + { + "epoch": 22.35, + "grad_norm": 0.9647842049598694, + "learning_rate": 2e-05, + "loss": 0.02903546, + "step": 11175 + }, + { + "epoch": 22.352, + "grad_norm": 1.4671611785888672, + "learning_rate": 2e-05, + "loss": 0.05149737, + "step": 11176 + }, + { + "epoch": 22.354, + "grad_norm": 1.0586042404174805, + "learning_rate": 2e-05, + "loss": 0.0410578, + "step": 11177 + }, + { + "epoch": 22.356, + "grad_norm": 1.2867225408554077, + "learning_rate": 2e-05, + "loss": 0.04615509, + "step": 11178 + }, + { + "epoch": 22.358, + "grad_norm": 1.0811021327972412, + "learning_rate": 2e-05, + "loss": 0.03351441, + "step": 11179 + }, + { + "epoch": 22.36, + "grad_norm": 2.024754762649536, + "learning_rate": 2e-05, + "loss": 0.04169374, + "step": 11180 + }, + { + "epoch": 22.362, + "grad_norm": 1.5441582202911377, + "learning_rate": 2e-05, + "loss": 0.03294868, + "step": 11181 + }, + { + "epoch": 22.364, + "grad_norm": 1.9213157892227173, + "learning_rate": 2e-05, + "loss": 0.04192325, + "step": 11182 + }, + { + "epoch": 22.366, + "grad_norm": 1.0890470743179321, + "learning_rate": 2e-05, + "loss": 0.03932363, + "step": 11183 + }, + { + "epoch": 22.368, + "grad_norm": 1.6594054698944092, + "learning_rate": 2e-05, + "loss": 0.04021504, + "step": 11184 + }, + { + "epoch": 22.37, + "grad_norm": 1.811708927154541, + "learning_rate": 2e-05, + "loss": 0.05453177, + "step": 11185 + }, + { + "epoch": 22.372, + "grad_norm": 1.0302735567092896, + "learning_rate": 2e-05, + "loss": 0.03242699, + "step": 11186 + }, + { + "epoch": 22.374, + "grad_norm": 0.9932673573493958, + "learning_rate": 2e-05, + "loss": 0.02922393, + "step": 11187 + }, + { + "epoch": 22.376, + "grad_norm": 1.15402352809906, + "learning_rate": 2e-05, + "loss": 0.03236395, + "step": 11188 + }, + { + "epoch": 22.378, + "grad_norm": 0.9961234927177429, + "learning_rate": 2e-05, + "loss": 0.03148089, + "step": 11189 + }, + { + "epoch": 22.38, + "grad_norm": 1.1807105541229248, + "learning_rate": 2e-05, + "loss": 0.03198289, + "step": 11190 + }, + { + "epoch": 22.382, + "grad_norm": 1.0972753763198853, + "learning_rate": 2e-05, + "loss": 0.03932333, + "step": 11191 + }, + { + "epoch": 22.384, + "grad_norm": 1.3407927751541138, + "learning_rate": 2e-05, + "loss": 0.02748748, + "step": 11192 + }, + { + "epoch": 22.386, + "grad_norm": 1.1336009502410889, + "learning_rate": 2e-05, + "loss": 0.03101401, + "step": 11193 + }, + { + "epoch": 22.388, + "grad_norm": 0.8580100536346436, + "learning_rate": 2e-05, + "loss": 0.01859001, + "step": 11194 + }, + { + "epoch": 22.39, + "grad_norm": 1.9567729234695435, + "learning_rate": 2e-05, + "loss": 0.03903724, + "step": 11195 + }, + { + "epoch": 22.392, + "grad_norm": 1.1536086797714233, + "learning_rate": 2e-05, + "loss": 0.03743483, + "step": 11196 + }, + { + "epoch": 22.394, + "grad_norm": 1.7297052145004272, + "learning_rate": 2e-05, + "loss": 0.04060826, + "step": 11197 + }, + { + "epoch": 22.396, + "grad_norm": 1.1482161283493042, + "learning_rate": 2e-05, + "loss": 0.04031848, + "step": 11198 + }, + { + "epoch": 22.398, + "grad_norm": 1.4597668647766113, + "learning_rate": 2e-05, + "loss": 0.04166424, + "step": 11199 + }, + { + "epoch": 22.4, + "grad_norm": 1.0787426233291626, + "learning_rate": 2e-05, + "loss": 0.0363807, + "step": 11200 + }, + { + "epoch": 22.402, + "grad_norm": 2.6459505558013916, + "learning_rate": 2e-05, + "loss": 0.03180039, + "step": 11201 + }, + { + "epoch": 22.404, + "grad_norm": 1.28476881980896, + "learning_rate": 2e-05, + "loss": 0.03239727, + "step": 11202 + }, + { + "epoch": 22.406, + "grad_norm": 1.4840121269226074, + "learning_rate": 2e-05, + "loss": 0.02138729, + "step": 11203 + }, + { + "epoch": 22.408, + "grad_norm": 1.6134271621704102, + "learning_rate": 2e-05, + "loss": 0.05612726, + "step": 11204 + }, + { + "epoch": 22.41, + "grad_norm": 1.0505448579788208, + "learning_rate": 2e-05, + "loss": 0.02743955, + "step": 11205 + }, + { + "epoch": 22.412, + "grad_norm": 1.2720935344696045, + "learning_rate": 2e-05, + "loss": 0.04470626, + "step": 11206 + }, + { + "epoch": 22.414, + "grad_norm": 1.216604232788086, + "learning_rate": 2e-05, + "loss": 0.03580245, + "step": 11207 + }, + { + "epoch": 22.416, + "grad_norm": 0.9434407949447632, + "learning_rate": 2e-05, + "loss": 0.02070105, + "step": 11208 + }, + { + "epoch": 22.418, + "grad_norm": 1.0345122814178467, + "learning_rate": 2e-05, + "loss": 0.0290225, + "step": 11209 + }, + { + "epoch": 22.42, + "grad_norm": 1.173581600189209, + "learning_rate": 2e-05, + "loss": 0.03839279, + "step": 11210 + }, + { + "epoch": 22.422, + "grad_norm": 1.084951400756836, + "learning_rate": 2e-05, + "loss": 0.03221672, + "step": 11211 + }, + { + "epoch": 22.424, + "grad_norm": 1.0095467567443848, + "learning_rate": 2e-05, + "loss": 0.02846614, + "step": 11212 + }, + { + "epoch": 22.426, + "grad_norm": 1.254839539527893, + "learning_rate": 2e-05, + "loss": 0.03883304, + "step": 11213 + }, + { + "epoch": 22.428, + "grad_norm": 2.123760461807251, + "learning_rate": 2e-05, + "loss": 0.03277697, + "step": 11214 + }, + { + "epoch": 22.43, + "grad_norm": 1.1430293321609497, + "learning_rate": 2e-05, + "loss": 0.02167347, + "step": 11215 + }, + { + "epoch": 22.432, + "grad_norm": 1.242515206336975, + "learning_rate": 2e-05, + "loss": 0.0301038, + "step": 11216 + }, + { + "epoch": 22.434, + "grad_norm": 1.651579737663269, + "learning_rate": 2e-05, + "loss": 0.03766736, + "step": 11217 + }, + { + "epoch": 22.436, + "grad_norm": 0.8291965126991272, + "learning_rate": 2e-05, + "loss": 0.02428487, + "step": 11218 + }, + { + "epoch": 22.438, + "grad_norm": 0.8632979393005371, + "learning_rate": 2e-05, + "loss": 0.0235811, + "step": 11219 + }, + { + "epoch": 22.44, + "grad_norm": 1.366252064704895, + "learning_rate": 2e-05, + "loss": 0.04416512, + "step": 11220 + }, + { + "epoch": 22.442, + "grad_norm": 0.796024739742279, + "learning_rate": 2e-05, + "loss": 0.0193195, + "step": 11221 + }, + { + "epoch": 22.444, + "grad_norm": 1.679572582244873, + "learning_rate": 2e-05, + "loss": 0.04569577, + "step": 11222 + }, + { + "epoch": 22.446, + "grad_norm": 1.6671518087387085, + "learning_rate": 2e-05, + "loss": 0.03679072, + "step": 11223 + }, + { + "epoch": 22.448, + "grad_norm": 1.1129558086395264, + "learning_rate": 2e-05, + "loss": 0.03054724, + "step": 11224 + }, + { + "epoch": 22.45, + "grad_norm": 5.476849555969238, + "learning_rate": 2e-05, + "loss": 0.03754855, + "step": 11225 + }, + { + "epoch": 22.452, + "grad_norm": 1.0009487867355347, + "learning_rate": 2e-05, + "loss": 0.03189157, + "step": 11226 + }, + { + "epoch": 22.454, + "grad_norm": 1.420681118965149, + "learning_rate": 2e-05, + "loss": 0.0333651, + "step": 11227 + }, + { + "epoch": 22.456, + "grad_norm": 1.3200653791427612, + "learning_rate": 2e-05, + "loss": 0.0283935, + "step": 11228 + }, + { + "epoch": 22.458, + "grad_norm": 2.4781174659729004, + "learning_rate": 2e-05, + "loss": 0.0337758, + "step": 11229 + }, + { + "epoch": 22.46, + "grad_norm": 1.2006012201309204, + "learning_rate": 2e-05, + "loss": 0.04407885, + "step": 11230 + }, + { + "epoch": 22.462, + "grad_norm": 0.8424386978149414, + "learning_rate": 2e-05, + "loss": 0.02036972, + "step": 11231 + }, + { + "epoch": 22.464, + "grad_norm": 1.0086331367492676, + "learning_rate": 2e-05, + "loss": 0.02813908, + "step": 11232 + }, + { + "epoch": 22.466, + "grad_norm": 1.6784151792526245, + "learning_rate": 2e-05, + "loss": 0.05713883, + "step": 11233 + }, + { + "epoch": 22.468, + "grad_norm": 1.6650753021240234, + "learning_rate": 2e-05, + "loss": 0.0437688, + "step": 11234 + }, + { + "epoch": 22.47, + "grad_norm": 1.033752679824829, + "learning_rate": 2e-05, + "loss": 0.02289294, + "step": 11235 + }, + { + "epoch": 22.472, + "grad_norm": 1.9619020223617554, + "learning_rate": 2e-05, + "loss": 0.03555152, + "step": 11236 + }, + { + "epoch": 22.474, + "grad_norm": 0.9467611908912659, + "learning_rate": 2e-05, + "loss": 0.02932581, + "step": 11237 + }, + { + "epoch": 22.476, + "grad_norm": 1.2341922521591187, + "learning_rate": 2e-05, + "loss": 0.0385951, + "step": 11238 + }, + { + "epoch": 22.478, + "grad_norm": 1.1940884590148926, + "learning_rate": 2e-05, + "loss": 0.03371501, + "step": 11239 + }, + { + "epoch": 22.48, + "grad_norm": 1.9683741331100464, + "learning_rate": 2e-05, + "loss": 0.02918104, + "step": 11240 + }, + { + "epoch": 22.482, + "grad_norm": 1.0450471639633179, + "learning_rate": 2e-05, + "loss": 0.02672218, + "step": 11241 + }, + { + "epoch": 22.484, + "grad_norm": 2.0168466567993164, + "learning_rate": 2e-05, + "loss": 0.03515015, + "step": 11242 + }, + { + "epoch": 22.486, + "grad_norm": 1.5114184617996216, + "learning_rate": 2e-05, + "loss": 0.04158841, + "step": 11243 + }, + { + "epoch": 22.488, + "grad_norm": 1.5208250284194946, + "learning_rate": 2e-05, + "loss": 0.05399328, + "step": 11244 + }, + { + "epoch": 22.49, + "grad_norm": 1.0174354314804077, + "learning_rate": 2e-05, + "loss": 0.02212554, + "step": 11245 + }, + { + "epoch": 22.492, + "grad_norm": 1.1009485721588135, + "learning_rate": 2e-05, + "loss": 0.03259212, + "step": 11246 + }, + { + "epoch": 22.494, + "grad_norm": 1.2722481489181519, + "learning_rate": 2e-05, + "loss": 0.03970824, + "step": 11247 + }, + { + "epoch": 22.496, + "grad_norm": 1.4732874631881714, + "learning_rate": 2e-05, + "loss": 0.03239143, + "step": 11248 + }, + { + "epoch": 22.498, + "grad_norm": 1.8055660724639893, + "learning_rate": 2e-05, + "loss": 0.04277326, + "step": 11249 + }, + { + "epoch": 22.5, + "grad_norm": 1.5957167148590088, + "learning_rate": 2e-05, + "loss": 0.04040106, + "step": 11250 + }, + { + "epoch": 22.502, + "grad_norm": 1.0191160440444946, + "learning_rate": 2e-05, + "loss": 0.03819025, + "step": 11251 + }, + { + "epoch": 22.504, + "grad_norm": 0.8751825094223022, + "learning_rate": 2e-05, + "loss": 0.02078329, + "step": 11252 + }, + { + "epoch": 22.506, + "grad_norm": 0.9703699350357056, + "learning_rate": 2e-05, + "loss": 0.02371472, + "step": 11253 + }, + { + "epoch": 22.508, + "grad_norm": 1.4609352350234985, + "learning_rate": 2e-05, + "loss": 0.04331794, + "step": 11254 + }, + { + "epoch": 22.51, + "grad_norm": 0.7537633180618286, + "learning_rate": 2e-05, + "loss": 0.02106013, + "step": 11255 + }, + { + "epoch": 22.512, + "grad_norm": 1.1532191038131714, + "learning_rate": 2e-05, + "loss": 0.02696463, + "step": 11256 + }, + { + "epoch": 22.514, + "grad_norm": 1.2470663785934448, + "learning_rate": 2e-05, + "loss": 0.03676067, + "step": 11257 + }, + { + "epoch": 22.516, + "grad_norm": 1.034314751625061, + "learning_rate": 2e-05, + "loss": 0.03010433, + "step": 11258 + }, + { + "epoch": 22.518, + "grad_norm": 1.0208652019500732, + "learning_rate": 2e-05, + "loss": 0.0345837, + "step": 11259 + }, + { + "epoch": 22.52, + "grad_norm": 1.0119693279266357, + "learning_rate": 2e-05, + "loss": 0.02655143, + "step": 11260 + }, + { + "epoch": 22.522, + "grad_norm": 1.8333549499511719, + "learning_rate": 2e-05, + "loss": 0.04552222, + "step": 11261 + }, + { + "epoch": 22.524, + "grad_norm": 1.5406705141067505, + "learning_rate": 2e-05, + "loss": 0.04645106, + "step": 11262 + }, + { + "epoch": 22.526, + "grad_norm": 0.7743955850601196, + "learning_rate": 2e-05, + "loss": 0.02030542, + "step": 11263 + }, + { + "epoch": 22.528, + "grad_norm": 1.0403064489364624, + "learning_rate": 2e-05, + "loss": 0.02845855, + "step": 11264 + }, + { + "epoch": 22.53, + "grad_norm": 1.3076633214950562, + "learning_rate": 2e-05, + "loss": 0.04241259, + "step": 11265 + }, + { + "epoch": 22.532, + "grad_norm": 1.454092025756836, + "learning_rate": 2e-05, + "loss": 0.04193448, + "step": 11266 + }, + { + "epoch": 22.534, + "grad_norm": 1.0106709003448486, + "learning_rate": 2e-05, + "loss": 0.03059579, + "step": 11267 + }, + { + "epoch": 22.536, + "grad_norm": 1.4397797584533691, + "learning_rate": 2e-05, + "loss": 0.03595772, + "step": 11268 + }, + { + "epoch": 22.538, + "grad_norm": 1.0913630723953247, + "learning_rate": 2e-05, + "loss": 0.02504985, + "step": 11269 + }, + { + "epoch": 22.54, + "grad_norm": 1.1001238822937012, + "learning_rate": 2e-05, + "loss": 0.03245225, + "step": 11270 + }, + { + "epoch": 22.542, + "grad_norm": 1.6088941097259521, + "learning_rate": 2e-05, + "loss": 0.05301979, + "step": 11271 + }, + { + "epoch": 22.544, + "grad_norm": 1.3868272304534912, + "learning_rate": 2e-05, + "loss": 0.04549325, + "step": 11272 + }, + { + "epoch": 22.546, + "grad_norm": 1.3786357641220093, + "learning_rate": 2e-05, + "loss": 0.03595918, + "step": 11273 + }, + { + "epoch": 22.548000000000002, + "grad_norm": 0.9569694995880127, + "learning_rate": 2e-05, + "loss": 0.03288563, + "step": 11274 + }, + { + "epoch": 22.55, + "grad_norm": 1.6313834190368652, + "learning_rate": 2e-05, + "loss": 0.0272957, + "step": 11275 + }, + { + "epoch": 22.552, + "grad_norm": 1.3030887842178345, + "learning_rate": 2e-05, + "loss": 0.03881227, + "step": 11276 + }, + { + "epoch": 22.554, + "grad_norm": 1.4454890489578247, + "learning_rate": 2e-05, + "loss": 0.03911527, + "step": 11277 + }, + { + "epoch": 22.556, + "grad_norm": 1.0935059785842896, + "learning_rate": 2e-05, + "loss": 0.02929432, + "step": 11278 + }, + { + "epoch": 22.558, + "grad_norm": 1.0467413663864136, + "learning_rate": 2e-05, + "loss": 0.03879336, + "step": 11279 + }, + { + "epoch": 22.56, + "grad_norm": 1.3743126392364502, + "learning_rate": 2e-05, + "loss": 0.03689353, + "step": 11280 + }, + { + "epoch": 22.562, + "grad_norm": 0.9728202819824219, + "learning_rate": 2e-05, + "loss": 0.02886998, + "step": 11281 + }, + { + "epoch": 22.564, + "grad_norm": 1.2006583213806152, + "learning_rate": 2e-05, + "loss": 0.03440745, + "step": 11282 + }, + { + "epoch": 22.566, + "grad_norm": 1.1317431926727295, + "learning_rate": 2e-05, + "loss": 0.03354093, + "step": 11283 + }, + { + "epoch": 22.568, + "grad_norm": 1.0374213457107544, + "learning_rate": 2e-05, + "loss": 0.0318547, + "step": 11284 + }, + { + "epoch": 22.57, + "grad_norm": 1.3933595418930054, + "learning_rate": 2e-05, + "loss": 0.03949068, + "step": 11285 + }, + { + "epoch": 22.572, + "grad_norm": 1.9650115966796875, + "learning_rate": 2e-05, + "loss": 0.0620589, + "step": 11286 + }, + { + "epoch": 22.574, + "grad_norm": 1.1582067012786865, + "learning_rate": 2e-05, + "loss": 0.03287086, + "step": 11287 + }, + { + "epoch": 22.576, + "grad_norm": 1.4262588024139404, + "learning_rate": 2e-05, + "loss": 0.0474546, + "step": 11288 + }, + { + "epoch": 22.578, + "grad_norm": 1.8528952598571777, + "learning_rate": 2e-05, + "loss": 0.03344478, + "step": 11289 + }, + { + "epoch": 22.58, + "grad_norm": 2.4040780067443848, + "learning_rate": 2e-05, + "loss": 0.04099388, + "step": 11290 + }, + { + "epoch": 22.582, + "grad_norm": 1.1463083028793335, + "learning_rate": 2e-05, + "loss": 0.02880567, + "step": 11291 + }, + { + "epoch": 22.584, + "grad_norm": 1.1943498849868774, + "learning_rate": 2e-05, + "loss": 0.03802372, + "step": 11292 + }, + { + "epoch": 22.586, + "grad_norm": 0.8871561288833618, + "learning_rate": 2e-05, + "loss": 0.02874197, + "step": 11293 + }, + { + "epoch": 22.588, + "grad_norm": 1.1465365886688232, + "learning_rate": 2e-05, + "loss": 0.0340536, + "step": 11294 + }, + { + "epoch": 22.59, + "grad_norm": 0.9088796973228455, + "learning_rate": 2e-05, + "loss": 0.0175512, + "step": 11295 + }, + { + "epoch": 22.592, + "grad_norm": 1.8269764184951782, + "learning_rate": 2e-05, + "loss": 0.04000066, + "step": 11296 + }, + { + "epoch": 22.594, + "grad_norm": 0.9784151315689087, + "learning_rate": 2e-05, + "loss": 0.02507006, + "step": 11297 + }, + { + "epoch": 22.596, + "grad_norm": 1.2063928842544556, + "learning_rate": 2e-05, + "loss": 0.04113289, + "step": 11298 + }, + { + "epoch": 22.598, + "grad_norm": 1.7086107730865479, + "learning_rate": 2e-05, + "loss": 0.03416719, + "step": 11299 + }, + { + "epoch": 22.6, + "grad_norm": 0.8789715766906738, + "learning_rate": 2e-05, + "loss": 0.02553638, + "step": 11300 + }, + { + "epoch": 22.602, + "grad_norm": 1.5890235900878906, + "learning_rate": 2e-05, + "loss": 0.03483776, + "step": 11301 + }, + { + "epoch": 22.604, + "grad_norm": 1.4625310897827148, + "learning_rate": 2e-05, + "loss": 0.04801542, + "step": 11302 + }, + { + "epoch": 22.606, + "grad_norm": 1.0777349472045898, + "learning_rate": 2e-05, + "loss": 0.03591266, + "step": 11303 + }, + { + "epoch": 22.608, + "grad_norm": 0.9330995082855225, + "learning_rate": 2e-05, + "loss": 0.02881388, + "step": 11304 + }, + { + "epoch": 22.61, + "grad_norm": 1.3750163316726685, + "learning_rate": 2e-05, + "loss": 0.03394581, + "step": 11305 + }, + { + "epoch": 22.612, + "grad_norm": 1.35263192653656, + "learning_rate": 2e-05, + "loss": 0.03068662, + "step": 11306 + }, + { + "epoch": 22.614, + "grad_norm": 1.291716456413269, + "learning_rate": 2e-05, + "loss": 0.0435909, + "step": 11307 + }, + { + "epoch": 22.616, + "grad_norm": 1.3781776428222656, + "learning_rate": 2e-05, + "loss": 0.03991321, + "step": 11308 + }, + { + "epoch": 22.618, + "grad_norm": 1.09850013256073, + "learning_rate": 2e-05, + "loss": 0.04068404, + "step": 11309 + }, + { + "epoch": 22.62, + "grad_norm": 1.2705893516540527, + "learning_rate": 2e-05, + "loss": 0.04384391, + "step": 11310 + }, + { + "epoch": 22.622, + "grad_norm": 1.6616660356521606, + "learning_rate": 2e-05, + "loss": 0.02786833, + "step": 11311 + }, + { + "epoch": 22.624, + "grad_norm": 1.111756443977356, + "learning_rate": 2e-05, + "loss": 0.02686176, + "step": 11312 + }, + { + "epoch": 22.626, + "grad_norm": 1.2425123453140259, + "learning_rate": 2e-05, + "loss": 0.04210493, + "step": 11313 + }, + { + "epoch": 22.628, + "grad_norm": 1.0276856422424316, + "learning_rate": 2e-05, + "loss": 0.03203695, + "step": 11314 + }, + { + "epoch": 22.63, + "grad_norm": 1.0379737615585327, + "learning_rate": 2e-05, + "loss": 0.03296281, + "step": 11315 + }, + { + "epoch": 22.632, + "grad_norm": 0.9909905195236206, + "learning_rate": 2e-05, + "loss": 0.02666735, + "step": 11316 + }, + { + "epoch": 22.634, + "grad_norm": 1.0876964330673218, + "learning_rate": 2e-05, + "loss": 0.03161968, + "step": 11317 + }, + { + "epoch": 22.636, + "grad_norm": 1.085287094116211, + "learning_rate": 2e-05, + "loss": 0.02732705, + "step": 11318 + }, + { + "epoch": 22.638, + "grad_norm": 2.1646599769592285, + "learning_rate": 2e-05, + "loss": 0.02609478, + "step": 11319 + }, + { + "epoch": 22.64, + "grad_norm": 0.967041552066803, + "learning_rate": 2e-05, + "loss": 0.02320741, + "step": 11320 + }, + { + "epoch": 22.642, + "grad_norm": 1.5888314247131348, + "learning_rate": 2e-05, + "loss": 0.05063655, + "step": 11321 + }, + { + "epoch": 22.644, + "grad_norm": 0.9226511120796204, + "learning_rate": 2e-05, + "loss": 0.03058358, + "step": 11322 + }, + { + "epoch": 22.646, + "grad_norm": 1.701810598373413, + "learning_rate": 2e-05, + "loss": 0.02915188, + "step": 11323 + }, + { + "epoch": 22.648, + "grad_norm": 1.497314453125, + "learning_rate": 2e-05, + "loss": 0.04008635, + "step": 11324 + }, + { + "epoch": 22.65, + "grad_norm": 1.3982007503509521, + "learning_rate": 2e-05, + "loss": 0.026863, + "step": 11325 + }, + { + "epoch": 22.652, + "grad_norm": 1.4211523532867432, + "learning_rate": 2e-05, + "loss": 0.04486757, + "step": 11326 + }, + { + "epoch": 22.654, + "grad_norm": 1.1829328536987305, + "learning_rate": 2e-05, + "loss": 0.02879838, + "step": 11327 + }, + { + "epoch": 22.656, + "grad_norm": 1.7771282196044922, + "learning_rate": 2e-05, + "loss": 0.03814802, + "step": 11328 + }, + { + "epoch": 22.658, + "grad_norm": 2.1755659580230713, + "learning_rate": 2e-05, + "loss": 0.03225455, + "step": 11329 + }, + { + "epoch": 22.66, + "grad_norm": 1.3810690641403198, + "learning_rate": 2e-05, + "loss": 0.03953212, + "step": 11330 + }, + { + "epoch": 22.662, + "grad_norm": 1.7033722400665283, + "learning_rate": 2e-05, + "loss": 0.04444302, + "step": 11331 + }, + { + "epoch": 22.664, + "grad_norm": 2.3877339363098145, + "learning_rate": 2e-05, + "loss": 0.04218495, + "step": 11332 + }, + { + "epoch": 22.666, + "grad_norm": 1.8948878049850464, + "learning_rate": 2e-05, + "loss": 0.05076663, + "step": 11333 + }, + { + "epoch": 22.668, + "grad_norm": 1.181063175201416, + "learning_rate": 2e-05, + "loss": 0.02133227, + "step": 11334 + }, + { + "epoch": 22.67, + "grad_norm": 2.0228304862976074, + "learning_rate": 2e-05, + "loss": 0.04120112, + "step": 11335 + }, + { + "epoch": 22.672, + "grad_norm": 1.0375754833221436, + "learning_rate": 2e-05, + "loss": 0.03073654, + "step": 11336 + }, + { + "epoch": 22.674, + "grad_norm": 2.258004665374756, + "learning_rate": 2e-05, + "loss": 0.05255279, + "step": 11337 + }, + { + "epoch": 22.676, + "grad_norm": 1.752378225326538, + "learning_rate": 2e-05, + "loss": 0.05190774, + "step": 11338 + }, + { + "epoch": 22.678, + "grad_norm": 2.363682746887207, + "learning_rate": 2e-05, + "loss": 0.04572538, + "step": 11339 + }, + { + "epoch": 22.68, + "grad_norm": 1.088084101676941, + "learning_rate": 2e-05, + "loss": 0.03764492, + "step": 11340 + }, + { + "epoch": 22.682, + "grad_norm": 0.8578435182571411, + "learning_rate": 2e-05, + "loss": 0.02391421, + "step": 11341 + }, + { + "epoch": 22.684, + "grad_norm": 1.1827582120895386, + "learning_rate": 2e-05, + "loss": 0.03264995, + "step": 11342 + }, + { + "epoch": 22.686, + "grad_norm": 1.3935420513153076, + "learning_rate": 2e-05, + "loss": 0.03195801, + "step": 11343 + }, + { + "epoch": 22.688, + "grad_norm": 0.9729677438735962, + "learning_rate": 2e-05, + "loss": 0.02463444, + "step": 11344 + }, + { + "epoch": 22.69, + "grad_norm": 3.6322379112243652, + "learning_rate": 2e-05, + "loss": 0.04378824, + "step": 11345 + }, + { + "epoch": 22.692, + "grad_norm": 1.002524971961975, + "learning_rate": 2e-05, + "loss": 0.02371977, + "step": 11346 + }, + { + "epoch": 22.694, + "grad_norm": 1.1274267435073853, + "learning_rate": 2e-05, + "loss": 0.02771427, + "step": 11347 + }, + { + "epoch": 22.696, + "grad_norm": 0.9226782321929932, + "learning_rate": 2e-05, + "loss": 0.0263548, + "step": 11348 + }, + { + "epoch": 22.698, + "grad_norm": 1.6948686838150024, + "learning_rate": 2e-05, + "loss": 0.03164578, + "step": 11349 + }, + { + "epoch": 22.7, + "grad_norm": 1.1010253429412842, + "learning_rate": 2e-05, + "loss": 0.03652514, + "step": 11350 + }, + { + "epoch": 22.701999999999998, + "grad_norm": 1.2103954553604126, + "learning_rate": 2e-05, + "loss": 0.03659312, + "step": 11351 + }, + { + "epoch": 22.704, + "grad_norm": 1.1126821041107178, + "learning_rate": 2e-05, + "loss": 0.02752934, + "step": 11352 + }, + { + "epoch": 22.706, + "grad_norm": 0.9469572305679321, + "learning_rate": 2e-05, + "loss": 0.02585907, + "step": 11353 + }, + { + "epoch": 22.708, + "grad_norm": 1.8642154932022095, + "learning_rate": 2e-05, + "loss": 0.04325049, + "step": 11354 + }, + { + "epoch": 22.71, + "grad_norm": 1.1645115613937378, + "learning_rate": 2e-05, + "loss": 0.04005027, + "step": 11355 + }, + { + "epoch": 22.712, + "grad_norm": 1.520301342010498, + "learning_rate": 2e-05, + "loss": 0.0399612, + "step": 11356 + }, + { + "epoch": 22.714, + "grad_norm": 1.235013723373413, + "learning_rate": 2e-05, + "loss": 0.03438204, + "step": 11357 + }, + { + "epoch": 22.716, + "grad_norm": 0.9834000468254089, + "learning_rate": 2e-05, + "loss": 0.02958825, + "step": 11358 + }, + { + "epoch": 22.718, + "grad_norm": 0.9529434442520142, + "learning_rate": 2e-05, + "loss": 0.02942239, + "step": 11359 + }, + { + "epoch": 22.72, + "grad_norm": 1.0147749185562134, + "learning_rate": 2e-05, + "loss": 0.02894795, + "step": 11360 + }, + { + "epoch": 22.722, + "grad_norm": 2.125749111175537, + "learning_rate": 2e-05, + "loss": 0.02771318, + "step": 11361 + }, + { + "epoch": 22.724, + "grad_norm": 1.1008570194244385, + "learning_rate": 2e-05, + "loss": 0.0344195, + "step": 11362 + }, + { + "epoch": 22.726, + "grad_norm": 2.6095540523529053, + "learning_rate": 2e-05, + "loss": 0.04947619, + "step": 11363 + }, + { + "epoch": 22.728, + "grad_norm": 1.228231430053711, + "learning_rate": 2e-05, + "loss": 0.03951849, + "step": 11364 + }, + { + "epoch": 22.73, + "grad_norm": 3.1176652908325195, + "learning_rate": 2e-05, + "loss": 0.03356297, + "step": 11365 + }, + { + "epoch": 22.732, + "grad_norm": 1.1963483095169067, + "learning_rate": 2e-05, + "loss": 0.02985599, + "step": 11366 + }, + { + "epoch": 22.734, + "grad_norm": 1.533469319343567, + "learning_rate": 2e-05, + "loss": 0.03753369, + "step": 11367 + }, + { + "epoch": 22.736, + "grad_norm": 0.9125386476516724, + "learning_rate": 2e-05, + "loss": 0.03140819, + "step": 11368 + }, + { + "epoch": 22.738, + "grad_norm": 1.361525058746338, + "learning_rate": 2e-05, + "loss": 0.03976889, + "step": 11369 + }, + { + "epoch": 22.74, + "grad_norm": 1.0508217811584473, + "learning_rate": 2e-05, + "loss": 0.03463693, + "step": 11370 + }, + { + "epoch": 22.742, + "grad_norm": 1.6200356483459473, + "learning_rate": 2e-05, + "loss": 0.04815769, + "step": 11371 + }, + { + "epoch": 22.744, + "grad_norm": 1.0508980751037598, + "learning_rate": 2e-05, + "loss": 0.04034566, + "step": 11372 + }, + { + "epoch": 22.746, + "grad_norm": 1.024295687675476, + "learning_rate": 2e-05, + "loss": 0.03280937, + "step": 11373 + }, + { + "epoch": 22.748, + "grad_norm": 1.413459062576294, + "learning_rate": 2e-05, + "loss": 0.04194808, + "step": 11374 + }, + { + "epoch": 22.75, + "grad_norm": 1.0012433528900146, + "learning_rate": 2e-05, + "loss": 0.02298179, + "step": 11375 + }, + { + "epoch": 22.752, + "grad_norm": 2.7343661785125732, + "learning_rate": 2e-05, + "loss": 0.04880745, + "step": 11376 + }, + { + "epoch": 22.754, + "grad_norm": 1.2125567197799683, + "learning_rate": 2e-05, + "loss": 0.0418051, + "step": 11377 + }, + { + "epoch": 22.756, + "grad_norm": 1.162410855293274, + "learning_rate": 2e-05, + "loss": 0.02900402, + "step": 11378 + }, + { + "epoch": 22.758, + "grad_norm": 0.9885851740837097, + "learning_rate": 2e-05, + "loss": 0.02954095, + "step": 11379 + }, + { + "epoch": 22.76, + "grad_norm": 1.2678016424179077, + "learning_rate": 2e-05, + "loss": 0.04300134, + "step": 11380 + }, + { + "epoch": 22.762, + "grad_norm": 1.4912469387054443, + "learning_rate": 2e-05, + "loss": 0.03353298, + "step": 11381 + }, + { + "epoch": 22.764, + "grad_norm": 1.0253992080688477, + "learning_rate": 2e-05, + "loss": 0.02656168, + "step": 11382 + }, + { + "epoch": 22.766, + "grad_norm": 0.8745793104171753, + "learning_rate": 2e-05, + "loss": 0.02277788, + "step": 11383 + }, + { + "epoch": 22.768, + "grad_norm": 1.3656665086746216, + "learning_rate": 2e-05, + "loss": 0.03005461, + "step": 11384 + }, + { + "epoch": 22.77, + "grad_norm": 1.9831993579864502, + "learning_rate": 2e-05, + "loss": 0.04527285, + "step": 11385 + }, + { + "epoch": 22.772, + "grad_norm": 1.8033963441848755, + "learning_rate": 2e-05, + "loss": 0.05182678, + "step": 11386 + }, + { + "epoch": 22.774, + "grad_norm": 1.5675684213638306, + "learning_rate": 2e-05, + "loss": 0.03885451, + "step": 11387 + }, + { + "epoch": 22.776, + "grad_norm": 2.476839065551758, + "learning_rate": 2e-05, + "loss": 0.05386474, + "step": 11388 + }, + { + "epoch": 22.778, + "grad_norm": 0.9441741108894348, + "learning_rate": 2e-05, + "loss": 0.02855249, + "step": 11389 + }, + { + "epoch": 22.78, + "grad_norm": 1.4872889518737793, + "learning_rate": 2e-05, + "loss": 0.04495686, + "step": 11390 + }, + { + "epoch": 22.782, + "grad_norm": 2.2526416778564453, + "learning_rate": 2e-05, + "loss": 0.04384479, + "step": 11391 + }, + { + "epoch": 22.784, + "grad_norm": 1.2508896589279175, + "learning_rate": 2e-05, + "loss": 0.03676173, + "step": 11392 + }, + { + "epoch": 22.786, + "grad_norm": 1.5125991106033325, + "learning_rate": 2e-05, + "loss": 0.04031794, + "step": 11393 + }, + { + "epoch": 22.788, + "grad_norm": 0.8906210660934448, + "learning_rate": 2e-05, + "loss": 0.02660401, + "step": 11394 + }, + { + "epoch": 22.79, + "grad_norm": 3.060696601867676, + "learning_rate": 2e-05, + "loss": 0.04931217, + "step": 11395 + }, + { + "epoch": 22.792, + "grad_norm": 1.2167766094207764, + "learning_rate": 2e-05, + "loss": 0.0397851, + "step": 11396 + }, + { + "epoch": 22.794, + "grad_norm": 1.559085726737976, + "learning_rate": 2e-05, + "loss": 0.04445662, + "step": 11397 + }, + { + "epoch": 22.796, + "grad_norm": 2.269927978515625, + "learning_rate": 2e-05, + "loss": 0.05920038, + "step": 11398 + }, + { + "epoch": 22.798000000000002, + "grad_norm": 1.1354275941848755, + "learning_rate": 2e-05, + "loss": 0.03149124, + "step": 11399 + }, + { + "epoch": 22.8, + "grad_norm": 1.1973793506622314, + "learning_rate": 2e-05, + "loss": 0.03659835, + "step": 11400 + }, + { + "epoch": 22.802, + "grad_norm": 1.178956151008606, + "learning_rate": 2e-05, + "loss": 0.03625498, + "step": 11401 + }, + { + "epoch": 22.804, + "grad_norm": 0.8932549357414246, + "learning_rate": 2e-05, + "loss": 0.02956768, + "step": 11402 + }, + { + "epoch": 22.806, + "grad_norm": 1.0559618473052979, + "learning_rate": 2e-05, + "loss": 0.03430596, + "step": 11403 + }, + { + "epoch": 22.808, + "grad_norm": 2.337083578109741, + "learning_rate": 2e-05, + "loss": 0.04307479, + "step": 11404 + }, + { + "epoch": 22.81, + "grad_norm": 1.271933913230896, + "learning_rate": 2e-05, + "loss": 0.03103825, + "step": 11405 + }, + { + "epoch": 22.812, + "grad_norm": 1.2939378023147583, + "learning_rate": 2e-05, + "loss": 0.04132862, + "step": 11406 + }, + { + "epoch": 22.814, + "grad_norm": 1.2300240993499756, + "learning_rate": 2e-05, + "loss": 0.03786105, + "step": 11407 + }, + { + "epoch": 22.816, + "grad_norm": 1.1687867641448975, + "learning_rate": 2e-05, + "loss": 0.04594323, + "step": 11408 + }, + { + "epoch": 22.818, + "grad_norm": 1.2083779573440552, + "learning_rate": 2e-05, + "loss": 0.04165336, + "step": 11409 + }, + { + "epoch": 22.82, + "grad_norm": 2.3479809761047363, + "learning_rate": 2e-05, + "loss": 0.0412714, + "step": 11410 + }, + { + "epoch": 22.822, + "grad_norm": 1.5195822715759277, + "learning_rate": 2e-05, + "loss": 0.02944362, + "step": 11411 + }, + { + "epoch": 22.824, + "grad_norm": 1.2599931955337524, + "learning_rate": 2e-05, + "loss": 0.039657, + "step": 11412 + }, + { + "epoch": 22.826, + "grad_norm": 1.5169256925582886, + "learning_rate": 2e-05, + "loss": 0.0313296, + "step": 11413 + }, + { + "epoch": 22.828, + "grad_norm": 1.3896204233169556, + "learning_rate": 2e-05, + "loss": 0.041568, + "step": 11414 + }, + { + "epoch": 22.83, + "grad_norm": 1.3151553869247437, + "learning_rate": 2e-05, + "loss": 0.03976612, + "step": 11415 + }, + { + "epoch": 22.832, + "grad_norm": 1.2248992919921875, + "learning_rate": 2e-05, + "loss": 0.032742, + "step": 11416 + }, + { + "epoch": 22.834, + "grad_norm": 1.6058671474456787, + "learning_rate": 2e-05, + "loss": 0.04475835, + "step": 11417 + }, + { + "epoch": 22.836, + "grad_norm": 1.131249189376831, + "learning_rate": 2e-05, + "loss": 0.03879447, + "step": 11418 + }, + { + "epoch": 22.838, + "grad_norm": 1.2209913730621338, + "learning_rate": 2e-05, + "loss": 0.03018184, + "step": 11419 + }, + { + "epoch": 22.84, + "grad_norm": 1.2565336227416992, + "learning_rate": 2e-05, + "loss": 0.03886155, + "step": 11420 + }, + { + "epoch": 22.842, + "grad_norm": 1.2150452136993408, + "learning_rate": 2e-05, + "loss": 0.03672185, + "step": 11421 + }, + { + "epoch": 22.844, + "grad_norm": 1.4771347045898438, + "learning_rate": 2e-05, + "loss": 0.02761836, + "step": 11422 + }, + { + "epoch": 22.846, + "grad_norm": 2.192431926727295, + "learning_rate": 2e-05, + "loss": 0.0543457, + "step": 11423 + }, + { + "epoch": 22.848, + "grad_norm": 1.1383287906646729, + "learning_rate": 2e-05, + "loss": 0.03061396, + "step": 11424 + }, + { + "epoch": 22.85, + "grad_norm": 2.1792497634887695, + "learning_rate": 2e-05, + "loss": 0.03490089, + "step": 11425 + }, + { + "epoch": 22.852, + "grad_norm": 1.2106961011886597, + "learning_rate": 2e-05, + "loss": 0.03852124, + "step": 11426 + }, + { + "epoch": 22.854, + "grad_norm": 4.948497772216797, + "learning_rate": 2e-05, + "loss": 0.02499816, + "step": 11427 + }, + { + "epoch": 22.856, + "grad_norm": 1.3616306781768799, + "learning_rate": 2e-05, + "loss": 0.04259917, + "step": 11428 + }, + { + "epoch": 22.858, + "grad_norm": 1.405735969543457, + "learning_rate": 2e-05, + "loss": 0.03377235, + "step": 11429 + }, + { + "epoch": 22.86, + "grad_norm": 1.426137924194336, + "learning_rate": 2e-05, + "loss": 0.04095915, + "step": 11430 + }, + { + "epoch": 22.862, + "grad_norm": 1.1575813293457031, + "learning_rate": 2e-05, + "loss": 0.03657633, + "step": 11431 + }, + { + "epoch": 22.864, + "grad_norm": 1.048122525215149, + "learning_rate": 2e-05, + "loss": 0.03505576, + "step": 11432 + }, + { + "epoch": 22.866, + "grad_norm": 1.589388132095337, + "learning_rate": 2e-05, + "loss": 0.02945594, + "step": 11433 + }, + { + "epoch": 22.868, + "grad_norm": 1.7383077144622803, + "learning_rate": 2e-05, + "loss": 0.03888205, + "step": 11434 + }, + { + "epoch": 22.87, + "grad_norm": 1.2866278886795044, + "learning_rate": 2e-05, + "loss": 0.02404101, + "step": 11435 + }, + { + "epoch": 22.872, + "grad_norm": 1.8006558418273926, + "learning_rate": 2e-05, + "loss": 0.02283414, + "step": 11436 + }, + { + "epoch": 22.874, + "grad_norm": 0.7845028638839722, + "learning_rate": 2e-05, + "loss": 0.01730304, + "step": 11437 + }, + { + "epoch": 22.876, + "grad_norm": 1.3924261331558228, + "learning_rate": 2e-05, + "loss": 0.03165732, + "step": 11438 + }, + { + "epoch": 22.878, + "grad_norm": 1.0454896688461304, + "learning_rate": 2e-05, + "loss": 0.03067488, + "step": 11439 + }, + { + "epoch": 22.88, + "grad_norm": 1.8691692352294922, + "learning_rate": 2e-05, + "loss": 0.03286079, + "step": 11440 + }, + { + "epoch": 22.882, + "grad_norm": 1.0586732625961304, + "learning_rate": 2e-05, + "loss": 0.02909458, + "step": 11441 + }, + { + "epoch": 22.884, + "grad_norm": 1.525641679763794, + "learning_rate": 2e-05, + "loss": 0.03261862, + "step": 11442 + }, + { + "epoch": 22.886, + "grad_norm": 1.164318323135376, + "learning_rate": 2e-05, + "loss": 0.02912915, + "step": 11443 + }, + { + "epoch": 22.888, + "grad_norm": 1.5031243562698364, + "learning_rate": 2e-05, + "loss": 0.03876949, + "step": 11444 + }, + { + "epoch": 22.89, + "grad_norm": 1.0397024154663086, + "learning_rate": 2e-05, + "loss": 0.02776971, + "step": 11445 + }, + { + "epoch": 22.892, + "grad_norm": 1.2293657064437866, + "learning_rate": 2e-05, + "loss": 0.04357108, + "step": 11446 + }, + { + "epoch": 22.894, + "grad_norm": 1.8432687520980835, + "learning_rate": 2e-05, + "loss": 0.02928544, + "step": 11447 + }, + { + "epoch": 22.896, + "grad_norm": 1.107912540435791, + "learning_rate": 2e-05, + "loss": 0.02876203, + "step": 11448 + }, + { + "epoch": 22.898, + "grad_norm": 1.5207486152648926, + "learning_rate": 2e-05, + "loss": 0.04288258, + "step": 11449 + }, + { + "epoch": 22.9, + "grad_norm": 1.0726830959320068, + "learning_rate": 2e-05, + "loss": 0.03220513, + "step": 11450 + }, + { + "epoch": 22.902, + "grad_norm": 1.3904377222061157, + "learning_rate": 2e-05, + "loss": 0.03931183, + "step": 11451 + }, + { + "epoch": 22.904, + "grad_norm": 1.601686954498291, + "learning_rate": 2e-05, + "loss": 0.04329685, + "step": 11452 + }, + { + "epoch": 22.906, + "grad_norm": 1.7275676727294922, + "learning_rate": 2e-05, + "loss": 0.03662112, + "step": 11453 + }, + { + "epoch": 22.908, + "grad_norm": 0.9452254176139832, + "learning_rate": 2e-05, + "loss": 0.03591129, + "step": 11454 + }, + { + "epoch": 22.91, + "grad_norm": 0.9925097823143005, + "learning_rate": 2e-05, + "loss": 0.02755562, + "step": 11455 + }, + { + "epoch": 22.912, + "grad_norm": 1.9532277584075928, + "learning_rate": 2e-05, + "loss": 0.03555189, + "step": 11456 + }, + { + "epoch": 22.914, + "grad_norm": 0.8371837139129639, + "learning_rate": 2e-05, + "loss": 0.02261299, + "step": 11457 + }, + { + "epoch": 22.916, + "grad_norm": 2.2544491291046143, + "learning_rate": 2e-05, + "loss": 0.05361641, + "step": 11458 + }, + { + "epoch": 22.918, + "grad_norm": 2.455197334289551, + "learning_rate": 2e-05, + "loss": 0.04926509, + "step": 11459 + }, + { + "epoch": 22.92, + "grad_norm": 1.0468469858169556, + "learning_rate": 2e-05, + "loss": 0.03623208, + "step": 11460 + }, + { + "epoch": 22.922, + "grad_norm": 1.6605795621871948, + "learning_rate": 2e-05, + "loss": 0.03582647, + "step": 11461 + }, + { + "epoch": 22.924, + "grad_norm": 1.6512527465820312, + "learning_rate": 2e-05, + "loss": 0.03864128, + "step": 11462 + }, + { + "epoch": 22.926, + "grad_norm": 1.015683889389038, + "learning_rate": 2e-05, + "loss": 0.02952475, + "step": 11463 + }, + { + "epoch": 22.928, + "grad_norm": 2.490291118621826, + "learning_rate": 2e-05, + "loss": 0.04171167, + "step": 11464 + }, + { + "epoch": 22.93, + "grad_norm": 1.092077612876892, + "learning_rate": 2e-05, + "loss": 0.03160276, + "step": 11465 + }, + { + "epoch": 22.932, + "grad_norm": 1.0306111574172974, + "learning_rate": 2e-05, + "loss": 0.02688064, + "step": 11466 + }, + { + "epoch": 22.934, + "grad_norm": 1.20967435836792, + "learning_rate": 2e-05, + "loss": 0.03792361, + "step": 11467 + }, + { + "epoch": 22.936, + "grad_norm": 1.6543128490447998, + "learning_rate": 2e-05, + "loss": 0.0479852, + "step": 11468 + }, + { + "epoch": 22.938, + "grad_norm": 1.193781852722168, + "learning_rate": 2e-05, + "loss": 0.03420633, + "step": 11469 + }, + { + "epoch": 22.94, + "grad_norm": 1.764358639717102, + "learning_rate": 2e-05, + "loss": 0.03625751, + "step": 11470 + }, + { + "epoch": 22.942, + "grad_norm": 1.6757545471191406, + "learning_rate": 2e-05, + "loss": 0.03107898, + "step": 11471 + }, + { + "epoch": 22.944, + "grad_norm": 0.9526821970939636, + "learning_rate": 2e-05, + "loss": 0.03141288, + "step": 11472 + }, + { + "epoch": 22.946, + "grad_norm": 1.6071827411651611, + "learning_rate": 2e-05, + "loss": 0.03736177, + "step": 11473 + }, + { + "epoch": 22.948, + "grad_norm": 1.1540484428405762, + "learning_rate": 2e-05, + "loss": 0.03194441, + "step": 11474 + }, + { + "epoch": 22.95, + "grad_norm": 1.1837899684906006, + "learning_rate": 2e-05, + "loss": 0.0288153, + "step": 11475 + }, + { + "epoch": 22.951999999999998, + "grad_norm": 1.1613291501998901, + "learning_rate": 2e-05, + "loss": 0.03203413, + "step": 11476 + }, + { + "epoch": 22.954, + "grad_norm": 2.0555005073547363, + "learning_rate": 2e-05, + "loss": 0.04438991, + "step": 11477 + }, + { + "epoch": 22.956, + "grad_norm": 1.4944390058517456, + "learning_rate": 2e-05, + "loss": 0.05310993, + "step": 11478 + }, + { + "epoch": 22.958, + "grad_norm": 1.4034525156021118, + "learning_rate": 2e-05, + "loss": 0.0385785, + "step": 11479 + }, + { + "epoch": 22.96, + "grad_norm": 2.2362515926361084, + "learning_rate": 2e-05, + "loss": 0.03343879, + "step": 11480 + }, + { + "epoch": 22.962, + "grad_norm": 1.9142746925354004, + "learning_rate": 2e-05, + "loss": 0.04086325, + "step": 11481 + }, + { + "epoch": 22.964, + "grad_norm": 3.3769659996032715, + "learning_rate": 2e-05, + "loss": 0.03347282, + "step": 11482 + }, + { + "epoch": 22.966, + "grad_norm": 1.3966118097305298, + "learning_rate": 2e-05, + "loss": 0.02449038, + "step": 11483 + }, + { + "epoch": 22.968, + "grad_norm": 1.6454405784606934, + "learning_rate": 2e-05, + "loss": 0.03337567, + "step": 11484 + }, + { + "epoch": 22.97, + "grad_norm": 1.2182114124298096, + "learning_rate": 2e-05, + "loss": 0.02583343, + "step": 11485 + }, + { + "epoch": 22.972, + "grad_norm": 2.4437437057495117, + "learning_rate": 2e-05, + "loss": 0.04662333, + "step": 11486 + }, + { + "epoch": 22.974, + "grad_norm": 1.3717036247253418, + "learning_rate": 2e-05, + "loss": 0.04354615, + "step": 11487 + }, + { + "epoch": 22.976, + "grad_norm": 1.0817804336547852, + "learning_rate": 2e-05, + "loss": 0.03474058, + "step": 11488 + }, + { + "epoch": 22.978, + "grad_norm": 1.8078384399414062, + "learning_rate": 2e-05, + "loss": 0.04659226, + "step": 11489 + }, + { + "epoch": 22.98, + "grad_norm": 1.3917157649993896, + "learning_rate": 2e-05, + "loss": 0.04078533, + "step": 11490 + }, + { + "epoch": 22.982, + "grad_norm": 0.8220876455307007, + "learning_rate": 2e-05, + "loss": 0.02258881, + "step": 11491 + }, + { + "epoch": 22.984, + "grad_norm": 1.2913342714309692, + "learning_rate": 2e-05, + "loss": 0.04604527, + "step": 11492 + }, + { + "epoch": 22.986, + "grad_norm": 1.271330714225769, + "learning_rate": 2e-05, + "loss": 0.03667264, + "step": 11493 + }, + { + "epoch": 22.988, + "grad_norm": 1.3295248746871948, + "learning_rate": 2e-05, + "loss": 0.03829803, + "step": 11494 + }, + { + "epoch": 22.99, + "grad_norm": 1.3785157203674316, + "learning_rate": 2e-05, + "loss": 0.04370276, + "step": 11495 + }, + { + "epoch": 22.992, + "grad_norm": 1.7691086530685425, + "learning_rate": 2e-05, + "loss": 0.04192688, + "step": 11496 + }, + { + "epoch": 22.994, + "grad_norm": 0.933940052986145, + "learning_rate": 2e-05, + "loss": 0.01630483, + "step": 11497 + }, + { + "epoch": 22.996, + "grad_norm": 1.4123674631118774, + "learning_rate": 2e-05, + "loss": 0.03609384, + "step": 11498 + }, + { + "epoch": 22.998, + "grad_norm": 1.3186585903167725, + "learning_rate": 2e-05, + "loss": 0.04399614, + "step": 11499 + }, + { + "epoch": 23.0, + "grad_norm": 0.9407261610031128, + "learning_rate": 2e-05, + "loss": 0.02819065, + "step": 11500 + }, + { + "epoch": 23.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9780439121756487, + "Equal_1": 0.998, + "Equal_2": 0.9780439121756487, + "Equal_3": 0.9760479041916168, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 1.0, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9919839679358717, + "Parallel_3": 0.994, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.992, + "Perpendicular_3": 0.8386773547094188, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9956666666666667, + "PointLiesOnCircle_3": 0.9896666666666667, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9800399201596807 + }, + "eval_runtime": 319.8554, + "eval_samples_per_second": 32.827, + "eval_steps_per_second": 0.657, + "step": 11500 + }, + { + "epoch": 23.002, + "grad_norm": 1.2540278434753418, + "learning_rate": 2e-05, + "loss": 0.05130012, + "step": 11501 + }, + { + "epoch": 23.004, + "grad_norm": 1.3764632940292358, + "learning_rate": 2e-05, + "loss": 0.04294942, + "step": 11502 + }, + { + "epoch": 23.006, + "grad_norm": 1.6750069856643677, + "learning_rate": 2e-05, + "loss": 0.05229911, + "step": 11503 + }, + { + "epoch": 23.008, + "grad_norm": 1.1598528623580933, + "learning_rate": 2e-05, + "loss": 0.04085714, + "step": 11504 + }, + { + "epoch": 23.01, + "grad_norm": 1.1853110790252686, + "learning_rate": 2e-05, + "loss": 0.03996443, + "step": 11505 + }, + { + "epoch": 23.012, + "grad_norm": 1.4552375078201294, + "learning_rate": 2e-05, + "loss": 0.05250798, + "step": 11506 + }, + { + "epoch": 23.014, + "grad_norm": 1.4703574180603027, + "learning_rate": 2e-05, + "loss": 0.05003911, + "step": 11507 + }, + { + "epoch": 23.016, + "grad_norm": 1.011425256729126, + "learning_rate": 2e-05, + "loss": 0.03787163, + "step": 11508 + }, + { + "epoch": 23.018, + "grad_norm": 0.9965170621871948, + "learning_rate": 2e-05, + "loss": 0.04001056, + "step": 11509 + }, + { + "epoch": 23.02, + "grad_norm": 1.442737102508545, + "learning_rate": 2e-05, + "loss": 0.03976019, + "step": 11510 + }, + { + "epoch": 23.022, + "grad_norm": 1.067285418510437, + "learning_rate": 2e-05, + "loss": 0.03311403, + "step": 11511 + }, + { + "epoch": 23.024, + "grad_norm": 1.3310978412628174, + "learning_rate": 2e-05, + "loss": 0.05192403, + "step": 11512 + }, + { + "epoch": 23.026, + "grad_norm": 2.071756601333618, + "learning_rate": 2e-05, + "loss": 0.04093051, + "step": 11513 + }, + { + "epoch": 23.028, + "grad_norm": 1.8485080003738403, + "learning_rate": 2e-05, + "loss": 0.04342106, + "step": 11514 + }, + { + "epoch": 23.03, + "grad_norm": 1.2285382747650146, + "learning_rate": 2e-05, + "loss": 0.03801531, + "step": 11515 + }, + { + "epoch": 23.032, + "grad_norm": 1.2549402713775635, + "learning_rate": 2e-05, + "loss": 0.04710529, + "step": 11516 + }, + { + "epoch": 23.034, + "grad_norm": 1.2575291395187378, + "learning_rate": 2e-05, + "loss": 0.04867338, + "step": 11517 + }, + { + "epoch": 23.036, + "grad_norm": 1.4146618843078613, + "learning_rate": 2e-05, + "loss": 0.04894332, + "step": 11518 + }, + { + "epoch": 23.038, + "grad_norm": 1.2756978273391724, + "learning_rate": 2e-05, + "loss": 0.0424, + "step": 11519 + }, + { + "epoch": 23.04, + "grad_norm": 1.3770201206207275, + "learning_rate": 2e-05, + "loss": 0.05650711, + "step": 11520 + }, + { + "epoch": 23.042, + "grad_norm": 1.5191999673843384, + "learning_rate": 2e-05, + "loss": 0.05803034, + "step": 11521 + }, + { + "epoch": 23.044, + "grad_norm": 1.6032277345657349, + "learning_rate": 2e-05, + "loss": 0.04611324, + "step": 11522 + }, + { + "epoch": 23.046, + "grad_norm": 1.0860769748687744, + "learning_rate": 2e-05, + "loss": 0.03498707, + "step": 11523 + }, + { + "epoch": 23.048, + "grad_norm": 1.0449998378753662, + "learning_rate": 2e-05, + "loss": 0.03648846, + "step": 11524 + }, + { + "epoch": 23.05, + "grad_norm": 2.3625895977020264, + "learning_rate": 2e-05, + "loss": 0.03183797, + "step": 11525 + }, + { + "epoch": 23.052, + "grad_norm": 1.3209874629974365, + "learning_rate": 2e-05, + "loss": 0.0523079, + "step": 11526 + }, + { + "epoch": 23.054, + "grad_norm": 0.8777402639389038, + "learning_rate": 2e-05, + "loss": 0.02545576, + "step": 11527 + }, + { + "epoch": 23.056, + "grad_norm": 1.1120661497116089, + "learning_rate": 2e-05, + "loss": 0.0336577, + "step": 11528 + }, + { + "epoch": 23.058, + "grad_norm": 1.3430697917938232, + "learning_rate": 2e-05, + "loss": 0.03209985, + "step": 11529 + }, + { + "epoch": 23.06, + "grad_norm": 1.346922755241394, + "learning_rate": 2e-05, + "loss": 0.02798453, + "step": 11530 + }, + { + "epoch": 23.062, + "grad_norm": 1.8189756870269775, + "learning_rate": 2e-05, + "loss": 0.05564386, + "step": 11531 + }, + { + "epoch": 23.064, + "grad_norm": 3.2581892013549805, + "learning_rate": 2e-05, + "loss": 0.04301608, + "step": 11532 + }, + { + "epoch": 23.066, + "grad_norm": 1.3685656785964966, + "learning_rate": 2e-05, + "loss": 0.03543826, + "step": 11533 + }, + { + "epoch": 23.068, + "grad_norm": 1.6216603517532349, + "learning_rate": 2e-05, + "loss": 0.04750851, + "step": 11534 + }, + { + "epoch": 23.07, + "grad_norm": 2.4956471920013428, + "learning_rate": 2e-05, + "loss": 0.05645571, + "step": 11535 + }, + { + "epoch": 23.072, + "grad_norm": 3.800372362136841, + "learning_rate": 2e-05, + "loss": 0.03642789, + "step": 11536 + }, + { + "epoch": 23.074, + "grad_norm": 1.135061502456665, + "learning_rate": 2e-05, + "loss": 0.03842633, + "step": 11537 + }, + { + "epoch": 23.076, + "grad_norm": 1.3326822519302368, + "learning_rate": 2e-05, + "loss": 0.05297691, + "step": 11538 + }, + { + "epoch": 23.078, + "grad_norm": 1.7269304990768433, + "learning_rate": 2e-05, + "loss": 0.04963746, + "step": 11539 + }, + { + "epoch": 23.08, + "grad_norm": 1.300743818283081, + "learning_rate": 2e-05, + "loss": 0.04434649, + "step": 11540 + }, + { + "epoch": 23.082, + "grad_norm": 2.736264705657959, + "learning_rate": 2e-05, + "loss": 0.05283449, + "step": 11541 + }, + { + "epoch": 23.084, + "grad_norm": 1.9172672033309937, + "learning_rate": 2e-05, + "loss": 0.05265498, + "step": 11542 + }, + { + "epoch": 23.086, + "grad_norm": 1.1808491945266724, + "learning_rate": 2e-05, + "loss": 0.0372861, + "step": 11543 + }, + { + "epoch": 23.088, + "grad_norm": 1.2032355070114136, + "learning_rate": 2e-05, + "loss": 0.04825965, + "step": 11544 + }, + { + "epoch": 23.09, + "grad_norm": 1.292203426361084, + "learning_rate": 2e-05, + "loss": 0.04288067, + "step": 11545 + }, + { + "epoch": 23.092, + "grad_norm": 1.23992919921875, + "learning_rate": 2e-05, + "loss": 0.03847829, + "step": 11546 + }, + { + "epoch": 23.094, + "grad_norm": 1.6928577423095703, + "learning_rate": 2e-05, + "loss": 0.05047772, + "step": 11547 + }, + { + "epoch": 23.096, + "grad_norm": 1.2044388055801392, + "learning_rate": 2e-05, + "loss": 0.04760785, + "step": 11548 + }, + { + "epoch": 23.098, + "grad_norm": 4.208912372589111, + "learning_rate": 2e-05, + "loss": 0.04748537, + "step": 11549 + }, + { + "epoch": 23.1, + "grad_norm": 1.1794403791427612, + "learning_rate": 2e-05, + "loss": 0.03520359, + "step": 11550 + }, + { + "epoch": 23.102, + "grad_norm": 2.432137966156006, + "learning_rate": 2e-05, + "loss": 0.05768427, + "step": 11551 + }, + { + "epoch": 23.104, + "grad_norm": 1.1416656970977783, + "learning_rate": 2e-05, + "loss": 0.04104606, + "step": 11552 + }, + { + "epoch": 23.106, + "grad_norm": 0.929355263710022, + "learning_rate": 2e-05, + "loss": 0.0206061, + "step": 11553 + }, + { + "epoch": 23.108, + "grad_norm": 1.0629310607910156, + "learning_rate": 2e-05, + "loss": 0.02847815, + "step": 11554 + }, + { + "epoch": 23.11, + "grad_norm": 2.062800168991089, + "learning_rate": 2e-05, + "loss": 0.043769, + "step": 11555 + }, + { + "epoch": 23.112, + "grad_norm": 1.8286502361297607, + "learning_rate": 2e-05, + "loss": 0.05828995, + "step": 11556 + }, + { + "epoch": 23.114, + "grad_norm": 1.8750455379486084, + "learning_rate": 2e-05, + "loss": 0.04257144, + "step": 11557 + }, + { + "epoch": 23.116, + "grad_norm": 1.3888318538665771, + "learning_rate": 2e-05, + "loss": 0.04705369, + "step": 11558 + }, + { + "epoch": 23.118, + "grad_norm": 1.7153780460357666, + "learning_rate": 2e-05, + "loss": 0.0691988, + "step": 11559 + }, + { + "epoch": 23.12, + "grad_norm": 1.4313924312591553, + "learning_rate": 2e-05, + "loss": 0.04791881, + "step": 11560 + }, + { + "epoch": 23.122, + "grad_norm": 1.0603506565093994, + "learning_rate": 2e-05, + "loss": 0.0366033, + "step": 11561 + }, + { + "epoch": 23.124, + "grad_norm": 1.405933141708374, + "learning_rate": 2e-05, + "loss": 0.03930571, + "step": 11562 + }, + { + "epoch": 23.126, + "grad_norm": 1.920100212097168, + "learning_rate": 2e-05, + "loss": 0.04094848, + "step": 11563 + }, + { + "epoch": 23.128, + "grad_norm": 5.326123237609863, + "learning_rate": 2e-05, + "loss": 0.06398241, + "step": 11564 + }, + { + "epoch": 23.13, + "grad_norm": 1.9219155311584473, + "learning_rate": 2e-05, + "loss": 0.02961898, + "step": 11565 + }, + { + "epoch": 23.132, + "grad_norm": 2.914210081100464, + "learning_rate": 2e-05, + "loss": 0.04696817, + "step": 11566 + }, + { + "epoch": 23.134, + "grad_norm": 0.8635774254798889, + "learning_rate": 2e-05, + "loss": 0.02733534, + "step": 11567 + }, + { + "epoch": 23.136, + "grad_norm": 1.496962308883667, + "learning_rate": 2e-05, + "loss": 0.03688932, + "step": 11568 + }, + { + "epoch": 23.138, + "grad_norm": 2.043313980102539, + "learning_rate": 2e-05, + "loss": 0.03681432, + "step": 11569 + }, + { + "epoch": 23.14, + "grad_norm": 1.5973297357559204, + "learning_rate": 2e-05, + "loss": 0.04360689, + "step": 11570 + }, + { + "epoch": 23.142, + "grad_norm": 1.3972605466842651, + "learning_rate": 2e-05, + "loss": 0.04391249, + "step": 11571 + }, + { + "epoch": 23.144, + "grad_norm": 1.3387508392333984, + "learning_rate": 2e-05, + "loss": 0.04608027, + "step": 11572 + }, + { + "epoch": 23.146, + "grad_norm": 2.011808395385742, + "learning_rate": 2e-05, + "loss": 0.05625965, + "step": 11573 + }, + { + "epoch": 23.148, + "grad_norm": 1.5013598203659058, + "learning_rate": 2e-05, + "loss": 0.05085777, + "step": 11574 + }, + { + "epoch": 23.15, + "grad_norm": 2.186159133911133, + "learning_rate": 2e-05, + "loss": 0.04711618, + "step": 11575 + }, + { + "epoch": 23.152, + "grad_norm": 2.2853493690490723, + "learning_rate": 2e-05, + "loss": 0.0342935, + "step": 11576 + }, + { + "epoch": 23.154, + "grad_norm": 1.3616294860839844, + "learning_rate": 2e-05, + "loss": 0.04885361, + "step": 11577 + }, + { + "epoch": 23.156, + "grad_norm": 1.2394838333129883, + "learning_rate": 2e-05, + "loss": 0.03285236, + "step": 11578 + }, + { + "epoch": 23.158, + "grad_norm": 1.339077353477478, + "learning_rate": 2e-05, + "loss": 0.02606569, + "step": 11579 + }, + { + "epoch": 23.16, + "grad_norm": 0.8776892423629761, + "learning_rate": 2e-05, + "loss": 0.02158548, + "step": 11580 + }, + { + "epoch": 23.162, + "grad_norm": 3.6260955333709717, + "learning_rate": 2e-05, + "loss": 0.05039684, + "step": 11581 + }, + { + "epoch": 23.164, + "grad_norm": 1.242028832435608, + "learning_rate": 2e-05, + "loss": 0.04099006, + "step": 11582 + }, + { + "epoch": 23.166, + "grad_norm": 1.870340347290039, + "learning_rate": 2e-05, + "loss": 0.05608212, + "step": 11583 + }, + { + "epoch": 23.168, + "grad_norm": 1.346118688583374, + "learning_rate": 2e-05, + "loss": 0.03667539, + "step": 11584 + }, + { + "epoch": 23.17, + "grad_norm": 1.3202615976333618, + "learning_rate": 2e-05, + "loss": 0.06091684, + "step": 11585 + }, + { + "epoch": 23.172, + "grad_norm": 1.252225399017334, + "learning_rate": 2e-05, + "loss": 0.0603791, + "step": 11586 + }, + { + "epoch": 23.174, + "grad_norm": 1.584975242614746, + "learning_rate": 2e-05, + "loss": 0.04847322, + "step": 11587 + }, + { + "epoch": 23.176, + "grad_norm": 1.3901718854904175, + "learning_rate": 2e-05, + "loss": 0.03669205, + "step": 11588 + }, + { + "epoch": 23.178, + "grad_norm": 1.0474369525909424, + "learning_rate": 2e-05, + "loss": 0.04350015, + "step": 11589 + }, + { + "epoch": 23.18, + "grad_norm": 1.0090280771255493, + "learning_rate": 2e-05, + "loss": 0.03717218, + "step": 11590 + }, + { + "epoch": 23.182, + "grad_norm": 1.004239797592163, + "learning_rate": 2e-05, + "loss": 0.02906239, + "step": 11591 + }, + { + "epoch": 23.184, + "grad_norm": 1.3345508575439453, + "learning_rate": 2e-05, + "loss": 0.03678144, + "step": 11592 + }, + { + "epoch": 23.186, + "grad_norm": 1.837895393371582, + "learning_rate": 2e-05, + "loss": 0.03756856, + "step": 11593 + }, + { + "epoch": 23.188, + "grad_norm": 1.3044272661209106, + "learning_rate": 2e-05, + "loss": 0.04445122, + "step": 11594 + }, + { + "epoch": 23.19, + "grad_norm": 1.1923726797103882, + "learning_rate": 2e-05, + "loss": 0.03589686, + "step": 11595 + }, + { + "epoch": 23.192, + "grad_norm": 1.0984914302825928, + "learning_rate": 2e-05, + "loss": 0.04005678, + "step": 11596 + }, + { + "epoch": 23.194, + "grad_norm": 1.199487566947937, + "learning_rate": 2e-05, + "loss": 0.03045943, + "step": 11597 + }, + { + "epoch": 23.196, + "grad_norm": 1.032716989517212, + "learning_rate": 2e-05, + "loss": 0.03927022, + "step": 11598 + }, + { + "epoch": 23.198, + "grad_norm": 2.102476119995117, + "learning_rate": 2e-05, + "loss": 0.06727884, + "step": 11599 + }, + { + "epoch": 23.2, + "grad_norm": 1.172778844833374, + "learning_rate": 2e-05, + "loss": 0.05053917, + "step": 11600 + }, + { + "epoch": 23.202, + "grad_norm": 1.736275315284729, + "learning_rate": 2e-05, + "loss": 0.03909831, + "step": 11601 + }, + { + "epoch": 23.204, + "grad_norm": 1.0708051919937134, + "learning_rate": 2e-05, + "loss": 0.03059747, + "step": 11602 + }, + { + "epoch": 23.206, + "grad_norm": 1.1244726181030273, + "learning_rate": 2e-05, + "loss": 0.04023635, + "step": 11603 + }, + { + "epoch": 23.208, + "grad_norm": 0.9452853202819824, + "learning_rate": 2e-05, + "loss": 0.03030682, + "step": 11604 + }, + { + "epoch": 23.21, + "grad_norm": 2.1545941829681396, + "learning_rate": 2e-05, + "loss": 0.04214806, + "step": 11605 + }, + { + "epoch": 23.212, + "grad_norm": 1.2645689249038696, + "learning_rate": 2e-05, + "loss": 0.03720896, + "step": 11606 + }, + { + "epoch": 23.214, + "grad_norm": 1.2697169780731201, + "learning_rate": 2e-05, + "loss": 0.04333124, + "step": 11607 + }, + { + "epoch": 23.216, + "grad_norm": 1.4149627685546875, + "learning_rate": 2e-05, + "loss": 0.05068656, + "step": 11608 + }, + { + "epoch": 23.218, + "grad_norm": 2.079878568649292, + "learning_rate": 2e-05, + "loss": 0.06022862, + "step": 11609 + }, + { + "epoch": 23.22, + "grad_norm": 1.221092700958252, + "learning_rate": 2e-05, + "loss": 0.02790497, + "step": 11610 + }, + { + "epoch": 23.222, + "grad_norm": 0.9675205945968628, + "learning_rate": 2e-05, + "loss": 0.03455825, + "step": 11611 + }, + { + "epoch": 23.224, + "grad_norm": 0.8989473581314087, + "learning_rate": 2e-05, + "loss": 0.02922, + "step": 11612 + }, + { + "epoch": 23.226, + "grad_norm": 1.088036060333252, + "learning_rate": 2e-05, + "loss": 0.05001824, + "step": 11613 + }, + { + "epoch": 23.228, + "grad_norm": 1.4129053354263306, + "learning_rate": 2e-05, + "loss": 0.04396573, + "step": 11614 + }, + { + "epoch": 23.23, + "grad_norm": 1.193305253982544, + "learning_rate": 2e-05, + "loss": 0.03746897, + "step": 11615 + }, + { + "epoch": 23.232, + "grad_norm": 1.3615189790725708, + "learning_rate": 2e-05, + "loss": 0.03328097, + "step": 11616 + }, + { + "epoch": 23.234, + "grad_norm": 3.6548335552215576, + "learning_rate": 2e-05, + "loss": 0.0415346, + "step": 11617 + }, + { + "epoch": 23.236, + "grad_norm": 1.218424677848816, + "learning_rate": 2e-05, + "loss": 0.02998128, + "step": 11618 + }, + { + "epoch": 23.238, + "grad_norm": 1.3885092735290527, + "learning_rate": 2e-05, + "loss": 0.04785194, + "step": 11619 + }, + { + "epoch": 23.24, + "grad_norm": 2.74619722366333, + "learning_rate": 2e-05, + "loss": 0.04847107, + "step": 11620 + }, + { + "epoch": 23.242, + "grad_norm": 1.482759952545166, + "learning_rate": 2e-05, + "loss": 0.04352796, + "step": 11621 + }, + { + "epoch": 23.244, + "grad_norm": 1.3058183193206787, + "learning_rate": 2e-05, + "loss": 0.0443982, + "step": 11622 + }, + { + "epoch": 23.246, + "grad_norm": 1.1419992446899414, + "learning_rate": 2e-05, + "loss": 0.03190926, + "step": 11623 + }, + { + "epoch": 23.248, + "grad_norm": 3.0687108039855957, + "learning_rate": 2e-05, + "loss": 0.04174676, + "step": 11624 + }, + { + "epoch": 23.25, + "grad_norm": 1.1151496171951294, + "learning_rate": 2e-05, + "loss": 0.0575142, + "step": 11625 + }, + { + "epoch": 23.252, + "grad_norm": 1.7354116439819336, + "learning_rate": 2e-05, + "loss": 0.03637656, + "step": 11626 + }, + { + "epoch": 23.254, + "grad_norm": 1.7158818244934082, + "learning_rate": 2e-05, + "loss": 0.04089367, + "step": 11627 + }, + { + "epoch": 23.256, + "grad_norm": 1.027208685874939, + "learning_rate": 2e-05, + "loss": 0.04252267, + "step": 11628 + }, + { + "epoch": 23.258, + "grad_norm": 1.3836479187011719, + "learning_rate": 2e-05, + "loss": 0.04314929, + "step": 11629 + }, + { + "epoch": 23.26, + "grad_norm": 1.6362491846084595, + "learning_rate": 2e-05, + "loss": 0.04100268, + "step": 11630 + }, + { + "epoch": 23.262, + "grad_norm": 1.1033443212509155, + "learning_rate": 2e-05, + "loss": 0.04861282, + "step": 11631 + }, + { + "epoch": 23.264, + "grad_norm": 1.354112148284912, + "learning_rate": 2e-05, + "loss": 0.03558677, + "step": 11632 + }, + { + "epoch": 23.266, + "grad_norm": 2.211174726486206, + "learning_rate": 2e-05, + "loss": 0.05863594, + "step": 11633 + }, + { + "epoch": 23.268, + "grad_norm": 1.1978131532669067, + "learning_rate": 2e-05, + "loss": 0.02527563, + "step": 11634 + }, + { + "epoch": 23.27, + "grad_norm": 1.079159140586853, + "learning_rate": 2e-05, + "loss": 0.04221547, + "step": 11635 + }, + { + "epoch": 23.272, + "grad_norm": 2.7079126834869385, + "learning_rate": 2e-05, + "loss": 0.04663559, + "step": 11636 + }, + { + "epoch": 23.274, + "grad_norm": 1.3492555618286133, + "learning_rate": 2e-05, + "loss": 0.05419156, + "step": 11637 + }, + { + "epoch": 23.276, + "grad_norm": 2.4996025562286377, + "learning_rate": 2e-05, + "loss": 0.03772485, + "step": 11638 + }, + { + "epoch": 23.278, + "grad_norm": 2.083409070968628, + "learning_rate": 2e-05, + "loss": 0.06437519, + "step": 11639 + }, + { + "epoch": 23.28, + "grad_norm": 1.45590341091156, + "learning_rate": 2e-05, + "loss": 0.03550784, + "step": 11640 + }, + { + "epoch": 23.282, + "grad_norm": 1.1977746486663818, + "learning_rate": 2e-05, + "loss": 0.03832368, + "step": 11641 + }, + { + "epoch": 23.284, + "grad_norm": 1.926700234413147, + "learning_rate": 2e-05, + "loss": 0.06392578, + "step": 11642 + }, + { + "epoch": 23.286, + "grad_norm": 2.1961700916290283, + "learning_rate": 2e-05, + "loss": 0.03601138, + "step": 11643 + }, + { + "epoch": 23.288, + "grad_norm": 1.1713889837265015, + "learning_rate": 2e-05, + "loss": 0.03601219, + "step": 11644 + }, + { + "epoch": 23.29, + "grad_norm": 1.6894010305404663, + "learning_rate": 2e-05, + "loss": 0.0445708, + "step": 11645 + }, + { + "epoch": 23.292, + "grad_norm": 1.3147658109664917, + "learning_rate": 2e-05, + "loss": 0.0355294, + "step": 11646 + }, + { + "epoch": 23.294, + "grad_norm": 1.2042319774627686, + "learning_rate": 2e-05, + "loss": 0.04006883, + "step": 11647 + }, + { + "epoch": 23.296, + "grad_norm": 1.1680289506912231, + "learning_rate": 2e-05, + "loss": 0.04503386, + "step": 11648 + }, + { + "epoch": 23.298, + "grad_norm": 1.135467529296875, + "learning_rate": 2e-05, + "loss": 0.0402963, + "step": 11649 + }, + { + "epoch": 23.3, + "grad_norm": 1.505689024925232, + "learning_rate": 2e-05, + "loss": 0.03736635, + "step": 11650 + }, + { + "epoch": 23.302, + "grad_norm": 2.4752161502838135, + "learning_rate": 2e-05, + "loss": 0.04134458, + "step": 11651 + }, + { + "epoch": 23.304, + "grad_norm": 1.4017146825790405, + "learning_rate": 2e-05, + "loss": 0.04087723, + "step": 11652 + }, + { + "epoch": 23.306, + "grad_norm": 1.2549480199813843, + "learning_rate": 2e-05, + "loss": 0.04560962, + "step": 11653 + }, + { + "epoch": 23.308, + "grad_norm": 1.496455192565918, + "learning_rate": 2e-05, + "loss": 0.04883281, + "step": 11654 + }, + { + "epoch": 23.31, + "grad_norm": 1.4462777376174927, + "learning_rate": 2e-05, + "loss": 0.04287733, + "step": 11655 + }, + { + "epoch": 23.312, + "grad_norm": 1.8287996053695679, + "learning_rate": 2e-05, + "loss": 0.03942668, + "step": 11656 + }, + { + "epoch": 23.314, + "grad_norm": 1.9385639429092407, + "learning_rate": 2e-05, + "loss": 0.06809842, + "step": 11657 + }, + { + "epoch": 23.316, + "grad_norm": 0.842710018157959, + "learning_rate": 2e-05, + "loss": 0.02134952, + "step": 11658 + }, + { + "epoch": 23.318, + "grad_norm": 2.0979163646698, + "learning_rate": 2e-05, + "loss": 0.05558499, + "step": 11659 + }, + { + "epoch": 23.32, + "grad_norm": 1.2247816324234009, + "learning_rate": 2e-05, + "loss": 0.04178263, + "step": 11660 + }, + { + "epoch": 23.322, + "grad_norm": 1.5208022594451904, + "learning_rate": 2e-05, + "loss": 0.04280531, + "step": 11661 + }, + { + "epoch": 23.324, + "grad_norm": 2.318500280380249, + "learning_rate": 2e-05, + "loss": 0.06325538, + "step": 11662 + }, + { + "epoch": 23.326, + "grad_norm": 1.6465061902999878, + "learning_rate": 2e-05, + "loss": 0.05929951, + "step": 11663 + }, + { + "epoch": 23.328, + "grad_norm": 1.3455055952072144, + "learning_rate": 2e-05, + "loss": 0.03735639, + "step": 11664 + }, + { + "epoch": 23.33, + "grad_norm": 1.0528393983840942, + "learning_rate": 2e-05, + "loss": 0.03686676, + "step": 11665 + }, + { + "epoch": 23.332, + "grad_norm": 2.8287322521209717, + "learning_rate": 2e-05, + "loss": 0.04164189, + "step": 11666 + }, + { + "epoch": 23.334, + "grad_norm": 1.6893528699874878, + "learning_rate": 2e-05, + "loss": 0.04093581, + "step": 11667 + }, + { + "epoch": 23.336, + "grad_norm": 1.6616655588150024, + "learning_rate": 2e-05, + "loss": 0.03945962, + "step": 11668 + }, + { + "epoch": 23.338, + "grad_norm": 1.084054946899414, + "learning_rate": 2e-05, + "loss": 0.04473053, + "step": 11669 + }, + { + "epoch": 23.34, + "grad_norm": 1.4993929862976074, + "learning_rate": 2e-05, + "loss": 0.03447219, + "step": 11670 + }, + { + "epoch": 23.342, + "grad_norm": 1.050879955291748, + "learning_rate": 2e-05, + "loss": 0.03411791, + "step": 11671 + }, + { + "epoch": 23.344, + "grad_norm": 1.2374294996261597, + "learning_rate": 2e-05, + "loss": 0.0351023, + "step": 11672 + }, + { + "epoch": 23.346, + "grad_norm": 1.3457894325256348, + "learning_rate": 2e-05, + "loss": 0.04368747, + "step": 11673 + }, + { + "epoch": 23.348, + "grad_norm": 0.8786593675613403, + "learning_rate": 2e-05, + "loss": 0.03530385, + "step": 11674 + }, + { + "epoch": 23.35, + "grad_norm": 1.326228380203247, + "learning_rate": 2e-05, + "loss": 0.04968689, + "step": 11675 + }, + { + "epoch": 23.352, + "grad_norm": 0.9884204864501953, + "learning_rate": 2e-05, + "loss": 0.03281524, + "step": 11676 + }, + { + "epoch": 23.354, + "grad_norm": 1.3991303443908691, + "learning_rate": 2e-05, + "loss": 0.04180757, + "step": 11677 + }, + { + "epoch": 23.356, + "grad_norm": 6.225818157196045, + "learning_rate": 2e-05, + "loss": 0.03679702, + "step": 11678 + }, + { + "epoch": 23.358, + "grad_norm": 0.8489519953727722, + "learning_rate": 2e-05, + "loss": 0.02504526, + "step": 11679 + }, + { + "epoch": 23.36, + "grad_norm": 1.34859299659729, + "learning_rate": 2e-05, + "loss": 0.05301417, + "step": 11680 + }, + { + "epoch": 23.362, + "grad_norm": 2.322361469268799, + "learning_rate": 2e-05, + "loss": 0.0461299, + "step": 11681 + }, + { + "epoch": 23.364, + "grad_norm": 1.9422720670700073, + "learning_rate": 2e-05, + "loss": 0.04207643, + "step": 11682 + }, + { + "epoch": 23.366, + "grad_norm": 2.020577907562256, + "learning_rate": 2e-05, + "loss": 0.04851538, + "step": 11683 + }, + { + "epoch": 23.368, + "grad_norm": 1.196955919265747, + "learning_rate": 2e-05, + "loss": 0.04109028, + "step": 11684 + }, + { + "epoch": 23.37, + "grad_norm": 2.016350507736206, + "learning_rate": 2e-05, + "loss": 0.04552692, + "step": 11685 + }, + { + "epoch": 23.372, + "grad_norm": 1.13778555393219, + "learning_rate": 2e-05, + "loss": 0.02917241, + "step": 11686 + }, + { + "epoch": 23.374, + "grad_norm": 1.7037264108657837, + "learning_rate": 2e-05, + "loss": 0.05261172, + "step": 11687 + }, + { + "epoch": 23.376, + "grad_norm": 6.611384868621826, + "learning_rate": 2e-05, + "loss": 0.03373696, + "step": 11688 + }, + { + "epoch": 23.378, + "grad_norm": 1.5197391510009766, + "learning_rate": 2e-05, + "loss": 0.05965453, + "step": 11689 + }, + { + "epoch": 23.38, + "grad_norm": 1.1589999198913574, + "learning_rate": 2e-05, + "loss": 0.02925105, + "step": 11690 + }, + { + "epoch": 23.382, + "grad_norm": 1.4183608293533325, + "learning_rate": 2e-05, + "loss": 0.03728583, + "step": 11691 + }, + { + "epoch": 23.384, + "grad_norm": 1.9492428302764893, + "learning_rate": 2e-05, + "loss": 0.04366478, + "step": 11692 + }, + { + "epoch": 23.386, + "grad_norm": 1.5098069906234741, + "learning_rate": 2e-05, + "loss": 0.04676771, + "step": 11693 + }, + { + "epoch": 23.388, + "grad_norm": 2.645620107650757, + "learning_rate": 2e-05, + "loss": 0.03401638, + "step": 11694 + }, + { + "epoch": 23.39, + "grad_norm": 1.2716730833053589, + "learning_rate": 2e-05, + "loss": 0.03060681, + "step": 11695 + }, + { + "epoch": 23.392, + "grad_norm": 1.4287192821502686, + "learning_rate": 2e-05, + "loss": 0.04606835, + "step": 11696 + }, + { + "epoch": 23.394, + "grad_norm": 2.0073654651641846, + "learning_rate": 2e-05, + "loss": 0.03770929, + "step": 11697 + }, + { + "epoch": 23.396, + "grad_norm": 1.0809135437011719, + "learning_rate": 2e-05, + "loss": 0.04094427, + "step": 11698 + }, + { + "epoch": 23.398, + "grad_norm": 1.341443419456482, + "learning_rate": 2e-05, + "loss": 0.03574315, + "step": 11699 + }, + { + "epoch": 23.4, + "grad_norm": 1.4604313373565674, + "learning_rate": 2e-05, + "loss": 0.04919671, + "step": 11700 + }, + { + "epoch": 23.402, + "grad_norm": 1.32045578956604, + "learning_rate": 2e-05, + "loss": 0.04904388, + "step": 11701 + }, + { + "epoch": 23.404, + "grad_norm": 1.5795214176177979, + "learning_rate": 2e-05, + "loss": 0.04413631, + "step": 11702 + }, + { + "epoch": 23.406, + "grad_norm": 1.566278338432312, + "learning_rate": 2e-05, + "loss": 0.04938511, + "step": 11703 + }, + { + "epoch": 23.408, + "grad_norm": 1.6116101741790771, + "learning_rate": 2e-05, + "loss": 0.03976731, + "step": 11704 + }, + { + "epoch": 23.41, + "grad_norm": 1.1294550895690918, + "learning_rate": 2e-05, + "loss": 0.04623659, + "step": 11705 + }, + { + "epoch": 23.412, + "grad_norm": 2.9337821006774902, + "learning_rate": 2e-05, + "loss": 0.05919053, + "step": 11706 + }, + { + "epoch": 23.414, + "grad_norm": 1.2668352127075195, + "learning_rate": 2e-05, + "loss": 0.04299584, + "step": 11707 + }, + { + "epoch": 23.416, + "grad_norm": 1.6017807722091675, + "learning_rate": 2e-05, + "loss": 0.04490201, + "step": 11708 + }, + { + "epoch": 23.418, + "grad_norm": 1.9470270872116089, + "learning_rate": 2e-05, + "loss": 0.04873323, + "step": 11709 + }, + { + "epoch": 23.42, + "grad_norm": 0.8678619265556335, + "learning_rate": 2e-05, + "loss": 0.01984765, + "step": 11710 + }, + { + "epoch": 23.422, + "grad_norm": 1.015408992767334, + "learning_rate": 2e-05, + "loss": 0.02890258, + "step": 11711 + }, + { + "epoch": 23.424, + "grad_norm": 1.9212663173675537, + "learning_rate": 2e-05, + "loss": 0.05686199, + "step": 11712 + }, + { + "epoch": 23.426, + "grad_norm": 1.5647587776184082, + "learning_rate": 2e-05, + "loss": 0.04229132, + "step": 11713 + }, + { + "epoch": 23.428, + "grad_norm": 1.157758116722107, + "learning_rate": 2e-05, + "loss": 0.05381518, + "step": 11714 + }, + { + "epoch": 23.43, + "grad_norm": 2.0962116718292236, + "learning_rate": 2e-05, + "loss": 0.04939672, + "step": 11715 + }, + { + "epoch": 23.432, + "grad_norm": 1.778320074081421, + "learning_rate": 2e-05, + "loss": 0.03275887, + "step": 11716 + }, + { + "epoch": 23.434, + "grad_norm": 1.3979294300079346, + "learning_rate": 2e-05, + "loss": 0.04589324, + "step": 11717 + }, + { + "epoch": 23.436, + "grad_norm": 1.1280796527862549, + "learning_rate": 2e-05, + "loss": 0.04539976, + "step": 11718 + }, + { + "epoch": 23.438, + "grad_norm": 2.4845614433288574, + "learning_rate": 2e-05, + "loss": 0.04491439, + "step": 11719 + }, + { + "epoch": 23.44, + "grad_norm": 1.0096144676208496, + "learning_rate": 2e-05, + "loss": 0.03461198, + "step": 11720 + }, + { + "epoch": 23.442, + "grad_norm": 2.4556872844696045, + "learning_rate": 2e-05, + "loss": 0.05000499, + "step": 11721 + }, + { + "epoch": 23.444, + "grad_norm": 5.186877727508545, + "learning_rate": 2e-05, + "loss": 0.05488808, + "step": 11722 + }, + { + "epoch": 23.446, + "grad_norm": 1.524732232093811, + "learning_rate": 2e-05, + "loss": 0.03662727, + "step": 11723 + }, + { + "epoch": 23.448, + "grad_norm": 2.764235734939575, + "learning_rate": 2e-05, + "loss": 0.04267842, + "step": 11724 + }, + { + "epoch": 23.45, + "grad_norm": 3.6539905071258545, + "learning_rate": 2e-05, + "loss": 0.05041397, + "step": 11725 + }, + { + "epoch": 23.452, + "grad_norm": 2.682654619216919, + "learning_rate": 2e-05, + "loss": 0.0332579, + "step": 11726 + }, + { + "epoch": 23.454, + "grad_norm": 1.5499696731567383, + "learning_rate": 2e-05, + "loss": 0.03875547, + "step": 11727 + }, + { + "epoch": 23.456, + "grad_norm": 2.318291664123535, + "learning_rate": 2e-05, + "loss": 0.02854845, + "step": 11728 + }, + { + "epoch": 23.458, + "grad_norm": 1.7998384237289429, + "learning_rate": 2e-05, + "loss": 0.05344633, + "step": 11729 + }, + { + "epoch": 23.46, + "grad_norm": 0.9075918197631836, + "learning_rate": 2e-05, + "loss": 0.02585726, + "step": 11730 + }, + { + "epoch": 23.462, + "grad_norm": 1.5107016563415527, + "learning_rate": 2e-05, + "loss": 0.05805733, + "step": 11731 + }, + { + "epoch": 23.464, + "grad_norm": 2.0827245712280273, + "learning_rate": 2e-05, + "loss": 0.04989189, + "step": 11732 + }, + { + "epoch": 23.466, + "grad_norm": 2.3105974197387695, + "learning_rate": 2e-05, + "loss": 0.06807564, + "step": 11733 + }, + { + "epoch": 23.468, + "grad_norm": 1.2695666551589966, + "learning_rate": 2e-05, + "loss": 0.03619879, + "step": 11734 + }, + { + "epoch": 23.47, + "grad_norm": 1.2384021282196045, + "learning_rate": 2e-05, + "loss": 0.02629324, + "step": 11735 + }, + { + "epoch": 23.472, + "grad_norm": 1.6881636381149292, + "learning_rate": 2e-05, + "loss": 0.04189314, + "step": 11736 + }, + { + "epoch": 23.474, + "grad_norm": 2.447591543197632, + "learning_rate": 2e-05, + "loss": 0.04387872, + "step": 11737 + }, + { + "epoch": 23.476, + "grad_norm": 0.8882629871368408, + "learning_rate": 2e-05, + "loss": 0.02441452, + "step": 11738 + }, + { + "epoch": 23.478, + "grad_norm": 1.4331190586090088, + "learning_rate": 2e-05, + "loss": 0.03524774, + "step": 11739 + }, + { + "epoch": 23.48, + "grad_norm": 2.20993709564209, + "learning_rate": 2e-05, + "loss": 0.05033822, + "step": 11740 + }, + { + "epoch": 23.482, + "grad_norm": 1.4944759607315063, + "learning_rate": 2e-05, + "loss": 0.05018061, + "step": 11741 + }, + { + "epoch": 23.484, + "grad_norm": 2.6251118183135986, + "learning_rate": 2e-05, + "loss": 0.04994038, + "step": 11742 + }, + { + "epoch": 23.486, + "grad_norm": 1.345861792564392, + "learning_rate": 2e-05, + "loss": 0.044821, + "step": 11743 + }, + { + "epoch": 23.488, + "grad_norm": 1.239540934562683, + "learning_rate": 2e-05, + "loss": 0.04019895, + "step": 11744 + }, + { + "epoch": 23.49, + "grad_norm": 1.280549168586731, + "learning_rate": 2e-05, + "loss": 0.03652037, + "step": 11745 + }, + { + "epoch": 23.492, + "grad_norm": 1.523679256439209, + "learning_rate": 2e-05, + "loss": 0.04372611, + "step": 11746 + }, + { + "epoch": 23.494, + "grad_norm": 1.49849271774292, + "learning_rate": 2e-05, + "loss": 0.03744507, + "step": 11747 + }, + { + "epoch": 23.496, + "grad_norm": 1.3029149770736694, + "learning_rate": 2e-05, + "loss": 0.04071214, + "step": 11748 + }, + { + "epoch": 23.498, + "grad_norm": 2.212434768676758, + "learning_rate": 2e-05, + "loss": 0.06702287, + "step": 11749 + }, + { + "epoch": 23.5, + "grad_norm": 1.800525188446045, + "learning_rate": 2e-05, + "loss": 0.04466784, + "step": 11750 + }, + { + "epoch": 23.502, + "grad_norm": 1.8852155208587646, + "learning_rate": 2e-05, + "loss": 0.06043893, + "step": 11751 + }, + { + "epoch": 23.504, + "grad_norm": 1.5796104669570923, + "learning_rate": 2e-05, + "loss": 0.03296822, + "step": 11752 + }, + { + "epoch": 23.506, + "grad_norm": 1.2062172889709473, + "learning_rate": 2e-05, + "loss": 0.03532603, + "step": 11753 + }, + { + "epoch": 23.508, + "grad_norm": 2.296785831451416, + "learning_rate": 2e-05, + "loss": 0.06512903, + "step": 11754 + }, + { + "epoch": 23.51, + "grad_norm": 1.458593487739563, + "learning_rate": 2e-05, + "loss": 0.06009974, + "step": 11755 + }, + { + "epoch": 23.512, + "grad_norm": 1.1910109519958496, + "learning_rate": 2e-05, + "loss": 0.03358722, + "step": 11756 + }, + { + "epoch": 23.514, + "grad_norm": 1.1627408266067505, + "learning_rate": 2e-05, + "loss": 0.03376558, + "step": 11757 + }, + { + "epoch": 23.516, + "grad_norm": 2.2633957862854004, + "learning_rate": 2e-05, + "loss": 0.06547912, + "step": 11758 + }, + { + "epoch": 23.518, + "grad_norm": 0.9992890357971191, + "learning_rate": 2e-05, + "loss": 0.02558643, + "step": 11759 + }, + { + "epoch": 23.52, + "grad_norm": 2.1989448070526123, + "learning_rate": 2e-05, + "loss": 0.04395504, + "step": 11760 + }, + { + "epoch": 23.522, + "grad_norm": 2.1435000896453857, + "learning_rate": 2e-05, + "loss": 0.04401811, + "step": 11761 + }, + { + "epoch": 23.524, + "grad_norm": 1.109850287437439, + "learning_rate": 2e-05, + "loss": 0.02487316, + "step": 11762 + }, + { + "epoch": 23.526, + "grad_norm": 2.0983777046203613, + "learning_rate": 2e-05, + "loss": 0.04313958, + "step": 11763 + }, + { + "epoch": 23.528, + "grad_norm": 1.1474552154541016, + "learning_rate": 2e-05, + "loss": 0.04640581, + "step": 11764 + }, + { + "epoch": 23.53, + "grad_norm": 2.8319504261016846, + "learning_rate": 2e-05, + "loss": 0.05273167, + "step": 11765 + }, + { + "epoch": 23.532, + "grad_norm": 1.2033793926239014, + "learning_rate": 2e-05, + "loss": 0.0447002, + "step": 11766 + }, + { + "epoch": 23.534, + "grad_norm": 0.9693114757537842, + "learning_rate": 2e-05, + "loss": 0.02837565, + "step": 11767 + }, + { + "epoch": 23.536, + "grad_norm": 0.9325622916221619, + "learning_rate": 2e-05, + "loss": 0.03289568, + "step": 11768 + }, + { + "epoch": 23.538, + "grad_norm": 1.8937511444091797, + "learning_rate": 2e-05, + "loss": 0.05601943, + "step": 11769 + }, + { + "epoch": 23.54, + "grad_norm": 1.1438976526260376, + "learning_rate": 2e-05, + "loss": 0.04349836, + "step": 11770 + }, + { + "epoch": 23.542, + "grad_norm": 1.7353458404541016, + "learning_rate": 2e-05, + "loss": 0.06614293, + "step": 11771 + }, + { + "epoch": 23.544, + "grad_norm": 1.3022950887680054, + "learning_rate": 2e-05, + "loss": 0.03682633, + "step": 11772 + }, + { + "epoch": 23.546, + "grad_norm": 2.2560153007507324, + "learning_rate": 2e-05, + "loss": 0.03180346, + "step": 11773 + }, + { + "epoch": 23.548000000000002, + "grad_norm": 2.171947479248047, + "learning_rate": 2e-05, + "loss": 0.0739259, + "step": 11774 + }, + { + "epoch": 23.55, + "grad_norm": 1.3080508708953857, + "learning_rate": 2e-05, + "loss": 0.0323229, + "step": 11775 + }, + { + "epoch": 23.552, + "grad_norm": 1.3092063665390015, + "learning_rate": 2e-05, + "loss": 0.04459471, + "step": 11776 + }, + { + "epoch": 23.554, + "grad_norm": 1.0528229475021362, + "learning_rate": 2e-05, + "loss": 0.02914536, + "step": 11777 + }, + { + "epoch": 23.556, + "grad_norm": 1.155535340309143, + "learning_rate": 2e-05, + "loss": 0.03187998, + "step": 11778 + }, + { + "epoch": 23.558, + "grad_norm": 1.5524402856826782, + "learning_rate": 2e-05, + "loss": 0.04663787, + "step": 11779 + }, + { + "epoch": 23.56, + "grad_norm": 1.0907926559448242, + "learning_rate": 2e-05, + "loss": 0.04237938, + "step": 11780 + }, + { + "epoch": 23.562, + "grad_norm": 1.2452819347381592, + "learning_rate": 2e-05, + "loss": 0.04205526, + "step": 11781 + }, + { + "epoch": 23.564, + "grad_norm": 1.1769381761550903, + "learning_rate": 2e-05, + "loss": 0.03974335, + "step": 11782 + }, + { + "epoch": 23.566, + "grad_norm": 1.1212342977523804, + "learning_rate": 2e-05, + "loss": 0.04549305, + "step": 11783 + }, + { + "epoch": 23.568, + "grad_norm": 1.3117882013320923, + "learning_rate": 2e-05, + "loss": 0.03518543, + "step": 11784 + }, + { + "epoch": 23.57, + "grad_norm": 2.5933187007904053, + "learning_rate": 2e-05, + "loss": 0.05680026, + "step": 11785 + }, + { + "epoch": 23.572, + "grad_norm": 1.0752009153366089, + "learning_rate": 2e-05, + "loss": 0.03501371, + "step": 11786 + }, + { + "epoch": 23.574, + "grad_norm": 1.2819617986679077, + "learning_rate": 2e-05, + "loss": 0.04427963, + "step": 11787 + }, + { + "epoch": 23.576, + "grad_norm": 1.0761966705322266, + "learning_rate": 2e-05, + "loss": 0.03001852, + "step": 11788 + }, + { + "epoch": 23.578, + "grad_norm": 1.9416499137878418, + "learning_rate": 2e-05, + "loss": 0.05700012, + "step": 11789 + }, + { + "epoch": 23.58, + "grad_norm": 2.196974992752075, + "learning_rate": 2e-05, + "loss": 0.06811506, + "step": 11790 + }, + { + "epoch": 23.582, + "grad_norm": 1.458816409111023, + "learning_rate": 2e-05, + "loss": 0.03885439, + "step": 11791 + }, + { + "epoch": 23.584, + "grad_norm": 1.5310564041137695, + "learning_rate": 2e-05, + "loss": 0.04649789, + "step": 11792 + }, + { + "epoch": 23.586, + "grad_norm": 1.3450838327407837, + "learning_rate": 2e-05, + "loss": 0.05772156, + "step": 11793 + }, + { + "epoch": 23.588, + "grad_norm": 1.8807865381240845, + "learning_rate": 2e-05, + "loss": 0.04831642, + "step": 11794 + }, + { + "epoch": 23.59, + "grad_norm": 1.4770256280899048, + "learning_rate": 2e-05, + "loss": 0.05228359, + "step": 11795 + }, + { + "epoch": 23.592, + "grad_norm": 1.5434561967849731, + "learning_rate": 2e-05, + "loss": 0.04642072, + "step": 11796 + }, + { + "epoch": 23.594, + "grad_norm": 1.5032665729522705, + "learning_rate": 2e-05, + "loss": 0.04318558, + "step": 11797 + }, + { + "epoch": 23.596, + "grad_norm": 1.2868092060089111, + "learning_rate": 2e-05, + "loss": 0.05032202, + "step": 11798 + }, + { + "epoch": 23.598, + "grad_norm": 2.0231192111968994, + "learning_rate": 2e-05, + "loss": 0.0559545, + "step": 11799 + }, + { + "epoch": 23.6, + "grad_norm": 1.333901047706604, + "learning_rate": 2e-05, + "loss": 0.04533472, + "step": 11800 + }, + { + "epoch": 23.602, + "grad_norm": 1.2813880443572998, + "learning_rate": 2e-05, + "loss": 0.04703806, + "step": 11801 + }, + { + "epoch": 23.604, + "grad_norm": 2.0209999084472656, + "learning_rate": 2e-05, + "loss": 0.03075261, + "step": 11802 + }, + { + "epoch": 23.606, + "grad_norm": 0.8158363103866577, + "learning_rate": 2e-05, + "loss": 0.02488266, + "step": 11803 + }, + { + "epoch": 23.608, + "grad_norm": 1.2968827486038208, + "learning_rate": 2e-05, + "loss": 0.04825204, + "step": 11804 + }, + { + "epoch": 23.61, + "grad_norm": 1.1694176197052002, + "learning_rate": 2e-05, + "loss": 0.04675373, + "step": 11805 + }, + { + "epoch": 23.612, + "grad_norm": 2.4291164875030518, + "learning_rate": 2e-05, + "loss": 0.03782834, + "step": 11806 + }, + { + "epoch": 23.614, + "grad_norm": 3.2317073345184326, + "learning_rate": 2e-05, + "loss": 0.05336016, + "step": 11807 + }, + { + "epoch": 23.616, + "grad_norm": 2.4341442584991455, + "learning_rate": 2e-05, + "loss": 0.04534258, + "step": 11808 + }, + { + "epoch": 23.618, + "grad_norm": 1.3405523300170898, + "learning_rate": 2e-05, + "loss": 0.03542194, + "step": 11809 + }, + { + "epoch": 23.62, + "grad_norm": 0.952492356300354, + "learning_rate": 2e-05, + "loss": 0.02990831, + "step": 11810 + }, + { + "epoch": 23.622, + "grad_norm": 1.8398486375808716, + "learning_rate": 2e-05, + "loss": 0.04314672, + "step": 11811 + }, + { + "epoch": 23.624, + "grad_norm": 1.486546277999878, + "learning_rate": 2e-05, + "loss": 0.04819717, + "step": 11812 + }, + { + "epoch": 23.626, + "grad_norm": 1.3383139371871948, + "learning_rate": 2e-05, + "loss": 0.03825889, + "step": 11813 + }, + { + "epoch": 23.628, + "grad_norm": 1.57940673828125, + "learning_rate": 2e-05, + "loss": 0.03851825, + "step": 11814 + }, + { + "epoch": 23.63, + "grad_norm": 1.3672587871551514, + "learning_rate": 2e-05, + "loss": 0.04236236, + "step": 11815 + }, + { + "epoch": 23.632, + "grad_norm": 1.9322770833969116, + "learning_rate": 2e-05, + "loss": 0.04957227, + "step": 11816 + }, + { + "epoch": 23.634, + "grad_norm": 2.318655014038086, + "learning_rate": 2e-05, + "loss": 0.05477686, + "step": 11817 + }, + { + "epoch": 23.636, + "grad_norm": 1.3054866790771484, + "learning_rate": 2e-05, + "loss": 0.0492695, + "step": 11818 + }, + { + "epoch": 23.638, + "grad_norm": 1.7329834699630737, + "learning_rate": 2e-05, + "loss": 0.03611884, + "step": 11819 + }, + { + "epoch": 23.64, + "grad_norm": 1.0600239038467407, + "learning_rate": 2e-05, + "loss": 0.0398752, + "step": 11820 + }, + { + "epoch": 23.642, + "grad_norm": 1.5419100522994995, + "learning_rate": 2e-05, + "loss": 0.05457453, + "step": 11821 + }, + { + "epoch": 23.644, + "grad_norm": 1.1670844554901123, + "learning_rate": 2e-05, + "loss": 0.04318011, + "step": 11822 + }, + { + "epoch": 23.646, + "grad_norm": 1.8038170337677002, + "learning_rate": 2e-05, + "loss": 0.05024581, + "step": 11823 + }, + { + "epoch": 23.648, + "grad_norm": 1.4596983194351196, + "learning_rate": 2e-05, + "loss": 0.04731473, + "step": 11824 + }, + { + "epoch": 23.65, + "grad_norm": 2.0060338973999023, + "learning_rate": 2e-05, + "loss": 0.03213666, + "step": 11825 + }, + { + "epoch": 23.652, + "grad_norm": 2.789665699005127, + "learning_rate": 2e-05, + "loss": 0.05821307, + "step": 11826 + }, + { + "epoch": 23.654, + "grad_norm": 3.190103054046631, + "learning_rate": 2e-05, + "loss": 0.04068895, + "step": 11827 + }, + { + "epoch": 23.656, + "grad_norm": 1.2811437845230103, + "learning_rate": 2e-05, + "loss": 0.05212625, + "step": 11828 + }, + { + "epoch": 23.658, + "grad_norm": 1.1203440427780151, + "learning_rate": 2e-05, + "loss": 0.04808802, + "step": 11829 + }, + { + "epoch": 23.66, + "grad_norm": 2.3472464084625244, + "learning_rate": 2e-05, + "loss": 0.0419308, + "step": 11830 + }, + { + "epoch": 23.662, + "grad_norm": 1.1911481618881226, + "learning_rate": 2e-05, + "loss": 0.05066958, + "step": 11831 + }, + { + "epoch": 23.664, + "grad_norm": 1.5024276971817017, + "learning_rate": 2e-05, + "loss": 0.03530101, + "step": 11832 + }, + { + "epoch": 23.666, + "grad_norm": 2.032147169113159, + "learning_rate": 2e-05, + "loss": 0.04991383, + "step": 11833 + }, + { + "epoch": 23.668, + "grad_norm": 1.2729007005691528, + "learning_rate": 2e-05, + "loss": 0.04908589, + "step": 11834 + }, + { + "epoch": 23.67, + "grad_norm": 7.958946228027344, + "learning_rate": 2e-05, + "loss": 0.06328648, + "step": 11835 + }, + { + "epoch": 23.672, + "grad_norm": 1.1186507940292358, + "learning_rate": 2e-05, + "loss": 0.03300332, + "step": 11836 + }, + { + "epoch": 23.674, + "grad_norm": 1.4105454683303833, + "learning_rate": 2e-05, + "loss": 0.04340991, + "step": 11837 + }, + { + "epoch": 23.676, + "grad_norm": 1.504833698272705, + "learning_rate": 2e-05, + "loss": 0.05525521, + "step": 11838 + }, + { + "epoch": 23.678, + "grad_norm": 1.2396059036254883, + "learning_rate": 2e-05, + "loss": 0.04817509, + "step": 11839 + }, + { + "epoch": 23.68, + "grad_norm": 1.5307303667068481, + "learning_rate": 2e-05, + "loss": 0.02923251, + "step": 11840 + }, + { + "epoch": 23.682, + "grad_norm": 1.6105175018310547, + "learning_rate": 2e-05, + "loss": 0.03141434, + "step": 11841 + }, + { + "epoch": 23.684, + "grad_norm": 1.293770670890808, + "learning_rate": 2e-05, + "loss": 0.05276821, + "step": 11842 + }, + { + "epoch": 23.686, + "grad_norm": 1.8566535711288452, + "learning_rate": 2e-05, + "loss": 0.04680733, + "step": 11843 + }, + { + "epoch": 23.688, + "grad_norm": 1.2725869417190552, + "learning_rate": 2e-05, + "loss": 0.03304363, + "step": 11844 + }, + { + "epoch": 23.69, + "grad_norm": 1.192263126373291, + "learning_rate": 2e-05, + "loss": 0.03634157, + "step": 11845 + }, + { + "epoch": 23.692, + "grad_norm": 3.0075485706329346, + "learning_rate": 2e-05, + "loss": 0.04553998, + "step": 11846 + }, + { + "epoch": 23.694, + "grad_norm": 2.9852254390716553, + "learning_rate": 2e-05, + "loss": 0.06526848, + "step": 11847 + }, + { + "epoch": 23.696, + "grad_norm": 1.9898120164871216, + "learning_rate": 2e-05, + "loss": 0.03190294, + "step": 11848 + }, + { + "epoch": 23.698, + "grad_norm": 1.2109845876693726, + "learning_rate": 2e-05, + "loss": 0.04095478, + "step": 11849 + }, + { + "epoch": 23.7, + "grad_norm": 3.1448001861572266, + "learning_rate": 2e-05, + "loss": 0.0624724, + "step": 11850 + }, + { + "epoch": 23.701999999999998, + "grad_norm": 1.063796043395996, + "learning_rate": 2e-05, + "loss": 0.04226255, + "step": 11851 + }, + { + "epoch": 23.704, + "grad_norm": 1.0157510042190552, + "learning_rate": 2e-05, + "loss": 0.04364079, + "step": 11852 + }, + { + "epoch": 23.706, + "grad_norm": 0.9760161638259888, + "learning_rate": 2e-05, + "loss": 0.02745278, + "step": 11853 + }, + { + "epoch": 23.708, + "grad_norm": 1.4634634256362915, + "learning_rate": 2e-05, + "loss": 0.04308833, + "step": 11854 + }, + { + "epoch": 23.71, + "grad_norm": 1.4211335182189941, + "learning_rate": 2e-05, + "loss": 0.03839317, + "step": 11855 + }, + { + "epoch": 23.712, + "grad_norm": 1.0064549446105957, + "learning_rate": 2e-05, + "loss": 0.03243163, + "step": 11856 + }, + { + "epoch": 23.714, + "grad_norm": 1.0889908075332642, + "learning_rate": 2e-05, + "loss": 0.03015575, + "step": 11857 + }, + { + "epoch": 23.716, + "grad_norm": 1.499775767326355, + "learning_rate": 2e-05, + "loss": 0.04761718, + "step": 11858 + }, + { + "epoch": 23.718, + "grad_norm": 1.0862691402435303, + "learning_rate": 2e-05, + "loss": 0.03753242, + "step": 11859 + }, + { + "epoch": 23.72, + "grad_norm": 0.9295043349266052, + "learning_rate": 2e-05, + "loss": 0.02882018, + "step": 11860 + }, + { + "epoch": 23.722, + "grad_norm": 1.0936144590377808, + "learning_rate": 2e-05, + "loss": 0.03536011, + "step": 11861 + }, + { + "epoch": 23.724, + "grad_norm": 0.9443338513374329, + "learning_rate": 2e-05, + "loss": 0.03331787, + "step": 11862 + }, + { + "epoch": 23.726, + "grad_norm": 1.0032241344451904, + "learning_rate": 2e-05, + "loss": 0.02793972, + "step": 11863 + }, + { + "epoch": 23.728, + "grad_norm": 1.677073359489441, + "learning_rate": 2e-05, + "loss": 0.04015389, + "step": 11864 + }, + { + "epoch": 23.73, + "grad_norm": 2.5019454956054688, + "learning_rate": 2e-05, + "loss": 0.04113224, + "step": 11865 + }, + { + "epoch": 23.732, + "grad_norm": 1.2569458484649658, + "learning_rate": 2e-05, + "loss": 0.04976218, + "step": 11866 + }, + { + "epoch": 23.734, + "grad_norm": 1.3353568315505981, + "learning_rate": 2e-05, + "loss": 0.0452405, + "step": 11867 + }, + { + "epoch": 23.736, + "grad_norm": 1.0816160440444946, + "learning_rate": 2e-05, + "loss": 0.03934583, + "step": 11868 + }, + { + "epoch": 23.738, + "grad_norm": 2.60617733001709, + "learning_rate": 2e-05, + "loss": 0.04733625, + "step": 11869 + }, + { + "epoch": 23.74, + "grad_norm": 1.3264927864074707, + "learning_rate": 2e-05, + "loss": 0.04373094, + "step": 11870 + }, + { + "epoch": 23.742, + "grad_norm": 1.8265389204025269, + "learning_rate": 2e-05, + "loss": 0.04801701, + "step": 11871 + }, + { + "epoch": 23.744, + "grad_norm": 1.5918618440628052, + "learning_rate": 2e-05, + "loss": 0.03365224, + "step": 11872 + }, + { + "epoch": 23.746, + "grad_norm": 1.1717815399169922, + "learning_rate": 2e-05, + "loss": 0.04041979, + "step": 11873 + }, + { + "epoch": 23.748, + "grad_norm": 1.1238340139389038, + "learning_rate": 2e-05, + "loss": 0.03989327, + "step": 11874 + }, + { + "epoch": 23.75, + "grad_norm": 3.7857089042663574, + "learning_rate": 2e-05, + "loss": 0.04945645, + "step": 11875 + }, + { + "epoch": 23.752, + "grad_norm": 3.371209144592285, + "learning_rate": 2e-05, + "loss": 0.0476009, + "step": 11876 + }, + { + "epoch": 23.754, + "grad_norm": 1.3550797700881958, + "learning_rate": 2e-05, + "loss": 0.04116515, + "step": 11877 + }, + { + "epoch": 23.756, + "grad_norm": 1.1104875802993774, + "learning_rate": 2e-05, + "loss": 0.03957765, + "step": 11878 + }, + { + "epoch": 23.758, + "grad_norm": 1.2116479873657227, + "learning_rate": 2e-05, + "loss": 0.03302076, + "step": 11879 + }, + { + "epoch": 23.76, + "grad_norm": 1.565588355064392, + "learning_rate": 2e-05, + "loss": 0.05508092, + "step": 11880 + }, + { + "epoch": 23.762, + "grad_norm": 6.225358486175537, + "learning_rate": 2e-05, + "loss": 0.0478447, + "step": 11881 + }, + { + "epoch": 23.764, + "grad_norm": 1.688665747642517, + "learning_rate": 2e-05, + "loss": 0.0437132, + "step": 11882 + }, + { + "epoch": 23.766, + "grad_norm": 1.2847251892089844, + "learning_rate": 2e-05, + "loss": 0.04984112, + "step": 11883 + }, + { + "epoch": 23.768, + "grad_norm": 1.4837887287139893, + "learning_rate": 2e-05, + "loss": 0.03870174, + "step": 11884 + }, + { + "epoch": 23.77, + "grad_norm": 1.2898259162902832, + "learning_rate": 2e-05, + "loss": 0.0534673, + "step": 11885 + }, + { + "epoch": 23.772, + "grad_norm": 1.0326087474822998, + "learning_rate": 2e-05, + "loss": 0.0354575, + "step": 11886 + }, + { + "epoch": 23.774, + "grad_norm": 1.8046265840530396, + "learning_rate": 2e-05, + "loss": 0.0476476, + "step": 11887 + }, + { + "epoch": 23.776, + "grad_norm": 1.4603756666183472, + "learning_rate": 2e-05, + "loss": 0.04588185, + "step": 11888 + }, + { + "epoch": 23.778, + "grad_norm": 1.1678380966186523, + "learning_rate": 2e-05, + "loss": 0.04390156, + "step": 11889 + }, + { + "epoch": 23.78, + "grad_norm": 1.5542722940444946, + "learning_rate": 2e-05, + "loss": 0.04794715, + "step": 11890 + }, + { + "epoch": 23.782, + "grad_norm": 1.2043498754501343, + "learning_rate": 2e-05, + "loss": 0.03155842, + "step": 11891 + }, + { + "epoch": 23.784, + "grad_norm": 1.1357460021972656, + "learning_rate": 2e-05, + "loss": 0.04152321, + "step": 11892 + }, + { + "epoch": 23.786, + "grad_norm": 1.1794549226760864, + "learning_rate": 2e-05, + "loss": 0.04586139, + "step": 11893 + }, + { + "epoch": 23.788, + "grad_norm": 0.9895995855331421, + "learning_rate": 2e-05, + "loss": 0.0327672, + "step": 11894 + }, + { + "epoch": 23.79, + "grad_norm": 1.7362169027328491, + "learning_rate": 2e-05, + "loss": 0.05948992, + "step": 11895 + }, + { + "epoch": 23.792, + "grad_norm": 0.9421995282173157, + "learning_rate": 2e-05, + "loss": 0.02367941, + "step": 11896 + }, + { + "epoch": 23.794, + "grad_norm": 2.629747152328491, + "learning_rate": 2e-05, + "loss": 0.04058916, + "step": 11897 + }, + { + "epoch": 23.796, + "grad_norm": 1.0108635425567627, + "learning_rate": 2e-05, + "loss": 0.02985597, + "step": 11898 + }, + { + "epoch": 23.798000000000002, + "grad_norm": 3.040156126022339, + "learning_rate": 2e-05, + "loss": 0.0340285, + "step": 11899 + }, + { + "epoch": 23.8, + "grad_norm": 1.1744736433029175, + "learning_rate": 2e-05, + "loss": 0.04568225, + "step": 11900 + }, + { + "epoch": 23.802, + "grad_norm": 2.0291764736175537, + "learning_rate": 2e-05, + "loss": 0.05318277, + "step": 11901 + }, + { + "epoch": 23.804, + "grad_norm": 1.3046629428863525, + "learning_rate": 2e-05, + "loss": 0.05274621, + "step": 11902 + }, + { + "epoch": 23.806, + "grad_norm": 2.1963841915130615, + "learning_rate": 2e-05, + "loss": 0.04720778, + "step": 11903 + }, + { + "epoch": 23.808, + "grad_norm": 2.655916213989258, + "learning_rate": 2e-05, + "loss": 0.0420041, + "step": 11904 + }, + { + "epoch": 23.81, + "grad_norm": 1.176939845085144, + "learning_rate": 2e-05, + "loss": 0.04580169, + "step": 11905 + }, + { + "epoch": 23.812, + "grad_norm": 1.0048329830169678, + "learning_rate": 2e-05, + "loss": 0.03731166, + "step": 11906 + }, + { + "epoch": 23.814, + "grad_norm": 1.414959192276001, + "learning_rate": 2e-05, + "loss": 0.06279035, + "step": 11907 + }, + { + "epoch": 23.816, + "grad_norm": 1.7334407567977905, + "learning_rate": 2e-05, + "loss": 0.03839599, + "step": 11908 + }, + { + "epoch": 23.818, + "grad_norm": 1.2090424299240112, + "learning_rate": 2e-05, + "loss": 0.03566598, + "step": 11909 + }, + { + "epoch": 23.82, + "grad_norm": 1.0905237197875977, + "learning_rate": 2e-05, + "loss": 0.04220146, + "step": 11910 + }, + { + "epoch": 23.822, + "grad_norm": 2.646314859390259, + "learning_rate": 2e-05, + "loss": 0.06015548, + "step": 11911 + }, + { + "epoch": 23.824, + "grad_norm": 1.4104152917861938, + "learning_rate": 2e-05, + "loss": 0.03940243, + "step": 11912 + }, + { + "epoch": 23.826, + "grad_norm": 2.7767820358276367, + "learning_rate": 2e-05, + "loss": 0.04243702, + "step": 11913 + }, + { + "epoch": 23.828, + "grad_norm": 1.3988125324249268, + "learning_rate": 2e-05, + "loss": 0.03724855, + "step": 11914 + }, + { + "epoch": 23.83, + "grad_norm": 1.0905022621154785, + "learning_rate": 2e-05, + "loss": 0.02765215, + "step": 11915 + }, + { + "epoch": 23.832, + "grad_norm": 1.3085315227508545, + "learning_rate": 2e-05, + "loss": 0.02965268, + "step": 11916 + }, + { + "epoch": 23.834, + "grad_norm": 2.0017828941345215, + "learning_rate": 2e-05, + "loss": 0.05234018, + "step": 11917 + }, + { + "epoch": 23.836, + "grad_norm": 1.6834447383880615, + "learning_rate": 2e-05, + "loss": 0.04721446, + "step": 11918 + }, + { + "epoch": 23.838, + "grad_norm": 1.4597904682159424, + "learning_rate": 2e-05, + "loss": 0.05151546, + "step": 11919 + }, + { + "epoch": 23.84, + "grad_norm": 2.2075035572052, + "learning_rate": 2e-05, + "loss": 0.04451163, + "step": 11920 + }, + { + "epoch": 23.842, + "grad_norm": 1.4260821342468262, + "learning_rate": 2e-05, + "loss": 0.03413969, + "step": 11921 + }, + { + "epoch": 23.844, + "grad_norm": 1.2742419242858887, + "learning_rate": 2e-05, + "loss": 0.0354282, + "step": 11922 + }, + { + "epoch": 23.846, + "grad_norm": 1.3900338411331177, + "learning_rate": 2e-05, + "loss": 0.03880811, + "step": 11923 + }, + { + "epoch": 23.848, + "grad_norm": 1.640749216079712, + "learning_rate": 2e-05, + "loss": 0.03321562, + "step": 11924 + }, + { + "epoch": 23.85, + "grad_norm": 1.2059239149093628, + "learning_rate": 2e-05, + "loss": 0.03351336, + "step": 11925 + }, + { + "epoch": 23.852, + "grad_norm": 1.6385819911956787, + "learning_rate": 2e-05, + "loss": 0.03568233, + "step": 11926 + }, + { + "epoch": 23.854, + "grad_norm": 1.6820650100708008, + "learning_rate": 2e-05, + "loss": 0.04155786, + "step": 11927 + }, + { + "epoch": 23.856, + "grad_norm": 1.2074123620986938, + "learning_rate": 2e-05, + "loss": 0.04882422, + "step": 11928 + }, + { + "epoch": 23.858, + "grad_norm": 1.130673885345459, + "learning_rate": 2e-05, + "loss": 0.03696358, + "step": 11929 + }, + { + "epoch": 23.86, + "grad_norm": 1.832159161567688, + "learning_rate": 2e-05, + "loss": 0.02932361, + "step": 11930 + }, + { + "epoch": 23.862, + "grad_norm": 1.9174375534057617, + "learning_rate": 2e-05, + "loss": 0.05652925, + "step": 11931 + }, + { + "epoch": 23.864, + "grad_norm": 1.8890222311019897, + "learning_rate": 2e-05, + "loss": 0.03878344, + "step": 11932 + }, + { + "epoch": 23.866, + "grad_norm": 1.596886157989502, + "learning_rate": 2e-05, + "loss": 0.047834, + "step": 11933 + }, + { + "epoch": 23.868, + "grad_norm": 1.2127768993377686, + "learning_rate": 2e-05, + "loss": 0.04310586, + "step": 11934 + }, + { + "epoch": 23.87, + "grad_norm": 1.5679130554199219, + "learning_rate": 2e-05, + "loss": 0.05463172, + "step": 11935 + }, + { + "epoch": 23.872, + "grad_norm": 2.985722303390503, + "learning_rate": 2e-05, + "loss": 0.05015947, + "step": 11936 + }, + { + "epoch": 23.874, + "grad_norm": 1.1504042148590088, + "learning_rate": 2e-05, + "loss": 0.04618395, + "step": 11937 + }, + { + "epoch": 23.876, + "grad_norm": 1.0759702920913696, + "learning_rate": 2e-05, + "loss": 0.03113378, + "step": 11938 + }, + { + "epoch": 23.878, + "grad_norm": 0.9902017116546631, + "learning_rate": 2e-05, + "loss": 0.03239877, + "step": 11939 + }, + { + "epoch": 23.88, + "grad_norm": 2.066293954849243, + "learning_rate": 2e-05, + "loss": 0.0501477, + "step": 11940 + }, + { + "epoch": 23.882, + "grad_norm": 1.0674852132797241, + "learning_rate": 2e-05, + "loss": 0.02988044, + "step": 11941 + }, + { + "epoch": 23.884, + "grad_norm": 1.0986425876617432, + "learning_rate": 2e-05, + "loss": 0.03874698, + "step": 11942 + }, + { + "epoch": 23.886, + "grad_norm": 2.511347770690918, + "learning_rate": 2e-05, + "loss": 0.04582223, + "step": 11943 + }, + { + "epoch": 23.888, + "grad_norm": 1.30561101436615, + "learning_rate": 2e-05, + "loss": 0.04915758, + "step": 11944 + }, + { + "epoch": 23.89, + "grad_norm": 1.0842196941375732, + "learning_rate": 2e-05, + "loss": 0.03920197, + "step": 11945 + }, + { + "epoch": 23.892, + "grad_norm": 2.458435535430908, + "learning_rate": 2e-05, + "loss": 0.06143365, + "step": 11946 + }, + { + "epoch": 23.894, + "grad_norm": 1.3508713245391846, + "learning_rate": 2e-05, + "loss": 0.04859567, + "step": 11947 + }, + { + "epoch": 23.896, + "grad_norm": 1.2235263586044312, + "learning_rate": 2e-05, + "loss": 0.04123156, + "step": 11948 + }, + { + "epoch": 23.898, + "grad_norm": 1.3440524339675903, + "learning_rate": 2e-05, + "loss": 0.05537596, + "step": 11949 + }, + { + "epoch": 23.9, + "grad_norm": 1.618463158607483, + "learning_rate": 2e-05, + "loss": 0.05320452, + "step": 11950 + }, + { + "epoch": 23.902, + "grad_norm": 1.1047896146774292, + "learning_rate": 2e-05, + "loss": 0.03042508, + "step": 11951 + }, + { + "epoch": 23.904, + "grad_norm": 1.4379069805145264, + "learning_rate": 2e-05, + "loss": 0.03938862, + "step": 11952 + }, + { + "epoch": 23.906, + "grad_norm": 1.2280552387237549, + "learning_rate": 2e-05, + "loss": 0.03531722, + "step": 11953 + }, + { + "epoch": 23.908, + "grad_norm": 1.7554948329925537, + "learning_rate": 2e-05, + "loss": 0.0457234, + "step": 11954 + }, + { + "epoch": 23.91, + "grad_norm": 1.4118913412094116, + "learning_rate": 2e-05, + "loss": 0.04296067, + "step": 11955 + }, + { + "epoch": 23.912, + "grad_norm": 1.3764146566390991, + "learning_rate": 2e-05, + "loss": 0.04469619, + "step": 11956 + }, + { + "epoch": 23.914, + "grad_norm": 1.2633867263793945, + "learning_rate": 2e-05, + "loss": 0.03698818, + "step": 11957 + }, + { + "epoch": 23.916, + "grad_norm": 0.9480093717575073, + "learning_rate": 2e-05, + "loss": 0.02930729, + "step": 11958 + }, + { + "epoch": 23.918, + "grad_norm": 1.1288198232650757, + "learning_rate": 2e-05, + "loss": 0.04000931, + "step": 11959 + }, + { + "epoch": 23.92, + "grad_norm": 1.1547532081604004, + "learning_rate": 2e-05, + "loss": 0.04611331, + "step": 11960 + }, + { + "epoch": 23.922, + "grad_norm": 1.3261470794677734, + "learning_rate": 2e-05, + "loss": 0.05121126, + "step": 11961 + }, + { + "epoch": 23.924, + "grad_norm": 1.592149019241333, + "learning_rate": 2e-05, + "loss": 0.03919889, + "step": 11962 + }, + { + "epoch": 23.926, + "grad_norm": 1.4399209022521973, + "learning_rate": 2e-05, + "loss": 0.0399067, + "step": 11963 + }, + { + "epoch": 23.928, + "grad_norm": 2.2152695655822754, + "learning_rate": 2e-05, + "loss": 0.05575511, + "step": 11964 + }, + { + "epoch": 23.93, + "grad_norm": 1.7007336616516113, + "learning_rate": 2e-05, + "loss": 0.04545318, + "step": 11965 + }, + { + "epoch": 23.932, + "grad_norm": 1.149708867073059, + "learning_rate": 2e-05, + "loss": 0.04423167, + "step": 11966 + }, + { + "epoch": 23.934, + "grad_norm": 1.4653066396713257, + "learning_rate": 2e-05, + "loss": 0.03144412, + "step": 11967 + }, + { + "epoch": 23.936, + "grad_norm": 1.416612982749939, + "learning_rate": 2e-05, + "loss": 0.0496158, + "step": 11968 + }, + { + "epoch": 23.938, + "grad_norm": 3.0709216594696045, + "learning_rate": 2e-05, + "loss": 0.03679985, + "step": 11969 + }, + { + "epoch": 23.94, + "grad_norm": 1.1610190868377686, + "learning_rate": 2e-05, + "loss": 0.02437357, + "step": 11970 + }, + { + "epoch": 23.942, + "grad_norm": 1.3058899641036987, + "learning_rate": 2e-05, + "loss": 0.03944077, + "step": 11971 + }, + { + "epoch": 23.944, + "grad_norm": 1.279998779296875, + "learning_rate": 2e-05, + "loss": 0.03772661, + "step": 11972 + }, + { + "epoch": 23.946, + "grad_norm": 3.4212002754211426, + "learning_rate": 2e-05, + "loss": 0.06490073, + "step": 11973 + }, + { + "epoch": 23.948, + "grad_norm": 1.8969491720199585, + "learning_rate": 2e-05, + "loss": 0.05955943, + "step": 11974 + }, + { + "epoch": 23.95, + "grad_norm": 1.3523305654525757, + "learning_rate": 2e-05, + "loss": 0.04351056, + "step": 11975 + }, + { + "epoch": 23.951999999999998, + "grad_norm": 2.597954273223877, + "learning_rate": 2e-05, + "loss": 0.06529416, + "step": 11976 + }, + { + "epoch": 23.954, + "grad_norm": 1.3287073373794556, + "learning_rate": 2e-05, + "loss": 0.02537181, + "step": 11977 + }, + { + "epoch": 23.956, + "grad_norm": 1.4415262937545776, + "learning_rate": 2e-05, + "loss": 0.04575014, + "step": 11978 + }, + { + "epoch": 23.958, + "grad_norm": 2.070765256881714, + "learning_rate": 2e-05, + "loss": 0.04959387, + "step": 11979 + }, + { + "epoch": 23.96, + "grad_norm": 1.198920726776123, + "learning_rate": 2e-05, + "loss": 0.04554899, + "step": 11980 + }, + { + "epoch": 23.962, + "grad_norm": 1.5052765607833862, + "learning_rate": 2e-05, + "loss": 0.0560311, + "step": 11981 + }, + { + "epoch": 23.964, + "grad_norm": 1.083158016204834, + "learning_rate": 2e-05, + "loss": 0.03436514, + "step": 11982 + }, + { + "epoch": 23.966, + "grad_norm": 1.2290107011795044, + "learning_rate": 2e-05, + "loss": 0.02736303, + "step": 11983 + }, + { + "epoch": 23.968, + "grad_norm": 2.753286361694336, + "learning_rate": 2e-05, + "loss": 0.05253476, + "step": 11984 + }, + { + "epoch": 23.97, + "grad_norm": 1.206758975982666, + "learning_rate": 2e-05, + "loss": 0.02033206, + "step": 11985 + }, + { + "epoch": 23.972, + "grad_norm": 2.5052928924560547, + "learning_rate": 2e-05, + "loss": 0.05282067, + "step": 11986 + }, + { + "epoch": 23.974, + "grad_norm": 1.4807002544403076, + "learning_rate": 2e-05, + "loss": 0.04118428, + "step": 11987 + }, + { + "epoch": 23.976, + "grad_norm": 0.930181622505188, + "learning_rate": 2e-05, + "loss": 0.0332891, + "step": 11988 + }, + { + "epoch": 23.978, + "grad_norm": 1.6639894247055054, + "learning_rate": 2e-05, + "loss": 0.04549947, + "step": 11989 + }, + { + "epoch": 23.98, + "grad_norm": 1.3116426467895508, + "learning_rate": 2e-05, + "loss": 0.03784752, + "step": 11990 + }, + { + "epoch": 23.982, + "grad_norm": 0.9380161166191101, + "learning_rate": 2e-05, + "loss": 0.02113626, + "step": 11991 + }, + { + "epoch": 23.984, + "grad_norm": 1.9898308515548706, + "learning_rate": 2e-05, + "loss": 0.05268819, + "step": 11992 + }, + { + "epoch": 23.986, + "grad_norm": 0.75873202085495, + "learning_rate": 2e-05, + "loss": 0.01814926, + "step": 11993 + }, + { + "epoch": 23.988, + "grad_norm": 1.49936842918396, + "learning_rate": 2e-05, + "loss": 0.03649104, + "step": 11994 + }, + { + "epoch": 23.99, + "grad_norm": 1.114337682723999, + "learning_rate": 2e-05, + "loss": 0.04498353, + "step": 11995 + }, + { + "epoch": 23.992, + "grad_norm": 1.0039514303207397, + "learning_rate": 2e-05, + "loss": 0.03449545, + "step": 11996 + }, + { + "epoch": 23.994, + "grad_norm": 0.9935739040374756, + "learning_rate": 2e-05, + "loss": 0.03035527, + "step": 11997 + }, + { + "epoch": 23.996, + "grad_norm": 1.7863175868988037, + "learning_rate": 2e-05, + "loss": 0.044944, + "step": 11998 + }, + { + "epoch": 23.998, + "grad_norm": 2.193424701690674, + "learning_rate": 2e-05, + "loss": 0.03365751, + "step": 11999 + }, + { + "epoch": 24.0, + "grad_norm": 1.198980450630188, + "learning_rate": 2e-05, + "loss": 0.02690567, + "step": 12000 + }, + { + "epoch": 24.0, + "eval_performance": { + "AngleClassification_1": 0.978, + "AngleClassification_2": 0.998, + "AngleClassification_3": 0.9700598802395209, + "Equal_1": 0.996, + "Equal_2": 0.9800399201596807, + "Equal_3": 0.9800399201596807, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.992, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.988, + "Perpendicular_3": 0.8697394789579158, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.986, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9919839679358717, + "PointLiesOnLine_3": 0.9940119760479041 + }, + "eval_runtime": 320.967, + "eval_samples_per_second": 32.714, + "eval_steps_per_second": 0.654, + "step": 12000 + }, + { + "epoch": 24.002, + "grad_norm": 1.232427716255188, + "learning_rate": 2e-05, + "loss": 0.0302996, + "step": 12001 + }, + { + "epoch": 24.004, + "grad_norm": 1.1565972566604614, + "learning_rate": 2e-05, + "loss": 0.0323018, + "step": 12002 + }, + { + "epoch": 24.006, + "grad_norm": 2.3447890281677246, + "learning_rate": 2e-05, + "loss": 0.04620555, + "step": 12003 + }, + { + "epoch": 24.008, + "grad_norm": 1.4546306133270264, + "learning_rate": 2e-05, + "loss": 0.04248685, + "step": 12004 + }, + { + "epoch": 24.01, + "grad_norm": 2.750286340713501, + "learning_rate": 2e-05, + "loss": 0.05949475, + "step": 12005 + }, + { + "epoch": 24.012, + "grad_norm": 2.553252935409546, + "learning_rate": 2e-05, + "loss": 0.05698572, + "step": 12006 + }, + { + "epoch": 24.014, + "grad_norm": 1.3687764406204224, + "learning_rate": 2e-05, + "loss": 0.05525066, + "step": 12007 + }, + { + "epoch": 24.016, + "grad_norm": 1.277368187904358, + "learning_rate": 2e-05, + "loss": 0.05651337, + "step": 12008 + }, + { + "epoch": 24.018, + "grad_norm": 1.1808255910873413, + "learning_rate": 2e-05, + "loss": 0.04304896, + "step": 12009 + }, + { + "epoch": 24.02, + "grad_norm": 1.2423754930496216, + "learning_rate": 2e-05, + "loss": 0.04889252, + "step": 12010 + }, + { + "epoch": 24.022, + "grad_norm": 2.5050649642944336, + "learning_rate": 2e-05, + "loss": 0.05278215, + "step": 12011 + }, + { + "epoch": 24.024, + "grad_norm": 2.068812131881714, + "learning_rate": 2e-05, + "loss": 0.04776769, + "step": 12012 + }, + { + "epoch": 24.026, + "grad_norm": 1.2705119848251343, + "learning_rate": 2e-05, + "loss": 0.06349517, + "step": 12013 + }, + { + "epoch": 24.028, + "grad_norm": 1.3783538341522217, + "learning_rate": 2e-05, + "loss": 0.04796961, + "step": 12014 + }, + { + "epoch": 24.03, + "grad_norm": 1.4065525531768799, + "learning_rate": 2e-05, + "loss": 0.05514029, + "step": 12015 + }, + { + "epoch": 24.032, + "grad_norm": 2.559398889541626, + "learning_rate": 2e-05, + "loss": 0.04297279, + "step": 12016 + }, + { + "epoch": 24.034, + "grad_norm": 1.3823760747909546, + "learning_rate": 2e-05, + "loss": 0.05167028, + "step": 12017 + }, + { + "epoch": 24.036, + "grad_norm": 0.9577856659889221, + "learning_rate": 2e-05, + "loss": 0.02910194, + "step": 12018 + }, + { + "epoch": 24.038, + "grad_norm": 1.0815929174423218, + "learning_rate": 2e-05, + "loss": 0.03755962, + "step": 12019 + }, + { + "epoch": 24.04, + "grad_norm": 1.190253734588623, + "learning_rate": 2e-05, + "loss": 0.03743257, + "step": 12020 + }, + { + "epoch": 24.042, + "grad_norm": 2.5249247550964355, + "learning_rate": 2e-05, + "loss": 0.05092564, + "step": 12021 + }, + { + "epoch": 24.044, + "grad_norm": 1.0962779521942139, + "learning_rate": 2e-05, + "loss": 0.04152154, + "step": 12022 + }, + { + "epoch": 24.046, + "grad_norm": 1.5275741815567017, + "learning_rate": 2e-05, + "loss": 0.04184947, + "step": 12023 + }, + { + "epoch": 24.048, + "grad_norm": 1.0383355617523193, + "learning_rate": 2e-05, + "loss": 0.03165521, + "step": 12024 + }, + { + "epoch": 24.05, + "grad_norm": 1.5432151556015015, + "learning_rate": 2e-05, + "loss": 0.05171941, + "step": 12025 + }, + { + "epoch": 24.052, + "grad_norm": 1.249215841293335, + "learning_rate": 2e-05, + "loss": 0.04950016, + "step": 12026 + }, + { + "epoch": 24.054, + "grad_norm": 1.309938907623291, + "learning_rate": 2e-05, + "loss": 0.04960479, + "step": 12027 + }, + { + "epoch": 24.056, + "grad_norm": 0.9974091053009033, + "learning_rate": 2e-05, + "loss": 0.03317707, + "step": 12028 + }, + { + "epoch": 24.058, + "grad_norm": 1.5228265523910522, + "learning_rate": 2e-05, + "loss": 0.04856505, + "step": 12029 + }, + { + "epoch": 24.06, + "grad_norm": 1.8886607885360718, + "learning_rate": 2e-05, + "loss": 0.05328999, + "step": 12030 + }, + { + "epoch": 24.062, + "grad_norm": 1.02639639377594, + "learning_rate": 2e-05, + "loss": 0.02694909, + "step": 12031 + }, + { + "epoch": 24.064, + "grad_norm": 1.0701245069503784, + "learning_rate": 2e-05, + "loss": 0.04255382, + "step": 12032 + }, + { + "epoch": 24.066, + "grad_norm": 1.3096833229064941, + "learning_rate": 2e-05, + "loss": 0.04552224, + "step": 12033 + }, + { + "epoch": 24.068, + "grad_norm": 1.0245039463043213, + "learning_rate": 2e-05, + "loss": 0.03121547, + "step": 12034 + }, + { + "epoch": 24.07, + "grad_norm": 2.104153871536255, + "learning_rate": 2e-05, + "loss": 0.05049267, + "step": 12035 + }, + { + "epoch": 24.072, + "grad_norm": 0.9794881343841553, + "learning_rate": 2e-05, + "loss": 0.04013614, + "step": 12036 + }, + { + "epoch": 24.074, + "grad_norm": 1.3545254468917847, + "learning_rate": 2e-05, + "loss": 0.04678908, + "step": 12037 + }, + { + "epoch": 24.076, + "grad_norm": 1.7231847047805786, + "learning_rate": 2e-05, + "loss": 0.05604774, + "step": 12038 + }, + { + "epoch": 24.078, + "grad_norm": 1.0859684944152832, + "learning_rate": 2e-05, + "loss": 0.02874695, + "step": 12039 + }, + { + "epoch": 24.08, + "grad_norm": 1.4772136211395264, + "learning_rate": 2e-05, + "loss": 0.04590698, + "step": 12040 + }, + { + "epoch": 24.082, + "grad_norm": 1.1300466060638428, + "learning_rate": 2e-05, + "loss": 0.03713751, + "step": 12041 + }, + { + "epoch": 24.084, + "grad_norm": 1.4452584981918335, + "learning_rate": 2e-05, + "loss": 0.04535814, + "step": 12042 + }, + { + "epoch": 24.086, + "grad_norm": 1.703972578048706, + "learning_rate": 2e-05, + "loss": 0.05336364, + "step": 12043 + }, + { + "epoch": 24.088, + "grad_norm": 1.5008318424224854, + "learning_rate": 2e-05, + "loss": 0.0530287, + "step": 12044 + }, + { + "epoch": 24.09, + "grad_norm": 1.0992947816848755, + "learning_rate": 2e-05, + "loss": 0.05150533, + "step": 12045 + }, + { + "epoch": 24.092, + "grad_norm": 1.1373698711395264, + "learning_rate": 2e-05, + "loss": 0.04210654, + "step": 12046 + }, + { + "epoch": 24.094, + "grad_norm": 1.3362079858779907, + "learning_rate": 2e-05, + "loss": 0.04046137, + "step": 12047 + }, + { + "epoch": 24.096, + "grad_norm": 2.110414743423462, + "learning_rate": 2e-05, + "loss": 0.05383376, + "step": 12048 + }, + { + "epoch": 24.098, + "grad_norm": 2.22654128074646, + "learning_rate": 2e-05, + "loss": 0.04562001, + "step": 12049 + }, + { + "epoch": 24.1, + "grad_norm": 1.5892013311386108, + "learning_rate": 2e-05, + "loss": 0.04545607, + "step": 12050 + }, + { + "epoch": 24.102, + "grad_norm": 1.7397476434707642, + "learning_rate": 2e-05, + "loss": 0.05527458, + "step": 12051 + }, + { + "epoch": 24.104, + "grad_norm": 1.3350776433944702, + "learning_rate": 2e-05, + "loss": 0.04359471, + "step": 12052 + }, + { + "epoch": 24.106, + "grad_norm": 1.240904450416565, + "learning_rate": 2e-05, + "loss": 0.04825516, + "step": 12053 + }, + { + "epoch": 24.108, + "grad_norm": 1.1014329195022583, + "learning_rate": 2e-05, + "loss": 0.0354472, + "step": 12054 + }, + { + "epoch": 24.11, + "grad_norm": 1.3946998119354248, + "learning_rate": 2e-05, + "loss": 0.04494639, + "step": 12055 + }, + { + "epoch": 24.112, + "grad_norm": 1.7110241651535034, + "learning_rate": 2e-05, + "loss": 0.05996033, + "step": 12056 + }, + { + "epoch": 24.114, + "grad_norm": 1.3246296644210815, + "learning_rate": 2e-05, + "loss": 0.0507584, + "step": 12057 + }, + { + "epoch": 24.116, + "grad_norm": 1.423532247543335, + "learning_rate": 2e-05, + "loss": 0.04155703, + "step": 12058 + }, + { + "epoch": 24.118, + "grad_norm": 1.6252057552337646, + "learning_rate": 2e-05, + "loss": 0.05566594, + "step": 12059 + }, + { + "epoch": 24.12, + "grad_norm": 1.978572964668274, + "learning_rate": 2e-05, + "loss": 0.06226615, + "step": 12060 + }, + { + "epoch": 24.122, + "grad_norm": 1.053499460220337, + "learning_rate": 2e-05, + "loss": 0.03881479, + "step": 12061 + }, + { + "epoch": 24.124, + "grad_norm": 1.3819252252578735, + "learning_rate": 2e-05, + "loss": 0.05036075, + "step": 12062 + }, + { + "epoch": 24.126, + "grad_norm": 1.7073538303375244, + "learning_rate": 2e-05, + "loss": 0.05705138, + "step": 12063 + }, + { + "epoch": 24.128, + "grad_norm": 1.606673002243042, + "learning_rate": 2e-05, + "loss": 0.04539119, + "step": 12064 + }, + { + "epoch": 24.13, + "grad_norm": 1.0602710247039795, + "learning_rate": 2e-05, + "loss": 0.04797367, + "step": 12065 + }, + { + "epoch": 24.132, + "grad_norm": 1.3240618705749512, + "learning_rate": 2e-05, + "loss": 0.05344753, + "step": 12066 + }, + { + "epoch": 24.134, + "grad_norm": 1.652435541152954, + "learning_rate": 2e-05, + "loss": 0.03491877, + "step": 12067 + }, + { + "epoch": 24.136, + "grad_norm": 2.6440341472625732, + "learning_rate": 2e-05, + "loss": 0.05096871, + "step": 12068 + }, + { + "epoch": 24.138, + "grad_norm": 1.0971581935882568, + "learning_rate": 2e-05, + "loss": 0.04188203, + "step": 12069 + }, + { + "epoch": 24.14, + "grad_norm": 0.9165200591087341, + "learning_rate": 2e-05, + "loss": 0.03046097, + "step": 12070 + }, + { + "epoch": 24.142, + "grad_norm": 2.975809097290039, + "learning_rate": 2e-05, + "loss": 0.05606359, + "step": 12071 + }, + { + "epoch": 24.144, + "grad_norm": 1.127575397491455, + "learning_rate": 2e-05, + "loss": 0.0311743, + "step": 12072 + }, + { + "epoch": 24.146, + "grad_norm": 1.337193489074707, + "learning_rate": 2e-05, + "loss": 0.03485664, + "step": 12073 + }, + { + "epoch": 24.148, + "grad_norm": 2.253185510635376, + "learning_rate": 2e-05, + "loss": 0.04143036, + "step": 12074 + }, + { + "epoch": 24.15, + "grad_norm": 2.4968769550323486, + "learning_rate": 2e-05, + "loss": 0.0586257, + "step": 12075 + }, + { + "epoch": 24.152, + "grad_norm": 1.3548872470855713, + "learning_rate": 2e-05, + "loss": 0.04275392, + "step": 12076 + }, + { + "epoch": 24.154, + "grad_norm": 1.05968177318573, + "learning_rate": 2e-05, + "loss": 0.03807945, + "step": 12077 + }, + { + "epoch": 24.156, + "grad_norm": 1.2706948518753052, + "learning_rate": 2e-05, + "loss": 0.05273522, + "step": 12078 + }, + { + "epoch": 24.158, + "grad_norm": 1.2641117572784424, + "learning_rate": 2e-05, + "loss": 0.05271117, + "step": 12079 + }, + { + "epoch": 24.16, + "grad_norm": 1.2767136096954346, + "learning_rate": 2e-05, + "loss": 0.03717405, + "step": 12080 + }, + { + "epoch": 24.162, + "grad_norm": 1.8129770755767822, + "learning_rate": 2e-05, + "loss": 0.05442118, + "step": 12081 + }, + { + "epoch": 24.164, + "grad_norm": 1.6097712516784668, + "learning_rate": 2e-05, + "loss": 0.02836596, + "step": 12082 + }, + { + "epoch": 24.166, + "grad_norm": 1.6396865844726562, + "learning_rate": 2e-05, + "loss": 0.0487038, + "step": 12083 + }, + { + "epoch": 24.168, + "grad_norm": 5.180088520050049, + "learning_rate": 2e-05, + "loss": 0.07057432, + "step": 12084 + }, + { + "epoch": 24.17, + "grad_norm": 3.231645345687866, + "learning_rate": 2e-05, + "loss": 0.0679914, + "step": 12085 + }, + { + "epoch": 24.172, + "grad_norm": 2.2453713417053223, + "learning_rate": 2e-05, + "loss": 0.0494958, + "step": 12086 + }, + { + "epoch": 24.174, + "grad_norm": 1.019257664680481, + "learning_rate": 2e-05, + "loss": 0.03262502, + "step": 12087 + }, + { + "epoch": 24.176, + "grad_norm": 1.8194106817245483, + "learning_rate": 2e-05, + "loss": 0.04817811, + "step": 12088 + }, + { + "epoch": 24.178, + "grad_norm": 1.1825400590896606, + "learning_rate": 2e-05, + "loss": 0.04405992, + "step": 12089 + }, + { + "epoch": 24.18, + "grad_norm": 0.9297054409980774, + "learning_rate": 2e-05, + "loss": 0.03029897, + "step": 12090 + }, + { + "epoch": 24.182, + "grad_norm": 1.7777191400527954, + "learning_rate": 2e-05, + "loss": 0.04615546, + "step": 12091 + }, + { + "epoch": 24.184, + "grad_norm": 3.235630750656128, + "learning_rate": 2e-05, + "loss": 0.05811172, + "step": 12092 + }, + { + "epoch": 24.186, + "grad_norm": 1.1768110990524292, + "learning_rate": 2e-05, + "loss": 0.03787137, + "step": 12093 + }, + { + "epoch": 24.188, + "grad_norm": 4.949007034301758, + "learning_rate": 2e-05, + "loss": 0.03465776, + "step": 12094 + }, + { + "epoch": 24.19, + "grad_norm": 1.4985238313674927, + "learning_rate": 2e-05, + "loss": 0.04126935, + "step": 12095 + }, + { + "epoch": 24.192, + "grad_norm": 2.3714089393615723, + "learning_rate": 2e-05, + "loss": 0.03277493, + "step": 12096 + }, + { + "epoch": 24.194, + "grad_norm": 1.5087124109268188, + "learning_rate": 2e-05, + "loss": 0.05334678, + "step": 12097 + }, + { + "epoch": 24.196, + "grad_norm": 1.2616825103759766, + "learning_rate": 2e-05, + "loss": 0.04551134, + "step": 12098 + }, + { + "epoch": 24.198, + "grad_norm": 1.158451795578003, + "learning_rate": 2e-05, + "loss": 0.04630031, + "step": 12099 + }, + { + "epoch": 24.2, + "grad_norm": 1.520171880722046, + "learning_rate": 2e-05, + "loss": 0.03448167, + "step": 12100 + }, + { + "epoch": 24.202, + "grad_norm": 5.375433921813965, + "learning_rate": 2e-05, + "loss": 0.05557346, + "step": 12101 + }, + { + "epoch": 24.204, + "grad_norm": 2.1798291206359863, + "learning_rate": 2e-05, + "loss": 0.05976859, + "step": 12102 + }, + { + "epoch": 24.206, + "grad_norm": 1.5132516622543335, + "learning_rate": 2e-05, + "loss": 0.04973812, + "step": 12103 + }, + { + "epoch": 24.208, + "grad_norm": 1.4551432132720947, + "learning_rate": 2e-05, + "loss": 0.03925546, + "step": 12104 + }, + { + "epoch": 24.21, + "grad_norm": 1.4387096166610718, + "learning_rate": 2e-05, + "loss": 0.04110787, + "step": 12105 + }, + { + "epoch": 24.212, + "grad_norm": 0.9817237854003906, + "learning_rate": 2e-05, + "loss": 0.043275, + "step": 12106 + }, + { + "epoch": 24.214, + "grad_norm": 0.899992048740387, + "learning_rate": 2e-05, + "loss": 0.02760095, + "step": 12107 + }, + { + "epoch": 24.216, + "grad_norm": 1.9981087446212769, + "learning_rate": 2e-05, + "loss": 0.08154525, + "step": 12108 + }, + { + "epoch": 24.218, + "grad_norm": 1.6760551929473877, + "learning_rate": 2e-05, + "loss": 0.05679325, + "step": 12109 + }, + { + "epoch": 24.22, + "grad_norm": 1.3508455753326416, + "learning_rate": 2e-05, + "loss": 0.05091976, + "step": 12110 + }, + { + "epoch": 24.222, + "grad_norm": 1.8912901878356934, + "learning_rate": 2e-05, + "loss": 0.06227458, + "step": 12111 + }, + { + "epoch": 24.224, + "grad_norm": 1.3273926973342896, + "learning_rate": 2e-05, + "loss": 0.05052402, + "step": 12112 + }, + { + "epoch": 24.226, + "grad_norm": 1.2464828491210938, + "learning_rate": 2e-05, + "loss": 0.05788217, + "step": 12113 + }, + { + "epoch": 24.228, + "grad_norm": 2.049316883087158, + "learning_rate": 2e-05, + "loss": 0.06217295, + "step": 12114 + }, + { + "epoch": 24.23, + "grad_norm": 1.428179144859314, + "learning_rate": 2e-05, + "loss": 0.05017955, + "step": 12115 + }, + { + "epoch": 24.232, + "grad_norm": 1.767197847366333, + "learning_rate": 2e-05, + "loss": 0.05077636, + "step": 12116 + }, + { + "epoch": 24.234, + "grad_norm": 1.231481671333313, + "learning_rate": 2e-05, + "loss": 0.05873109, + "step": 12117 + }, + { + "epoch": 24.236, + "grad_norm": 1.6732200384140015, + "learning_rate": 2e-05, + "loss": 0.04653981, + "step": 12118 + }, + { + "epoch": 24.238, + "grad_norm": 1.0597010850906372, + "learning_rate": 2e-05, + "loss": 0.03581889, + "step": 12119 + }, + { + "epoch": 24.24, + "grad_norm": 1.2121529579162598, + "learning_rate": 2e-05, + "loss": 0.04930365, + "step": 12120 + }, + { + "epoch": 24.242, + "grad_norm": 1.3998339176177979, + "learning_rate": 2e-05, + "loss": 0.03907991, + "step": 12121 + }, + { + "epoch": 24.244, + "grad_norm": 1.2382348775863647, + "learning_rate": 2e-05, + "loss": 0.05211905, + "step": 12122 + }, + { + "epoch": 24.246, + "grad_norm": 1.1277509927749634, + "learning_rate": 2e-05, + "loss": 0.03834852, + "step": 12123 + }, + { + "epoch": 24.248, + "grad_norm": 1.2743759155273438, + "learning_rate": 2e-05, + "loss": 0.0476411, + "step": 12124 + }, + { + "epoch": 24.25, + "grad_norm": 1.2436480522155762, + "learning_rate": 2e-05, + "loss": 0.0411014, + "step": 12125 + }, + { + "epoch": 24.252, + "grad_norm": 1.9588987827301025, + "learning_rate": 2e-05, + "loss": 0.06562616, + "step": 12126 + }, + { + "epoch": 24.254, + "grad_norm": 1.318472146987915, + "learning_rate": 2e-05, + "loss": 0.04503114, + "step": 12127 + }, + { + "epoch": 24.256, + "grad_norm": 1.3602678775787354, + "learning_rate": 2e-05, + "loss": 0.02978888, + "step": 12128 + }, + { + "epoch": 24.258, + "grad_norm": 1.2801728248596191, + "learning_rate": 2e-05, + "loss": 0.03404569, + "step": 12129 + }, + { + "epoch": 24.26, + "grad_norm": 1.8510991334915161, + "learning_rate": 2e-05, + "loss": 0.041315, + "step": 12130 + }, + { + "epoch": 24.262, + "grad_norm": 1.1254820823669434, + "learning_rate": 2e-05, + "loss": 0.04042017, + "step": 12131 + }, + { + "epoch": 24.264, + "grad_norm": 1.4146277904510498, + "learning_rate": 2e-05, + "loss": 0.04251551, + "step": 12132 + }, + { + "epoch": 24.266, + "grad_norm": 1.4054226875305176, + "learning_rate": 2e-05, + "loss": 0.03718261, + "step": 12133 + }, + { + "epoch": 24.268, + "grad_norm": 1.2703016996383667, + "learning_rate": 2e-05, + "loss": 0.0467224, + "step": 12134 + }, + { + "epoch": 24.27, + "grad_norm": 1.6173110008239746, + "learning_rate": 2e-05, + "loss": 0.05684683, + "step": 12135 + }, + { + "epoch": 24.272, + "grad_norm": 2.035446882247925, + "learning_rate": 2e-05, + "loss": 0.04400192, + "step": 12136 + }, + { + "epoch": 24.274, + "grad_norm": 1.105391502380371, + "learning_rate": 2e-05, + "loss": 0.04434478, + "step": 12137 + }, + { + "epoch": 24.276, + "grad_norm": 1.2330455780029297, + "learning_rate": 2e-05, + "loss": 0.05438062, + "step": 12138 + }, + { + "epoch": 24.278, + "grad_norm": 1.1647205352783203, + "learning_rate": 2e-05, + "loss": 0.03314302, + "step": 12139 + }, + { + "epoch": 24.28, + "grad_norm": 1.1791774034500122, + "learning_rate": 2e-05, + "loss": 0.05466758, + "step": 12140 + }, + { + "epoch": 24.282, + "grad_norm": 1.4208977222442627, + "learning_rate": 2e-05, + "loss": 0.03269932, + "step": 12141 + }, + { + "epoch": 24.284, + "grad_norm": 1.1843621730804443, + "learning_rate": 2e-05, + "loss": 0.04443068, + "step": 12142 + }, + { + "epoch": 24.286, + "grad_norm": 1.522578239440918, + "learning_rate": 2e-05, + "loss": 0.06013615, + "step": 12143 + }, + { + "epoch": 24.288, + "grad_norm": 1.4927306175231934, + "learning_rate": 2e-05, + "loss": 0.04813029, + "step": 12144 + }, + { + "epoch": 24.29, + "grad_norm": 1.2117818593978882, + "learning_rate": 2e-05, + "loss": 0.03942649, + "step": 12145 + }, + { + "epoch": 24.292, + "grad_norm": 1.0251680612564087, + "learning_rate": 2e-05, + "loss": 0.03021972, + "step": 12146 + }, + { + "epoch": 24.294, + "grad_norm": 1.3790630102157593, + "learning_rate": 2e-05, + "loss": 0.05024281, + "step": 12147 + }, + { + "epoch": 24.296, + "grad_norm": 1.5437334775924683, + "learning_rate": 2e-05, + "loss": 0.03962575, + "step": 12148 + }, + { + "epoch": 24.298, + "grad_norm": 1.204317569732666, + "learning_rate": 2e-05, + "loss": 0.03893788, + "step": 12149 + }, + { + "epoch": 24.3, + "grad_norm": 1.1474257707595825, + "learning_rate": 2e-05, + "loss": 0.04675932, + "step": 12150 + }, + { + "epoch": 24.302, + "grad_norm": 0.9916406869888306, + "learning_rate": 2e-05, + "loss": 0.03100633, + "step": 12151 + }, + { + "epoch": 24.304, + "grad_norm": 1.9832628965377808, + "learning_rate": 2e-05, + "loss": 0.05169064, + "step": 12152 + }, + { + "epoch": 24.306, + "grad_norm": 1.811354398727417, + "learning_rate": 2e-05, + "loss": 0.05321472, + "step": 12153 + }, + { + "epoch": 24.308, + "grad_norm": 1.277738094329834, + "learning_rate": 2e-05, + "loss": 0.04673781, + "step": 12154 + }, + { + "epoch": 24.31, + "grad_norm": 1.3241530656814575, + "learning_rate": 2e-05, + "loss": 0.04513844, + "step": 12155 + }, + { + "epoch": 24.312, + "grad_norm": 2.2016355991363525, + "learning_rate": 2e-05, + "loss": 0.03816313, + "step": 12156 + }, + { + "epoch": 24.314, + "grad_norm": 1.972180962562561, + "learning_rate": 2e-05, + "loss": 0.04108272, + "step": 12157 + }, + { + "epoch": 24.316, + "grad_norm": 1.1562265157699585, + "learning_rate": 2e-05, + "loss": 0.04686077, + "step": 12158 + }, + { + "epoch": 24.318, + "grad_norm": 1.2996026277542114, + "learning_rate": 2e-05, + "loss": 0.04213938, + "step": 12159 + }, + { + "epoch": 24.32, + "grad_norm": 2.028224468231201, + "learning_rate": 2e-05, + "loss": 0.05159696, + "step": 12160 + }, + { + "epoch": 24.322, + "grad_norm": 1.2881364822387695, + "learning_rate": 2e-05, + "loss": 0.04292353, + "step": 12161 + }, + { + "epoch": 24.324, + "grad_norm": 1.4620164632797241, + "learning_rate": 2e-05, + "loss": 0.04024596, + "step": 12162 + }, + { + "epoch": 24.326, + "grad_norm": 1.6173001527786255, + "learning_rate": 2e-05, + "loss": 0.04233917, + "step": 12163 + }, + { + "epoch": 24.328, + "grad_norm": 1.7425227165222168, + "learning_rate": 2e-05, + "loss": 0.03686478, + "step": 12164 + }, + { + "epoch": 24.33, + "grad_norm": 1.112245798110962, + "learning_rate": 2e-05, + "loss": 0.04378101, + "step": 12165 + }, + { + "epoch": 24.332, + "grad_norm": 2.1608293056488037, + "learning_rate": 2e-05, + "loss": 0.04491881, + "step": 12166 + }, + { + "epoch": 24.334, + "grad_norm": 1.1910284757614136, + "learning_rate": 2e-05, + "loss": 0.03951531, + "step": 12167 + }, + { + "epoch": 24.336, + "grad_norm": 1.9994529485702515, + "learning_rate": 2e-05, + "loss": 0.04223923, + "step": 12168 + }, + { + "epoch": 24.338, + "grad_norm": 1.0348660945892334, + "learning_rate": 2e-05, + "loss": 0.03475592, + "step": 12169 + }, + { + "epoch": 24.34, + "grad_norm": 3.0013294219970703, + "learning_rate": 2e-05, + "loss": 0.06690484, + "step": 12170 + }, + { + "epoch": 24.342, + "grad_norm": 1.6031739711761475, + "learning_rate": 2e-05, + "loss": 0.04361062, + "step": 12171 + }, + { + "epoch": 24.344, + "grad_norm": 2.190584182739258, + "learning_rate": 2e-05, + "loss": 0.04059751, + "step": 12172 + }, + { + "epoch": 24.346, + "grad_norm": 1.5847196578979492, + "learning_rate": 2e-05, + "loss": 0.03874936, + "step": 12173 + }, + { + "epoch": 24.348, + "grad_norm": 2.4007716178894043, + "learning_rate": 2e-05, + "loss": 0.04349686, + "step": 12174 + }, + { + "epoch": 24.35, + "grad_norm": 3.5807485580444336, + "learning_rate": 2e-05, + "loss": 0.04612588, + "step": 12175 + }, + { + "epoch": 24.352, + "grad_norm": 1.5115255117416382, + "learning_rate": 2e-05, + "loss": 0.05002192, + "step": 12176 + }, + { + "epoch": 24.354, + "grad_norm": 1.3225903511047363, + "learning_rate": 2e-05, + "loss": 0.04832003, + "step": 12177 + }, + { + "epoch": 24.356, + "grad_norm": 1.0971873998641968, + "learning_rate": 2e-05, + "loss": 0.038155, + "step": 12178 + }, + { + "epoch": 24.358, + "grad_norm": 0.9766896367073059, + "learning_rate": 2e-05, + "loss": 0.0364832, + "step": 12179 + }, + { + "epoch": 24.36, + "grad_norm": 2.5646822452545166, + "learning_rate": 2e-05, + "loss": 0.04737934, + "step": 12180 + }, + { + "epoch": 24.362, + "grad_norm": 1.9678560495376587, + "learning_rate": 2e-05, + "loss": 0.03837503, + "step": 12181 + }, + { + "epoch": 24.364, + "grad_norm": 1.550510048866272, + "learning_rate": 2e-05, + "loss": 0.05680725, + "step": 12182 + }, + { + "epoch": 24.366, + "grad_norm": 1.2245664596557617, + "learning_rate": 2e-05, + "loss": 0.04190246, + "step": 12183 + }, + { + "epoch": 24.368, + "grad_norm": 1.0684999227523804, + "learning_rate": 2e-05, + "loss": 0.04685017, + "step": 12184 + }, + { + "epoch": 24.37, + "grad_norm": 1.253795862197876, + "learning_rate": 2e-05, + "loss": 0.05009492, + "step": 12185 + }, + { + "epoch": 24.372, + "grad_norm": 1.1181055307388306, + "learning_rate": 2e-05, + "loss": 0.03659803, + "step": 12186 + }, + { + "epoch": 24.374, + "grad_norm": 2.1159071922302246, + "learning_rate": 2e-05, + "loss": 0.04460203, + "step": 12187 + }, + { + "epoch": 24.376, + "grad_norm": 1.5088142156600952, + "learning_rate": 2e-05, + "loss": 0.05556968, + "step": 12188 + }, + { + "epoch": 24.378, + "grad_norm": 2.524197578430176, + "learning_rate": 2e-05, + "loss": 0.05749575, + "step": 12189 + }, + { + "epoch": 24.38, + "grad_norm": 1.4844391345977783, + "learning_rate": 2e-05, + "loss": 0.04623716, + "step": 12190 + }, + { + "epoch": 24.382, + "grad_norm": 1.9306976795196533, + "learning_rate": 2e-05, + "loss": 0.04222405, + "step": 12191 + }, + { + "epoch": 24.384, + "grad_norm": 1.4491407871246338, + "learning_rate": 2e-05, + "loss": 0.03950711, + "step": 12192 + }, + { + "epoch": 24.386, + "grad_norm": 3.8366308212280273, + "learning_rate": 2e-05, + "loss": 0.04016727, + "step": 12193 + }, + { + "epoch": 24.388, + "grad_norm": 2.6945362091064453, + "learning_rate": 2e-05, + "loss": 0.05264809, + "step": 12194 + }, + { + "epoch": 24.39, + "grad_norm": 1.028458833694458, + "learning_rate": 2e-05, + "loss": 0.02927229, + "step": 12195 + }, + { + "epoch": 24.392, + "grad_norm": 1.1725099086761475, + "learning_rate": 2e-05, + "loss": 0.04898581, + "step": 12196 + }, + { + "epoch": 24.394, + "grad_norm": 1.7470937967300415, + "learning_rate": 2e-05, + "loss": 0.03954092, + "step": 12197 + }, + { + "epoch": 24.396, + "grad_norm": 2.421860456466675, + "learning_rate": 2e-05, + "loss": 0.04920001, + "step": 12198 + }, + { + "epoch": 24.398, + "grad_norm": 1.3461695909500122, + "learning_rate": 2e-05, + "loss": 0.04510684, + "step": 12199 + }, + { + "epoch": 24.4, + "grad_norm": 1.7747408151626587, + "learning_rate": 2e-05, + "loss": 0.05545141, + "step": 12200 + }, + { + "epoch": 24.402, + "grad_norm": 1.768431544303894, + "learning_rate": 2e-05, + "loss": 0.03842364, + "step": 12201 + }, + { + "epoch": 24.404, + "grad_norm": 1.4380724430084229, + "learning_rate": 2e-05, + "loss": 0.03090051, + "step": 12202 + }, + { + "epoch": 24.406, + "grad_norm": 1.3252431154251099, + "learning_rate": 2e-05, + "loss": 0.04328402, + "step": 12203 + }, + { + "epoch": 24.408, + "grad_norm": 1.5969432592391968, + "learning_rate": 2e-05, + "loss": 0.04301886, + "step": 12204 + }, + { + "epoch": 24.41, + "grad_norm": 1.354119062423706, + "learning_rate": 2e-05, + "loss": 0.03353261, + "step": 12205 + }, + { + "epoch": 24.412, + "grad_norm": 1.791136622428894, + "learning_rate": 2e-05, + "loss": 0.06121269, + "step": 12206 + }, + { + "epoch": 24.414, + "grad_norm": 3.968132495880127, + "learning_rate": 2e-05, + "loss": 0.04851215, + "step": 12207 + }, + { + "epoch": 24.416, + "grad_norm": 1.5197120904922485, + "learning_rate": 2e-05, + "loss": 0.04968302, + "step": 12208 + }, + { + "epoch": 24.418, + "grad_norm": 2.272183418273926, + "learning_rate": 2e-05, + "loss": 0.04752434, + "step": 12209 + }, + { + "epoch": 24.42, + "grad_norm": 2.935992479324341, + "learning_rate": 2e-05, + "loss": 0.05976545, + "step": 12210 + }, + { + "epoch": 24.422, + "grad_norm": 1.0861903429031372, + "learning_rate": 2e-05, + "loss": 0.04118664, + "step": 12211 + }, + { + "epoch": 24.424, + "grad_norm": 1.050560474395752, + "learning_rate": 2e-05, + "loss": 0.03079762, + "step": 12212 + }, + { + "epoch": 24.426, + "grad_norm": 2.5487446784973145, + "learning_rate": 2e-05, + "loss": 0.04272437, + "step": 12213 + }, + { + "epoch": 24.428, + "grad_norm": 1.3722039461135864, + "learning_rate": 2e-05, + "loss": 0.0525198, + "step": 12214 + }, + { + "epoch": 24.43, + "grad_norm": 1.176689863204956, + "learning_rate": 2e-05, + "loss": 0.0387351, + "step": 12215 + }, + { + "epoch": 24.432, + "grad_norm": 1.4874681234359741, + "learning_rate": 2e-05, + "loss": 0.05370562, + "step": 12216 + }, + { + "epoch": 24.434, + "grad_norm": 1.266204595565796, + "learning_rate": 2e-05, + "loss": 0.04619331, + "step": 12217 + }, + { + "epoch": 24.436, + "grad_norm": 1.2195080518722534, + "learning_rate": 2e-05, + "loss": 0.03758291, + "step": 12218 + }, + { + "epoch": 24.438, + "grad_norm": 2.4579455852508545, + "learning_rate": 2e-05, + "loss": 0.04582806, + "step": 12219 + }, + { + "epoch": 24.44, + "grad_norm": 1.3026827573776245, + "learning_rate": 2e-05, + "loss": 0.04520482, + "step": 12220 + }, + { + "epoch": 24.442, + "grad_norm": 1.2211519479751587, + "learning_rate": 2e-05, + "loss": 0.0372187, + "step": 12221 + }, + { + "epoch": 24.444, + "grad_norm": 1.1291041374206543, + "learning_rate": 2e-05, + "loss": 0.03202847, + "step": 12222 + }, + { + "epoch": 24.446, + "grad_norm": 1.0536681413650513, + "learning_rate": 2e-05, + "loss": 0.03556249, + "step": 12223 + }, + { + "epoch": 24.448, + "grad_norm": 1.5793570280075073, + "learning_rate": 2e-05, + "loss": 0.04041308, + "step": 12224 + }, + { + "epoch": 24.45, + "grad_norm": 1.9870901107788086, + "learning_rate": 2e-05, + "loss": 0.04744115, + "step": 12225 + }, + { + "epoch": 24.452, + "grad_norm": 1.1192922592163086, + "learning_rate": 2e-05, + "loss": 0.03771604, + "step": 12226 + }, + { + "epoch": 24.454, + "grad_norm": 1.5271708965301514, + "learning_rate": 2e-05, + "loss": 0.04509822, + "step": 12227 + }, + { + "epoch": 24.456, + "grad_norm": 1.09177565574646, + "learning_rate": 2e-05, + "loss": 0.03703531, + "step": 12228 + }, + { + "epoch": 24.458, + "grad_norm": 1.141800880432129, + "learning_rate": 2e-05, + "loss": 0.03585979, + "step": 12229 + }, + { + "epoch": 24.46, + "grad_norm": 0.9765757322311401, + "learning_rate": 2e-05, + "loss": 0.03288185, + "step": 12230 + }, + { + "epoch": 24.462, + "grad_norm": 1.7331185340881348, + "learning_rate": 2e-05, + "loss": 0.06162242, + "step": 12231 + }, + { + "epoch": 24.464, + "grad_norm": 1.0068011283874512, + "learning_rate": 2e-05, + "loss": 0.03150559, + "step": 12232 + }, + { + "epoch": 24.466, + "grad_norm": 2.214324951171875, + "learning_rate": 2e-05, + "loss": 0.05624455, + "step": 12233 + }, + { + "epoch": 24.468, + "grad_norm": 1.3637458086013794, + "learning_rate": 2e-05, + "loss": 0.04643622, + "step": 12234 + }, + { + "epoch": 24.47, + "grad_norm": 1.8071335554122925, + "learning_rate": 2e-05, + "loss": 0.05237973, + "step": 12235 + }, + { + "epoch": 24.472, + "grad_norm": 1.4858529567718506, + "learning_rate": 2e-05, + "loss": 0.04988906, + "step": 12236 + }, + { + "epoch": 24.474, + "grad_norm": 2.615488052368164, + "learning_rate": 2e-05, + "loss": 0.05420715, + "step": 12237 + }, + { + "epoch": 24.476, + "grad_norm": 1.3015847206115723, + "learning_rate": 2e-05, + "loss": 0.04124864, + "step": 12238 + }, + { + "epoch": 24.478, + "grad_norm": 1.670394778251648, + "learning_rate": 2e-05, + "loss": 0.05579087, + "step": 12239 + }, + { + "epoch": 24.48, + "grad_norm": 0.9430084228515625, + "learning_rate": 2e-05, + "loss": 0.03753918, + "step": 12240 + }, + { + "epoch": 24.482, + "grad_norm": 1.3326176404953003, + "learning_rate": 2e-05, + "loss": 0.05490822, + "step": 12241 + }, + { + "epoch": 24.484, + "grad_norm": 1.2883130311965942, + "learning_rate": 2e-05, + "loss": 0.0533482, + "step": 12242 + }, + { + "epoch": 24.486, + "grad_norm": 1.2082029581069946, + "learning_rate": 2e-05, + "loss": 0.04799801, + "step": 12243 + }, + { + "epoch": 24.488, + "grad_norm": 1.6594882011413574, + "learning_rate": 2e-05, + "loss": 0.04107316, + "step": 12244 + }, + { + "epoch": 24.49, + "grad_norm": 1.1790004968643188, + "learning_rate": 2e-05, + "loss": 0.03800457, + "step": 12245 + }, + { + "epoch": 24.492, + "grad_norm": 1.3230494260787964, + "learning_rate": 2e-05, + "loss": 0.05228403, + "step": 12246 + }, + { + "epoch": 24.494, + "grad_norm": 2.5752084255218506, + "learning_rate": 2e-05, + "loss": 0.04225325, + "step": 12247 + }, + { + "epoch": 24.496, + "grad_norm": 1.1051968336105347, + "learning_rate": 2e-05, + "loss": 0.03597275, + "step": 12248 + }, + { + "epoch": 24.498, + "grad_norm": 1.0105959177017212, + "learning_rate": 2e-05, + "loss": 0.03709402, + "step": 12249 + }, + { + "epoch": 24.5, + "grad_norm": 1.417772650718689, + "learning_rate": 2e-05, + "loss": 0.05351973, + "step": 12250 + }, + { + "epoch": 24.502, + "grad_norm": 1.1224205493927002, + "learning_rate": 2e-05, + "loss": 0.03034319, + "step": 12251 + }, + { + "epoch": 24.504, + "grad_norm": 1.2055881023406982, + "learning_rate": 2e-05, + "loss": 0.04971508, + "step": 12252 + }, + { + "epoch": 24.506, + "grad_norm": 2.210259199142456, + "learning_rate": 2e-05, + "loss": 0.05114106, + "step": 12253 + }, + { + "epoch": 24.508, + "grad_norm": 1.4939137697219849, + "learning_rate": 2e-05, + "loss": 0.05452298, + "step": 12254 + }, + { + "epoch": 24.51, + "grad_norm": 0.9842581748962402, + "learning_rate": 2e-05, + "loss": 0.03696226, + "step": 12255 + }, + { + "epoch": 24.512, + "grad_norm": 1.253783941268921, + "learning_rate": 2e-05, + "loss": 0.03539809, + "step": 12256 + }, + { + "epoch": 24.514, + "grad_norm": 3.1072165966033936, + "learning_rate": 2e-05, + "loss": 0.05696538, + "step": 12257 + }, + { + "epoch": 24.516, + "grad_norm": 1.1666399240493774, + "learning_rate": 2e-05, + "loss": 0.04510186, + "step": 12258 + }, + { + "epoch": 24.518, + "grad_norm": 1.8516592979431152, + "learning_rate": 2e-05, + "loss": 0.06507455, + "step": 12259 + }, + { + "epoch": 24.52, + "grad_norm": 2.5476531982421875, + "learning_rate": 2e-05, + "loss": 0.04386877, + "step": 12260 + }, + { + "epoch": 24.522, + "grad_norm": 1.530977487564087, + "learning_rate": 2e-05, + "loss": 0.05524298, + "step": 12261 + }, + { + "epoch": 24.524, + "grad_norm": 0.8967766761779785, + "learning_rate": 2e-05, + "loss": 0.02332542, + "step": 12262 + }, + { + "epoch": 24.526, + "grad_norm": 1.7736743688583374, + "learning_rate": 2e-05, + "loss": 0.05415568, + "step": 12263 + }, + { + "epoch": 24.528, + "grad_norm": 1.1440763473510742, + "learning_rate": 2e-05, + "loss": 0.0214408, + "step": 12264 + }, + { + "epoch": 24.53, + "grad_norm": 1.4621922969818115, + "learning_rate": 2e-05, + "loss": 0.04975047, + "step": 12265 + }, + { + "epoch": 24.532, + "grad_norm": 2.4430623054504395, + "learning_rate": 2e-05, + "loss": 0.0487452, + "step": 12266 + }, + { + "epoch": 24.534, + "grad_norm": 1.67803955078125, + "learning_rate": 2e-05, + "loss": 0.046294, + "step": 12267 + }, + { + "epoch": 24.536, + "grad_norm": 3.9234485626220703, + "learning_rate": 2e-05, + "loss": 0.05363451, + "step": 12268 + }, + { + "epoch": 24.538, + "grad_norm": 1.0378974676132202, + "learning_rate": 2e-05, + "loss": 0.03485204, + "step": 12269 + }, + { + "epoch": 24.54, + "grad_norm": 2.17879581451416, + "learning_rate": 2e-05, + "loss": 0.05576217, + "step": 12270 + }, + { + "epoch": 24.542, + "grad_norm": 1.3864517211914062, + "learning_rate": 2e-05, + "loss": 0.0414337, + "step": 12271 + }, + { + "epoch": 24.544, + "grad_norm": 1.147458553314209, + "learning_rate": 2e-05, + "loss": 0.03207621, + "step": 12272 + }, + { + "epoch": 24.546, + "grad_norm": 1.8686847686767578, + "learning_rate": 2e-05, + "loss": 0.06861442, + "step": 12273 + }, + { + "epoch": 24.548000000000002, + "grad_norm": 1.162184715270996, + "learning_rate": 2e-05, + "loss": 0.03643592, + "step": 12274 + }, + { + "epoch": 24.55, + "grad_norm": 2.051259994506836, + "learning_rate": 2e-05, + "loss": 0.04812557, + "step": 12275 + }, + { + "epoch": 24.552, + "grad_norm": 3.596480131149292, + "learning_rate": 2e-05, + "loss": 0.04335639, + "step": 12276 + }, + { + "epoch": 24.554, + "grad_norm": 1.473477840423584, + "learning_rate": 2e-05, + "loss": 0.05357858, + "step": 12277 + }, + { + "epoch": 24.556, + "grad_norm": 1.5587390661239624, + "learning_rate": 2e-05, + "loss": 0.04699494, + "step": 12278 + }, + { + "epoch": 24.558, + "grad_norm": 1.1098556518554688, + "learning_rate": 2e-05, + "loss": 0.04609701, + "step": 12279 + }, + { + "epoch": 24.56, + "grad_norm": 2.259882688522339, + "learning_rate": 2e-05, + "loss": 0.05510319, + "step": 12280 + }, + { + "epoch": 24.562, + "grad_norm": 1.759789228439331, + "learning_rate": 2e-05, + "loss": 0.04140832, + "step": 12281 + }, + { + "epoch": 24.564, + "grad_norm": 1.605728268623352, + "learning_rate": 2e-05, + "loss": 0.04575901, + "step": 12282 + }, + { + "epoch": 24.566, + "grad_norm": 1.5841501951217651, + "learning_rate": 2e-05, + "loss": 0.06220622, + "step": 12283 + }, + { + "epoch": 24.568, + "grad_norm": 1.8580799102783203, + "learning_rate": 2e-05, + "loss": 0.0317122, + "step": 12284 + }, + { + "epoch": 24.57, + "grad_norm": 1.2249244451522827, + "learning_rate": 2e-05, + "loss": 0.04554923, + "step": 12285 + }, + { + "epoch": 24.572, + "grad_norm": 1.4103236198425293, + "learning_rate": 2e-05, + "loss": 0.05672751, + "step": 12286 + }, + { + "epoch": 24.574, + "grad_norm": 1.2100307941436768, + "learning_rate": 2e-05, + "loss": 0.04418559, + "step": 12287 + }, + { + "epoch": 24.576, + "grad_norm": 1.7110328674316406, + "learning_rate": 2e-05, + "loss": 0.05983176, + "step": 12288 + }, + { + "epoch": 24.578, + "grad_norm": 4.228270053863525, + "learning_rate": 2e-05, + "loss": 0.05921287, + "step": 12289 + }, + { + "epoch": 24.58, + "grad_norm": 1.1426968574523926, + "learning_rate": 2e-05, + "loss": 0.04294404, + "step": 12290 + }, + { + "epoch": 24.582, + "grad_norm": 2.854487419128418, + "learning_rate": 2e-05, + "loss": 0.04818879, + "step": 12291 + }, + { + "epoch": 24.584, + "grad_norm": 1.464268445968628, + "learning_rate": 2e-05, + "loss": 0.05259855, + "step": 12292 + }, + { + "epoch": 24.586, + "grad_norm": 1.6875348091125488, + "learning_rate": 2e-05, + "loss": 0.04386838, + "step": 12293 + }, + { + "epoch": 24.588, + "grad_norm": 1.284960150718689, + "learning_rate": 2e-05, + "loss": 0.03831192, + "step": 12294 + }, + { + "epoch": 24.59, + "grad_norm": 1.7952815294265747, + "learning_rate": 2e-05, + "loss": 0.05706906, + "step": 12295 + }, + { + "epoch": 24.592, + "grad_norm": 1.6755213737487793, + "learning_rate": 2e-05, + "loss": 0.05718254, + "step": 12296 + }, + { + "epoch": 24.594, + "grad_norm": 1.6490305662155151, + "learning_rate": 2e-05, + "loss": 0.04554712, + "step": 12297 + }, + { + "epoch": 24.596, + "grad_norm": 1.4170808792114258, + "learning_rate": 2e-05, + "loss": 0.04948672, + "step": 12298 + }, + { + "epoch": 24.598, + "grad_norm": 0.9543237686157227, + "learning_rate": 2e-05, + "loss": 0.03450447, + "step": 12299 + }, + { + "epoch": 24.6, + "grad_norm": 1.3354976177215576, + "learning_rate": 2e-05, + "loss": 0.04121848, + "step": 12300 + }, + { + "epoch": 24.602, + "grad_norm": 1.322101354598999, + "learning_rate": 2e-05, + "loss": 0.03823018, + "step": 12301 + }, + { + "epoch": 24.604, + "grad_norm": 3.014385938644409, + "learning_rate": 2e-05, + "loss": 0.04420658, + "step": 12302 + }, + { + "epoch": 24.606, + "grad_norm": 1.7275763750076294, + "learning_rate": 2e-05, + "loss": 0.05624251, + "step": 12303 + }, + { + "epoch": 24.608, + "grad_norm": 1.091782569885254, + "learning_rate": 2e-05, + "loss": 0.0310542, + "step": 12304 + }, + { + "epoch": 24.61, + "grad_norm": 1.6847692728042603, + "learning_rate": 2e-05, + "loss": 0.04774918, + "step": 12305 + }, + { + "epoch": 24.612, + "grad_norm": 1.1882266998291016, + "learning_rate": 2e-05, + "loss": 0.04614477, + "step": 12306 + }, + { + "epoch": 24.614, + "grad_norm": 2.3512465953826904, + "learning_rate": 2e-05, + "loss": 0.07600546, + "step": 12307 + }, + { + "epoch": 24.616, + "grad_norm": 1.3159834146499634, + "learning_rate": 2e-05, + "loss": 0.05892436, + "step": 12308 + }, + { + "epoch": 24.618, + "grad_norm": 1.2765941619873047, + "learning_rate": 2e-05, + "loss": 0.03476145, + "step": 12309 + }, + { + "epoch": 24.62, + "grad_norm": 1.3784600496292114, + "learning_rate": 2e-05, + "loss": 0.03707298, + "step": 12310 + }, + { + "epoch": 24.622, + "grad_norm": 1.248801350593567, + "learning_rate": 2e-05, + "loss": 0.04117713, + "step": 12311 + }, + { + "epoch": 24.624, + "grad_norm": 1.257441759109497, + "learning_rate": 2e-05, + "loss": 0.03224612, + "step": 12312 + }, + { + "epoch": 24.626, + "grad_norm": 1.3691023588180542, + "learning_rate": 2e-05, + "loss": 0.04922335, + "step": 12313 + }, + { + "epoch": 24.628, + "grad_norm": 1.3157551288604736, + "learning_rate": 2e-05, + "loss": 0.03670745, + "step": 12314 + }, + { + "epoch": 24.63, + "grad_norm": 1.2883113622665405, + "learning_rate": 2e-05, + "loss": 0.03211619, + "step": 12315 + }, + { + "epoch": 24.632, + "grad_norm": 1.0646746158599854, + "learning_rate": 2e-05, + "loss": 0.03154656, + "step": 12316 + }, + { + "epoch": 24.634, + "grad_norm": 2.252239227294922, + "learning_rate": 2e-05, + "loss": 0.0534423, + "step": 12317 + }, + { + "epoch": 24.636, + "grad_norm": 1.5512782335281372, + "learning_rate": 2e-05, + "loss": 0.05046772, + "step": 12318 + }, + { + "epoch": 24.638, + "grad_norm": 1.1680291891098022, + "learning_rate": 2e-05, + "loss": 0.04530792, + "step": 12319 + }, + { + "epoch": 24.64, + "grad_norm": 1.2998766899108887, + "learning_rate": 2e-05, + "loss": 0.0484796, + "step": 12320 + }, + { + "epoch": 24.642, + "grad_norm": 2.491387128829956, + "learning_rate": 2e-05, + "loss": 0.04955789, + "step": 12321 + }, + { + "epoch": 24.644, + "grad_norm": 2.7813923358917236, + "learning_rate": 2e-05, + "loss": 0.05465161, + "step": 12322 + }, + { + "epoch": 24.646, + "grad_norm": 3.847205400466919, + "learning_rate": 2e-05, + "loss": 0.07493284, + "step": 12323 + }, + { + "epoch": 24.648, + "grad_norm": 1.364022135734558, + "learning_rate": 2e-05, + "loss": 0.03194257, + "step": 12324 + }, + { + "epoch": 24.65, + "grad_norm": 1.3578834533691406, + "learning_rate": 2e-05, + "loss": 0.05537183, + "step": 12325 + }, + { + "epoch": 24.652, + "grad_norm": 1.1270604133605957, + "learning_rate": 2e-05, + "loss": 0.04368369, + "step": 12326 + }, + { + "epoch": 24.654, + "grad_norm": 2.827782154083252, + "learning_rate": 2e-05, + "loss": 0.05406433, + "step": 12327 + }, + { + "epoch": 24.656, + "grad_norm": 1.6303904056549072, + "learning_rate": 2e-05, + "loss": 0.05844283, + "step": 12328 + }, + { + "epoch": 24.658, + "grad_norm": 1.267575740814209, + "learning_rate": 2e-05, + "loss": 0.04553982, + "step": 12329 + }, + { + "epoch": 24.66, + "grad_norm": 2.1354129314422607, + "learning_rate": 2e-05, + "loss": 0.05353948, + "step": 12330 + }, + { + "epoch": 24.662, + "grad_norm": 3.700606346130371, + "learning_rate": 2e-05, + "loss": 0.03866877, + "step": 12331 + }, + { + "epoch": 24.664, + "grad_norm": 3.4093546867370605, + "learning_rate": 2e-05, + "loss": 0.03720096, + "step": 12332 + }, + { + "epoch": 24.666, + "grad_norm": 1.756603479385376, + "learning_rate": 2e-05, + "loss": 0.04626716, + "step": 12333 + }, + { + "epoch": 24.668, + "grad_norm": 2.8935391902923584, + "learning_rate": 2e-05, + "loss": 0.04296909, + "step": 12334 + }, + { + "epoch": 24.67, + "grad_norm": 2.631392002105713, + "learning_rate": 2e-05, + "loss": 0.05087268, + "step": 12335 + }, + { + "epoch": 24.672, + "grad_norm": 1.9597891569137573, + "learning_rate": 2e-05, + "loss": 0.04416256, + "step": 12336 + }, + { + "epoch": 24.674, + "grad_norm": 1.3087323904037476, + "learning_rate": 2e-05, + "loss": 0.05481458, + "step": 12337 + }, + { + "epoch": 24.676, + "grad_norm": 172.79251098632812, + "learning_rate": 2e-05, + "loss": 0.04823511, + "step": 12338 + }, + { + "epoch": 24.678, + "grad_norm": 1.6188311576843262, + "learning_rate": 2e-05, + "loss": 0.05058477, + "step": 12339 + }, + { + "epoch": 24.68, + "grad_norm": 1.4070014953613281, + "learning_rate": 2e-05, + "loss": 0.05267171, + "step": 12340 + }, + { + "epoch": 24.682, + "grad_norm": 1.056060552597046, + "learning_rate": 2e-05, + "loss": 0.04122014, + "step": 12341 + }, + { + "epoch": 24.684, + "grad_norm": 1.0840667486190796, + "learning_rate": 2e-05, + "loss": 0.03836139, + "step": 12342 + }, + { + "epoch": 24.686, + "grad_norm": 1.6224464178085327, + "learning_rate": 2e-05, + "loss": 0.03195419, + "step": 12343 + }, + { + "epoch": 24.688, + "grad_norm": 1.4936470985412598, + "learning_rate": 2e-05, + "loss": 0.07893459, + "step": 12344 + }, + { + "epoch": 24.69, + "grad_norm": 1.8073782920837402, + "learning_rate": 2e-05, + "loss": 0.05727108, + "step": 12345 + }, + { + "epoch": 24.692, + "grad_norm": 1.218421220779419, + "learning_rate": 2e-05, + "loss": 0.046979, + "step": 12346 + }, + { + "epoch": 24.694, + "grad_norm": 1.725235939025879, + "learning_rate": 2e-05, + "loss": 0.03699161, + "step": 12347 + }, + { + "epoch": 24.696, + "grad_norm": 2.4789557456970215, + "learning_rate": 2e-05, + "loss": 0.04929878, + "step": 12348 + }, + { + "epoch": 24.698, + "grad_norm": 3.441601276397705, + "learning_rate": 2e-05, + "loss": 0.0512137, + "step": 12349 + }, + { + "epoch": 24.7, + "grad_norm": 1.1820622682571411, + "learning_rate": 2e-05, + "loss": 0.0380182, + "step": 12350 + }, + { + "epoch": 24.701999999999998, + "grad_norm": 1.0114235877990723, + "learning_rate": 2e-05, + "loss": 0.03822229, + "step": 12351 + }, + { + "epoch": 24.704, + "grad_norm": 1.1000703573226929, + "learning_rate": 2e-05, + "loss": 0.03765395, + "step": 12352 + }, + { + "epoch": 24.706, + "grad_norm": 1.2947152853012085, + "learning_rate": 2e-05, + "loss": 0.03386806, + "step": 12353 + }, + { + "epoch": 24.708, + "grad_norm": 1.8047212362289429, + "learning_rate": 2e-05, + "loss": 0.05379967, + "step": 12354 + }, + { + "epoch": 24.71, + "grad_norm": 2.945786476135254, + "learning_rate": 2e-05, + "loss": 0.05634, + "step": 12355 + }, + { + "epoch": 24.712, + "grad_norm": 2.3145387172698975, + "learning_rate": 2e-05, + "loss": 0.06896135, + "step": 12356 + }, + { + "epoch": 24.714, + "grad_norm": 1.7058087587356567, + "learning_rate": 2e-05, + "loss": 0.04002954, + "step": 12357 + }, + { + "epoch": 24.716, + "grad_norm": 1.0332450866699219, + "learning_rate": 2e-05, + "loss": 0.04152671, + "step": 12358 + }, + { + "epoch": 24.718, + "grad_norm": 1.0253223180770874, + "learning_rate": 2e-05, + "loss": 0.05022538, + "step": 12359 + }, + { + "epoch": 24.72, + "grad_norm": 1.4460958242416382, + "learning_rate": 2e-05, + "loss": 0.04726547, + "step": 12360 + }, + { + "epoch": 24.722, + "grad_norm": 2.94338321685791, + "learning_rate": 2e-05, + "loss": 0.04982202, + "step": 12361 + }, + { + "epoch": 24.724, + "grad_norm": 1.2132338285446167, + "learning_rate": 2e-05, + "loss": 0.0424381, + "step": 12362 + }, + { + "epoch": 24.726, + "grad_norm": 1.2509801387786865, + "learning_rate": 2e-05, + "loss": 0.03625492, + "step": 12363 + }, + { + "epoch": 24.728, + "grad_norm": 1.001670241355896, + "learning_rate": 2e-05, + "loss": 0.03844631, + "step": 12364 + }, + { + "epoch": 24.73, + "grad_norm": 1.2094303369522095, + "learning_rate": 2e-05, + "loss": 0.05520956, + "step": 12365 + }, + { + "epoch": 24.732, + "grad_norm": 1.0332337617874146, + "learning_rate": 2e-05, + "loss": 0.03330713, + "step": 12366 + }, + { + "epoch": 24.734, + "grad_norm": 1.8160288333892822, + "learning_rate": 2e-05, + "loss": 0.06366383, + "step": 12367 + }, + { + "epoch": 24.736, + "grad_norm": 1.0962200164794922, + "learning_rate": 2e-05, + "loss": 0.03358031, + "step": 12368 + }, + { + "epoch": 24.738, + "grad_norm": 1.2256057262420654, + "learning_rate": 2e-05, + "loss": 0.04108488, + "step": 12369 + }, + { + "epoch": 24.74, + "grad_norm": 2.2090930938720703, + "learning_rate": 2e-05, + "loss": 0.06522129, + "step": 12370 + }, + { + "epoch": 24.742, + "grad_norm": 1.1580830812454224, + "learning_rate": 2e-05, + "loss": 0.03839488, + "step": 12371 + }, + { + "epoch": 24.744, + "grad_norm": 1.1396980285644531, + "learning_rate": 2e-05, + "loss": 0.04421246, + "step": 12372 + }, + { + "epoch": 24.746, + "grad_norm": 1.0483407974243164, + "learning_rate": 2e-05, + "loss": 0.03586071, + "step": 12373 + }, + { + "epoch": 24.748, + "grad_norm": 1.1575627326965332, + "learning_rate": 2e-05, + "loss": 0.02652693, + "step": 12374 + }, + { + "epoch": 24.75, + "grad_norm": 1.2578696012496948, + "learning_rate": 2e-05, + "loss": 0.04251358, + "step": 12375 + }, + { + "epoch": 24.752, + "grad_norm": 1.2931718826293945, + "learning_rate": 2e-05, + "loss": 0.04332025, + "step": 12376 + }, + { + "epoch": 24.754, + "grad_norm": 1.2202832698822021, + "learning_rate": 2e-05, + "loss": 0.0368543, + "step": 12377 + }, + { + "epoch": 24.756, + "grad_norm": 2.3942205905914307, + "learning_rate": 2e-05, + "loss": 0.05159555, + "step": 12378 + }, + { + "epoch": 24.758, + "grad_norm": 1.4237806797027588, + "learning_rate": 2e-05, + "loss": 0.04190379, + "step": 12379 + }, + { + "epoch": 24.76, + "grad_norm": 2.9148285388946533, + "learning_rate": 2e-05, + "loss": 0.0515301, + "step": 12380 + }, + { + "epoch": 24.762, + "grad_norm": 1.355830192565918, + "learning_rate": 2e-05, + "loss": 0.04525499, + "step": 12381 + }, + { + "epoch": 24.764, + "grad_norm": 1.7303012609481812, + "learning_rate": 2e-05, + "loss": 0.05111047, + "step": 12382 + }, + { + "epoch": 24.766, + "grad_norm": 1.722062110900879, + "learning_rate": 2e-05, + "loss": 0.05258113, + "step": 12383 + }, + { + "epoch": 24.768, + "grad_norm": 0.8899957537651062, + "learning_rate": 2e-05, + "loss": 0.02432646, + "step": 12384 + }, + { + "epoch": 24.77, + "grad_norm": 1.543811559677124, + "learning_rate": 2e-05, + "loss": 0.04550616, + "step": 12385 + }, + { + "epoch": 24.772, + "grad_norm": 1.6278526782989502, + "learning_rate": 2e-05, + "loss": 0.05577083, + "step": 12386 + }, + { + "epoch": 24.774, + "grad_norm": 1.1185393333435059, + "learning_rate": 2e-05, + "loss": 0.0319613, + "step": 12387 + }, + { + "epoch": 24.776, + "grad_norm": 1.4521170854568481, + "learning_rate": 2e-05, + "loss": 0.05207948, + "step": 12388 + }, + { + "epoch": 24.778, + "grad_norm": 1.3518644571304321, + "learning_rate": 2e-05, + "loss": 0.04421254, + "step": 12389 + }, + { + "epoch": 24.78, + "grad_norm": 1.6394786834716797, + "learning_rate": 2e-05, + "loss": 0.05696079, + "step": 12390 + }, + { + "epoch": 24.782, + "grad_norm": 1.0339347124099731, + "learning_rate": 2e-05, + "loss": 0.03561187, + "step": 12391 + }, + { + "epoch": 24.784, + "grad_norm": 2.129795789718628, + "learning_rate": 2e-05, + "loss": 0.04448505, + "step": 12392 + }, + { + "epoch": 24.786, + "grad_norm": 1.024999976158142, + "learning_rate": 2e-05, + "loss": 0.05013253, + "step": 12393 + }, + { + "epoch": 24.788, + "grad_norm": 1.413780927658081, + "learning_rate": 2e-05, + "loss": 0.03977506, + "step": 12394 + }, + { + "epoch": 24.79, + "grad_norm": 1.4434248208999634, + "learning_rate": 2e-05, + "loss": 0.05382534, + "step": 12395 + }, + { + "epoch": 24.792, + "grad_norm": 1.4509685039520264, + "learning_rate": 2e-05, + "loss": 0.04992918, + "step": 12396 + }, + { + "epoch": 24.794, + "grad_norm": 1.47963547706604, + "learning_rate": 2e-05, + "loss": 0.0391894, + "step": 12397 + }, + { + "epoch": 24.796, + "grad_norm": 1.1163206100463867, + "learning_rate": 2e-05, + "loss": 0.03056799, + "step": 12398 + }, + { + "epoch": 24.798000000000002, + "grad_norm": 1.147055745124817, + "learning_rate": 2e-05, + "loss": 0.03964867, + "step": 12399 + }, + { + "epoch": 24.8, + "grad_norm": 0.9283363819122314, + "learning_rate": 2e-05, + "loss": 0.02905277, + "step": 12400 + }, + { + "epoch": 24.802, + "grad_norm": 0.967431902885437, + "learning_rate": 2e-05, + "loss": 0.03618809, + "step": 12401 + }, + { + "epoch": 24.804, + "grad_norm": 1.2811535596847534, + "learning_rate": 2e-05, + "loss": 0.04123492, + "step": 12402 + }, + { + "epoch": 24.806, + "grad_norm": 0.7825071215629578, + "learning_rate": 2e-05, + "loss": 0.02419277, + "step": 12403 + }, + { + "epoch": 24.808, + "grad_norm": 3.262420654296875, + "learning_rate": 2e-05, + "loss": 0.04532979, + "step": 12404 + }, + { + "epoch": 24.81, + "grad_norm": 3.3228957653045654, + "learning_rate": 2e-05, + "loss": 0.04625534, + "step": 12405 + }, + { + "epoch": 24.812, + "grad_norm": 2.1002676486968994, + "learning_rate": 2e-05, + "loss": 0.04704674, + "step": 12406 + }, + { + "epoch": 24.814, + "grad_norm": 2.0813817977905273, + "learning_rate": 2e-05, + "loss": 0.06468686, + "step": 12407 + }, + { + "epoch": 24.816, + "grad_norm": 1.4478918313980103, + "learning_rate": 2e-05, + "loss": 0.04282294, + "step": 12408 + }, + { + "epoch": 24.818, + "grad_norm": 1.2476028203964233, + "learning_rate": 2e-05, + "loss": 0.04784694, + "step": 12409 + }, + { + "epoch": 24.82, + "grad_norm": 1.4642516374588013, + "learning_rate": 2e-05, + "loss": 0.05254901, + "step": 12410 + }, + { + "epoch": 24.822, + "grad_norm": 0.8719230890274048, + "learning_rate": 2e-05, + "loss": 0.0320285, + "step": 12411 + }, + { + "epoch": 24.824, + "grad_norm": 3.0026919841766357, + "learning_rate": 2e-05, + "loss": 0.05441289, + "step": 12412 + }, + { + "epoch": 24.826, + "grad_norm": 3.024451732635498, + "learning_rate": 2e-05, + "loss": 0.04576012, + "step": 12413 + }, + { + "epoch": 24.828, + "grad_norm": 1.2607303857803345, + "learning_rate": 2e-05, + "loss": 0.05559196, + "step": 12414 + }, + { + "epoch": 24.83, + "grad_norm": 1.9243049621582031, + "learning_rate": 2e-05, + "loss": 0.04724313, + "step": 12415 + }, + { + "epoch": 24.832, + "grad_norm": 1.1702837944030762, + "learning_rate": 2e-05, + "loss": 0.04239753, + "step": 12416 + }, + { + "epoch": 24.834, + "grad_norm": 1.2738158702850342, + "learning_rate": 2e-05, + "loss": 0.04667337, + "step": 12417 + }, + { + "epoch": 24.836, + "grad_norm": 1.4549773931503296, + "learning_rate": 2e-05, + "loss": 0.03520815, + "step": 12418 + }, + { + "epoch": 24.838, + "grad_norm": 1.89911687374115, + "learning_rate": 2e-05, + "loss": 0.05533872, + "step": 12419 + }, + { + "epoch": 24.84, + "grad_norm": 1.3373122215270996, + "learning_rate": 2e-05, + "loss": 0.04861745, + "step": 12420 + }, + { + "epoch": 24.842, + "grad_norm": 1.1704626083374023, + "learning_rate": 2e-05, + "loss": 0.04655242, + "step": 12421 + }, + { + "epoch": 24.844, + "grad_norm": 1.7334611415863037, + "learning_rate": 2e-05, + "loss": 0.04279828, + "step": 12422 + }, + { + "epoch": 24.846, + "grad_norm": 1.5228956937789917, + "learning_rate": 2e-05, + "loss": 0.06574546, + "step": 12423 + }, + { + "epoch": 24.848, + "grad_norm": 1.5824440717697144, + "learning_rate": 2e-05, + "loss": 0.03519925, + "step": 12424 + }, + { + "epoch": 24.85, + "grad_norm": 1.4289437532424927, + "learning_rate": 2e-05, + "loss": 0.03780733, + "step": 12425 + }, + { + "epoch": 24.852, + "grad_norm": 1.498123288154602, + "learning_rate": 2e-05, + "loss": 0.03797191, + "step": 12426 + }, + { + "epoch": 24.854, + "grad_norm": 1.006430745124817, + "learning_rate": 2e-05, + "loss": 0.03351758, + "step": 12427 + }, + { + "epoch": 24.856, + "grad_norm": 1.0861419439315796, + "learning_rate": 2e-05, + "loss": 0.0464669, + "step": 12428 + }, + { + "epoch": 24.858, + "grad_norm": 1.2548224925994873, + "learning_rate": 2e-05, + "loss": 0.02788034, + "step": 12429 + }, + { + "epoch": 24.86, + "grad_norm": 1.3924577236175537, + "learning_rate": 2e-05, + "loss": 0.02896036, + "step": 12430 + }, + { + "epoch": 24.862, + "grad_norm": 2.9513254165649414, + "learning_rate": 2e-05, + "loss": 0.05202112, + "step": 12431 + }, + { + "epoch": 24.864, + "grad_norm": 1.3894107341766357, + "learning_rate": 2e-05, + "loss": 0.05267946, + "step": 12432 + }, + { + "epoch": 24.866, + "grad_norm": 1.1392052173614502, + "learning_rate": 2e-05, + "loss": 0.04014817, + "step": 12433 + }, + { + "epoch": 24.868, + "grad_norm": 1.3228307962417603, + "learning_rate": 2e-05, + "loss": 0.03538731, + "step": 12434 + }, + { + "epoch": 24.87, + "grad_norm": 1.11819589138031, + "learning_rate": 2e-05, + "loss": 0.03386147, + "step": 12435 + }, + { + "epoch": 24.872, + "grad_norm": 1.2249038219451904, + "learning_rate": 2e-05, + "loss": 0.0489039, + "step": 12436 + }, + { + "epoch": 24.874, + "grad_norm": 1.4754010438919067, + "learning_rate": 2e-05, + "loss": 0.04698762, + "step": 12437 + }, + { + "epoch": 24.876, + "grad_norm": 3.302642583847046, + "learning_rate": 2e-05, + "loss": 0.05646276, + "step": 12438 + }, + { + "epoch": 24.878, + "grad_norm": 1.419040560722351, + "learning_rate": 2e-05, + "loss": 0.04880114, + "step": 12439 + }, + { + "epoch": 24.88, + "grad_norm": 1.1326826810836792, + "learning_rate": 2e-05, + "loss": 0.04119021, + "step": 12440 + }, + { + "epoch": 24.882, + "grad_norm": 1.429632306098938, + "learning_rate": 2e-05, + "loss": 0.0510744, + "step": 12441 + }, + { + "epoch": 24.884, + "grad_norm": 2.021416425704956, + "learning_rate": 2e-05, + "loss": 0.05398323, + "step": 12442 + }, + { + "epoch": 24.886, + "grad_norm": 1.2925662994384766, + "learning_rate": 2e-05, + "loss": 0.0487259, + "step": 12443 + }, + { + "epoch": 24.888, + "grad_norm": 1.3188891410827637, + "learning_rate": 2e-05, + "loss": 0.04359906, + "step": 12444 + }, + { + "epoch": 24.89, + "grad_norm": 1.8213289976119995, + "learning_rate": 2e-05, + "loss": 0.05419963, + "step": 12445 + }, + { + "epoch": 24.892, + "grad_norm": 1.8820993900299072, + "learning_rate": 2e-05, + "loss": 0.04406717, + "step": 12446 + }, + { + "epoch": 24.894, + "grad_norm": 1.1095833778381348, + "learning_rate": 2e-05, + "loss": 0.04703937, + "step": 12447 + }, + { + "epoch": 24.896, + "grad_norm": 1.145403265953064, + "learning_rate": 2e-05, + "loss": 0.04098237, + "step": 12448 + }, + { + "epoch": 24.898, + "grad_norm": 4.939516067504883, + "learning_rate": 2e-05, + "loss": 0.04888795, + "step": 12449 + }, + { + "epoch": 24.9, + "grad_norm": 1.2902284860610962, + "learning_rate": 2e-05, + "loss": 0.04742273, + "step": 12450 + }, + { + "epoch": 24.902, + "grad_norm": 1.15613853931427, + "learning_rate": 2e-05, + "loss": 0.02914495, + "step": 12451 + }, + { + "epoch": 24.904, + "grad_norm": 2.327373504638672, + "learning_rate": 2e-05, + "loss": 0.05553031, + "step": 12452 + }, + { + "epoch": 24.906, + "grad_norm": 1.4978739023208618, + "learning_rate": 2e-05, + "loss": 0.03483084, + "step": 12453 + }, + { + "epoch": 24.908, + "grad_norm": 1.7437061071395874, + "learning_rate": 2e-05, + "loss": 0.05667341, + "step": 12454 + }, + { + "epoch": 24.91, + "grad_norm": 2.045104742050171, + "learning_rate": 2e-05, + "loss": 0.0504937, + "step": 12455 + }, + { + "epoch": 24.912, + "grad_norm": 2.127146005630493, + "learning_rate": 2e-05, + "loss": 0.04464479, + "step": 12456 + }, + { + "epoch": 24.914, + "grad_norm": 1.9251153469085693, + "learning_rate": 2e-05, + "loss": 0.04323905, + "step": 12457 + }, + { + "epoch": 24.916, + "grad_norm": 1.2294851541519165, + "learning_rate": 2e-05, + "loss": 0.03710897, + "step": 12458 + }, + { + "epoch": 24.918, + "grad_norm": 1.0860822200775146, + "learning_rate": 2e-05, + "loss": 0.04871416, + "step": 12459 + }, + { + "epoch": 24.92, + "grad_norm": 1.0837758779525757, + "learning_rate": 2e-05, + "loss": 0.03727125, + "step": 12460 + }, + { + "epoch": 24.922, + "grad_norm": 1.4478741884231567, + "learning_rate": 2e-05, + "loss": 0.04571364, + "step": 12461 + }, + { + "epoch": 24.924, + "grad_norm": 1.5994067192077637, + "learning_rate": 2e-05, + "loss": 0.04567996, + "step": 12462 + }, + { + "epoch": 24.926, + "grad_norm": 2.3954379558563232, + "learning_rate": 2e-05, + "loss": 0.05547336, + "step": 12463 + }, + { + "epoch": 24.928, + "grad_norm": 2.130188465118408, + "learning_rate": 2e-05, + "loss": 0.0510396, + "step": 12464 + }, + { + "epoch": 24.93, + "grad_norm": 19.31425666809082, + "learning_rate": 2e-05, + "loss": 0.09202991, + "step": 12465 + }, + { + "epoch": 24.932, + "grad_norm": 1.2676620483398438, + "learning_rate": 2e-05, + "loss": 0.03964383, + "step": 12466 + }, + { + "epoch": 24.934, + "grad_norm": 1.5841865539550781, + "learning_rate": 2e-05, + "loss": 0.03884972, + "step": 12467 + }, + { + "epoch": 24.936, + "grad_norm": 1.8690932989120483, + "learning_rate": 2e-05, + "loss": 0.05343563, + "step": 12468 + }, + { + "epoch": 24.938, + "grad_norm": 1.262275218963623, + "learning_rate": 2e-05, + "loss": 0.03402144, + "step": 12469 + }, + { + "epoch": 24.94, + "grad_norm": 1.1100060939788818, + "learning_rate": 2e-05, + "loss": 0.04514051, + "step": 12470 + }, + { + "epoch": 24.942, + "grad_norm": 1.4891170263290405, + "learning_rate": 2e-05, + "loss": 0.04965903, + "step": 12471 + }, + { + "epoch": 24.944, + "grad_norm": 1.1756887435913086, + "learning_rate": 2e-05, + "loss": 0.03555219, + "step": 12472 + }, + { + "epoch": 24.946, + "grad_norm": 1.38705575466156, + "learning_rate": 2e-05, + "loss": 0.05496265, + "step": 12473 + }, + { + "epoch": 24.948, + "grad_norm": 1.8380342721939087, + "learning_rate": 2e-05, + "loss": 0.05392865, + "step": 12474 + }, + { + "epoch": 24.95, + "grad_norm": 0.9538768529891968, + "learning_rate": 2e-05, + "loss": 0.03763831, + "step": 12475 + }, + { + "epoch": 24.951999999999998, + "grad_norm": 1.4392766952514648, + "learning_rate": 2e-05, + "loss": 0.03219794, + "step": 12476 + }, + { + "epoch": 24.954, + "grad_norm": 1.1344821453094482, + "learning_rate": 2e-05, + "loss": 0.04084168, + "step": 12477 + }, + { + "epoch": 24.956, + "grad_norm": 1.4016164541244507, + "learning_rate": 2e-05, + "loss": 0.04202775, + "step": 12478 + }, + { + "epoch": 24.958, + "grad_norm": 1.3038735389709473, + "learning_rate": 2e-05, + "loss": 0.05586255, + "step": 12479 + }, + { + "epoch": 24.96, + "grad_norm": 1.2745096683502197, + "learning_rate": 2e-05, + "loss": 0.04563636, + "step": 12480 + }, + { + "epoch": 24.962, + "grad_norm": 1.8629027605056763, + "learning_rate": 2e-05, + "loss": 0.04409952, + "step": 12481 + }, + { + "epoch": 24.964, + "grad_norm": 1.1554667949676514, + "learning_rate": 2e-05, + "loss": 0.04890275, + "step": 12482 + }, + { + "epoch": 24.966, + "grad_norm": 1.329375147819519, + "learning_rate": 2e-05, + "loss": 0.04882819, + "step": 12483 + }, + { + "epoch": 24.968, + "grad_norm": 1.0821406841278076, + "learning_rate": 2e-05, + "loss": 0.03480864, + "step": 12484 + }, + { + "epoch": 24.97, + "grad_norm": 1.5325900316238403, + "learning_rate": 2e-05, + "loss": 0.04224129, + "step": 12485 + }, + { + "epoch": 24.972, + "grad_norm": 1.1319386959075928, + "learning_rate": 2e-05, + "loss": 0.03097108, + "step": 12486 + }, + { + "epoch": 24.974, + "grad_norm": 1.6647253036499023, + "learning_rate": 2e-05, + "loss": 0.05010362, + "step": 12487 + }, + { + "epoch": 24.976, + "grad_norm": 1.5213617086410522, + "learning_rate": 2e-05, + "loss": 0.05155212, + "step": 12488 + }, + { + "epoch": 24.978, + "grad_norm": 1.181519865989685, + "learning_rate": 2e-05, + "loss": 0.05764193, + "step": 12489 + }, + { + "epoch": 24.98, + "grad_norm": 1.0516437292099, + "learning_rate": 2e-05, + "loss": 0.04471372, + "step": 12490 + }, + { + "epoch": 24.982, + "grad_norm": 1.2531055212020874, + "learning_rate": 2e-05, + "loss": 0.04816005, + "step": 12491 + }, + { + "epoch": 24.984, + "grad_norm": 2.094572067260742, + "learning_rate": 2e-05, + "loss": 0.0658077, + "step": 12492 + }, + { + "epoch": 24.986, + "grad_norm": 1.2252720594406128, + "learning_rate": 2e-05, + "loss": 0.04415631, + "step": 12493 + }, + { + "epoch": 24.988, + "grad_norm": 1.2434792518615723, + "learning_rate": 2e-05, + "loss": 0.04903311, + "step": 12494 + }, + { + "epoch": 24.99, + "grad_norm": 0.994875967502594, + "learning_rate": 2e-05, + "loss": 0.03526482, + "step": 12495 + }, + { + "epoch": 24.992, + "grad_norm": 1.4973106384277344, + "learning_rate": 2e-05, + "loss": 0.03749626, + "step": 12496 + }, + { + "epoch": 24.994, + "grad_norm": 0.9715427160263062, + "learning_rate": 2e-05, + "loss": 0.0366944, + "step": 12497 + }, + { + "epoch": 24.996, + "grad_norm": 1.1738646030426025, + "learning_rate": 2e-05, + "loss": 0.03677404, + "step": 12498 + }, + { + "epoch": 24.998, + "grad_norm": 3.087301731109619, + "learning_rate": 2e-05, + "loss": 0.06307375, + "step": 12499 + }, + { + "epoch": 25.0, + "grad_norm": 1.783982276916504, + "learning_rate": 2e-05, + "loss": 0.04853023, + "step": 12500 + }, + { + "epoch": 25.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9720558882235529, + "Equal_1": 0.996, + "Equal_2": 0.9840319361277445, + "Equal_3": 0.9840319361277445, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.988, + "Perpendicular_1": 0.992, + "Perpendicular_2": 0.984, + "Perpendicular_3": 0.8697394789579158, + "PointLiesOnCircle_1": 0.9959919839679359, + "PointLiesOnCircle_2": 0.9916666666666667, + "PointLiesOnCircle_3": 0.9912000000000001, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9800399201596807 + }, + "eval_runtime": 320.1532, + "eval_samples_per_second": 32.797, + "eval_steps_per_second": 0.656, + "step": 12500 + }, + { + "epoch": 25.002, + "grad_norm": 2.553316354751587, + "learning_rate": 2e-05, + "loss": 0.04658042, + "step": 12501 + }, + { + "epoch": 25.004, + "grad_norm": 1.2298393249511719, + "learning_rate": 2e-05, + "loss": 0.04537143, + "step": 12502 + }, + { + "epoch": 25.006, + "grad_norm": 1.3164474964141846, + "learning_rate": 2e-05, + "loss": 0.06407714, + "step": 12503 + }, + { + "epoch": 25.008, + "grad_norm": 1.3401498794555664, + "learning_rate": 2e-05, + "loss": 0.04077845, + "step": 12504 + }, + { + "epoch": 25.01, + "grad_norm": 1.0891218185424805, + "learning_rate": 2e-05, + "loss": 0.02724287, + "step": 12505 + }, + { + "epoch": 25.012, + "grad_norm": 1.2074103355407715, + "learning_rate": 2e-05, + "loss": 0.05306327, + "step": 12506 + }, + { + "epoch": 25.014, + "grad_norm": 1.1338789463043213, + "learning_rate": 2e-05, + "loss": 0.04252898, + "step": 12507 + }, + { + "epoch": 25.016, + "grad_norm": 1.2618156671524048, + "learning_rate": 2e-05, + "loss": 0.0498926, + "step": 12508 + }, + { + "epoch": 25.018, + "grad_norm": 1.14939546585083, + "learning_rate": 2e-05, + "loss": 0.04680283, + "step": 12509 + }, + { + "epoch": 25.02, + "grad_norm": 1.1815592050552368, + "learning_rate": 2e-05, + "loss": 0.04917777, + "step": 12510 + }, + { + "epoch": 25.022, + "grad_norm": 1.619907259941101, + "learning_rate": 2e-05, + "loss": 0.05033459, + "step": 12511 + }, + { + "epoch": 25.024, + "grad_norm": 1.0121499300003052, + "learning_rate": 2e-05, + "loss": 0.03902668, + "step": 12512 + }, + { + "epoch": 25.026, + "grad_norm": 1.4635061025619507, + "learning_rate": 2e-05, + "loss": 0.03216758, + "step": 12513 + }, + { + "epoch": 25.028, + "grad_norm": 1.818674087524414, + "learning_rate": 2e-05, + "loss": 0.05529295, + "step": 12514 + }, + { + "epoch": 25.03, + "grad_norm": 1.5501385927200317, + "learning_rate": 2e-05, + "loss": 0.05194931, + "step": 12515 + }, + { + "epoch": 25.032, + "grad_norm": 1.735026240348816, + "learning_rate": 2e-05, + "loss": 0.0563397, + "step": 12516 + }, + { + "epoch": 25.034, + "grad_norm": 1.9267089366912842, + "learning_rate": 2e-05, + "loss": 0.05115256, + "step": 12517 + }, + { + "epoch": 25.036, + "grad_norm": 1.591173768043518, + "learning_rate": 2e-05, + "loss": 0.04865324, + "step": 12518 + }, + { + "epoch": 25.038, + "grad_norm": 1.233425259590149, + "learning_rate": 2e-05, + "loss": 0.04186363, + "step": 12519 + }, + { + "epoch": 25.04, + "grad_norm": 1.3910762071609497, + "learning_rate": 2e-05, + "loss": 0.05607095, + "step": 12520 + }, + { + "epoch": 25.042, + "grad_norm": 1.911817193031311, + "learning_rate": 2e-05, + "loss": 0.04746141, + "step": 12521 + }, + { + "epoch": 25.044, + "grad_norm": 1.5743073225021362, + "learning_rate": 2e-05, + "loss": 0.0310831, + "step": 12522 + }, + { + "epoch": 25.046, + "grad_norm": 2.0004522800445557, + "learning_rate": 2e-05, + "loss": 0.05432073, + "step": 12523 + }, + { + "epoch": 25.048, + "grad_norm": 1.304870367050171, + "learning_rate": 2e-05, + "loss": 0.05191491, + "step": 12524 + }, + { + "epoch": 25.05, + "grad_norm": 1.6975371837615967, + "learning_rate": 2e-05, + "loss": 0.04251885, + "step": 12525 + }, + { + "epoch": 25.052, + "grad_norm": 1.4703913927078247, + "learning_rate": 2e-05, + "loss": 0.04141282, + "step": 12526 + }, + { + "epoch": 25.054, + "grad_norm": 1.6704292297363281, + "learning_rate": 2e-05, + "loss": 0.05951826, + "step": 12527 + }, + { + "epoch": 25.056, + "grad_norm": 1.4327316284179688, + "learning_rate": 2e-05, + "loss": 0.05041571, + "step": 12528 + }, + { + "epoch": 25.058, + "grad_norm": 2.054896116256714, + "learning_rate": 2e-05, + "loss": 0.05156599, + "step": 12529 + }, + { + "epoch": 25.06, + "grad_norm": 1.0578651428222656, + "learning_rate": 2e-05, + "loss": 0.03640224, + "step": 12530 + }, + { + "epoch": 25.062, + "grad_norm": 1.2330236434936523, + "learning_rate": 2e-05, + "loss": 0.03764113, + "step": 12531 + }, + { + "epoch": 25.064, + "grad_norm": 1.5884158611297607, + "learning_rate": 2e-05, + "loss": 0.04582855, + "step": 12532 + }, + { + "epoch": 25.066, + "grad_norm": 1.3398418426513672, + "learning_rate": 2e-05, + "loss": 0.04462974, + "step": 12533 + }, + { + "epoch": 25.068, + "grad_norm": 1.4082545042037964, + "learning_rate": 2e-05, + "loss": 0.03973638, + "step": 12534 + }, + { + "epoch": 25.07, + "grad_norm": 3.2086148262023926, + "learning_rate": 2e-05, + "loss": 0.0822015, + "step": 12535 + }, + { + "epoch": 25.072, + "grad_norm": 1.2738802433013916, + "learning_rate": 2e-05, + "loss": 0.0383213, + "step": 12536 + }, + { + "epoch": 25.074, + "grad_norm": 1.359592318534851, + "learning_rate": 2e-05, + "loss": 0.04718591, + "step": 12537 + }, + { + "epoch": 25.076, + "grad_norm": 2.490873336791992, + "learning_rate": 2e-05, + "loss": 0.04099283, + "step": 12538 + }, + { + "epoch": 25.078, + "grad_norm": 1.3400729894638062, + "learning_rate": 2e-05, + "loss": 0.05246966, + "step": 12539 + }, + { + "epoch": 25.08, + "grad_norm": 1.2105579376220703, + "learning_rate": 2e-05, + "loss": 0.04603817, + "step": 12540 + }, + { + "epoch": 25.082, + "grad_norm": 1.2426912784576416, + "learning_rate": 2e-05, + "loss": 0.04203958, + "step": 12541 + }, + { + "epoch": 25.084, + "grad_norm": 1.232663869857788, + "learning_rate": 2e-05, + "loss": 0.03772744, + "step": 12542 + }, + { + "epoch": 25.086, + "grad_norm": 1.6726223230361938, + "learning_rate": 2e-05, + "loss": 0.04957581, + "step": 12543 + }, + { + "epoch": 25.088, + "grad_norm": 1.9126125574111938, + "learning_rate": 2e-05, + "loss": 0.05544265, + "step": 12544 + }, + { + "epoch": 25.09, + "grad_norm": 1.0108952522277832, + "learning_rate": 2e-05, + "loss": 0.02635121, + "step": 12545 + }, + { + "epoch": 25.092, + "grad_norm": 1.3411767482757568, + "learning_rate": 2e-05, + "loss": 0.03911927, + "step": 12546 + }, + { + "epoch": 25.094, + "grad_norm": 2.594507932662964, + "learning_rate": 2e-05, + "loss": 0.05281266, + "step": 12547 + }, + { + "epoch": 25.096, + "grad_norm": 1.0433748960494995, + "learning_rate": 2e-05, + "loss": 0.04605139, + "step": 12548 + }, + { + "epoch": 25.098, + "grad_norm": 1.9768903255462646, + "learning_rate": 2e-05, + "loss": 0.0549066, + "step": 12549 + }, + { + "epoch": 25.1, + "grad_norm": 2.1046414375305176, + "learning_rate": 2e-05, + "loss": 0.05329197, + "step": 12550 + }, + { + "epoch": 25.102, + "grad_norm": 1.1665661334991455, + "learning_rate": 2e-05, + "loss": 0.04764657, + "step": 12551 + }, + { + "epoch": 25.104, + "grad_norm": 2.242833137512207, + "learning_rate": 2e-05, + "loss": 0.04974656, + "step": 12552 + }, + { + "epoch": 25.106, + "grad_norm": 1.416796326637268, + "learning_rate": 2e-05, + "loss": 0.04945759, + "step": 12553 + }, + { + "epoch": 25.108, + "grad_norm": 1.1873366832733154, + "learning_rate": 2e-05, + "loss": 0.05103811, + "step": 12554 + }, + { + "epoch": 25.11, + "grad_norm": 1.926827311515808, + "learning_rate": 2e-05, + "loss": 0.05267503, + "step": 12555 + }, + { + "epoch": 25.112, + "grad_norm": 0.9494261741638184, + "learning_rate": 2e-05, + "loss": 0.03307255, + "step": 12556 + }, + { + "epoch": 25.114, + "grad_norm": 1.8937228918075562, + "learning_rate": 2e-05, + "loss": 0.06592927, + "step": 12557 + }, + { + "epoch": 25.116, + "grad_norm": 2.0769922733306885, + "learning_rate": 2e-05, + "loss": 0.05289575, + "step": 12558 + }, + { + "epoch": 25.118, + "grad_norm": 1.1254668235778809, + "learning_rate": 2e-05, + "loss": 0.03823429, + "step": 12559 + }, + { + "epoch": 25.12, + "grad_norm": 2.4549336433410645, + "learning_rate": 2e-05, + "loss": 0.04544316, + "step": 12560 + }, + { + "epoch": 25.122, + "grad_norm": 1.7756479978561401, + "learning_rate": 2e-05, + "loss": 0.0518619, + "step": 12561 + }, + { + "epoch": 25.124, + "grad_norm": 1.4617094993591309, + "learning_rate": 2e-05, + "loss": 0.05234993, + "step": 12562 + }, + { + "epoch": 25.126, + "grad_norm": 1.5612094402313232, + "learning_rate": 2e-05, + "loss": 0.04799661, + "step": 12563 + }, + { + "epoch": 25.128, + "grad_norm": 1.2895623445510864, + "learning_rate": 2e-05, + "loss": 0.04298138, + "step": 12564 + }, + { + "epoch": 25.13, + "grad_norm": 1.085653305053711, + "learning_rate": 2e-05, + "loss": 0.03644668, + "step": 12565 + }, + { + "epoch": 25.132, + "grad_norm": 2.4465861320495605, + "learning_rate": 2e-05, + "loss": 0.04523169, + "step": 12566 + }, + { + "epoch": 25.134, + "grad_norm": 1.476048231124878, + "learning_rate": 2e-05, + "loss": 0.02938263, + "step": 12567 + }, + { + "epoch": 25.136, + "grad_norm": 0.9077489376068115, + "learning_rate": 2e-05, + "loss": 0.02951773, + "step": 12568 + }, + { + "epoch": 25.138, + "grad_norm": 1.348293662071228, + "learning_rate": 2e-05, + "loss": 0.04014814, + "step": 12569 + }, + { + "epoch": 25.14, + "grad_norm": 1.2255805730819702, + "learning_rate": 2e-05, + "loss": 0.04978126, + "step": 12570 + }, + { + "epoch": 25.142, + "grad_norm": 1.4020949602127075, + "learning_rate": 2e-05, + "loss": 0.05750665, + "step": 12571 + }, + { + "epoch": 25.144, + "grad_norm": 1.526517629623413, + "learning_rate": 2e-05, + "loss": 0.04812055, + "step": 12572 + }, + { + "epoch": 25.146, + "grad_norm": 1.3054735660552979, + "learning_rate": 2e-05, + "loss": 0.04948972, + "step": 12573 + }, + { + "epoch": 25.148, + "grad_norm": 0.968127429485321, + "learning_rate": 2e-05, + "loss": 0.03364644, + "step": 12574 + }, + { + "epoch": 25.15, + "grad_norm": 2.5585010051727295, + "learning_rate": 2e-05, + "loss": 0.06127685, + "step": 12575 + }, + { + "epoch": 25.152, + "grad_norm": 1.267968773841858, + "learning_rate": 2e-05, + "loss": 0.05282283, + "step": 12576 + }, + { + "epoch": 25.154, + "grad_norm": 1.3801593780517578, + "learning_rate": 2e-05, + "loss": 0.0542292, + "step": 12577 + }, + { + "epoch": 25.156, + "grad_norm": 1.1124210357666016, + "learning_rate": 2e-05, + "loss": 0.03990389, + "step": 12578 + }, + { + "epoch": 25.158, + "grad_norm": 1.1204739809036255, + "learning_rate": 2e-05, + "loss": 0.04882549, + "step": 12579 + }, + { + "epoch": 25.16, + "grad_norm": 1.5411254167556763, + "learning_rate": 2e-05, + "loss": 0.03279787, + "step": 12580 + }, + { + "epoch": 25.162, + "grad_norm": 1.0646896362304688, + "learning_rate": 2e-05, + "loss": 0.04206897, + "step": 12581 + }, + { + "epoch": 25.164, + "grad_norm": 1.4947314262390137, + "learning_rate": 2e-05, + "loss": 0.04689228, + "step": 12582 + }, + { + "epoch": 25.166, + "grad_norm": 1.3432666063308716, + "learning_rate": 2e-05, + "loss": 0.03807723, + "step": 12583 + }, + { + "epoch": 25.168, + "grad_norm": 1.3646643161773682, + "learning_rate": 2e-05, + "loss": 0.05141376, + "step": 12584 + }, + { + "epoch": 25.17, + "grad_norm": 2.0790557861328125, + "learning_rate": 2e-05, + "loss": 0.04800526, + "step": 12585 + }, + { + "epoch": 25.172, + "grad_norm": 1.5363324880599976, + "learning_rate": 2e-05, + "loss": 0.03496822, + "step": 12586 + }, + { + "epoch": 25.174, + "grad_norm": 1.1385252475738525, + "learning_rate": 2e-05, + "loss": 0.04525664, + "step": 12587 + }, + { + "epoch": 25.176, + "grad_norm": 1.0282803773880005, + "learning_rate": 2e-05, + "loss": 0.03413811, + "step": 12588 + }, + { + "epoch": 25.178, + "grad_norm": 1.563729166984558, + "learning_rate": 2e-05, + "loss": 0.04510281, + "step": 12589 + }, + { + "epoch": 25.18, + "grad_norm": 2.2484776973724365, + "learning_rate": 2e-05, + "loss": 0.05236363, + "step": 12590 + }, + { + "epoch": 25.182, + "grad_norm": 1.737161636352539, + "learning_rate": 2e-05, + "loss": 0.04524446, + "step": 12591 + }, + { + "epoch": 25.184, + "grad_norm": 1.867302656173706, + "learning_rate": 2e-05, + "loss": 0.06257963, + "step": 12592 + }, + { + "epoch": 25.186, + "grad_norm": 1.426841139793396, + "learning_rate": 2e-05, + "loss": 0.04176682, + "step": 12593 + }, + { + "epoch": 25.188, + "grad_norm": 1.0782243013381958, + "learning_rate": 2e-05, + "loss": 0.03285508, + "step": 12594 + }, + { + "epoch": 25.19, + "grad_norm": 3.013762950897217, + "learning_rate": 2e-05, + "loss": 0.05704542, + "step": 12595 + }, + { + "epoch": 25.192, + "grad_norm": 1.8257044553756714, + "learning_rate": 2e-05, + "loss": 0.04196394, + "step": 12596 + }, + { + "epoch": 25.194, + "grad_norm": 0.9650020599365234, + "learning_rate": 2e-05, + "loss": 0.03296274, + "step": 12597 + }, + { + "epoch": 25.196, + "grad_norm": 2.078765630722046, + "learning_rate": 2e-05, + "loss": 0.04554818, + "step": 12598 + }, + { + "epoch": 25.198, + "grad_norm": 1.3087246417999268, + "learning_rate": 2e-05, + "loss": 0.05761782, + "step": 12599 + }, + { + "epoch": 25.2, + "grad_norm": 1.3766916990280151, + "learning_rate": 2e-05, + "loss": 0.052761, + "step": 12600 + }, + { + "epoch": 25.202, + "grad_norm": 1.2516851425170898, + "learning_rate": 2e-05, + "loss": 0.04158796, + "step": 12601 + }, + { + "epoch": 25.204, + "grad_norm": 1.0545592308044434, + "learning_rate": 2e-05, + "loss": 0.03375325, + "step": 12602 + }, + { + "epoch": 25.206, + "grad_norm": 1.732971429824829, + "learning_rate": 2e-05, + "loss": 0.05470074, + "step": 12603 + }, + { + "epoch": 25.208, + "grad_norm": 1.1150429248809814, + "learning_rate": 2e-05, + "loss": 0.03985225, + "step": 12604 + }, + { + "epoch": 25.21, + "grad_norm": 1.1678208112716675, + "learning_rate": 2e-05, + "loss": 0.04104099, + "step": 12605 + }, + { + "epoch": 25.212, + "grad_norm": 1.085423469543457, + "learning_rate": 2e-05, + "loss": 0.04031212, + "step": 12606 + }, + { + "epoch": 25.214, + "grad_norm": 1.413877248764038, + "learning_rate": 2e-05, + "loss": 0.04183791, + "step": 12607 + }, + { + "epoch": 25.216, + "grad_norm": 1.270094871520996, + "learning_rate": 2e-05, + "loss": 0.03035654, + "step": 12608 + }, + { + "epoch": 25.218, + "grad_norm": 1.045606017112732, + "learning_rate": 2e-05, + "loss": 0.03988522, + "step": 12609 + }, + { + "epoch": 25.22, + "grad_norm": 1.19292151927948, + "learning_rate": 2e-05, + "loss": 0.03305714, + "step": 12610 + }, + { + "epoch": 25.222, + "grad_norm": 1.377384901046753, + "learning_rate": 2e-05, + "loss": 0.0412189, + "step": 12611 + }, + { + "epoch": 25.224, + "grad_norm": 1.903836727142334, + "learning_rate": 2e-05, + "loss": 0.05559286, + "step": 12612 + }, + { + "epoch": 25.226, + "grad_norm": 1.2150182723999023, + "learning_rate": 2e-05, + "loss": 0.04249961, + "step": 12613 + }, + { + "epoch": 25.228, + "grad_norm": 1.5361416339874268, + "learning_rate": 2e-05, + "loss": 0.05546279, + "step": 12614 + }, + { + "epoch": 25.23, + "grad_norm": 1.1674758195877075, + "learning_rate": 2e-05, + "loss": 0.03651671, + "step": 12615 + }, + { + "epoch": 25.232, + "grad_norm": 1.5208848714828491, + "learning_rate": 2e-05, + "loss": 0.06328288, + "step": 12616 + }, + { + "epoch": 25.234, + "grad_norm": 1.5309934616088867, + "learning_rate": 2e-05, + "loss": 0.06355253, + "step": 12617 + }, + { + "epoch": 25.236, + "grad_norm": 1.5100221633911133, + "learning_rate": 2e-05, + "loss": 0.04278588, + "step": 12618 + }, + { + "epoch": 25.238, + "grad_norm": 1.191089153289795, + "learning_rate": 2e-05, + "loss": 0.04086279, + "step": 12619 + }, + { + "epoch": 25.24, + "grad_norm": 1.7226134538650513, + "learning_rate": 2e-05, + "loss": 0.04553945, + "step": 12620 + }, + { + "epoch": 25.242, + "grad_norm": 1.6444710493087769, + "learning_rate": 2e-05, + "loss": 0.04359776, + "step": 12621 + }, + { + "epoch": 25.244, + "grad_norm": 1.3565912246704102, + "learning_rate": 2e-05, + "loss": 0.04153623, + "step": 12622 + }, + { + "epoch": 25.246, + "grad_norm": 1.295622706413269, + "learning_rate": 2e-05, + "loss": 0.03719122, + "step": 12623 + }, + { + "epoch": 25.248, + "grad_norm": 1.560507893562317, + "learning_rate": 2e-05, + "loss": 0.04558239, + "step": 12624 + }, + { + "epoch": 25.25, + "grad_norm": 1.1936217546463013, + "learning_rate": 2e-05, + "loss": 0.04132167, + "step": 12625 + }, + { + "epoch": 25.252, + "grad_norm": 1.165188193321228, + "learning_rate": 2e-05, + "loss": 0.04114609, + "step": 12626 + }, + { + "epoch": 25.254, + "grad_norm": 1.5809382200241089, + "learning_rate": 2e-05, + "loss": 0.04351167, + "step": 12627 + }, + { + "epoch": 25.256, + "grad_norm": 1.449571132659912, + "learning_rate": 2e-05, + "loss": 0.0306358, + "step": 12628 + }, + { + "epoch": 25.258, + "grad_norm": 1.3308436870574951, + "learning_rate": 2e-05, + "loss": 0.02577632, + "step": 12629 + }, + { + "epoch": 25.26, + "grad_norm": 1.2263076305389404, + "learning_rate": 2e-05, + "loss": 0.03553704, + "step": 12630 + }, + { + "epoch": 25.262, + "grad_norm": 1.204790711402893, + "learning_rate": 2e-05, + "loss": 0.04875295, + "step": 12631 + }, + { + "epoch": 25.264, + "grad_norm": 1.111572504043579, + "learning_rate": 2e-05, + "loss": 0.04481274, + "step": 12632 + }, + { + "epoch": 25.266, + "grad_norm": 1.6431896686553955, + "learning_rate": 2e-05, + "loss": 0.05574667, + "step": 12633 + }, + { + "epoch": 25.268, + "grad_norm": 1.5125669240951538, + "learning_rate": 2e-05, + "loss": 0.05090974, + "step": 12634 + }, + { + "epoch": 25.27, + "grad_norm": 6.2000932693481445, + "learning_rate": 2e-05, + "loss": 0.04442751, + "step": 12635 + }, + { + "epoch": 25.272, + "grad_norm": 4.573184967041016, + "learning_rate": 2e-05, + "loss": 0.06365525, + "step": 12636 + }, + { + "epoch": 25.274, + "grad_norm": 2.154564142227173, + "learning_rate": 2e-05, + "loss": 0.03896628, + "step": 12637 + }, + { + "epoch": 25.276, + "grad_norm": 1.2963447570800781, + "learning_rate": 2e-05, + "loss": 0.04430124, + "step": 12638 + }, + { + "epoch": 25.278, + "grad_norm": 1.1166437864303589, + "learning_rate": 2e-05, + "loss": 0.03765798, + "step": 12639 + }, + { + "epoch": 25.28, + "grad_norm": 1.1233835220336914, + "learning_rate": 2e-05, + "loss": 0.03647223, + "step": 12640 + }, + { + "epoch": 25.282, + "grad_norm": 1.2153040170669556, + "learning_rate": 2e-05, + "loss": 0.04259994, + "step": 12641 + }, + { + "epoch": 25.284, + "grad_norm": 1.6373004913330078, + "learning_rate": 2e-05, + "loss": 0.05044476, + "step": 12642 + }, + { + "epoch": 25.286, + "grad_norm": 1.4357736110687256, + "learning_rate": 2e-05, + "loss": 0.04274999, + "step": 12643 + }, + { + "epoch": 25.288, + "grad_norm": 1.0987167358398438, + "learning_rate": 2e-05, + "loss": 0.04832689, + "step": 12644 + }, + { + "epoch": 25.29, + "grad_norm": 1.4695364236831665, + "learning_rate": 2e-05, + "loss": 0.04650634, + "step": 12645 + }, + { + "epoch": 25.292, + "grad_norm": 1.8129997253417969, + "learning_rate": 2e-05, + "loss": 0.03666451, + "step": 12646 + }, + { + "epoch": 25.294, + "grad_norm": 1.2179253101348877, + "learning_rate": 2e-05, + "loss": 0.03910381, + "step": 12647 + }, + { + "epoch": 25.296, + "grad_norm": 1.5008174180984497, + "learning_rate": 2e-05, + "loss": 0.05692586, + "step": 12648 + }, + { + "epoch": 25.298, + "grad_norm": 0.9341026544570923, + "learning_rate": 2e-05, + "loss": 0.03112362, + "step": 12649 + }, + { + "epoch": 25.3, + "grad_norm": 0.8844679594039917, + "learning_rate": 2e-05, + "loss": 0.02362539, + "step": 12650 + }, + { + "epoch": 25.302, + "grad_norm": 1.7629514932632446, + "learning_rate": 2e-05, + "loss": 0.03517685, + "step": 12651 + }, + { + "epoch": 25.304, + "grad_norm": 1.303821325302124, + "learning_rate": 2e-05, + "loss": 0.05173323, + "step": 12652 + }, + { + "epoch": 25.306, + "grad_norm": 1.4266308546066284, + "learning_rate": 2e-05, + "loss": 0.05540767, + "step": 12653 + }, + { + "epoch": 25.308, + "grad_norm": 1.2281945943832397, + "learning_rate": 2e-05, + "loss": 0.04114369, + "step": 12654 + }, + { + "epoch": 25.31, + "grad_norm": 1.3700491189956665, + "learning_rate": 2e-05, + "loss": 0.03522223, + "step": 12655 + }, + { + "epoch": 25.312, + "grad_norm": 1.426082968711853, + "learning_rate": 2e-05, + "loss": 0.06648382, + "step": 12656 + }, + { + "epoch": 25.314, + "grad_norm": 1.3388099670410156, + "learning_rate": 2e-05, + "loss": 0.05874391, + "step": 12657 + }, + { + "epoch": 25.316, + "grad_norm": 0.9311839938163757, + "learning_rate": 2e-05, + "loss": 0.0290327, + "step": 12658 + }, + { + "epoch": 25.318, + "grad_norm": 1.222132682800293, + "learning_rate": 2e-05, + "loss": 0.03191571, + "step": 12659 + }, + { + "epoch": 25.32, + "grad_norm": 1.1202843189239502, + "learning_rate": 2e-05, + "loss": 0.03595486, + "step": 12660 + }, + { + "epoch": 25.322, + "grad_norm": 1.4704622030258179, + "learning_rate": 2e-05, + "loss": 0.04287569, + "step": 12661 + }, + { + "epoch": 25.324, + "grad_norm": 0.9908611178398132, + "learning_rate": 2e-05, + "loss": 0.03052938, + "step": 12662 + }, + { + "epoch": 25.326, + "grad_norm": 1.0778824090957642, + "learning_rate": 2e-05, + "loss": 0.04454803, + "step": 12663 + }, + { + "epoch": 25.328, + "grad_norm": 1.4507533311843872, + "learning_rate": 2e-05, + "loss": 0.04375216, + "step": 12664 + }, + { + "epoch": 25.33, + "grad_norm": 1.0799167156219482, + "learning_rate": 2e-05, + "loss": 0.03894107, + "step": 12665 + }, + { + "epoch": 25.332, + "grad_norm": 1.2972630262374878, + "learning_rate": 2e-05, + "loss": 0.03249099, + "step": 12666 + }, + { + "epoch": 25.334, + "grad_norm": 1.2298357486724854, + "learning_rate": 2e-05, + "loss": 0.04411597, + "step": 12667 + }, + { + "epoch": 25.336, + "grad_norm": 1.2464599609375, + "learning_rate": 2e-05, + "loss": 0.03922657, + "step": 12668 + }, + { + "epoch": 25.338, + "grad_norm": 1.6811728477478027, + "learning_rate": 2e-05, + "loss": 0.05457025, + "step": 12669 + }, + { + "epoch": 25.34, + "grad_norm": 1.4204384088516235, + "learning_rate": 2e-05, + "loss": 0.03159765, + "step": 12670 + }, + { + "epoch": 25.342, + "grad_norm": 1.5988712310791016, + "learning_rate": 2e-05, + "loss": 0.04662495, + "step": 12671 + }, + { + "epoch": 25.344, + "grad_norm": 1.4310500621795654, + "learning_rate": 2e-05, + "loss": 0.05521808, + "step": 12672 + }, + { + "epoch": 25.346, + "grad_norm": 1.4796041250228882, + "learning_rate": 2e-05, + "loss": 0.05029026, + "step": 12673 + }, + { + "epoch": 25.348, + "grad_norm": 1.120280385017395, + "learning_rate": 2e-05, + "loss": 0.05406159, + "step": 12674 + }, + { + "epoch": 25.35, + "grad_norm": 1.5472899675369263, + "learning_rate": 2e-05, + "loss": 0.04434624, + "step": 12675 + }, + { + "epoch": 25.352, + "grad_norm": 1.5988304615020752, + "learning_rate": 2e-05, + "loss": 0.03512241, + "step": 12676 + }, + { + "epoch": 25.354, + "grad_norm": 2.166592836380005, + "learning_rate": 2e-05, + "loss": 0.05324619, + "step": 12677 + }, + { + "epoch": 25.356, + "grad_norm": 1.23325514793396, + "learning_rate": 2e-05, + "loss": 0.0458478, + "step": 12678 + }, + { + "epoch": 25.358, + "grad_norm": 1.305748462677002, + "learning_rate": 2e-05, + "loss": 0.04038317, + "step": 12679 + }, + { + "epoch": 25.36, + "grad_norm": 1.2116440534591675, + "learning_rate": 2e-05, + "loss": 0.04997266, + "step": 12680 + }, + { + "epoch": 25.362, + "grad_norm": 1.0483074188232422, + "learning_rate": 2e-05, + "loss": 0.03716281, + "step": 12681 + }, + { + "epoch": 25.364, + "grad_norm": 1.1586565971374512, + "learning_rate": 2e-05, + "loss": 0.03767997, + "step": 12682 + }, + { + "epoch": 25.366, + "grad_norm": 1.693536400794983, + "learning_rate": 2e-05, + "loss": 0.05087979, + "step": 12683 + }, + { + "epoch": 25.368, + "grad_norm": 1.4441686868667603, + "learning_rate": 2e-05, + "loss": 0.05642378, + "step": 12684 + }, + { + "epoch": 25.37, + "grad_norm": 1.18898344039917, + "learning_rate": 2e-05, + "loss": 0.04010397, + "step": 12685 + }, + { + "epoch": 25.372, + "grad_norm": 1.593220591545105, + "learning_rate": 2e-05, + "loss": 0.0599516, + "step": 12686 + }, + { + "epoch": 25.374, + "grad_norm": 2.1640501022338867, + "learning_rate": 2e-05, + "loss": 0.04657248, + "step": 12687 + }, + { + "epoch": 25.376, + "grad_norm": 1.3017292022705078, + "learning_rate": 2e-05, + "loss": 0.05840718, + "step": 12688 + }, + { + "epoch": 25.378, + "grad_norm": 1.1755383014678955, + "learning_rate": 2e-05, + "loss": 0.04789887, + "step": 12689 + }, + { + "epoch": 25.38, + "grad_norm": 1.9385019540786743, + "learning_rate": 2e-05, + "loss": 0.05613485, + "step": 12690 + }, + { + "epoch": 25.382, + "grad_norm": 1.2281224727630615, + "learning_rate": 2e-05, + "loss": 0.03748925, + "step": 12691 + }, + { + "epoch": 25.384, + "grad_norm": 1.4565155506134033, + "learning_rate": 2e-05, + "loss": 0.0405966, + "step": 12692 + }, + { + "epoch": 25.386, + "grad_norm": 2.12028169631958, + "learning_rate": 2e-05, + "loss": 0.04600952, + "step": 12693 + }, + { + "epoch": 25.388, + "grad_norm": 1.0148862600326538, + "learning_rate": 2e-05, + "loss": 0.02967937, + "step": 12694 + }, + { + "epoch": 25.39, + "grad_norm": 1.3207978010177612, + "learning_rate": 2e-05, + "loss": 0.06044834, + "step": 12695 + }, + { + "epoch": 25.392, + "grad_norm": 1.3644589185714722, + "learning_rate": 2e-05, + "loss": 0.03912506, + "step": 12696 + }, + { + "epoch": 25.394, + "grad_norm": 1.5914479494094849, + "learning_rate": 2e-05, + "loss": 0.0491058, + "step": 12697 + }, + { + "epoch": 25.396, + "grad_norm": 1.4540284872055054, + "learning_rate": 2e-05, + "loss": 0.04001951, + "step": 12698 + }, + { + "epoch": 25.398, + "grad_norm": 1.3614214658737183, + "learning_rate": 2e-05, + "loss": 0.04715081, + "step": 12699 + }, + { + "epoch": 25.4, + "grad_norm": 1.1477137804031372, + "learning_rate": 2e-05, + "loss": 0.0351803, + "step": 12700 + }, + { + "epoch": 25.402, + "grad_norm": 1.382109522819519, + "learning_rate": 2e-05, + "loss": 0.04908609, + "step": 12701 + }, + { + "epoch": 25.404, + "grad_norm": 1.372443675994873, + "learning_rate": 2e-05, + "loss": 0.06265668, + "step": 12702 + }, + { + "epoch": 25.406, + "grad_norm": 1.0587457418441772, + "learning_rate": 2e-05, + "loss": 0.04351508, + "step": 12703 + }, + { + "epoch": 25.408, + "grad_norm": 1.4110264778137207, + "learning_rate": 2e-05, + "loss": 0.05135802, + "step": 12704 + }, + { + "epoch": 25.41, + "grad_norm": 2.1139957904815674, + "learning_rate": 2e-05, + "loss": 0.03126623, + "step": 12705 + }, + { + "epoch": 25.412, + "grad_norm": 1.8020521402359009, + "learning_rate": 2e-05, + "loss": 0.04075658, + "step": 12706 + }, + { + "epoch": 25.414, + "grad_norm": 1.4906790256500244, + "learning_rate": 2e-05, + "loss": 0.03323084, + "step": 12707 + }, + { + "epoch": 25.416, + "grad_norm": 1.2605034112930298, + "learning_rate": 2e-05, + "loss": 0.04425685, + "step": 12708 + }, + { + "epoch": 25.418, + "grad_norm": 1.4229034185409546, + "learning_rate": 2e-05, + "loss": 0.06664809, + "step": 12709 + }, + { + "epoch": 25.42, + "grad_norm": 3.019768476486206, + "learning_rate": 2e-05, + "loss": 0.06653355, + "step": 12710 + }, + { + "epoch": 25.422, + "grad_norm": 1.1232490539550781, + "learning_rate": 2e-05, + "loss": 0.03858437, + "step": 12711 + }, + { + "epoch": 25.424, + "grad_norm": 1.0489226579666138, + "learning_rate": 2e-05, + "loss": 0.03429198, + "step": 12712 + }, + { + "epoch": 25.426, + "grad_norm": 2.398514747619629, + "learning_rate": 2e-05, + "loss": 0.07073388, + "step": 12713 + }, + { + "epoch": 25.428, + "grad_norm": 1.2204501628875732, + "learning_rate": 2e-05, + "loss": 0.04312594, + "step": 12714 + }, + { + "epoch": 25.43, + "grad_norm": 0.9238330721855164, + "learning_rate": 2e-05, + "loss": 0.03127215, + "step": 12715 + }, + { + "epoch": 25.432, + "grad_norm": 1.1325103044509888, + "learning_rate": 2e-05, + "loss": 0.04861875, + "step": 12716 + }, + { + "epoch": 25.434, + "grad_norm": 1.4087941646575928, + "learning_rate": 2e-05, + "loss": 0.04097962, + "step": 12717 + }, + { + "epoch": 25.436, + "grad_norm": 2.770249366760254, + "learning_rate": 2e-05, + "loss": 0.05767106, + "step": 12718 + }, + { + "epoch": 25.438, + "grad_norm": 1.5272942781448364, + "learning_rate": 2e-05, + "loss": 0.05232708, + "step": 12719 + }, + { + "epoch": 25.44, + "grad_norm": 1.8735677003860474, + "learning_rate": 2e-05, + "loss": 0.04702976, + "step": 12720 + }, + { + "epoch": 25.442, + "grad_norm": 1.5433956384658813, + "learning_rate": 2e-05, + "loss": 0.04802687, + "step": 12721 + }, + { + "epoch": 25.444, + "grad_norm": 1.2358498573303223, + "learning_rate": 2e-05, + "loss": 0.03744403, + "step": 12722 + }, + { + "epoch": 25.446, + "grad_norm": 1.9896537065505981, + "learning_rate": 2e-05, + "loss": 0.0489389, + "step": 12723 + }, + { + "epoch": 25.448, + "grad_norm": 1.2103878259658813, + "learning_rate": 2e-05, + "loss": 0.04206013, + "step": 12724 + }, + { + "epoch": 25.45, + "grad_norm": 1.4859042167663574, + "learning_rate": 2e-05, + "loss": 0.04817963, + "step": 12725 + }, + { + "epoch": 25.452, + "grad_norm": 1.0707011222839355, + "learning_rate": 2e-05, + "loss": 0.02487176, + "step": 12726 + }, + { + "epoch": 25.454, + "grad_norm": 0.8648288249969482, + "learning_rate": 2e-05, + "loss": 0.02900369, + "step": 12727 + }, + { + "epoch": 25.456, + "grad_norm": 1.057659387588501, + "learning_rate": 2e-05, + "loss": 0.03823693, + "step": 12728 + }, + { + "epoch": 25.458, + "grad_norm": 1.477248191833496, + "learning_rate": 2e-05, + "loss": 0.05023545, + "step": 12729 + }, + { + "epoch": 25.46, + "grad_norm": 1.2823160886764526, + "learning_rate": 2e-05, + "loss": 0.05811198, + "step": 12730 + }, + { + "epoch": 25.462, + "grad_norm": 1.3287951946258545, + "learning_rate": 2e-05, + "loss": 0.05152711, + "step": 12731 + }, + { + "epoch": 25.464, + "grad_norm": 1.0755188465118408, + "learning_rate": 2e-05, + "loss": 0.04283263, + "step": 12732 + }, + { + "epoch": 25.466, + "grad_norm": 1.084681510925293, + "learning_rate": 2e-05, + "loss": 0.03596375, + "step": 12733 + }, + { + "epoch": 25.468, + "grad_norm": 1.1877176761627197, + "learning_rate": 2e-05, + "loss": 0.04653904, + "step": 12734 + }, + { + "epoch": 25.47, + "grad_norm": 1.7374459505081177, + "learning_rate": 2e-05, + "loss": 0.03502556, + "step": 12735 + }, + { + "epoch": 25.472, + "grad_norm": 1.5863116979599, + "learning_rate": 2e-05, + "loss": 0.03652181, + "step": 12736 + }, + { + "epoch": 25.474, + "grad_norm": 1.1849809885025024, + "learning_rate": 2e-05, + "loss": 0.0374795, + "step": 12737 + }, + { + "epoch": 25.476, + "grad_norm": 1.6925612688064575, + "learning_rate": 2e-05, + "loss": 0.05677975, + "step": 12738 + }, + { + "epoch": 25.478, + "grad_norm": 1.0018976926803589, + "learning_rate": 2e-05, + "loss": 0.035368, + "step": 12739 + }, + { + "epoch": 25.48, + "grad_norm": 1.0738310813903809, + "learning_rate": 2e-05, + "loss": 0.04165709, + "step": 12740 + }, + { + "epoch": 25.482, + "grad_norm": 1.0751115083694458, + "learning_rate": 2e-05, + "loss": 0.04024215, + "step": 12741 + }, + { + "epoch": 25.484, + "grad_norm": 1.2548881769180298, + "learning_rate": 2e-05, + "loss": 0.06063127, + "step": 12742 + }, + { + "epoch": 25.486, + "grad_norm": 2.8296592235565186, + "learning_rate": 2e-05, + "loss": 0.04534397, + "step": 12743 + }, + { + "epoch": 25.488, + "grad_norm": 2.0588326454162598, + "learning_rate": 2e-05, + "loss": 0.07510805, + "step": 12744 + }, + { + "epoch": 25.49, + "grad_norm": 2.919177293777466, + "learning_rate": 2e-05, + "loss": 0.05336414, + "step": 12745 + }, + { + "epoch": 25.492, + "grad_norm": 1.1437351703643799, + "learning_rate": 2e-05, + "loss": 0.04661782, + "step": 12746 + }, + { + "epoch": 25.494, + "grad_norm": 1.2010008096694946, + "learning_rate": 2e-05, + "loss": 0.04713929, + "step": 12747 + }, + { + "epoch": 25.496, + "grad_norm": 1.2396228313446045, + "learning_rate": 2e-05, + "loss": 0.03676428, + "step": 12748 + }, + { + "epoch": 25.498, + "grad_norm": 1.1913061141967773, + "learning_rate": 2e-05, + "loss": 0.03432823, + "step": 12749 + }, + { + "epoch": 25.5, + "grad_norm": 1.0427284240722656, + "learning_rate": 2e-05, + "loss": 0.04304377, + "step": 12750 + }, + { + "epoch": 25.502, + "grad_norm": 0.9250820279121399, + "learning_rate": 2e-05, + "loss": 0.02991773, + "step": 12751 + }, + { + "epoch": 25.504, + "grad_norm": 1.7029353380203247, + "learning_rate": 2e-05, + "loss": 0.05027733, + "step": 12752 + }, + { + "epoch": 25.506, + "grad_norm": 1.2598884105682373, + "learning_rate": 2e-05, + "loss": 0.03872483, + "step": 12753 + }, + { + "epoch": 25.508, + "grad_norm": 1.1312626600265503, + "learning_rate": 2e-05, + "loss": 0.03387032, + "step": 12754 + }, + { + "epoch": 25.51, + "grad_norm": 1.4454662799835205, + "learning_rate": 2e-05, + "loss": 0.04541782, + "step": 12755 + }, + { + "epoch": 25.512, + "grad_norm": 1.2431281805038452, + "learning_rate": 2e-05, + "loss": 0.04072841, + "step": 12756 + }, + { + "epoch": 25.514, + "grad_norm": 1.0991272926330566, + "learning_rate": 2e-05, + "loss": 0.03918246, + "step": 12757 + }, + { + "epoch": 25.516, + "grad_norm": 1.9460620880126953, + "learning_rate": 2e-05, + "loss": 0.06519485, + "step": 12758 + }, + { + "epoch": 25.518, + "grad_norm": 1.238317608833313, + "learning_rate": 2e-05, + "loss": 0.05222324, + "step": 12759 + }, + { + "epoch": 25.52, + "grad_norm": 1.1859232187271118, + "learning_rate": 2e-05, + "loss": 0.03913081, + "step": 12760 + }, + { + "epoch": 25.522, + "grad_norm": 1.1361500024795532, + "learning_rate": 2e-05, + "loss": 0.05126172, + "step": 12761 + }, + { + "epoch": 25.524, + "grad_norm": 1.657873511314392, + "learning_rate": 2e-05, + "loss": 0.06139502, + "step": 12762 + }, + { + "epoch": 25.526, + "grad_norm": 0.9964461326599121, + "learning_rate": 2e-05, + "loss": 0.03220826, + "step": 12763 + }, + { + "epoch": 25.528, + "grad_norm": 1.958337426185608, + "learning_rate": 2e-05, + "loss": 0.03383143, + "step": 12764 + }, + { + "epoch": 25.53, + "grad_norm": 1.3577609062194824, + "learning_rate": 2e-05, + "loss": 0.04257483, + "step": 12765 + }, + { + "epoch": 25.532, + "grad_norm": 1.1528269052505493, + "learning_rate": 2e-05, + "loss": 0.04727619, + "step": 12766 + }, + { + "epoch": 25.534, + "grad_norm": 1.3304978609085083, + "learning_rate": 2e-05, + "loss": 0.04373534, + "step": 12767 + }, + { + "epoch": 25.536, + "grad_norm": 2.185859441757202, + "learning_rate": 2e-05, + "loss": 0.06034337, + "step": 12768 + }, + { + "epoch": 25.538, + "grad_norm": 1.6654530763626099, + "learning_rate": 2e-05, + "loss": 0.05115644, + "step": 12769 + }, + { + "epoch": 25.54, + "grad_norm": 1.3865916728973389, + "learning_rate": 2e-05, + "loss": 0.05147178, + "step": 12770 + }, + { + "epoch": 25.542, + "grad_norm": 1.5472357273101807, + "learning_rate": 2e-05, + "loss": 0.04129698, + "step": 12771 + }, + { + "epoch": 25.544, + "grad_norm": 2.8680312633514404, + "learning_rate": 2e-05, + "loss": 0.05492282, + "step": 12772 + }, + { + "epoch": 25.546, + "grad_norm": 1.9399982690811157, + "learning_rate": 2e-05, + "loss": 0.04655978, + "step": 12773 + }, + { + "epoch": 25.548000000000002, + "grad_norm": 1.1271389722824097, + "learning_rate": 2e-05, + "loss": 0.04596433, + "step": 12774 + }, + { + "epoch": 25.55, + "grad_norm": 1.3790403604507446, + "learning_rate": 2e-05, + "loss": 0.04871649, + "step": 12775 + }, + { + "epoch": 25.552, + "grad_norm": 1.8597426414489746, + "learning_rate": 2e-05, + "loss": 0.04528391, + "step": 12776 + }, + { + "epoch": 25.554, + "grad_norm": 1.6001335382461548, + "learning_rate": 2e-05, + "loss": 0.04731301, + "step": 12777 + }, + { + "epoch": 25.556, + "grad_norm": 1.0347784757614136, + "learning_rate": 2e-05, + "loss": 0.03880426, + "step": 12778 + }, + { + "epoch": 25.558, + "grad_norm": 1.1842137575149536, + "learning_rate": 2e-05, + "loss": 0.04206136, + "step": 12779 + }, + { + "epoch": 25.56, + "grad_norm": 1.1110947132110596, + "learning_rate": 2e-05, + "loss": 0.02964817, + "step": 12780 + }, + { + "epoch": 25.562, + "grad_norm": 1.3468905687332153, + "learning_rate": 2e-05, + "loss": 0.05464962, + "step": 12781 + }, + { + "epoch": 25.564, + "grad_norm": 1.660844326019287, + "learning_rate": 2e-05, + "loss": 0.05474025, + "step": 12782 + }, + { + "epoch": 25.566, + "grad_norm": 0.958869993686676, + "learning_rate": 2e-05, + "loss": 0.03442763, + "step": 12783 + }, + { + "epoch": 25.568, + "grad_norm": 1.2681729793548584, + "learning_rate": 2e-05, + "loss": 0.05398714, + "step": 12784 + }, + { + "epoch": 25.57, + "grad_norm": 1.6511400938034058, + "learning_rate": 2e-05, + "loss": 0.04100592, + "step": 12785 + }, + { + "epoch": 25.572, + "grad_norm": 1.7444071769714355, + "learning_rate": 2e-05, + "loss": 0.05146802, + "step": 12786 + }, + { + "epoch": 25.574, + "grad_norm": 1.5246992111206055, + "learning_rate": 2e-05, + "loss": 0.04002138, + "step": 12787 + }, + { + "epoch": 25.576, + "grad_norm": 1.2669793367385864, + "learning_rate": 2e-05, + "loss": 0.03664404, + "step": 12788 + }, + { + "epoch": 25.578, + "grad_norm": 1.2276067733764648, + "learning_rate": 2e-05, + "loss": 0.04909092, + "step": 12789 + }, + { + "epoch": 25.58, + "grad_norm": 1.8915936946868896, + "learning_rate": 2e-05, + "loss": 0.04786887, + "step": 12790 + }, + { + "epoch": 25.582, + "grad_norm": 1.526929259300232, + "learning_rate": 2e-05, + "loss": 0.0330558, + "step": 12791 + }, + { + "epoch": 25.584, + "grad_norm": 1.7936866283416748, + "learning_rate": 2e-05, + "loss": 0.04382351, + "step": 12792 + }, + { + "epoch": 25.586, + "grad_norm": 2.3560688495635986, + "learning_rate": 2e-05, + "loss": 0.03439542, + "step": 12793 + }, + { + "epoch": 25.588, + "grad_norm": 1.3485525846481323, + "learning_rate": 2e-05, + "loss": 0.03818742, + "step": 12794 + }, + { + "epoch": 25.59, + "grad_norm": 1.1782861948013306, + "learning_rate": 2e-05, + "loss": 0.03852155, + "step": 12795 + }, + { + "epoch": 25.592, + "grad_norm": 1.5356895923614502, + "learning_rate": 2e-05, + "loss": 0.05399141, + "step": 12796 + }, + { + "epoch": 25.594, + "grad_norm": 1.246084451675415, + "learning_rate": 2e-05, + "loss": 0.04823676, + "step": 12797 + }, + { + "epoch": 25.596, + "grad_norm": 1.3950222730636597, + "learning_rate": 2e-05, + "loss": 0.02780427, + "step": 12798 + }, + { + "epoch": 25.598, + "grad_norm": 3.066758632659912, + "learning_rate": 2e-05, + "loss": 0.05022492, + "step": 12799 + }, + { + "epoch": 25.6, + "grad_norm": 0.9001642465591431, + "learning_rate": 2e-05, + "loss": 0.02825335, + "step": 12800 + }, + { + "epoch": 25.602, + "grad_norm": 1.1426812410354614, + "learning_rate": 2e-05, + "loss": 0.02332602, + "step": 12801 + }, + { + "epoch": 25.604, + "grad_norm": 1.341883897781372, + "learning_rate": 2e-05, + "loss": 0.05295313, + "step": 12802 + }, + { + "epoch": 25.606, + "grad_norm": 1.4535542726516724, + "learning_rate": 2e-05, + "loss": 0.04531492, + "step": 12803 + }, + { + "epoch": 25.608, + "grad_norm": 1.222536563873291, + "learning_rate": 2e-05, + "loss": 0.03721589, + "step": 12804 + }, + { + "epoch": 25.61, + "grad_norm": 1.7234156131744385, + "learning_rate": 2e-05, + "loss": 0.05661245, + "step": 12805 + }, + { + "epoch": 25.612, + "grad_norm": 1.7372621297836304, + "learning_rate": 2e-05, + "loss": 0.03441635, + "step": 12806 + }, + { + "epoch": 25.614, + "grad_norm": 1.6260429620742798, + "learning_rate": 2e-05, + "loss": 0.0488297, + "step": 12807 + }, + { + "epoch": 25.616, + "grad_norm": 1.0251811742782593, + "learning_rate": 2e-05, + "loss": 0.03384798, + "step": 12808 + }, + { + "epoch": 25.618, + "grad_norm": 1.0292855501174927, + "learning_rate": 2e-05, + "loss": 0.04441176, + "step": 12809 + }, + { + "epoch": 25.62, + "grad_norm": 0.9246246218681335, + "learning_rate": 2e-05, + "loss": 0.0311376, + "step": 12810 + }, + { + "epoch": 25.622, + "grad_norm": 1.1222032308578491, + "learning_rate": 2e-05, + "loss": 0.03349977, + "step": 12811 + }, + { + "epoch": 25.624, + "grad_norm": 1.044348955154419, + "learning_rate": 2e-05, + "loss": 0.03562807, + "step": 12812 + }, + { + "epoch": 25.626, + "grad_norm": 1.314139723777771, + "learning_rate": 2e-05, + "loss": 0.048839, + "step": 12813 + }, + { + "epoch": 25.628, + "grad_norm": 1.1304693222045898, + "learning_rate": 2e-05, + "loss": 0.05354474, + "step": 12814 + }, + { + "epoch": 25.63, + "grad_norm": 1.2466613054275513, + "learning_rate": 2e-05, + "loss": 0.0536761, + "step": 12815 + }, + { + "epoch": 25.632, + "grad_norm": 1.0759629011154175, + "learning_rate": 2e-05, + "loss": 0.04039822, + "step": 12816 + }, + { + "epoch": 25.634, + "grad_norm": 1.232729196548462, + "learning_rate": 2e-05, + "loss": 0.04778222, + "step": 12817 + }, + { + "epoch": 25.636, + "grad_norm": 1.3825780153274536, + "learning_rate": 2e-05, + "loss": 0.0383257, + "step": 12818 + }, + { + "epoch": 25.638, + "grad_norm": 1.105268120765686, + "learning_rate": 2e-05, + "loss": 0.03146917, + "step": 12819 + }, + { + "epoch": 25.64, + "grad_norm": 3.4076695442199707, + "learning_rate": 2e-05, + "loss": 0.04861237, + "step": 12820 + }, + { + "epoch": 25.642, + "grad_norm": 1.1343084573745728, + "learning_rate": 2e-05, + "loss": 0.0362022, + "step": 12821 + }, + { + "epoch": 25.644, + "grad_norm": 1.5663788318634033, + "learning_rate": 2e-05, + "loss": 0.04831579, + "step": 12822 + }, + { + "epoch": 25.646, + "grad_norm": 1.799911618232727, + "learning_rate": 2e-05, + "loss": 0.05474582, + "step": 12823 + }, + { + "epoch": 25.648, + "grad_norm": 1.5941003561019897, + "learning_rate": 2e-05, + "loss": 0.04637301, + "step": 12824 + }, + { + "epoch": 25.65, + "grad_norm": 1.7795392274856567, + "learning_rate": 2e-05, + "loss": 0.0298002, + "step": 12825 + }, + { + "epoch": 25.652, + "grad_norm": 1.5927716493606567, + "learning_rate": 2e-05, + "loss": 0.0472445, + "step": 12826 + }, + { + "epoch": 25.654, + "grad_norm": 1.3922079801559448, + "learning_rate": 2e-05, + "loss": 0.04148681, + "step": 12827 + }, + { + "epoch": 25.656, + "grad_norm": 1.3033424615859985, + "learning_rate": 2e-05, + "loss": 0.03751112, + "step": 12828 + }, + { + "epoch": 25.658, + "grad_norm": 1.3551427125930786, + "learning_rate": 2e-05, + "loss": 0.04426542, + "step": 12829 + }, + { + "epoch": 25.66, + "grad_norm": 4.914473056793213, + "learning_rate": 2e-05, + "loss": 0.05875074, + "step": 12830 + }, + { + "epoch": 25.662, + "grad_norm": 1.2864055633544922, + "learning_rate": 2e-05, + "loss": 0.05747083, + "step": 12831 + }, + { + "epoch": 25.664, + "grad_norm": 1.245685338973999, + "learning_rate": 2e-05, + "loss": 0.04856413, + "step": 12832 + }, + { + "epoch": 25.666, + "grad_norm": 1.2700167894363403, + "learning_rate": 2e-05, + "loss": 0.04627567, + "step": 12833 + }, + { + "epoch": 25.668, + "grad_norm": 2.3879594802856445, + "learning_rate": 2e-05, + "loss": 0.04573566, + "step": 12834 + }, + { + "epoch": 25.67, + "grad_norm": 1.0853418111801147, + "learning_rate": 2e-05, + "loss": 0.03547914, + "step": 12835 + }, + { + "epoch": 25.672, + "grad_norm": 2.4202558994293213, + "learning_rate": 2e-05, + "loss": 0.05616173, + "step": 12836 + }, + { + "epoch": 25.674, + "grad_norm": 1.213026523590088, + "learning_rate": 2e-05, + "loss": 0.04792723, + "step": 12837 + }, + { + "epoch": 25.676, + "grad_norm": 1.4499080181121826, + "learning_rate": 2e-05, + "loss": 0.04635759, + "step": 12838 + }, + { + "epoch": 25.678, + "grad_norm": 1.0613125562667847, + "learning_rate": 2e-05, + "loss": 0.04478652, + "step": 12839 + }, + { + "epoch": 25.68, + "grad_norm": 0.9953880906105042, + "learning_rate": 2e-05, + "loss": 0.02956217, + "step": 12840 + }, + { + "epoch": 25.682, + "grad_norm": 2.4529874324798584, + "learning_rate": 2e-05, + "loss": 0.04293549, + "step": 12841 + }, + { + "epoch": 25.684, + "grad_norm": 1.058303952217102, + "learning_rate": 2e-05, + "loss": 0.03956451, + "step": 12842 + }, + { + "epoch": 25.686, + "grad_norm": 1.2368803024291992, + "learning_rate": 2e-05, + "loss": 0.05040938, + "step": 12843 + }, + { + "epoch": 25.688, + "grad_norm": 1.3737293481826782, + "learning_rate": 2e-05, + "loss": 0.05072392, + "step": 12844 + }, + { + "epoch": 25.69, + "grad_norm": 1.0648189783096313, + "learning_rate": 2e-05, + "loss": 0.03040619, + "step": 12845 + }, + { + "epoch": 25.692, + "grad_norm": 1.3716273307800293, + "learning_rate": 2e-05, + "loss": 0.03951315, + "step": 12846 + }, + { + "epoch": 25.694, + "grad_norm": 1.4560929536819458, + "learning_rate": 2e-05, + "loss": 0.0416421, + "step": 12847 + }, + { + "epoch": 25.696, + "grad_norm": 1.9304611682891846, + "learning_rate": 2e-05, + "loss": 0.0425549, + "step": 12848 + }, + { + "epoch": 25.698, + "grad_norm": 0.9284382462501526, + "learning_rate": 2e-05, + "loss": 0.03185133, + "step": 12849 + }, + { + "epoch": 25.7, + "grad_norm": 1.3706117868423462, + "learning_rate": 2e-05, + "loss": 0.04668346, + "step": 12850 + }, + { + "epoch": 25.701999999999998, + "grad_norm": 1.7373970746994019, + "learning_rate": 2e-05, + "loss": 0.03499283, + "step": 12851 + }, + { + "epoch": 25.704, + "grad_norm": 2.239835262298584, + "learning_rate": 2e-05, + "loss": 0.0479632, + "step": 12852 + }, + { + "epoch": 25.706, + "grad_norm": 1.1869913339614868, + "learning_rate": 2e-05, + "loss": 0.04254553, + "step": 12853 + }, + { + "epoch": 25.708, + "grad_norm": 1.1868373155593872, + "learning_rate": 2e-05, + "loss": 0.03624144, + "step": 12854 + }, + { + "epoch": 25.71, + "grad_norm": 1.2735049724578857, + "learning_rate": 2e-05, + "loss": 0.04621403, + "step": 12855 + }, + { + "epoch": 25.712, + "grad_norm": 2.1693756580352783, + "learning_rate": 2e-05, + "loss": 0.05169474, + "step": 12856 + }, + { + "epoch": 25.714, + "grad_norm": 1.0680606365203857, + "learning_rate": 2e-05, + "loss": 0.03689266, + "step": 12857 + }, + { + "epoch": 25.716, + "grad_norm": 1.085115671157837, + "learning_rate": 2e-05, + "loss": 0.04505826, + "step": 12858 + }, + { + "epoch": 25.718, + "grad_norm": 1.4041411876678467, + "learning_rate": 2e-05, + "loss": 0.05596658, + "step": 12859 + }, + { + "epoch": 25.72, + "grad_norm": 1.3453253507614136, + "learning_rate": 2e-05, + "loss": 0.0477687, + "step": 12860 + }, + { + "epoch": 25.722, + "grad_norm": 1.0043227672576904, + "learning_rate": 2e-05, + "loss": 0.03752232, + "step": 12861 + }, + { + "epoch": 25.724, + "grad_norm": 1.8186486959457397, + "learning_rate": 2e-05, + "loss": 0.04446795, + "step": 12862 + }, + { + "epoch": 25.726, + "grad_norm": 1.1549947261810303, + "learning_rate": 2e-05, + "loss": 0.03548076, + "step": 12863 + }, + { + "epoch": 25.728, + "grad_norm": 1.8598767518997192, + "learning_rate": 2e-05, + "loss": 0.05279416, + "step": 12864 + }, + { + "epoch": 25.73, + "grad_norm": 1.2044051885604858, + "learning_rate": 2e-05, + "loss": 0.04757049, + "step": 12865 + }, + { + "epoch": 25.732, + "grad_norm": 1.5500402450561523, + "learning_rate": 2e-05, + "loss": 0.04289635, + "step": 12866 + }, + { + "epoch": 25.734, + "grad_norm": 1.505210041999817, + "learning_rate": 2e-05, + "loss": 0.03871378, + "step": 12867 + }, + { + "epoch": 25.736, + "grad_norm": 1.5146814584732056, + "learning_rate": 2e-05, + "loss": 0.06822012, + "step": 12868 + }, + { + "epoch": 25.738, + "grad_norm": 1.9045121669769287, + "learning_rate": 2e-05, + "loss": 0.04598961, + "step": 12869 + }, + { + "epoch": 25.74, + "grad_norm": 1.5291447639465332, + "learning_rate": 2e-05, + "loss": 0.04768593, + "step": 12870 + }, + { + "epoch": 25.742, + "grad_norm": 1.6933094263076782, + "learning_rate": 2e-05, + "loss": 0.03961466, + "step": 12871 + }, + { + "epoch": 25.744, + "grad_norm": 1.7180720567703247, + "learning_rate": 2e-05, + "loss": 0.04465307, + "step": 12872 + }, + { + "epoch": 25.746, + "grad_norm": 1.5562777519226074, + "learning_rate": 2e-05, + "loss": 0.04760274, + "step": 12873 + }, + { + "epoch": 25.748, + "grad_norm": 1.8690643310546875, + "learning_rate": 2e-05, + "loss": 0.0456716, + "step": 12874 + }, + { + "epoch": 25.75, + "grad_norm": 1.6642539501190186, + "learning_rate": 2e-05, + "loss": 0.0361044, + "step": 12875 + }, + { + "epoch": 25.752, + "grad_norm": 1.0987035036087036, + "learning_rate": 2e-05, + "loss": 0.0358355, + "step": 12876 + }, + { + "epoch": 25.754, + "grad_norm": 1.276738166809082, + "learning_rate": 2e-05, + "loss": 0.04325083, + "step": 12877 + }, + { + "epoch": 25.756, + "grad_norm": 1.4052026271820068, + "learning_rate": 2e-05, + "loss": 0.05564051, + "step": 12878 + }, + { + "epoch": 25.758, + "grad_norm": 1.1691828966140747, + "learning_rate": 2e-05, + "loss": 0.0309645, + "step": 12879 + }, + { + "epoch": 25.76, + "grad_norm": 1.5935523509979248, + "learning_rate": 2e-05, + "loss": 0.04389683, + "step": 12880 + }, + { + "epoch": 25.762, + "grad_norm": 1.4796446561813354, + "learning_rate": 2e-05, + "loss": 0.05674142, + "step": 12881 + }, + { + "epoch": 25.764, + "grad_norm": 1.473631501197815, + "learning_rate": 2e-05, + "loss": 0.05766422, + "step": 12882 + }, + { + "epoch": 25.766, + "grad_norm": 1.003163456916809, + "learning_rate": 2e-05, + "loss": 0.03590136, + "step": 12883 + }, + { + "epoch": 25.768, + "grad_norm": 1.1867930889129639, + "learning_rate": 2e-05, + "loss": 0.04083221, + "step": 12884 + }, + { + "epoch": 25.77, + "grad_norm": 1.342621922492981, + "learning_rate": 2e-05, + "loss": 0.04845904, + "step": 12885 + }, + { + "epoch": 25.772, + "grad_norm": 1.7419025897979736, + "learning_rate": 2e-05, + "loss": 0.04856285, + "step": 12886 + }, + { + "epoch": 25.774, + "grad_norm": 0.9666280150413513, + "learning_rate": 2e-05, + "loss": 0.03569352, + "step": 12887 + }, + { + "epoch": 25.776, + "grad_norm": 1.4296828508377075, + "learning_rate": 2e-05, + "loss": 0.03583002, + "step": 12888 + }, + { + "epoch": 25.778, + "grad_norm": 1.1190946102142334, + "learning_rate": 2e-05, + "loss": 0.02983772, + "step": 12889 + }, + { + "epoch": 25.78, + "grad_norm": 1.2515982389450073, + "learning_rate": 2e-05, + "loss": 0.04836629, + "step": 12890 + }, + { + "epoch": 25.782, + "grad_norm": 2.553123712539673, + "learning_rate": 2e-05, + "loss": 0.05472127, + "step": 12891 + }, + { + "epoch": 25.784, + "grad_norm": 1.407144546508789, + "learning_rate": 2e-05, + "loss": 0.04308041, + "step": 12892 + }, + { + "epoch": 25.786, + "grad_norm": 1.1658834218978882, + "learning_rate": 2e-05, + "loss": 0.03924808, + "step": 12893 + }, + { + "epoch": 25.788, + "grad_norm": 1.917145848274231, + "learning_rate": 2e-05, + "loss": 0.06007392, + "step": 12894 + }, + { + "epoch": 25.79, + "grad_norm": 2.5494346618652344, + "learning_rate": 2e-05, + "loss": 0.04623754, + "step": 12895 + }, + { + "epoch": 25.792, + "grad_norm": 1.3239128589630127, + "learning_rate": 2e-05, + "loss": 0.04573008, + "step": 12896 + }, + { + "epoch": 25.794, + "grad_norm": 1.2541325092315674, + "learning_rate": 2e-05, + "loss": 0.03188618, + "step": 12897 + }, + { + "epoch": 25.796, + "grad_norm": 1.2667912244796753, + "learning_rate": 2e-05, + "loss": 0.04757137, + "step": 12898 + }, + { + "epoch": 25.798000000000002, + "grad_norm": 1.1675081253051758, + "learning_rate": 2e-05, + "loss": 0.03582035, + "step": 12899 + }, + { + "epoch": 25.8, + "grad_norm": 1.2599587440490723, + "learning_rate": 2e-05, + "loss": 0.05033941, + "step": 12900 + }, + { + "epoch": 25.802, + "grad_norm": 1.4526861906051636, + "learning_rate": 2e-05, + "loss": 0.02551962, + "step": 12901 + }, + { + "epoch": 25.804, + "grad_norm": 2.03424334526062, + "learning_rate": 2e-05, + "loss": 0.04258246, + "step": 12902 + }, + { + "epoch": 25.806, + "grad_norm": 1.5461796522140503, + "learning_rate": 2e-05, + "loss": 0.05401737, + "step": 12903 + }, + { + "epoch": 25.808, + "grad_norm": 1.3652410507202148, + "learning_rate": 2e-05, + "loss": 0.04861759, + "step": 12904 + }, + { + "epoch": 25.81, + "grad_norm": 2.5918078422546387, + "learning_rate": 2e-05, + "loss": 0.04921463, + "step": 12905 + }, + { + "epoch": 25.812, + "grad_norm": 1.6058486700057983, + "learning_rate": 2e-05, + "loss": 0.03778245, + "step": 12906 + }, + { + "epoch": 25.814, + "grad_norm": 1.3766907453536987, + "learning_rate": 2e-05, + "loss": 0.03459726, + "step": 12907 + }, + { + "epoch": 25.816, + "grad_norm": 1.255175232887268, + "learning_rate": 2e-05, + "loss": 0.04270238, + "step": 12908 + }, + { + "epoch": 25.818, + "grad_norm": 1.1050204038619995, + "learning_rate": 2e-05, + "loss": 0.03457598, + "step": 12909 + }, + { + "epoch": 25.82, + "grad_norm": 1.7525054216384888, + "learning_rate": 2e-05, + "loss": 0.04051763, + "step": 12910 + }, + { + "epoch": 25.822, + "grad_norm": 3.4903576374053955, + "learning_rate": 2e-05, + "loss": 0.04965971, + "step": 12911 + }, + { + "epoch": 25.824, + "grad_norm": 1.1599464416503906, + "learning_rate": 2e-05, + "loss": 0.03711332, + "step": 12912 + }, + { + "epoch": 25.826, + "grad_norm": 3.031643867492676, + "learning_rate": 2e-05, + "loss": 0.05846217, + "step": 12913 + }, + { + "epoch": 25.828, + "grad_norm": 1.1548397541046143, + "learning_rate": 2e-05, + "loss": 0.03609941, + "step": 12914 + }, + { + "epoch": 25.83, + "grad_norm": 6.252331733703613, + "learning_rate": 2e-05, + "loss": 0.04764732, + "step": 12915 + }, + { + "epoch": 25.832, + "grad_norm": 1.6452500820159912, + "learning_rate": 2e-05, + "loss": 0.05718385, + "step": 12916 + }, + { + "epoch": 25.834, + "grad_norm": 1.1862694025039673, + "learning_rate": 2e-05, + "loss": 0.06088445, + "step": 12917 + }, + { + "epoch": 25.836, + "grad_norm": 1.5176570415496826, + "learning_rate": 2e-05, + "loss": 0.03762556, + "step": 12918 + }, + { + "epoch": 25.838, + "grad_norm": 1.6177486181259155, + "learning_rate": 2e-05, + "loss": 0.05010308, + "step": 12919 + }, + { + "epoch": 25.84, + "grad_norm": 3.6465678215026855, + "learning_rate": 2e-05, + "loss": 0.04568088, + "step": 12920 + }, + { + "epoch": 25.842, + "grad_norm": 1.225893497467041, + "learning_rate": 2e-05, + "loss": 0.04407287, + "step": 12921 + }, + { + "epoch": 25.844, + "grad_norm": 1.419485092163086, + "learning_rate": 2e-05, + "loss": 0.05671504, + "step": 12922 + }, + { + "epoch": 25.846, + "grad_norm": 0.9688834547996521, + "learning_rate": 2e-05, + "loss": 0.0258717, + "step": 12923 + }, + { + "epoch": 25.848, + "grad_norm": 1.4948381185531616, + "learning_rate": 2e-05, + "loss": 0.05070211, + "step": 12924 + }, + { + "epoch": 25.85, + "grad_norm": 1.48432195186615, + "learning_rate": 2e-05, + "loss": 0.03078519, + "step": 12925 + }, + { + "epoch": 25.852, + "grad_norm": 1.1909409761428833, + "learning_rate": 2e-05, + "loss": 0.04312345, + "step": 12926 + }, + { + "epoch": 25.854, + "grad_norm": 1.25083327293396, + "learning_rate": 2e-05, + "loss": 0.04205637, + "step": 12927 + }, + { + "epoch": 25.856, + "grad_norm": 1.1452685594558716, + "learning_rate": 2e-05, + "loss": 0.04423122, + "step": 12928 + }, + { + "epoch": 25.858, + "grad_norm": 1.53263258934021, + "learning_rate": 2e-05, + "loss": 0.04331882, + "step": 12929 + }, + { + "epoch": 25.86, + "grad_norm": 1.451874852180481, + "learning_rate": 2e-05, + "loss": 0.04954249, + "step": 12930 + }, + { + "epoch": 25.862, + "grad_norm": 1.3785700798034668, + "learning_rate": 2e-05, + "loss": 0.04673723, + "step": 12931 + }, + { + "epoch": 25.864, + "grad_norm": 1.375231146812439, + "learning_rate": 2e-05, + "loss": 0.0459612, + "step": 12932 + }, + { + "epoch": 25.866, + "grad_norm": 1.2162593603134155, + "learning_rate": 2e-05, + "loss": 0.04416041, + "step": 12933 + }, + { + "epoch": 25.868, + "grad_norm": 1.1460282802581787, + "learning_rate": 2e-05, + "loss": 0.0397279, + "step": 12934 + }, + { + "epoch": 25.87, + "grad_norm": 1.4855611324310303, + "learning_rate": 2e-05, + "loss": 0.04811959, + "step": 12935 + }, + { + "epoch": 25.872, + "grad_norm": 1.6014434099197388, + "learning_rate": 2e-05, + "loss": 0.05218223, + "step": 12936 + }, + { + "epoch": 25.874, + "grad_norm": 1.3930706977844238, + "learning_rate": 2e-05, + "loss": 0.05569389, + "step": 12937 + }, + { + "epoch": 25.876, + "grad_norm": 1.5376465320587158, + "learning_rate": 2e-05, + "loss": 0.05116809, + "step": 12938 + }, + { + "epoch": 25.878, + "grad_norm": 1.3046684265136719, + "learning_rate": 2e-05, + "loss": 0.05488844, + "step": 12939 + }, + { + "epoch": 25.88, + "grad_norm": 1.029398798942566, + "learning_rate": 2e-05, + "loss": 0.03596478, + "step": 12940 + }, + { + "epoch": 25.882, + "grad_norm": 1.2957504987716675, + "learning_rate": 2e-05, + "loss": 0.03197503, + "step": 12941 + }, + { + "epoch": 25.884, + "grad_norm": 1.1034080982208252, + "learning_rate": 2e-05, + "loss": 0.04575316, + "step": 12942 + }, + { + "epoch": 25.886, + "grad_norm": 1.4886722564697266, + "learning_rate": 2e-05, + "loss": 0.04241584, + "step": 12943 + }, + { + "epoch": 25.888, + "grad_norm": 1.763895869255066, + "learning_rate": 2e-05, + "loss": 0.04630455, + "step": 12944 + }, + { + "epoch": 25.89, + "grad_norm": 1.2453800439834595, + "learning_rate": 2e-05, + "loss": 0.04511211, + "step": 12945 + }, + { + "epoch": 25.892, + "grad_norm": 1.3455971479415894, + "learning_rate": 2e-05, + "loss": 0.03952543, + "step": 12946 + }, + { + "epoch": 25.894, + "grad_norm": 1.4724493026733398, + "learning_rate": 2e-05, + "loss": 0.06532818, + "step": 12947 + }, + { + "epoch": 25.896, + "grad_norm": 1.60633385181427, + "learning_rate": 2e-05, + "loss": 0.06741303, + "step": 12948 + }, + { + "epoch": 25.898, + "grad_norm": 1.3577247858047485, + "learning_rate": 2e-05, + "loss": 0.04642791, + "step": 12949 + }, + { + "epoch": 25.9, + "grad_norm": 0.9978728890419006, + "learning_rate": 2e-05, + "loss": 0.0259608, + "step": 12950 + }, + { + "epoch": 25.902, + "grad_norm": 1.5392905473709106, + "learning_rate": 2e-05, + "loss": 0.03973429, + "step": 12951 + }, + { + "epoch": 25.904, + "grad_norm": 1.0129889249801636, + "learning_rate": 2e-05, + "loss": 0.02280146, + "step": 12952 + }, + { + "epoch": 25.906, + "grad_norm": 2.133425712585449, + "learning_rate": 2e-05, + "loss": 0.04928362, + "step": 12953 + }, + { + "epoch": 25.908, + "grad_norm": 1.3658127784729004, + "learning_rate": 2e-05, + "loss": 0.0481901, + "step": 12954 + }, + { + "epoch": 25.91, + "grad_norm": 2.754051446914673, + "learning_rate": 2e-05, + "loss": 0.05194226, + "step": 12955 + }, + { + "epoch": 25.912, + "grad_norm": 1.568504810333252, + "learning_rate": 2e-05, + "loss": 0.04438355, + "step": 12956 + }, + { + "epoch": 25.914, + "grad_norm": 1.2474275827407837, + "learning_rate": 2e-05, + "loss": 0.0374986, + "step": 12957 + }, + { + "epoch": 25.916, + "grad_norm": 1.8854233026504517, + "learning_rate": 2e-05, + "loss": 0.05303825, + "step": 12958 + }, + { + "epoch": 25.918, + "grad_norm": 1.0603793859481812, + "learning_rate": 2e-05, + "loss": 0.04348066, + "step": 12959 + }, + { + "epoch": 25.92, + "grad_norm": 1.2751173973083496, + "learning_rate": 2e-05, + "loss": 0.04942772, + "step": 12960 + }, + { + "epoch": 25.922, + "grad_norm": 1.5532885789871216, + "learning_rate": 2e-05, + "loss": 0.04615701, + "step": 12961 + }, + { + "epoch": 25.924, + "grad_norm": 1.4249638319015503, + "learning_rate": 2e-05, + "loss": 0.04614986, + "step": 12962 + }, + { + "epoch": 25.926, + "grad_norm": 1.7213596105575562, + "learning_rate": 2e-05, + "loss": 0.04523218, + "step": 12963 + }, + { + "epoch": 25.928, + "grad_norm": 1.564770221710205, + "learning_rate": 2e-05, + "loss": 0.06634088, + "step": 12964 + }, + { + "epoch": 25.93, + "grad_norm": 2.3734843730926514, + "learning_rate": 2e-05, + "loss": 0.06825857, + "step": 12965 + }, + { + "epoch": 25.932, + "grad_norm": 0.9273267388343811, + "learning_rate": 2e-05, + "loss": 0.02456025, + "step": 12966 + }, + { + "epoch": 25.934, + "grad_norm": 2.232722759246826, + "learning_rate": 2e-05, + "loss": 0.05355635, + "step": 12967 + }, + { + "epoch": 25.936, + "grad_norm": 3.6321330070495605, + "learning_rate": 2e-05, + "loss": 0.05499596, + "step": 12968 + }, + { + "epoch": 25.938, + "grad_norm": 1.379671335220337, + "learning_rate": 2e-05, + "loss": 0.05246424, + "step": 12969 + }, + { + "epoch": 25.94, + "grad_norm": 1.2768114805221558, + "learning_rate": 2e-05, + "loss": 0.03318752, + "step": 12970 + }, + { + "epoch": 25.942, + "grad_norm": 1.4348204135894775, + "learning_rate": 2e-05, + "loss": 0.04715006, + "step": 12971 + }, + { + "epoch": 25.944, + "grad_norm": 1.4383244514465332, + "learning_rate": 2e-05, + "loss": 0.05146648, + "step": 12972 + }, + { + "epoch": 25.946, + "grad_norm": 2.579630136489868, + "learning_rate": 2e-05, + "loss": 0.05426867, + "step": 12973 + }, + { + "epoch": 25.948, + "grad_norm": 1.199095606803894, + "learning_rate": 2e-05, + "loss": 0.04479141, + "step": 12974 + }, + { + "epoch": 25.95, + "grad_norm": 1.3972845077514648, + "learning_rate": 2e-05, + "loss": 0.0324418, + "step": 12975 + }, + { + "epoch": 25.951999999999998, + "grad_norm": 1.0197676420211792, + "learning_rate": 2e-05, + "loss": 0.03971727, + "step": 12976 + }, + { + "epoch": 25.954, + "grad_norm": 1.1759026050567627, + "learning_rate": 2e-05, + "loss": 0.03519825, + "step": 12977 + }, + { + "epoch": 25.956, + "grad_norm": 1.27419912815094, + "learning_rate": 2e-05, + "loss": 0.04272161, + "step": 12978 + }, + { + "epoch": 25.958, + "grad_norm": 1.3605846166610718, + "learning_rate": 2e-05, + "loss": 0.05668283, + "step": 12979 + }, + { + "epoch": 25.96, + "grad_norm": 2.3534092903137207, + "learning_rate": 2e-05, + "loss": 0.04117073, + "step": 12980 + }, + { + "epoch": 25.962, + "grad_norm": 1.1537200212478638, + "learning_rate": 2e-05, + "loss": 0.06345319, + "step": 12981 + }, + { + "epoch": 25.964, + "grad_norm": 2.1213245391845703, + "learning_rate": 2e-05, + "loss": 0.04125203, + "step": 12982 + }, + { + "epoch": 25.966, + "grad_norm": 1.309527039527893, + "learning_rate": 2e-05, + "loss": 0.04417682, + "step": 12983 + }, + { + "epoch": 25.968, + "grad_norm": 1.206042766571045, + "learning_rate": 2e-05, + "loss": 0.04790517, + "step": 12984 + }, + { + "epoch": 25.97, + "grad_norm": 1.2381349802017212, + "learning_rate": 2e-05, + "loss": 0.03844728, + "step": 12985 + }, + { + "epoch": 25.972, + "grad_norm": 1.0183463096618652, + "learning_rate": 2e-05, + "loss": 0.04048822, + "step": 12986 + }, + { + "epoch": 25.974, + "grad_norm": 1.5077916383743286, + "learning_rate": 2e-05, + "loss": 0.03975806, + "step": 12987 + }, + { + "epoch": 25.976, + "grad_norm": 1.284839391708374, + "learning_rate": 2e-05, + "loss": 0.03966205, + "step": 12988 + }, + { + "epoch": 25.978, + "grad_norm": 2.6741936206817627, + "learning_rate": 2e-05, + "loss": 0.06823039, + "step": 12989 + }, + { + "epoch": 25.98, + "grad_norm": 1.2821533679962158, + "learning_rate": 2e-05, + "loss": 0.02755376, + "step": 12990 + }, + { + "epoch": 25.982, + "grad_norm": 1.1977559328079224, + "learning_rate": 2e-05, + "loss": 0.05067346, + "step": 12991 + }, + { + "epoch": 25.984, + "grad_norm": 1.267210602760315, + "learning_rate": 2e-05, + "loss": 0.05051599, + "step": 12992 + }, + { + "epoch": 25.986, + "grad_norm": 1.6138875484466553, + "learning_rate": 2e-05, + "loss": 0.05310912, + "step": 12993 + }, + { + "epoch": 25.988, + "grad_norm": 1.2645289897918701, + "learning_rate": 2e-05, + "loss": 0.05147942, + "step": 12994 + }, + { + "epoch": 25.99, + "grad_norm": 1.5932942628860474, + "learning_rate": 2e-05, + "loss": 0.02298946, + "step": 12995 + }, + { + "epoch": 25.992, + "grad_norm": 1.1337193250656128, + "learning_rate": 2e-05, + "loss": 0.02549054, + "step": 12996 + }, + { + "epoch": 25.994, + "grad_norm": 1.2463297843933105, + "learning_rate": 2e-05, + "loss": 0.04182723, + "step": 12997 + }, + { + "epoch": 25.996, + "grad_norm": 2.0521812438964844, + "learning_rate": 2e-05, + "loss": 0.04027394, + "step": 12998 + }, + { + "epoch": 25.998, + "grad_norm": 1.0944292545318604, + "learning_rate": 2e-05, + "loss": 0.05282188, + "step": 12999 + }, + { + "epoch": 26.0, + "grad_norm": 0.9855789542198181, + "learning_rate": 2e-05, + "loss": 0.03689377, + "step": 13000 + }, + { + "epoch": 26.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9780439121756487, + "Equal_1": 0.996, + "Equal_2": 0.9840319361277445, + "Equal_3": 0.9800399201596807, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9960079840319361, + "Parallel_1": 0.9879759519038076, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.992, + "Perpendicular_1": 0.99, + "Perpendicular_2": 0.982, + "Perpendicular_3": 0.8717434869739479, + "PointLiesOnCircle_1": 0.9996659986639947, + "PointLiesOnCircle_2": 0.9996666666666667, + "PointLiesOnCircle_3": 0.9936, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9700598802395209 + }, + "eval_runtime": 319.7653, + "eval_samples_per_second": 32.837, + "eval_steps_per_second": 0.657, + "step": 13000 + }, + { + "epoch": 26.002, + "grad_norm": 1.2228409051895142, + "learning_rate": 2e-05, + "loss": 0.03640506, + "step": 13001 + }, + { + "epoch": 26.004, + "grad_norm": 2.15970778465271, + "learning_rate": 2e-05, + "loss": 0.05572034, + "step": 13002 + }, + { + "epoch": 26.006, + "grad_norm": 1.3950461149215698, + "learning_rate": 2e-05, + "loss": 0.04152773, + "step": 13003 + }, + { + "epoch": 26.008, + "grad_norm": 1.6164307594299316, + "learning_rate": 2e-05, + "loss": 0.04839995, + "step": 13004 + }, + { + "epoch": 26.01, + "grad_norm": 1.7224260568618774, + "learning_rate": 2e-05, + "loss": 0.04432855, + "step": 13005 + }, + { + "epoch": 26.012, + "grad_norm": 1.3991963863372803, + "learning_rate": 2e-05, + "loss": 0.05045583, + "step": 13006 + }, + { + "epoch": 26.014, + "grad_norm": 1.8230713605880737, + "learning_rate": 2e-05, + "loss": 0.06015655, + "step": 13007 + }, + { + "epoch": 26.016, + "grad_norm": 1.7791680097579956, + "learning_rate": 2e-05, + "loss": 0.0451679, + "step": 13008 + }, + { + "epoch": 26.018, + "grad_norm": 1.7463334798812866, + "learning_rate": 2e-05, + "loss": 0.05355278, + "step": 13009 + }, + { + "epoch": 26.02, + "grad_norm": 1.9259138107299805, + "learning_rate": 2e-05, + "loss": 0.05147279, + "step": 13010 + }, + { + "epoch": 26.022, + "grad_norm": 1.498757243156433, + "learning_rate": 2e-05, + "loss": 0.05272644, + "step": 13011 + }, + { + "epoch": 26.024, + "grad_norm": 2.2757580280303955, + "learning_rate": 2e-05, + "loss": 0.0499465, + "step": 13012 + }, + { + "epoch": 26.026, + "grad_norm": 2.213714122772217, + "learning_rate": 2e-05, + "loss": 0.04541406, + "step": 13013 + }, + { + "epoch": 26.028, + "grad_norm": 1.893241286277771, + "learning_rate": 2e-05, + "loss": 0.05529698, + "step": 13014 + }, + { + "epoch": 26.03, + "grad_norm": 1.8718706369400024, + "learning_rate": 2e-05, + "loss": 0.04376329, + "step": 13015 + }, + { + "epoch": 26.032, + "grad_norm": 1.3967310190200806, + "learning_rate": 2e-05, + "loss": 0.03513469, + "step": 13016 + }, + { + "epoch": 26.034, + "grad_norm": 1.8421751260757446, + "learning_rate": 2e-05, + "loss": 0.04883641, + "step": 13017 + }, + { + "epoch": 26.036, + "grad_norm": 1.9495335817337036, + "learning_rate": 2e-05, + "loss": 0.03737961, + "step": 13018 + }, + { + "epoch": 26.038, + "grad_norm": 1.0064893960952759, + "learning_rate": 2e-05, + "loss": 0.04025814, + "step": 13019 + }, + { + "epoch": 26.04, + "grad_norm": 1.1454533338546753, + "learning_rate": 2e-05, + "loss": 0.05090432, + "step": 13020 + }, + { + "epoch": 26.042, + "grad_norm": 1.2419127225875854, + "learning_rate": 2e-05, + "loss": 0.04127611, + "step": 13021 + }, + { + "epoch": 26.044, + "grad_norm": 1.4765992164611816, + "learning_rate": 2e-05, + "loss": 0.053638, + "step": 13022 + }, + { + "epoch": 26.046, + "grad_norm": 1.3810056447982788, + "learning_rate": 2e-05, + "loss": 0.02833214, + "step": 13023 + }, + { + "epoch": 26.048, + "grad_norm": 1.1815146207809448, + "learning_rate": 2e-05, + "loss": 0.0438899, + "step": 13024 + }, + { + "epoch": 26.05, + "grad_norm": 1.4837486743927002, + "learning_rate": 2e-05, + "loss": 0.04868677, + "step": 13025 + }, + { + "epoch": 26.052, + "grad_norm": 1.6083544492721558, + "learning_rate": 2e-05, + "loss": 0.0421802, + "step": 13026 + }, + { + "epoch": 26.054, + "grad_norm": 1.2895910739898682, + "learning_rate": 2e-05, + "loss": 0.04041936, + "step": 13027 + }, + { + "epoch": 26.056, + "grad_norm": 1.0532166957855225, + "learning_rate": 2e-05, + "loss": 0.03752539, + "step": 13028 + }, + { + "epoch": 26.058, + "grad_norm": 1.7177013158798218, + "learning_rate": 2e-05, + "loss": 0.04090296, + "step": 13029 + }, + { + "epoch": 26.06, + "grad_norm": 1.4352203607559204, + "learning_rate": 2e-05, + "loss": 0.06325287, + "step": 13030 + }, + { + "epoch": 26.062, + "grad_norm": 1.2944906949996948, + "learning_rate": 2e-05, + "loss": 0.03913593, + "step": 13031 + }, + { + "epoch": 26.064, + "grad_norm": 1.1036475896835327, + "learning_rate": 2e-05, + "loss": 0.03017885, + "step": 13032 + }, + { + "epoch": 26.066, + "grad_norm": 1.4907349348068237, + "learning_rate": 2e-05, + "loss": 0.06610879, + "step": 13033 + }, + { + "epoch": 26.068, + "grad_norm": 2.620415210723877, + "learning_rate": 2e-05, + "loss": 0.06170364, + "step": 13034 + }, + { + "epoch": 26.07, + "grad_norm": 1.2906463146209717, + "learning_rate": 2e-05, + "loss": 0.05566068, + "step": 13035 + }, + { + "epoch": 26.072, + "grad_norm": 1.5933210849761963, + "learning_rate": 2e-05, + "loss": 0.06452239, + "step": 13036 + }, + { + "epoch": 26.074, + "grad_norm": 1.2673341035842896, + "learning_rate": 2e-05, + "loss": 0.04142977, + "step": 13037 + }, + { + "epoch": 26.076, + "grad_norm": 1.4276865720748901, + "learning_rate": 2e-05, + "loss": 0.04855426, + "step": 13038 + }, + { + "epoch": 26.078, + "grad_norm": 1.5951037406921387, + "learning_rate": 2e-05, + "loss": 0.04814789, + "step": 13039 + }, + { + "epoch": 26.08, + "grad_norm": 1.3216607570648193, + "learning_rate": 2e-05, + "loss": 0.04974344, + "step": 13040 + }, + { + "epoch": 26.082, + "grad_norm": 1.4452524185180664, + "learning_rate": 2e-05, + "loss": 0.04898542, + "step": 13041 + }, + { + "epoch": 26.084, + "grad_norm": 1.5695279836654663, + "learning_rate": 2e-05, + "loss": 0.05378542, + "step": 13042 + }, + { + "epoch": 26.086, + "grad_norm": 2.6487820148468018, + "learning_rate": 2e-05, + "loss": 0.05485958, + "step": 13043 + }, + { + "epoch": 26.088, + "grad_norm": 1.1026345491409302, + "learning_rate": 2e-05, + "loss": 0.03617822, + "step": 13044 + }, + { + "epoch": 26.09, + "grad_norm": 2.1571977138519287, + "learning_rate": 2e-05, + "loss": 0.05426963, + "step": 13045 + }, + { + "epoch": 26.092, + "grad_norm": 1.3975282907485962, + "learning_rate": 2e-05, + "loss": 0.05105602, + "step": 13046 + }, + { + "epoch": 26.094, + "grad_norm": 1.2141252756118774, + "learning_rate": 2e-05, + "loss": 0.05173802, + "step": 13047 + }, + { + "epoch": 26.096, + "grad_norm": 2.8250155448913574, + "learning_rate": 2e-05, + "loss": 0.06396338, + "step": 13048 + }, + { + "epoch": 26.098, + "grad_norm": 1.162996530532837, + "learning_rate": 2e-05, + "loss": 0.05161465, + "step": 13049 + }, + { + "epoch": 26.1, + "grad_norm": 1.0605719089508057, + "learning_rate": 2e-05, + "loss": 0.03604612, + "step": 13050 + }, + { + "epoch": 26.102, + "grad_norm": 1.1841809749603271, + "learning_rate": 2e-05, + "loss": 0.04377447, + "step": 13051 + }, + { + "epoch": 26.104, + "grad_norm": 1.2295196056365967, + "learning_rate": 2e-05, + "loss": 0.05042927, + "step": 13052 + }, + { + "epoch": 26.106, + "grad_norm": 1.04031503200531, + "learning_rate": 2e-05, + "loss": 0.02950481, + "step": 13053 + }, + { + "epoch": 26.108, + "grad_norm": 1.689014196395874, + "learning_rate": 2e-05, + "loss": 0.05845881, + "step": 13054 + }, + { + "epoch": 26.11, + "grad_norm": 1.123939871788025, + "learning_rate": 2e-05, + "loss": 0.04595611, + "step": 13055 + }, + { + "epoch": 26.112, + "grad_norm": 2.1183228492736816, + "learning_rate": 2e-05, + "loss": 0.03145133, + "step": 13056 + }, + { + "epoch": 26.114, + "grad_norm": 1.138588786125183, + "learning_rate": 2e-05, + "loss": 0.04393515, + "step": 13057 + }, + { + "epoch": 26.116, + "grad_norm": 1.1965045928955078, + "learning_rate": 2e-05, + "loss": 0.05045221, + "step": 13058 + }, + { + "epoch": 26.118, + "grad_norm": 1.3790438175201416, + "learning_rate": 2e-05, + "loss": 0.05804875, + "step": 13059 + }, + { + "epoch": 26.12, + "grad_norm": 1.0586308240890503, + "learning_rate": 2e-05, + "loss": 0.03835727, + "step": 13060 + }, + { + "epoch": 26.122, + "grad_norm": 0.9596161842346191, + "learning_rate": 2e-05, + "loss": 0.03627136, + "step": 13061 + }, + { + "epoch": 26.124, + "grad_norm": 1.7609083652496338, + "learning_rate": 2e-05, + "loss": 0.04719493, + "step": 13062 + }, + { + "epoch": 26.126, + "grad_norm": 1.9145840406417847, + "learning_rate": 2e-05, + "loss": 0.05495848, + "step": 13063 + }, + { + "epoch": 26.128, + "grad_norm": 3.6397500038146973, + "learning_rate": 2e-05, + "loss": 0.06928319, + "step": 13064 + }, + { + "epoch": 26.13, + "grad_norm": 1.7477335929870605, + "learning_rate": 2e-05, + "loss": 0.06259966, + "step": 13065 + }, + { + "epoch": 26.132, + "grad_norm": 1.4324450492858887, + "learning_rate": 2e-05, + "loss": 0.05550072, + "step": 13066 + }, + { + "epoch": 26.134, + "grad_norm": 1.6972732543945312, + "learning_rate": 2e-05, + "loss": 0.03786982, + "step": 13067 + }, + { + "epoch": 26.136, + "grad_norm": 2.0623652935028076, + "learning_rate": 2e-05, + "loss": 0.0455045, + "step": 13068 + }, + { + "epoch": 26.138, + "grad_norm": 1.056091547012329, + "learning_rate": 2e-05, + "loss": 0.03710656, + "step": 13069 + }, + { + "epoch": 26.14, + "grad_norm": 1.2790498733520508, + "learning_rate": 2e-05, + "loss": 0.0524893, + "step": 13070 + }, + { + "epoch": 26.142, + "grad_norm": 1.587206244468689, + "learning_rate": 2e-05, + "loss": 0.04856799, + "step": 13071 + }, + { + "epoch": 26.144, + "grad_norm": 1.0538597106933594, + "learning_rate": 2e-05, + "loss": 0.03798814, + "step": 13072 + }, + { + "epoch": 26.146, + "grad_norm": 1.031635046005249, + "learning_rate": 2e-05, + "loss": 0.03815435, + "step": 13073 + }, + { + "epoch": 26.148, + "grad_norm": 1.4059712886810303, + "learning_rate": 2e-05, + "loss": 0.0483502, + "step": 13074 + }, + { + "epoch": 26.15, + "grad_norm": 0.9544400572776794, + "learning_rate": 2e-05, + "loss": 0.02810543, + "step": 13075 + }, + { + "epoch": 26.152, + "grad_norm": 1.027392029762268, + "learning_rate": 2e-05, + "loss": 0.03287507, + "step": 13076 + }, + { + "epoch": 26.154, + "grad_norm": 1.084064245223999, + "learning_rate": 2e-05, + "loss": 0.03592873, + "step": 13077 + }, + { + "epoch": 26.156, + "grad_norm": 1.2711600065231323, + "learning_rate": 2e-05, + "loss": 0.04065077, + "step": 13078 + }, + { + "epoch": 26.158, + "grad_norm": 1.0948227643966675, + "learning_rate": 2e-05, + "loss": 0.03384892, + "step": 13079 + }, + { + "epoch": 26.16, + "grad_norm": 1.3854420185089111, + "learning_rate": 2e-05, + "loss": 0.04718272, + "step": 13080 + }, + { + "epoch": 26.162, + "grad_norm": 1.2978075742721558, + "learning_rate": 2e-05, + "loss": 0.0398887, + "step": 13081 + }, + { + "epoch": 26.164, + "grad_norm": 1.2315889596939087, + "learning_rate": 2e-05, + "loss": 0.03722582, + "step": 13082 + }, + { + "epoch": 26.166, + "grad_norm": 1.9231492280960083, + "learning_rate": 2e-05, + "loss": 0.05172444, + "step": 13083 + }, + { + "epoch": 26.168, + "grad_norm": 1.1194469928741455, + "learning_rate": 2e-05, + "loss": 0.03749369, + "step": 13084 + }, + { + "epoch": 26.17, + "grad_norm": 1.6649547815322876, + "learning_rate": 2e-05, + "loss": 0.05333608, + "step": 13085 + }, + { + "epoch": 26.172, + "grad_norm": 2.011101245880127, + "learning_rate": 2e-05, + "loss": 0.04135909, + "step": 13086 + }, + { + "epoch": 26.174, + "grad_norm": 1.6736245155334473, + "learning_rate": 2e-05, + "loss": 0.043147, + "step": 13087 + }, + { + "epoch": 26.176, + "grad_norm": 1.3968009948730469, + "learning_rate": 2e-05, + "loss": 0.04125686, + "step": 13088 + }, + { + "epoch": 26.178, + "grad_norm": 1.05874764919281, + "learning_rate": 2e-05, + "loss": 0.03640444, + "step": 13089 + }, + { + "epoch": 26.18, + "grad_norm": 1.14805006980896, + "learning_rate": 2e-05, + "loss": 0.0389007, + "step": 13090 + }, + { + "epoch": 26.182, + "grad_norm": 1.106278419494629, + "learning_rate": 2e-05, + "loss": 0.04055985, + "step": 13091 + }, + { + "epoch": 26.184, + "grad_norm": 2.2514469623565674, + "learning_rate": 2e-05, + "loss": 0.04158842, + "step": 13092 + }, + { + "epoch": 26.186, + "grad_norm": 1.33916437625885, + "learning_rate": 2e-05, + "loss": 0.04522097, + "step": 13093 + }, + { + "epoch": 26.188, + "grad_norm": 2.1648190021514893, + "learning_rate": 2e-05, + "loss": 0.04654589, + "step": 13094 + }, + { + "epoch": 26.19, + "grad_norm": 1.1959937810897827, + "learning_rate": 2e-05, + "loss": 0.04704513, + "step": 13095 + }, + { + "epoch": 26.192, + "grad_norm": 1.733241081237793, + "learning_rate": 2e-05, + "loss": 0.05815672, + "step": 13096 + }, + { + "epoch": 26.194, + "grad_norm": 0.8504005670547485, + "learning_rate": 2e-05, + "loss": 0.02225319, + "step": 13097 + }, + { + "epoch": 26.196, + "grad_norm": 2.0521740913391113, + "learning_rate": 2e-05, + "loss": 0.05633967, + "step": 13098 + }, + { + "epoch": 26.198, + "grad_norm": 1.087705373764038, + "learning_rate": 2e-05, + "loss": 0.04992017, + "step": 13099 + }, + { + "epoch": 26.2, + "grad_norm": 1.121282696723938, + "learning_rate": 2e-05, + "loss": 0.03436579, + "step": 13100 + }, + { + "epoch": 26.202, + "grad_norm": 1.6202627420425415, + "learning_rate": 2e-05, + "loss": 0.04655264, + "step": 13101 + }, + { + "epoch": 26.204, + "grad_norm": 1.7769203186035156, + "learning_rate": 2e-05, + "loss": 0.05040466, + "step": 13102 + }, + { + "epoch": 26.206, + "grad_norm": 1.3449256420135498, + "learning_rate": 2e-05, + "loss": 0.0527843, + "step": 13103 + }, + { + "epoch": 26.208, + "grad_norm": 2.069218397140503, + "learning_rate": 2e-05, + "loss": 0.05726243, + "step": 13104 + }, + { + "epoch": 26.21, + "grad_norm": 1.4746137857437134, + "learning_rate": 2e-05, + "loss": 0.06106601, + "step": 13105 + }, + { + "epoch": 26.212, + "grad_norm": 1.4619065523147583, + "learning_rate": 2e-05, + "loss": 0.03955628, + "step": 13106 + }, + { + "epoch": 26.214, + "grad_norm": 0.9846628308296204, + "learning_rate": 2e-05, + "loss": 0.03640333, + "step": 13107 + }, + { + "epoch": 26.216, + "grad_norm": 1.4153640270233154, + "learning_rate": 2e-05, + "loss": 0.04586072, + "step": 13108 + }, + { + "epoch": 26.218, + "grad_norm": 1.6593786478042603, + "learning_rate": 2e-05, + "loss": 0.04747347, + "step": 13109 + }, + { + "epoch": 26.22, + "grad_norm": 1.1394518613815308, + "learning_rate": 2e-05, + "loss": 0.04001589, + "step": 13110 + }, + { + "epoch": 26.222, + "grad_norm": 1.1682400703430176, + "learning_rate": 2e-05, + "loss": 0.05425475, + "step": 13111 + }, + { + "epoch": 26.224, + "grad_norm": 1.2645248174667358, + "learning_rate": 2e-05, + "loss": 0.03757692, + "step": 13112 + }, + { + "epoch": 26.226, + "grad_norm": 1.0900676250457764, + "learning_rate": 2e-05, + "loss": 0.03077891, + "step": 13113 + }, + { + "epoch": 26.228, + "grad_norm": 1.570024013519287, + "learning_rate": 2e-05, + "loss": 0.04349901, + "step": 13114 + }, + { + "epoch": 26.23, + "grad_norm": 1.2463085651397705, + "learning_rate": 2e-05, + "loss": 0.05144939, + "step": 13115 + }, + { + "epoch": 26.232, + "grad_norm": 2.3603127002716064, + "learning_rate": 2e-05, + "loss": 0.05303069, + "step": 13116 + }, + { + "epoch": 26.234, + "grad_norm": 1.4877861738204956, + "learning_rate": 2e-05, + "loss": 0.04663004, + "step": 13117 + }, + { + "epoch": 26.236, + "grad_norm": 2.002016544342041, + "learning_rate": 2e-05, + "loss": 0.02264161, + "step": 13118 + }, + { + "epoch": 26.238, + "grad_norm": 1.2714250087738037, + "learning_rate": 2e-05, + "loss": 0.05852698, + "step": 13119 + }, + { + "epoch": 26.24, + "grad_norm": 1.6923631429672241, + "learning_rate": 2e-05, + "loss": 0.03670363, + "step": 13120 + }, + { + "epoch": 26.242, + "grad_norm": 2.9363479614257812, + "learning_rate": 2e-05, + "loss": 0.05860969, + "step": 13121 + }, + { + "epoch": 26.244, + "grad_norm": 0.9841324090957642, + "learning_rate": 2e-05, + "loss": 0.03121854, + "step": 13122 + }, + { + "epoch": 26.246, + "grad_norm": 1.421994686126709, + "learning_rate": 2e-05, + "loss": 0.04863666, + "step": 13123 + }, + { + "epoch": 26.248, + "grad_norm": 1.1231731176376343, + "learning_rate": 2e-05, + "loss": 0.04469264, + "step": 13124 + }, + { + "epoch": 26.25, + "grad_norm": 1.8403480052947998, + "learning_rate": 2e-05, + "loss": 0.0488335, + "step": 13125 + }, + { + "epoch": 26.252, + "grad_norm": 2.583953619003296, + "learning_rate": 2e-05, + "loss": 0.04319778, + "step": 13126 + }, + { + "epoch": 26.254, + "grad_norm": 9.854668617248535, + "learning_rate": 2e-05, + "loss": 0.08909784, + "step": 13127 + }, + { + "epoch": 26.256, + "grad_norm": 52.158538818359375, + "learning_rate": 2e-05, + "loss": 0.10964724, + "step": 13128 + }, + { + "epoch": 26.258, + "grad_norm": 1.0165956020355225, + "learning_rate": 2e-05, + "loss": 0.0289845, + "step": 13129 + }, + { + "epoch": 26.26, + "grad_norm": 18.144968032836914, + "learning_rate": 2e-05, + "loss": 0.25045502, + "step": 13130 + }, + { + "epoch": 26.262, + "grad_norm": 4.379713535308838, + "learning_rate": 2e-05, + "loss": 0.03416047, + "step": 13131 + }, + { + "epoch": 26.264, + "grad_norm": 25.38080596923828, + "learning_rate": 2e-05, + "loss": 0.12389896, + "step": 13132 + }, + { + "epoch": 26.266, + "grad_norm": 1.384544014930725, + "learning_rate": 2e-05, + "loss": 0.04459827, + "step": 13133 + }, + { + "epoch": 26.268, + "grad_norm": 1.378836750984192, + "learning_rate": 2e-05, + "loss": 0.04477947, + "step": 13134 + }, + { + "epoch": 26.27, + "grad_norm": 2.1596052646636963, + "learning_rate": 2e-05, + "loss": 0.05260035, + "step": 13135 + }, + { + "epoch": 26.272, + "grad_norm": 1.0437805652618408, + "learning_rate": 2e-05, + "loss": 0.03832313, + "step": 13136 + }, + { + "epoch": 26.274, + "grad_norm": 1.2373123168945312, + "learning_rate": 2e-05, + "loss": 0.04758352, + "step": 13137 + }, + { + "epoch": 26.276, + "grad_norm": 1.18471097946167, + "learning_rate": 2e-05, + "loss": 0.0513614, + "step": 13138 + }, + { + "epoch": 26.278, + "grad_norm": 1.364261507987976, + "learning_rate": 2e-05, + "loss": 0.03552637, + "step": 13139 + }, + { + "epoch": 26.28, + "grad_norm": 1.291282296180725, + "learning_rate": 2e-05, + "loss": 0.04767456, + "step": 13140 + }, + { + "epoch": 26.282, + "grad_norm": 2.0756161212921143, + "learning_rate": 2e-05, + "loss": 0.05183939, + "step": 13141 + }, + { + "epoch": 26.284, + "grad_norm": 0.7976016402244568, + "learning_rate": 2e-05, + "loss": 0.02524626, + "step": 13142 + }, + { + "epoch": 26.286, + "grad_norm": 1.0466339588165283, + "learning_rate": 2e-05, + "loss": 0.05251211, + "step": 13143 + }, + { + "epoch": 26.288, + "grad_norm": 1.6141102313995361, + "learning_rate": 2e-05, + "loss": 0.02955076, + "step": 13144 + }, + { + "epoch": 26.29, + "grad_norm": 1.5093724727630615, + "learning_rate": 2e-05, + "loss": 0.05642843, + "step": 13145 + }, + { + "epoch": 26.292, + "grad_norm": 1.431347131729126, + "learning_rate": 2e-05, + "loss": 0.04590548, + "step": 13146 + }, + { + "epoch": 26.294, + "grad_norm": 1.239392876625061, + "learning_rate": 2e-05, + "loss": 0.04617111, + "step": 13147 + }, + { + "epoch": 26.296, + "grad_norm": 1.1694923639297485, + "learning_rate": 2e-05, + "loss": 0.03655789, + "step": 13148 + }, + { + "epoch": 26.298, + "grad_norm": 1.7410540580749512, + "learning_rate": 2e-05, + "loss": 0.05551188, + "step": 13149 + }, + { + "epoch": 26.3, + "grad_norm": 1.2234365940093994, + "learning_rate": 2e-05, + "loss": 0.03679156, + "step": 13150 + }, + { + "epoch": 26.302, + "grad_norm": 0.9119114875793457, + "learning_rate": 2e-05, + "loss": 0.02980712, + "step": 13151 + }, + { + "epoch": 26.304, + "grad_norm": 2.018742561340332, + "learning_rate": 2e-05, + "loss": 0.05444905, + "step": 13152 + }, + { + "epoch": 26.306, + "grad_norm": 1.7591094970703125, + "learning_rate": 2e-05, + "loss": 0.06758714, + "step": 13153 + }, + { + "epoch": 26.308, + "grad_norm": 1.300772786140442, + "learning_rate": 2e-05, + "loss": 0.0449839, + "step": 13154 + }, + { + "epoch": 26.31, + "grad_norm": 1.3861204385757446, + "learning_rate": 2e-05, + "loss": 0.04811238, + "step": 13155 + }, + { + "epoch": 26.312, + "grad_norm": 1.2385214567184448, + "learning_rate": 2e-05, + "loss": 0.03959964, + "step": 13156 + }, + { + "epoch": 26.314, + "grad_norm": 1.3340177536010742, + "learning_rate": 2e-05, + "loss": 0.04750549, + "step": 13157 + }, + { + "epoch": 26.316, + "grad_norm": 2.2569944858551025, + "learning_rate": 2e-05, + "loss": 0.05605095, + "step": 13158 + }, + { + "epoch": 26.318, + "grad_norm": 2.1105170249938965, + "learning_rate": 2e-05, + "loss": 0.05581967, + "step": 13159 + }, + { + "epoch": 26.32, + "grad_norm": 1.4911856651306152, + "learning_rate": 2e-05, + "loss": 0.06944424, + "step": 13160 + }, + { + "epoch": 26.322, + "grad_norm": 0.8688299655914307, + "learning_rate": 2e-05, + "loss": 0.01928959, + "step": 13161 + }, + { + "epoch": 26.324, + "grad_norm": 1.162561297416687, + "learning_rate": 2e-05, + "loss": 0.03705761, + "step": 13162 + }, + { + "epoch": 26.326, + "grad_norm": 1.117928385734558, + "learning_rate": 2e-05, + "loss": 0.04116834, + "step": 13163 + }, + { + "epoch": 26.328, + "grad_norm": 3.4923346042633057, + "learning_rate": 2e-05, + "loss": 0.04038919, + "step": 13164 + }, + { + "epoch": 26.33, + "grad_norm": 1.4581316709518433, + "learning_rate": 2e-05, + "loss": 0.03603051, + "step": 13165 + }, + { + "epoch": 26.332, + "grad_norm": 1.2061631679534912, + "learning_rate": 2e-05, + "loss": 0.03987415, + "step": 13166 + }, + { + "epoch": 26.334, + "grad_norm": 2.4524641036987305, + "learning_rate": 2e-05, + "loss": 0.06676086, + "step": 13167 + }, + { + "epoch": 26.336, + "grad_norm": 1.0566414594650269, + "learning_rate": 2e-05, + "loss": 0.04725492, + "step": 13168 + }, + { + "epoch": 26.338, + "grad_norm": 0.9778428077697754, + "learning_rate": 2e-05, + "loss": 0.03469347, + "step": 13169 + }, + { + "epoch": 26.34, + "grad_norm": 1.4649147987365723, + "learning_rate": 2e-05, + "loss": 0.04087276, + "step": 13170 + }, + { + "epoch": 26.342, + "grad_norm": 1.4261488914489746, + "learning_rate": 2e-05, + "loss": 0.04543655, + "step": 13171 + }, + { + "epoch": 26.344, + "grad_norm": 1.2056576013565063, + "learning_rate": 2e-05, + "loss": 0.04065976, + "step": 13172 + }, + { + "epoch": 26.346, + "grad_norm": 1.3527207374572754, + "learning_rate": 2e-05, + "loss": 0.05951834, + "step": 13173 + }, + { + "epoch": 26.348, + "grad_norm": 1.4392766952514648, + "learning_rate": 2e-05, + "loss": 0.04477938, + "step": 13174 + }, + { + "epoch": 26.35, + "grad_norm": 1.0581060647964478, + "learning_rate": 2e-05, + "loss": 0.04285652, + "step": 13175 + }, + { + "epoch": 26.352, + "grad_norm": 1.3287581205368042, + "learning_rate": 2e-05, + "loss": 0.05369517, + "step": 13176 + }, + { + "epoch": 26.354, + "grad_norm": 1.4318236112594604, + "learning_rate": 2e-05, + "loss": 0.03889168, + "step": 13177 + }, + { + "epoch": 26.356, + "grad_norm": 4.168168544769287, + "learning_rate": 2e-05, + "loss": 0.04014147, + "step": 13178 + }, + { + "epoch": 26.358, + "grad_norm": 2.5664639472961426, + "learning_rate": 2e-05, + "loss": 0.0509958, + "step": 13179 + }, + { + "epoch": 26.36, + "grad_norm": 1.6791688203811646, + "learning_rate": 2e-05, + "loss": 0.0571983, + "step": 13180 + }, + { + "epoch": 26.362, + "grad_norm": 1.597809076309204, + "learning_rate": 2e-05, + "loss": 0.04761202, + "step": 13181 + }, + { + "epoch": 26.364, + "grad_norm": 1.0649417638778687, + "learning_rate": 2e-05, + "loss": 0.03972602, + "step": 13182 + }, + { + "epoch": 26.366, + "grad_norm": 0.9360977411270142, + "learning_rate": 2e-05, + "loss": 0.03613855, + "step": 13183 + }, + { + "epoch": 26.368, + "grad_norm": 0.9736573696136475, + "learning_rate": 2e-05, + "loss": 0.03216821, + "step": 13184 + }, + { + "epoch": 26.37, + "grad_norm": 1.6165627241134644, + "learning_rate": 2e-05, + "loss": 0.05862963, + "step": 13185 + }, + { + "epoch": 26.372, + "grad_norm": 1.3186026811599731, + "learning_rate": 2e-05, + "loss": 0.04542302, + "step": 13186 + }, + { + "epoch": 26.374, + "grad_norm": 1.6366772651672363, + "learning_rate": 2e-05, + "loss": 0.04529631, + "step": 13187 + }, + { + "epoch": 26.376, + "grad_norm": 1.0712040662765503, + "learning_rate": 2e-05, + "loss": 0.03381637, + "step": 13188 + }, + { + "epoch": 26.378, + "grad_norm": 1.1308493614196777, + "learning_rate": 2e-05, + "loss": 0.04229905, + "step": 13189 + }, + { + "epoch": 26.38, + "grad_norm": 1.3662320375442505, + "learning_rate": 2e-05, + "loss": 0.05177809, + "step": 13190 + }, + { + "epoch": 26.382, + "grad_norm": 1.9889822006225586, + "learning_rate": 2e-05, + "loss": 0.07416862, + "step": 13191 + }, + { + "epoch": 26.384, + "grad_norm": 1.464272379875183, + "learning_rate": 2e-05, + "loss": 0.04299095, + "step": 13192 + }, + { + "epoch": 26.386, + "grad_norm": 1.1341749429702759, + "learning_rate": 2e-05, + "loss": 0.03406891, + "step": 13193 + }, + { + "epoch": 26.388, + "grad_norm": 1.3196316957473755, + "learning_rate": 2e-05, + "loss": 0.05596439, + "step": 13194 + }, + { + "epoch": 26.39, + "grad_norm": 1.1319140195846558, + "learning_rate": 2e-05, + "loss": 0.03612085, + "step": 13195 + }, + { + "epoch": 26.392, + "grad_norm": 0.9723718762397766, + "learning_rate": 2e-05, + "loss": 0.03945912, + "step": 13196 + }, + { + "epoch": 26.394, + "grad_norm": 0.8996081352233887, + "learning_rate": 2e-05, + "loss": 0.03478167, + "step": 13197 + }, + { + "epoch": 26.396, + "grad_norm": 1.8404150009155273, + "learning_rate": 2e-05, + "loss": 0.04371741, + "step": 13198 + }, + { + "epoch": 26.398, + "grad_norm": 2.319556713104248, + "learning_rate": 2e-05, + "loss": 0.05231041, + "step": 13199 + }, + { + "epoch": 26.4, + "grad_norm": 2.335930824279785, + "learning_rate": 2e-05, + "loss": 0.0324939, + "step": 13200 + }, + { + "epoch": 26.402, + "grad_norm": 1.6622105836868286, + "learning_rate": 2e-05, + "loss": 0.02796814, + "step": 13201 + }, + { + "epoch": 26.404, + "grad_norm": 2.356870651245117, + "learning_rate": 2e-05, + "loss": 0.04440472, + "step": 13202 + }, + { + "epoch": 26.406, + "grad_norm": 1.4844690561294556, + "learning_rate": 2e-05, + "loss": 0.05918201, + "step": 13203 + }, + { + "epoch": 26.408, + "grad_norm": 1.3682526350021362, + "learning_rate": 2e-05, + "loss": 0.064179, + "step": 13204 + }, + { + "epoch": 26.41, + "grad_norm": 1.0957056283950806, + "learning_rate": 2e-05, + "loss": 0.046661, + "step": 13205 + }, + { + "epoch": 26.412, + "grad_norm": 1.2934621572494507, + "learning_rate": 2e-05, + "loss": 0.04822002, + "step": 13206 + }, + { + "epoch": 26.414, + "grad_norm": 1.6465322971343994, + "learning_rate": 2e-05, + "loss": 0.05369683, + "step": 13207 + }, + { + "epoch": 26.416, + "grad_norm": 1.0616811513900757, + "learning_rate": 2e-05, + "loss": 0.04072348, + "step": 13208 + }, + { + "epoch": 26.418, + "grad_norm": 4.23490047454834, + "learning_rate": 2e-05, + "loss": 0.05251167, + "step": 13209 + }, + { + "epoch": 26.42, + "grad_norm": 1.170454978942871, + "learning_rate": 2e-05, + "loss": 0.03898567, + "step": 13210 + }, + { + "epoch": 26.422, + "grad_norm": 1.7174979448318481, + "learning_rate": 2e-05, + "loss": 0.04406247, + "step": 13211 + }, + { + "epoch": 26.424, + "grad_norm": 1.1600885391235352, + "learning_rate": 2e-05, + "loss": 0.05435538, + "step": 13212 + }, + { + "epoch": 26.426, + "grad_norm": 2.2355101108551025, + "learning_rate": 2e-05, + "loss": 0.05025771, + "step": 13213 + }, + { + "epoch": 26.428, + "grad_norm": 1.322678804397583, + "learning_rate": 2e-05, + "loss": 0.04307056, + "step": 13214 + }, + { + "epoch": 26.43, + "grad_norm": 1.323901891708374, + "learning_rate": 2e-05, + "loss": 0.04015271, + "step": 13215 + }, + { + "epoch": 26.432, + "grad_norm": 1.8236982822418213, + "learning_rate": 2e-05, + "loss": 0.04085752, + "step": 13216 + }, + { + "epoch": 26.434, + "grad_norm": 1.3549261093139648, + "learning_rate": 2e-05, + "loss": 0.04854722, + "step": 13217 + }, + { + "epoch": 26.436, + "grad_norm": 1.7481671571731567, + "learning_rate": 2e-05, + "loss": 0.04112818, + "step": 13218 + }, + { + "epoch": 26.438, + "grad_norm": 1.3064723014831543, + "learning_rate": 2e-05, + "loss": 0.05516483, + "step": 13219 + }, + { + "epoch": 26.44, + "grad_norm": 3.367008686065674, + "learning_rate": 2e-05, + "loss": 0.05630918, + "step": 13220 + }, + { + "epoch": 26.442, + "grad_norm": 1.8773056268692017, + "learning_rate": 2e-05, + "loss": 0.06338409, + "step": 13221 + }, + { + "epoch": 26.444, + "grad_norm": 1.168461561203003, + "learning_rate": 2e-05, + "loss": 0.04579566, + "step": 13222 + }, + { + "epoch": 26.446, + "grad_norm": 1.4216398000717163, + "learning_rate": 2e-05, + "loss": 0.03986656, + "step": 13223 + }, + { + "epoch": 26.448, + "grad_norm": 0.9925347566604614, + "learning_rate": 2e-05, + "loss": 0.03974314, + "step": 13224 + }, + { + "epoch": 26.45, + "grad_norm": 1.569463849067688, + "learning_rate": 2e-05, + "loss": 0.0388345, + "step": 13225 + }, + { + "epoch": 26.452, + "grad_norm": 1.6125260591506958, + "learning_rate": 2e-05, + "loss": 0.0357227, + "step": 13226 + }, + { + "epoch": 26.454, + "grad_norm": 1.2239407300949097, + "learning_rate": 2e-05, + "loss": 0.04946052, + "step": 13227 + }, + { + "epoch": 26.456, + "grad_norm": 0.8565706014633179, + "learning_rate": 2e-05, + "loss": 0.03200175, + "step": 13228 + }, + { + "epoch": 26.458, + "grad_norm": 1.0081415176391602, + "learning_rate": 2e-05, + "loss": 0.0341515, + "step": 13229 + }, + { + "epoch": 26.46, + "grad_norm": 1.0593496561050415, + "learning_rate": 2e-05, + "loss": 0.03936413, + "step": 13230 + }, + { + "epoch": 26.462, + "grad_norm": 1.1382449865341187, + "learning_rate": 2e-05, + "loss": 0.02872925, + "step": 13231 + }, + { + "epoch": 26.464, + "grad_norm": 1.4055805206298828, + "learning_rate": 2e-05, + "loss": 0.03323968, + "step": 13232 + }, + { + "epoch": 26.466, + "grad_norm": 1.23487389087677, + "learning_rate": 2e-05, + "loss": 0.05444343, + "step": 13233 + }, + { + "epoch": 26.468, + "grad_norm": 1.100627064704895, + "learning_rate": 2e-05, + "loss": 0.04639729, + "step": 13234 + }, + { + "epoch": 26.47, + "grad_norm": 1.6120901107788086, + "learning_rate": 2e-05, + "loss": 0.05141643, + "step": 13235 + }, + { + "epoch": 26.472, + "grad_norm": 1.5615583658218384, + "learning_rate": 2e-05, + "loss": 0.04910383, + "step": 13236 + }, + { + "epoch": 26.474, + "grad_norm": 1.7524781227111816, + "learning_rate": 2e-05, + "loss": 0.0517026, + "step": 13237 + }, + { + "epoch": 26.476, + "grad_norm": 1.4189718961715698, + "learning_rate": 2e-05, + "loss": 0.03197357, + "step": 13238 + }, + { + "epoch": 26.478, + "grad_norm": 1.2002609968185425, + "learning_rate": 2e-05, + "loss": 0.04137202, + "step": 13239 + }, + { + "epoch": 26.48, + "grad_norm": 1.5820086002349854, + "learning_rate": 2e-05, + "loss": 0.05161813, + "step": 13240 + }, + { + "epoch": 26.482, + "grad_norm": 2.00372052192688, + "learning_rate": 2e-05, + "loss": 0.05241256, + "step": 13241 + }, + { + "epoch": 26.484, + "grad_norm": 1.582104206085205, + "learning_rate": 2e-05, + "loss": 0.05075792, + "step": 13242 + }, + { + "epoch": 26.486, + "grad_norm": 1.3096644878387451, + "learning_rate": 2e-05, + "loss": 0.04678864, + "step": 13243 + }, + { + "epoch": 26.488, + "grad_norm": 1.2743966579437256, + "learning_rate": 2e-05, + "loss": 0.05511482, + "step": 13244 + }, + { + "epoch": 26.49, + "grad_norm": 0.9095728397369385, + "learning_rate": 2e-05, + "loss": 0.0362159, + "step": 13245 + }, + { + "epoch": 26.492, + "grad_norm": 2.057610511779785, + "learning_rate": 2e-05, + "loss": 0.04670215, + "step": 13246 + }, + { + "epoch": 26.494, + "grad_norm": 1.7796850204467773, + "learning_rate": 2e-05, + "loss": 0.0543985, + "step": 13247 + }, + { + "epoch": 26.496, + "grad_norm": 2.0312533378601074, + "learning_rate": 2e-05, + "loss": 0.03435779, + "step": 13248 + }, + { + "epoch": 26.498, + "grad_norm": 0.8784269690513611, + "learning_rate": 2e-05, + "loss": 0.02505475, + "step": 13249 + }, + { + "epoch": 26.5, + "grad_norm": 1.3337388038635254, + "learning_rate": 2e-05, + "loss": 0.03425723, + "step": 13250 + }, + { + "epoch": 26.502, + "grad_norm": 1.1322087049484253, + "learning_rate": 2e-05, + "loss": 0.0380166, + "step": 13251 + }, + { + "epoch": 26.504, + "grad_norm": 1.4790630340576172, + "learning_rate": 2e-05, + "loss": 0.05530652, + "step": 13252 + }, + { + "epoch": 26.506, + "grad_norm": 1.203283429145813, + "learning_rate": 2e-05, + "loss": 0.04369481, + "step": 13253 + }, + { + "epoch": 26.508, + "grad_norm": 0.9502495527267456, + "learning_rate": 2e-05, + "loss": 0.03597327, + "step": 13254 + }, + { + "epoch": 26.51, + "grad_norm": 1.1296265125274658, + "learning_rate": 2e-05, + "loss": 0.03838349, + "step": 13255 + }, + { + "epoch": 26.512, + "grad_norm": 1.4536200761795044, + "learning_rate": 2e-05, + "loss": 0.06651304, + "step": 13256 + }, + { + "epoch": 26.514, + "grad_norm": 1.022311806678772, + "learning_rate": 2e-05, + "loss": 0.03070169, + "step": 13257 + }, + { + "epoch": 26.516, + "grad_norm": 1.7230098247528076, + "learning_rate": 2e-05, + "loss": 0.05207474, + "step": 13258 + }, + { + "epoch": 26.518, + "grad_norm": 1.0804290771484375, + "learning_rate": 2e-05, + "loss": 0.03778263, + "step": 13259 + }, + { + "epoch": 26.52, + "grad_norm": 0.8944292068481445, + "learning_rate": 2e-05, + "loss": 0.02704319, + "step": 13260 + }, + { + "epoch": 26.522, + "grad_norm": 1.2151002883911133, + "learning_rate": 2e-05, + "loss": 0.03234539, + "step": 13261 + }, + { + "epoch": 26.524, + "grad_norm": 1.4448111057281494, + "learning_rate": 2e-05, + "loss": 0.04379795, + "step": 13262 + }, + { + "epoch": 26.526, + "grad_norm": 3.976080894470215, + "learning_rate": 2e-05, + "loss": 0.04685978, + "step": 13263 + }, + { + "epoch": 26.528, + "grad_norm": 1.7629016637802124, + "learning_rate": 2e-05, + "loss": 0.0644666, + "step": 13264 + }, + { + "epoch": 26.53, + "grad_norm": 1.4677865505218506, + "learning_rate": 2e-05, + "loss": 0.0506825, + "step": 13265 + }, + { + "epoch": 26.532, + "grad_norm": 0.9562329649925232, + "learning_rate": 2e-05, + "loss": 0.03811152, + "step": 13266 + }, + { + "epoch": 26.534, + "grad_norm": 2.1469244956970215, + "learning_rate": 2e-05, + "loss": 0.04386814, + "step": 13267 + }, + { + "epoch": 26.536, + "grad_norm": 1.172168254852295, + "learning_rate": 2e-05, + "loss": 0.03443911, + "step": 13268 + }, + { + "epoch": 26.538, + "grad_norm": 1.5751445293426514, + "learning_rate": 2e-05, + "loss": 0.05732818, + "step": 13269 + }, + { + "epoch": 26.54, + "grad_norm": 0.9219292402267456, + "learning_rate": 2e-05, + "loss": 0.02910371, + "step": 13270 + }, + { + "epoch": 26.542, + "grad_norm": 1.7685173749923706, + "learning_rate": 2e-05, + "loss": 0.05187953, + "step": 13271 + }, + { + "epoch": 26.544, + "grad_norm": 1.689509630203247, + "learning_rate": 2e-05, + "loss": 0.05508012, + "step": 13272 + }, + { + "epoch": 26.546, + "grad_norm": 1.1748220920562744, + "learning_rate": 2e-05, + "loss": 0.03818913, + "step": 13273 + }, + { + "epoch": 26.548000000000002, + "grad_norm": 2.35014009475708, + "learning_rate": 2e-05, + "loss": 0.05988345, + "step": 13274 + }, + { + "epoch": 26.55, + "grad_norm": 0.9859991669654846, + "learning_rate": 2e-05, + "loss": 0.02704637, + "step": 13275 + }, + { + "epoch": 26.552, + "grad_norm": 1.1411195993423462, + "learning_rate": 2e-05, + "loss": 0.03679326, + "step": 13276 + }, + { + "epoch": 26.554, + "grad_norm": 2.2478442192077637, + "learning_rate": 2e-05, + "loss": 0.05884918, + "step": 13277 + }, + { + "epoch": 26.556, + "grad_norm": 1.2798776626586914, + "learning_rate": 2e-05, + "loss": 0.03827868, + "step": 13278 + }, + { + "epoch": 26.558, + "grad_norm": 1.3902195692062378, + "learning_rate": 2e-05, + "loss": 0.04380446, + "step": 13279 + }, + { + "epoch": 26.56, + "grad_norm": 1.3358991146087646, + "learning_rate": 2e-05, + "loss": 0.04080527, + "step": 13280 + }, + { + "epoch": 26.562, + "grad_norm": 1.769484043121338, + "learning_rate": 2e-05, + "loss": 0.06003635, + "step": 13281 + }, + { + "epoch": 26.564, + "grad_norm": 1.0746612548828125, + "learning_rate": 2e-05, + "loss": 0.02794596, + "step": 13282 + }, + { + "epoch": 26.566, + "grad_norm": 1.3699100017547607, + "learning_rate": 2e-05, + "loss": 0.03206124, + "step": 13283 + }, + { + "epoch": 26.568, + "grad_norm": 1.0902633666992188, + "learning_rate": 2e-05, + "loss": 0.0326319, + "step": 13284 + }, + { + "epoch": 26.57, + "grad_norm": 1.6095848083496094, + "learning_rate": 2e-05, + "loss": 0.05489502, + "step": 13285 + }, + { + "epoch": 26.572, + "grad_norm": 1.2423216104507446, + "learning_rate": 2e-05, + "loss": 0.03849361, + "step": 13286 + }, + { + "epoch": 26.574, + "grad_norm": 1.2105095386505127, + "learning_rate": 2e-05, + "loss": 0.0360062, + "step": 13287 + }, + { + "epoch": 26.576, + "grad_norm": 1.267501950263977, + "learning_rate": 2e-05, + "loss": 0.04801185, + "step": 13288 + }, + { + "epoch": 26.578, + "grad_norm": 2.0049567222595215, + "learning_rate": 2e-05, + "loss": 0.03750632, + "step": 13289 + }, + { + "epoch": 26.58, + "grad_norm": 2.5399057865142822, + "learning_rate": 2e-05, + "loss": 0.0422065, + "step": 13290 + }, + { + "epoch": 26.582, + "grad_norm": 1.4443379640579224, + "learning_rate": 2e-05, + "loss": 0.03480319, + "step": 13291 + }, + { + "epoch": 26.584, + "grad_norm": 1.135719656944275, + "learning_rate": 2e-05, + "loss": 0.03749589, + "step": 13292 + }, + { + "epoch": 26.586, + "grad_norm": 1.231929063796997, + "learning_rate": 2e-05, + "loss": 0.04628847, + "step": 13293 + }, + { + "epoch": 26.588, + "grad_norm": 1.4225656986236572, + "learning_rate": 2e-05, + "loss": 0.05674511, + "step": 13294 + }, + { + "epoch": 26.59, + "grad_norm": 1.0678924322128296, + "learning_rate": 2e-05, + "loss": 0.03472822, + "step": 13295 + }, + { + "epoch": 26.592, + "grad_norm": 1.3472518920898438, + "learning_rate": 2e-05, + "loss": 0.04637545, + "step": 13296 + }, + { + "epoch": 26.594, + "grad_norm": 1.0952340364456177, + "learning_rate": 2e-05, + "loss": 0.0380681, + "step": 13297 + }, + { + "epoch": 26.596, + "grad_norm": 2.9363811016082764, + "learning_rate": 2e-05, + "loss": 0.05129644, + "step": 13298 + }, + { + "epoch": 26.598, + "grad_norm": 1.1145724058151245, + "learning_rate": 2e-05, + "loss": 0.02599895, + "step": 13299 + }, + { + "epoch": 26.6, + "grad_norm": 0.9582138061523438, + "learning_rate": 2e-05, + "loss": 0.03655213, + "step": 13300 + }, + { + "epoch": 26.602, + "grad_norm": 0.9235168099403381, + "learning_rate": 2e-05, + "loss": 0.03182671, + "step": 13301 + }, + { + "epoch": 26.604, + "grad_norm": 1.1051247119903564, + "learning_rate": 2e-05, + "loss": 0.03536615, + "step": 13302 + }, + { + "epoch": 26.606, + "grad_norm": 1.4593842029571533, + "learning_rate": 2e-05, + "loss": 0.06312197, + "step": 13303 + }, + { + "epoch": 26.608, + "grad_norm": 1.04954993724823, + "learning_rate": 2e-05, + "loss": 0.04040704, + "step": 13304 + }, + { + "epoch": 26.61, + "grad_norm": 1.3381813764572144, + "learning_rate": 2e-05, + "loss": 0.04579471, + "step": 13305 + }, + { + "epoch": 26.612, + "grad_norm": 1.8087036609649658, + "learning_rate": 2e-05, + "loss": 0.05931551, + "step": 13306 + }, + { + "epoch": 26.614, + "grad_norm": 1.8034483194351196, + "learning_rate": 2e-05, + "loss": 0.05207924, + "step": 13307 + }, + { + "epoch": 26.616, + "grad_norm": 1.1185029745101929, + "learning_rate": 2e-05, + "loss": 0.03802615, + "step": 13308 + }, + { + "epoch": 26.618, + "grad_norm": 2.3116369247436523, + "learning_rate": 2e-05, + "loss": 0.05495331, + "step": 13309 + }, + { + "epoch": 26.62, + "grad_norm": 1.7399510145187378, + "learning_rate": 2e-05, + "loss": 0.04346514, + "step": 13310 + }, + { + "epoch": 26.622, + "grad_norm": 1.2430211305618286, + "learning_rate": 2e-05, + "loss": 0.03976776, + "step": 13311 + }, + { + "epoch": 26.624, + "grad_norm": 1.2265691757202148, + "learning_rate": 2e-05, + "loss": 0.03571905, + "step": 13312 + }, + { + "epoch": 26.626, + "grad_norm": 3.572852611541748, + "learning_rate": 2e-05, + "loss": 0.06186254, + "step": 13313 + }, + { + "epoch": 26.628, + "grad_norm": 1.096924066543579, + "learning_rate": 2e-05, + "loss": 0.03906501, + "step": 13314 + }, + { + "epoch": 26.63, + "grad_norm": 2.9369330406188965, + "learning_rate": 2e-05, + "loss": 0.05661511, + "step": 13315 + }, + { + "epoch": 26.632, + "grad_norm": 1.467205286026001, + "learning_rate": 2e-05, + "loss": 0.02951013, + "step": 13316 + }, + { + "epoch": 26.634, + "grad_norm": 1.185747742652893, + "learning_rate": 2e-05, + "loss": 0.03714722, + "step": 13317 + }, + { + "epoch": 26.636, + "grad_norm": 2.264068126678467, + "learning_rate": 2e-05, + "loss": 0.0546186, + "step": 13318 + }, + { + "epoch": 26.638, + "grad_norm": 1.3490537405014038, + "learning_rate": 2e-05, + "loss": 0.03904513, + "step": 13319 + }, + { + "epoch": 26.64, + "grad_norm": 2.2553672790527344, + "learning_rate": 2e-05, + "loss": 0.04899627, + "step": 13320 + }, + { + "epoch": 26.642, + "grad_norm": 1.3507367372512817, + "learning_rate": 2e-05, + "loss": 0.04580309, + "step": 13321 + }, + { + "epoch": 26.644, + "grad_norm": 1.3584458827972412, + "learning_rate": 2e-05, + "loss": 0.04628047, + "step": 13322 + }, + { + "epoch": 26.646, + "grad_norm": 1.6204533576965332, + "learning_rate": 2e-05, + "loss": 0.05554104, + "step": 13323 + }, + { + "epoch": 26.648, + "grad_norm": 2.109013319015503, + "learning_rate": 2e-05, + "loss": 0.05139774, + "step": 13324 + }, + { + "epoch": 26.65, + "grad_norm": 1.177474021911621, + "learning_rate": 2e-05, + "loss": 0.04520677, + "step": 13325 + }, + { + "epoch": 26.652, + "grad_norm": 1.265397071838379, + "learning_rate": 2e-05, + "loss": 0.0448864, + "step": 13326 + }, + { + "epoch": 26.654, + "grad_norm": 0.9993850588798523, + "learning_rate": 2e-05, + "loss": 0.04312583, + "step": 13327 + }, + { + "epoch": 26.656, + "grad_norm": 2.175631523132324, + "learning_rate": 2e-05, + "loss": 0.04794764, + "step": 13328 + }, + { + "epoch": 26.658, + "grad_norm": 1.602054476737976, + "learning_rate": 2e-05, + "loss": 0.06128681, + "step": 13329 + }, + { + "epoch": 26.66, + "grad_norm": 1.4241081476211548, + "learning_rate": 2e-05, + "loss": 0.04915302, + "step": 13330 + }, + { + "epoch": 26.662, + "grad_norm": 1.6712620258331299, + "learning_rate": 2e-05, + "loss": 0.0512425, + "step": 13331 + }, + { + "epoch": 26.664, + "grad_norm": 2.4270451068878174, + "learning_rate": 2e-05, + "loss": 0.07092887, + "step": 13332 + }, + { + "epoch": 26.666, + "grad_norm": 1.007644772529602, + "learning_rate": 2e-05, + "loss": 0.03094992, + "step": 13333 + }, + { + "epoch": 26.668, + "grad_norm": 1.4786583185195923, + "learning_rate": 2e-05, + "loss": 0.05066321, + "step": 13334 + }, + { + "epoch": 26.67, + "grad_norm": 1.4783406257629395, + "learning_rate": 2e-05, + "loss": 0.04284883, + "step": 13335 + }, + { + "epoch": 26.672, + "grad_norm": 2.875722885131836, + "learning_rate": 2e-05, + "loss": 0.05276472, + "step": 13336 + }, + { + "epoch": 26.674, + "grad_norm": 1.0497190952301025, + "learning_rate": 2e-05, + "loss": 0.03627689, + "step": 13337 + }, + { + "epoch": 26.676, + "grad_norm": 1.1127156019210815, + "learning_rate": 2e-05, + "loss": 0.03644489, + "step": 13338 + }, + { + "epoch": 26.678, + "grad_norm": 1.5005398988723755, + "learning_rate": 2e-05, + "loss": 0.0399859, + "step": 13339 + }, + { + "epoch": 26.68, + "grad_norm": 1.4886964559555054, + "learning_rate": 2e-05, + "loss": 0.05113001, + "step": 13340 + }, + { + "epoch": 26.682, + "grad_norm": 1.1941289901733398, + "learning_rate": 2e-05, + "loss": 0.04594672, + "step": 13341 + }, + { + "epoch": 26.684, + "grad_norm": 1.4600493907928467, + "learning_rate": 2e-05, + "loss": 0.05219441, + "step": 13342 + }, + { + "epoch": 26.686, + "grad_norm": 1.2766631841659546, + "learning_rate": 2e-05, + "loss": 0.05881347, + "step": 13343 + }, + { + "epoch": 26.688, + "grad_norm": 1.2152045965194702, + "learning_rate": 2e-05, + "loss": 0.02941299, + "step": 13344 + }, + { + "epoch": 26.69, + "grad_norm": 1.2594579458236694, + "learning_rate": 2e-05, + "loss": 0.04704573, + "step": 13345 + }, + { + "epoch": 26.692, + "grad_norm": 1.4651315212249756, + "learning_rate": 2e-05, + "loss": 0.04764215, + "step": 13346 + }, + { + "epoch": 26.694, + "grad_norm": 1.5274326801300049, + "learning_rate": 2e-05, + "loss": 0.06374133, + "step": 13347 + }, + { + "epoch": 26.696, + "grad_norm": 1.9418085813522339, + "learning_rate": 2e-05, + "loss": 0.06519874, + "step": 13348 + }, + { + "epoch": 26.698, + "grad_norm": 1.6129246950149536, + "learning_rate": 2e-05, + "loss": 0.03720499, + "step": 13349 + }, + { + "epoch": 26.7, + "grad_norm": 0.7866122126579285, + "learning_rate": 2e-05, + "loss": 0.02468162, + "step": 13350 + }, + { + "epoch": 26.701999999999998, + "grad_norm": 1.3163105249404907, + "learning_rate": 2e-05, + "loss": 0.05013556, + "step": 13351 + }, + { + "epoch": 26.704, + "grad_norm": 1.4515641927719116, + "learning_rate": 2e-05, + "loss": 0.04763116, + "step": 13352 + }, + { + "epoch": 26.706, + "grad_norm": 1.3559627532958984, + "learning_rate": 2e-05, + "loss": 0.04021575, + "step": 13353 + }, + { + "epoch": 26.708, + "grad_norm": 2.714416742324829, + "learning_rate": 2e-05, + "loss": 0.05345739, + "step": 13354 + }, + { + "epoch": 26.71, + "grad_norm": 2.2046430110931396, + "learning_rate": 2e-05, + "loss": 0.04232349, + "step": 13355 + }, + { + "epoch": 26.712, + "grad_norm": 1.0346970558166504, + "learning_rate": 2e-05, + "loss": 0.03661943, + "step": 13356 + }, + { + "epoch": 26.714, + "grad_norm": 1.037308692932129, + "learning_rate": 2e-05, + "loss": 0.0360332, + "step": 13357 + }, + { + "epoch": 26.716, + "grad_norm": 1.399788498878479, + "learning_rate": 2e-05, + "loss": 0.04450074, + "step": 13358 + }, + { + "epoch": 26.718, + "grad_norm": 1.1871050596237183, + "learning_rate": 2e-05, + "loss": 0.04489849, + "step": 13359 + }, + { + "epoch": 26.72, + "grad_norm": 1.4322409629821777, + "learning_rate": 2e-05, + "loss": 0.06084087, + "step": 13360 + }, + { + "epoch": 26.722, + "grad_norm": 1.2092519998550415, + "learning_rate": 2e-05, + "loss": 0.04185452, + "step": 13361 + }, + { + "epoch": 26.724, + "grad_norm": 1.5179123878479004, + "learning_rate": 2e-05, + "loss": 0.05089298, + "step": 13362 + }, + { + "epoch": 26.726, + "grad_norm": 1.2479588985443115, + "learning_rate": 2e-05, + "loss": 0.04334253, + "step": 13363 + }, + { + "epoch": 26.728, + "grad_norm": 1.2945444583892822, + "learning_rate": 2e-05, + "loss": 0.03954721, + "step": 13364 + }, + { + "epoch": 26.73, + "grad_norm": 1.0343196392059326, + "learning_rate": 2e-05, + "loss": 0.03208121, + "step": 13365 + }, + { + "epoch": 26.732, + "grad_norm": 1.5581246614456177, + "learning_rate": 2e-05, + "loss": 0.05093454, + "step": 13366 + }, + { + "epoch": 26.734, + "grad_norm": 1.2444391250610352, + "learning_rate": 2e-05, + "loss": 0.04247945, + "step": 13367 + }, + { + "epoch": 26.736, + "grad_norm": 0.9851118326187134, + "learning_rate": 2e-05, + "loss": 0.03192877, + "step": 13368 + }, + { + "epoch": 26.738, + "grad_norm": 1.6827157735824585, + "learning_rate": 2e-05, + "loss": 0.04289787, + "step": 13369 + }, + { + "epoch": 26.74, + "grad_norm": 1.6345899105072021, + "learning_rate": 2e-05, + "loss": 0.05687879, + "step": 13370 + }, + { + "epoch": 26.742, + "grad_norm": 1.2550289630889893, + "learning_rate": 2e-05, + "loss": 0.04343875, + "step": 13371 + }, + { + "epoch": 26.744, + "grad_norm": 1.3297144174575806, + "learning_rate": 2e-05, + "loss": 0.03608532, + "step": 13372 + }, + { + "epoch": 26.746, + "grad_norm": 1.9059828519821167, + "learning_rate": 2e-05, + "loss": 0.06531657, + "step": 13373 + }, + { + "epoch": 26.748, + "grad_norm": 1.4299732446670532, + "learning_rate": 2e-05, + "loss": 0.05242453, + "step": 13374 + }, + { + "epoch": 26.75, + "grad_norm": 1.1574034690856934, + "learning_rate": 2e-05, + "loss": 0.0402981, + "step": 13375 + }, + { + "epoch": 26.752, + "grad_norm": 2.168186902999878, + "learning_rate": 2e-05, + "loss": 0.052399, + "step": 13376 + }, + { + "epoch": 26.754, + "grad_norm": 1.0010058879852295, + "learning_rate": 2e-05, + "loss": 0.03435899, + "step": 13377 + }, + { + "epoch": 26.756, + "grad_norm": 1.2347971200942993, + "learning_rate": 2e-05, + "loss": 0.04020776, + "step": 13378 + }, + { + "epoch": 26.758, + "grad_norm": 1.1826847791671753, + "learning_rate": 2e-05, + "loss": 0.03945916, + "step": 13379 + }, + { + "epoch": 26.76, + "grad_norm": 1.6033371686935425, + "learning_rate": 2e-05, + "loss": 0.0463616, + "step": 13380 + }, + { + "epoch": 26.762, + "grad_norm": 1.1534563302993774, + "learning_rate": 2e-05, + "loss": 0.04242963, + "step": 13381 + }, + { + "epoch": 26.764, + "grad_norm": 1.3048988580703735, + "learning_rate": 2e-05, + "loss": 0.05592864, + "step": 13382 + }, + { + "epoch": 26.766, + "grad_norm": 1.2142225503921509, + "learning_rate": 2e-05, + "loss": 0.04550473, + "step": 13383 + }, + { + "epoch": 26.768, + "grad_norm": 1.946568489074707, + "learning_rate": 2e-05, + "loss": 0.06550965, + "step": 13384 + }, + { + "epoch": 26.77, + "grad_norm": 1.2773605585098267, + "learning_rate": 2e-05, + "loss": 0.06554952, + "step": 13385 + }, + { + "epoch": 26.772, + "grad_norm": 1.2231202125549316, + "learning_rate": 2e-05, + "loss": 0.03776004, + "step": 13386 + }, + { + "epoch": 26.774, + "grad_norm": 1.1051039695739746, + "learning_rate": 2e-05, + "loss": 0.02784972, + "step": 13387 + }, + { + "epoch": 26.776, + "grad_norm": 1.1587811708450317, + "learning_rate": 2e-05, + "loss": 0.03560489, + "step": 13388 + }, + { + "epoch": 26.778, + "grad_norm": 1.0926035642623901, + "learning_rate": 2e-05, + "loss": 0.03061144, + "step": 13389 + }, + { + "epoch": 26.78, + "grad_norm": 0.9534873962402344, + "learning_rate": 2e-05, + "loss": 0.03757609, + "step": 13390 + }, + { + "epoch": 26.782, + "grad_norm": 1.3637419939041138, + "learning_rate": 2e-05, + "loss": 0.04305605, + "step": 13391 + }, + { + "epoch": 26.784, + "grad_norm": 1.3764069080352783, + "learning_rate": 2e-05, + "loss": 0.0500427, + "step": 13392 + }, + { + "epoch": 26.786, + "grad_norm": 1.036009430885315, + "learning_rate": 2e-05, + "loss": 0.04027718, + "step": 13393 + }, + { + "epoch": 26.788, + "grad_norm": 1.7560526132583618, + "learning_rate": 2e-05, + "loss": 0.06162121, + "step": 13394 + }, + { + "epoch": 26.79, + "grad_norm": 1.822432279586792, + "learning_rate": 2e-05, + "loss": 0.04765713, + "step": 13395 + }, + { + "epoch": 26.792, + "grad_norm": 1.488826036453247, + "learning_rate": 2e-05, + "loss": 0.03322754, + "step": 13396 + }, + { + "epoch": 26.794, + "grad_norm": 1.4484844207763672, + "learning_rate": 2e-05, + "loss": 0.04716588, + "step": 13397 + }, + { + "epoch": 26.796, + "grad_norm": 1.1175645589828491, + "learning_rate": 2e-05, + "loss": 0.03686436, + "step": 13398 + }, + { + "epoch": 26.798000000000002, + "grad_norm": 1.6677278280258179, + "learning_rate": 2e-05, + "loss": 0.03573879, + "step": 13399 + }, + { + "epoch": 26.8, + "grad_norm": 1.8486576080322266, + "learning_rate": 2e-05, + "loss": 0.05908503, + "step": 13400 + }, + { + "epoch": 26.802, + "grad_norm": 2.1419644355773926, + "learning_rate": 2e-05, + "loss": 0.04133147, + "step": 13401 + }, + { + "epoch": 26.804, + "grad_norm": 1.5237059593200684, + "learning_rate": 2e-05, + "loss": 0.04093695, + "step": 13402 + }, + { + "epoch": 26.806, + "grad_norm": 0.9631497263908386, + "learning_rate": 2e-05, + "loss": 0.03032106, + "step": 13403 + }, + { + "epoch": 26.808, + "grad_norm": 1.6903769969940186, + "learning_rate": 2e-05, + "loss": 0.03851802, + "step": 13404 + }, + { + "epoch": 26.81, + "grad_norm": 2.2639379501342773, + "learning_rate": 2e-05, + "loss": 0.0339236, + "step": 13405 + }, + { + "epoch": 26.812, + "grad_norm": 1.8234732151031494, + "learning_rate": 2e-05, + "loss": 0.04727671, + "step": 13406 + }, + { + "epoch": 26.814, + "grad_norm": 1.174176573753357, + "learning_rate": 2e-05, + "loss": 0.04155207, + "step": 13407 + }, + { + "epoch": 26.816, + "grad_norm": 1.388486385345459, + "learning_rate": 2e-05, + "loss": 0.03067862, + "step": 13408 + }, + { + "epoch": 26.818, + "grad_norm": 1.2704886198043823, + "learning_rate": 2e-05, + "loss": 0.02589917, + "step": 13409 + }, + { + "epoch": 26.82, + "grad_norm": 1.151698350906372, + "learning_rate": 2e-05, + "loss": 0.05481917, + "step": 13410 + }, + { + "epoch": 26.822, + "grad_norm": 1.2374764680862427, + "learning_rate": 2e-05, + "loss": 0.04253447, + "step": 13411 + }, + { + "epoch": 26.824, + "grad_norm": 1.3624358177185059, + "learning_rate": 2e-05, + "loss": 0.04338268, + "step": 13412 + }, + { + "epoch": 26.826, + "grad_norm": 1.1399511098861694, + "learning_rate": 2e-05, + "loss": 0.04491112, + "step": 13413 + }, + { + "epoch": 26.828, + "grad_norm": 1.8094592094421387, + "learning_rate": 2e-05, + "loss": 0.04950622, + "step": 13414 + }, + { + "epoch": 26.83, + "grad_norm": 1.5777703523635864, + "learning_rate": 2e-05, + "loss": 0.04463634, + "step": 13415 + }, + { + "epoch": 26.832, + "grad_norm": 1.1766401529312134, + "learning_rate": 2e-05, + "loss": 0.04658622, + "step": 13416 + }, + { + "epoch": 26.834, + "grad_norm": 1.3368674516677856, + "learning_rate": 2e-05, + "loss": 0.04229524, + "step": 13417 + }, + { + "epoch": 26.836, + "grad_norm": 1.3476362228393555, + "learning_rate": 2e-05, + "loss": 0.03923023, + "step": 13418 + }, + { + "epoch": 26.838, + "grad_norm": 1.4750795364379883, + "learning_rate": 2e-05, + "loss": 0.06064068, + "step": 13419 + }, + { + "epoch": 26.84, + "grad_norm": 1.1680598258972168, + "learning_rate": 2e-05, + "loss": 0.04480653, + "step": 13420 + }, + { + "epoch": 26.842, + "grad_norm": 2.2910873889923096, + "learning_rate": 2e-05, + "loss": 0.06019218, + "step": 13421 + }, + { + "epoch": 26.844, + "grad_norm": 1.3044073581695557, + "learning_rate": 2e-05, + "loss": 0.06455293, + "step": 13422 + }, + { + "epoch": 26.846, + "grad_norm": 1.560138463973999, + "learning_rate": 2e-05, + "loss": 0.06546596, + "step": 13423 + }, + { + "epoch": 26.848, + "grad_norm": 1.6893231868743896, + "learning_rate": 2e-05, + "loss": 0.0381937, + "step": 13424 + }, + { + "epoch": 26.85, + "grad_norm": 1.344321846961975, + "learning_rate": 2e-05, + "loss": 0.05524303, + "step": 13425 + }, + { + "epoch": 26.852, + "grad_norm": 1.0075470209121704, + "learning_rate": 2e-05, + "loss": 0.03442932, + "step": 13426 + }, + { + "epoch": 26.854, + "grad_norm": 1.832734227180481, + "learning_rate": 2e-05, + "loss": 0.06405198, + "step": 13427 + }, + { + "epoch": 26.856, + "grad_norm": 0.9961326122283936, + "learning_rate": 2e-05, + "loss": 0.02738804, + "step": 13428 + }, + { + "epoch": 26.858, + "grad_norm": 0.9834625124931335, + "learning_rate": 2e-05, + "loss": 0.03841186, + "step": 13429 + }, + { + "epoch": 26.86, + "grad_norm": 1.1662216186523438, + "learning_rate": 2e-05, + "loss": 0.04747712, + "step": 13430 + }, + { + "epoch": 26.862, + "grad_norm": 1.8224124908447266, + "learning_rate": 2e-05, + "loss": 0.038079, + "step": 13431 + }, + { + "epoch": 26.864, + "grad_norm": 1.2939532995224, + "learning_rate": 2e-05, + "loss": 0.04183429, + "step": 13432 + }, + { + "epoch": 26.866, + "grad_norm": 1.1002147197723389, + "learning_rate": 2e-05, + "loss": 0.03624891, + "step": 13433 + }, + { + "epoch": 26.868, + "grad_norm": 1.4251444339752197, + "learning_rate": 2e-05, + "loss": 0.05983438, + "step": 13434 + }, + { + "epoch": 26.87, + "grad_norm": 1.7216922044754028, + "learning_rate": 2e-05, + "loss": 0.09802464, + "step": 13435 + }, + { + "epoch": 26.872, + "grad_norm": 1.3873647451400757, + "learning_rate": 2e-05, + "loss": 0.04079521, + "step": 13436 + }, + { + "epoch": 26.874, + "grad_norm": 1.0737560987472534, + "learning_rate": 2e-05, + "loss": 0.04044875, + "step": 13437 + }, + { + "epoch": 26.876, + "grad_norm": 1.148348331451416, + "learning_rate": 2e-05, + "loss": 0.04982531, + "step": 13438 + }, + { + "epoch": 26.878, + "grad_norm": 1.1479979753494263, + "learning_rate": 2e-05, + "loss": 0.05051372, + "step": 13439 + }, + { + "epoch": 26.88, + "grad_norm": 1.727099061012268, + "learning_rate": 2e-05, + "loss": 0.04605647, + "step": 13440 + }, + { + "epoch": 26.882, + "grad_norm": 1.8774471282958984, + "learning_rate": 2e-05, + "loss": 0.04778759, + "step": 13441 + }, + { + "epoch": 26.884, + "grad_norm": 1.4502348899841309, + "learning_rate": 2e-05, + "loss": 0.03749793, + "step": 13442 + }, + { + "epoch": 26.886, + "grad_norm": 2.354149103164673, + "learning_rate": 2e-05, + "loss": 0.05970661, + "step": 13443 + }, + { + "epoch": 26.888, + "grad_norm": 1.1341004371643066, + "learning_rate": 2e-05, + "loss": 0.04527812, + "step": 13444 + }, + { + "epoch": 26.89, + "grad_norm": 1.6915284395217896, + "learning_rate": 2e-05, + "loss": 0.04350384, + "step": 13445 + }, + { + "epoch": 26.892, + "grad_norm": 1.5579638481140137, + "learning_rate": 2e-05, + "loss": 0.0307783, + "step": 13446 + }, + { + "epoch": 26.894, + "grad_norm": 1.3309636116027832, + "learning_rate": 2e-05, + "loss": 0.0582713, + "step": 13447 + }, + { + "epoch": 26.896, + "grad_norm": 1.2728379964828491, + "learning_rate": 2e-05, + "loss": 0.04255494, + "step": 13448 + }, + { + "epoch": 26.898, + "grad_norm": 1.1662172079086304, + "learning_rate": 2e-05, + "loss": 0.03874854, + "step": 13449 + }, + { + "epoch": 26.9, + "grad_norm": 2.72029447555542, + "learning_rate": 2e-05, + "loss": 0.0405199, + "step": 13450 + }, + { + "epoch": 26.902, + "grad_norm": 1.25313401222229, + "learning_rate": 2e-05, + "loss": 0.04713894, + "step": 13451 + }, + { + "epoch": 26.904, + "grad_norm": 1.2333053350448608, + "learning_rate": 2e-05, + "loss": 0.03969055, + "step": 13452 + }, + { + "epoch": 26.906, + "grad_norm": 1.149762511253357, + "learning_rate": 2e-05, + "loss": 0.03785865, + "step": 13453 + }, + { + "epoch": 26.908, + "grad_norm": 0.8945139646530151, + "learning_rate": 2e-05, + "loss": 0.03088951, + "step": 13454 + }, + { + "epoch": 26.91, + "grad_norm": 1.6503828763961792, + "learning_rate": 2e-05, + "loss": 0.03936726, + "step": 13455 + }, + { + "epoch": 26.912, + "grad_norm": 1.1440142393112183, + "learning_rate": 2e-05, + "loss": 0.04367042, + "step": 13456 + }, + { + "epoch": 26.914, + "grad_norm": 1.2169008255004883, + "learning_rate": 2e-05, + "loss": 0.04358334, + "step": 13457 + }, + { + "epoch": 26.916, + "grad_norm": 0.9487123489379883, + "learning_rate": 2e-05, + "loss": 0.02613301, + "step": 13458 + }, + { + "epoch": 26.918, + "grad_norm": 1.0501741170883179, + "learning_rate": 2e-05, + "loss": 0.03885425, + "step": 13459 + }, + { + "epoch": 26.92, + "grad_norm": 1.3245856761932373, + "learning_rate": 2e-05, + "loss": 0.03801511, + "step": 13460 + }, + { + "epoch": 26.922, + "grad_norm": 1.1823177337646484, + "learning_rate": 2e-05, + "loss": 0.04091566, + "step": 13461 + }, + { + "epoch": 26.924, + "grad_norm": 1.5307384729385376, + "learning_rate": 2e-05, + "loss": 0.04803543, + "step": 13462 + }, + { + "epoch": 26.926, + "grad_norm": 1.2377997636795044, + "learning_rate": 2e-05, + "loss": 0.04242143, + "step": 13463 + }, + { + "epoch": 26.928, + "grad_norm": 1.0577019453048706, + "learning_rate": 2e-05, + "loss": 0.04687814, + "step": 13464 + }, + { + "epoch": 26.93, + "grad_norm": 1.1497957706451416, + "learning_rate": 2e-05, + "loss": 0.03771542, + "step": 13465 + }, + { + "epoch": 26.932, + "grad_norm": 1.1796302795410156, + "learning_rate": 2e-05, + "loss": 0.04946107, + "step": 13466 + }, + { + "epoch": 26.934, + "grad_norm": 1.092475175857544, + "learning_rate": 2e-05, + "loss": 0.03278045, + "step": 13467 + }, + { + "epoch": 26.936, + "grad_norm": 1.85981285572052, + "learning_rate": 2e-05, + "loss": 0.04644259, + "step": 13468 + }, + { + "epoch": 26.938, + "grad_norm": 1.8343428373336792, + "learning_rate": 2e-05, + "loss": 0.05847539, + "step": 13469 + }, + { + "epoch": 26.94, + "grad_norm": 2.2233643531799316, + "learning_rate": 2e-05, + "loss": 0.05392168, + "step": 13470 + }, + { + "epoch": 26.942, + "grad_norm": 2.9804506301879883, + "learning_rate": 2e-05, + "loss": 0.05396551, + "step": 13471 + }, + { + "epoch": 26.944, + "grad_norm": 0.8914803862571716, + "learning_rate": 2e-05, + "loss": 0.02593549, + "step": 13472 + }, + { + "epoch": 26.946, + "grad_norm": 1.0326448678970337, + "learning_rate": 2e-05, + "loss": 0.03847109, + "step": 13473 + }, + { + "epoch": 26.948, + "grad_norm": 1.935031533241272, + "learning_rate": 2e-05, + "loss": 0.0416937, + "step": 13474 + }, + { + "epoch": 26.95, + "grad_norm": 1.1312302350997925, + "learning_rate": 2e-05, + "loss": 0.04824629, + "step": 13475 + }, + { + "epoch": 26.951999999999998, + "grad_norm": 1.6324800252914429, + "learning_rate": 2e-05, + "loss": 0.04897019, + "step": 13476 + }, + { + "epoch": 26.954, + "grad_norm": 1.1452162265777588, + "learning_rate": 2e-05, + "loss": 0.03952339, + "step": 13477 + }, + { + "epoch": 26.956, + "grad_norm": 1.284656047821045, + "learning_rate": 2e-05, + "loss": 0.03469507, + "step": 13478 + }, + { + "epoch": 26.958, + "grad_norm": 1.433944582939148, + "learning_rate": 2e-05, + "loss": 0.03940383, + "step": 13479 + }, + { + "epoch": 26.96, + "grad_norm": 0.9404771327972412, + "learning_rate": 2e-05, + "loss": 0.03143159, + "step": 13480 + }, + { + "epoch": 26.962, + "grad_norm": 1.0665420293807983, + "learning_rate": 2e-05, + "loss": 0.03889358, + "step": 13481 + }, + { + "epoch": 26.964, + "grad_norm": 1.0098894834518433, + "learning_rate": 2e-05, + "loss": 0.03096025, + "step": 13482 + }, + { + "epoch": 26.966, + "grad_norm": 1.2511407136917114, + "learning_rate": 2e-05, + "loss": 0.05308896, + "step": 13483 + }, + { + "epoch": 26.968, + "grad_norm": 1.9953383207321167, + "learning_rate": 2e-05, + "loss": 0.03955588, + "step": 13484 + }, + { + "epoch": 26.97, + "grad_norm": 1.377296805381775, + "learning_rate": 2e-05, + "loss": 0.03692595, + "step": 13485 + }, + { + "epoch": 26.972, + "grad_norm": 1.0224653482437134, + "learning_rate": 2e-05, + "loss": 0.03503982, + "step": 13486 + }, + { + "epoch": 26.974, + "grad_norm": 1.0944488048553467, + "learning_rate": 2e-05, + "loss": 0.03338355, + "step": 13487 + }, + { + "epoch": 26.976, + "grad_norm": 1.0018296241760254, + "learning_rate": 2e-05, + "loss": 0.02892867, + "step": 13488 + }, + { + "epoch": 26.978, + "grad_norm": 1.019363522529602, + "learning_rate": 2e-05, + "loss": 0.03823832, + "step": 13489 + }, + { + "epoch": 26.98, + "grad_norm": 1.2241965532302856, + "learning_rate": 2e-05, + "loss": 0.04636896, + "step": 13490 + }, + { + "epoch": 26.982, + "grad_norm": 1.5487970113754272, + "learning_rate": 2e-05, + "loss": 0.05474398, + "step": 13491 + }, + { + "epoch": 26.984, + "grad_norm": 0.9554542899131775, + "learning_rate": 2e-05, + "loss": 0.03364764, + "step": 13492 + }, + { + "epoch": 26.986, + "grad_norm": 2.8217360973358154, + "learning_rate": 2e-05, + "loss": 0.03697766, + "step": 13493 + }, + { + "epoch": 26.988, + "grad_norm": 1.6074271202087402, + "learning_rate": 2e-05, + "loss": 0.04182863, + "step": 13494 + }, + { + "epoch": 26.99, + "grad_norm": 1.3544893264770508, + "learning_rate": 2e-05, + "loss": 0.04771724, + "step": 13495 + }, + { + "epoch": 26.992, + "grad_norm": 0.9438838362693787, + "learning_rate": 2e-05, + "loss": 0.03837147, + "step": 13496 + }, + { + "epoch": 26.994, + "grad_norm": 1.067697525024414, + "learning_rate": 2e-05, + "loss": 0.03082527, + "step": 13497 + }, + { + "epoch": 26.996, + "grad_norm": 3.1041669845581055, + "learning_rate": 2e-05, + "loss": 0.05515031, + "step": 13498 + }, + { + "epoch": 26.998, + "grad_norm": 1.1457984447479248, + "learning_rate": 2e-05, + "loss": 0.04233011, + "step": 13499 + }, + { + "epoch": 27.0, + "grad_norm": 1.2711799144744873, + "learning_rate": 2e-05, + "loss": 0.04493605, + "step": 13500 + }, + { + "epoch": 27.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9740518962075848, + "Equal_1": 0.998, + "Equal_2": 0.9860279441117764, + "Equal_3": 0.9920159680638723, + "LineComparison_1": 1.0, + "LineComparison_2": 0.998003992015968, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.992, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.99, + "Perpendicular_3": 0.8977955911823647, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.9996666666666667, + "PointLiesOnCircle_3": 0.986, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9880239520958084 + }, + "eval_runtime": 319.7304, + "eval_samples_per_second": 32.84, + "eval_steps_per_second": 0.657, + "step": 13500 + }, + { + "epoch": 27.002, + "grad_norm": 1.1527739763259888, + "learning_rate": 2e-05, + "loss": 0.04508916, + "step": 13501 + }, + { + "epoch": 27.004, + "grad_norm": 1.562675952911377, + "learning_rate": 2e-05, + "loss": 0.04754071, + "step": 13502 + }, + { + "epoch": 27.006, + "grad_norm": 1.1957426071166992, + "learning_rate": 2e-05, + "loss": 0.03879406, + "step": 13503 + }, + { + "epoch": 27.008, + "grad_norm": 1.1542894840240479, + "learning_rate": 2e-05, + "loss": 0.0369472, + "step": 13504 + }, + { + "epoch": 27.01, + "grad_norm": 0.9654834866523743, + "learning_rate": 2e-05, + "loss": 0.03338242, + "step": 13505 + }, + { + "epoch": 27.012, + "grad_norm": 0.94996577501297, + "learning_rate": 2e-05, + "loss": 0.03290822, + "step": 13506 + }, + { + "epoch": 27.014, + "grad_norm": 1.7157975435256958, + "learning_rate": 2e-05, + "loss": 0.0513361, + "step": 13507 + }, + { + "epoch": 27.016, + "grad_norm": 1.248037576675415, + "learning_rate": 2e-05, + "loss": 0.03775504, + "step": 13508 + }, + { + "epoch": 27.018, + "grad_norm": 0.9875592589378357, + "learning_rate": 2e-05, + "loss": 0.03280458, + "step": 13509 + }, + { + "epoch": 27.02, + "grad_norm": 1.2043684720993042, + "learning_rate": 2e-05, + "loss": 0.03919297, + "step": 13510 + }, + { + "epoch": 27.022, + "grad_norm": 1.1797631978988647, + "learning_rate": 2e-05, + "loss": 0.03209981, + "step": 13511 + }, + { + "epoch": 27.024, + "grad_norm": 0.9786536693572998, + "learning_rate": 2e-05, + "loss": 0.03343519, + "step": 13512 + }, + { + "epoch": 27.026, + "grad_norm": 1.6166154146194458, + "learning_rate": 2e-05, + "loss": 0.04526688, + "step": 13513 + }, + { + "epoch": 27.028, + "grad_norm": 1.9152218103408813, + "learning_rate": 2e-05, + "loss": 0.04358869, + "step": 13514 + }, + { + "epoch": 27.03, + "grad_norm": 1.1101301908493042, + "learning_rate": 2e-05, + "loss": 0.02884656, + "step": 13515 + }, + { + "epoch": 27.032, + "grad_norm": 1.529909610748291, + "learning_rate": 2e-05, + "loss": 0.03502784, + "step": 13516 + }, + { + "epoch": 27.034, + "grad_norm": 0.97792649269104, + "learning_rate": 2e-05, + "loss": 0.02850207, + "step": 13517 + }, + { + "epoch": 27.036, + "grad_norm": 2.824786901473999, + "learning_rate": 2e-05, + "loss": 0.04891523, + "step": 13518 + }, + { + "epoch": 27.038, + "grad_norm": 1.4314439296722412, + "learning_rate": 2e-05, + "loss": 0.03873865, + "step": 13519 + }, + { + "epoch": 27.04, + "grad_norm": 1.3454880714416504, + "learning_rate": 2e-05, + "loss": 0.050864, + "step": 13520 + }, + { + "epoch": 27.042, + "grad_norm": 1.2566922903060913, + "learning_rate": 2e-05, + "loss": 0.05498883, + "step": 13521 + }, + { + "epoch": 27.044, + "grad_norm": 1.0700196027755737, + "learning_rate": 2e-05, + "loss": 0.03465045, + "step": 13522 + }, + { + "epoch": 27.046, + "grad_norm": 1.479763388633728, + "learning_rate": 2e-05, + "loss": 0.04723434, + "step": 13523 + }, + { + "epoch": 27.048, + "grad_norm": 1.4551644325256348, + "learning_rate": 2e-05, + "loss": 0.04056372, + "step": 13524 + }, + { + "epoch": 27.05, + "grad_norm": 1.3529044389724731, + "learning_rate": 2e-05, + "loss": 0.06563016, + "step": 13525 + }, + { + "epoch": 27.052, + "grad_norm": 1.122623085975647, + "learning_rate": 2e-05, + "loss": 0.04986753, + "step": 13526 + }, + { + "epoch": 27.054, + "grad_norm": 1.1177496910095215, + "learning_rate": 2e-05, + "loss": 0.04144656, + "step": 13527 + }, + { + "epoch": 27.056, + "grad_norm": 1.2102779150009155, + "learning_rate": 2e-05, + "loss": 0.04852476, + "step": 13528 + }, + { + "epoch": 27.058, + "grad_norm": 1.0115301609039307, + "learning_rate": 2e-05, + "loss": 0.03474714, + "step": 13529 + }, + { + "epoch": 27.06, + "grad_norm": 0.9496335983276367, + "learning_rate": 2e-05, + "loss": 0.03019882, + "step": 13530 + }, + { + "epoch": 27.062, + "grad_norm": 1.04624342918396, + "learning_rate": 2e-05, + "loss": 0.03820858, + "step": 13531 + }, + { + "epoch": 27.064, + "grad_norm": 1.3676940202713013, + "learning_rate": 2e-05, + "loss": 0.05121147, + "step": 13532 + }, + { + "epoch": 27.066, + "grad_norm": 1.2893519401550293, + "learning_rate": 2e-05, + "loss": 0.04120003, + "step": 13533 + }, + { + "epoch": 27.068, + "grad_norm": 3.120879650115967, + "learning_rate": 2e-05, + "loss": 0.05591538, + "step": 13534 + }, + { + "epoch": 27.07, + "grad_norm": 1.1986747980117798, + "learning_rate": 2e-05, + "loss": 0.04751917, + "step": 13535 + }, + { + "epoch": 27.072, + "grad_norm": 1.3522542715072632, + "learning_rate": 2e-05, + "loss": 0.04812311, + "step": 13536 + }, + { + "epoch": 27.074, + "grad_norm": 0.9945651292800903, + "learning_rate": 2e-05, + "loss": 0.04262705, + "step": 13537 + }, + { + "epoch": 27.076, + "grad_norm": 1.059237003326416, + "learning_rate": 2e-05, + "loss": 0.04054239, + "step": 13538 + }, + { + "epoch": 27.078, + "grad_norm": 2.535274028778076, + "learning_rate": 2e-05, + "loss": 0.0487668, + "step": 13539 + }, + { + "epoch": 27.08, + "grad_norm": 1.2254159450531006, + "learning_rate": 2e-05, + "loss": 0.04526926, + "step": 13540 + }, + { + "epoch": 27.082, + "grad_norm": 1.0619149208068848, + "learning_rate": 2e-05, + "loss": 0.03487138, + "step": 13541 + }, + { + "epoch": 27.084, + "grad_norm": 1.4296939373016357, + "learning_rate": 2e-05, + "loss": 0.05496912, + "step": 13542 + }, + { + "epoch": 27.086, + "grad_norm": 1.0750129222869873, + "learning_rate": 2e-05, + "loss": 0.04620029, + "step": 13543 + }, + { + "epoch": 27.088, + "grad_norm": 1.0724533796310425, + "learning_rate": 2e-05, + "loss": 0.0384186, + "step": 13544 + }, + { + "epoch": 27.09, + "grad_norm": 1.2280149459838867, + "learning_rate": 2e-05, + "loss": 0.04641897, + "step": 13545 + }, + { + "epoch": 27.092, + "grad_norm": 1.78925359249115, + "learning_rate": 2e-05, + "loss": 0.0408336, + "step": 13546 + }, + { + "epoch": 27.094, + "grad_norm": 1.6435742378234863, + "learning_rate": 2e-05, + "loss": 0.04276124, + "step": 13547 + }, + { + "epoch": 27.096, + "grad_norm": 1.4462822675704956, + "learning_rate": 2e-05, + "loss": 0.05522976, + "step": 13548 + }, + { + "epoch": 27.098, + "grad_norm": 1.1338396072387695, + "learning_rate": 2e-05, + "loss": 0.03534421, + "step": 13549 + }, + { + "epoch": 27.1, + "grad_norm": 1.2627896070480347, + "learning_rate": 2e-05, + "loss": 0.04895552, + "step": 13550 + }, + { + "epoch": 27.102, + "grad_norm": 1.2807172536849976, + "learning_rate": 2e-05, + "loss": 0.03973666, + "step": 13551 + }, + { + "epoch": 27.104, + "grad_norm": 1.1761025190353394, + "learning_rate": 2e-05, + "loss": 0.04104013, + "step": 13552 + }, + { + "epoch": 27.106, + "grad_norm": 2.8090035915374756, + "learning_rate": 2e-05, + "loss": 0.05685973, + "step": 13553 + }, + { + "epoch": 27.108, + "grad_norm": 1.8607357740402222, + "learning_rate": 2e-05, + "loss": 0.0470711, + "step": 13554 + }, + { + "epoch": 27.11, + "grad_norm": 1.0800814628601074, + "learning_rate": 2e-05, + "loss": 0.03861955, + "step": 13555 + }, + { + "epoch": 27.112, + "grad_norm": 2.145892858505249, + "learning_rate": 2e-05, + "loss": 0.05473135, + "step": 13556 + }, + { + "epoch": 27.114, + "grad_norm": 0.7809432148933411, + "learning_rate": 2e-05, + "loss": 0.02051185, + "step": 13557 + }, + { + "epoch": 27.116, + "grad_norm": 1.3810927867889404, + "learning_rate": 2e-05, + "loss": 0.05468324, + "step": 13558 + }, + { + "epoch": 27.118, + "grad_norm": 1.2400935888290405, + "learning_rate": 2e-05, + "loss": 0.03758835, + "step": 13559 + }, + { + "epoch": 27.12, + "grad_norm": 2.447911262512207, + "learning_rate": 2e-05, + "loss": 0.07153429, + "step": 13560 + }, + { + "epoch": 27.122, + "grad_norm": 1.063595175743103, + "learning_rate": 2e-05, + "loss": 0.04638623, + "step": 13561 + }, + { + "epoch": 27.124, + "grad_norm": 1.1095471382141113, + "learning_rate": 2e-05, + "loss": 0.03623333, + "step": 13562 + }, + { + "epoch": 27.126, + "grad_norm": 2.209012031555176, + "learning_rate": 2e-05, + "loss": 0.03872228, + "step": 13563 + }, + { + "epoch": 27.128, + "grad_norm": 1.9592890739440918, + "learning_rate": 2e-05, + "loss": 0.03570852, + "step": 13564 + }, + { + "epoch": 27.13, + "grad_norm": 1.3131749629974365, + "learning_rate": 2e-05, + "loss": 0.04663508, + "step": 13565 + }, + { + "epoch": 27.132, + "grad_norm": 1.461511492729187, + "learning_rate": 2e-05, + "loss": 0.04620817, + "step": 13566 + }, + { + "epoch": 27.134, + "grad_norm": 2.396193504333496, + "learning_rate": 2e-05, + "loss": 0.05443357, + "step": 13567 + }, + { + "epoch": 27.136, + "grad_norm": 1.4050869941711426, + "learning_rate": 2e-05, + "loss": 0.06443222, + "step": 13568 + }, + { + "epoch": 27.138, + "grad_norm": 2.008094072341919, + "learning_rate": 2e-05, + "loss": 0.05765811, + "step": 13569 + }, + { + "epoch": 27.14, + "grad_norm": 2.7625555992126465, + "learning_rate": 2e-05, + "loss": 0.07556622, + "step": 13570 + }, + { + "epoch": 27.142, + "grad_norm": 0.9222900867462158, + "learning_rate": 2e-05, + "loss": 0.02512233, + "step": 13571 + }, + { + "epoch": 27.144, + "grad_norm": 1.1549010276794434, + "learning_rate": 2e-05, + "loss": 0.03712163, + "step": 13572 + }, + { + "epoch": 27.146, + "grad_norm": 2.7713279724121094, + "learning_rate": 2e-05, + "loss": 0.04769222, + "step": 13573 + }, + { + "epoch": 27.148, + "grad_norm": 1.5160826444625854, + "learning_rate": 2e-05, + "loss": 0.04316017, + "step": 13574 + }, + { + "epoch": 27.15, + "grad_norm": 1.5735561847686768, + "learning_rate": 2e-05, + "loss": 0.05453978, + "step": 13575 + }, + { + "epoch": 27.152, + "grad_norm": 1.5878405570983887, + "learning_rate": 2e-05, + "loss": 0.0406678, + "step": 13576 + }, + { + "epoch": 27.154, + "grad_norm": 1.9090240001678467, + "learning_rate": 2e-05, + "loss": 0.04792298, + "step": 13577 + }, + { + "epoch": 27.156, + "grad_norm": 1.24784255027771, + "learning_rate": 2e-05, + "loss": 0.04576039, + "step": 13578 + }, + { + "epoch": 27.158, + "grad_norm": 1.1896952390670776, + "learning_rate": 2e-05, + "loss": 0.03995226, + "step": 13579 + }, + { + "epoch": 27.16, + "grad_norm": 1.1981700658798218, + "learning_rate": 2e-05, + "loss": 0.04131437, + "step": 13580 + }, + { + "epoch": 27.162, + "grad_norm": 1.583319067955017, + "learning_rate": 2e-05, + "loss": 0.03563969, + "step": 13581 + }, + { + "epoch": 27.164, + "grad_norm": 1.8085306882858276, + "learning_rate": 2e-05, + "loss": 0.04181949, + "step": 13582 + }, + { + "epoch": 27.166, + "grad_norm": 1.1413896083831787, + "learning_rate": 2e-05, + "loss": 0.04053574, + "step": 13583 + }, + { + "epoch": 27.168, + "grad_norm": 1.0028431415557861, + "learning_rate": 2e-05, + "loss": 0.04165848, + "step": 13584 + }, + { + "epoch": 27.17, + "grad_norm": 1.02912175655365, + "learning_rate": 2e-05, + "loss": 0.03777147, + "step": 13585 + }, + { + "epoch": 27.172, + "grad_norm": 1.0012383460998535, + "learning_rate": 2e-05, + "loss": 0.03452305, + "step": 13586 + }, + { + "epoch": 27.174, + "grad_norm": 1.588924765586853, + "learning_rate": 2e-05, + "loss": 0.06215196, + "step": 13587 + }, + { + "epoch": 27.176, + "grad_norm": 1.2847086191177368, + "learning_rate": 2e-05, + "loss": 0.0568499, + "step": 13588 + }, + { + "epoch": 27.178, + "grad_norm": 0.8857622146606445, + "learning_rate": 2e-05, + "loss": 0.03210337, + "step": 13589 + }, + { + "epoch": 27.18, + "grad_norm": 1.2372381687164307, + "learning_rate": 2e-05, + "loss": 0.03922543, + "step": 13590 + }, + { + "epoch": 27.182, + "grad_norm": 1.127703070640564, + "learning_rate": 2e-05, + "loss": 0.04630746, + "step": 13591 + }, + { + "epoch": 27.184, + "grad_norm": 1.2580335140228271, + "learning_rate": 2e-05, + "loss": 0.0422937, + "step": 13592 + }, + { + "epoch": 27.186, + "grad_norm": 1.6288437843322754, + "learning_rate": 2e-05, + "loss": 0.04852737, + "step": 13593 + }, + { + "epoch": 27.188, + "grad_norm": 1.931805968284607, + "learning_rate": 2e-05, + "loss": 0.05083385, + "step": 13594 + }, + { + "epoch": 27.19, + "grad_norm": 1.3048754930496216, + "learning_rate": 2e-05, + "loss": 0.05102141, + "step": 13595 + }, + { + "epoch": 27.192, + "grad_norm": 1.8813409805297852, + "learning_rate": 2e-05, + "loss": 0.06094917, + "step": 13596 + }, + { + "epoch": 27.194, + "grad_norm": 1.7660962343215942, + "learning_rate": 2e-05, + "loss": 0.04968596, + "step": 13597 + }, + { + "epoch": 27.196, + "grad_norm": 1.2359485626220703, + "learning_rate": 2e-05, + "loss": 0.05053048, + "step": 13598 + }, + { + "epoch": 27.198, + "grad_norm": 1.4421732425689697, + "learning_rate": 2e-05, + "loss": 0.03922078, + "step": 13599 + }, + { + "epoch": 27.2, + "grad_norm": 1.9196407794952393, + "learning_rate": 2e-05, + "loss": 0.04884792, + "step": 13600 + }, + { + "epoch": 27.202, + "grad_norm": 1.4350956678390503, + "learning_rate": 2e-05, + "loss": 0.05208459, + "step": 13601 + }, + { + "epoch": 27.204, + "grad_norm": 1.878467321395874, + "learning_rate": 2e-05, + "loss": 0.06585848, + "step": 13602 + }, + { + "epoch": 27.206, + "grad_norm": 2.1743922233581543, + "learning_rate": 2e-05, + "loss": 0.0544947, + "step": 13603 + }, + { + "epoch": 27.208, + "grad_norm": 1.2076090574264526, + "learning_rate": 2e-05, + "loss": 0.03394687, + "step": 13604 + }, + { + "epoch": 27.21, + "grad_norm": 2.1118581295013428, + "learning_rate": 2e-05, + "loss": 0.06034087, + "step": 13605 + }, + { + "epoch": 27.212, + "grad_norm": 0.812614381313324, + "learning_rate": 2e-05, + "loss": 0.03094057, + "step": 13606 + }, + { + "epoch": 27.214, + "grad_norm": 1.090105652809143, + "learning_rate": 2e-05, + "loss": 0.05721441, + "step": 13607 + }, + { + "epoch": 27.216, + "grad_norm": 1.6390467882156372, + "learning_rate": 2e-05, + "loss": 0.04660071, + "step": 13608 + }, + { + "epoch": 27.218, + "grad_norm": 2.038808584213257, + "learning_rate": 2e-05, + "loss": 0.05760818, + "step": 13609 + }, + { + "epoch": 27.22, + "grad_norm": 1.293735146522522, + "learning_rate": 2e-05, + "loss": 0.03360115, + "step": 13610 + }, + { + "epoch": 27.222, + "grad_norm": 1.192157506942749, + "learning_rate": 2e-05, + "loss": 0.03546806, + "step": 13611 + }, + { + "epoch": 27.224, + "grad_norm": 1.3775697946548462, + "learning_rate": 2e-05, + "loss": 0.0483359, + "step": 13612 + }, + { + "epoch": 27.226, + "grad_norm": 1.3804492950439453, + "learning_rate": 2e-05, + "loss": 0.05687822, + "step": 13613 + }, + { + "epoch": 27.228, + "grad_norm": 1.8396795988082886, + "learning_rate": 2e-05, + "loss": 0.04156945, + "step": 13614 + }, + { + "epoch": 27.23, + "grad_norm": 1.3716535568237305, + "learning_rate": 2e-05, + "loss": 0.05421146, + "step": 13615 + }, + { + "epoch": 27.232, + "grad_norm": 1.2649599313735962, + "learning_rate": 2e-05, + "loss": 0.04328211, + "step": 13616 + }, + { + "epoch": 27.234, + "grad_norm": 1.3094875812530518, + "learning_rate": 2e-05, + "loss": 0.04095453, + "step": 13617 + }, + { + "epoch": 27.236, + "grad_norm": 1.3521350622177124, + "learning_rate": 2e-05, + "loss": 0.0435282, + "step": 13618 + }, + { + "epoch": 27.238, + "grad_norm": 0.901831865310669, + "learning_rate": 2e-05, + "loss": 0.02451799, + "step": 13619 + }, + { + "epoch": 27.24, + "grad_norm": 1.344462275505066, + "learning_rate": 2e-05, + "loss": 0.06785841, + "step": 13620 + }, + { + "epoch": 27.242, + "grad_norm": 1.497972846031189, + "learning_rate": 2e-05, + "loss": 0.04425344, + "step": 13621 + }, + { + "epoch": 27.244, + "grad_norm": 0.9255624413490295, + "learning_rate": 2e-05, + "loss": 0.02206445, + "step": 13622 + }, + { + "epoch": 27.246, + "grad_norm": 0.988325834274292, + "learning_rate": 2e-05, + "loss": 0.02141338, + "step": 13623 + }, + { + "epoch": 27.248, + "grad_norm": 1.9612500667572021, + "learning_rate": 2e-05, + "loss": 0.03873473, + "step": 13624 + }, + { + "epoch": 27.25, + "grad_norm": 2.3645355701446533, + "learning_rate": 2e-05, + "loss": 0.0586706, + "step": 13625 + }, + { + "epoch": 27.252, + "grad_norm": 1.6442853212356567, + "learning_rate": 2e-05, + "loss": 0.04403327, + "step": 13626 + }, + { + "epoch": 27.254, + "grad_norm": 1.2499282360076904, + "learning_rate": 2e-05, + "loss": 0.05442981, + "step": 13627 + }, + { + "epoch": 27.256, + "grad_norm": 1.122299313545227, + "learning_rate": 2e-05, + "loss": 0.04064956, + "step": 13628 + }, + { + "epoch": 27.258, + "grad_norm": 1.537174105644226, + "learning_rate": 2e-05, + "loss": 0.04302373, + "step": 13629 + }, + { + "epoch": 27.26, + "grad_norm": 1.0611103773117065, + "learning_rate": 2e-05, + "loss": 0.03115848, + "step": 13630 + }, + { + "epoch": 27.262, + "grad_norm": 1.2895267009735107, + "learning_rate": 2e-05, + "loss": 0.03978854, + "step": 13631 + }, + { + "epoch": 27.264, + "grad_norm": 1.8378465175628662, + "learning_rate": 2e-05, + "loss": 0.06035455, + "step": 13632 + }, + { + "epoch": 27.266, + "grad_norm": 0.9278236031532288, + "learning_rate": 2e-05, + "loss": 0.0275078, + "step": 13633 + }, + { + "epoch": 27.268, + "grad_norm": 1.2331749200820923, + "learning_rate": 2e-05, + "loss": 0.03504308, + "step": 13634 + }, + { + "epoch": 27.27, + "grad_norm": 1.629676103591919, + "learning_rate": 2e-05, + "loss": 0.0618648, + "step": 13635 + }, + { + "epoch": 27.272, + "grad_norm": 1.405290961265564, + "learning_rate": 2e-05, + "loss": 0.0505509, + "step": 13636 + }, + { + "epoch": 27.274, + "grad_norm": 1.1765658855438232, + "learning_rate": 2e-05, + "loss": 0.05041191, + "step": 13637 + }, + { + "epoch": 27.276, + "grad_norm": 1.735039472579956, + "learning_rate": 2e-05, + "loss": 0.02883138, + "step": 13638 + }, + { + "epoch": 27.278, + "grad_norm": 0.9482174515724182, + "learning_rate": 2e-05, + "loss": 0.02577251, + "step": 13639 + }, + { + "epoch": 27.28, + "grad_norm": 1.1373050212860107, + "learning_rate": 2e-05, + "loss": 0.04171196, + "step": 13640 + }, + { + "epoch": 27.282, + "grad_norm": 2.016035795211792, + "learning_rate": 2e-05, + "loss": 0.0531563, + "step": 13641 + }, + { + "epoch": 27.284, + "grad_norm": 1.1706429719924927, + "learning_rate": 2e-05, + "loss": 0.04091615, + "step": 13642 + }, + { + "epoch": 27.286, + "grad_norm": 0.9347245693206787, + "learning_rate": 2e-05, + "loss": 0.03005055, + "step": 13643 + }, + { + "epoch": 27.288, + "grad_norm": 1.119093418121338, + "learning_rate": 2e-05, + "loss": 0.05457556, + "step": 13644 + }, + { + "epoch": 27.29, + "grad_norm": 1.3505531549453735, + "learning_rate": 2e-05, + "loss": 0.03677881, + "step": 13645 + }, + { + "epoch": 27.292, + "grad_norm": 1.270559549331665, + "learning_rate": 2e-05, + "loss": 0.03553203, + "step": 13646 + }, + { + "epoch": 27.294, + "grad_norm": 1.016276240348816, + "learning_rate": 2e-05, + "loss": 0.02860792, + "step": 13647 + }, + { + "epoch": 27.296, + "grad_norm": 2.6578431129455566, + "learning_rate": 2e-05, + "loss": 0.0371431, + "step": 13648 + }, + { + "epoch": 27.298, + "grad_norm": 1.0674734115600586, + "learning_rate": 2e-05, + "loss": 0.03743986, + "step": 13649 + }, + { + "epoch": 27.3, + "grad_norm": 0.987488865852356, + "learning_rate": 2e-05, + "loss": 0.02790395, + "step": 13650 + }, + { + "epoch": 27.302, + "grad_norm": 1.2899256944656372, + "learning_rate": 2e-05, + "loss": 0.05075889, + "step": 13651 + }, + { + "epoch": 27.304, + "grad_norm": 1.9835212230682373, + "learning_rate": 2e-05, + "loss": 0.05801808, + "step": 13652 + }, + { + "epoch": 27.306, + "grad_norm": 1.7673009634017944, + "learning_rate": 2e-05, + "loss": 0.05642558, + "step": 13653 + }, + { + "epoch": 27.308, + "grad_norm": 1.229543924331665, + "learning_rate": 2e-05, + "loss": 0.04363228, + "step": 13654 + }, + { + "epoch": 27.31, + "grad_norm": 1.8748433589935303, + "learning_rate": 2e-05, + "loss": 0.04515584, + "step": 13655 + }, + { + "epoch": 27.312, + "grad_norm": 1.0442911386489868, + "learning_rate": 2e-05, + "loss": 0.03930074, + "step": 13656 + }, + { + "epoch": 27.314, + "grad_norm": 1.443469524383545, + "learning_rate": 2e-05, + "loss": 0.04800137, + "step": 13657 + }, + { + "epoch": 27.316, + "grad_norm": 1.3523346185684204, + "learning_rate": 2e-05, + "loss": 0.04009256, + "step": 13658 + }, + { + "epoch": 27.318, + "grad_norm": 1.8447299003601074, + "learning_rate": 2e-05, + "loss": 0.0515473, + "step": 13659 + }, + { + "epoch": 27.32, + "grad_norm": 1.3202327489852905, + "learning_rate": 2e-05, + "loss": 0.05036566, + "step": 13660 + }, + { + "epoch": 27.322, + "grad_norm": 1.1027467250823975, + "learning_rate": 2e-05, + "loss": 0.04486959, + "step": 13661 + }, + { + "epoch": 27.324, + "grad_norm": 1.2013741731643677, + "learning_rate": 2e-05, + "loss": 0.04390001, + "step": 13662 + }, + { + "epoch": 27.326, + "grad_norm": 1.7813587188720703, + "learning_rate": 2e-05, + "loss": 0.03983445, + "step": 13663 + }, + { + "epoch": 27.328, + "grad_norm": 2.0535483360290527, + "learning_rate": 2e-05, + "loss": 0.0440939, + "step": 13664 + }, + { + "epoch": 27.33, + "grad_norm": 1.0619609355926514, + "learning_rate": 2e-05, + "loss": 0.03448323, + "step": 13665 + }, + { + "epoch": 27.332, + "grad_norm": 1.0306423902511597, + "learning_rate": 2e-05, + "loss": 0.03753592, + "step": 13666 + }, + { + "epoch": 27.334, + "grad_norm": 1.2383522987365723, + "learning_rate": 2e-05, + "loss": 0.03938786, + "step": 13667 + }, + { + "epoch": 27.336, + "grad_norm": 1.9096163511276245, + "learning_rate": 2e-05, + "loss": 0.07150873, + "step": 13668 + }, + { + "epoch": 27.338, + "grad_norm": 1.201596975326538, + "learning_rate": 2e-05, + "loss": 0.03716744, + "step": 13669 + }, + { + "epoch": 27.34, + "grad_norm": 1.5398945808410645, + "learning_rate": 2e-05, + "loss": 0.0461993, + "step": 13670 + }, + { + "epoch": 27.342, + "grad_norm": 1.073887586593628, + "learning_rate": 2e-05, + "loss": 0.0333119, + "step": 13671 + }, + { + "epoch": 27.344, + "grad_norm": 1.5942431688308716, + "learning_rate": 2e-05, + "loss": 0.0529358, + "step": 13672 + }, + { + "epoch": 27.346, + "grad_norm": 1.545664668083191, + "learning_rate": 2e-05, + "loss": 0.03542736, + "step": 13673 + }, + { + "epoch": 27.348, + "grad_norm": 1.5381520986557007, + "learning_rate": 2e-05, + "loss": 0.03843318, + "step": 13674 + }, + { + "epoch": 27.35, + "grad_norm": 1.3689863681793213, + "learning_rate": 2e-05, + "loss": 0.02982055, + "step": 13675 + }, + { + "epoch": 27.352, + "grad_norm": 1.2728350162506104, + "learning_rate": 2e-05, + "loss": 0.05316496, + "step": 13676 + }, + { + "epoch": 27.354, + "grad_norm": 0.9394140839576721, + "learning_rate": 2e-05, + "loss": 0.02646692, + "step": 13677 + }, + { + "epoch": 27.356, + "grad_norm": 0.9711006879806519, + "learning_rate": 2e-05, + "loss": 0.02917571, + "step": 13678 + }, + { + "epoch": 27.358, + "grad_norm": 1.5109331607818604, + "learning_rate": 2e-05, + "loss": 0.03243531, + "step": 13679 + }, + { + "epoch": 27.36, + "grad_norm": 1.113895297050476, + "learning_rate": 2e-05, + "loss": 0.03930811, + "step": 13680 + }, + { + "epoch": 27.362, + "grad_norm": 1.5176002979278564, + "learning_rate": 2e-05, + "loss": 0.06130125, + "step": 13681 + }, + { + "epoch": 27.364, + "grad_norm": 0.8235756754875183, + "learning_rate": 2e-05, + "loss": 0.02695323, + "step": 13682 + }, + { + "epoch": 27.366, + "grad_norm": 1.0173813104629517, + "learning_rate": 2e-05, + "loss": 0.03705809, + "step": 13683 + }, + { + "epoch": 27.368, + "grad_norm": 1.7335045337677002, + "learning_rate": 2e-05, + "loss": 0.04375722, + "step": 13684 + }, + { + "epoch": 27.37, + "grad_norm": 1.392207145690918, + "learning_rate": 2e-05, + "loss": 0.03899303, + "step": 13685 + }, + { + "epoch": 27.372, + "grad_norm": 1.5254584550857544, + "learning_rate": 2e-05, + "loss": 0.05612728, + "step": 13686 + }, + { + "epoch": 27.374, + "grad_norm": 1.4681589603424072, + "learning_rate": 2e-05, + "loss": 0.05056222, + "step": 13687 + }, + { + "epoch": 27.376, + "grad_norm": 1.21063232421875, + "learning_rate": 2e-05, + "loss": 0.03048548, + "step": 13688 + }, + { + "epoch": 27.378, + "grad_norm": 1.43213951587677, + "learning_rate": 2e-05, + "loss": 0.06458762, + "step": 13689 + }, + { + "epoch": 27.38, + "grad_norm": 2.742194652557373, + "learning_rate": 2e-05, + "loss": 0.05234894, + "step": 13690 + }, + { + "epoch": 27.382, + "grad_norm": 2.0506792068481445, + "learning_rate": 2e-05, + "loss": 0.05463105, + "step": 13691 + }, + { + "epoch": 27.384, + "grad_norm": 1.5875942707061768, + "learning_rate": 2e-05, + "loss": 0.03925296, + "step": 13692 + }, + { + "epoch": 27.386, + "grad_norm": 1.5243009328842163, + "learning_rate": 2e-05, + "loss": 0.05017775, + "step": 13693 + }, + { + "epoch": 27.388, + "grad_norm": 1.4063950777053833, + "learning_rate": 2e-05, + "loss": 0.05697461, + "step": 13694 + }, + { + "epoch": 27.39, + "grad_norm": 2.2699193954467773, + "learning_rate": 2e-05, + "loss": 0.03670984, + "step": 13695 + }, + { + "epoch": 27.392, + "grad_norm": 2.1173882484436035, + "learning_rate": 2e-05, + "loss": 0.05186939, + "step": 13696 + }, + { + "epoch": 27.394, + "grad_norm": 1.2842679023742676, + "learning_rate": 2e-05, + "loss": 0.06256726, + "step": 13697 + }, + { + "epoch": 27.396, + "grad_norm": 2.2587108612060547, + "learning_rate": 2e-05, + "loss": 0.06481649, + "step": 13698 + }, + { + "epoch": 27.398, + "grad_norm": 1.8947147130966187, + "learning_rate": 2e-05, + "loss": 0.04356439, + "step": 13699 + }, + { + "epoch": 27.4, + "grad_norm": 0.9812667369842529, + "learning_rate": 2e-05, + "loss": 0.02878316, + "step": 13700 + }, + { + "epoch": 27.402, + "grad_norm": 1.1468307971954346, + "learning_rate": 2e-05, + "loss": 0.04130252, + "step": 13701 + }, + { + "epoch": 27.404, + "grad_norm": 1.0459030866622925, + "learning_rate": 2e-05, + "loss": 0.03767227, + "step": 13702 + }, + { + "epoch": 27.406, + "grad_norm": 1.448286771774292, + "learning_rate": 2e-05, + "loss": 0.04471764, + "step": 13703 + }, + { + "epoch": 27.408, + "grad_norm": 1.1528844833374023, + "learning_rate": 2e-05, + "loss": 0.02849596, + "step": 13704 + }, + { + "epoch": 27.41, + "grad_norm": 1.6160606145858765, + "learning_rate": 2e-05, + "loss": 0.02551612, + "step": 13705 + }, + { + "epoch": 27.412, + "grad_norm": 2.2915830612182617, + "learning_rate": 2e-05, + "loss": 0.05899898, + "step": 13706 + }, + { + "epoch": 27.414, + "grad_norm": 1.510726809501648, + "learning_rate": 2e-05, + "loss": 0.05045645, + "step": 13707 + }, + { + "epoch": 27.416, + "grad_norm": 3.0780930519104004, + "learning_rate": 2e-05, + "loss": 0.06752353, + "step": 13708 + }, + { + "epoch": 27.418, + "grad_norm": 1.264102816581726, + "learning_rate": 2e-05, + "loss": 0.04427083, + "step": 13709 + }, + { + "epoch": 27.42, + "grad_norm": 1.073390007019043, + "learning_rate": 2e-05, + "loss": 0.04122963, + "step": 13710 + }, + { + "epoch": 27.422, + "grad_norm": 1.1260555982589722, + "learning_rate": 2e-05, + "loss": 0.04339466, + "step": 13711 + }, + { + "epoch": 27.424, + "grad_norm": 1.236524224281311, + "learning_rate": 2e-05, + "loss": 0.03498268, + "step": 13712 + }, + { + "epoch": 27.426, + "grad_norm": 1.594119906425476, + "learning_rate": 2e-05, + "loss": 0.04375029, + "step": 13713 + }, + { + "epoch": 27.428, + "grad_norm": 1.2735551595687866, + "learning_rate": 2e-05, + "loss": 0.03774681, + "step": 13714 + }, + { + "epoch": 27.43, + "grad_norm": 1.059618592262268, + "learning_rate": 2e-05, + "loss": 0.03130772, + "step": 13715 + }, + { + "epoch": 27.432, + "grad_norm": 1.5274853706359863, + "learning_rate": 2e-05, + "loss": 0.05236381, + "step": 13716 + }, + { + "epoch": 27.434, + "grad_norm": 0.9794763326644897, + "learning_rate": 2e-05, + "loss": 0.03129626, + "step": 13717 + }, + { + "epoch": 27.436, + "grad_norm": 1.2296133041381836, + "learning_rate": 2e-05, + "loss": 0.04068862, + "step": 13718 + }, + { + "epoch": 27.438, + "grad_norm": 1.5292551517486572, + "learning_rate": 2e-05, + "loss": 0.04712692, + "step": 13719 + }, + { + "epoch": 27.44, + "grad_norm": 1.157141089439392, + "learning_rate": 2e-05, + "loss": 0.05303669, + "step": 13720 + }, + { + "epoch": 27.442, + "grad_norm": 1.2794381380081177, + "learning_rate": 2e-05, + "loss": 0.05657168, + "step": 13721 + }, + { + "epoch": 27.444, + "grad_norm": 1.1883527040481567, + "learning_rate": 2e-05, + "loss": 0.05889457, + "step": 13722 + }, + { + "epoch": 27.446, + "grad_norm": 1.260737419128418, + "learning_rate": 2e-05, + "loss": 0.04855951, + "step": 13723 + }, + { + "epoch": 27.448, + "grad_norm": 1.1919201612472534, + "learning_rate": 2e-05, + "loss": 0.02818074, + "step": 13724 + }, + { + "epoch": 27.45, + "grad_norm": 1.3503472805023193, + "learning_rate": 2e-05, + "loss": 0.05925683, + "step": 13725 + }, + { + "epoch": 27.452, + "grad_norm": 1.102662205696106, + "learning_rate": 2e-05, + "loss": 0.03601834, + "step": 13726 + }, + { + "epoch": 27.454, + "grad_norm": 0.9547258019447327, + "learning_rate": 2e-05, + "loss": 0.0299718, + "step": 13727 + }, + { + "epoch": 27.456, + "grad_norm": 1.2688747644424438, + "learning_rate": 2e-05, + "loss": 0.04194263, + "step": 13728 + }, + { + "epoch": 27.458, + "grad_norm": 2.936462640762329, + "learning_rate": 2e-05, + "loss": 0.0673967, + "step": 13729 + }, + { + "epoch": 27.46, + "grad_norm": 1.7205671072006226, + "learning_rate": 2e-05, + "loss": 0.06203447, + "step": 13730 + }, + { + "epoch": 27.462, + "grad_norm": 1.4902185201644897, + "learning_rate": 2e-05, + "loss": 0.04702317, + "step": 13731 + }, + { + "epoch": 27.464, + "grad_norm": 1.2047202587127686, + "learning_rate": 2e-05, + "loss": 0.04467279, + "step": 13732 + }, + { + "epoch": 27.466, + "grad_norm": 1.2293957471847534, + "learning_rate": 2e-05, + "loss": 0.03134837, + "step": 13733 + }, + { + "epoch": 27.468, + "grad_norm": 1.4293403625488281, + "learning_rate": 2e-05, + "loss": 0.03927672, + "step": 13734 + }, + { + "epoch": 27.47, + "grad_norm": 1.2355231046676636, + "learning_rate": 2e-05, + "loss": 0.03786084, + "step": 13735 + }, + { + "epoch": 27.472, + "grad_norm": 1.4935061931610107, + "learning_rate": 2e-05, + "loss": 0.05582435, + "step": 13736 + }, + { + "epoch": 27.474, + "grad_norm": 1.0883551836013794, + "learning_rate": 2e-05, + "loss": 0.05210774, + "step": 13737 + }, + { + "epoch": 27.476, + "grad_norm": 1.4402879476547241, + "learning_rate": 2e-05, + "loss": 0.0516119, + "step": 13738 + }, + { + "epoch": 27.478, + "grad_norm": 1.3633337020874023, + "learning_rate": 2e-05, + "loss": 0.05501125, + "step": 13739 + }, + { + "epoch": 27.48, + "grad_norm": 1.403083324432373, + "learning_rate": 2e-05, + "loss": 0.07498935, + "step": 13740 + }, + { + "epoch": 27.482, + "grad_norm": 1.22272527217865, + "learning_rate": 2e-05, + "loss": 0.04192954, + "step": 13741 + }, + { + "epoch": 27.484, + "grad_norm": 0.9646196961402893, + "learning_rate": 2e-05, + "loss": 0.03863492, + "step": 13742 + }, + { + "epoch": 27.486, + "grad_norm": 1.4368653297424316, + "learning_rate": 2e-05, + "loss": 0.05678754, + "step": 13743 + }, + { + "epoch": 27.488, + "grad_norm": 1.0323776006698608, + "learning_rate": 2e-05, + "loss": 0.03164582, + "step": 13744 + }, + { + "epoch": 27.49, + "grad_norm": 1.335216760635376, + "learning_rate": 2e-05, + "loss": 0.04490231, + "step": 13745 + }, + { + "epoch": 27.492, + "grad_norm": 1.462903618812561, + "learning_rate": 2e-05, + "loss": 0.05864259, + "step": 13746 + }, + { + "epoch": 27.494, + "grad_norm": 1.1962298154830933, + "learning_rate": 2e-05, + "loss": 0.03484888, + "step": 13747 + }, + { + "epoch": 27.496, + "grad_norm": 1.8432676792144775, + "learning_rate": 2e-05, + "loss": 0.04844757, + "step": 13748 + }, + { + "epoch": 27.498, + "grad_norm": 1.9054256677627563, + "learning_rate": 2e-05, + "loss": 0.03610344, + "step": 13749 + }, + { + "epoch": 27.5, + "grad_norm": 1.8628199100494385, + "learning_rate": 2e-05, + "loss": 0.03124732, + "step": 13750 + }, + { + "epoch": 27.502, + "grad_norm": 1.300698161125183, + "learning_rate": 2e-05, + "loss": 0.03852872, + "step": 13751 + }, + { + "epoch": 27.504, + "grad_norm": 1.4470487833023071, + "learning_rate": 2e-05, + "loss": 0.03881678, + "step": 13752 + }, + { + "epoch": 27.506, + "grad_norm": 1.4798719882965088, + "learning_rate": 2e-05, + "loss": 0.0504625, + "step": 13753 + }, + { + "epoch": 27.508, + "grad_norm": 0.9879271388053894, + "learning_rate": 2e-05, + "loss": 0.0390942, + "step": 13754 + }, + { + "epoch": 27.51, + "grad_norm": 1.9289140701293945, + "learning_rate": 2e-05, + "loss": 0.04204384, + "step": 13755 + }, + { + "epoch": 27.512, + "grad_norm": 2.8271074295043945, + "learning_rate": 2e-05, + "loss": 0.0529681, + "step": 13756 + }, + { + "epoch": 27.514, + "grad_norm": 4.042783260345459, + "learning_rate": 2e-05, + "loss": 0.04220014, + "step": 13757 + }, + { + "epoch": 27.516, + "grad_norm": 1.2601898908615112, + "learning_rate": 2e-05, + "loss": 0.03472359, + "step": 13758 + }, + { + "epoch": 27.518, + "grad_norm": 1.2109252214431763, + "learning_rate": 2e-05, + "loss": 0.04545831, + "step": 13759 + }, + { + "epoch": 27.52, + "grad_norm": 1.168678641319275, + "learning_rate": 2e-05, + "loss": 0.03518002, + "step": 13760 + }, + { + "epoch": 27.522, + "grad_norm": 1.7536158561706543, + "learning_rate": 2e-05, + "loss": 0.05130105, + "step": 13761 + }, + { + "epoch": 27.524, + "grad_norm": 1.5836191177368164, + "learning_rate": 2e-05, + "loss": 0.03332604, + "step": 13762 + }, + { + "epoch": 27.526, + "grad_norm": 2.656456232070923, + "learning_rate": 2e-05, + "loss": 0.05432518, + "step": 13763 + }, + { + "epoch": 27.528, + "grad_norm": 2.7838237285614014, + "learning_rate": 2e-05, + "loss": 0.03519111, + "step": 13764 + }, + { + "epoch": 27.53, + "grad_norm": 2.392075777053833, + "learning_rate": 2e-05, + "loss": 0.04325394, + "step": 13765 + }, + { + "epoch": 27.532, + "grad_norm": 1.0115920305252075, + "learning_rate": 2e-05, + "loss": 0.03398691, + "step": 13766 + }, + { + "epoch": 27.534, + "grad_norm": 1.0626614093780518, + "learning_rate": 2e-05, + "loss": 0.03680531, + "step": 13767 + }, + { + "epoch": 27.536, + "grad_norm": 1.1814494132995605, + "learning_rate": 2e-05, + "loss": 0.0340856, + "step": 13768 + }, + { + "epoch": 27.538, + "grad_norm": 1.5384715795516968, + "learning_rate": 2e-05, + "loss": 0.04477349, + "step": 13769 + }, + { + "epoch": 27.54, + "grad_norm": 1.392616868019104, + "learning_rate": 2e-05, + "loss": 0.03679165, + "step": 13770 + }, + { + "epoch": 27.542, + "grad_norm": 2.3772451877593994, + "learning_rate": 2e-05, + "loss": 0.04677385, + "step": 13771 + }, + { + "epoch": 27.544, + "grad_norm": 2.6153879165649414, + "learning_rate": 2e-05, + "loss": 0.05070829, + "step": 13772 + }, + { + "epoch": 27.546, + "grad_norm": 1.1042912006378174, + "learning_rate": 2e-05, + "loss": 0.03895772, + "step": 13773 + }, + { + "epoch": 27.548000000000002, + "grad_norm": 2.243035316467285, + "learning_rate": 2e-05, + "loss": 0.04948647, + "step": 13774 + }, + { + "epoch": 27.55, + "grad_norm": 1.1408405303955078, + "learning_rate": 2e-05, + "loss": 0.03680218, + "step": 13775 + }, + { + "epoch": 27.552, + "grad_norm": 1.123650074005127, + "learning_rate": 2e-05, + "loss": 0.04027671, + "step": 13776 + }, + { + "epoch": 27.554, + "grad_norm": 1.9606479406356812, + "learning_rate": 2e-05, + "loss": 0.05564614, + "step": 13777 + }, + { + "epoch": 27.556, + "grad_norm": 1.310119867324829, + "learning_rate": 2e-05, + "loss": 0.03973632, + "step": 13778 + }, + { + "epoch": 27.558, + "grad_norm": 1.5927740335464478, + "learning_rate": 2e-05, + "loss": 0.04547199, + "step": 13779 + }, + { + "epoch": 27.56, + "grad_norm": 1.2601191997528076, + "learning_rate": 2e-05, + "loss": 0.05317003, + "step": 13780 + }, + { + "epoch": 27.562, + "grad_norm": 1.647443175315857, + "learning_rate": 2e-05, + "loss": 0.0479614, + "step": 13781 + }, + { + "epoch": 27.564, + "grad_norm": 1.0511468648910522, + "learning_rate": 2e-05, + "loss": 0.03439099, + "step": 13782 + }, + { + "epoch": 27.566, + "grad_norm": 1.2322078943252563, + "learning_rate": 2e-05, + "loss": 0.04574615, + "step": 13783 + }, + { + "epoch": 27.568, + "grad_norm": 1.0778727531433105, + "learning_rate": 2e-05, + "loss": 0.03921001, + "step": 13784 + }, + { + "epoch": 27.57, + "grad_norm": 1.158125877380371, + "learning_rate": 2e-05, + "loss": 0.0469446, + "step": 13785 + }, + { + "epoch": 27.572, + "grad_norm": 1.4670352935791016, + "learning_rate": 2e-05, + "loss": 0.04167302, + "step": 13786 + }, + { + "epoch": 27.574, + "grad_norm": 2.666126251220703, + "learning_rate": 2e-05, + "loss": 0.05114643, + "step": 13787 + }, + { + "epoch": 27.576, + "grad_norm": 1.1133148670196533, + "learning_rate": 2e-05, + "loss": 0.03147897, + "step": 13788 + }, + { + "epoch": 27.578, + "grad_norm": 1.703502893447876, + "learning_rate": 2e-05, + "loss": 0.03852395, + "step": 13789 + }, + { + "epoch": 27.58, + "grad_norm": 1.3457422256469727, + "learning_rate": 2e-05, + "loss": 0.0443656, + "step": 13790 + }, + { + "epoch": 27.582, + "grad_norm": 1.0474896430969238, + "learning_rate": 2e-05, + "loss": 0.02956739, + "step": 13791 + }, + { + "epoch": 27.584, + "grad_norm": 1.2296174764633179, + "learning_rate": 2e-05, + "loss": 0.03405485, + "step": 13792 + }, + { + "epoch": 27.586, + "grad_norm": 0.9992415308952332, + "learning_rate": 2e-05, + "loss": 0.02881895, + "step": 13793 + }, + { + "epoch": 27.588, + "grad_norm": 1.370805263519287, + "learning_rate": 2e-05, + "loss": 0.0559724, + "step": 13794 + }, + { + "epoch": 27.59, + "grad_norm": 2.4789819717407227, + "learning_rate": 2e-05, + "loss": 0.04826446, + "step": 13795 + }, + { + "epoch": 27.592, + "grad_norm": 2.2076499462127686, + "learning_rate": 2e-05, + "loss": 0.05645463, + "step": 13796 + }, + { + "epoch": 27.594, + "grad_norm": 1.5076375007629395, + "learning_rate": 2e-05, + "loss": 0.04553786, + "step": 13797 + }, + { + "epoch": 27.596, + "grad_norm": 1.0695239305496216, + "learning_rate": 2e-05, + "loss": 0.0458618, + "step": 13798 + }, + { + "epoch": 27.598, + "grad_norm": 1.2451701164245605, + "learning_rate": 2e-05, + "loss": 0.04357987, + "step": 13799 + }, + { + "epoch": 27.6, + "grad_norm": 1.1719753742218018, + "learning_rate": 2e-05, + "loss": 0.05280523, + "step": 13800 + }, + { + "epoch": 27.602, + "grad_norm": 1.9771161079406738, + "learning_rate": 2e-05, + "loss": 0.0490317, + "step": 13801 + }, + { + "epoch": 27.604, + "grad_norm": 1.2304954528808594, + "learning_rate": 2e-05, + "loss": 0.04574135, + "step": 13802 + }, + { + "epoch": 27.606, + "grad_norm": 1.237014651298523, + "learning_rate": 2e-05, + "loss": 0.03605729, + "step": 13803 + }, + { + "epoch": 27.608, + "grad_norm": 3.3194026947021484, + "learning_rate": 2e-05, + "loss": 0.05870149, + "step": 13804 + }, + { + "epoch": 27.61, + "grad_norm": 1.4011256694793701, + "learning_rate": 2e-05, + "loss": 0.03893548, + "step": 13805 + }, + { + "epoch": 27.612, + "grad_norm": 1.604568362236023, + "learning_rate": 2e-05, + "loss": 0.0501047, + "step": 13806 + }, + { + "epoch": 27.614, + "grad_norm": 1.0760208368301392, + "learning_rate": 2e-05, + "loss": 0.03730212, + "step": 13807 + }, + { + "epoch": 27.616, + "grad_norm": 1.54534912109375, + "learning_rate": 2e-05, + "loss": 0.05426643, + "step": 13808 + }, + { + "epoch": 27.618, + "grad_norm": 1.1083625555038452, + "learning_rate": 2e-05, + "loss": 0.03855055, + "step": 13809 + }, + { + "epoch": 27.62, + "grad_norm": 1.462074875831604, + "learning_rate": 2e-05, + "loss": 0.04273522, + "step": 13810 + }, + { + "epoch": 27.622, + "grad_norm": 2.044818878173828, + "learning_rate": 2e-05, + "loss": 0.06233047, + "step": 13811 + }, + { + "epoch": 27.624, + "grad_norm": 1.4091565608978271, + "learning_rate": 2e-05, + "loss": 0.04076818, + "step": 13812 + }, + { + "epoch": 27.626, + "grad_norm": 1.1645406484603882, + "learning_rate": 2e-05, + "loss": 0.0466489, + "step": 13813 + }, + { + "epoch": 27.628, + "grad_norm": 1.4516359567642212, + "learning_rate": 2e-05, + "loss": 0.04340706, + "step": 13814 + }, + { + "epoch": 27.63, + "grad_norm": 1.0174524784088135, + "learning_rate": 2e-05, + "loss": 0.02986944, + "step": 13815 + }, + { + "epoch": 27.632, + "grad_norm": 1.4557236433029175, + "learning_rate": 2e-05, + "loss": 0.04444972, + "step": 13816 + }, + { + "epoch": 27.634, + "grad_norm": 1.2507081031799316, + "learning_rate": 2e-05, + "loss": 0.0360889, + "step": 13817 + }, + { + "epoch": 27.636, + "grad_norm": 2.815284013748169, + "learning_rate": 2e-05, + "loss": 0.04525167, + "step": 13818 + }, + { + "epoch": 27.638, + "grad_norm": 0.9910860061645508, + "learning_rate": 2e-05, + "loss": 0.03063339, + "step": 13819 + }, + { + "epoch": 27.64, + "grad_norm": 2.006213426589966, + "learning_rate": 2e-05, + "loss": 0.04587614, + "step": 13820 + }, + { + "epoch": 27.642, + "grad_norm": 1.6014063358306885, + "learning_rate": 2e-05, + "loss": 0.05192234, + "step": 13821 + }, + { + "epoch": 27.644, + "grad_norm": 1.4815598726272583, + "learning_rate": 2e-05, + "loss": 0.05399399, + "step": 13822 + }, + { + "epoch": 27.646, + "grad_norm": 1.2255171537399292, + "learning_rate": 2e-05, + "loss": 0.04327129, + "step": 13823 + }, + { + "epoch": 27.648, + "grad_norm": 1.0979926586151123, + "learning_rate": 2e-05, + "loss": 0.03967486, + "step": 13824 + }, + { + "epoch": 27.65, + "grad_norm": 1.2818236351013184, + "learning_rate": 2e-05, + "loss": 0.05102768, + "step": 13825 + }, + { + "epoch": 27.652, + "grad_norm": 1.1726670265197754, + "learning_rate": 2e-05, + "loss": 0.04293338, + "step": 13826 + }, + { + "epoch": 27.654, + "grad_norm": 1.7466474771499634, + "learning_rate": 2e-05, + "loss": 0.05429044, + "step": 13827 + }, + { + "epoch": 27.656, + "grad_norm": 1.266982078552246, + "learning_rate": 2e-05, + "loss": 0.05010791, + "step": 13828 + }, + { + "epoch": 27.658, + "grad_norm": 1.0520493984222412, + "learning_rate": 2e-05, + "loss": 0.0411777, + "step": 13829 + }, + { + "epoch": 27.66, + "grad_norm": 1.2513072490692139, + "learning_rate": 2e-05, + "loss": 0.0348096, + "step": 13830 + }, + { + "epoch": 27.662, + "grad_norm": 0.8475068211555481, + "learning_rate": 2e-05, + "loss": 0.02205401, + "step": 13831 + }, + { + "epoch": 27.664, + "grad_norm": 1.2510443925857544, + "learning_rate": 2e-05, + "loss": 0.04548548, + "step": 13832 + }, + { + "epoch": 27.666, + "grad_norm": 1.2824064493179321, + "learning_rate": 2e-05, + "loss": 0.03442299, + "step": 13833 + }, + { + "epoch": 27.668, + "grad_norm": 1.1054766178131104, + "learning_rate": 2e-05, + "loss": 0.04257656, + "step": 13834 + }, + { + "epoch": 27.67, + "grad_norm": 1.853498935699463, + "learning_rate": 2e-05, + "loss": 0.05419736, + "step": 13835 + }, + { + "epoch": 27.672, + "grad_norm": 4.41054630279541, + "learning_rate": 2e-05, + "loss": 0.05251537, + "step": 13836 + }, + { + "epoch": 27.674, + "grad_norm": 1.2487452030181885, + "learning_rate": 2e-05, + "loss": 0.04494422, + "step": 13837 + }, + { + "epoch": 27.676, + "grad_norm": 1.9383426904678345, + "learning_rate": 2e-05, + "loss": 0.05443368, + "step": 13838 + }, + { + "epoch": 27.678, + "grad_norm": 1.7748433351516724, + "learning_rate": 2e-05, + "loss": 0.04427961, + "step": 13839 + }, + { + "epoch": 27.68, + "grad_norm": 0.8062875866889954, + "learning_rate": 2e-05, + "loss": 0.02282286, + "step": 13840 + }, + { + "epoch": 27.682, + "grad_norm": 2.103724479675293, + "learning_rate": 2e-05, + "loss": 0.05308349, + "step": 13841 + }, + { + "epoch": 27.684, + "grad_norm": 1.1036003828048706, + "learning_rate": 2e-05, + "loss": 0.0406925, + "step": 13842 + }, + { + "epoch": 27.686, + "grad_norm": 1.1960515975952148, + "learning_rate": 2e-05, + "loss": 0.04112234, + "step": 13843 + }, + { + "epoch": 27.688, + "grad_norm": 1.1153143644332886, + "learning_rate": 2e-05, + "loss": 0.02981465, + "step": 13844 + }, + { + "epoch": 27.69, + "grad_norm": 1.2569992542266846, + "learning_rate": 2e-05, + "loss": 0.0435634, + "step": 13845 + }, + { + "epoch": 27.692, + "grad_norm": 1.384782314300537, + "learning_rate": 2e-05, + "loss": 0.03404148, + "step": 13846 + }, + { + "epoch": 27.694, + "grad_norm": 1.6214118003845215, + "learning_rate": 2e-05, + "loss": 0.04686133, + "step": 13847 + }, + { + "epoch": 27.696, + "grad_norm": 1.5570800304412842, + "learning_rate": 2e-05, + "loss": 0.05804433, + "step": 13848 + }, + { + "epoch": 27.698, + "grad_norm": 1.5227315425872803, + "learning_rate": 2e-05, + "loss": 0.04167802, + "step": 13849 + }, + { + "epoch": 27.7, + "grad_norm": 1.578632116317749, + "learning_rate": 2e-05, + "loss": 0.04537342, + "step": 13850 + }, + { + "epoch": 27.701999999999998, + "grad_norm": 1.34732186794281, + "learning_rate": 2e-05, + "loss": 0.05119549, + "step": 13851 + }, + { + "epoch": 27.704, + "grad_norm": 2.011436939239502, + "learning_rate": 2e-05, + "loss": 0.08780254, + "step": 13852 + }, + { + "epoch": 27.706, + "grad_norm": 1.0843994617462158, + "learning_rate": 2e-05, + "loss": 0.03971418, + "step": 13853 + }, + { + "epoch": 27.708, + "grad_norm": 1.1688135862350464, + "learning_rate": 2e-05, + "loss": 0.04056138, + "step": 13854 + }, + { + "epoch": 27.71, + "grad_norm": 0.9493677616119385, + "learning_rate": 2e-05, + "loss": 0.03187769, + "step": 13855 + }, + { + "epoch": 27.712, + "grad_norm": 1.0012264251708984, + "learning_rate": 2e-05, + "loss": 0.04027501, + "step": 13856 + }, + { + "epoch": 27.714, + "grad_norm": 0.8952470421791077, + "learning_rate": 2e-05, + "loss": 0.03563171, + "step": 13857 + }, + { + "epoch": 27.716, + "grad_norm": 1.1415109634399414, + "learning_rate": 2e-05, + "loss": 0.03367986, + "step": 13858 + }, + { + "epoch": 27.718, + "grad_norm": 1.085681676864624, + "learning_rate": 2e-05, + "loss": 0.04318651, + "step": 13859 + }, + { + "epoch": 27.72, + "grad_norm": 1.0478479862213135, + "learning_rate": 2e-05, + "loss": 0.03802703, + "step": 13860 + }, + { + "epoch": 27.722, + "grad_norm": 1.5198477506637573, + "learning_rate": 2e-05, + "loss": 0.04132574, + "step": 13861 + }, + { + "epoch": 27.724, + "grad_norm": 1.0840024948120117, + "learning_rate": 2e-05, + "loss": 0.04059497, + "step": 13862 + }, + { + "epoch": 27.726, + "grad_norm": 1.027240514755249, + "learning_rate": 2e-05, + "loss": 0.03974399, + "step": 13863 + }, + { + "epoch": 27.728, + "grad_norm": 1.053519368171692, + "learning_rate": 2e-05, + "loss": 0.03394579, + "step": 13864 + }, + { + "epoch": 27.73, + "grad_norm": 1.8327863216400146, + "learning_rate": 2e-05, + "loss": 0.03920068, + "step": 13865 + }, + { + "epoch": 27.732, + "grad_norm": 2.2196757793426514, + "learning_rate": 2e-05, + "loss": 0.04846443, + "step": 13866 + }, + { + "epoch": 27.734, + "grad_norm": 1.4028538465499878, + "learning_rate": 2e-05, + "loss": 0.04624466, + "step": 13867 + }, + { + "epoch": 27.736, + "grad_norm": 1.165058970451355, + "learning_rate": 2e-05, + "loss": 0.0364709, + "step": 13868 + }, + { + "epoch": 27.738, + "grad_norm": 4.092937469482422, + "learning_rate": 2e-05, + "loss": 0.05190163, + "step": 13869 + }, + { + "epoch": 27.74, + "grad_norm": 1.328935146331787, + "learning_rate": 2e-05, + "loss": 0.05347368, + "step": 13870 + }, + { + "epoch": 27.742, + "grad_norm": 2.0258467197418213, + "learning_rate": 2e-05, + "loss": 0.03681408, + "step": 13871 + }, + { + "epoch": 27.744, + "grad_norm": 1.4326660633087158, + "learning_rate": 2e-05, + "loss": 0.06321809, + "step": 13872 + }, + { + "epoch": 27.746, + "grad_norm": 1.131027340888977, + "learning_rate": 2e-05, + "loss": 0.0508743, + "step": 13873 + }, + { + "epoch": 27.748, + "grad_norm": 1.18997061252594, + "learning_rate": 2e-05, + "loss": 0.03275201, + "step": 13874 + }, + { + "epoch": 27.75, + "grad_norm": 2.4015719890594482, + "learning_rate": 2e-05, + "loss": 0.03576087, + "step": 13875 + }, + { + "epoch": 27.752, + "grad_norm": 2.24355149269104, + "learning_rate": 2e-05, + "loss": 0.05075097, + "step": 13876 + }, + { + "epoch": 27.754, + "grad_norm": 1.1765543222427368, + "learning_rate": 2e-05, + "loss": 0.04541855, + "step": 13877 + }, + { + "epoch": 27.756, + "grad_norm": 1.5028316974639893, + "learning_rate": 2e-05, + "loss": 0.05536481, + "step": 13878 + }, + { + "epoch": 27.758, + "grad_norm": 1.1628483533859253, + "learning_rate": 2e-05, + "loss": 0.04130168, + "step": 13879 + }, + { + "epoch": 27.76, + "grad_norm": 1.752406120300293, + "learning_rate": 2e-05, + "loss": 0.05474552, + "step": 13880 + }, + { + "epoch": 27.762, + "grad_norm": 1.0870063304901123, + "learning_rate": 2e-05, + "loss": 0.04335282, + "step": 13881 + }, + { + "epoch": 27.764, + "grad_norm": 2.0513229370117188, + "learning_rate": 2e-05, + "loss": 0.03874574, + "step": 13882 + }, + { + "epoch": 27.766, + "grad_norm": 1.2223379611968994, + "learning_rate": 2e-05, + "loss": 0.05046418, + "step": 13883 + }, + { + "epoch": 27.768, + "grad_norm": 1.059180498123169, + "learning_rate": 2e-05, + "loss": 0.03355616, + "step": 13884 + }, + { + "epoch": 27.77, + "grad_norm": 1.3236563205718994, + "learning_rate": 2e-05, + "loss": 0.0412216, + "step": 13885 + }, + { + "epoch": 27.772, + "grad_norm": 1.560232162475586, + "learning_rate": 2e-05, + "loss": 0.03983791, + "step": 13886 + }, + { + "epoch": 27.774, + "grad_norm": 1.3728487491607666, + "learning_rate": 2e-05, + "loss": 0.04270254, + "step": 13887 + }, + { + "epoch": 27.776, + "grad_norm": 1.7650214433670044, + "learning_rate": 2e-05, + "loss": 0.03499018, + "step": 13888 + }, + { + "epoch": 27.778, + "grad_norm": 1.0585721731185913, + "learning_rate": 2e-05, + "loss": 0.03514803, + "step": 13889 + }, + { + "epoch": 27.78, + "grad_norm": 1.339477300643921, + "learning_rate": 2e-05, + "loss": 0.05417306, + "step": 13890 + }, + { + "epoch": 27.782, + "grad_norm": 1.4370014667510986, + "learning_rate": 2e-05, + "loss": 0.05690622, + "step": 13891 + }, + { + "epoch": 27.784, + "grad_norm": 1.5644543170928955, + "learning_rate": 2e-05, + "loss": 0.04291898, + "step": 13892 + }, + { + "epoch": 27.786, + "grad_norm": 1.0327696800231934, + "learning_rate": 2e-05, + "loss": 0.03720927, + "step": 13893 + }, + { + "epoch": 27.788, + "grad_norm": 1.0730547904968262, + "learning_rate": 2e-05, + "loss": 0.0368498, + "step": 13894 + }, + { + "epoch": 27.79, + "grad_norm": 1.1899945735931396, + "learning_rate": 2e-05, + "loss": 0.04839388, + "step": 13895 + }, + { + "epoch": 27.792, + "grad_norm": 1.9402941465377808, + "learning_rate": 2e-05, + "loss": 0.05492554, + "step": 13896 + }, + { + "epoch": 27.794, + "grad_norm": 1.4402939081192017, + "learning_rate": 2e-05, + "loss": 0.05447104, + "step": 13897 + }, + { + "epoch": 27.796, + "grad_norm": 2.334462881088257, + "learning_rate": 2e-05, + "loss": 0.05614328, + "step": 13898 + }, + { + "epoch": 27.798000000000002, + "grad_norm": 1.5610952377319336, + "learning_rate": 2e-05, + "loss": 0.05484813, + "step": 13899 + }, + { + "epoch": 27.8, + "grad_norm": 2.178652763366699, + "learning_rate": 2e-05, + "loss": 0.06258977, + "step": 13900 + }, + { + "epoch": 27.802, + "grad_norm": 1.075310468673706, + "learning_rate": 2e-05, + "loss": 0.03375444, + "step": 13901 + }, + { + "epoch": 27.804, + "grad_norm": 1.107140064239502, + "learning_rate": 2e-05, + "loss": 0.03990953, + "step": 13902 + }, + { + "epoch": 27.806, + "grad_norm": 1.2043249607086182, + "learning_rate": 2e-05, + "loss": 0.05589031, + "step": 13903 + }, + { + "epoch": 27.808, + "grad_norm": 3.252065658569336, + "learning_rate": 2e-05, + "loss": 0.04488713, + "step": 13904 + }, + { + "epoch": 27.81, + "grad_norm": 1.4825037717819214, + "learning_rate": 2e-05, + "loss": 0.04118239, + "step": 13905 + }, + { + "epoch": 27.812, + "grad_norm": 1.1587483882904053, + "learning_rate": 2e-05, + "loss": 0.03698997, + "step": 13906 + }, + { + "epoch": 27.814, + "grad_norm": 1.0998228788375854, + "learning_rate": 2e-05, + "loss": 0.03384409, + "step": 13907 + }, + { + "epoch": 27.816, + "grad_norm": 1.2784723043441772, + "learning_rate": 2e-05, + "loss": 0.04363027, + "step": 13908 + }, + { + "epoch": 27.818, + "grad_norm": 1.6805578470230103, + "learning_rate": 2e-05, + "loss": 0.05093968, + "step": 13909 + }, + { + "epoch": 27.82, + "grad_norm": 4.108209609985352, + "learning_rate": 2e-05, + "loss": 0.04699673, + "step": 13910 + }, + { + "epoch": 27.822, + "grad_norm": 1.0773557424545288, + "learning_rate": 2e-05, + "loss": 0.03354843, + "step": 13911 + }, + { + "epoch": 27.824, + "grad_norm": 2.1529176235198975, + "learning_rate": 2e-05, + "loss": 0.05770917, + "step": 13912 + }, + { + "epoch": 27.826, + "grad_norm": 1.2121992111206055, + "learning_rate": 2e-05, + "loss": 0.04613957, + "step": 13913 + }, + { + "epoch": 27.828, + "grad_norm": 1.3433780670166016, + "learning_rate": 2e-05, + "loss": 0.06019507, + "step": 13914 + }, + { + "epoch": 27.83, + "grad_norm": 1.4100244045257568, + "learning_rate": 2e-05, + "loss": 0.04316228, + "step": 13915 + }, + { + "epoch": 27.832, + "grad_norm": 2.21895694732666, + "learning_rate": 2e-05, + "loss": 0.04332146, + "step": 13916 + }, + { + "epoch": 27.834, + "grad_norm": 1.132088303565979, + "learning_rate": 2e-05, + "loss": 0.03310034, + "step": 13917 + }, + { + "epoch": 27.836, + "grad_norm": 0.9991299510002136, + "learning_rate": 2e-05, + "loss": 0.03843431, + "step": 13918 + }, + { + "epoch": 27.838, + "grad_norm": 1.3979414701461792, + "learning_rate": 2e-05, + "loss": 0.04863296, + "step": 13919 + }, + { + "epoch": 27.84, + "grad_norm": 1.2485167980194092, + "learning_rate": 2e-05, + "loss": 0.03016427, + "step": 13920 + }, + { + "epoch": 27.842, + "grad_norm": 2.4594881534576416, + "learning_rate": 2e-05, + "loss": 0.0523196, + "step": 13921 + }, + { + "epoch": 27.844, + "grad_norm": 1.169227957725525, + "learning_rate": 2e-05, + "loss": 0.04624384, + "step": 13922 + }, + { + "epoch": 27.846, + "grad_norm": 1.015640377998352, + "learning_rate": 2e-05, + "loss": 0.03382251, + "step": 13923 + }, + { + "epoch": 27.848, + "grad_norm": 1.3113818168640137, + "learning_rate": 2e-05, + "loss": 0.05841094, + "step": 13924 + }, + { + "epoch": 27.85, + "grad_norm": 1.174110770225525, + "learning_rate": 2e-05, + "loss": 0.03625581, + "step": 13925 + }, + { + "epoch": 27.852, + "grad_norm": 1.187264084815979, + "learning_rate": 2e-05, + "loss": 0.04166123, + "step": 13926 + }, + { + "epoch": 27.854, + "grad_norm": 1.0158615112304688, + "learning_rate": 2e-05, + "loss": 0.0311013, + "step": 13927 + }, + { + "epoch": 27.856, + "grad_norm": 1.158115267753601, + "learning_rate": 2e-05, + "loss": 0.03714382, + "step": 13928 + }, + { + "epoch": 27.858, + "grad_norm": 1.1665765047073364, + "learning_rate": 2e-05, + "loss": 0.04791439, + "step": 13929 + }, + { + "epoch": 27.86, + "grad_norm": 1.0975072383880615, + "learning_rate": 2e-05, + "loss": 0.05305796, + "step": 13930 + }, + { + "epoch": 27.862, + "grad_norm": 1.2020257711410522, + "learning_rate": 2e-05, + "loss": 0.03673447, + "step": 13931 + }, + { + "epoch": 27.864, + "grad_norm": 1.7321451902389526, + "learning_rate": 2e-05, + "loss": 0.03766854, + "step": 13932 + }, + { + "epoch": 27.866, + "grad_norm": 1.0557488203048706, + "learning_rate": 2e-05, + "loss": 0.03491737, + "step": 13933 + }, + { + "epoch": 27.868, + "grad_norm": 1.1876567602157593, + "learning_rate": 2e-05, + "loss": 0.02960133, + "step": 13934 + }, + { + "epoch": 27.87, + "grad_norm": 1.1305729150772095, + "learning_rate": 2e-05, + "loss": 0.02485805, + "step": 13935 + }, + { + "epoch": 27.872, + "grad_norm": 1.0962995290756226, + "learning_rate": 2e-05, + "loss": 0.05305071, + "step": 13936 + }, + { + "epoch": 27.874, + "grad_norm": 1.6864302158355713, + "learning_rate": 2e-05, + "loss": 0.05625251, + "step": 13937 + }, + { + "epoch": 27.876, + "grad_norm": 1.1173137426376343, + "learning_rate": 2e-05, + "loss": 0.04091568, + "step": 13938 + }, + { + "epoch": 27.878, + "grad_norm": 1.3211218118667603, + "learning_rate": 2e-05, + "loss": 0.05462737, + "step": 13939 + }, + { + "epoch": 27.88, + "grad_norm": 2.1169698238372803, + "learning_rate": 2e-05, + "loss": 0.07160946, + "step": 13940 + }, + { + "epoch": 27.882, + "grad_norm": 2.254796028137207, + "learning_rate": 2e-05, + "loss": 0.05478616, + "step": 13941 + }, + { + "epoch": 27.884, + "grad_norm": 1.0552836656570435, + "learning_rate": 2e-05, + "loss": 0.0460465, + "step": 13942 + }, + { + "epoch": 27.886, + "grad_norm": 1.2649129629135132, + "learning_rate": 2e-05, + "loss": 0.05034371, + "step": 13943 + }, + { + "epoch": 27.888, + "grad_norm": 1.4517078399658203, + "learning_rate": 2e-05, + "loss": 0.05885756, + "step": 13944 + }, + { + "epoch": 27.89, + "grad_norm": 1.3323506116867065, + "learning_rate": 2e-05, + "loss": 0.03592032, + "step": 13945 + }, + { + "epoch": 27.892, + "grad_norm": 1.004231333732605, + "learning_rate": 2e-05, + "loss": 0.03064884, + "step": 13946 + }, + { + "epoch": 27.894, + "grad_norm": 3.4080281257629395, + "learning_rate": 2e-05, + "loss": 0.04444969, + "step": 13947 + }, + { + "epoch": 27.896, + "grad_norm": 1.0597387552261353, + "learning_rate": 2e-05, + "loss": 0.05533614, + "step": 13948 + }, + { + "epoch": 27.898, + "grad_norm": 3.37524151802063, + "learning_rate": 2e-05, + "loss": 0.03681152, + "step": 13949 + }, + { + "epoch": 27.9, + "grad_norm": 1.2154083251953125, + "learning_rate": 2e-05, + "loss": 0.04092389, + "step": 13950 + }, + { + "epoch": 27.902, + "grad_norm": 1.1490893363952637, + "learning_rate": 2e-05, + "loss": 0.04752996, + "step": 13951 + }, + { + "epoch": 27.904, + "grad_norm": 2.7081265449523926, + "learning_rate": 2e-05, + "loss": 0.07245349, + "step": 13952 + }, + { + "epoch": 27.906, + "grad_norm": 0.9910997748374939, + "learning_rate": 2e-05, + "loss": 0.03678518, + "step": 13953 + }, + { + "epoch": 27.908, + "grad_norm": 1.374592900276184, + "learning_rate": 2e-05, + "loss": 0.05117041, + "step": 13954 + }, + { + "epoch": 27.91, + "grad_norm": 1.7255548238754272, + "learning_rate": 2e-05, + "loss": 0.04001204, + "step": 13955 + }, + { + "epoch": 27.912, + "grad_norm": 1.196602463722229, + "learning_rate": 2e-05, + "loss": 0.02974688, + "step": 13956 + }, + { + "epoch": 27.914, + "grad_norm": 1.4762085676193237, + "learning_rate": 2e-05, + "loss": 0.04770838, + "step": 13957 + }, + { + "epoch": 27.916, + "grad_norm": 1.137497067451477, + "learning_rate": 2e-05, + "loss": 0.04189302, + "step": 13958 + }, + { + "epoch": 27.918, + "grad_norm": 1.2305675745010376, + "learning_rate": 2e-05, + "loss": 0.03438773, + "step": 13959 + }, + { + "epoch": 27.92, + "grad_norm": 1.2307065725326538, + "learning_rate": 2e-05, + "loss": 0.0470115, + "step": 13960 + }, + { + "epoch": 27.922, + "grad_norm": 1.3657495975494385, + "learning_rate": 2e-05, + "loss": 0.04345737, + "step": 13961 + }, + { + "epoch": 27.924, + "grad_norm": 1.3622334003448486, + "learning_rate": 2e-05, + "loss": 0.04275557, + "step": 13962 + }, + { + "epoch": 27.926, + "grad_norm": 1.8857320547103882, + "learning_rate": 2e-05, + "loss": 0.04838826, + "step": 13963 + }, + { + "epoch": 27.928, + "grad_norm": 1.1448822021484375, + "learning_rate": 2e-05, + "loss": 0.04292282, + "step": 13964 + }, + { + "epoch": 27.93, + "grad_norm": 1.3071253299713135, + "learning_rate": 2e-05, + "loss": 0.04572289, + "step": 13965 + }, + { + "epoch": 27.932, + "grad_norm": 1.3378077745437622, + "learning_rate": 2e-05, + "loss": 0.04263856, + "step": 13966 + }, + { + "epoch": 27.934, + "grad_norm": 1.6963642835617065, + "learning_rate": 2e-05, + "loss": 0.05341676, + "step": 13967 + }, + { + "epoch": 27.936, + "grad_norm": 3.9375112056732178, + "learning_rate": 2e-05, + "loss": 0.05422161, + "step": 13968 + }, + { + "epoch": 27.938, + "grad_norm": 1.3592469692230225, + "learning_rate": 2e-05, + "loss": 0.06483057, + "step": 13969 + }, + { + "epoch": 27.94, + "grad_norm": 1.460677981376648, + "learning_rate": 2e-05, + "loss": 0.05142547, + "step": 13970 + }, + { + "epoch": 27.942, + "grad_norm": 1.2273732423782349, + "learning_rate": 2e-05, + "loss": 0.03599922, + "step": 13971 + }, + { + "epoch": 27.944, + "grad_norm": 1.2938038110733032, + "learning_rate": 2e-05, + "loss": 0.04321476, + "step": 13972 + }, + { + "epoch": 27.946, + "grad_norm": 1.007976770401001, + "learning_rate": 2e-05, + "loss": 0.04248318, + "step": 13973 + }, + { + "epoch": 27.948, + "grad_norm": 2.465160846710205, + "learning_rate": 2e-05, + "loss": 0.04337021, + "step": 13974 + }, + { + "epoch": 27.95, + "grad_norm": 3.6507928371429443, + "learning_rate": 2e-05, + "loss": 0.06409168, + "step": 13975 + }, + { + "epoch": 27.951999999999998, + "grad_norm": 1.0644190311431885, + "learning_rate": 2e-05, + "loss": 0.04959762, + "step": 13976 + }, + { + "epoch": 27.954, + "grad_norm": 1.2026413679122925, + "learning_rate": 2e-05, + "loss": 0.04680315, + "step": 13977 + }, + { + "epoch": 27.956, + "grad_norm": 0.87662672996521, + "learning_rate": 2e-05, + "loss": 0.03997586, + "step": 13978 + }, + { + "epoch": 27.958, + "grad_norm": 1.1587470769882202, + "learning_rate": 2e-05, + "loss": 0.0403148, + "step": 13979 + }, + { + "epoch": 27.96, + "grad_norm": 1.5906647443771362, + "learning_rate": 2e-05, + "loss": 0.05821125, + "step": 13980 + }, + { + "epoch": 27.962, + "grad_norm": 2.6488704681396484, + "learning_rate": 2e-05, + "loss": 0.05198992, + "step": 13981 + }, + { + "epoch": 27.964, + "grad_norm": 1.068997859954834, + "learning_rate": 2e-05, + "loss": 0.02909309, + "step": 13982 + }, + { + "epoch": 27.966, + "grad_norm": 1.0853064060211182, + "learning_rate": 2e-05, + "loss": 0.03459357, + "step": 13983 + }, + { + "epoch": 27.968, + "grad_norm": 1.227805733680725, + "learning_rate": 2e-05, + "loss": 0.04584693, + "step": 13984 + }, + { + "epoch": 27.97, + "grad_norm": 1.665924072265625, + "learning_rate": 2e-05, + "loss": 0.04428935, + "step": 13985 + }, + { + "epoch": 27.972, + "grad_norm": 1.8217002153396606, + "learning_rate": 2e-05, + "loss": 0.04603599, + "step": 13986 + }, + { + "epoch": 27.974, + "grad_norm": 1.1141926050186157, + "learning_rate": 2e-05, + "loss": 0.03780417, + "step": 13987 + }, + { + "epoch": 27.976, + "grad_norm": 1.0562472343444824, + "learning_rate": 2e-05, + "loss": 0.04129979, + "step": 13988 + }, + { + "epoch": 27.978, + "grad_norm": 1.1564449071884155, + "learning_rate": 2e-05, + "loss": 0.05021477, + "step": 13989 + }, + { + "epoch": 27.98, + "grad_norm": 1.0061001777648926, + "learning_rate": 2e-05, + "loss": 0.03430205, + "step": 13990 + }, + { + "epoch": 27.982, + "grad_norm": 3.191288709640503, + "learning_rate": 2e-05, + "loss": 0.04336466, + "step": 13991 + }, + { + "epoch": 27.984, + "grad_norm": 1.3708455562591553, + "learning_rate": 2e-05, + "loss": 0.03357625, + "step": 13992 + }, + { + "epoch": 27.986, + "grad_norm": 1.410815954208374, + "learning_rate": 2e-05, + "loss": 0.03761749, + "step": 13993 + }, + { + "epoch": 27.988, + "grad_norm": 1.0445529222488403, + "learning_rate": 2e-05, + "loss": 0.03375229, + "step": 13994 + }, + { + "epoch": 27.99, + "grad_norm": 1.9780248403549194, + "learning_rate": 2e-05, + "loss": 0.05313461, + "step": 13995 + }, + { + "epoch": 27.992, + "grad_norm": 1.082139492034912, + "learning_rate": 2e-05, + "loss": 0.04322237, + "step": 13996 + }, + { + "epoch": 27.994, + "grad_norm": 1.701316475868225, + "learning_rate": 2e-05, + "loss": 0.04349414, + "step": 13997 + }, + { + "epoch": 27.996, + "grad_norm": 1.5152575969696045, + "learning_rate": 2e-05, + "loss": 0.04271465, + "step": 13998 + }, + { + "epoch": 27.998, + "grad_norm": 1.3154436349868774, + "learning_rate": 2e-05, + "loss": 0.04630833, + "step": 13999 + }, + { + "epoch": 28.0, + "grad_norm": 1.4491382837295532, + "learning_rate": 2e-05, + "loss": 0.05390036, + "step": 14000 + }, + { + "epoch": 28.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9880239520958084, + "Equal_1": 0.998, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9840319361277445, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.994, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.988, + "Perpendicular_3": 0.8677354709418837, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.998, + "PointLiesOnCircle_3": 0.9876666666666667, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9860279441117764 + }, + "eval_runtime": 319.0556, + "eval_samples_per_second": 32.91, + "eval_steps_per_second": 0.658, + "step": 14000 + }, + { + "epoch": 28.002, + "grad_norm": 1.4470635652542114, + "learning_rate": 2e-05, + "loss": 0.03747822, + "step": 14001 + }, + { + "epoch": 28.004, + "grad_norm": 1.0949195623397827, + "learning_rate": 2e-05, + "loss": 0.04181771, + "step": 14002 + }, + { + "epoch": 28.006, + "grad_norm": 1.7039893865585327, + "learning_rate": 2e-05, + "loss": 0.04626131, + "step": 14003 + }, + { + "epoch": 28.008, + "grad_norm": 2.289400100708008, + "learning_rate": 2e-05, + "loss": 0.05722143, + "step": 14004 + }, + { + "epoch": 28.01, + "grad_norm": 1.7248917818069458, + "learning_rate": 2e-05, + "loss": 0.04162604, + "step": 14005 + }, + { + "epoch": 28.012, + "grad_norm": 1.235735297203064, + "learning_rate": 2e-05, + "loss": 0.05281135, + "step": 14006 + }, + { + "epoch": 28.014, + "grad_norm": 1.0774388313293457, + "learning_rate": 2e-05, + "loss": 0.03179125, + "step": 14007 + }, + { + "epoch": 28.016, + "grad_norm": 1.8190655708312988, + "learning_rate": 2e-05, + "loss": 0.04784362, + "step": 14008 + }, + { + "epoch": 28.018, + "grad_norm": 1.644675612449646, + "learning_rate": 2e-05, + "loss": 0.04483693, + "step": 14009 + }, + { + "epoch": 28.02, + "grad_norm": 1.27074134349823, + "learning_rate": 2e-05, + "loss": 0.05767696, + "step": 14010 + }, + { + "epoch": 28.022, + "grad_norm": 1.3586097955703735, + "learning_rate": 2e-05, + "loss": 0.03288449, + "step": 14011 + }, + { + "epoch": 28.024, + "grad_norm": 0.9760823845863342, + "learning_rate": 2e-05, + "loss": 0.03223011, + "step": 14012 + }, + { + "epoch": 28.026, + "grad_norm": 1.9947422742843628, + "learning_rate": 2e-05, + "loss": 0.06774095, + "step": 14013 + }, + { + "epoch": 28.028, + "grad_norm": 1.3853563070297241, + "learning_rate": 2e-05, + "loss": 0.03108654, + "step": 14014 + }, + { + "epoch": 28.03, + "grad_norm": 1.166358470916748, + "learning_rate": 2e-05, + "loss": 0.04371829, + "step": 14015 + }, + { + "epoch": 28.032, + "grad_norm": 1.3257215023040771, + "learning_rate": 2e-05, + "loss": 0.05647626, + "step": 14016 + }, + { + "epoch": 28.034, + "grad_norm": 1.8312715291976929, + "learning_rate": 2e-05, + "loss": 0.04174164, + "step": 14017 + }, + { + "epoch": 28.036, + "grad_norm": 1.1713063716888428, + "learning_rate": 2e-05, + "loss": 0.04272055, + "step": 14018 + }, + { + "epoch": 28.038, + "grad_norm": 1.022841453552246, + "learning_rate": 2e-05, + "loss": 0.04106878, + "step": 14019 + }, + { + "epoch": 28.04, + "grad_norm": 1.123375654220581, + "learning_rate": 2e-05, + "loss": 0.04132652, + "step": 14020 + }, + { + "epoch": 28.042, + "grad_norm": 0.9225612878799438, + "learning_rate": 2e-05, + "loss": 0.03926476, + "step": 14021 + }, + { + "epoch": 28.044, + "grad_norm": 1.3327795267105103, + "learning_rate": 2e-05, + "loss": 0.04138635, + "step": 14022 + }, + { + "epoch": 28.046, + "grad_norm": 1.0123158693313599, + "learning_rate": 2e-05, + "loss": 0.03201216, + "step": 14023 + }, + { + "epoch": 28.048, + "grad_norm": 1.0678902864456177, + "learning_rate": 2e-05, + "loss": 0.03800803, + "step": 14024 + }, + { + "epoch": 28.05, + "grad_norm": 1.6009563207626343, + "learning_rate": 2e-05, + "loss": 0.05565003, + "step": 14025 + }, + { + "epoch": 28.052, + "grad_norm": 1.7109339237213135, + "learning_rate": 2e-05, + "loss": 0.03978261, + "step": 14026 + }, + { + "epoch": 28.054, + "grad_norm": 0.9139453172683716, + "learning_rate": 2e-05, + "loss": 0.02852735, + "step": 14027 + }, + { + "epoch": 28.056, + "grad_norm": 1.2070707082748413, + "learning_rate": 2e-05, + "loss": 0.04159069, + "step": 14028 + }, + { + "epoch": 28.058, + "grad_norm": 1.1406387090682983, + "learning_rate": 2e-05, + "loss": 0.03497103, + "step": 14029 + }, + { + "epoch": 28.06, + "grad_norm": 1.5811642408370972, + "learning_rate": 2e-05, + "loss": 0.04424217, + "step": 14030 + }, + { + "epoch": 28.062, + "grad_norm": 2.008183479309082, + "learning_rate": 2e-05, + "loss": 0.0522799, + "step": 14031 + }, + { + "epoch": 28.064, + "grad_norm": 1.3083295822143555, + "learning_rate": 2e-05, + "loss": 0.04245078, + "step": 14032 + }, + { + "epoch": 28.066, + "grad_norm": 1.156947135925293, + "learning_rate": 2e-05, + "loss": 0.04440115, + "step": 14033 + }, + { + "epoch": 28.068, + "grad_norm": 1.2095609903335571, + "learning_rate": 2e-05, + "loss": 0.0388485, + "step": 14034 + }, + { + "epoch": 28.07, + "grad_norm": 1.2254390716552734, + "learning_rate": 2e-05, + "loss": 0.03347994, + "step": 14035 + }, + { + "epoch": 28.072, + "grad_norm": 1.1406832933425903, + "learning_rate": 2e-05, + "loss": 0.04151241, + "step": 14036 + }, + { + "epoch": 28.074, + "grad_norm": 1.8342012166976929, + "learning_rate": 2e-05, + "loss": 0.0339776, + "step": 14037 + }, + { + "epoch": 28.076, + "grad_norm": 1.5849714279174805, + "learning_rate": 2e-05, + "loss": 0.04472189, + "step": 14038 + }, + { + "epoch": 28.078, + "grad_norm": 1.7151453495025635, + "learning_rate": 2e-05, + "loss": 0.04204217, + "step": 14039 + }, + { + "epoch": 28.08, + "grad_norm": 1.2614927291870117, + "learning_rate": 2e-05, + "loss": 0.04573628, + "step": 14040 + }, + { + "epoch": 28.082, + "grad_norm": 0.9816139936447144, + "learning_rate": 2e-05, + "loss": 0.04488077, + "step": 14041 + }, + { + "epoch": 28.084, + "grad_norm": 1.832466959953308, + "learning_rate": 2e-05, + "loss": 0.03606717, + "step": 14042 + }, + { + "epoch": 28.086, + "grad_norm": 1.1039230823516846, + "learning_rate": 2e-05, + "loss": 0.03455424, + "step": 14043 + }, + { + "epoch": 28.088, + "grad_norm": 1.3150477409362793, + "learning_rate": 2e-05, + "loss": 0.03564423, + "step": 14044 + }, + { + "epoch": 28.09, + "grad_norm": 1.4359805583953857, + "learning_rate": 2e-05, + "loss": 0.04819302, + "step": 14045 + }, + { + "epoch": 28.092, + "grad_norm": 1.6399556398391724, + "learning_rate": 2e-05, + "loss": 0.06461568, + "step": 14046 + }, + { + "epoch": 28.094, + "grad_norm": 1.261181116104126, + "learning_rate": 2e-05, + "loss": 0.03727288, + "step": 14047 + }, + { + "epoch": 28.096, + "grad_norm": 1.3191063404083252, + "learning_rate": 2e-05, + "loss": 0.05051003, + "step": 14048 + }, + { + "epoch": 28.098, + "grad_norm": 2.9893617630004883, + "learning_rate": 2e-05, + "loss": 0.04233899, + "step": 14049 + }, + { + "epoch": 28.1, + "grad_norm": 1.1125555038452148, + "learning_rate": 2e-05, + "loss": 0.03035634, + "step": 14050 + }, + { + "epoch": 28.102, + "grad_norm": 1.1988699436187744, + "learning_rate": 2e-05, + "loss": 0.0434362, + "step": 14051 + }, + { + "epoch": 28.104, + "grad_norm": 1.1385446786880493, + "learning_rate": 2e-05, + "loss": 0.04272396, + "step": 14052 + }, + { + "epoch": 28.106, + "grad_norm": 1.5576671361923218, + "learning_rate": 2e-05, + "loss": 0.04542271, + "step": 14053 + }, + { + "epoch": 28.108, + "grad_norm": 0.9911758303642273, + "learning_rate": 2e-05, + "loss": 0.02662791, + "step": 14054 + }, + { + "epoch": 28.11, + "grad_norm": 1.0654523372650146, + "learning_rate": 2e-05, + "loss": 0.02663458, + "step": 14055 + }, + { + "epoch": 28.112, + "grad_norm": 2.447866916656494, + "learning_rate": 2e-05, + "loss": 0.06156901, + "step": 14056 + }, + { + "epoch": 28.114, + "grad_norm": 1.5394794940948486, + "learning_rate": 2e-05, + "loss": 0.03734413, + "step": 14057 + }, + { + "epoch": 28.116, + "grad_norm": 1.2493864297866821, + "learning_rate": 2e-05, + "loss": 0.04325924, + "step": 14058 + }, + { + "epoch": 28.118, + "grad_norm": 1.0674172639846802, + "learning_rate": 2e-05, + "loss": 0.03373833, + "step": 14059 + }, + { + "epoch": 28.12, + "grad_norm": 1.6935789585113525, + "learning_rate": 2e-05, + "loss": 0.06536911, + "step": 14060 + }, + { + "epoch": 28.122, + "grad_norm": 1.346823811531067, + "learning_rate": 2e-05, + "loss": 0.03599031, + "step": 14061 + }, + { + "epoch": 28.124, + "grad_norm": 1.2584784030914307, + "learning_rate": 2e-05, + "loss": 0.04426415, + "step": 14062 + }, + { + "epoch": 28.126, + "grad_norm": 1.5686087608337402, + "learning_rate": 2e-05, + "loss": 0.03532685, + "step": 14063 + }, + { + "epoch": 28.128, + "grad_norm": 1.0472480058670044, + "learning_rate": 2e-05, + "loss": 0.02852079, + "step": 14064 + }, + { + "epoch": 28.13, + "grad_norm": 1.4226044416427612, + "learning_rate": 2e-05, + "loss": 0.05019859, + "step": 14065 + }, + { + "epoch": 28.132, + "grad_norm": 1.4346106052398682, + "learning_rate": 2e-05, + "loss": 0.05388649, + "step": 14066 + }, + { + "epoch": 28.134, + "grad_norm": 2.281315803527832, + "learning_rate": 2e-05, + "loss": 0.0545974, + "step": 14067 + }, + { + "epoch": 28.136, + "grad_norm": 1.2159454822540283, + "learning_rate": 2e-05, + "loss": 0.03459051, + "step": 14068 + }, + { + "epoch": 28.138, + "grad_norm": 1.2951035499572754, + "learning_rate": 2e-05, + "loss": 0.0595167, + "step": 14069 + }, + { + "epoch": 28.14, + "grad_norm": 1.1814157962799072, + "learning_rate": 2e-05, + "loss": 0.03543475, + "step": 14070 + }, + { + "epoch": 28.142, + "grad_norm": 1.0047551393508911, + "learning_rate": 2e-05, + "loss": 0.03344669, + "step": 14071 + }, + { + "epoch": 28.144, + "grad_norm": 0.9211133718490601, + "learning_rate": 2e-05, + "loss": 0.03054952, + "step": 14072 + }, + { + "epoch": 28.146, + "grad_norm": 0.922265887260437, + "learning_rate": 2e-05, + "loss": 0.03004513, + "step": 14073 + }, + { + "epoch": 28.148, + "grad_norm": 1.5452007055282593, + "learning_rate": 2e-05, + "loss": 0.0379502, + "step": 14074 + }, + { + "epoch": 28.15, + "grad_norm": 1.1936745643615723, + "learning_rate": 2e-05, + "loss": 0.0383325, + "step": 14075 + }, + { + "epoch": 28.152, + "grad_norm": 0.9541253447532654, + "learning_rate": 2e-05, + "loss": 0.03261286, + "step": 14076 + }, + { + "epoch": 28.154, + "grad_norm": 1.3197306394577026, + "learning_rate": 2e-05, + "loss": 0.05566575, + "step": 14077 + }, + { + "epoch": 28.156, + "grad_norm": 1.5139483213424683, + "learning_rate": 2e-05, + "loss": 0.04211396, + "step": 14078 + }, + { + "epoch": 28.158, + "grad_norm": 2.144613742828369, + "learning_rate": 2e-05, + "loss": 0.05478022, + "step": 14079 + }, + { + "epoch": 28.16, + "grad_norm": 1.2877733707427979, + "learning_rate": 2e-05, + "loss": 0.04739144, + "step": 14080 + }, + { + "epoch": 28.162, + "grad_norm": 0.9685249924659729, + "learning_rate": 2e-05, + "loss": 0.02751518, + "step": 14081 + }, + { + "epoch": 28.164, + "grad_norm": 1.1108578443527222, + "learning_rate": 2e-05, + "loss": 0.04293613, + "step": 14082 + }, + { + "epoch": 28.166, + "grad_norm": 0.9321027994155884, + "learning_rate": 2e-05, + "loss": 0.03453288, + "step": 14083 + }, + { + "epoch": 28.168, + "grad_norm": 1.8604676723480225, + "learning_rate": 2e-05, + "loss": 0.03813556, + "step": 14084 + }, + { + "epoch": 28.17, + "grad_norm": 1.3579093217849731, + "learning_rate": 2e-05, + "loss": 0.03601374, + "step": 14085 + }, + { + "epoch": 28.172, + "grad_norm": 1.3795456886291504, + "learning_rate": 2e-05, + "loss": 0.05737968, + "step": 14086 + }, + { + "epoch": 28.174, + "grad_norm": 1.3870776891708374, + "learning_rate": 2e-05, + "loss": 0.03918634, + "step": 14087 + }, + { + "epoch": 28.176, + "grad_norm": 1.5775760412216187, + "learning_rate": 2e-05, + "loss": 0.04204413, + "step": 14088 + }, + { + "epoch": 28.178, + "grad_norm": 1.3750708103179932, + "learning_rate": 2e-05, + "loss": 0.03528126, + "step": 14089 + }, + { + "epoch": 28.18, + "grad_norm": 1.1607511043548584, + "learning_rate": 2e-05, + "loss": 0.0513554, + "step": 14090 + }, + { + "epoch": 28.182, + "grad_norm": 1.7518668174743652, + "learning_rate": 2e-05, + "loss": 0.05958167, + "step": 14091 + }, + { + "epoch": 28.184, + "grad_norm": 0.8187534213066101, + "learning_rate": 2e-05, + "loss": 0.02375086, + "step": 14092 + }, + { + "epoch": 28.186, + "grad_norm": 1.1586307287216187, + "learning_rate": 2e-05, + "loss": 0.03455898, + "step": 14093 + }, + { + "epoch": 28.188, + "grad_norm": 2.6694910526275635, + "learning_rate": 2e-05, + "loss": 0.04185054, + "step": 14094 + }, + { + "epoch": 28.19, + "grad_norm": 1.0022153854370117, + "learning_rate": 2e-05, + "loss": 0.03717811, + "step": 14095 + }, + { + "epoch": 28.192, + "grad_norm": 1.133339762687683, + "learning_rate": 2e-05, + "loss": 0.02898293, + "step": 14096 + }, + { + "epoch": 28.194, + "grad_norm": 0.9910487532615662, + "learning_rate": 2e-05, + "loss": 0.02521181, + "step": 14097 + }, + { + "epoch": 28.196, + "grad_norm": 1.2302407026290894, + "learning_rate": 2e-05, + "loss": 0.04486918, + "step": 14098 + }, + { + "epoch": 28.198, + "grad_norm": 1.398140788078308, + "learning_rate": 2e-05, + "loss": 0.02154693, + "step": 14099 + }, + { + "epoch": 28.2, + "grad_norm": 2.0596702098846436, + "learning_rate": 2e-05, + "loss": 0.03849518, + "step": 14100 + }, + { + "epoch": 28.202, + "grad_norm": 1.3504750728607178, + "learning_rate": 2e-05, + "loss": 0.0484315, + "step": 14101 + }, + { + "epoch": 28.204, + "grad_norm": 1.0688234567642212, + "learning_rate": 2e-05, + "loss": 0.0351585, + "step": 14102 + }, + { + "epoch": 28.206, + "grad_norm": 1.4983340501785278, + "learning_rate": 2e-05, + "loss": 0.04929278, + "step": 14103 + }, + { + "epoch": 28.208, + "grad_norm": 1.1455981731414795, + "learning_rate": 2e-05, + "loss": 0.04318264, + "step": 14104 + }, + { + "epoch": 28.21, + "grad_norm": 1.3201831579208374, + "learning_rate": 2e-05, + "loss": 0.03962881, + "step": 14105 + }, + { + "epoch": 28.212, + "grad_norm": 1.7087925672531128, + "learning_rate": 2e-05, + "loss": 0.05958664, + "step": 14106 + }, + { + "epoch": 28.214, + "grad_norm": 1.799774169921875, + "learning_rate": 2e-05, + "loss": 0.04615483, + "step": 14107 + }, + { + "epoch": 28.216, + "grad_norm": 2.0174641609191895, + "learning_rate": 2e-05, + "loss": 0.03510554, + "step": 14108 + }, + { + "epoch": 28.218, + "grad_norm": 1.7066216468811035, + "learning_rate": 2e-05, + "loss": 0.04433943, + "step": 14109 + }, + { + "epoch": 28.22, + "grad_norm": 1.1175912618637085, + "learning_rate": 2e-05, + "loss": 0.04259259, + "step": 14110 + }, + { + "epoch": 28.222, + "grad_norm": 1.3678927421569824, + "learning_rate": 2e-05, + "loss": 0.0313809, + "step": 14111 + }, + { + "epoch": 28.224, + "grad_norm": 1.330997109413147, + "learning_rate": 2e-05, + "loss": 0.03561566, + "step": 14112 + }, + { + "epoch": 28.226, + "grad_norm": 1.0542383193969727, + "learning_rate": 2e-05, + "loss": 0.03285637, + "step": 14113 + }, + { + "epoch": 28.228, + "grad_norm": 1.6406633853912354, + "learning_rate": 2e-05, + "loss": 0.03652732, + "step": 14114 + }, + { + "epoch": 28.23, + "grad_norm": 1.4692353010177612, + "learning_rate": 2e-05, + "loss": 0.04664152, + "step": 14115 + }, + { + "epoch": 28.232, + "grad_norm": 3.2253143787384033, + "learning_rate": 2e-05, + "loss": 0.052161, + "step": 14116 + }, + { + "epoch": 28.234, + "grad_norm": 1.0887095928192139, + "learning_rate": 2e-05, + "loss": 0.03422359, + "step": 14117 + }, + { + "epoch": 28.236, + "grad_norm": 1.4389132261276245, + "learning_rate": 2e-05, + "loss": 0.0335054, + "step": 14118 + }, + { + "epoch": 28.238, + "grad_norm": 1.1777735948562622, + "learning_rate": 2e-05, + "loss": 0.0480193, + "step": 14119 + }, + { + "epoch": 28.24, + "grad_norm": 1.2068471908569336, + "learning_rate": 2e-05, + "loss": 0.03690059, + "step": 14120 + }, + { + "epoch": 28.242, + "grad_norm": 0.9290897846221924, + "learning_rate": 2e-05, + "loss": 0.03334764, + "step": 14121 + }, + { + "epoch": 28.244, + "grad_norm": 1.0152655839920044, + "learning_rate": 2e-05, + "loss": 0.04105952, + "step": 14122 + }, + { + "epoch": 28.246, + "grad_norm": 1.3126530647277832, + "learning_rate": 2e-05, + "loss": 0.03845873, + "step": 14123 + }, + { + "epoch": 28.248, + "grad_norm": 1.0583008527755737, + "learning_rate": 2e-05, + "loss": 0.02750574, + "step": 14124 + }, + { + "epoch": 28.25, + "grad_norm": 1.3489996194839478, + "learning_rate": 2e-05, + "loss": 0.03019518, + "step": 14125 + }, + { + "epoch": 28.252, + "grad_norm": 1.9256123304367065, + "learning_rate": 2e-05, + "loss": 0.0618353, + "step": 14126 + }, + { + "epoch": 28.254, + "grad_norm": 1.2744817733764648, + "learning_rate": 2e-05, + "loss": 0.05729286, + "step": 14127 + }, + { + "epoch": 28.256, + "grad_norm": 1.2284598350524902, + "learning_rate": 2e-05, + "loss": 0.0571045, + "step": 14128 + }, + { + "epoch": 28.258, + "grad_norm": 1.3543322086334229, + "learning_rate": 2e-05, + "loss": 0.04329966, + "step": 14129 + }, + { + "epoch": 28.26, + "grad_norm": 1.3076565265655518, + "learning_rate": 2e-05, + "loss": 0.04858845, + "step": 14130 + }, + { + "epoch": 28.262, + "grad_norm": 1.8700453042984009, + "learning_rate": 2e-05, + "loss": 0.06022802, + "step": 14131 + }, + { + "epoch": 28.264, + "grad_norm": 1.6746340990066528, + "learning_rate": 2e-05, + "loss": 0.05681061, + "step": 14132 + }, + { + "epoch": 28.266, + "grad_norm": 1.4257715940475464, + "learning_rate": 2e-05, + "loss": 0.04958214, + "step": 14133 + }, + { + "epoch": 28.268, + "grad_norm": 1.2631217241287231, + "learning_rate": 2e-05, + "loss": 0.03757473, + "step": 14134 + }, + { + "epoch": 28.27, + "grad_norm": 1.4865784645080566, + "learning_rate": 2e-05, + "loss": 0.0356946, + "step": 14135 + }, + { + "epoch": 28.272, + "grad_norm": 1.2895032167434692, + "learning_rate": 2e-05, + "loss": 0.03698609, + "step": 14136 + }, + { + "epoch": 28.274, + "grad_norm": 1.0424330234527588, + "learning_rate": 2e-05, + "loss": 0.03331292, + "step": 14137 + }, + { + "epoch": 28.276, + "grad_norm": 0.9681985378265381, + "learning_rate": 2e-05, + "loss": 0.02686777, + "step": 14138 + }, + { + "epoch": 28.278, + "grad_norm": 1.1451689004898071, + "learning_rate": 2e-05, + "loss": 0.04804129, + "step": 14139 + }, + { + "epoch": 28.28, + "grad_norm": 1.2859004735946655, + "learning_rate": 2e-05, + "loss": 0.03937485, + "step": 14140 + }, + { + "epoch": 28.282, + "grad_norm": 1.9779037237167358, + "learning_rate": 2e-05, + "loss": 0.04771383, + "step": 14141 + }, + { + "epoch": 28.284, + "grad_norm": 1.6972112655639648, + "learning_rate": 2e-05, + "loss": 0.04609143, + "step": 14142 + }, + { + "epoch": 28.286, + "grad_norm": 1.5024956464767456, + "learning_rate": 2e-05, + "loss": 0.04176302, + "step": 14143 + }, + { + "epoch": 28.288, + "grad_norm": 2.7473392486572266, + "learning_rate": 2e-05, + "loss": 0.06857879, + "step": 14144 + }, + { + "epoch": 28.29, + "grad_norm": 1.476321816444397, + "learning_rate": 2e-05, + "loss": 0.04816295, + "step": 14145 + }, + { + "epoch": 28.292, + "grad_norm": 1.9630745649337769, + "learning_rate": 2e-05, + "loss": 0.04113162, + "step": 14146 + }, + { + "epoch": 28.294, + "grad_norm": 1.5762404203414917, + "learning_rate": 2e-05, + "loss": 0.04521403, + "step": 14147 + }, + { + "epoch": 28.296, + "grad_norm": 1.8791242837905884, + "learning_rate": 2e-05, + "loss": 0.05574446, + "step": 14148 + }, + { + "epoch": 28.298, + "grad_norm": 1.3232239484786987, + "learning_rate": 2e-05, + "loss": 0.04592151, + "step": 14149 + }, + { + "epoch": 28.3, + "grad_norm": 1.0860222578048706, + "learning_rate": 2e-05, + "loss": 0.03201285, + "step": 14150 + }, + { + "epoch": 28.302, + "grad_norm": 1.1450574398040771, + "learning_rate": 2e-05, + "loss": 0.0431738, + "step": 14151 + }, + { + "epoch": 28.304, + "grad_norm": 1.2073911428451538, + "learning_rate": 2e-05, + "loss": 0.04106464, + "step": 14152 + }, + { + "epoch": 28.306, + "grad_norm": 2.2250092029571533, + "learning_rate": 2e-05, + "loss": 0.03272331, + "step": 14153 + }, + { + "epoch": 28.308, + "grad_norm": 1.2592679262161255, + "learning_rate": 2e-05, + "loss": 0.03767652, + "step": 14154 + }, + { + "epoch": 28.31, + "grad_norm": 1.481679081916809, + "learning_rate": 2e-05, + "loss": 0.04322655, + "step": 14155 + }, + { + "epoch": 28.312, + "grad_norm": 1.8476687669754028, + "learning_rate": 2e-05, + "loss": 0.02506692, + "step": 14156 + }, + { + "epoch": 28.314, + "grad_norm": 2.533881664276123, + "learning_rate": 2e-05, + "loss": 0.05798201, + "step": 14157 + }, + { + "epoch": 28.316, + "grad_norm": 1.16340172290802, + "learning_rate": 2e-05, + "loss": 0.04741621, + "step": 14158 + }, + { + "epoch": 28.318, + "grad_norm": 1.0567313432693481, + "learning_rate": 2e-05, + "loss": 0.03934213, + "step": 14159 + }, + { + "epoch": 28.32, + "grad_norm": 1.8486943244934082, + "learning_rate": 2e-05, + "loss": 0.04713926, + "step": 14160 + }, + { + "epoch": 28.322, + "grad_norm": 1.489852786064148, + "learning_rate": 2e-05, + "loss": 0.0400935, + "step": 14161 + }, + { + "epoch": 28.324, + "grad_norm": 3.173983097076416, + "learning_rate": 2e-05, + "loss": 0.05666588, + "step": 14162 + }, + { + "epoch": 28.326, + "grad_norm": 1.2710679769515991, + "learning_rate": 2e-05, + "loss": 0.04324594, + "step": 14163 + }, + { + "epoch": 28.328, + "grad_norm": 2.056770086288452, + "learning_rate": 2e-05, + "loss": 0.0498774, + "step": 14164 + }, + { + "epoch": 28.33, + "grad_norm": 1.194877028465271, + "learning_rate": 2e-05, + "loss": 0.033945, + "step": 14165 + }, + { + "epoch": 28.332, + "grad_norm": 1.5242279767990112, + "learning_rate": 2e-05, + "loss": 0.04730607, + "step": 14166 + }, + { + "epoch": 28.334, + "grad_norm": 1.2096294164657593, + "learning_rate": 2e-05, + "loss": 0.04421799, + "step": 14167 + }, + { + "epoch": 28.336, + "grad_norm": 2.0177338123321533, + "learning_rate": 2e-05, + "loss": 0.05057753, + "step": 14168 + }, + { + "epoch": 28.338, + "grad_norm": 1.4078340530395508, + "learning_rate": 2e-05, + "loss": 0.04942624, + "step": 14169 + }, + { + "epoch": 28.34, + "grad_norm": 2.0044713020324707, + "learning_rate": 2e-05, + "loss": 0.05860762, + "step": 14170 + }, + { + "epoch": 28.342, + "grad_norm": 1.4696686267852783, + "learning_rate": 2e-05, + "loss": 0.05207686, + "step": 14171 + }, + { + "epoch": 28.344, + "grad_norm": 1.4194014072418213, + "learning_rate": 2e-05, + "loss": 0.03678628, + "step": 14172 + }, + { + "epoch": 28.346, + "grad_norm": 1.3729615211486816, + "learning_rate": 2e-05, + "loss": 0.07338053, + "step": 14173 + }, + { + "epoch": 28.348, + "grad_norm": 1.4968407154083252, + "learning_rate": 2e-05, + "loss": 0.05206795, + "step": 14174 + }, + { + "epoch": 28.35, + "grad_norm": 0.9872764348983765, + "learning_rate": 2e-05, + "loss": 0.03436659, + "step": 14175 + }, + { + "epoch": 28.352, + "grad_norm": 1.0700467824935913, + "learning_rate": 2e-05, + "loss": 0.02983767, + "step": 14176 + }, + { + "epoch": 28.354, + "grad_norm": 1.9751871824264526, + "learning_rate": 2e-05, + "loss": 0.04803991, + "step": 14177 + }, + { + "epoch": 28.356, + "grad_norm": 1.7464139461517334, + "learning_rate": 2e-05, + "loss": 0.03985727, + "step": 14178 + }, + { + "epoch": 28.358, + "grad_norm": 1.93489670753479, + "learning_rate": 2e-05, + "loss": 0.03951417, + "step": 14179 + }, + { + "epoch": 28.36, + "grad_norm": 1.8252264261245728, + "learning_rate": 2e-05, + "loss": 0.05627768, + "step": 14180 + }, + { + "epoch": 28.362, + "grad_norm": 1.957201361656189, + "learning_rate": 2e-05, + "loss": 0.03644243, + "step": 14181 + }, + { + "epoch": 28.364, + "grad_norm": 1.244791865348816, + "learning_rate": 2e-05, + "loss": 0.05136299, + "step": 14182 + }, + { + "epoch": 28.366, + "grad_norm": 1.722070336341858, + "learning_rate": 2e-05, + "loss": 0.05756142, + "step": 14183 + }, + { + "epoch": 28.368, + "grad_norm": 1.8835808038711548, + "learning_rate": 2e-05, + "loss": 0.05007068, + "step": 14184 + }, + { + "epoch": 28.37, + "grad_norm": 1.2348777055740356, + "learning_rate": 2e-05, + "loss": 0.03819226, + "step": 14185 + }, + { + "epoch": 28.372, + "grad_norm": 1.257193684577942, + "learning_rate": 2e-05, + "loss": 0.05539931, + "step": 14186 + }, + { + "epoch": 28.374, + "grad_norm": 2.638650894165039, + "learning_rate": 2e-05, + "loss": 0.03770338, + "step": 14187 + }, + { + "epoch": 28.376, + "grad_norm": 1.6504069566726685, + "learning_rate": 2e-05, + "loss": 0.05300485, + "step": 14188 + }, + { + "epoch": 28.378, + "grad_norm": 1.1879500150680542, + "learning_rate": 2e-05, + "loss": 0.04724121, + "step": 14189 + }, + { + "epoch": 28.38, + "grad_norm": 1.1239627599716187, + "learning_rate": 2e-05, + "loss": 0.04006843, + "step": 14190 + }, + { + "epoch": 28.382, + "grad_norm": 1.3962266445159912, + "learning_rate": 2e-05, + "loss": 0.04130918, + "step": 14191 + }, + { + "epoch": 28.384, + "grad_norm": 1.0262913703918457, + "learning_rate": 2e-05, + "loss": 0.03978799, + "step": 14192 + }, + { + "epoch": 28.386, + "grad_norm": 1.3596129417419434, + "learning_rate": 2e-05, + "loss": 0.06046655, + "step": 14193 + }, + { + "epoch": 28.388, + "grad_norm": 1.1571170091629028, + "learning_rate": 2e-05, + "loss": 0.05023224, + "step": 14194 + }, + { + "epoch": 28.39, + "grad_norm": 1.3431873321533203, + "learning_rate": 2e-05, + "loss": 0.05262809, + "step": 14195 + }, + { + "epoch": 28.392, + "grad_norm": 2.0367238521575928, + "learning_rate": 2e-05, + "loss": 0.04556445, + "step": 14196 + }, + { + "epoch": 28.394, + "grad_norm": 1.3604267835617065, + "learning_rate": 2e-05, + "loss": 0.04495424, + "step": 14197 + }, + { + "epoch": 28.396, + "grad_norm": 1.0701876878738403, + "learning_rate": 2e-05, + "loss": 0.03906007, + "step": 14198 + }, + { + "epoch": 28.398, + "grad_norm": 1.1933096647262573, + "learning_rate": 2e-05, + "loss": 0.05860349, + "step": 14199 + }, + { + "epoch": 28.4, + "grad_norm": 1.3504351377487183, + "learning_rate": 2e-05, + "loss": 0.04702557, + "step": 14200 + }, + { + "epoch": 28.402, + "grad_norm": 1.2822675704956055, + "learning_rate": 2e-05, + "loss": 0.03257849, + "step": 14201 + }, + { + "epoch": 28.404, + "grad_norm": 1.6527397632598877, + "learning_rate": 2e-05, + "loss": 0.05858765, + "step": 14202 + }, + { + "epoch": 28.406, + "grad_norm": 1.0486624240875244, + "learning_rate": 2e-05, + "loss": 0.03177363, + "step": 14203 + }, + { + "epoch": 28.408, + "grad_norm": 1.128495216369629, + "learning_rate": 2e-05, + "loss": 0.04799465, + "step": 14204 + }, + { + "epoch": 28.41, + "grad_norm": 1.0047136545181274, + "learning_rate": 2e-05, + "loss": 0.03385249, + "step": 14205 + }, + { + "epoch": 28.412, + "grad_norm": 1.4099255800247192, + "learning_rate": 2e-05, + "loss": 0.0562316, + "step": 14206 + }, + { + "epoch": 28.414, + "grad_norm": 2.193967580795288, + "learning_rate": 2e-05, + "loss": 0.07079425, + "step": 14207 + }, + { + "epoch": 28.416, + "grad_norm": 1.1035521030426025, + "learning_rate": 2e-05, + "loss": 0.04856515, + "step": 14208 + }, + { + "epoch": 28.418, + "grad_norm": 1.315811038017273, + "learning_rate": 2e-05, + "loss": 0.03866019, + "step": 14209 + }, + { + "epoch": 28.42, + "grad_norm": 1.1560747623443604, + "learning_rate": 2e-05, + "loss": 0.04573543, + "step": 14210 + }, + { + "epoch": 28.422, + "grad_norm": 1.3213006258010864, + "learning_rate": 2e-05, + "loss": 0.04729943, + "step": 14211 + }, + { + "epoch": 28.424, + "grad_norm": 1.467768907546997, + "learning_rate": 2e-05, + "loss": 0.0497423, + "step": 14212 + }, + { + "epoch": 28.426, + "grad_norm": 1.0412020683288574, + "learning_rate": 2e-05, + "loss": 0.03104374, + "step": 14213 + }, + { + "epoch": 28.428, + "grad_norm": 1.289223074913025, + "learning_rate": 2e-05, + "loss": 0.04878268, + "step": 14214 + }, + { + "epoch": 28.43, + "grad_norm": 1.0868279933929443, + "learning_rate": 2e-05, + "loss": 0.04009105, + "step": 14215 + }, + { + "epoch": 28.432, + "grad_norm": 1.506838321685791, + "learning_rate": 2e-05, + "loss": 0.04779705, + "step": 14216 + }, + { + "epoch": 28.434, + "grad_norm": 1.3598910570144653, + "learning_rate": 2e-05, + "loss": 0.0399574, + "step": 14217 + }, + { + "epoch": 28.436, + "grad_norm": 1.165126085281372, + "learning_rate": 2e-05, + "loss": 0.03589812, + "step": 14218 + }, + { + "epoch": 28.438, + "grad_norm": 1.403108835220337, + "learning_rate": 2e-05, + "loss": 0.03950584, + "step": 14219 + }, + { + "epoch": 28.44, + "grad_norm": 0.9602160453796387, + "learning_rate": 2e-05, + "loss": 0.0343546, + "step": 14220 + }, + { + "epoch": 28.442, + "grad_norm": 1.707650065422058, + "learning_rate": 2e-05, + "loss": 0.04669677, + "step": 14221 + }, + { + "epoch": 28.444, + "grad_norm": 1.5494041442871094, + "learning_rate": 2e-05, + "loss": 0.05189268, + "step": 14222 + }, + { + "epoch": 28.446, + "grad_norm": 1.4422967433929443, + "learning_rate": 2e-05, + "loss": 0.04455651, + "step": 14223 + }, + { + "epoch": 28.448, + "grad_norm": 2.1481945514678955, + "learning_rate": 2e-05, + "loss": 0.04172315, + "step": 14224 + }, + { + "epoch": 28.45, + "grad_norm": 1.8955481052398682, + "learning_rate": 2e-05, + "loss": 0.05096827, + "step": 14225 + }, + { + "epoch": 28.452, + "grad_norm": 1.4757720232009888, + "learning_rate": 2e-05, + "loss": 0.0427974, + "step": 14226 + }, + { + "epoch": 28.454, + "grad_norm": 3.5340352058410645, + "learning_rate": 2e-05, + "loss": 0.03696814, + "step": 14227 + }, + { + "epoch": 28.456, + "grad_norm": 1.2981595993041992, + "learning_rate": 2e-05, + "loss": 0.04960819, + "step": 14228 + }, + { + "epoch": 28.458, + "grad_norm": 1.3688567876815796, + "learning_rate": 2e-05, + "loss": 0.04346091, + "step": 14229 + }, + { + "epoch": 28.46, + "grad_norm": 1.0863968133926392, + "learning_rate": 2e-05, + "loss": 0.04432026, + "step": 14230 + }, + { + "epoch": 28.462, + "grad_norm": 1.1809697151184082, + "learning_rate": 2e-05, + "loss": 0.04062311, + "step": 14231 + }, + { + "epoch": 28.464, + "grad_norm": 1.087384581565857, + "learning_rate": 2e-05, + "loss": 0.04801937, + "step": 14232 + }, + { + "epoch": 28.466, + "grad_norm": 3.0253777503967285, + "learning_rate": 2e-05, + "loss": 0.06829751, + "step": 14233 + }, + { + "epoch": 28.468, + "grad_norm": 0.9512907862663269, + "learning_rate": 2e-05, + "loss": 0.02574901, + "step": 14234 + }, + { + "epoch": 28.47, + "grad_norm": 1.2561564445495605, + "learning_rate": 2e-05, + "loss": 0.03712705, + "step": 14235 + }, + { + "epoch": 28.472, + "grad_norm": 1.114723801612854, + "learning_rate": 2e-05, + "loss": 0.04053148, + "step": 14236 + }, + { + "epoch": 28.474, + "grad_norm": 0.9256406426429749, + "learning_rate": 2e-05, + "loss": 0.0282863, + "step": 14237 + }, + { + "epoch": 28.476, + "grad_norm": 1.0956227779388428, + "learning_rate": 2e-05, + "loss": 0.03660049, + "step": 14238 + }, + { + "epoch": 28.478, + "grad_norm": 1.1929676532745361, + "learning_rate": 2e-05, + "loss": 0.03722402, + "step": 14239 + }, + { + "epoch": 28.48, + "grad_norm": 1.0275921821594238, + "learning_rate": 2e-05, + "loss": 0.03313165, + "step": 14240 + }, + { + "epoch": 28.482, + "grad_norm": 1.4167778491973877, + "learning_rate": 2e-05, + "loss": 0.04980793, + "step": 14241 + }, + { + "epoch": 28.484, + "grad_norm": 2.8569796085357666, + "learning_rate": 2e-05, + "loss": 0.04864322, + "step": 14242 + }, + { + "epoch": 28.486, + "grad_norm": 1.0566939115524292, + "learning_rate": 2e-05, + "loss": 0.03854397, + "step": 14243 + }, + { + "epoch": 28.488, + "grad_norm": 1.8737938404083252, + "learning_rate": 2e-05, + "loss": 0.06766672, + "step": 14244 + }, + { + "epoch": 28.49, + "grad_norm": 1.1267757415771484, + "learning_rate": 2e-05, + "loss": 0.03949556, + "step": 14245 + }, + { + "epoch": 28.492, + "grad_norm": 1.5439599752426147, + "learning_rate": 2e-05, + "loss": 0.03746489, + "step": 14246 + }, + { + "epoch": 28.494, + "grad_norm": 1.10368812084198, + "learning_rate": 2e-05, + "loss": 0.04418413, + "step": 14247 + }, + { + "epoch": 28.496, + "grad_norm": 1.8851401805877686, + "learning_rate": 2e-05, + "loss": 0.05599113, + "step": 14248 + }, + { + "epoch": 28.498, + "grad_norm": 1.218051552772522, + "learning_rate": 2e-05, + "loss": 0.0527468, + "step": 14249 + }, + { + "epoch": 28.5, + "grad_norm": 2.2494425773620605, + "learning_rate": 2e-05, + "loss": 0.05196273, + "step": 14250 + }, + { + "epoch": 28.502, + "grad_norm": 1.1959189176559448, + "learning_rate": 2e-05, + "loss": 0.04170246, + "step": 14251 + }, + { + "epoch": 28.504, + "grad_norm": 1.4841954708099365, + "learning_rate": 2e-05, + "loss": 0.04115407, + "step": 14252 + }, + { + "epoch": 28.506, + "grad_norm": 1.4579923152923584, + "learning_rate": 2e-05, + "loss": 0.06711036, + "step": 14253 + }, + { + "epoch": 28.508, + "grad_norm": 1.3409069776535034, + "learning_rate": 2e-05, + "loss": 0.04993474, + "step": 14254 + }, + { + "epoch": 28.51, + "grad_norm": 0.9282925128936768, + "learning_rate": 2e-05, + "loss": 0.02424309, + "step": 14255 + }, + { + "epoch": 28.512, + "grad_norm": 1.0717816352844238, + "learning_rate": 2e-05, + "loss": 0.03973263, + "step": 14256 + }, + { + "epoch": 28.514, + "grad_norm": 1.3361554145812988, + "learning_rate": 2e-05, + "loss": 0.03955629, + "step": 14257 + }, + { + "epoch": 28.516, + "grad_norm": 2.1933910846710205, + "learning_rate": 2e-05, + "loss": 0.05748781, + "step": 14258 + }, + { + "epoch": 28.518, + "grad_norm": 1.1817597150802612, + "learning_rate": 2e-05, + "loss": 0.04961357, + "step": 14259 + }, + { + "epoch": 28.52, + "grad_norm": 1.298127293586731, + "learning_rate": 2e-05, + "loss": 0.04155416, + "step": 14260 + }, + { + "epoch": 28.522, + "grad_norm": 1.060938835144043, + "learning_rate": 2e-05, + "loss": 0.04268425, + "step": 14261 + }, + { + "epoch": 28.524, + "grad_norm": 1.8386595249176025, + "learning_rate": 2e-05, + "loss": 0.04054004, + "step": 14262 + }, + { + "epoch": 28.526, + "grad_norm": 1.0489224195480347, + "learning_rate": 2e-05, + "loss": 0.0311989, + "step": 14263 + }, + { + "epoch": 28.528, + "grad_norm": 1.3312833309173584, + "learning_rate": 2e-05, + "loss": 0.04081573, + "step": 14264 + }, + { + "epoch": 28.53, + "grad_norm": 1.7386471033096313, + "learning_rate": 2e-05, + "loss": 0.05501184, + "step": 14265 + }, + { + "epoch": 28.532, + "grad_norm": 1.1394075155258179, + "learning_rate": 2e-05, + "loss": 0.0496092, + "step": 14266 + }, + { + "epoch": 28.534, + "grad_norm": 1.0800431966781616, + "learning_rate": 2e-05, + "loss": 0.04388725, + "step": 14267 + }, + { + "epoch": 28.536, + "grad_norm": 1.365820288658142, + "learning_rate": 2e-05, + "loss": 0.04674472, + "step": 14268 + }, + { + "epoch": 28.538, + "grad_norm": 1.6938581466674805, + "learning_rate": 2e-05, + "loss": 0.03524645, + "step": 14269 + }, + { + "epoch": 28.54, + "grad_norm": 1.0656059980392456, + "learning_rate": 2e-05, + "loss": 0.0350618, + "step": 14270 + }, + { + "epoch": 28.542, + "grad_norm": 1.300699234008789, + "learning_rate": 2e-05, + "loss": 0.0439024, + "step": 14271 + }, + { + "epoch": 28.544, + "grad_norm": 1.0407557487487793, + "learning_rate": 2e-05, + "loss": 0.03287885, + "step": 14272 + }, + { + "epoch": 28.546, + "grad_norm": 1.203810214996338, + "learning_rate": 2e-05, + "loss": 0.03666135, + "step": 14273 + }, + { + "epoch": 28.548000000000002, + "grad_norm": 1.3447908163070679, + "learning_rate": 2e-05, + "loss": 0.03037151, + "step": 14274 + }, + { + "epoch": 28.55, + "grad_norm": 1.6330796480178833, + "learning_rate": 2e-05, + "loss": 0.04389308, + "step": 14275 + }, + { + "epoch": 28.552, + "grad_norm": 1.453428030014038, + "learning_rate": 2e-05, + "loss": 0.05481421, + "step": 14276 + }, + { + "epoch": 28.554, + "grad_norm": 1.3805347681045532, + "learning_rate": 2e-05, + "loss": 0.05831959, + "step": 14277 + }, + { + "epoch": 28.556, + "grad_norm": 1.1240483522415161, + "learning_rate": 2e-05, + "loss": 0.03230035, + "step": 14278 + }, + { + "epoch": 28.558, + "grad_norm": 1.5682629346847534, + "learning_rate": 2e-05, + "loss": 0.04137525, + "step": 14279 + }, + { + "epoch": 28.56, + "grad_norm": 1.067922830581665, + "learning_rate": 2e-05, + "loss": 0.03095679, + "step": 14280 + }, + { + "epoch": 28.562, + "grad_norm": 1.3047091960906982, + "learning_rate": 2e-05, + "loss": 0.040992, + "step": 14281 + }, + { + "epoch": 28.564, + "grad_norm": 1.1715583801269531, + "learning_rate": 2e-05, + "loss": 0.04538528, + "step": 14282 + }, + { + "epoch": 28.566, + "grad_norm": 2.710317373275757, + "learning_rate": 2e-05, + "loss": 0.04703784, + "step": 14283 + }, + { + "epoch": 28.568, + "grad_norm": 1.1046556234359741, + "learning_rate": 2e-05, + "loss": 0.03596055, + "step": 14284 + }, + { + "epoch": 28.57, + "grad_norm": 1.5328203439712524, + "learning_rate": 2e-05, + "loss": 0.05959145, + "step": 14285 + }, + { + "epoch": 28.572, + "grad_norm": 1.1067321300506592, + "learning_rate": 2e-05, + "loss": 0.03717593, + "step": 14286 + }, + { + "epoch": 28.574, + "grad_norm": 1.5921152830123901, + "learning_rate": 2e-05, + "loss": 0.04818038, + "step": 14287 + }, + { + "epoch": 28.576, + "grad_norm": 1.2906737327575684, + "learning_rate": 2e-05, + "loss": 0.04694307, + "step": 14288 + }, + { + "epoch": 28.578, + "grad_norm": 1.4389523267745972, + "learning_rate": 2e-05, + "loss": 0.06396471, + "step": 14289 + }, + { + "epoch": 28.58, + "grad_norm": 1.250801920890808, + "learning_rate": 2e-05, + "loss": 0.03378974, + "step": 14290 + }, + { + "epoch": 28.582, + "grad_norm": 1.8675917387008667, + "learning_rate": 2e-05, + "loss": 0.04210771, + "step": 14291 + }, + { + "epoch": 28.584, + "grad_norm": 1.2386926412582397, + "learning_rate": 2e-05, + "loss": 0.04732635, + "step": 14292 + }, + { + "epoch": 28.586, + "grad_norm": 2.474187135696411, + "learning_rate": 2e-05, + "loss": 0.06141835, + "step": 14293 + }, + { + "epoch": 28.588, + "grad_norm": 1.7406890392303467, + "learning_rate": 2e-05, + "loss": 0.0405802, + "step": 14294 + }, + { + "epoch": 28.59, + "grad_norm": 1.0976345539093018, + "learning_rate": 2e-05, + "loss": 0.03914689, + "step": 14295 + }, + { + "epoch": 28.592, + "grad_norm": 1.8601065874099731, + "learning_rate": 2e-05, + "loss": 0.04376506, + "step": 14296 + }, + { + "epoch": 28.594, + "grad_norm": 1.8947181701660156, + "learning_rate": 2e-05, + "loss": 0.05417305, + "step": 14297 + }, + { + "epoch": 28.596, + "grad_norm": 1.3904882669448853, + "learning_rate": 2e-05, + "loss": 0.05922655, + "step": 14298 + }, + { + "epoch": 28.598, + "grad_norm": 1.1092592477798462, + "learning_rate": 2e-05, + "loss": 0.04238461, + "step": 14299 + }, + { + "epoch": 28.6, + "grad_norm": 1.1075598001480103, + "learning_rate": 2e-05, + "loss": 0.03969569, + "step": 14300 + }, + { + "epoch": 28.602, + "grad_norm": 1.2379860877990723, + "learning_rate": 2e-05, + "loss": 0.05977491, + "step": 14301 + }, + { + "epoch": 28.604, + "grad_norm": 2.523383378982544, + "learning_rate": 2e-05, + "loss": 0.04559822, + "step": 14302 + }, + { + "epoch": 28.606, + "grad_norm": 1.390464186668396, + "learning_rate": 2e-05, + "loss": 0.03601509, + "step": 14303 + }, + { + "epoch": 28.608, + "grad_norm": 1.1424009799957275, + "learning_rate": 2e-05, + "loss": 0.04128938, + "step": 14304 + }, + { + "epoch": 28.61, + "grad_norm": 1.5949323177337646, + "learning_rate": 2e-05, + "loss": 0.03449958, + "step": 14305 + }, + { + "epoch": 28.612, + "grad_norm": 1.0766521692276, + "learning_rate": 2e-05, + "loss": 0.03662252, + "step": 14306 + }, + { + "epoch": 28.614, + "grad_norm": 1.684144377708435, + "learning_rate": 2e-05, + "loss": 0.03993898, + "step": 14307 + }, + { + "epoch": 28.616, + "grad_norm": 1.2283536195755005, + "learning_rate": 2e-05, + "loss": 0.0440466, + "step": 14308 + }, + { + "epoch": 28.618, + "grad_norm": 1.1252257823944092, + "learning_rate": 2e-05, + "loss": 0.03805107, + "step": 14309 + }, + { + "epoch": 28.62, + "grad_norm": 2.2137584686279297, + "learning_rate": 2e-05, + "loss": 0.04531762, + "step": 14310 + }, + { + "epoch": 28.622, + "grad_norm": 1.8632186651229858, + "learning_rate": 2e-05, + "loss": 0.04363943, + "step": 14311 + }, + { + "epoch": 28.624, + "grad_norm": 2.1854779720306396, + "learning_rate": 2e-05, + "loss": 0.04152116, + "step": 14312 + }, + { + "epoch": 28.626, + "grad_norm": 1.199179768562317, + "learning_rate": 2e-05, + "loss": 0.04882842, + "step": 14313 + }, + { + "epoch": 28.628, + "grad_norm": 1.9288642406463623, + "learning_rate": 2e-05, + "loss": 0.06294123, + "step": 14314 + }, + { + "epoch": 28.63, + "grad_norm": 0.9891992807388306, + "learning_rate": 2e-05, + "loss": 0.03864823, + "step": 14315 + }, + { + "epoch": 28.632, + "grad_norm": 1.1384472846984863, + "learning_rate": 2e-05, + "loss": 0.03697848, + "step": 14316 + }, + { + "epoch": 28.634, + "grad_norm": 1.3403377532958984, + "learning_rate": 2e-05, + "loss": 0.0520224, + "step": 14317 + }, + { + "epoch": 28.636, + "grad_norm": 1.8841863870620728, + "learning_rate": 2e-05, + "loss": 0.03481198, + "step": 14318 + }, + { + "epoch": 28.638, + "grad_norm": 2.253284215927124, + "learning_rate": 2e-05, + "loss": 0.05113673, + "step": 14319 + }, + { + "epoch": 28.64, + "grad_norm": 1.678731918334961, + "learning_rate": 2e-05, + "loss": 0.03881251, + "step": 14320 + }, + { + "epoch": 28.642, + "grad_norm": 1.419081211090088, + "learning_rate": 2e-05, + "loss": 0.03930322, + "step": 14321 + }, + { + "epoch": 28.644, + "grad_norm": 2.305083990097046, + "learning_rate": 2e-05, + "loss": 0.03415557, + "step": 14322 + }, + { + "epoch": 28.646, + "grad_norm": 1.7453691959381104, + "learning_rate": 2e-05, + "loss": 0.05402776, + "step": 14323 + }, + { + "epoch": 28.648, + "grad_norm": 1.6866177320480347, + "learning_rate": 2e-05, + "loss": 0.05624064, + "step": 14324 + }, + { + "epoch": 28.65, + "grad_norm": 2.8119149208068848, + "learning_rate": 2e-05, + "loss": 0.03970397, + "step": 14325 + }, + { + "epoch": 28.652, + "grad_norm": 1.4768586158752441, + "learning_rate": 2e-05, + "loss": 0.04976168, + "step": 14326 + }, + { + "epoch": 28.654, + "grad_norm": 1.2367441654205322, + "learning_rate": 2e-05, + "loss": 0.04001617, + "step": 14327 + }, + { + "epoch": 28.656, + "grad_norm": 1.3339909315109253, + "learning_rate": 2e-05, + "loss": 0.0409479, + "step": 14328 + }, + { + "epoch": 28.658, + "grad_norm": 1.3718701601028442, + "learning_rate": 2e-05, + "loss": 0.06367579, + "step": 14329 + }, + { + "epoch": 28.66, + "grad_norm": 1.323080062866211, + "learning_rate": 2e-05, + "loss": 0.0345336, + "step": 14330 + }, + { + "epoch": 28.662, + "grad_norm": 0.998365044593811, + "learning_rate": 2e-05, + "loss": 0.03076694, + "step": 14331 + }, + { + "epoch": 28.664, + "grad_norm": 1.3956735134124756, + "learning_rate": 2e-05, + "loss": 0.04270991, + "step": 14332 + }, + { + "epoch": 28.666, + "grad_norm": 1.0146839618682861, + "learning_rate": 2e-05, + "loss": 0.03931056, + "step": 14333 + }, + { + "epoch": 28.668, + "grad_norm": 1.7695027589797974, + "learning_rate": 2e-05, + "loss": 0.0385569, + "step": 14334 + }, + { + "epoch": 28.67, + "grad_norm": 1.3834205865859985, + "learning_rate": 2e-05, + "loss": 0.05658727, + "step": 14335 + }, + { + "epoch": 28.672, + "grad_norm": 1.1219671964645386, + "learning_rate": 2e-05, + "loss": 0.03792579, + "step": 14336 + }, + { + "epoch": 28.674, + "grad_norm": 1.3171169757843018, + "learning_rate": 2e-05, + "loss": 0.03911427, + "step": 14337 + }, + { + "epoch": 28.676, + "grad_norm": 1.2529704570770264, + "learning_rate": 2e-05, + "loss": 0.05144939, + "step": 14338 + }, + { + "epoch": 28.678, + "grad_norm": 0.909761905670166, + "learning_rate": 2e-05, + "loss": 0.02666837, + "step": 14339 + }, + { + "epoch": 28.68, + "grad_norm": 1.0001680850982666, + "learning_rate": 2e-05, + "loss": 0.03465949, + "step": 14340 + }, + { + "epoch": 28.682, + "grad_norm": 1.1715612411499023, + "learning_rate": 2e-05, + "loss": 0.03337407, + "step": 14341 + }, + { + "epoch": 28.684, + "grad_norm": 1.56963050365448, + "learning_rate": 2e-05, + "loss": 0.04966843, + "step": 14342 + }, + { + "epoch": 28.686, + "grad_norm": 1.6177550554275513, + "learning_rate": 2e-05, + "loss": 0.06656225, + "step": 14343 + }, + { + "epoch": 28.688, + "grad_norm": 1.37739896774292, + "learning_rate": 2e-05, + "loss": 0.04131253, + "step": 14344 + }, + { + "epoch": 28.69, + "grad_norm": 1.133405327796936, + "learning_rate": 2e-05, + "loss": 0.04604138, + "step": 14345 + }, + { + "epoch": 28.692, + "grad_norm": 1.2516900300979614, + "learning_rate": 2e-05, + "loss": 0.04709205, + "step": 14346 + }, + { + "epoch": 28.694, + "grad_norm": 1.7897838354110718, + "learning_rate": 2e-05, + "loss": 0.04589042, + "step": 14347 + }, + { + "epoch": 28.696, + "grad_norm": 0.9966497421264648, + "learning_rate": 2e-05, + "loss": 0.0285507, + "step": 14348 + }, + { + "epoch": 28.698, + "grad_norm": 1.126671314239502, + "learning_rate": 2e-05, + "loss": 0.04037992, + "step": 14349 + }, + { + "epoch": 28.7, + "grad_norm": 1.076209306716919, + "learning_rate": 2e-05, + "loss": 0.04209936, + "step": 14350 + }, + { + "epoch": 28.701999999999998, + "grad_norm": 1.2333250045776367, + "learning_rate": 2e-05, + "loss": 0.04853529, + "step": 14351 + }, + { + "epoch": 28.704, + "grad_norm": 1.4717018604278564, + "learning_rate": 2e-05, + "loss": 0.04341109, + "step": 14352 + }, + { + "epoch": 28.706, + "grad_norm": 1.9151333570480347, + "learning_rate": 2e-05, + "loss": 0.04162496, + "step": 14353 + }, + { + "epoch": 28.708, + "grad_norm": 1.7289716005325317, + "learning_rate": 2e-05, + "loss": 0.04978495, + "step": 14354 + }, + { + "epoch": 28.71, + "grad_norm": 1.104732632637024, + "learning_rate": 2e-05, + "loss": 0.04172861, + "step": 14355 + }, + { + "epoch": 28.712, + "grad_norm": 1.1524678468704224, + "learning_rate": 2e-05, + "loss": 0.03477337, + "step": 14356 + }, + { + "epoch": 28.714, + "grad_norm": 1.5342556238174438, + "learning_rate": 2e-05, + "loss": 0.04556607, + "step": 14357 + }, + { + "epoch": 28.716, + "grad_norm": 1.1712950468063354, + "learning_rate": 2e-05, + "loss": 0.04763829, + "step": 14358 + }, + { + "epoch": 28.718, + "grad_norm": 1.091592788696289, + "learning_rate": 2e-05, + "loss": 0.03833639, + "step": 14359 + }, + { + "epoch": 28.72, + "grad_norm": 0.793989360332489, + "learning_rate": 2e-05, + "loss": 0.025282, + "step": 14360 + }, + { + "epoch": 28.722, + "grad_norm": 1.4406577348709106, + "learning_rate": 2e-05, + "loss": 0.03637164, + "step": 14361 + }, + { + "epoch": 28.724, + "grad_norm": 1.957209825515747, + "learning_rate": 2e-05, + "loss": 0.03893009, + "step": 14362 + }, + { + "epoch": 28.726, + "grad_norm": 1.1229599714279175, + "learning_rate": 2e-05, + "loss": 0.03854847, + "step": 14363 + }, + { + "epoch": 28.728, + "grad_norm": 1.9506926536560059, + "learning_rate": 2e-05, + "loss": 0.03666751, + "step": 14364 + }, + { + "epoch": 28.73, + "grad_norm": 1.435692310333252, + "learning_rate": 2e-05, + "loss": 0.03390378, + "step": 14365 + }, + { + "epoch": 28.732, + "grad_norm": 0.955740749835968, + "learning_rate": 2e-05, + "loss": 0.02934472, + "step": 14366 + }, + { + "epoch": 28.734, + "grad_norm": 1.1466535329818726, + "learning_rate": 2e-05, + "loss": 0.05017526, + "step": 14367 + }, + { + "epoch": 28.736, + "grad_norm": 1.218614101409912, + "learning_rate": 2e-05, + "loss": 0.05039631, + "step": 14368 + }, + { + "epoch": 28.738, + "grad_norm": 1.093510389328003, + "learning_rate": 2e-05, + "loss": 0.03176686, + "step": 14369 + }, + { + "epoch": 28.74, + "grad_norm": 1.144060492515564, + "learning_rate": 2e-05, + "loss": 0.04076057, + "step": 14370 + }, + { + "epoch": 28.742, + "grad_norm": 1.5863828659057617, + "learning_rate": 2e-05, + "loss": 0.04815026, + "step": 14371 + }, + { + "epoch": 28.744, + "grad_norm": 0.9931338429450989, + "learning_rate": 2e-05, + "loss": 0.03469016, + "step": 14372 + }, + { + "epoch": 28.746, + "grad_norm": 1.6943480968475342, + "learning_rate": 2e-05, + "loss": 0.05469589, + "step": 14373 + }, + { + "epoch": 28.748, + "grad_norm": 1.2340564727783203, + "learning_rate": 2e-05, + "loss": 0.04292573, + "step": 14374 + }, + { + "epoch": 28.75, + "grad_norm": 1.6473150253295898, + "learning_rate": 2e-05, + "loss": 0.06950572, + "step": 14375 + }, + { + "epoch": 28.752, + "grad_norm": 1.900185465812683, + "learning_rate": 2e-05, + "loss": 0.05023978, + "step": 14376 + }, + { + "epoch": 28.754, + "grad_norm": 1.381726622581482, + "learning_rate": 2e-05, + "loss": 0.05448072, + "step": 14377 + }, + { + "epoch": 28.756, + "grad_norm": 1.0618399381637573, + "learning_rate": 2e-05, + "loss": 0.03553692, + "step": 14378 + }, + { + "epoch": 28.758, + "grad_norm": 1.158182978630066, + "learning_rate": 2e-05, + "loss": 0.04157444, + "step": 14379 + }, + { + "epoch": 28.76, + "grad_norm": 1.0218384265899658, + "learning_rate": 2e-05, + "loss": 0.03774838, + "step": 14380 + }, + { + "epoch": 28.762, + "grad_norm": 1.4672837257385254, + "learning_rate": 2e-05, + "loss": 0.04652436, + "step": 14381 + }, + { + "epoch": 28.764, + "grad_norm": 2.9982128143310547, + "learning_rate": 2e-05, + "loss": 0.05198572, + "step": 14382 + }, + { + "epoch": 28.766, + "grad_norm": 1.5844800472259521, + "learning_rate": 2e-05, + "loss": 0.04013401, + "step": 14383 + }, + { + "epoch": 28.768, + "grad_norm": 2.2007713317871094, + "learning_rate": 2e-05, + "loss": 0.0491824, + "step": 14384 + }, + { + "epoch": 28.77, + "grad_norm": 1.0713263750076294, + "learning_rate": 2e-05, + "loss": 0.03637439, + "step": 14385 + }, + { + "epoch": 28.772, + "grad_norm": 2.0838675498962402, + "learning_rate": 2e-05, + "loss": 0.04036938, + "step": 14386 + }, + { + "epoch": 28.774, + "grad_norm": 1.051755428314209, + "learning_rate": 2e-05, + "loss": 0.03982811, + "step": 14387 + }, + { + "epoch": 28.776, + "grad_norm": 1.3116508722305298, + "learning_rate": 2e-05, + "loss": 0.04132722, + "step": 14388 + }, + { + "epoch": 28.778, + "grad_norm": 1.0713746547698975, + "learning_rate": 2e-05, + "loss": 0.0432993, + "step": 14389 + }, + { + "epoch": 28.78, + "grad_norm": 0.9972354173660278, + "learning_rate": 2e-05, + "loss": 0.04672929, + "step": 14390 + }, + { + "epoch": 28.782, + "grad_norm": 0.9369496703147888, + "learning_rate": 2e-05, + "loss": 0.02907495, + "step": 14391 + }, + { + "epoch": 28.784, + "grad_norm": 1.1444287300109863, + "learning_rate": 2e-05, + "loss": 0.03748661, + "step": 14392 + }, + { + "epoch": 28.786, + "grad_norm": 1.1183111667633057, + "learning_rate": 2e-05, + "loss": 0.02960247, + "step": 14393 + }, + { + "epoch": 28.788, + "grad_norm": 1.0258089303970337, + "learning_rate": 2e-05, + "loss": 0.03007736, + "step": 14394 + }, + { + "epoch": 28.79, + "grad_norm": 0.9938691258430481, + "learning_rate": 2e-05, + "loss": 0.03597631, + "step": 14395 + }, + { + "epoch": 28.792, + "grad_norm": 1.1507741212844849, + "learning_rate": 2e-05, + "loss": 0.04170668, + "step": 14396 + }, + { + "epoch": 28.794, + "grad_norm": 3.4133851528167725, + "learning_rate": 2e-05, + "loss": 0.04272288, + "step": 14397 + }, + { + "epoch": 28.796, + "grad_norm": 1.7486724853515625, + "learning_rate": 2e-05, + "loss": 0.06500375, + "step": 14398 + }, + { + "epoch": 28.798000000000002, + "grad_norm": 1.4064818620681763, + "learning_rate": 2e-05, + "loss": 0.04955719, + "step": 14399 + }, + { + "epoch": 28.8, + "grad_norm": 1.6619688272476196, + "learning_rate": 2e-05, + "loss": 0.04951192, + "step": 14400 + }, + { + "epoch": 28.802, + "grad_norm": 1.1675795316696167, + "learning_rate": 2e-05, + "loss": 0.037157, + "step": 14401 + }, + { + "epoch": 28.804, + "grad_norm": 1.6489543914794922, + "learning_rate": 2e-05, + "loss": 0.04647279, + "step": 14402 + }, + { + "epoch": 28.806, + "grad_norm": 1.7396496534347534, + "learning_rate": 2e-05, + "loss": 0.04493985, + "step": 14403 + }, + { + "epoch": 28.808, + "grad_norm": 1.156243920326233, + "learning_rate": 2e-05, + "loss": 0.04273091, + "step": 14404 + }, + { + "epoch": 28.81, + "grad_norm": 1.0005199909210205, + "learning_rate": 2e-05, + "loss": 0.03280544, + "step": 14405 + }, + { + "epoch": 28.812, + "grad_norm": 3.4301085472106934, + "learning_rate": 2e-05, + "loss": 0.03465569, + "step": 14406 + }, + { + "epoch": 28.814, + "grad_norm": 1.50563383102417, + "learning_rate": 2e-05, + "loss": 0.07086978, + "step": 14407 + }, + { + "epoch": 28.816, + "grad_norm": 1.5838433504104614, + "learning_rate": 2e-05, + "loss": 0.03891544, + "step": 14408 + }, + { + "epoch": 28.818, + "grad_norm": 1.1893781423568726, + "learning_rate": 2e-05, + "loss": 0.04945404, + "step": 14409 + }, + { + "epoch": 28.82, + "grad_norm": 0.8621784448623657, + "learning_rate": 2e-05, + "loss": 0.03163891, + "step": 14410 + }, + { + "epoch": 28.822, + "grad_norm": 1.1594724655151367, + "learning_rate": 2e-05, + "loss": 0.05155197, + "step": 14411 + }, + { + "epoch": 28.824, + "grad_norm": 1.443181037902832, + "learning_rate": 2e-05, + "loss": 0.05630941, + "step": 14412 + }, + { + "epoch": 28.826, + "grad_norm": 1.5907959938049316, + "learning_rate": 2e-05, + "loss": 0.04138833, + "step": 14413 + }, + { + "epoch": 28.828, + "grad_norm": 1.6216257810592651, + "learning_rate": 2e-05, + "loss": 0.04543457, + "step": 14414 + }, + { + "epoch": 28.83, + "grad_norm": 1.4624403715133667, + "learning_rate": 2e-05, + "loss": 0.05380312, + "step": 14415 + }, + { + "epoch": 28.832, + "grad_norm": 1.0963571071624756, + "learning_rate": 2e-05, + "loss": 0.04702737, + "step": 14416 + }, + { + "epoch": 28.834, + "grad_norm": 1.7015491724014282, + "learning_rate": 2e-05, + "loss": 0.03770491, + "step": 14417 + }, + { + "epoch": 28.836, + "grad_norm": 1.20378839969635, + "learning_rate": 2e-05, + "loss": 0.04444302, + "step": 14418 + }, + { + "epoch": 28.838, + "grad_norm": 1.72189199924469, + "learning_rate": 2e-05, + "loss": 0.05816022, + "step": 14419 + }, + { + "epoch": 28.84, + "grad_norm": 1.1528189182281494, + "learning_rate": 2e-05, + "loss": 0.03968474, + "step": 14420 + }, + { + "epoch": 28.842, + "grad_norm": 1.2541671991348267, + "learning_rate": 2e-05, + "loss": 0.03849109, + "step": 14421 + }, + { + "epoch": 28.844, + "grad_norm": 2.8099963665008545, + "learning_rate": 2e-05, + "loss": 0.05084788, + "step": 14422 + }, + { + "epoch": 28.846, + "grad_norm": 0.9781857132911682, + "learning_rate": 2e-05, + "loss": 0.0285262, + "step": 14423 + }, + { + "epoch": 28.848, + "grad_norm": 1.4903124570846558, + "learning_rate": 2e-05, + "loss": 0.05180159, + "step": 14424 + }, + { + "epoch": 28.85, + "grad_norm": 1.7416338920593262, + "learning_rate": 2e-05, + "loss": 0.03428745, + "step": 14425 + }, + { + "epoch": 28.852, + "grad_norm": 1.521485686302185, + "learning_rate": 2e-05, + "loss": 0.04813501, + "step": 14426 + }, + { + "epoch": 28.854, + "grad_norm": 1.2684760093688965, + "learning_rate": 2e-05, + "loss": 0.04741008, + "step": 14427 + }, + { + "epoch": 28.856, + "grad_norm": 1.1217941045761108, + "learning_rate": 2e-05, + "loss": 0.04172241, + "step": 14428 + }, + { + "epoch": 28.858, + "grad_norm": 1.3092353343963623, + "learning_rate": 2e-05, + "loss": 0.04935054, + "step": 14429 + }, + { + "epoch": 28.86, + "grad_norm": 1.0041587352752686, + "learning_rate": 2e-05, + "loss": 0.03718605, + "step": 14430 + }, + { + "epoch": 28.862, + "grad_norm": 2.0443413257598877, + "learning_rate": 2e-05, + "loss": 0.04975209, + "step": 14431 + }, + { + "epoch": 28.864, + "grad_norm": 0.8965648412704468, + "learning_rate": 2e-05, + "loss": 0.03016404, + "step": 14432 + }, + { + "epoch": 28.866, + "grad_norm": 1.8447829484939575, + "learning_rate": 2e-05, + "loss": 0.05125307, + "step": 14433 + }, + { + "epoch": 28.868, + "grad_norm": 1.2474303245544434, + "learning_rate": 2e-05, + "loss": 0.04687266, + "step": 14434 + }, + { + "epoch": 28.87, + "grad_norm": 1.745761752128601, + "learning_rate": 2e-05, + "loss": 0.07663966, + "step": 14435 + }, + { + "epoch": 28.872, + "grad_norm": 1.3358268737792969, + "learning_rate": 2e-05, + "loss": 0.04147369, + "step": 14436 + }, + { + "epoch": 28.874, + "grad_norm": 1.2408902645111084, + "learning_rate": 2e-05, + "loss": 0.04460159, + "step": 14437 + }, + { + "epoch": 28.876, + "grad_norm": 1.9124995470046997, + "learning_rate": 2e-05, + "loss": 0.05355459, + "step": 14438 + }, + { + "epoch": 28.878, + "grad_norm": 1.334394931793213, + "learning_rate": 2e-05, + "loss": 0.04897958, + "step": 14439 + }, + { + "epoch": 28.88, + "grad_norm": 1.8889309167861938, + "learning_rate": 2e-05, + "loss": 0.04751597, + "step": 14440 + }, + { + "epoch": 28.882, + "grad_norm": 1.1469106674194336, + "learning_rate": 2e-05, + "loss": 0.04521649, + "step": 14441 + }, + { + "epoch": 28.884, + "grad_norm": 1.0258030891418457, + "learning_rate": 2e-05, + "loss": 0.04085048, + "step": 14442 + }, + { + "epoch": 28.886, + "grad_norm": 2.1602070331573486, + "learning_rate": 2e-05, + "loss": 0.05456547, + "step": 14443 + }, + { + "epoch": 28.888, + "grad_norm": 1.524979591369629, + "learning_rate": 2e-05, + "loss": 0.04410193, + "step": 14444 + }, + { + "epoch": 28.89, + "grad_norm": 1.0402649641036987, + "learning_rate": 2e-05, + "loss": 0.03135321, + "step": 14445 + }, + { + "epoch": 28.892, + "grad_norm": 1.7973295450210571, + "learning_rate": 2e-05, + "loss": 0.04395194, + "step": 14446 + }, + { + "epoch": 28.894, + "grad_norm": 1.0734200477600098, + "learning_rate": 2e-05, + "loss": 0.0351134, + "step": 14447 + }, + { + "epoch": 28.896, + "grad_norm": 1.271427035331726, + "learning_rate": 2e-05, + "loss": 0.0497748, + "step": 14448 + }, + { + "epoch": 28.898, + "grad_norm": 0.9508517384529114, + "learning_rate": 2e-05, + "loss": 0.03418116, + "step": 14449 + }, + { + "epoch": 28.9, + "grad_norm": 2.287792921066284, + "learning_rate": 2e-05, + "loss": 0.04537106, + "step": 14450 + }, + { + "epoch": 28.902, + "grad_norm": 2.9237358570098877, + "learning_rate": 2e-05, + "loss": 0.06133288, + "step": 14451 + }, + { + "epoch": 28.904, + "grad_norm": 1.5036847591400146, + "learning_rate": 2e-05, + "loss": 0.04737253, + "step": 14452 + }, + { + "epoch": 28.906, + "grad_norm": 1.2241439819335938, + "learning_rate": 2e-05, + "loss": 0.03472542, + "step": 14453 + }, + { + "epoch": 28.908, + "grad_norm": 1.3582462072372437, + "learning_rate": 2e-05, + "loss": 0.05366043, + "step": 14454 + }, + { + "epoch": 28.91, + "grad_norm": 1.204209327697754, + "learning_rate": 2e-05, + "loss": 0.04971019, + "step": 14455 + }, + { + "epoch": 28.912, + "grad_norm": 1.7010318040847778, + "learning_rate": 2e-05, + "loss": 0.05780742, + "step": 14456 + }, + { + "epoch": 28.914, + "grad_norm": 1.0974135398864746, + "learning_rate": 2e-05, + "loss": 0.03613953, + "step": 14457 + }, + { + "epoch": 28.916, + "grad_norm": 1.595192551612854, + "learning_rate": 2e-05, + "loss": 0.04265611, + "step": 14458 + }, + { + "epoch": 28.918, + "grad_norm": 1.5996463298797607, + "learning_rate": 2e-05, + "loss": 0.03614064, + "step": 14459 + }, + { + "epoch": 28.92, + "grad_norm": 0.9934185743331909, + "learning_rate": 2e-05, + "loss": 0.03452392, + "step": 14460 + }, + { + "epoch": 28.922, + "grad_norm": 1.1480003595352173, + "learning_rate": 2e-05, + "loss": 0.03976715, + "step": 14461 + }, + { + "epoch": 28.924, + "grad_norm": 0.7937719225883484, + "learning_rate": 2e-05, + "loss": 0.03106033, + "step": 14462 + }, + { + "epoch": 28.926, + "grad_norm": 0.9472975134849548, + "learning_rate": 2e-05, + "loss": 0.03592245, + "step": 14463 + }, + { + "epoch": 28.928, + "grad_norm": 1.2322276830673218, + "learning_rate": 2e-05, + "loss": 0.06219757, + "step": 14464 + }, + { + "epoch": 28.93, + "grad_norm": 1.1841628551483154, + "learning_rate": 2e-05, + "loss": 0.02757031, + "step": 14465 + }, + { + "epoch": 28.932, + "grad_norm": 2.2316744327545166, + "learning_rate": 2e-05, + "loss": 0.05290478, + "step": 14466 + }, + { + "epoch": 28.934, + "grad_norm": 1.1691445112228394, + "learning_rate": 2e-05, + "loss": 0.04423434, + "step": 14467 + }, + { + "epoch": 28.936, + "grad_norm": 1.2703580856323242, + "learning_rate": 2e-05, + "loss": 0.04534604, + "step": 14468 + }, + { + "epoch": 28.938, + "grad_norm": 1.8662946224212646, + "learning_rate": 2e-05, + "loss": 0.0505904, + "step": 14469 + }, + { + "epoch": 28.94, + "grad_norm": 1.984939455986023, + "learning_rate": 2e-05, + "loss": 0.04816742, + "step": 14470 + }, + { + "epoch": 28.942, + "grad_norm": 1.6010085344314575, + "learning_rate": 2e-05, + "loss": 0.03411138, + "step": 14471 + }, + { + "epoch": 28.944, + "grad_norm": 1.3114997148513794, + "learning_rate": 2e-05, + "loss": 0.03839431, + "step": 14472 + }, + { + "epoch": 28.946, + "grad_norm": 1.9743845462799072, + "learning_rate": 2e-05, + "loss": 0.06387973, + "step": 14473 + }, + { + "epoch": 28.948, + "grad_norm": 3.784247398376465, + "learning_rate": 2e-05, + "loss": 0.05783255, + "step": 14474 + }, + { + "epoch": 28.95, + "grad_norm": 1.5012797117233276, + "learning_rate": 2e-05, + "loss": 0.04049198, + "step": 14475 + }, + { + "epoch": 28.951999999999998, + "grad_norm": 0.9062674641609192, + "learning_rate": 2e-05, + "loss": 0.0387709, + "step": 14476 + }, + { + "epoch": 28.954, + "grad_norm": 1.216447353363037, + "learning_rate": 2e-05, + "loss": 0.03773759, + "step": 14477 + }, + { + "epoch": 28.956, + "grad_norm": 2.346182346343994, + "learning_rate": 2e-05, + "loss": 0.04593071, + "step": 14478 + }, + { + "epoch": 28.958, + "grad_norm": 2.298583984375, + "learning_rate": 2e-05, + "loss": 0.05919803, + "step": 14479 + }, + { + "epoch": 28.96, + "grad_norm": 0.9582846164703369, + "learning_rate": 2e-05, + "loss": 0.02871491, + "step": 14480 + }, + { + "epoch": 28.962, + "grad_norm": 0.7866615056991577, + "learning_rate": 2e-05, + "loss": 0.02259299, + "step": 14481 + }, + { + "epoch": 28.964, + "grad_norm": 1.4925915002822876, + "learning_rate": 2e-05, + "loss": 0.04916498, + "step": 14482 + }, + { + "epoch": 28.966, + "grad_norm": 1.4005680084228516, + "learning_rate": 2e-05, + "loss": 0.05168829, + "step": 14483 + }, + { + "epoch": 28.968, + "grad_norm": 4.166713237762451, + "learning_rate": 2e-05, + "loss": 0.04969958, + "step": 14484 + }, + { + "epoch": 28.97, + "grad_norm": 1.1008797883987427, + "learning_rate": 2e-05, + "loss": 0.03930159, + "step": 14485 + }, + { + "epoch": 28.972, + "grad_norm": 1.0620421171188354, + "learning_rate": 2e-05, + "loss": 0.03718957, + "step": 14486 + }, + { + "epoch": 28.974, + "grad_norm": 1.1578391790390015, + "learning_rate": 2e-05, + "loss": 0.04585633, + "step": 14487 + }, + { + "epoch": 28.976, + "grad_norm": 1.48446524143219, + "learning_rate": 2e-05, + "loss": 0.0247959, + "step": 14488 + }, + { + "epoch": 28.978, + "grad_norm": 1.0976674556732178, + "learning_rate": 2e-05, + "loss": 0.03197834, + "step": 14489 + }, + { + "epoch": 28.98, + "grad_norm": 1.139172077178955, + "learning_rate": 2e-05, + "loss": 0.04646052, + "step": 14490 + }, + { + "epoch": 28.982, + "grad_norm": 1.9785137176513672, + "learning_rate": 2e-05, + "loss": 0.05109739, + "step": 14491 + }, + { + "epoch": 28.984, + "grad_norm": 1.3987001180648804, + "learning_rate": 2e-05, + "loss": 0.0440822, + "step": 14492 + }, + { + "epoch": 28.986, + "grad_norm": 1.220787763595581, + "learning_rate": 2e-05, + "loss": 0.03669748, + "step": 14493 + }, + { + "epoch": 28.988, + "grad_norm": 2.083451986312866, + "learning_rate": 2e-05, + "loss": 0.05152371, + "step": 14494 + }, + { + "epoch": 28.99, + "grad_norm": 1.1273852586746216, + "learning_rate": 2e-05, + "loss": 0.03859162, + "step": 14495 + }, + { + "epoch": 28.992, + "grad_norm": 1.344372034072876, + "learning_rate": 2e-05, + "loss": 0.03838318, + "step": 14496 + }, + { + "epoch": 28.994, + "grad_norm": 1.728652000427246, + "learning_rate": 2e-05, + "loss": 0.03777511, + "step": 14497 + }, + { + "epoch": 28.996, + "grad_norm": 3.5054938793182373, + "learning_rate": 2e-05, + "loss": 0.04964066, + "step": 14498 + }, + { + "epoch": 28.998, + "grad_norm": 1.501190185546875, + "learning_rate": 2e-05, + "loss": 0.06090293, + "step": 14499 + }, + { + "epoch": 29.0, + "grad_norm": 1.227286696434021, + "learning_rate": 2e-05, + "loss": 0.04943068, + "step": 14500 + }, + { + "epoch": 29.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9780439121756487, + "Equal_1": 0.998, + "Equal_2": 0.9780439121756487, + "Equal_3": 0.9880239520958084, + "LineComparison_1": 0.998, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9960079840319361, + "Parallel_1": 0.9959919839679359, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.994, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.984, + "Perpendicular_3": 0.8897795591182365, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9936666666666667, + "PointLiesOnCircle_3": 0.9876, + "PointLiesOnLine_1": 0.9899799599198397, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9820359281437125 + }, + "eval_runtime": 320.129, + "eval_samples_per_second": 32.799, + "eval_steps_per_second": 0.656, + "step": 14500 + }, + { + "epoch": 29.002, + "grad_norm": 1.3614428043365479, + "learning_rate": 2e-05, + "loss": 0.04904348, + "step": 14501 + }, + { + "epoch": 29.004, + "grad_norm": 0.990894079208374, + "learning_rate": 2e-05, + "loss": 0.03972761, + "step": 14502 + }, + { + "epoch": 29.006, + "grad_norm": 1.5582256317138672, + "learning_rate": 2e-05, + "loss": 0.04734577, + "step": 14503 + }, + { + "epoch": 29.008, + "grad_norm": 1.598989725112915, + "learning_rate": 2e-05, + "loss": 0.04445969, + "step": 14504 + }, + { + "epoch": 29.01, + "grad_norm": 1.4012812376022339, + "learning_rate": 2e-05, + "loss": 0.05144128, + "step": 14505 + }, + { + "epoch": 29.012, + "grad_norm": 1.633912444114685, + "learning_rate": 2e-05, + "loss": 0.04580737, + "step": 14506 + }, + { + "epoch": 29.014, + "grad_norm": 1.3634629249572754, + "learning_rate": 2e-05, + "loss": 0.03885691, + "step": 14507 + }, + { + "epoch": 29.016, + "grad_norm": 1.5107965469360352, + "learning_rate": 2e-05, + "loss": 0.05544426, + "step": 14508 + }, + { + "epoch": 29.018, + "grad_norm": 1.50559401512146, + "learning_rate": 2e-05, + "loss": 0.03723505, + "step": 14509 + }, + { + "epoch": 29.02, + "grad_norm": 1.2116549015045166, + "learning_rate": 2e-05, + "loss": 0.06099574, + "step": 14510 + }, + { + "epoch": 29.022, + "grad_norm": 1.4817736148834229, + "learning_rate": 2e-05, + "loss": 0.04306397, + "step": 14511 + }, + { + "epoch": 29.024, + "grad_norm": 1.6792289018630981, + "learning_rate": 2e-05, + "loss": 0.0461472, + "step": 14512 + }, + { + "epoch": 29.026, + "grad_norm": 1.1383700370788574, + "learning_rate": 2e-05, + "loss": 0.0400393, + "step": 14513 + }, + { + "epoch": 29.028, + "grad_norm": 1.6050753593444824, + "learning_rate": 2e-05, + "loss": 0.0388446, + "step": 14514 + }, + { + "epoch": 29.03, + "grad_norm": 2.475426435470581, + "learning_rate": 2e-05, + "loss": 0.04789076, + "step": 14515 + }, + { + "epoch": 29.032, + "grad_norm": 1.7298914194107056, + "learning_rate": 2e-05, + "loss": 0.04865304, + "step": 14516 + }, + { + "epoch": 29.034, + "grad_norm": 1.0967127084732056, + "learning_rate": 2e-05, + "loss": 0.02864125, + "step": 14517 + }, + { + "epoch": 29.036, + "grad_norm": 2.11590838432312, + "learning_rate": 2e-05, + "loss": 0.03580888, + "step": 14518 + }, + { + "epoch": 29.038, + "grad_norm": 1.135114073753357, + "learning_rate": 2e-05, + "loss": 0.04573719, + "step": 14519 + }, + { + "epoch": 29.04, + "grad_norm": 1.1727830171585083, + "learning_rate": 2e-05, + "loss": 0.04438477, + "step": 14520 + }, + { + "epoch": 29.042, + "grad_norm": 1.3196028470993042, + "learning_rate": 2e-05, + "loss": 0.04897299, + "step": 14521 + }, + { + "epoch": 29.044, + "grad_norm": 1.5784796476364136, + "learning_rate": 2e-05, + "loss": 0.06415123, + "step": 14522 + }, + { + "epoch": 29.046, + "grad_norm": 1.6042543649673462, + "learning_rate": 2e-05, + "loss": 0.03841857, + "step": 14523 + }, + { + "epoch": 29.048, + "grad_norm": 1.3283536434173584, + "learning_rate": 2e-05, + "loss": 0.06226228, + "step": 14524 + }, + { + "epoch": 29.05, + "grad_norm": 2.3464431762695312, + "learning_rate": 2e-05, + "loss": 0.05047053, + "step": 14525 + }, + { + "epoch": 29.052, + "grad_norm": 1.370872139930725, + "learning_rate": 2e-05, + "loss": 0.03550132, + "step": 14526 + }, + { + "epoch": 29.054, + "grad_norm": 1.3238565921783447, + "learning_rate": 2e-05, + "loss": 0.0407903, + "step": 14527 + }, + { + "epoch": 29.056, + "grad_norm": 1.16240656375885, + "learning_rate": 2e-05, + "loss": 0.03744307, + "step": 14528 + }, + { + "epoch": 29.058, + "grad_norm": 1.2564165592193604, + "learning_rate": 2e-05, + "loss": 0.03232381, + "step": 14529 + }, + { + "epoch": 29.06, + "grad_norm": 1.1828256845474243, + "learning_rate": 2e-05, + "loss": 0.04936057, + "step": 14530 + }, + { + "epoch": 29.062, + "grad_norm": 1.9137167930603027, + "learning_rate": 2e-05, + "loss": 0.05838963, + "step": 14531 + }, + { + "epoch": 29.064, + "grad_norm": 1.2713327407836914, + "learning_rate": 2e-05, + "loss": 0.04713174, + "step": 14532 + }, + { + "epoch": 29.066, + "grad_norm": 1.812070369720459, + "learning_rate": 2e-05, + "loss": 0.0491786, + "step": 14533 + }, + { + "epoch": 29.068, + "grad_norm": 1.9103559255599976, + "learning_rate": 2e-05, + "loss": 0.04707557, + "step": 14534 + }, + { + "epoch": 29.07, + "grad_norm": 1.211613655090332, + "learning_rate": 2e-05, + "loss": 0.0457197, + "step": 14535 + }, + { + "epoch": 29.072, + "grad_norm": 1.562606692314148, + "learning_rate": 2e-05, + "loss": 0.05934884, + "step": 14536 + }, + { + "epoch": 29.074, + "grad_norm": 1.6958134174346924, + "learning_rate": 2e-05, + "loss": 0.04679252, + "step": 14537 + }, + { + "epoch": 29.076, + "grad_norm": 1.0809810161590576, + "learning_rate": 2e-05, + "loss": 0.04069981, + "step": 14538 + }, + { + "epoch": 29.078, + "grad_norm": 1.0606014728546143, + "learning_rate": 2e-05, + "loss": 0.03034795, + "step": 14539 + }, + { + "epoch": 29.08, + "grad_norm": 1.3464914560317993, + "learning_rate": 2e-05, + "loss": 0.03704887, + "step": 14540 + }, + { + "epoch": 29.082, + "grad_norm": 2.9524738788604736, + "learning_rate": 2e-05, + "loss": 0.06308039, + "step": 14541 + }, + { + "epoch": 29.084, + "grad_norm": 1.7321432828903198, + "learning_rate": 2e-05, + "loss": 0.06781643, + "step": 14542 + }, + { + "epoch": 29.086, + "grad_norm": 1.9871220588684082, + "learning_rate": 2e-05, + "loss": 0.0448276, + "step": 14543 + }, + { + "epoch": 29.088, + "grad_norm": 1.0521864891052246, + "learning_rate": 2e-05, + "loss": 0.04277543, + "step": 14544 + }, + { + "epoch": 29.09, + "grad_norm": 1.6437976360321045, + "learning_rate": 2e-05, + "loss": 0.05797031, + "step": 14545 + }, + { + "epoch": 29.092, + "grad_norm": 1.883757472038269, + "learning_rate": 2e-05, + "loss": 0.03330542, + "step": 14546 + }, + { + "epoch": 29.094, + "grad_norm": 1.5868852138519287, + "learning_rate": 2e-05, + "loss": 0.05670642, + "step": 14547 + }, + { + "epoch": 29.096, + "grad_norm": 1.4247984886169434, + "learning_rate": 2e-05, + "loss": 0.04870617, + "step": 14548 + }, + { + "epoch": 29.098, + "grad_norm": 1.7102012634277344, + "learning_rate": 2e-05, + "loss": 0.0426238, + "step": 14549 + }, + { + "epoch": 29.1, + "grad_norm": 1.3921092748641968, + "learning_rate": 2e-05, + "loss": 0.06138652, + "step": 14550 + }, + { + "epoch": 29.102, + "grad_norm": 1.0594866275787354, + "learning_rate": 2e-05, + "loss": 0.0388628, + "step": 14551 + }, + { + "epoch": 29.104, + "grad_norm": 1.6638845205307007, + "learning_rate": 2e-05, + "loss": 0.04031256, + "step": 14552 + }, + { + "epoch": 29.106, + "grad_norm": 1.3251597881317139, + "learning_rate": 2e-05, + "loss": 0.0368821, + "step": 14553 + }, + { + "epoch": 29.108, + "grad_norm": 1.3504347801208496, + "learning_rate": 2e-05, + "loss": 0.04997741, + "step": 14554 + }, + { + "epoch": 29.11, + "grad_norm": 1.2452493906021118, + "learning_rate": 2e-05, + "loss": 0.04492478, + "step": 14555 + }, + { + "epoch": 29.112, + "grad_norm": 1.7179110050201416, + "learning_rate": 2e-05, + "loss": 0.06608713, + "step": 14556 + }, + { + "epoch": 29.114, + "grad_norm": 0.9290695190429688, + "learning_rate": 2e-05, + "loss": 0.02252526, + "step": 14557 + }, + { + "epoch": 29.116, + "grad_norm": 1.4510340690612793, + "learning_rate": 2e-05, + "loss": 0.03936088, + "step": 14558 + }, + { + "epoch": 29.118, + "grad_norm": 1.056706428527832, + "learning_rate": 2e-05, + "loss": 0.04137318, + "step": 14559 + }, + { + "epoch": 29.12, + "grad_norm": 1.5099283456802368, + "learning_rate": 2e-05, + "loss": 0.05786295, + "step": 14560 + }, + { + "epoch": 29.122, + "grad_norm": 1.028210997581482, + "learning_rate": 2e-05, + "loss": 0.03324694, + "step": 14561 + }, + { + "epoch": 29.124, + "grad_norm": 1.200790524482727, + "learning_rate": 2e-05, + "loss": 0.0418604, + "step": 14562 + }, + { + "epoch": 29.126, + "grad_norm": 1.2720760107040405, + "learning_rate": 2e-05, + "loss": 0.05241677, + "step": 14563 + }, + { + "epoch": 29.128, + "grad_norm": 1.2256773710250854, + "learning_rate": 2e-05, + "loss": 0.05769563, + "step": 14564 + }, + { + "epoch": 29.13, + "grad_norm": 1.125728964805603, + "learning_rate": 2e-05, + "loss": 0.03989631, + "step": 14565 + }, + { + "epoch": 29.132, + "grad_norm": 3.5522632598876953, + "learning_rate": 2e-05, + "loss": 0.04670805, + "step": 14566 + }, + { + "epoch": 29.134, + "grad_norm": 1.2549570798873901, + "learning_rate": 2e-05, + "loss": 0.03625032, + "step": 14567 + }, + { + "epoch": 29.136, + "grad_norm": 1.3453484773635864, + "learning_rate": 2e-05, + "loss": 0.04612465, + "step": 14568 + }, + { + "epoch": 29.138, + "grad_norm": 1.3071088790893555, + "learning_rate": 2e-05, + "loss": 0.04188219, + "step": 14569 + }, + { + "epoch": 29.14, + "grad_norm": 1.373300313949585, + "learning_rate": 2e-05, + "loss": 0.05468735, + "step": 14570 + }, + { + "epoch": 29.142, + "grad_norm": 1.1056849956512451, + "learning_rate": 2e-05, + "loss": 0.04801034, + "step": 14571 + }, + { + "epoch": 29.144, + "grad_norm": 2.6124298572540283, + "learning_rate": 2e-05, + "loss": 0.06286603, + "step": 14572 + }, + { + "epoch": 29.146, + "grad_norm": 1.090798020362854, + "learning_rate": 2e-05, + "loss": 0.04604357, + "step": 14573 + }, + { + "epoch": 29.148, + "grad_norm": 1.4949955940246582, + "learning_rate": 2e-05, + "loss": 0.05045891, + "step": 14574 + }, + { + "epoch": 29.15, + "grad_norm": 1.3205533027648926, + "learning_rate": 2e-05, + "loss": 0.04018179, + "step": 14575 + }, + { + "epoch": 29.152, + "grad_norm": 1.686370611190796, + "learning_rate": 2e-05, + "loss": 0.03889347, + "step": 14576 + }, + { + "epoch": 29.154, + "grad_norm": 0.989204466342926, + "learning_rate": 2e-05, + "loss": 0.04520692, + "step": 14577 + }, + { + "epoch": 29.156, + "grad_norm": 1.28013014793396, + "learning_rate": 2e-05, + "loss": 0.04326824, + "step": 14578 + }, + { + "epoch": 29.158, + "grad_norm": 1.1764912605285645, + "learning_rate": 2e-05, + "loss": 0.03642344, + "step": 14579 + }, + { + "epoch": 29.16, + "grad_norm": 1.8132061958312988, + "learning_rate": 2e-05, + "loss": 0.05279174, + "step": 14580 + }, + { + "epoch": 29.162, + "grad_norm": 1.478899359703064, + "learning_rate": 2e-05, + "loss": 0.05297806, + "step": 14581 + }, + { + "epoch": 29.164, + "grad_norm": 3.247129201889038, + "learning_rate": 2e-05, + "loss": 0.05896338, + "step": 14582 + }, + { + "epoch": 29.166, + "grad_norm": 1.3242168426513672, + "learning_rate": 2e-05, + "loss": 0.03942503, + "step": 14583 + }, + { + "epoch": 29.168, + "grad_norm": 1.1935416460037231, + "learning_rate": 2e-05, + "loss": 0.04639933, + "step": 14584 + }, + { + "epoch": 29.17, + "grad_norm": 1.0874605178833008, + "learning_rate": 2e-05, + "loss": 0.04434571, + "step": 14585 + }, + { + "epoch": 29.172, + "grad_norm": 1.1143393516540527, + "learning_rate": 2e-05, + "loss": 0.0469871, + "step": 14586 + }, + { + "epoch": 29.174, + "grad_norm": 1.423553228378296, + "learning_rate": 2e-05, + "loss": 0.03586338, + "step": 14587 + }, + { + "epoch": 29.176, + "grad_norm": 1.2574028968811035, + "learning_rate": 2e-05, + "loss": 0.038145, + "step": 14588 + }, + { + "epoch": 29.178, + "grad_norm": 1.1540359258651733, + "learning_rate": 2e-05, + "loss": 0.04100233, + "step": 14589 + }, + { + "epoch": 29.18, + "grad_norm": 1.3354766368865967, + "learning_rate": 2e-05, + "loss": 0.05904789, + "step": 14590 + }, + { + "epoch": 29.182, + "grad_norm": 1.163336992263794, + "learning_rate": 2e-05, + "loss": 0.04909441, + "step": 14591 + }, + { + "epoch": 29.184, + "grad_norm": 2.428375482559204, + "learning_rate": 2e-05, + "loss": 0.04465162, + "step": 14592 + }, + { + "epoch": 29.186, + "grad_norm": 1.6806772947311401, + "learning_rate": 2e-05, + "loss": 0.04538292, + "step": 14593 + }, + { + "epoch": 29.188, + "grad_norm": 1.1672495603561401, + "learning_rate": 2e-05, + "loss": 0.04777298, + "step": 14594 + }, + { + "epoch": 29.19, + "grad_norm": 1.0995149612426758, + "learning_rate": 2e-05, + "loss": 0.03148938, + "step": 14595 + }, + { + "epoch": 29.192, + "grad_norm": 1.1681532859802246, + "learning_rate": 2e-05, + "loss": 0.03608632, + "step": 14596 + }, + { + "epoch": 29.194, + "grad_norm": 1.3820486068725586, + "learning_rate": 2e-05, + "loss": 0.05060082, + "step": 14597 + }, + { + "epoch": 29.196, + "grad_norm": 1.0916459560394287, + "learning_rate": 2e-05, + "loss": 0.03177829, + "step": 14598 + }, + { + "epoch": 29.198, + "grad_norm": 1.2117968797683716, + "learning_rate": 2e-05, + "loss": 0.04310794, + "step": 14599 + }, + { + "epoch": 29.2, + "grad_norm": 1.3815139532089233, + "learning_rate": 2e-05, + "loss": 0.03904974, + "step": 14600 + }, + { + "epoch": 29.202, + "grad_norm": 1.455543875694275, + "learning_rate": 2e-05, + "loss": 0.04168963, + "step": 14601 + }, + { + "epoch": 29.204, + "grad_norm": 1.2078819274902344, + "learning_rate": 2e-05, + "loss": 0.05466375, + "step": 14602 + }, + { + "epoch": 29.206, + "grad_norm": 1.8423599004745483, + "learning_rate": 2e-05, + "loss": 0.03679687, + "step": 14603 + }, + { + "epoch": 29.208, + "grad_norm": 1.3103402853012085, + "learning_rate": 2e-05, + "loss": 0.0345687, + "step": 14604 + }, + { + "epoch": 29.21, + "grad_norm": 1.3979569673538208, + "learning_rate": 2e-05, + "loss": 0.04111661, + "step": 14605 + }, + { + "epoch": 29.212, + "grad_norm": 1.2820682525634766, + "learning_rate": 2e-05, + "loss": 0.05244845, + "step": 14606 + }, + { + "epoch": 29.214, + "grad_norm": 1.8376924991607666, + "learning_rate": 2e-05, + "loss": 0.06727045, + "step": 14607 + }, + { + "epoch": 29.216, + "grad_norm": 1.4911775588989258, + "learning_rate": 2e-05, + "loss": 0.03376488, + "step": 14608 + }, + { + "epoch": 29.218, + "grad_norm": 2.5098586082458496, + "learning_rate": 2e-05, + "loss": 0.07967003, + "step": 14609 + }, + { + "epoch": 29.22, + "grad_norm": 1.4665964841842651, + "learning_rate": 2e-05, + "loss": 0.05623496, + "step": 14610 + }, + { + "epoch": 29.222, + "grad_norm": 1.1030839681625366, + "learning_rate": 2e-05, + "loss": 0.04080631, + "step": 14611 + }, + { + "epoch": 29.224, + "grad_norm": 1.169342041015625, + "learning_rate": 2e-05, + "loss": 0.0512477, + "step": 14612 + }, + { + "epoch": 29.226, + "grad_norm": 5.032285690307617, + "learning_rate": 2e-05, + "loss": 0.04048228, + "step": 14613 + }, + { + "epoch": 29.228, + "grad_norm": 1.2166979312896729, + "learning_rate": 2e-05, + "loss": 0.04632211, + "step": 14614 + }, + { + "epoch": 29.23, + "grad_norm": 1.1534336805343628, + "learning_rate": 2e-05, + "loss": 0.03913743, + "step": 14615 + }, + { + "epoch": 29.232, + "grad_norm": 1.1760746240615845, + "learning_rate": 2e-05, + "loss": 0.04341678, + "step": 14616 + }, + { + "epoch": 29.234, + "grad_norm": 1.4027602672576904, + "learning_rate": 2e-05, + "loss": 0.04686597, + "step": 14617 + }, + { + "epoch": 29.236, + "grad_norm": 3.7930917739868164, + "learning_rate": 2e-05, + "loss": 0.05493877, + "step": 14618 + }, + { + "epoch": 29.238, + "grad_norm": 1.149735689163208, + "learning_rate": 2e-05, + "loss": 0.0425221, + "step": 14619 + }, + { + "epoch": 29.24, + "grad_norm": 1.6718671321868896, + "learning_rate": 2e-05, + "loss": 0.05708656, + "step": 14620 + }, + { + "epoch": 29.242, + "grad_norm": 1.2668569087982178, + "learning_rate": 2e-05, + "loss": 0.04664732, + "step": 14621 + }, + { + "epoch": 29.244, + "grad_norm": 1.033226490020752, + "learning_rate": 2e-05, + "loss": 0.02460607, + "step": 14622 + }, + { + "epoch": 29.246, + "grad_norm": 1.172166109085083, + "learning_rate": 2e-05, + "loss": 0.04102369, + "step": 14623 + }, + { + "epoch": 29.248, + "grad_norm": 1.5639601945877075, + "learning_rate": 2e-05, + "loss": 0.0346932, + "step": 14624 + }, + { + "epoch": 29.25, + "grad_norm": 1.2138320207595825, + "learning_rate": 2e-05, + "loss": 0.05353147, + "step": 14625 + }, + { + "epoch": 29.252, + "grad_norm": 1.075577974319458, + "learning_rate": 2e-05, + "loss": 0.04629251, + "step": 14626 + }, + { + "epoch": 29.254, + "grad_norm": 4.6314473152160645, + "learning_rate": 2e-05, + "loss": 0.05750819, + "step": 14627 + }, + { + "epoch": 29.256, + "grad_norm": 1.6789414882659912, + "learning_rate": 2e-05, + "loss": 0.04619577, + "step": 14628 + }, + { + "epoch": 29.258, + "grad_norm": 10.09363079071045, + "learning_rate": 2e-05, + "loss": 0.06505147, + "step": 14629 + }, + { + "epoch": 29.26, + "grad_norm": 1.3053208589553833, + "learning_rate": 2e-05, + "loss": 0.03836732, + "step": 14630 + }, + { + "epoch": 29.262, + "grad_norm": 1.1734479665756226, + "learning_rate": 2e-05, + "loss": 0.04292649, + "step": 14631 + }, + { + "epoch": 29.264, + "grad_norm": 1.0191097259521484, + "learning_rate": 2e-05, + "loss": 0.0362784, + "step": 14632 + }, + { + "epoch": 29.266, + "grad_norm": 1.4284632205963135, + "learning_rate": 2e-05, + "loss": 0.04708409, + "step": 14633 + }, + { + "epoch": 29.268, + "grad_norm": 1.857406735420227, + "learning_rate": 2e-05, + "loss": 0.04922803, + "step": 14634 + }, + { + "epoch": 29.27, + "grad_norm": 1.4661109447479248, + "learning_rate": 2e-05, + "loss": 0.032971, + "step": 14635 + }, + { + "epoch": 29.272, + "grad_norm": 1.6468689441680908, + "learning_rate": 2e-05, + "loss": 0.04467516, + "step": 14636 + }, + { + "epoch": 29.274, + "grad_norm": 1.1976274251937866, + "learning_rate": 2e-05, + "loss": 0.05661123, + "step": 14637 + }, + { + "epoch": 29.276, + "grad_norm": 1.0101951360702515, + "learning_rate": 2e-05, + "loss": 0.04431897, + "step": 14638 + }, + { + "epoch": 29.278, + "grad_norm": 1.9550938606262207, + "learning_rate": 2e-05, + "loss": 0.03428712, + "step": 14639 + }, + { + "epoch": 29.28, + "grad_norm": 1.263190746307373, + "learning_rate": 2e-05, + "loss": 0.05213517, + "step": 14640 + }, + { + "epoch": 29.282, + "grad_norm": 1.0882905721664429, + "learning_rate": 2e-05, + "loss": 0.04379845, + "step": 14641 + }, + { + "epoch": 29.284, + "grad_norm": 1.6553674936294556, + "learning_rate": 2e-05, + "loss": 0.04459314, + "step": 14642 + }, + { + "epoch": 29.286, + "grad_norm": 1.1553642749786377, + "learning_rate": 2e-05, + "loss": 0.04111927, + "step": 14643 + }, + { + "epoch": 29.288, + "grad_norm": 1.7052907943725586, + "learning_rate": 2e-05, + "loss": 0.04706088, + "step": 14644 + }, + { + "epoch": 29.29, + "grad_norm": 1.7893346548080444, + "learning_rate": 2e-05, + "loss": 0.04122625, + "step": 14645 + }, + { + "epoch": 29.292, + "grad_norm": 0.8917121887207031, + "learning_rate": 2e-05, + "loss": 0.02282002, + "step": 14646 + }, + { + "epoch": 29.294, + "grad_norm": 1.3831899166107178, + "learning_rate": 2e-05, + "loss": 0.06464884, + "step": 14647 + }, + { + "epoch": 29.296, + "grad_norm": 1.0708141326904297, + "learning_rate": 2e-05, + "loss": 0.03723243, + "step": 14648 + }, + { + "epoch": 29.298, + "grad_norm": 2.3124451637268066, + "learning_rate": 2e-05, + "loss": 0.03636015, + "step": 14649 + }, + { + "epoch": 29.3, + "grad_norm": 1.150187373161316, + "learning_rate": 2e-05, + "loss": 0.03666411, + "step": 14650 + }, + { + "epoch": 29.302, + "grad_norm": 1.4192458391189575, + "learning_rate": 2e-05, + "loss": 0.05449123, + "step": 14651 + }, + { + "epoch": 29.304, + "grad_norm": 1.0729455947875977, + "learning_rate": 2e-05, + "loss": 0.04042206, + "step": 14652 + }, + { + "epoch": 29.306, + "grad_norm": 1.1908464431762695, + "learning_rate": 2e-05, + "loss": 0.0445194, + "step": 14653 + }, + { + "epoch": 29.308, + "grad_norm": 1.1397109031677246, + "learning_rate": 2e-05, + "loss": 0.03711201, + "step": 14654 + }, + { + "epoch": 29.31, + "grad_norm": 1.309540867805481, + "learning_rate": 2e-05, + "loss": 0.04084215, + "step": 14655 + }, + { + "epoch": 29.312, + "grad_norm": 1.0415390729904175, + "learning_rate": 2e-05, + "loss": 0.03293938, + "step": 14656 + }, + { + "epoch": 29.314, + "grad_norm": 1.683875560760498, + "learning_rate": 2e-05, + "loss": 0.04933864, + "step": 14657 + }, + { + "epoch": 29.316, + "grad_norm": 1.3526222705841064, + "learning_rate": 2e-05, + "loss": 0.02989796, + "step": 14658 + }, + { + "epoch": 29.318, + "grad_norm": 1.5054032802581787, + "learning_rate": 2e-05, + "loss": 0.04449115, + "step": 14659 + }, + { + "epoch": 29.32, + "grad_norm": 1.2268929481506348, + "learning_rate": 2e-05, + "loss": 0.0320545, + "step": 14660 + }, + { + "epoch": 29.322, + "grad_norm": 1.3516592979431152, + "learning_rate": 2e-05, + "loss": 0.05182027, + "step": 14661 + }, + { + "epoch": 29.324, + "grad_norm": 1.1587803363800049, + "learning_rate": 2e-05, + "loss": 0.04185852, + "step": 14662 + }, + { + "epoch": 29.326, + "grad_norm": 1.048889398574829, + "learning_rate": 2e-05, + "loss": 0.02531143, + "step": 14663 + }, + { + "epoch": 29.328, + "grad_norm": 2.0608630180358887, + "learning_rate": 2e-05, + "loss": 0.04467651, + "step": 14664 + }, + { + "epoch": 29.33, + "grad_norm": 1.0219707489013672, + "learning_rate": 2e-05, + "loss": 0.02736703, + "step": 14665 + }, + { + "epoch": 29.332, + "grad_norm": 1.091489553451538, + "learning_rate": 2e-05, + "loss": 0.04337668, + "step": 14666 + }, + { + "epoch": 29.334, + "grad_norm": 1.2775390148162842, + "learning_rate": 2e-05, + "loss": 0.04817453, + "step": 14667 + }, + { + "epoch": 29.336, + "grad_norm": 1.760574221611023, + "learning_rate": 2e-05, + "loss": 0.04524562, + "step": 14668 + }, + { + "epoch": 29.338, + "grad_norm": 1.5247159004211426, + "learning_rate": 2e-05, + "loss": 0.04375861, + "step": 14669 + }, + { + "epoch": 29.34, + "grad_norm": 1.308775544166565, + "learning_rate": 2e-05, + "loss": 0.04865634, + "step": 14670 + }, + { + "epoch": 29.342, + "grad_norm": 1.2305173873901367, + "learning_rate": 2e-05, + "loss": 0.04637342, + "step": 14671 + }, + { + "epoch": 29.344, + "grad_norm": 1.3184024095535278, + "learning_rate": 2e-05, + "loss": 0.048801, + "step": 14672 + }, + { + "epoch": 29.346, + "grad_norm": 1.0286909341812134, + "learning_rate": 2e-05, + "loss": 0.03713179, + "step": 14673 + }, + { + "epoch": 29.348, + "grad_norm": 1.2571288347244263, + "learning_rate": 2e-05, + "loss": 0.03584104, + "step": 14674 + }, + { + "epoch": 29.35, + "grad_norm": 0.9635476469993591, + "learning_rate": 2e-05, + "loss": 0.03240035, + "step": 14675 + }, + { + "epoch": 29.352, + "grad_norm": 0.9458169341087341, + "learning_rate": 2e-05, + "loss": 0.03441823, + "step": 14676 + }, + { + "epoch": 29.354, + "grad_norm": 1.2421998977661133, + "learning_rate": 2e-05, + "loss": 0.02801424, + "step": 14677 + }, + { + "epoch": 29.356, + "grad_norm": 1.1887518167495728, + "learning_rate": 2e-05, + "loss": 0.03601447, + "step": 14678 + }, + { + "epoch": 29.358, + "grad_norm": 1.2935701608657837, + "learning_rate": 2e-05, + "loss": 0.05169864, + "step": 14679 + }, + { + "epoch": 29.36, + "grad_norm": 1.5155844688415527, + "learning_rate": 2e-05, + "loss": 0.05170265, + "step": 14680 + }, + { + "epoch": 29.362, + "grad_norm": 1.0864388942718506, + "learning_rate": 2e-05, + "loss": 0.03917462, + "step": 14681 + }, + { + "epoch": 29.364, + "grad_norm": 3.3979506492614746, + "learning_rate": 2e-05, + "loss": 0.06590277, + "step": 14682 + }, + { + "epoch": 29.366, + "grad_norm": 1.263260006904602, + "learning_rate": 2e-05, + "loss": 0.03736039, + "step": 14683 + }, + { + "epoch": 29.368, + "grad_norm": 1.6203467845916748, + "learning_rate": 2e-05, + "loss": 0.03045352, + "step": 14684 + }, + { + "epoch": 29.37, + "grad_norm": 1.185299038887024, + "learning_rate": 2e-05, + "loss": 0.0308273, + "step": 14685 + }, + { + "epoch": 29.372, + "grad_norm": 1.065706491470337, + "learning_rate": 2e-05, + "loss": 0.04142375, + "step": 14686 + }, + { + "epoch": 29.374, + "grad_norm": 1.071706771850586, + "learning_rate": 2e-05, + "loss": 0.02857183, + "step": 14687 + }, + { + "epoch": 29.376, + "grad_norm": 1.0515819787979126, + "learning_rate": 2e-05, + "loss": 0.03293607, + "step": 14688 + }, + { + "epoch": 29.378, + "grad_norm": 1.2863487005233765, + "learning_rate": 2e-05, + "loss": 0.06486448, + "step": 14689 + }, + { + "epoch": 29.38, + "grad_norm": 1.3283601999282837, + "learning_rate": 2e-05, + "loss": 0.0511212, + "step": 14690 + }, + { + "epoch": 29.382, + "grad_norm": 1.3435486555099487, + "learning_rate": 2e-05, + "loss": 0.04754411, + "step": 14691 + }, + { + "epoch": 29.384, + "grad_norm": 2.4031832218170166, + "learning_rate": 2e-05, + "loss": 0.02932245, + "step": 14692 + }, + { + "epoch": 29.386, + "grad_norm": 1.0233304500579834, + "learning_rate": 2e-05, + "loss": 0.02972196, + "step": 14693 + }, + { + "epoch": 29.388, + "grad_norm": 0.9224061369895935, + "learning_rate": 2e-05, + "loss": 0.03122431, + "step": 14694 + }, + { + "epoch": 29.39, + "grad_norm": 3.187535524368286, + "learning_rate": 2e-05, + "loss": 0.06037506, + "step": 14695 + }, + { + "epoch": 29.392, + "grad_norm": 2.3195505142211914, + "learning_rate": 2e-05, + "loss": 0.04076795, + "step": 14696 + }, + { + "epoch": 29.394, + "grad_norm": 1.0622916221618652, + "learning_rate": 2e-05, + "loss": 0.02912345, + "step": 14697 + }, + { + "epoch": 29.396, + "grad_norm": 1.1375831365585327, + "learning_rate": 2e-05, + "loss": 0.03395087, + "step": 14698 + }, + { + "epoch": 29.398, + "grad_norm": 4.46896505355835, + "learning_rate": 2e-05, + "loss": 0.08969009, + "step": 14699 + }, + { + "epoch": 29.4, + "grad_norm": 1.382957100868225, + "learning_rate": 2e-05, + "loss": 0.05825138, + "step": 14700 + }, + { + "epoch": 29.402, + "grad_norm": 3.7034122943878174, + "learning_rate": 2e-05, + "loss": 0.05984267, + "step": 14701 + }, + { + "epoch": 29.404, + "grad_norm": 1.288468837738037, + "learning_rate": 2e-05, + "loss": 0.04220777, + "step": 14702 + }, + { + "epoch": 29.406, + "grad_norm": 5.697216510772705, + "learning_rate": 2e-05, + "loss": 0.04966062, + "step": 14703 + }, + { + "epoch": 29.408, + "grad_norm": 1.3597912788391113, + "learning_rate": 2e-05, + "loss": 0.06009267, + "step": 14704 + }, + { + "epoch": 29.41, + "grad_norm": 1.123845100402832, + "learning_rate": 2e-05, + "loss": 0.03696072, + "step": 14705 + }, + { + "epoch": 29.412, + "grad_norm": 1.5596554279327393, + "learning_rate": 2e-05, + "loss": 0.03732103, + "step": 14706 + }, + { + "epoch": 29.414, + "grad_norm": 1.2256417274475098, + "learning_rate": 2e-05, + "loss": 0.0455781, + "step": 14707 + }, + { + "epoch": 29.416, + "grad_norm": 1.2588534355163574, + "learning_rate": 2e-05, + "loss": 0.04248727, + "step": 14708 + }, + { + "epoch": 29.418, + "grad_norm": 1.5599313974380493, + "learning_rate": 2e-05, + "loss": 0.05124461, + "step": 14709 + }, + { + "epoch": 29.42, + "grad_norm": 1.3829172849655151, + "learning_rate": 2e-05, + "loss": 0.05284266, + "step": 14710 + }, + { + "epoch": 29.422, + "grad_norm": 0.9758954644203186, + "learning_rate": 2e-05, + "loss": 0.03507173, + "step": 14711 + }, + { + "epoch": 29.424, + "grad_norm": 1.019920825958252, + "learning_rate": 2e-05, + "loss": 0.03660026, + "step": 14712 + }, + { + "epoch": 29.426, + "grad_norm": 1.6652642488479614, + "learning_rate": 2e-05, + "loss": 0.05362531, + "step": 14713 + }, + { + "epoch": 29.428, + "grad_norm": 3.9348857402801514, + "learning_rate": 2e-05, + "loss": 0.03445033, + "step": 14714 + }, + { + "epoch": 29.43, + "grad_norm": 0.9290486574172974, + "learning_rate": 2e-05, + "loss": 0.03240933, + "step": 14715 + }, + { + "epoch": 29.432, + "grad_norm": 1.4232569932937622, + "learning_rate": 2e-05, + "loss": 0.05291854, + "step": 14716 + }, + { + "epoch": 29.434, + "grad_norm": 1.1019871234893799, + "learning_rate": 2e-05, + "loss": 0.04499703, + "step": 14717 + }, + { + "epoch": 29.436, + "grad_norm": 1.0482898950576782, + "learning_rate": 2e-05, + "loss": 0.02875671, + "step": 14718 + }, + { + "epoch": 29.438, + "grad_norm": 2.0963454246520996, + "learning_rate": 2e-05, + "loss": 0.06372029, + "step": 14719 + }, + { + "epoch": 29.44, + "grad_norm": 1.4427788257598877, + "learning_rate": 2e-05, + "loss": 0.05994175, + "step": 14720 + }, + { + "epoch": 29.442, + "grad_norm": 1.1246081590652466, + "learning_rate": 2e-05, + "loss": 0.05035602, + "step": 14721 + }, + { + "epoch": 29.444, + "grad_norm": 1.080545425415039, + "learning_rate": 2e-05, + "loss": 0.03505052, + "step": 14722 + }, + { + "epoch": 29.446, + "grad_norm": 2.5756614208221436, + "learning_rate": 2e-05, + "loss": 0.0641964, + "step": 14723 + }, + { + "epoch": 29.448, + "grad_norm": 0.9659985899925232, + "learning_rate": 2e-05, + "loss": 0.0326273, + "step": 14724 + }, + { + "epoch": 29.45, + "grad_norm": 1.0880937576293945, + "learning_rate": 2e-05, + "loss": 0.03114274, + "step": 14725 + }, + { + "epoch": 29.452, + "grad_norm": 1.5530476570129395, + "learning_rate": 2e-05, + "loss": 0.0458639, + "step": 14726 + }, + { + "epoch": 29.454, + "grad_norm": 1.9776489734649658, + "learning_rate": 2e-05, + "loss": 0.04284504, + "step": 14727 + }, + { + "epoch": 29.456, + "grad_norm": 2.1341958045959473, + "learning_rate": 2e-05, + "loss": 0.06103675, + "step": 14728 + }, + { + "epoch": 29.458, + "grad_norm": 2.14361834526062, + "learning_rate": 2e-05, + "loss": 0.04144895, + "step": 14729 + }, + { + "epoch": 29.46, + "grad_norm": 1.2067841291427612, + "learning_rate": 2e-05, + "loss": 0.03508757, + "step": 14730 + }, + { + "epoch": 29.462, + "grad_norm": 1.6224901676177979, + "learning_rate": 2e-05, + "loss": 0.03946466, + "step": 14731 + }, + { + "epoch": 29.464, + "grad_norm": 1.643559455871582, + "learning_rate": 2e-05, + "loss": 0.04336884, + "step": 14732 + }, + { + "epoch": 29.466, + "grad_norm": 1.0944159030914307, + "learning_rate": 2e-05, + "loss": 0.05785472, + "step": 14733 + }, + { + "epoch": 29.468, + "grad_norm": 1.0807979106903076, + "learning_rate": 2e-05, + "loss": 0.03531633, + "step": 14734 + }, + { + "epoch": 29.47, + "grad_norm": 2.1468169689178467, + "learning_rate": 2e-05, + "loss": 0.05086451, + "step": 14735 + }, + { + "epoch": 29.472, + "grad_norm": 1.5246763229370117, + "learning_rate": 2e-05, + "loss": 0.05114653, + "step": 14736 + }, + { + "epoch": 29.474, + "grad_norm": 1.0691903829574585, + "learning_rate": 2e-05, + "loss": 0.04326432, + "step": 14737 + }, + { + "epoch": 29.476, + "grad_norm": 1.1668561697006226, + "learning_rate": 2e-05, + "loss": 0.05376833, + "step": 14738 + }, + { + "epoch": 29.478, + "grad_norm": 1.191372275352478, + "learning_rate": 2e-05, + "loss": 0.04350607, + "step": 14739 + }, + { + "epoch": 29.48, + "grad_norm": 1.1428472995758057, + "learning_rate": 2e-05, + "loss": 0.03237382, + "step": 14740 + }, + { + "epoch": 29.482, + "grad_norm": 1.0473662614822388, + "learning_rate": 2e-05, + "loss": 0.03938694, + "step": 14741 + }, + { + "epoch": 29.484, + "grad_norm": 0.9359759092330933, + "learning_rate": 2e-05, + "loss": 0.03335633, + "step": 14742 + }, + { + "epoch": 29.486, + "grad_norm": 1.2112761735916138, + "learning_rate": 2e-05, + "loss": 0.03153354, + "step": 14743 + }, + { + "epoch": 29.488, + "grad_norm": 0.8638390302658081, + "learning_rate": 2e-05, + "loss": 0.0252349, + "step": 14744 + }, + { + "epoch": 29.49, + "grad_norm": 0.9504496455192566, + "learning_rate": 2e-05, + "loss": 0.03327549, + "step": 14745 + }, + { + "epoch": 29.492, + "grad_norm": 4.146841526031494, + "learning_rate": 2e-05, + "loss": 0.04628399, + "step": 14746 + }, + { + "epoch": 29.494, + "grad_norm": 2.1307597160339355, + "learning_rate": 2e-05, + "loss": 0.06677426, + "step": 14747 + }, + { + "epoch": 29.496, + "grad_norm": 1.4499460458755493, + "learning_rate": 2e-05, + "loss": 0.04317267, + "step": 14748 + }, + { + "epoch": 29.498, + "grad_norm": 1.7764891386032104, + "learning_rate": 2e-05, + "loss": 0.06008489, + "step": 14749 + }, + { + "epoch": 29.5, + "grad_norm": 1.4432235956192017, + "learning_rate": 2e-05, + "loss": 0.06923801, + "step": 14750 + }, + { + "epoch": 29.502, + "grad_norm": 1.4468677043914795, + "learning_rate": 2e-05, + "loss": 0.05008958, + "step": 14751 + }, + { + "epoch": 29.504, + "grad_norm": 1.3012151718139648, + "learning_rate": 2e-05, + "loss": 0.04738281, + "step": 14752 + }, + { + "epoch": 29.506, + "grad_norm": 1.1001543998718262, + "learning_rate": 2e-05, + "loss": 0.04851674, + "step": 14753 + }, + { + "epoch": 29.508, + "grad_norm": 1.2774382829666138, + "learning_rate": 2e-05, + "loss": 0.03910155, + "step": 14754 + }, + { + "epoch": 29.51, + "grad_norm": 1.2547918558120728, + "learning_rate": 2e-05, + "loss": 0.05241001, + "step": 14755 + }, + { + "epoch": 29.512, + "grad_norm": 1.3092374801635742, + "learning_rate": 2e-05, + "loss": 0.04025935, + "step": 14756 + }, + { + "epoch": 29.514, + "grad_norm": 1.3792351484298706, + "learning_rate": 2e-05, + "loss": 0.0517666, + "step": 14757 + }, + { + "epoch": 29.516, + "grad_norm": 0.9093297719955444, + "learning_rate": 2e-05, + "loss": 0.03267594, + "step": 14758 + }, + { + "epoch": 29.518, + "grad_norm": 1.0705640316009521, + "learning_rate": 2e-05, + "loss": 0.04189052, + "step": 14759 + }, + { + "epoch": 29.52, + "grad_norm": 1.0387095212936401, + "learning_rate": 2e-05, + "loss": 0.04280265, + "step": 14760 + }, + { + "epoch": 29.522, + "grad_norm": 1.2232365608215332, + "learning_rate": 2e-05, + "loss": 0.04506699, + "step": 14761 + }, + { + "epoch": 29.524, + "grad_norm": 1.1054472923278809, + "learning_rate": 2e-05, + "loss": 0.04134972, + "step": 14762 + }, + { + "epoch": 29.526, + "grad_norm": 1.1075634956359863, + "learning_rate": 2e-05, + "loss": 0.03896327, + "step": 14763 + }, + { + "epoch": 29.528, + "grad_norm": 1.2247326374053955, + "learning_rate": 2e-05, + "loss": 0.0459715, + "step": 14764 + }, + { + "epoch": 29.53, + "grad_norm": 1.2403117418289185, + "learning_rate": 2e-05, + "loss": 0.03991577, + "step": 14765 + }, + { + "epoch": 29.532, + "grad_norm": 1.0780881643295288, + "learning_rate": 2e-05, + "loss": 0.03780124, + "step": 14766 + }, + { + "epoch": 29.534, + "grad_norm": 1.5894378423690796, + "learning_rate": 2e-05, + "loss": 0.03415346, + "step": 14767 + }, + { + "epoch": 29.536, + "grad_norm": 0.9309705495834351, + "learning_rate": 2e-05, + "loss": 0.02937978, + "step": 14768 + }, + { + "epoch": 29.538, + "grad_norm": 1.135497808456421, + "learning_rate": 2e-05, + "loss": 0.03995682, + "step": 14769 + }, + { + "epoch": 29.54, + "grad_norm": 1.4162516593933105, + "learning_rate": 2e-05, + "loss": 0.05143316, + "step": 14770 + }, + { + "epoch": 29.542, + "grad_norm": 1.3331693410873413, + "learning_rate": 2e-05, + "loss": 0.04119586, + "step": 14771 + }, + { + "epoch": 29.544, + "grad_norm": 1.2744102478027344, + "learning_rate": 2e-05, + "loss": 0.06154334, + "step": 14772 + }, + { + "epoch": 29.546, + "grad_norm": 1.3395334482192993, + "learning_rate": 2e-05, + "loss": 0.03931785, + "step": 14773 + }, + { + "epoch": 29.548000000000002, + "grad_norm": 1.2095366716384888, + "learning_rate": 2e-05, + "loss": 0.04690979, + "step": 14774 + }, + { + "epoch": 29.55, + "grad_norm": 1.7333354949951172, + "learning_rate": 2e-05, + "loss": 0.0398626, + "step": 14775 + }, + { + "epoch": 29.552, + "grad_norm": 1.404498815536499, + "learning_rate": 2e-05, + "loss": 0.03682255, + "step": 14776 + }, + { + "epoch": 29.554, + "grad_norm": 1.3003278970718384, + "learning_rate": 2e-05, + "loss": 0.03933616, + "step": 14777 + }, + { + "epoch": 29.556, + "grad_norm": 1.013600468635559, + "learning_rate": 2e-05, + "loss": 0.04233592, + "step": 14778 + }, + { + "epoch": 29.558, + "grad_norm": 0.9388518333435059, + "learning_rate": 2e-05, + "loss": 0.03290233, + "step": 14779 + }, + { + "epoch": 29.56, + "grad_norm": 1.076974868774414, + "learning_rate": 2e-05, + "loss": 0.03113963, + "step": 14780 + }, + { + "epoch": 29.562, + "grad_norm": 1.5698806047439575, + "learning_rate": 2e-05, + "loss": 0.03970818, + "step": 14781 + }, + { + "epoch": 29.564, + "grad_norm": 1.5527507066726685, + "learning_rate": 2e-05, + "loss": 0.05457158, + "step": 14782 + }, + { + "epoch": 29.566, + "grad_norm": 1.105551838874817, + "learning_rate": 2e-05, + "loss": 0.0364199, + "step": 14783 + }, + { + "epoch": 29.568, + "grad_norm": 1.2447288036346436, + "learning_rate": 2e-05, + "loss": 0.05043525, + "step": 14784 + }, + { + "epoch": 29.57, + "grad_norm": 1.6889266967773438, + "learning_rate": 2e-05, + "loss": 0.03643173, + "step": 14785 + }, + { + "epoch": 29.572, + "grad_norm": 1.1132603883743286, + "learning_rate": 2e-05, + "loss": 0.03821333, + "step": 14786 + }, + { + "epoch": 29.574, + "grad_norm": 1.628547191619873, + "learning_rate": 2e-05, + "loss": 0.02059213, + "step": 14787 + }, + { + "epoch": 29.576, + "grad_norm": 1.4750348329544067, + "learning_rate": 2e-05, + "loss": 0.05588658, + "step": 14788 + }, + { + "epoch": 29.578, + "grad_norm": 1.142136812210083, + "learning_rate": 2e-05, + "loss": 0.04343769, + "step": 14789 + }, + { + "epoch": 29.58, + "grad_norm": 2.035980463027954, + "learning_rate": 2e-05, + "loss": 0.04574387, + "step": 14790 + }, + { + "epoch": 29.582, + "grad_norm": 0.9724331498146057, + "learning_rate": 2e-05, + "loss": 0.0316183, + "step": 14791 + }, + { + "epoch": 29.584, + "grad_norm": 1.0555124282836914, + "learning_rate": 2e-05, + "loss": 0.03854711, + "step": 14792 + }, + { + "epoch": 29.586, + "grad_norm": 2.9631097316741943, + "learning_rate": 2e-05, + "loss": 0.07389696, + "step": 14793 + }, + { + "epoch": 29.588, + "grad_norm": 1.2569674253463745, + "learning_rate": 2e-05, + "loss": 0.04960248, + "step": 14794 + }, + { + "epoch": 29.59, + "grad_norm": 0.8959588408470154, + "learning_rate": 2e-05, + "loss": 0.02651092, + "step": 14795 + }, + { + "epoch": 29.592, + "grad_norm": 1.4212428331375122, + "learning_rate": 2e-05, + "loss": 0.05283296, + "step": 14796 + }, + { + "epoch": 29.594, + "grad_norm": 1.2432961463928223, + "learning_rate": 2e-05, + "loss": 0.03414245, + "step": 14797 + }, + { + "epoch": 29.596, + "grad_norm": 1.3017022609710693, + "learning_rate": 2e-05, + "loss": 0.04572171, + "step": 14798 + }, + { + "epoch": 29.598, + "grad_norm": 1.7550230026245117, + "learning_rate": 2e-05, + "loss": 0.04602732, + "step": 14799 + }, + { + "epoch": 29.6, + "grad_norm": 0.9701183438301086, + "learning_rate": 2e-05, + "loss": 0.02774266, + "step": 14800 + }, + { + "epoch": 29.602, + "grad_norm": 1.1324889659881592, + "learning_rate": 2e-05, + "loss": 0.03797245, + "step": 14801 + }, + { + "epoch": 29.604, + "grad_norm": 1.1381210088729858, + "learning_rate": 2e-05, + "loss": 0.043383, + "step": 14802 + }, + { + "epoch": 29.606, + "grad_norm": 1.3297442197799683, + "learning_rate": 2e-05, + "loss": 0.04047207, + "step": 14803 + }, + { + "epoch": 29.608, + "grad_norm": 1.389540433883667, + "learning_rate": 2e-05, + "loss": 0.04826028, + "step": 14804 + }, + { + "epoch": 29.61, + "grad_norm": 2.0742807388305664, + "learning_rate": 2e-05, + "loss": 0.0447638, + "step": 14805 + }, + { + "epoch": 29.612, + "grad_norm": 3.014904260635376, + "learning_rate": 2e-05, + "loss": 0.04129062, + "step": 14806 + }, + { + "epoch": 29.614, + "grad_norm": 1.2650154829025269, + "learning_rate": 2e-05, + "loss": 0.03646675, + "step": 14807 + }, + { + "epoch": 29.616, + "grad_norm": 1.6480337381362915, + "learning_rate": 2e-05, + "loss": 0.03977471, + "step": 14808 + }, + { + "epoch": 29.618, + "grad_norm": 1.184668779373169, + "learning_rate": 2e-05, + "loss": 0.05217928, + "step": 14809 + }, + { + "epoch": 29.62, + "grad_norm": 1.2116751670837402, + "learning_rate": 2e-05, + "loss": 0.03607161, + "step": 14810 + }, + { + "epoch": 29.622, + "grad_norm": 1.508227825164795, + "learning_rate": 2e-05, + "loss": 0.04605871, + "step": 14811 + }, + { + "epoch": 29.624, + "grad_norm": 1.2774351835250854, + "learning_rate": 2e-05, + "loss": 0.04796334, + "step": 14812 + }, + { + "epoch": 29.626, + "grad_norm": 1.0448981523513794, + "learning_rate": 2e-05, + "loss": 0.03082025, + "step": 14813 + }, + { + "epoch": 29.628, + "grad_norm": 1.2237423658370972, + "learning_rate": 2e-05, + "loss": 0.04101267, + "step": 14814 + }, + { + "epoch": 29.63, + "grad_norm": 1.5655183792114258, + "learning_rate": 2e-05, + "loss": 0.04582419, + "step": 14815 + }, + { + "epoch": 29.632, + "grad_norm": 2.1846954822540283, + "learning_rate": 2e-05, + "loss": 0.05257653, + "step": 14816 + }, + { + "epoch": 29.634, + "grad_norm": 1.238049030303955, + "learning_rate": 2e-05, + "loss": 0.03405455, + "step": 14817 + }, + { + "epoch": 29.636, + "grad_norm": 1.6897296905517578, + "learning_rate": 2e-05, + "loss": 0.01916685, + "step": 14818 + }, + { + "epoch": 29.638, + "grad_norm": 0.9575730562210083, + "learning_rate": 2e-05, + "loss": 0.03765364, + "step": 14819 + }, + { + "epoch": 29.64, + "grad_norm": 2.1810710430145264, + "learning_rate": 2e-05, + "loss": 0.05722524, + "step": 14820 + }, + { + "epoch": 29.642, + "grad_norm": 0.9834027886390686, + "learning_rate": 2e-05, + "loss": 0.04266867, + "step": 14821 + }, + { + "epoch": 29.644, + "grad_norm": 1.2944148778915405, + "learning_rate": 2e-05, + "loss": 0.05544176, + "step": 14822 + }, + { + "epoch": 29.646, + "grad_norm": 1.9650918245315552, + "learning_rate": 2e-05, + "loss": 0.02375646, + "step": 14823 + }, + { + "epoch": 29.648, + "grad_norm": 0.9951708912849426, + "learning_rate": 2e-05, + "loss": 0.02884598, + "step": 14824 + }, + { + "epoch": 29.65, + "grad_norm": 1.0523395538330078, + "learning_rate": 2e-05, + "loss": 0.03761973, + "step": 14825 + }, + { + "epoch": 29.652, + "grad_norm": 1.0838866233825684, + "learning_rate": 2e-05, + "loss": 0.03728612, + "step": 14826 + }, + { + "epoch": 29.654, + "grad_norm": 1.0605547428131104, + "learning_rate": 2e-05, + "loss": 0.03837313, + "step": 14827 + }, + { + "epoch": 29.656, + "grad_norm": 1.1938754320144653, + "learning_rate": 2e-05, + "loss": 0.05729951, + "step": 14828 + }, + { + "epoch": 29.658, + "grad_norm": 2.144594430923462, + "learning_rate": 2e-05, + "loss": 0.04445962, + "step": 14829 + }, + { + "epoch": 29.66, + "grad_norm": 1.8156507015228271, + "learning_rate": 2e-05, + "loss": 0.03740517, + "step": 14830 + }, + { + "epoch": 29.662, + "grad_norm": 1.244408130645752, + "learning_rate": 2e-05, + "loss": 0.04781131, + "step": 14831 + }, + { + "epoch": 29.664, + "grad_norm": 1.2653872966766357, + "learning_rate": 2e-05, + "loss": 0.0585079, + "step": 14832 + }, + { + "epoch": 29.666, + "grad_norm": 1.3794729709625244, + "learning_rate": 2e-05, + "loss": 0.03325066, + "step": 14833 + }, + { + "epoch": 29.668, + "grad_norm": 1.5158426761627197, + "learning_rate": 2e-05, + "loss": 0.04454071, + "step": 14834 + }, + { + "epoch": 29.67, + "grad_norm": 0.9856611490249634, + "learning_rate": 2e-05, + "loss": 0.03513188, + "step": 14835 + }, + { + "epoch": 29.672, + "grad_norm": 1.0112903118133545, + "learning_rate": 2e-05, + "loss": 0.03400346, + "step": 14836 + }, + { + "epoch": 29.674, + "grad_norm": 2.110278606414795, + "learning_rate": 2e-05, + "loss": 0.06449761, + "step": 14837 + }, + { + "epoch": 29.676, + "grad_norm": 1.4624208211898804, + "learning_rate": 2e-05, + "loss": 0.05108786, + "step": 14838 + }, + { + "epoch": 29.678, + "grad_norm": 1.6022717952728271, + "learning_rate": 2e-05, + "loss": 0.04408568, + "step": 14839 + }, + { + "epoch": 29.68, + "grad_norm": 1.1528983116149902, + "learning_rate": 2e-05, + "loss": 0.04578086, + "step": 14840 + }, + { + "epoch": 29.682, + "grad_norm": 1.0715017318725586, + "learning_rate": 2e-05, + "loss": 0.04369386, + "step": 14841 + }, + { + "epoch": 29.684, + "grad_norm": 1.3998444080352783, + "learning_rate": 2e-05, + "loss": 0.04720485, + "step": 14842 + }, + { + "epoch": 29.686, + "grad_norm": 1.2437803745269775, + "learning_rate": 2e-05, + "loss": 0.04152617, + "step": 14843 + }, + { + "epoch": 29.688, + "grad_norm": 1.2567319869995117, + "learning_rate": 2e-05, + "loss": 0.03946881, + "step": 14844 + }, + { + "epoch": 29.69, + "grad_norm": 1.7165799140930176, + "learning_rate": 2e-05, + "loss": 0.03655791, + "step": 14845 + }, + { + "epoch": 29.692, + "grad_norm": 0.9617201685905457, + "learning_rate": 2e-05, + "loss": 0.03272104, + "step": 14846 + }, + { + "epoch": 29.694, + "grad_norm": 1.2149977684020996, + "learning_rate": 2e-05, + "loss": 0.04295772, + "step": 14847 + }, + { + "epoch": 29.696, + "grad_norm": 1.5055172443389893, + "learning_rate": 2e-05, + "loss": 0.04826812, + "step": 14848 + }, + { + "epoch": 29.698, + "grad_norm": 1.1299159526824951, + "learning_rate": 2e-05, + "loss": 0.04282779, + "step": 14849 + }, + { + "epoch": 29.7, + "grad_norm": 1.5118989944458008, + "learning_rate": 2e-05, + "loss": 0.04391037, + "step": 14850 + }, + { + "epoch": 29.701999999999998, + "grad_norm": 2.338747501373291, + "learning_rate": 2e-05, + "loss": 0.05273699, + "step": 14851 + }, + { + "epoch": 29.704, + "grad_norm": 1.2019315958023071, + "learning_rate": 2e-05, + "loss": 0.04617017, + "step": 14852 + }, + { + "epoch": 29.706, + "grad_norm": 1.618383765220642, + "learning_rate": 2e-05, + "loss": 0.06011579, + "step": 14853 + }, + { + "epoch": 29.708, + "grad_norm": 1.8187501430511475, + "learning_rate": 2e-05, + "loss": 0.03678617, + "step": 14854 + }, + { + "epoch": 29.71, + "grad_norm": 1.143147587776184, + "learning_rate": 2e-05, + "loss": 0.04157409, + "step": 14855 + }, + { + "epoch": 29.712, + "grad_norm": 1.034891128540039, + "learning_rate": 2e-05, + "loss": 0.04029275, + "step": 14856 + }, + { + "epoch": 29.714, + "grad_norm": 1.3997042179107666, + "learning_rate": 2e-05, + "loss": 0.0383551, + "step": 14857 + }, + { + "epoch": 29.716, + "grad_norm": 1.0622479915618896, + "learning_rate": 2e-05, + "loss": 0.04367114, + "step": 14858 + }, + { + "epoch": 29.718, + "grad_norm": 1.824373722076416, + "learning_rate": 2e-05, + "loss": 0.04785391, + "step": 14859 + }, + { + "epoch": 29.72, + "grad_norm": 0.980838418006897, + "learning_rate": 2e-05, + "loss": 0.03948919, + "step": 14860 + }, + { + "epoch": 29.722, + "grad_norm": 1.3569544553756714, + "learning_rate": 2e-05, + "loss": 0.05038366, + "step": 14861 + }, + { + "epoch": 29.724, + "grad_norm": 1.304292917251587, + "learning_rate": 2e-05, + "loss": 0.03696183, + "step": 14862 + }, + { + "epoch": 29.726, + "grad_norm": 3.2446513175964355, + "learning_rate": 2e-05, + "loss": 0.05859087, + "step": 14863 + }, + { + "epoch": 29.728, + "grad_norm": 1.621229648590088, + "learning_rate": 2e-05, + "loss": 0.02783653, + "step": 14864 + }, + { + "epoch": 29.73, + "grad_norm": 1.2278355360031128, + "learning_rate": 2e-05, + "loss": 0.05444655, + "step": 14865 + }, + { + "epoch": 29.732, + "grad_norm": 1.2174080610275269, + "learning_rate": 2e-05, + "loss": 0.05196868, + "step": 14866 + }, + { + "epoch": 29.734, + "grad_norm": 1.6062108278274536, + "learning_rate": 2e-05, + "loss": 0.04332883, + "step": 14867 + }, + { + "epoch": 29.736, + "grad_norm": 1.161248803138733, + "learning_rate": 2e-05, + "loss": 0.03697128, + "step": 14868 + }, + { + "epoch": 29.738, + "grad_norm": 1.2859642505645752, + "learning_rate": 2e-05, + "loss": 0.05505038, + "step": 14869 + }, + { + "epoch": 29.74, + "grad_norm": 1.4202659130096436, + "learning_rate": 2e-05, + "loss": 0.0477881, + "step": 14870 + }, + { + "epoch": 29.742, + "grad_norm": 1.2499854564666748, + "learning_rate": 2e-05, + "loss": 0.05374869, + "step": 14871 + }, + { + "epoch": 29.744, + "grad_norm": 0.955565333366394, + "learning_rate": 2e-05, + "loss": 0.02939542, + "step": 14872 + }, + { + "epoch": 29.746, + "grad_norm": 1.3554538488388062, + "learning_rate": 2e-05, + "loss": 0.04614054, + "step": 14873 + }, + { + "epoch": 29.748, + "grad_norm": 1.089884638786316, + "learning_rate": 2e-05, + "loss": 0.03633293, + "step": 14874 + }, + { + "epoch": 29.75, + "grad_norm": 1.1395641565322876, + "learning_rate": 2e-05, + "loss": 0.04034775, + "step": 14875 + }, + { + "epoch": 29.752, + "grad_norm": 1.3981385231018066, + "learning_rate": 2e-05, + "loss": 0.04628996, + "step": 14876 + }, + { + "epoch": 29.754, + "grad_norm": 1.7084894180297852, + "learning_rate": 2e-05, + "loss": 0.04334879, + "step": 14877 + }, + { + "epoch": 29.756, + "grad_norm": 2.442875385284424, + "learning_rate": 2e-05, + "loss": 0.05451498, + "step": 14878 + }, + { + "epoch": 29.758, + "grad_norm": 1.4088022708892822, + "learning_rate": 2e-05, + "loss": 0.05375902, + "step": 14879 + }, + { + "epoch": 29.76, + "grad_norm": 1.7990258932113647, + "learning_rate": 2e-05, + "loss": 0.04616355, + "step": 14880 + }, + { + "epoch": 29.762, + "grad_norm": 2.0218417644500732, + "learning_rate": 2e-05, + "loss": 0.03352433, + "step": 14881 + }, + { + "epoch": 29.764, + "grad_norm": 1.8284379243850708, + "learning_rate": 2e-05, + "loss": 0.04442716, + "step": 14882 + }, + { + "epoch": 29.766, + "grad_norm": 1.8842039108276367, + "learning_rate": 2e-05, + "loss": 0.03985734, + "step": 14883 + }, + { + "epoch": 29.768, + "grad_norm": 1.5910454988479614, + "learning_rate": 2e-05, + "loss": 0.0417336, + "step": 14884 + }, + { + "epoch": 29.77, + "grad_norm": 5.6194987297058105, + "learning_rate": 2e-05, + "loss": 0.06214419, + "step": 14885 + }, + { + "epoch": 29.772, + "grad_norm": 1.221018671989441, + "learning_rate": 2e-05, + "loss": 0.05359631, + "step": 14886 + }, + { + "epoch": 29.774, + "grad_norm": 1.4850738048553467, + "learning_rate": 2e-05, + "loss": 0.04186449, + "step": 14887 + }, + { + "epoch": 29.776, + "grad_norm": 2.0336577892303467, + "learning_rate": 2e-05, + "loss": 0.03588335, + "step": 14888 + }, + { + "epoch": 29.778, + "grad_norm": 1.0733425617218018, + "learning_rate": 2e-05, + "loss": 0.04026532, + "step": 14889 + }, + { + "epoch": 29.78, + "grad_norm": 1.098156452178955, + "learning_rate": 2e-05, + "loss": 0.04264045, + "step": 14890 + }, + { + "epoch": 29.782, + "grad_norm": 1.4435909986495972, + "learning_rate": 2e-05, + "loss": 0.03248915, + "step": 14891 + }, + { + "epoch": 29.784, + "grad_norm": 3.9169139862060547, + "learning_rate": 2e-05, + "loss": 0.05105972, + "step": 14892 + }, + { + "epoch": 29.786, + "grad_norm": 1.969159722328186, + "learning_rate": 2e-05, + "loss": 0.05551322, + "step": 14893 + }, + { + "epoch": 29.788, + "grad_norm": 1.2635542154312134, + "learning_rate": 2e-05, + "loss": 0.04648653, + "step": 14894 + }, + { + "epoch": 29.79, + "grad_norm": 2.3477652072906494, + "learning_rate": 2e-05, + "loss": 0.06277344, + "step": 14895 + }, + { + "epoch": 29.792, + "grad_norm": 1.0149587392807007, + "learning_rate": 2e-05, + "loss": 0.03186602, + "step": 14896 + }, + { + "epoch": 29.794, + "grad_norm": 1.1373287439346313, + "learning_rate": 2e-05, + "loss": 0.03521444, + "step": 14897 + }, + { + "epoch": 29.796, + "grad_norm": 1.9701231718063354, + "learning_rate": 2e-05, + "loss": 0.05275909, + "step": 14898 + }, + { + "epoch": 29.798000000000002, + "grad_norm": 1.1814894676208496, + "learning_rate": 2e-05, + "loss": 0.0368986, + "step": 14899 + }, + { + "epoch": 29.8, + "grad_norm": 1.1683019399642944, + "learning_rate": 2e-05, + "loss": 0.04178146, + "step": 14900 + }, + { + "epoch": 29.802, + "grad_norm": 1.2779021263122559, + "learning_rate": 2e-05, + "loss": 0.03463981, + "step": 14901 + }, + { + "epoch": 29.804, + "grad_norm": 1.0893248319625854, + "learning_rate": 2e-05, + "loss": 0.03910547, + "step": 14902 + }, + { + "epoch": 29.806, + "grad_norm": 1.4731847047805786, + "learning_rate": 2e-05, + "loss": 0.04429988, + "step": 14903 + }, + { + "epoch": 29.808, + "grad_norm": 1.3464443683624268, + "learning_rate": 2e-05, + "loss": 0.04595333, + "step": 14904 + }, + { + "epoch": 29.81, + "grad_norm": 1.2633438110351562, + "learning_rate": 2e-05, + "loss": 0.03501693, + "step": 14905 + }, + { + "epoch": 29.812, + "grad_norm": 1.1309336423873901, + "learning_rate": 2e-05, + "loss": 0.04051533, + "step": 14906 + }, + { + "epoch": 29.814, + "grad_norm": 1.007876992225647, + "learning_rate": 2e-05, + "loss": 0.03409252, + "step": 14907 + }, + { + "epoch": 29.816, + "grad_norm": 1.6379777193069458, + "learning_rate": 2e-05, + "loss": 0.05974958, + "step": 14908 + }, + { + "epoch": 29.818, + "grad_norm": 1.3505051136016846, + "learning_rate": 2e-05, + "loss": 0.05870251, + "step": 14909 + }, + { + "epoch": 29.82, + "grad_norm": 1.0954749584197998, + "learning_rate": 2e-05, + "loss": 0.03586072, + "step": 14910 + }, + { + "epoch": 29.822, + "grad_norm": 0.8500576615333557, + "learning_rate": 2e-05, + "loss": 0.02894812, + "step": 14911 + }, + { + "epoch": 29.824, + "grad_norm": 0.9059650897979736, + "learning_rate": 2e-05, + "loss": 0.03347109, + "step": 14912 + }, + { + "epoch": 29.826, + "grad_norm": 2.9803061485290527, + "learning_rate": 2e-05, + "loss": 0.05787177, + "step": 14913 + }, + { + "epoch": 29.828, + "grad_norm": 1.2712470293045044, + "learning_rate": 2e-05, + "loss": 0.03648849, + "step": 14914 + }, + { + "epoch": 29.83, + "grad_norm": 1.3802344799041748, + "learning_rate": 2e-05, + "loss": 0.04678344, + "step": 14915 + }, + { + "epoch": 29.832, + "grad_norm": 0.9832471609115601, + "learning_rate": 2e-05, + "loss": 0.03517216, + "step": 14916 + }, + { + "epoch": 29.834, + "grad_norm": 1.1615490913391113, + "learning_rate": 2e-05, + "loss": 0.04364904, + "step": 14917 + }, + { + "epoch": 29.836, + "grad_norm": 1.3471040725708008, + "learning_rate": 2e-05, + "loss": 0.0301399, + "step": 14918 + }, + { + "epoch": 29.838, + "grad_norm": 1.2438772916793823, + "learning_rate": 2e-05, + "loss": 0.0445204, + "step": 14919 + }, + { + "epoch": 29.84, + "grad_norm": 1.1942741870880127, + "learning_rate": 2e-05, + "loss": 0.03932862, + "step": 14920 + }, + { + "epoch": 29.842, + "grad_norm": 1.0131380558013916, + "learning_rate": 2e-05, + "loss": 0.04467336, + "step": 14921 + }, + { + "epoch": 29.844, + "grad_norm": 1.1072560548782349, + "learning_rate": 2e-05, + "loss": 0.03384002, + "step": 14922 + }, + { + "epoch": 29.846, + "grad_norm": 1.2888267040252686, + "learning_rate": 2e-05, + "loss": 0.03786197, + "step": 14923 + }, + { + "epoch": 29.848, + "grad_norm": 1.1689302921295166, + "learning_rate": 2e-05, + "loss": 0.05054848, + "step": 14924 + }, + { + "epoch": 29.85, + "grad_norm": 1.3626478910446167, + "learning_rate": 2e-05, + "loss": 0.04308386, + "step": 14925 + }, + { + "epoch": 29.852, + "grad_norm": 1.0472064018249512, + "learning_rate": 2e-05, + "loss": 0.033345, + "step": 14926 + }, + { + "epoch": 29.854, + "grad_norm": 1.0115253925323486, + "learning_rate": 2e-05, + "loss": 0.03919667, + "step": 14927 + }, + { + "epoch": 29.856, + "grad_norm": 1.8129942417144775, + "learning_rate": 2e-05, + "loss": 0.04338409, + "step": 14928 + }, + { + "epoch": 29.858, + "grad_norm": 1.012441873550415, + "learning_rate": 2e-05, + "loss": 0.03030182, + "step": 14929 + }, + { + "epoch": 29.86, + "grad_norm": 1.0548601150512695, + "learning_rate": 2e-05, + "loss": 0.03725137, + "step": 14930 + }, + { + "epoch": 29.862, + "grad_norm": 1.027254581451416, + "learning_rate": 2e-05, + "loss": 0.04611764, + "step": 14931 + }, + { + "epoch": 29.864, + "grad_norm": 1.1189601421356201, + "learning_rate": 2e-05, + "loss": 0.03380036, + "step": 14932 + }, + { + "epoch": 29.866, + "grad_norm": 1.33846116065979, + "learning_rate": 2e-05, + "loss": 0.06069037, + "step": 14933 + }, + { + "epoch": 29.868, + "grad_norm": 1.096492052078247, + "learning_rate": 2e-05, + "loss": 0.04014472, + "step": 14934 + }, + { + "epoch": 29.87, + "grad_norm": 1.174947738647461, + "learning_rate": 2e-05, + "loss": 0.02727459, + "step": 14935 + }, + { + "epoch": 29.872, + "grad_norm": 1.087896466255188, + "learning_rate": 2e-05, + "loss": 0.04823601, + "step": 14936 + }, + { + "epoch": 29.874, + "grad_norm": 1.0803396701812744, + "learning_rate": 2e-05, + "loss": 0.04184086, + "step": 14937 + }, + { + "epoch": 29.876, + "grad_norm": 1.0180484056472778, + "learning_rate": 2e-05, + "loss": 0.02937475, + "step": 14938 + }, + { + "epoch": 29.878, + "grad_norm": 1.099775791168213, + "learning_rate": 2e-05, + "loss": 0.04277577, + "step": 14939 + }, + { + "epoch": 29.88, + "grad_norm": 1.3840006589889526, + "learning_rate": 2e-05, + "loss": 0.04482201, + "step": 14940 + }, + { + "epoch": 29.882, + "grad_norm": 1.119278907775879, + "learning_rate": 2e-05, + "loss": 0.04266882, + "step": 14941 + }, + { + "epoch": 29.884, + "grad_norm": 1.1960852146148682, + "learning_rate": 2e-05, + "loss": 0.03658342, + "step": 14942 + }, + { + "epoch": 29.886, + "grad_norm": 1.3054516315460205, + "learning_rate": 2e-05, + "loss": 0.04391373, + "step": 14943 + }, + { + "epoch": 29.888, + "grad_norm": 2.7474465370178223, + "learning_rate": 2e-05, + "loss": 0.04351743, + "step": 14944 + }, + { + "epoch": 29.89, + "grad_norm": 1.2123734951019287, + "learning_rate": 2e-05, + "loss": 0.03769352, + "step": 14945 + }, + { + "epoch": 29.892, + "grad_norm": 1.325861930847168, + "learning_rate": 2e-05, + "loss": 0.05362713, + "step": 14946 + }, + { + "epoch": 29.894, + "grad_norm": 1.4618078470230103, + "learning_rate": 2e-05, + "loss": 0.04493023, + "step": 14947 + }, + { + "epoch": 29.896, + "grad_norm": 1.041840672492981, + "learning_rate": 2e-05, + "loss": 0.03452935, + "step": 14948 + }, + { + "epoch": 29.898, + "grad_norm": 1.356163740158081, + "learning_rate": 2e-05, + "loss": 0.05633589, + "step": 14949 + }, + { + "epoch": 29.9, + "grad_norm": 1.409123420715332, + "learning_rate": 2e-05, + "loss": 0.04535218, + "step": 14950 + }, + { + "epoch": 29.902, + "grad_norm": 1.5181453227996826, + "learning_rate": 2e-05, + "loss": 0.03477838, + "step": 14951 + }, + { + "epoch": 29.904, + "grad_norm": 1.1335440874099731, + "learning_rate": 2e-05, + "loss": 0.05675086, + "step": 14952 + }, + { + "epoch": 29.906, + "grad_norm": 2.706404209136963, + "learning_rate": 2e-05, + "loss": 0.04841974, + "step": 14953 + }, + { + "epoch": 29.908, + "grad_norm": 1.2822026014328003, + "learning_rate": 2e-05, + "loss": 0.0460251, + "step": 14954 + }, + { + "epoch": 29.91, + "grad_norm": 1.2433853149414062, + "learning_rate": 2e-05, + "loss": 0.04511236, + "step": 14955 + }, + { + "epoch": 29.912, + "grad_norm": 1.2542903423309326, + "learning_rate": 2e-05, + "loss": 0.04461327, + "step": 14956 + }, + { + "epoch": 29.914, + "grad_norm": 1.5348360538482666, + "learning_rate": 2e-05, + "loss": 0.06257273, + "step": 14957 + }, + { + "epoch": 29.916, + "grad_norm": 1.2867058515548706, + "learning_rate": 2e-05, + "loss": 0.03613833, + "step": 14958 + }, + { + "epoch": 29.918, + "grad_norm": 1.8032394647598267, + "learning_rate": 2e-05, + "loss": 0.06370901, + "step": 14959 + }, + { + "epoch": 29.92, + "grad_norm": 2.311445951461792, + "learning_rate": 2e-05, + "loss": 0.03839797, + "step": 14960 + }, + { + "epoch": 29.922, + "grad_norm": 1.3040590286254883, + "learning_rate": 2e-05, + "loss": 0.02956476, + "step": 14961 + }, + { + "epoch": 29.924, + "grad_norm": 1.2339740991592407, + "learning_rate": 2e-05, + "loss": 0.03441235, + "step": 14962 + }, + { + "epoch": 29.926, + "grad_norm": 1.8125553131103516, + "learning_rate": 2e-05, + "loss": 0.04188786, + "step": 14963 + }, + { + "epoch": 29.928, + "grad_norm": 1.2054352760314941, + "learning_rate": 2e-05, + "loss": 0.02116534, + "step": 14964 + }, + { + "epoch": 29.93, + "grad_norm": 1.1890215873718262, + "learning_rate": 2e-05, + "loss": 0.04022015, + "step": 14965 + }, + { + "epoch": 29.932, + "grad_norm": 1.709370493888855, + "learning_rate": 2e-05, + "loss": 0.04972868, + "step": 14966 + }, + { + "epoch": 29.934, + "grad_norm": 1.2925797700881958, + "learning_rate": 2e-05, + "loss": 0.0480578, + "step": 14967 + }, + { + "epoch": 29.936, + "grad_norm": 1.549068808555603, + "learning_rate": 2e-05, + "loss": 0.05139837, + "step": 14968 + }, + { + "epoch": 29.938, + "grad_norm": 1.3146717548370361, + "learning_rate": 2e-05, + "loss": 0.03840823, + "step": 14969 + }, + { + "epoch": 29.94, + "grad_norm": 1.0093461275100708, + "learning_rate": 2e-05, + "loss": 0.03652712, + "step": 14970 + }, + { + "epoch": 29.942, + "grad_norm": 3.679654121398926, + "learning_rate": 2e-05, + "loss": 0.06581252, + "step": 14971 + }, + { + "epoch": 29.944, + "grad_norm": 1.36691415309906, + "learning_rate": 2e-05, + "loss": 0.04719871, + "step": 14972 + }, + { + "epoch": 29.946, + "grad_norm": 1.2385770082473755, + "learning_rate": 2e-05, + "loss": 0.04421876, + "step": 14973 + }, + { + "epoch": 29.948, + "grad_norm": 1.2146538496017456, + "learning_rate": 2e-05, + "loss": 0.03504267, + "step": 14974 + }, + { + "epoch": 29.95, + "grad_norm": 1.7719730138778687, + "learning_rate": 2e-05, + "loss": 0.04804779, + "step": 14975 + }, + { + "epoch": 29.951999999999998, + "grad_norm": 1.596618890762329, + "learning_rate": 2e-05, + "loss": 0.06619457, + "step": 14976 + }, + { + "epoch": 29.954, + "grad_norm": 0.9984351396560669, + "learning_rate": 2e-05, + "loss": 0.02848834, + "step": 14977 + }, + { + "epoch": 29.956, + "grad_norm": 1.1433227062225342, + "learning_rate": 2e-05, + "loss": 0.0418468, + "step": 14978 + }, + { + "epoch": 29.958, + "grad_norm": 1.499070644378662, + "learning_rate": 2e-05, + "loss": 0.05216137, + "step": 14979 + }, + { + "epoch": 29.96, + "grad_norm": 1.0211125612258911, + "learning_rate": 2e-05, + "loss": 0.03615819, + "step": 14980 + }, + { + "epoch": 29.962, + "grad_norm": 3.567038059234619, + "learning_rate": 2e-05, + "loss": 0.05577373, + "step": 14981 + }, + { + "epoch": 29.964, + "grad_norm": 1.994671106338501, + "learning_rate": 2e-05, + "loss": 0.04374481, + "step": 14982 + }, + { + "epoch": 29.966, + "grad_norm": 1.1184746026992798, + "learning_rate": 2e-05, + "loss": 0.04952614, + "step": 14983 + }, + { + "epoch": 29.968, + "grad_norm": 1.5312135219573975, + "learning_rate": 2e-05, + "loss": 0.04638387, + "step": 14984 + }, + { + "epoch": 29.97, + "grad_norm": 1.1531622409820557, + "learning_rate": 2e-05, + "loss": 0.04479223, + "step": 14985 + }, + { + "epoch": 29.972, + "grad_norm": 1.111600637435913, + "learning_rate": 2e-05, + "loss": 0.03504475, + "step": 14986 + }, + { + "epoch": 29.974, + "grad_norm": 1.4171292781829834, + "learning_rate": 2e-05, + "loss": 0.03749974, + "step": 14987 + }, + { + "epoch": 29.976, + "grad_norm": 0.8947274684906006, + "learning_rate": 2e-05, + "loss": 0.02938852, + "step": 14988 + }, + { + "epoch": 29.978, + "grad_norm": 1.0345733165740967, + "learning_rate": 2e-05, + "loss": 0.04883969, + "step": 14989 + }, + { + "epoch": 29.98, + "grad_norm": 1.0411769151687622, + "learning_rate": 2e-05, + "loss": 0.04379828, + "step": 14990 + }, + { + "epoch": 29.982, + "grad_norm": 1.4568036794662476, + "learning_rate": 2e-05, + "loss": 0.04191837, + "step": 14991 + }, + { + "epoch": 29.984, + "grad_norm": 1.2673814296722412, + "learning_rate": 2e-05, + "loss": 0.04016875, + "step": 14992 + }, + { + "epoch": 29.986, + "grad_norm": 0.9755784869194031, + "learning_rate": 2e-05, + "loss": 0.03788521, + "step": 14993 + }, + { + "epoch": 29.988, + "grad_norm": 1.2146108150482178, + "learning_rate": 2e-05, + "loss": 0.0330191, + "step": 14994 + }, + { + "epoch": 29.99, + "grad_norm": 1.6079703569412231, + "learning_rate": 2e-05, + "loss": 0.05264592, + "step": 14995 + }, + { + "epoch": 29.992, + "grad_norm": 1.3531326055526733, + "learning_rate": 2e-05, + "loss": 0.05724774, + "step": 14996 + }, + { + "epoch": 29.994, + "grad_norm": 1.4556341171264648, + "learning_rate": 2e-05, + "loss": 0.06128872, + "step": 14997 + }, + { + "epoch": 29.996, + "grad_norm": 1.7786065340042114, + "learning_rate": 2e-05, + "loss": 0.06668452, + "step": 14998 + }, + { + "epoch": 29.998, + "grad_norm": 1.122294306755066, + "learning_rate": 2e-05, + "loss": 0.04705849, + "step": 14999 + }, + { + "epoch": 30.0, + "grad_norm": 1.9331656694412231, + "learning_rate": 2e-05, + "loss": 0.0316266, + "step": 15000 + }, + { + "epoch": 30.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9820359281437125, + "Equal_1": 0.996, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9780439121756487, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9960079840319361, + "Parallel_1": 0.9919839679358717, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.988, + "Perpendicular_3": 0.8937875751503006, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.998, + "PointLiesOnCircle_3": 0.9916666666666667, + "PointLiesOnLine_1": 0.9939879759519038, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9900199600798403 + }, + "eval_runtime": 320.3735, + "eval_samples_per_second": 32.774, + "eval_steps_per_second": 0.655, + "step": 15000 + }, + { + "epoch": 30.002, + "grad_norm": 1.4232646226882935, + "learning_rate": 2e-05, + "loss": 0.04287136, + "step": 15001 + }, + { + "epoch": 30.004, + "grad_norm": 1.111973524093628, + "learning_rate": 2e-05, + "loss": 0.04549501, + "step": 15002 + }, + { + "epoch": 30.006, + "grad_norm": 1.4063668251037598, + "learning_rate": 2e-05, + "loss": 0.04815795, + "step": 15003 + }, + { + "epoch": 30.008, + "grad_norm": 1.0374184846878052, + "learning_rate": 2e-05, + "loss": 0.03240955, + "step": 15004 + }, + { + "epoch": 30.01, + "grad_norm": 2.511753797531128, + "learning_rate": 2e-05, + "loss": 0.05327015, + "step": 15005 + }, + { + "epoch": 30.012, + "grad_norm": 1.5259581804275513, + "learning_rate": 2e-05, + "loss": 0.04226237, + "step": 15006 + }, + { + "epoch": 30.014, + "grad_norm": 2.2420709133148193, + "learning_rate": 2e-05, + "loss": 0.05237993, + "step": 15007 + }, + { + "epoch": 30.016, + "grad_norm": 1.380776047706604, + "learning_rate": 2e-05, + "loss": 0.04655606, + "step": 15008 + }, + { + "epoch": 30.018, + "grad_norm": 1.1870553493499756, + "learning_rate": 2e-05, + "loss": 0.03608441, + "step": 15009 + }, + { + "epoch": 30.02, + "grad_norm": 1.869041085243225, + "learning_rate": 2e-05, + "loss": 0.05452606, + "step": 15010 + }, + { + "epoch": 30.022, + "grad_norm": 1.5327285528182983, + "learning_rate": 2e-05, + "loss": 0.07322414, + "step": 15011 + }, + { + "epoch": 30.024, + "grad_norm": 1.1738957166671753, + "learning_rate": 2e-05, + "loss": 0.04997585, + "step": 15012 + }, + { + "epoch": 30.026, + "grad_norm": 1.7185429334640503, + "learning_rate": 2e-05, + "loss": 0.0466965, + "step": 15013 + }, + { + "epoch": 30.028, + "grad_norm": 1.3917158842086792, + "learning_rate": 2e-05, + "loss": 0.04702974, + "step": 15014 + }, + { + "epoch": 30.03, + "grad_norm": 1.652574896812439, + "learning_rate": 2e-05, + "loss": 0.03737196, + "step": 15015 + }, + { + "epoch": 30.032, + "grad_norm": 1.2738018035888672, + "learning_rate": 2e-05, + "loss": 0.04564106, + "step": 15016 + }, + { + "epoch": 30.034, + "grad_norm": 1.3657252788543701, + "learning_rate": 2e-05, + "loss": 0.0442913, + "step": 15017 + }, + { + "epoch": 30.036, + "grad_norm": 1.1747430562973022, + "learning_rate": 2e-05, + "loss": 0.02756719, + "step": 15018 + }, + { + "epoch": 30.038, + "grad_norm": 1.0821881294250488, + "learning_rate": 2e-05, + "loss": 0.0346719, + "step": 15019 + }, + { + "epoch": 30.04, + "grad_norm": 1.230070948600769, + "learning_rate": 2e-05, + "loss": 0.03856944, + "step": 15020 + }, + { + "epoch": 30.042, + "grad_norm": 1.540521502494812, + "learning_rate": 2e-05, + "loss": 0.04057005, + "step": 15021 + }, + { + "epoch": 30.044, + "grad_norm": 2.8572275638580322, + "learning_rate": 2e-05, + "loss": 0.04592321, + "step": 15022 + }, + { + "epoch": 30.046, + "grad_norm": 0.957338809967041, + "learning_rate": 2e-05, + "loss": 0.03693869, + "step": 15023 + }, + { + "epoch": 30.048, + "grad_norm": 1.0699844360351562, + "learning_rate": 2e-05, + "loss": 0.03753718, + "step": 15024 + }, + { + "epoch": 30.05, + "grad_norm": 2.457496404647827, + "learning_rate": 2e-05, + "loss": 0.04840863, + "step": 15025 + }, + { + "epoch": 30.052, + "grad_norm": 1.2310270071029663, + "learning_rate": 2e-05, + "loss": 0.04352465, + "step": 15026 + }, + { + "epoch": 30.054, + "grad_norm": 1.2693308591842651, + "learning_rate": 2e-05, + "loss": 0.04948821, + "step": 15027 + }, + { + "epoch": 30.056, + "grad_norm": 1.4048819541931152, + "learning_rate": 2e-05, + "loss": 0.0399585, + "step": 15028 + }, + { + "epoch": 30.058, + "grad_norm": 2.0493316650390625, + "learning_rate": 2e-05, + "loss": 0.05211222, + "step": 15029 + }, + { + "epoch": 30.06, + "grad_norm": 3.7002038955688477, + "learning_rate": 2e-05, + "loss": 0.05875326, + "step": 15030 + }, + { + "epoch": 30.062, + "grad_norm": 1.7289574146270752, + "learning_rate": 2e-05, + "loss": 0.04627757, + "step": 15031 + }, + { + "epoch": 30.064, + "grad_norm": 1.0221132040023804, + "learning_rate": 2e-05, + "loss": 0.03416388, + "step": 15032 + }, + { + "epoch": 30.066, + "grad_norm": 1.4639520645141602, + "learning_rate": 2e-05, + "loss": 0.04067467, + "step": 15033 + }, + { + "epoch": 30.068, + "grad_norm": 3.5001463890075684, + "learning_rate": 2e-05, + "loss": 0.04698951, + "step": 15034 + }, + { + "epoch": 30.07, + "grad_norm": 2.3693861961364746, + "learning_rate": 2e-05, + "loss": 0.04970219, + "step": 15035 + }, + { + "epoch": 30.072, + "grad_norm": 1.3238275051116943, + "learning_rate": 2e-05, + "loss": 0.05959756, + "step": 15036 + }, + { + "epoch": 30.074, + "grad_norm": 1.5840702056884766, + "learning_rate": 2e-05, + "loss": 0.04837484, + "step": 15037 + }, + { + "epoch": 30.076, + "grad_norm": 0.9318821430206299, + "learning_rate": 2e-05, + "loss": 0.03147685, + "step": 15038 + }, + { + "epoch": 30.078, + "grad_norm": 1.0989964008331299, + "learning_rate": 2e-05, + "loss": 0.03799355, + "step": 15039 + }, + { + "epoch": 30.08, + "grad_norm": 1.4380488395690918, + "learning_rate": 2e-05, + "loss": 0.04152, + "step": 15040 + }, + { + "epoch": 30.082, + "grad_norm": 1.2408324480056763, + "learning_rate": 2e-05, + "loss": 0.04909746, + "step": 15041 + }, + { + "epoch": 30.084, + "grad_norm": 1.1136013269424438, + "learning_rate": 2e-05, + "loss": 0.03348158, + "step": 15042 + }, + { + "epoch": 30.086, + "grad_norm": 1.2548508644104004, + "learning_rate": 2e-05, + "loss": 0.0451351, + "step": 15043 + }, + { + "epoch": 30.088, + "grad_norm": 1.0726171731948853, + "learning_rate": 2e-05, + "loss": 0.04324069, + "step": 15044 + }, + { + "epoch": 30.09, + "grad_norm": 1.0724010467529297, + "learning_rate": 2e-05, + "loss": 0.0484529, + "step": 15045 + }, + { + "epoch": 30.092, + "grad_norm": 1.228132963180542, + "learning_rate": 2e-05, + "loss": 0.03065848, + "step": 15046 + }, + { + "epoch": 30.094, + "grad_norm": 1.099954605102539, + "learning_rate": 2e-05, + "loss": 0.04499891, + "step": 15047 + }, + { + "epoch": 30.096, + "grad_norm": 1.0415964126586914, + "learning_rate": 2e-05, + "loss": 0.03373061, + "step": 15048 + }, + { + "epoch": 30.098, + "grad_norm": 1.237966775894165, + "learning_rate": 2e-05, + "loss": 0.04612883, + "step": 15049 + }, + { + "epoch": 30.1, + "grad_norm": 0.8815692067146301, + "learning_rate": 2e-05, + "loss": 0.02756138, + "step": 15050 + }, + { + "epoch": 30.102, + "grad_norm": 1.3700236082077026, + "learning_rate": 2e-05, + "loss": 0.06041703, + "step": 15051 + }, + { + "epoch": 30.104, + "grad_norm": 1.2398154735565186, + "learning_rate": 2e-05, + "loss": 0.03576667, + "step": 15052 + }, + { + "epoch": 30.106, + "grad_norm": 1.1791821718215942, + "learning_rate": 2e-05, + "loss": 0.05038423, + "step": 15053 + }, + { + "epoch": 30.108, + "grad_norm": 2.0784451961517334, + "learning_rate": 2e-05, + "loss": 0.05495892, + "step": 15054 + }, + { + "epoch": 30.11, + "grad_norm": 2.1504173278808594, + "learning_rate": 2e-05, + "loss": 0.03890841, + "step": 15055 + }, + { + "epoch": 30.112, + "grad_norm": 3.112828254699707, + "learning_rate": 2e-05, + "loss": 0.05653836, + "step": 15056 + }, + { + "epoch": 30.114, + "grad_norm": 1.1712428331375122, + "learning_rate": 2e-05, + "loss": 0.05188262, + "step": 15057 + }, + { + "epoch": 30.116, + "grad_norm": 1.5940415859222412, + "learning_rate": 2e-05, + "loss": 0.05263701, + "step": 15058 + }, + { + "epoch": 30.118, + "grad_norm": 1.2304260730743408, + "learning_rate": 2e-05, + "loss": 0.04616508, + "step": 15059 + }, + { + "epoch": 30.12, + "grad_norm": 3.7707254886627197, + "learning_rate": 2e-05, + "loss": 0.05711006, + "step": 15060 + }, + { + "epoch": 30.122, + "grad_norm": 1.8504637479782104, + "learning_rate": 2e-05, + "loss": 0.04005916, + "step": 15061 + }, + { + "epoch": 30.124, + "grad_norm": 1.3818079233169556, + "learning_rate": 2e-05, + "loss": 0.0316132, + "step": 15062 + }, + { + "epoch": 30.126, + "grad_norm": 1.582114577293396, + "learning_rate": 2e-05, + "loss": 0.05220442, + "step": 15063 + }, + { + "epoch": 30.128, + "grad_norm": 1.2166645526885986, + "learning_rate": 2e-05, + "loss": 0.03328218, + "step": 15064 + }, + { + "epoch": 30.13, + "grad_norm": 1.2545546293258667, + "learning_rate": 2e-05, + "loss": 0.03880144, + "step": 15065 + }, + { + "epoch": 30.132, + "grad_norm": 1.1685189008712769, + "learning_rate": 2e-05, + "loss": 0.0422597, + "step": 15066 + }, + { + "epoch": 30.134, + "grad_norm": 1.2280715703964233, + "learning_rate": 2e-05, + "loss": 0.03680715, + "step": 15067 + }, + { + "epoch": 30.136, + "grad_norm": 1.2604179382324219, + "learning_rate": 2e-05, + "loss": 0.0577843, + "step": 15068 + }, + { + "epoch": 30.138, + "grad_norm": 1.333001971244812, + "learning_rate": 2e-05, + "loss": 0.03825287, + "step": 15069 + }, + { + "epoch": 30.14, + "grad_norm": 1.515370488166809, + "learning_rate": 2e-05, + "loss": 0.03778506, + "step": 15070 + }, + { + "epoch": 30.142, + "grad_norm": 1.265008568763733, + "learning_rate": 2e-05, + "loss": 0.03106946, + "step": 15071 + }, + { + "epoch": 30.144, + "grad_norm": 2.2077817916870117, + "learning_rate": 2e-05, + "loss": 0.03389125, + "step": 15072 + }, + { + "epoch": 30.146, + "grad_norm": 1.4839131832122803, + "learning_rate": 2e-05, + "loss": 0.0673124, + "step": 15073 + }, + { + "epoch": 30.148, + "grad_norm": 1.1836014986038208, + "learning_rate": 2e-05, + "loss": 0.0335351, + "step": 15074 + }, + { + "epoch": 30.15, + "grad_norm": 1.3027890920639038, + "learning_rate": 2e-05, + "loss": 0.04611604, + "step": 15075 + }, + { + "epoch": 30.152, + "grad_norm": 1.2303056716918945, + "learning_rate": 2e-05, + "loss": 0.04055518, + "step": 15076 + }, + { + "epoch": 30.154, + "grad_norm": 1.6350833177566528, + "learning_rate": 2e-05, + "loss": 0.0367165, + "step": 15077 + }, + { + "epoch": 30.156, + "grad_norm": 0.9786274433135986, + "learning_rate": 2e-05, + "loss": 0.03549477, + "step": 15078 + }, + { + "epoch": 30.158, + "grad_norm": 1.5880675315856934, + "learning_rate": 2e-05, + "loss": 0.04486579, + "step": 15079 + }, + { + "epoch": 30.16, + "grad_norm": 1.2543953657150269, + "learning_rate": 2e-05, + "loss": 0.04821232, + "step": 15080 + }, + { + "epoch": 30.162, + "grad_norm": 2.140390396118164, + "learning_rate": 2e-05, + "loss": 0.04136558, + "step": 15081 + }, + { + "epoch": 30.164, + "grad_norm": 1.0265830755233765, + "learning_rate": 2e-05, + "loss": 0.03865357, + "step": 15082 + }, + { + "epoch": 30.166, + "grad_norm": 1.3790358304977417, + "learning_rate": 2e-05, + "loss": 0.06286014, + "step": 15083 + }, + { + "epoch": 30.168, + "grad_norm": 2.187826633453369, + "learning_rate": 2e-05, + "loss": 0.06071652, + "step": 15084 + }, + { + "epoch": 30.17, + "grad_norm": 1.165565848350525, + "learning_rate": 2e-05, + "loss": 0.04597174, + "step": 15085 + }, + { + "epoch": 30.172, + "grad_norm": 1.5375369787216187, + "learning_rate": 2e-05, + "loss": 0.03429234, + "step": 15086 + }, + { + "epoch": 30.174, + "grad_norm": 2.7333414554595947, + "learning_rate": 2e-05, + "loss": 0.05888695, + "step": 15087 + }, + { + "epoch": 30.176, + "grad_norm": 1.0312190055847168, + "learning_rate": 2e-05, + "loss": 0.0352921, + "step": 15088 + }, + { + "epoch": 30.178, + "grad_norm": 4.89245080947876, + "learning_rate": 2e-05, + "loss": 0.0566744, + "step": 15089 + }, + { + "epoch": 30.18, + "grad_norm": 1.7868283987045288, + "learning_rate": 2e-05, + "loss": 0.05283628, + "step": 15090 + }, + { + "epoch": 30.182, + "grad_norm": 2.932136297225952, + "learning_rate": 2e-05, + "loss": 0.04512861, + "step": 15091 + }, + { + "epoch": 30.184, + "grad_norm": 1.8231194019317627, + "learning_rate": 2e-05, + "loss": 0.04028752, + "step": 15092 + }, + { + "epoch": 30.186, + "grad_norm": 1.5239124298095703, + "learning_rate": 2e-05, + "loss": 0.03271829, + "step": 15093 + }, + { + "epoch": 30.188, + "grad_norm": 1.2162010669708252, + "learning_rate": 2e-05, + "loss": 0.03735817, + "step": 15094 + }, + { + "epoch": 30.19, + "grad_norm": 1.240540623664856, + "learning_rate": 2e-05, + "loss": 0.04491368, + "step": 15095 + }, + { + "epoch": 30.192, + "grad_norm": 1.1648021936416626, + "learning_rate": 2e-05, + "loss": 0.04308267, + "step": 15096 + }, + { + "epoch": 30.194, + "grad_norm": 1.0710548162460327, + "learning_rate": 2e-05, + "loss": 0.04492667, + "step": 15097 + }, + { + "epoch": 30.196, + "grad_norm": 1.1542491912841797, + "learning_rate": 2e-05, + "loss": 0.0378947, + "step": 15098 + }, + { + "epoch": 30.198, + "grad_norm": 1.8039984703063965, + "learning_rate": 2e-05, + "loss": 0.05451577, + "step": 15099 + }, + { + "epoch": 30.2, + "grad_norm": 1.4430378675460815, + "learning_rate": 2e-05, + "loss": 0.05195063, + "step": 15100 + }, + { + "epoch": 30.202, + "grad_norm": 1.3164018392562866, + "learning_rate": 2e-05, + "loss": 0.03912523, + "step": 15101 + }, + { + "epoch": 30.204, + "grad_norm": 1.669808030128479, + "learning_rate": 2e-05, + "loss": 0.04372003, + "step": 15102 + }, + { + "epoch": 30.206, + "grad_norm": 1.5180182456970215, + "learning_rate": 2e-05, + "loss": 0.03866254, + "step": 15103 + }, + { + "epoch": 30.208, + "grad_norm": 1.6664462089538574, + "learning_rate": 2e-05, + "loss": 0.0531385, + "step": 15104 + }, + { + "epoch": 30.21, + "grad_norm": 1.4806171655654907, + "learning_rate": 2e-05, + "loss": 0.05282668, + "step": 15105 + }, + { + "epoch": 30.212, + "grad_norm": 1.128859281539917, + "learning_rate": 2e-05, + "loss": 0.03547608, + "step": 15106 + }, + { + "epoch": 30.214, + "grad_norm": 1.117397427558899, + "learning_rate": 2e-05, + "loss": 0.03773435, + "step": 15107 + }, + { + "epoch": 30.216, + "grad_norm": 1.2309256792068481, + "learning_rate": 2e-05, + "loss": 0.04536531, + "step": 15108 + }, + { + "epoch": 30.218, + "grad_norm": 1.3051683902740479, + "learning_rate": 2e-05, + "loss": 0.04438914, + "step": 15109 + }, + { + "epoch": 30.22, + "grad_norm": 1.4824970960617065, + "learning_rate": 2e-05, + "loss": 0.05524845, + "step": 15110 + }, + { + "epoch": 30.222, + "grad_norm": 1.3169000148773193, + "learning_rate": 2e-05, + "loss": 0.04102128, + "step": 15111 + }, + { + "epoch": 30.224, + "grad_norm": 1.2333604097366333, + "learning_rate": 2e-05, + "loss": 0.06657355, + "step": 15112 + }, + { + "epoch": 30.226, + "grad_norm": 0.9591508507728577, + "learning_rate": 2e-05, + "loss": 0.02981432, + "step": 15113 + }, + { + "epoch": 30.228, + "grad_norm": 1.213059425354004, + "learning_rate": 2e-05, + "loss": 0.0342233, + "step": 15114 + }, + { + "epoch": 30.23, + "grad_norm": 1.1020963191986084, + "learning_rate": 2e-05, + "loss": 0.02755513, + "step": 15115 + }, + { + "epoch": 30.232, + "grad_norm": 1.0923129320144653, + "learning_rate": 2e-05, + "loss": 0.04014743, + "step": 15116 + }, + { + "epoch": 30.234, + "grad_norm": 1.1201938390731812, + "learning_rate": 2e-05, + "loss": 0.05072346, + "step": 15117 + }, + { + "epoch": 30.236, + "grad_norm": 3.7093849182128906, + "learning_rate": 2e-05, + "loss": 0.06261256, + "step": 15118 + }, + { + "epoch": 30.238, + "grad_norm": 1.1667890548706055, + "learning_rate": 2e-05, + "loss": 0.04364653, + "step": 15119 + }, + { + "epoch": 30.24, + "grad_norm": 1.1062977313995361, + "learning_rate": 2e-05, + "loss": 0.05012048, + "step": 15120 + }, + { + "epoch": 30.242, + "grad_norm": 1.3190714120864868, + "learning_rate": 2e-05, + "loss": 0.05197113, + "step": 15121 + }, + { + "epoch": 30.244, + "grad_norm": 1.458884596824646, + "learning_rate": 2e-05, + "loss": 0.04996847, + "step": 15122 + }, + { + "epoch": 30.246, + "grad_norm": 1.366876244544983, + "learning_rate": 2e-05, + "loss": 0.04006827, + "step": 15123 + }, + { + "epoch": 30.248, + "grad_norm": 1.0993216037750244, + "learning_rate": 2e-05, + "loss": 0.03217662, + "step": 15124 + }, + { + "epoch": 30.25, + "grad_norm": 0.9922396540641785, + "learning_rate": 2e-05, + "loss": 0.04653015, + "step": 15125 + }, + { + "epoch": 30.252, + "grad_norm": 1.0631383657455444, + "learning_rate": 2e-05, + "loss": 0.03681334, + "step": 15126 + }, + { + "epoch": 30.254, + "grad_norm": 1.0309144258499146, + "learning_rate": 2e-05, + "loss": 0.04099189, + "step": 15127 + }, + { + "epoch": 30.256, + "grad_norm": 1.1436161994934082, + "learning_rate": 2e-05, + "loss": 0.04641513, + "step": 15128 + }, + { + "epoch": 30.258, + "grad_norm": 1.134342074394226, + "learning_rate": 2e-05, + "loss": 0.03863002, + "step": 15129 + }, + { + "epoch": 30.26, + "grad_norm": 1.5008431673049927, + "learning_rate": 2e-05, + "loss": 0.04123659, + "step": 15130 + }, + { + "epoch": 30.262, + "grad_norm": 1.044177770614624, + "learning_rate": 2e-05, + "loss": 0.03907245, + "step": 15131 + }, + { + "epoch": 30.264, + "grad_norm": 0.9937227964401245, + "learning_rate": 2e-05, + "loss": 0.03749677, + "step": 15132 + }, + { + "epoch": 30.266, + "grad_norm": 3.122323989868164, + "learning_rate": 2e-05, + "loss": 0.05537373, + "step": 15133 + }, + { + "epoch": 30.268, + "grad_norm": 1.1054846048355103, + "learning_rate": 2e-05, + "loss": 0.04427914, + "step": 15134 + }, + { + "epoch": 30.27, + "grad_norm": 1.256409764289856, + "learning_rate": 2e-05, + "loss": 0.05179529, + "step": 15135 + }, + { + "epoch": 30.272, + "grad_norm": 1.4733872413635254, + "learning_rate": 2e-05, + "loss": 0.04531301, + "step": 15136 + }, + { + "epoch": 30.274, + "grad_norm": 1.2605996131896973, + "learning_rate": 2e-05, + "loss": 0.0307104, + "step": 15137 + }, + { + "epoch": 30.276, + "grad_norm": 1.0388493537902832, + "learning_rate": 2e-05, + "loss": 0.04817986, + "step": 15138 + }, + { + "epoch": 30.278, + "grad_norm": 1.6238524913787842, + "learning_rate": 2e-05, + "loss": 0.05085561, + "step": 15139 + }, + { + "epoch": 30.28, + "grad_norm": 1.096824288368225, + "learning_rate": 2e-05, + "loss": 0.04942642, + "step": 15140 + }, + { + "epoch": 30.282, + "grad_norm": 1.2300045490264893, + "learning_rate": 2e-05, + "loss": 0.0502276, + "step": 15141 + }, + { + "epoch": 30.284, + "grad_norm": 1.2435781955718994, + "learning_rate": 2e-05, + "loss": 0.04534131, + "step": 15142 + }, + { + "epoch": 30.286, + "grad_norm": 1.386216163635254, + "learning_rate": 2e-05, + "loss": 0.04588864, + "step": 15143 + }, + { + "epoch": 30.288, + "grad_norm": 1.2593432664871216, + "learning_rate": 2e-05, + "loss": 0.04650512, + "step": 15144 + }, + { + "epoch": 30.29, + "grad_norm": 1.6124224662780762, + "learning_rate": 2e-05, + "loss": 0.04883378, + "step": 15145 + }, + { + "epoch": 30.292, + "grad_norm": 2.2996997833251953, + "learning_rate": 2e-05, + "loss": 0.04866761, + "step": 15146 + }, + { + "epoch": 30.294, + "grad_norm": 1.7783503532409668, + "learning_rate": 2e-05, + "loss": 0.06645172, + "step": 15147 + }, + { + "epoch": 30.296, + "grad_norm": 1.2972900867462158, + "learning_rate": 2e-05, + "loss": 0.05338973, + "step": 15148 + }, + { + "epoch": 30.298, + "grad_norm": 2.96677827835083, + "learning_rate": 2e-05, + "loss": 0.05938792, + "step": 15149 + }, + { + "epoch": 30.3, + "grad_norm": 1.1180599927902222, + "learning_rate": 2e-05, + "loss": 0.03344629, + "step": 15150 + }, + { + "epoch": 30.302, + "grad_norm": 1.7869832515716553, + "learning_rate": 2e-05, + "loss": 0.05831323, + "step": 15151 + }, + { + "epoch": 30.304, + "grad_norm": 1.225154161453247, + "learning_rate": 2e-05, + "loss": 0.05661523, + "step": 15152 + }, + { + "epoch": 30.306, + "grad_norm": 2.486720561981201, + "learning_rate": 2e-05, + "loss": 0.05136902, + "step": 15153 + }, + { + "epoch": 30.308, + "grad_norm": 1.1094615459442139, + "learning_rate": 2e-05, + "loss": 0.04413821, + "step": 15154 + }, + { + "epoch": 30.31, + "grad_norm": 2.087968349456787, + "learning_rate": 2e-05, + "loss": 0.05737863, + "step": 15155 + }, + { + "epoch": 30.312, + "grad_norm": 1.0131031274795532, + "learning_rate": 2e-05, + "loss": 0.04438207, + "step": 15156 + }, + { + "epoch": 30.314, + "grad_norm": 1.278663992881775, + "learning_rate": 2e-05, + "loss": 0.04451298, + "step": 15157 + }, + { + "epoch": 30.316, + "grad_norm": 1.0880504846572876, + "learning_rate": 2e-05, + "loss": 0.04140671, + "step": 15158 + }, + { + "epoch": 30.318, + "grad_norm": 1.6311753988265991, + "learning_rate": 2e-05, + "loss": 0.04358608, + "step": 15159 + }, + { + "epoch": 30.32, + "grad_norm": 1.3990247249603271, + "learning_rate": 2e-05, + "loss": 0.05173497, + "step": 15160 + }, + { + "epoch": 30.322, + "grad_norm": 1.4861396551132202, + "learning_rate": 2e-05, + "loss": 0.03629741, + "step": 15161 + }, + { + "epoch": 30.324, + "grad_norm": 1.2476096153259277, + "learning_rate": 2e-05, + "loss": 0.03799465, + "step": 15162 + }, + { + "epoch": 30.326, + "grad_norm": 1.2157728672027588, + "learning_rate": 2e-05, + "loss": 0.06220795, + "step": 15163 + }, + { + "epoch": 30.328, + "grad_norm": 1.5057649612426758, + "learning_rate": 2e-05, + "loss": 0.04527867, + "step": 15164 + }, + { + "epoch": 30.33, + "grad_norm": 1.7932603359222412, + "learning_rate": 2e-05, + "loss": 0.05007517, + "step": 15165 + }, + { + "epoch": 30.332, + "grad_norm": 1.5305033922195435, + "learning_rate": 2e-05, + "loss": 0.03679098, + "step": 15166 + }, + { + "epoch": 30.334, + "grad_norm": 1.2285985946655273, + "learning_rate": 2e-05, + "loss": 0.0510888, + "step": 15167 + }, + { + "epoch": 30.336, + "grad_norm": 1.0248448848724365, + "learning_rate": 2e-05, + "loss": 0.04463315, + "step": 15168 + }, + { + "epoch": 30.338, + "grad_norm": 1.1598585844039917, + "learning_rate": 2e-05, + "loss": 0.04191558, + "step": 15169 + }, + { + "epoch": 30.34, + "grad_norm": 1.415334939956665, + "learning_rate": 2e-05, + "loss": 0.04389124, + "step": 15170 + }, + { + "epoch": 30.342, + "grad_norm": 1.105470895767212, + "learning_rate": 2e-05, + "loss": 0.04412735, + "step": 15171 + }, + { + "epoch": 30.344, + "grad_norm": 2.4653127193450928, + "learning_rate": 2e-05, + "loss": 0.04922763, + "step": 15172 + }, + { + "epoch": 30.346, + "grad_norm": 0.9219580292701721, + "learning_rate": 2e-05, + "loss": 0.02929104, + "step": 15173 + }, + { + "epoch": 30.348, + "grad_norm": 1.086243987083435, + "learning_rate": 2e-05, + "loss": 0.04635917, + "step": 15174 + }, + { + "epoch": 30.35, + "grad_norm": 0.9130674004554749, + "learning_rate": 2e-05, + "loss": 0.031245, + "step": 15175 + }, + { + "epoch": 30.352, + "grad_norm": 0.9588483572006226, + "learning_rate": 2e-05, + "loss": 0.03446349, + "step": 15176 + }, + { + "epoch": 30.354, + "grad_norm": 1.1762733459472656, + "learning_rate": 2e-05, + "loss": 0.04649212, + "step": 15177 + }, + { + "epoch": 30.356, + "grad_norm": 1.3569097518920898, + "learning_rate": 2e-05, + "loss": 0.05233772, + "step": 15178 + }, + { + "epoch": 30.358, + "grad_norm": 1.1183359622955322, + "learning_rate": 2e-05, + "loss": 0.02675498, + "step": 15179 + }, + { + "epoch": 30.36, + "grad_norm": 1.455556869506836, + "learning_rate": 2e-05, + "loss": 0.05393392, + "step": 15180 + }, + { + "epoch": 30.362, + "grad_norm": 1.3345918655395508, + "learning_rate": 2e-05, + "loss": 0.04711598, + "step": 15181 + }, + { + "epoch": 30.364, + "grad_norm": 1.4557232856750488, + "learning_rate": 2e-05, + "loss": 0.05467194, + "step": 15182 + }, + { + "epoch": 30.366, + "grad_norm": 1.2925028800964355, + "learning_rate": 2e-05, + "loss": 0.04946, + "step": 15183 + }, + { + "epoch": 30.368, + "grad_norm": 1.2487906217575073, + "learning_rate": 2e-05, + "loss": 0.04537457, + "step": 15184 + }, + { + "epoch": 30.37, + "grad_norm": 1.4377646446228027, + "learning_rate": 2e-05, + "loss": 0.05969262, + "step": 15185 + }, + { + "epoch": 30.372, + "grad_norm": 1.0396651029586792, + "learning_rate": 2e-05, + "loss": 0.03547468, + "step": 15186 + }, + { + "epoch": 30.374, + "grad_norm": 1.273114562034607, + "learning_rate": 2e-05, + "loss": 0.04310979, + "step": 15187 + }, + { + "epoch": 30.376, + "grad_norm": 0.977163553237915, + "learning_rate": 2e-05, + "loss": 0.03705889, + "step": 15188 + }, + { + "epoch": 30.378, + "grad_norm": 2.641759157180786, + "learning_rate": 2e-05, + "loss": 0.05824747, + "step": 15189 + }, + { + "epoch": 30.38, + "grad_norm": 2.5414299964904785, + "learning_rate": 2e-05, + "loss": 0.04271651, + "step": 15190 + }, + { + "epoch": 30.382, + "grad_norm": 1.6944334506988525, + "learning_rate": 2e-05, + "loss": 0.0411639, + "step": 15191 + }, + { + "epoch": 30.384, + "grad_norm": 1.1185425519943237, + "learning_rate": 2e-05, + "loss": 0.03539154, + "step": 15192 + }, + { + "epoch": 30.386, + "grad_norm": 2.133718252182007, + "learning_rate": 2e-05, + "loss": 0.03833131, + "step": 15193 + }, + { + "epoch": 30.388, + "grad_norm": 1.6777894496917725, + "learning_rate": 2e-05, + "loss": 0.04095278, + "step": 15194 + }, + { + "epoch": 30.39, + "grad_norm": 2.4663455486297607, + "learning_rate": 2e-05, + "loss": 0.03233901, + "step": 15195 + }, + { + "epoch": 30.392, + "grad_norm": 1.5365792512893677, + "learning_rate": 2e-05, + "loss": 0.05167238, + "step": 15196 + }, + { + "epoch": 30.394, + "grad_norm": 1.1537460088729858, + "learning_rate": 2e-05, + "loss": 0.03557686, + "step": 15197 + }, + { + "epoch": 30.396, + "grad_norm": 0.9411190152168274, + "learning_rate": 2e-05, + "loss": 0.02952364, + "step": 15198 + }, + { + "epoch": 30.398, + "grad_norm": 1.2708302736282349, + "learning_rate": 2e-05, + "loss": 0.05088828, + "step": 15199 + }, + { + "epoch": 30.4, + "grad_norm": 2.0306994915008545, + "learning_rate": 2e-05, + "loss": 0.04890837, + "step": 15200 + }, + { + "epoch": 30.402, + "grad_norm": 1.7756644487380981, + "learning_rate": 2e-05, + "loss": 0.04039244, + "step": 15201 + }, + { + "epoch": 30.404, + "grad_norm": 1.4186558723449707, + "learning_rate": 2e-05, + "loss": 0.05239846, + "step": 15202 + }, + { + "epoch": 30.406, + "grad_norm": 1.0431525707244873, + "learning_rate": 2e-05, + "loss": 0.03030356, + "step": 15203 + }, + { + "epoch": 30.408, + "grad_norm": 2.5661303997039795, + "learning_rate": 2e-05, + "loss": 0.06191105, + "step": 15204 + }, + { + "epoch": 30.41, + "grad_norm": 1.5152759552001953, + "learning_rate": 2e-05, + "loss": 0.0515747, + "step": 15205 + }, + { + "epoch": 30.412, + "grad_norm": 1.1445180177688599, + "learning_rate": 2e-05, + "loss": 0.04189182, + "step": 15206 + }, + { + "epoch": 30.414, + "grad_norm": 1.1220874786376953, + "learning_rate": 2e-05, + "loss": 0.04300126, + "step": 15207 + }, + { + "epoch": 30.416, + "grad_norm": 1.1378350257873535, + "learning_rate": 2e-05, + "loss": 0.03469217, + "step": 15208 + }, + { + "epoch": 30.418, + "grad_norm": 1.337202548980713, + "learning_rate": 2e-05, + "loss": 0.03686753, + "step": 15209 + }, + { + "epoch": 30.42, + "grad_norm": 0.9815614223480225, + "learning_rate": 2e-05, + "loss": 0.03325251, + "step": 15210 + }, + { + "epoch": 30.422, + "grad_norm": 0.8998761773109436, + "learning_rate": 2e-05, + "loss": 0.03149875, + "step": 15211 + }, + { + "epoch": 30.424, + "grad_norm": 1.0622773170471191, + "learning_rate": 2e-05, + "loss": 0.04623077, + "step": 15212 + }, + { + "epoch": 30.426, + "grad_norm": 2.55696177482605, + "learning_rate": 2e-05, + "loss": 0.05916242, + "step": 15213 + }, + { + "epoch": 30.428, + "grad_norm": 1.0451836585998535, + "learning_rate": 2e-05, + "loss": 0.03193265, + "step": 15214 + }, + { + "epoch": 30.43, + "grad_norm": 1.0104472637176514, + "learning_rate": 2e-05, + "loss": 0.04023196, + "step": 15215 + }, + { + "epoch": 30.432, + "grad_norm": 1.5846662521362305, + "learning_rate": 2e-05, + "loss": 0.04536471, + "step": 15216 + }, + { + "epoch": 30.434, + "grad_norm": 1.2485827207565308, + "learning_rate": 2e-05, + "loss": 0.05067682, + "step": 15217 + }, + { + "epoch": 30.436, + "grad_norm": 2.142920732498169, + "learning_rate": 2e-05, + "loss": 0.04851421, + "step": 15218 + }, + { + "epoch": 30.438, + "grad_norm": 2.670254945755005, + "learning_rate": 2e-05, + "loss": 0.03117582, + "step": 15219 + }, + { + "epoch": 30.44, + "grad_norm": 2.0113894939422607, + "learning_rate": 2e-05, + "loss": 0.04463819, + "step": 15220 + }, + { + "epoch": 30.442, + "grad_norm": 1.347212791442871, + "learning_rate": 2e-05, + "loss": 0.03855886, + "step": 15221 + }, + { + "epoch": 30.444, + "grad_norm": 1.3231719732284546, + "learning_rate": 2e-05, + "loss": 0.04579235, + "step": 15222 + }, + { + "epoch": 30.446, + "grad_norm": 1.3619619607925415, + "learning_rate": 2e-05, + "loss": 0.05623931, + "step": 15223 + }, + { + "epoch": 30.448, + "grad_norm": 1.569140076637268, + "learning_rate": 2e-05, + "loss": 0.05256468, + "step": 15224 + }, + { + "epoch": 30.45, + "grad_norm": 1.5590763092041016, + "learning_rate": 2e-05, + "loss": 0.04745616, + "step": 15225 + }, + { + "epoch": 30.452, + "grad_norm": 1.8052546977996826, + "learning_rate": 2e-05, + "loss": 0.03997829, + "step": 15226 + }, + { + "epoch": 30.454, + "grad_norm": 1.4032646417617798, + "learning_rate": 2e-05, + "loss": 0.03643226, + "step": 15227 + }, + { + "epoch": 30.456, + "grad_norm": 1.6574431657791138, + "learning_rate": 2e-05, + "loss": 0.04806088, + "step": 15228 + }, + { + "epoch": 30.458, + "grad_norm": 1.1438560485839844, + "learning_rate": 2e-05, + "loss": 0.04555596, + "step": 15229 + }, + { + "epoch": 30.46, + "grad_norm": 1.6412994861602783, + "learning_rate": 2e-05, + "loss": 0.05041576, + "step": 15230 + }, + { + "epoch": 30.462, + "grad_norm": 1.7027461528778076, + "learning_rate": 2e-05, + "loss": 0.04613438, + "step": 15231 + }, + { + "epoch": 30.464, + "grad_norm": 1.8641504049301147, + "learning_rate": 2e-05, + "loss": 0.05400945, + "step": 15232 + }, + { + "epoch": 30.466, + "grad_norm": 1.0201693773269653, + "learning_rate": 2e-05, + "loss": 0.03487154, + "step": 15233 + }, + { + "epoch": 30.468, + "grad_norm": 3.329221248626709, + "learning_rate": 2e-05, + "loss": 0.0438869, + "step": 15234 + }, + { + "epoch": 30.47, + "grad_norm": 2.2483973503112793, + "learning_rate": 2e-05, + "loss": 0.05888098, + "step": 15235 + }, + { + "epoch": 30.472, + "grad_norm": 1.1053797006607056, + "learning_rate": 2e-05, + "loss": 0.02992825, + "step": 15236 + }, + { + "epoch": 30.474, + "grad_norm": 1.3227661848068237, + "learning_rate": 2e-05, + "loss": 0.0443338, + "step": 15237 + }, + { + "epoch": 30.476, + "grad_norm": 1.077270269393921, + "learning_rate": 2e-05, + "loss": 0.03753895, + "step": 15238 + }, + { + "epoch": 30.478, + "grad_norm": 1.0654442310333252, + "learning_rate": 2e-05, + "loss": 0.04142526, + "step": 15239 + }, + { + "epoch": 30.48, + "grad_norm": 1.0802520513534546, + "learning_rate": 2e-05, + "loss": 0.04611638, + "step": 15240 + }, + { + "epoch": 30.482, + "grad_norm": 1.2629036903381348, + "learning_rate": 2e-05, + "loss": 0.06196934, + "step": 15241 + }, + { + "epoch": 30.484, + "grad_norm": 1.756333589553833, + "learning_rate": 2e-05, + "loss": 0.04763553, + "step": 15242 + }, + { + "epoch": 30.486, + "grad_norm": 1.0675008296966553, + "learning_rate": 2e-05, + "loss": 0.03430801, + "step": 15243 + }, + { + "epoch": 30.488, + "grad_norm": 5.384986400604248, + "learning_rate": 2e-05, + "loss": 0.03727914, + "step": 15244 + }, + { + "epoch": 30.49, + "grad_norm": 1.504477620124817, + "learning_rate": 2e-05, + "loss": 0.05895821, + "step": 15245 + }, + { + "epoch": 30.492, + "grad_norm": 1.3034749031066895, + "learning_rate": 2e-05, + "loss": 0.04594578, + "step": 15246 + }, + { + "epoch": 30.494, + "grad_norm": 1.38698148727417, + "learning_rate": 2e-05, + "loss": 0.05439015, + "step": 15247 + }, + { + "epoch": 30.496, + "grad_norm": 1.0708712339401245, + "learning_rate": 2e-05, + "loss": 0.03840674, + "step": 15248 + }, + { + "epoch": 30.498, + "grad_norm": 1.4558812379837036, + "learning_rate": 2e-05, + "loss": 0.02542023, + "step": 15249 + }, + { + "epoch": 30.5, + "grad_norm": 1.7556490898132324, + "learning_rate": 2e-05, + "loss": 0.0409022, + "step": 15250 + }, + { + "epoch": 30.502, + "grad_norm": 1.2140228748321533, + "learning_rate": 2e-05, + "loss": 0.03575923, + "step": 15251 + }, + { + "epoch": 30.504, + "grad_norm": 0.9344707131385803, + "learning_rate": 2e-05, + "loss": 0.03167684, + "step": 15252 + }, + { + "epoch": 30.506, + "grad_norm": 1.3499372005462646, + "learning_rate": 2e-05, + "loss": 0.03214483, + "step": 15253 + }, + { + "epoch": 30.508, + "grad_norm": 1.1017028093338013, + "learning_rate": 2e-05, + "loss": 0.02525232, + "step": 15254 + }, + { + "epoch": 30.51, + "grad_norm": 1.4885841608047485, + "learning_rate": 2e-05, + "loss": 0.06516996, + "step": 15255 + }, + { + "epoch": 30.512, + "grad_norm": 1.4931526184082031, + "learning_rate": 2e-05, + "loss": 0.06183759, + "step": 15256 + }, + { + "epoch": 30.514, + "grad_norm": 0.9158805012702942, + "learning_rate": 2e-05, + "loss": 0.03264806, + "step": 15257 + }, + { + "epoch": 30.516, + "grad_norm": 1.7295507192611694, + "learning_rate": 2e-05, + "loss": 0.0437915, + "step": 15258 + }, + { + "epoch": 30.518, + "grad_norm": 1.2573812007904053, + "learning_rate": 2e-05, + "loss": 0.04718456, + "step": 15259 + }, + { + "epoch": 30.52, + "grad_norm": 1.111760139465332, + "learning_rate": 2e-05, + "loss": 0.03778823, + "step": 15260 + }, + { + "epoch": 30.522, + "grad_norm": 1.3841087818145752, + "learning_rate": 2e-05, + "loss": 0.05139244, + "step": 15261 + }, + { + "epoch": 30.524, + "grad_norm": 1.2217211723327637, + "learning_rate": 2e-05, + "loss": 0.05407543, + "step": 15262 + }, + { + "epoch": 30.526, + "grad_norm": 1.9204769134521484, + "learning_rate": 2e-05, + "loss": 0.05501651, + "step": 15263 + }, + { + "epoch": 30.528, + "grad_norm": 1.0646864175796509, + "learning_rate": 2e-05, + "loss": 0.04272152, + "step": 15264 + }, + { + "epoch": 30.53, + "grad_norm": 1.138542652130127, + "learning_rate": 2e-05, + "loss": 0.04351484, + "step": 15265 + }, + { + "epoch": 30.532, + "grad_norm": 1.1147321462631226, + "learning_rate": 2e-05, + "loss": 0.03230875, + "step": 15266 + }, + { + "epoch": 30.534, + "grad_norm": 1.3464140892028809, + "learning_rate": 2e-05, + "loss": 0.05336832, + "step": 15267 + }, + { + "epoch": 30.536, + "grad_norm": 1.3282221555709839, + "learning_rate": 2e-05, + "loss": 0.06214736, + "step": 15268 + }, + { + "epoch": 30.538, + "grad_norm": 1.1814717054367065, + "learning_rate": 2e-05, + "loss": 0.03814064, + "step": 15269 + }, + { + "epoch": 30.54, + "grad_norm": 1.167283535003662, + "learning_rate": 2e-05, + "loss": 0.04938902, + "step": 15270 + }, + { + "epoch": 30.542, + "grad_norm": 1.5443036556243896, + "learning_rate": 2e-05, + "loss": 0.04786984, + "step": 15271 + }, + { + "epoch": 30.544, + "grad_norm": 1.666343331336975, + "learning_rate": 2e-05, + "loss": 0.05091058, + "step": 15272 + }, + { + "epoch": 30.546, + "grad_norm": 1.4363032579421997, + "learning_rate": 2e-05, + "loss": 0.04783217, + "step": 15273 + }, + { + "epoch": 30.548000000000002, + "grad_norm": 1.348711371421814, + "learning_rate": 2e-05, + "loss": 0.05052876, + "step": 15274 + }, + { + "epoch": 30.55, + "grad_norm": 1.2497211694717407, + "learning_rate": 2e-05, + "loss": 0.03438094, + "step": 15275 + }, + { + "epoch": 30.552, + "grad_norm": 1.3024879693984985, + "learning_rate": 2e-05, + "loss": 0.04458308, + "step": 15276 + }, + { + "epoch": 30.554, + "grad_norm": 1.4879077672958374, + "learning_rate": 2e-05, + "loss": 0.03748794, + "step": 15277 + }, + { + "epoch": 30.556, + "grad_norm": 1.565683126449585, + "learning_rate": 2e-05, + "loss": 0.03531917, + "step": 15278 + }, + { + "epoch": 30.558, + "grad_norm": 1.226104974746704, + "learning_rate": 2e-05, + "loss": 0.04298712, + "step": 15279 + }, + { + "epoch": 30.56, + "grad_norm": 1.5387117862701416, + "learning_rate": 2e-05, + "loss": 0.04004962, + "step": 15280 + }, + { + "epoch": 30.562, + "grad_norm": 1.0032356977462769, + "learning_rate": 2e-05, + "loss": 0.03250517, + "step": 15281 + }, + { + "epoch": 30.564, + "grad_norm": 1.0234278440475464, + "learning_rate": 2e-05, + "loss": 0.04238243, + "step": 15282 + }, + { + "epoch": 30.566, + "grad_norm": 1.151049256324768, + "learning_rate": 2e-05, + "loss": 0.04576783, + "step": 15283 + }, + { + "epoch": 30.568, + "grad_norm": 2.4301161766052246, + "learning_rate": 2e-05, + "loss": 0.04964666, + "step": 15284 + }, + { + "epoch": 30.57, + "grad_norm": 1.798929214477539, + "learning_rate": 2e-05, + "loss": 0.04637855, + "step": 15285 + }, + { + "epoch": 30.572, + "grad_norm": 1.205482006072998, + "learning_rate": 2e-05, + "loss": 0.04132317, + "step": 15286 + }, + { + "epoch": 30.574, + "grad_norm": 1.730236530303955, + "learning_rate": 2e-05, + "loss": 0.04239135, + "step": 15287 + }, + { + "epoch": 30.576, + "grad_norm": 1.3888801336288452, + "learning_rate": 2e-05, + "loss": 0.04025697, + "step": 15288 + }, + { + "epoch": 30.578, + "grad_norm": 1.2676804065704346, + "learning_rate": 2e-05, + "loss": 0.05485225, + "step": 15289 + }, + { + "epoch": 30.58, + "grad_norm": 1.2869915962219238, + "learning_rate": 2e-05, + "loss": 0.04968848, + "step": 15290 + }, + { + "epoch": 30.582, + "grad_norm": 1.0613614320755005, + "learning_rate": 2e-05, + "loss": 0.03498927, + "step": 15291 + }, + { + "epoch": 30.584, + "grad_norm": 1.000826120376587, + "learning_rate": 2e-05, + "loss": 0.03525396, + "step": 15292 + }, + { + "epoch": 30.586, + "grad_norm": 1.5731277465820312, + "learning_rate": 2e-05, + "loss": 0.05101633, + "step": 15293 + }, + { + "epoch": 30.588, + "grad_norm": 1.1912803649902344, + "learning_rate": 2e-05, + "loss": 0.04636155, + "step": 15294 + }, + { + "epoch": 30.59, + "grad_norm": 1.1083427667617798, + "learning_rate": 2e-05, + "loss": 0.04011728, + "step": 15295 + }, + { + "epoch": 30.592, + "grad_norm": 0.8691169023513794, + "learning_rate": 2e-05, + "loss": 0.02712849, + "step": 15296 + }, + { + "epoch": 30.594, + "grad_norm": 1.5057884454727173, + "learning_rate": 2e-05, + "loss": 0.05116381, + "step": 15297 + }, + { + "epoch": 30.596, + "grad_norm": 1.8393473625183105, + "learning_rate": 2e-05, + "loss": 0.05369818, + "step": 15298 + }, + { + "epoch": 30.598, + "grad_norm": 1.4146645069122314, + "learning_rate": 2e-05, + "loss": 0.04586646, + "step": 15299 + }, + { + "epoch": 30.6, + "grad_norm": 1.0296281576156616, + "learning_rate": 2e-05, + "loss": 0.02862342, + "step": 15300 + }, + { + "epoch": 30.602, + "grad_norm": 1.4884545803070068, + "learning_rate": 2e-05, + "loss": 0.03266924, + "step": 15301 + }, + { + "epoch": 30.604, + "grad_norm": 1.6892744302749634, + "learning_rate": 2e-05, + "loss": 0.03972404, + "step": 15302 + }, + { + "epoch": 30.606, + "grad_norm": 1.5136027336120605, + "learning_rate": 2e-05, + "loss": 0.02378543, + "step": 15303 + }, + { + "epoch": 30.608, + "grad_norm": 1.1681137084960938, + "learning_rate": 2e-05, + "loss": 0.0225147, + "step": 15304 + }, + { + "epoch": 30.61, + "grad_norm": 1.0413873195648193, + "learning_rate": 2e-05, + "loss": 0.03786266, + "step": 15305 + }, + { + "epoch": 30.612, + "grad_norm": 1.3476159572601318, + "learning_rate": 2e-05, + "loss": 0.0479761, + "step": 15306 + }, + { + "epoch": 30.614, + "grad_norm": 1.4096819162368774, + "learning_rate": 2e-05, + "loss": 0.04315577, + "step": 15307 + }, + { + "epoch": 30.616, + "grad_norm": 1.1657134294509888, + "learning_rate": 2e-05, + "loss": 0.03629996, + "step": 15308 + }, + { + "epoch": 30.618, + "grad_norm": 1.0118683576583862, + "learning_rate": 2e-05, + "loss": 0.03799508, + "step": 15309 + }, + { + "epoch": 30.62, + "grad_norm": 1.3761227130889893, + "learning_rate": 2e-05, + "loss": 0.04091384, + "step": 15310 + }, + { + "epoch": 30.622, + "grad_norm": 1.3315359354019165, + "learning_rate": 2e-05, + "loss": 0.05451095, + "step": 15311 + }, + { + "epoch": 30.624, + "grad_norm": 1.0484113693237305, + "learning_rate": 2e-05, + "loss": 0.04014854, + "step": 15312 + }, + { + "epoch": 30.626, + "grad_norm": 1.0485554933547974, + "learning_rate": 2e-05, + "loss": 0.03734627, + "step": 15313 + }, + { + "epoch": 30.628, + "grad_norm": 1.7621971368789673, + "learning_rate": 2e-05, + "loss": 0.06587996, + "step": 15314 + }, + { + "epoch": 30.63, + "grad_norm": 1.1702368259429932, + "learning_rate": 2e-05, + "loss": 0.03241003, + "step": 15315 + }, + { + "epoch": 30.632, + "grad_norm": 0.9568459987640381, + "learning_rate": 2e-05, + "loss": 0.03606553, + "step": 15316 + }, + { + "epoch": 30.634, + "grad_norm": 1.383159875869751, + "learning_rate": 2e-05, + "loss": 0.05273655, + "step": 15317 + }, + { + "epoch": 30.636, + "grad_norm": 1.2365126609802246, + "learning_rate": 2e-05, + "loss": 0.03494096, + "step": 15318 + }, + { + "epoch": 30.638, + "grad_norm": 1.046418309211731, + "learning_rate": 2e-05, + "loss": 0.03035589, + "step": 15319 + }, + { + "epoch": 30.64, + "grad_norm": 1.7063677310943604, + "learning_rate": 2e-05, + "loss": 0.04815275, + "step": 15320 + }, + { + "epoch": 30.642, + "grad_norm": 0.9858196973800659, + "learning_rate": 2e-05, + "loss": 0.0392507, + "step": 15321 + }, + { + "epoch": 30.644, + "grad_norm": 2.5675480365753174, + "learning_rate": 2e-05, + "loss": 0.06889924, + "step": 15322 + }, + { + "epoch": 30.646, + "grad_norm": 1.2175625562667847, + "learning_rate": 2e-05, + "loss": 0.04431954, + "step": 15323 + }, + { + "epoch": 30.648, + "grad_norm": 1.0321712493896484, + "learning_rate": 2e-05, + "loss": 0.03850232, + "step": 15324 + }, + { + "epoch": 30.65, + "grad_norm": 1.097364068031311, + "learning_rate": 2e-05, + "loss": 0.04117165, + "step": 15325 + }, + { + "epoch": 30.652, + "grad_norm": 1.0748999118804932, + "learning_rate": 2e-05, + "loss": 0.03478822, + "step": 15326 + }, + { + "epoch": 30.654, + "grad_norm": 3.8500382900238037, + "learning_rate": 2e-05, + "loss": 0.03891071, + "step": 15327 + }, + { + "epoch": 30.656, + "grad_norm": 1.0356502532958984, + "learning_rate": 2e-05, + "loss": 0.04003483, + "step": 15328 + }, + { + "epoch": 30.658, + "grad_norm": 1.2226482629776, + "learning_rate": 2e-05, + "loss": 0.04361072, + "step": 15329 + }, + { + "epoch": 30.66, + "grad_norm": 3.1957709789276123, + "learning_rate": 2e-05, + "loss": 0.04747942, + "step": 15330 + }, + { + "epoch": 30.662, + "grad_norm": 1.8828949928283691, + "learning_rate": 2e-05, + "loss": 0.05194776, + "step": 15331 + }, + { + "epoch": 30.664, + "grad_norm": 1.5473778247833252, + "learning_rate": 2e-05, + "loss": 0.05460526, + "step": 15332 + }, + { + "epoch": 30.666, + "grad_norm": 1.3898783922195435, + "learning_rate": 2e-05, + "loss": 0.05540655, + "step": 15333 + }, + { + "epoch": 30.668, + "grad_norm": 0.9823468923568726, + "learning_rate": 2e-05, + "loss": 0.02938699, + "step": 15334 + }, + { + "epoch": 30.67, + "grad_norm": 2.376797676086426, + "learning_rate": 2e-05, + "loss": 0.03299019, + "step": 15335 + }, + { + "epoch": 30.672, + "grad_norm": 1.2253479957580566, + "learning_rate": 2e-05, + "loss": 0.03981312, + "step": 15336 + }, + { + "epoch": 30.674, + "grad_norm": 1.2062132358551025, + "learning_rate": 2e-05, + "loss": 0.03478318, + "step": 15337 + }, + { + "epoch": 30.676, + "grad_norm": 0.9848882555961609, + "learning_rate": 2e-05, + "loss": 0.03393116, + "step": 15338 + }, + { + "epoch": 30.678, + "grad_norm": 1.059923768043518, + "learning_rate": 2e-05, + "loss": 0.03325968, + "step": 15339 + }, + { + "epoch": 30.68, + "grad_norm": 1.6200644969940186, + "learning_rate": 2e-05, + "loss": 0.03570171, + "step": 15340 + }, + { + "epoch": 30.682, + "grad_norm": 1.2216757535934448, + "learning_rate": 2e-05, + "loss": 0.04476602, + "step": 15341 + }, + { + "epoch": 30.684, + "grad_norm": 2.0105185508728027, + "learning_rate": 2e-05, + "loss": 0.04382947, + "step": 15342 + }, + { + "epoch": 30.686, + "grad_norm": 1.8764110803604126, + "learning_rate": 2e-05, + "loss": 0.05112007, + "step": 15343 + }, + { + "epoch": 30.688, + "grad_norm": 1.9776445627212524, + "learning_rate": 2e-05, + "loss": 0.05051792, + "step": 15344 + }, + { + "epoch": 30.69, + "grad_norm": 1.2636760473251343, + "learning_rate": 2e-05, + "loss": 0.03654606, + "step": 15345 + }, + { + "epoch": 30.692, + "grad_norm": 1.58008873462677, + "learning_rate": 2e-05, + "loss": 0.05276753, + "step": 15346 + }, + { + "epoch": 30.694, + "grad_norm": 1.1315622329711914, + "learning_rate": 2e-05, + "loss": 0.04263277, + "step": 15347 + }, + { + "epoch": 30.696, + "grad_norm": 0.9804048538208008, + "learning_rate": 2e-05, + "loss": 0.03775415, + "step": 15348 + }, + { + "epoch": 30.698, + "grad_norm": 1.381704568862915, + "learning_rate": 2e-05, + "loss": 0.03425687, + "step": 15349 + }, + { + "epoch": 30.7, + "grad_norm": 0.9259615540504456, + "learning_rate": 2e-05, + "loss": 0.02952418, + "step": 15350 + }, + { + "epoch": 30.701999999999998, + "grad_norm": 1.5775662660598755, + "learning_rate": 2e-05, + "loss": 0.05005368, + "step": 15351 + }, + { + "epoch": 30.704, + "grad_norm": 1.3794008493423462, + "learning_rate": 2e-05, + "loss": 0.0453692, + "step": 15352 + }, + { + "epoch": 30.706, + "grad_norm": 1.189494013786316, + "learning_rate": 2e-05, + "loss": 0.04057778, + "step": 15353 + }, + { + "epoch": 30.708, + "grad_norm": 0.9436183571815491, + "learning_rate": 2e-05, + "loss": 0.02952637, + "step": 15354 + }, + { + "epoch": 30.71, + "grad_norm": 1.8463188409805298, + "learning_rate": 2e-05, + "loss": 0.05099573, + "step": 15355 + }, + { + "epoch": 30.712, + "grad_norm": 1.0513994693756104, + "learning_rate": 2e-05, + "loss": 0.03942338, + "step": 15356 + }, + { + "epoch": 30.714, + "grad_norm": 1.306980013847351, + "learning_rate": 2e-05, + "loss": 0.04591175, + "step": 15357 + }, + { + "epoch": 30.716, + "grad_norm": 1.1594239473342896, + "learning_rate": 2e-05, + "loss": 0.03216893, + "step": 15358 + }, + { + "epoch": 30.718, + "grad_norm": 1.6709867715835571, + "learning_rate": 2e-05, + "loss": 0.04788967, + "step": 15359 + }, + { + "epoch": 30.72, + "grad_norm": 1.4016337394714355, + "learning_rate": 2e-05, + "loss": 0.04345062, + "step": 15360 + }, + { + "epoch": 30.722, + "grad_norm": 1.594774603843689, + "learning_rate": 2e-05, + "loss": 0.04838118, + "step": 15361 + }, + { + "epoch": 30.724, + "grad_norm": 1.3870517015457153, + "learning_rate": 2e-05, + "loss": 0.04288274, + "step": 15362 + }, + { + "epoch": 30.726, + "grad_norm": 1.472120761871338, + "learning_rate": 2e-05, + "loss": 0.05352866, + "step": 15363 + }, + { + "epoch": 30.728, + "grad_norm": 1.9960273504257202, + "learning_rate": 2e-05, + "loss": 0.0549426, + "step": 15364 + }, + { + "epoch": 30.73, + "grad_norm": 1.4268264770507812, + "learning_rate": 2e-05, + "loss": 0.04215187, + "step": 15365 + }, + { + "epoch": 30.732, + "grad_norm": 1.359204649925232, + "learning_rate": 2e-05, + "loss": 0.03503023, + "step": 15366 + }, + { + "epoch": 30.734, + "grad_norm": 1.4836820363998413, + "learning_rate": 2e-05, + "loss": 0.04756457, + "step": 15367 + }, + { + "epoch": 30.736, + "grad_norm": 1.3566566705703735, + "learning_rate": 2e-05, + "loss": 0.04132615, + "step": 15368 + }, + { + "epoch": 30.738, + "grad_norm": 1.0889496803283691, + "learning_rate": 2e-05, + "loss": 0.03694168, + "step": 15369 + }, + { + "epoch": 30.74, + "grad_norm": 1.9702776670455933, + "learning_rate": 2e-05, + "loss": 0.04029308, + "step": 15370 + }, + { + "epoch": 30.742, + "grad_norm": 1.7063497304916382, + "learning_rate": 2e-05, + "loss": 0.04850524, + "step": 15371 + }, + { + "epoch": 30.744, + "grad_norm": 1.1768672466278076, + "learning_rate": 2e-05, + "loss": 0.04457154, + "step": 15372 + }, + { + "epoch": 30.746, + "grad_norm": 1.506422758102417, + "learning_rate": 2e-05, + "loss": 0.05803453, + "step": 15373 + }, + { + "epoch": 30.748, + "grad_norm": 1.2613164186477661, + "learning_rate": 2e-05, + "loss": 0.0555273, + "step": 15374 + }, + { + "epoch": 30.75, + "grad_norm": 1.302585482597351, + "learning_rate": 2e-05, + "loss": 0.03637126, + "step": 15375 + }, + { + "epoch": 30.752, + "grad_norm": 1.2151802778244019, + "learning_rate": 2e-05, + "loss": 0.04023366, + "step": 15376 + }, + { + "epoch": 30.754, + "grad_norm": 1.4890795946121216, + "learning_rate": 2e-05, + "loss": 0.04400902, + "step": 15377 + }, + { + "epoch": 30.756, + "grad_norm": 1.8800874948501587, + "learning_rate": 2e-05, + "loss": 0.04399022, + "step": 15378 + }, + { + "epoch": 30.758, + "grad_norm": 1.0342926979064941, + "learning_rate": 2e-05, + "loss": 0.04493139, + "step": 15379 + }, + { + "epoch": 30.76, + "grad_norm": 2.302096366882324, + "learning_rate": 2e-05, + "loss": 0.0552254, + "step": 15380 + }, + { + "epoch": 30.762, + "grad_norm": 0.9423291087150574, + "learning_rate": 2e-05, + "loss": 0.03075608, + "step": 15381 + }, + { + "epoch": 30.764, + "grad_norm": 1.7802855968475342, + "learning_rate": 2e-05, + "loss": 0.0369587, + "step": 15382 + }, + { + "epoch": 30.766, + "grad_norm": 1.1834336519241333, + "learning_rate": 2e-05, + "loss": 0.04712176, + "step": 15383 + }, + { + "epoch": 30.768, + "grad_norm": 1.2314602136611938, + "learning_rate": 2e-05, + "loss": 0.04508484, + "step": 15384 + }, + { + "epoch": 30.77, + "grad_norm": 1.5756516456604004, + "learning_rate": 2e-05, + "loss": 0.04346953, + "step": 15385 + }, + { + "epoch": 30.772, + "grad_norm": 3.2680647373199463, + "learning_rate": 2e-05, + "loss": 0.03386783, + "step": 15386 + }, + { + "epoch": 30.774, + "grad_norm": 1.0917185544967651, + "learning_rate": 2e-05, + "loss": 0.04377945, + "step": 15387 + }, + { + "epoch": 30.776, + "grad_norm": 1.5100263357162476, + "learning_rate": 2e-05, + "loss": 0.0553541, + "step": 15388 + }, + { + "epoch": 30.778, + "grad_norm": 1.24834406375885, + "learning_rate": 2e-05, + "loss": 0.03370894, + "step": 15389 + }, + { + "epoch": 30.78, + "grad_norm": 1.9447808265686035, + "learning_rate": 2e-05, + "loss": 0.05794008, + "step": 15390 + }, + { + "epoch": 30.782, + "grad_norm": 1.1977484226226807, + "learning_rate": 2e-05, + "loss": 0.04914448, + "step": 15391 + }, + { + "epoch": 30.784, + "grad_norm": 2.5817272663116455, + "learning_rate": 2e-05, + "loss": 0.04607561, + "step": 15392 + }, + { + "epoch": 30.786, + "grad_norm": 0.9399213790893555, + "learning_rate": 2e-05, + "loss": 0.03578041, + "step": 15393 + }, + { + "epoch": 30.788, + "grad_norm": 1.2921048402786255, + "learning_rate": 2e-05, + "loss": 0.03382245, + "step": 15394 + }, + { + "epoch": 30.79, + "grad_norm": 1.1487122774124146, + "learning_rate": 2e-05, + "loss": 0.04007035, + "step": 15395 + }, + { + "epoch": 30.792, + "grad_norm": 1.0508596897125244, + "learning_rate": 2e-05, + "loss": 0.03421424, + "step": 15396 + }, + { + "epoch": 30.794, + "grad_norm": 0.864069938659668, + "learning_rate": 2e-05, + "loss": 0.02603904, + "step": 15397 + }, + { + "epoch": 30.796, + "grad_norm": 1.4718056917190552, + "learning_rate": 2e-05, + "loss": 0.038355, + "step": 15398 + }, + { + "epoch": 30.798000000000002, + "grad_norm": 1.2328131198883057, + "learning_rate": 2e-05, + "loss": 0.04491291, + "step": 15399 + }, + { + "epoch": 30.8, + "grad_norm": 1.186107873916626, + "learning_rate": 2e-05, + "loss": 0.05053133, + "step": 15400 + }, + { + "epoch": 30.802, + "grad_norm": 1.0685789585113525, + "learning_rate": 2e-05, + "loss": 0.04427924, + "step": 15401 + }, + { + "epoch": 30.804, + "grad_norm": 1.1698602437973022, + "learning_rate": 2e-05, + "loss": 0.03796928, + "step": 15402 + }, + { + "epoch": 30.806, + "grad_norm": 1.2792614698410034, + "learning_rate": 2e-05, + "loss": 0.0548584, + "step": 15403 + }, + { + "epoch": 30.808, + "grad_norm": 1.533371090888977, + "learning_rate": 2e-05, + "loss": 0.04547575, + "step": 15404 + }, + { + "epoch": 30.81, + "grad_norm": 1.5394388437271118, + "learning_rate": 2e-05, + "loss": 0.05188145, + "step": 15405 + }, + { + "epoch": 30.812, + "grad_norm": 1.0687896013259888, + "learning_rate": 2e-05, + "loss": 0.04358595, + "step": 15406 + }, + { + "epoch": 30.814, + "grad_norm": 1.0643762350082397, + "learning_rate": 2e-05, + "loss": 0.03647963, + "step": 15407 + }, + { + "epoch": 30.816, + "grad_norm": 1.1489731073379517, + "learning_rate": 2e-05, + "loss": 0.05470124, + "step": 15408 + }, + { + "epoch": 30.818, + "grad_norm": 1.8806666135787964, + "learning_rate": 2e-05, + "loss": 0.05590469, + "step": 15409 + }, + { + "epoch": 30.82, + "grad_norm": 1.0280253887176514, + "learning_rate": 2e-05, + "loss": 0.04052454, + "step": 15410 + }, + { + "epoch": 30.822, + "grad_norm": 1.6329935789108276, + "learning_rate": 2e-05, + "loss": 0.04793561, + "step": 15411 + }, + { + "epoch": 30.824, + "grad_norm": 1.4511553049087524, + "learning_rate": 2e-05, + "loss": 0.03960864, + "step": 15412 + }, + { + "epoch": 30.826, + "grad_norm": 1.6550666093826294, + "learning_rate": 2e-05, + "loss": 0.04335929, + "step": 15413 + }, + { + "epoch": 30.828, + "grad_norm": 1.3845441341400146, + "learning_rate": 2e-05, + "loss": 0.03868318, + "step": 15414 + }, + { + "epoch": 30.83, + "grad_norm": 1.1391074657440186, + "learning_rate": 2e-05, + "loss": 0.0478869, + "step": 15415 + }, + { + "epoch": 30.832, + "grad_norm": 1.278219223022461, + "learning_rate": 2e-05, + "loss": 0.04307889, + "step": 15416 + }, + { + "epoch": 30.834, + "grad_norm": 1.268723726272583, + "learning_rate": 2e-05, + "loss": 0.03802036, + "step": 15417 + }, + { + "epoch": 30.836, + "grad_norm": 1.159140706062317, + "learning_rate": 2e-05, + "loss": 0.03451194, + "step": 15418 + }, + { + "epoch": 30.838, + "grad_norm": 1.1894630193710327, + "learning_rate": 2e-05, + "loss": 0.0473679, + "step": 15419 + }, + { + "epoch": 30.84, + "grad_norm": 1.262392520904541, + "learning_rate": 2e-05, + "loss": 0.03899877, + "step": 15420 + }, + { + "epoch": 30.842, + "grad_norm": 1.446376085281372, + "learning_rate": 2e-05, + "loss": 0.03620021, + "step": 15421 + }, + { + "epoch": 30.844, + "grad_norm": 1.4437825679779053, + "learning_rate": 2e-05, + "loss": 0.04326116, + "step": 15422 + }, + { + "epoch": 30.846, + "grad_norm": 1.1866358518600464, + "learning_rate": 2e-05, + "loss": 0.03965632, + "step": 15423 + }, + { + "epoch": 30.848, + "grad_norm": 1.379371166229248, + "learning_rate": 2e-05, + "loss": 0.04506073, + "step": 15424 + }, + { + "epoch": 30.85, + "grad_norm": 2.1914074420928955, + "learning_rate": 2e-05, + "loss": 0.05980007, + "step": 15425 + }, + { + "epoch": 30.852, + "grad_norm": 1.2148512601852417, + "learning_rate": 2e-05, + "loss": 0.04434382, + "step": 15426 + }, + { + "epoch": 30.854, + "grad_norm": 1.0785166025161743, + "learning_rate": 2e-05, + "loss": 0.03507778, + "step": 15427 + }, + { + "epoch": 30.856, + "grad_norm": 1.0878171920776367, + "learning_rate": 2e-05, + "loss": 0.03332923, + "step": 15428 + }, + { + "epoch": 30.858, + "grad_norm": 1.4415518045425415, + "learning_rate": 2e-05, + "loss": 0.05356176, + "step": 15429 + }, + { + "epoch": 30.86, + "grad_norm": 1.4083247184753418, + "learning_rate": 2e-05, + "loss": 0.05534295, + "step": 15430 + }, + { + "epoch": 30.862, + "grad_norm": 1.1404030323028564, + "learning_rate": 2e-05, + "loss": 0.0465, + "step": 15431 + }, + { + "epoch": 30.864, + "grad_norm": 1.4626610279083252, + "learning_rate": 2e-05, + "loss": 0.03837757, + "step": 15432 + }, + { + "epoch": 30.866, + "grad_norm": 1.0799211263656616, + "learning_rate": 2e-05, + "loss": 0.04735259, + "step": 15433 + }, + { + "epoch": 30.868, + "grad_norm": 1.3441239595413208, + "learning_rate": 2e-05, + "loss": 0.0439587, + "step": 15434 + }, + { + "epoch": 30.87, + "grad_norm": 1.341903567314148, + "learning_rate": 2e-05, + "loss": 0.04261306, + "step": 15435 + }, + { + "epoch": 30.872, + "grad_norm": 2.014629602432251, + "learning_rate": 2e-05, + "loss": 0.04418968, + "step": 15436 + }, + { + "epoch": 30.874, + "grad_norm": 1.4627952575683594, + "learning_rate": 2e-05, + "loss": 0.04787583, + "step": 15437 + }, + { + "epoch": 30.876, + "grad_norm": 1.0740972757339478, + "learning_rate": 2e-05, + "loss": 0.03408062, + "step": 15438 + }, + { + "epoch": 30.878, + "grad_norm": 1.458024024963379, + "learning_rate": 2e-05, + "loss": 0.05833004, + "step": 15439 + }, + { + "epoch": 30.88, + "grad_norm": 1.0955424308776855, + "learning_rate": 2e-05, + "loss": 0.04713885, + "step": 15440 + }, + { + "epoch": 30.882, + "grad_norm": 1.673150658607483, + "learning_rate": 2e-05, + "loss": 0.05364336, + "step": 15441 + }, + { + "epoch": 30.884, + "grad_norm": 1.1266365051269531, + "learning_rate": 2e-05, + "loss": 0.04248474, + "step": 15442 + }, + { + "epoch": 30.886, + "grad_norm": 1.5739974975585938, + "learning_rate": 2e-05, + "loss": 0.03701893, + "step": 15443 + }, + { + "epoch": 30.888, + "grad_norm": 1.4596102237701416, + "learning_rate": 2e-05, + "loss": 0.047738, + "step": 15444 + }, + { + "epoch": 30.89, + "grad_norm": 1.0531946420669556, + "learning_rate": 2e-05, + "loss": 0.04053745, + "step": 15445 + }, + { + "epoch": 30.892, + "grad_norm": 0.9864213466644287, + "learning_rate": 2e-05, + "loss": 0.03761032, + "step": 15446 + }, + { + "epoch": 30.894, + "grad_norm": 2.1464390754699707, + "learning_rate": 2e-05, + "loss": 0.05027218, + "step": 15447 + }, + { + "epoch": 30.896, + "grad_norm": 1.2808313369750977, + "learning_rate": 2e-05, + "loss": 0.04857701, + "step": 15448 + }, + { + "epoch": 30.898, + "grad_norm": 1.7590348720550537, + "learning_rate": 2e-05, + "loss": 0.05534477, + "step": 15449 + }, + { + "epoch": 30.9, + "grad_norm": 1.0882501602172852, + "learning_rate": 2e-05, + "loss": 0.04137859, + "step": 15450 + }, + { + "epoch": 30.902, + "grad_norm": 0.946066677570343, + "learning_rate": 2e-05, + "loss": 0.03223543, + "step": 15451 + }, + { + "epoch": 30.904, + "grad_norm": 0.9430605173110962, + "learning_rate": 2e-05, + "loss": 0.0311843, + "step": 15452 + }, + { + "epoch": 30.906, + "grad_norm": 1.3041696548461914, + "learning_rate": 2e-05, + "loss": 0.05397451, + "step": 15453 + }, + { + "epoch": 30.908, + "grad_norm": 1.3684122562408447, + "learning_rate": 2e-05, + "loss": 0.05539594, + "step": 15454 + }, + { + "epoch": 30.91, + "grad_norm": 2.258601665496826, + "learning_rate": 2e-05, + "loss": 0.05372835, + "step": 15455 + }, + { + "epoch": 30.912, + "grad_norm": 2.0608880519866943, + "learning_rate": 2e-05, + "loss": 0.03330045, + "step": 15456 + }, + { + "epoch": 30.914, + "grad_norm": 1.1213666200637817, + "learning_rate": 2e-05, + "loss": 0.04753671, + "step": 15457 + }, + { + "epoch": 30.916, + "grad_norm": 0.942790150642395, + "learning_rate": 2e-05, + "loss": 0.03259289, + "step": 15458 + }, + { + "epoch": 30.918, + "grad_norm": 1.2493982315063477, + "learning_rate": 2e-05, + "loss": 0.04098676, + "step": 15459 + }, + { + "epoch": 30.92, + "grad_norm": 1.1307854652404785, + "learning_rate": 2e-05, + "loss": 0.04249147, + "step": 15460 + }, + { + "epoch": 30.922, + "grad_norm": 1.5322867631912231, + "learning_rate": 2e-05, + "loss": 0.03617536, + "step": 15461 + }, + { + "epoch": 30.924, + "grad_norm": 1.1813342571258545, + "learning_rate": 2e-05, + "loss": 0.05639866, + "step": 15462 + }, + { + "epoch": 30.926, + "grad_norm": 0.8674895763397217, + "learning_rate": 2e-05, + "loss": 0.03328463, + "step": 15463 + }, + { + "epoch": 30.928, + "grad_norm": 1.5556342601776123, + "learning_rate": 2e-05, + "loss": 0.05676714, + "step": 15464 + }, + { + "epoch": 30.93, + "grad_norm": 1.0045814514160156, + "learning_rate": 2e-05, + "loss": 0.03216821, + "step": 15465 + }, + { + "epoch": 30.932, + "grad_norm": 1.0363186597824097, + "learning_rate": 2e-05, + "loss": 0.04014078, + "step": 15466 + }, + { + "epoch": 30.934, + "grad_norm": 1.386189341545105, + "learning_rate": 2e-05, + "loss": 0.05402887, + "step": 15467 + }, + { + "epoch": 30.936, + "grad_norm": 1.137304663658142, + "learning_rate": 2e-05, + "loss": 0.03464006, + "step": 15468 + }, + { + "epoch": 30.938, + "grad_norm": 1.3049153089523315, + "learning_rate": 2e-05, + "loss": 0.04342077, + "step": 15469 + }, + { + "epoch": 30.94, + "grad_norm": 1.1227730512619019, + "learning_rate": 2e-05, + "loss": 0.04238366, + "step": 15470 + }, + { + "epoch": 30.942, + "grad_norm": 1.2162871360778809, + "learning_rate": 2e-05, + "loss": 0.06160378, + "step": 15471 + }, + { + "epoch": 30.944, + "grad_norm": 0.8703532218933105, + "learning_rate": 2e-05, + "loss": 0.02413613, + "step": 15472 + }, + { + "epoch": 30.946, + "grad_norm": 1.9870779514312744, + "learning_rate": 2e-05, + "loss": 0.03381436, + "step": 15473 + }, + { + "epoch": 30.948, + "grad_norm": 1.9186214208602905, + "learning_rate": 2e-05, + "loss": 0.05125932, + "step": 15474 + }, + { + "epoch": 30.95, + "grad_norm": 1.7328497171401978, + "learning_rate": 2e-05, + "loss": 0.0443011, + "step": 15475 + }, + { + "epoch": 30.951999999999998, + "grad_norm": 0.9649899005889893, + "learning_rate": 2e-05, + "loss": 0.03079892, + "step": 15476 + }, + { + "epoch": 30.954, + "grad_norm": 0.9868589043617249, + "learning_rate": 2e-05, + "loss": 0.02492999, + "step": 15477 + }, + { + "epoch": 30.956, + "grad_norm": 1.2447783946990967, + "learning_rate": 2e-05, + "loss": 0.05273892, + "step": 15478 + }, + { + "epoch": 30.958, + "grad_norm": 1.2232327461242676, + "learning_rate": 2e-05, + "loss": 0.04913422, + "step": 15479 + }, + { + "epoch": 30.96, + "grad_norm": 1.6843918561935425, + "learning_rate": 2e-05, + "loss": 0.05371039, + "step": 15480 + }, + { + "epoch": 30.962, + "grad_norm": 0.8725624680519104, + "learning_rate": 2e-05, + "loss": 0.02640246, + "step": 15481 + }, + { + "epoch": 30.964, + "grad_norm": 1.2975506782531738, + "learning_rate": 2e-05, + "loss": 0.05456071, + "step": 15482 + }, + { + "epoch": 30.966, + "grad_norm": 1.7329095602035522, + "learning_rate": 2e-05, + "loss": 0.04484017, + "step": 15483 + }, + { + "epoch": 30.968, + "grad_norm": 1.4026141166687012, + "learning_rate": 2e-05, + "loss": 0.05021109, + "step": 15484 + }, + { + "epoch": 30.97, + "grad_norm": 1.046205997467041, + "learning_rate": 2e-05, + "loss": 0.03913869, + "step": 15485 + }, + { + "epoch": 30.972, + "grad_norm": 3.3396363258361816, + "learning_rate": 2e-05, + "loss": 0.04359189, + "step": 15486 + }, + { + "epoch": 30.974, + "grad_norm": 1.369861125946045, + "learning_rate": 2e-05, + "loss": 0.04546359, + "step": 15487 + }, + { + "epoch": 30.976, + "grad_norm": 1.4066284894943237, + "learning_rate": 2e-05, + "loss": 0.03458145, + "step": 15488 + }, + { + "epoch": 30.978, + "grad_norm": 1.479104995727539, + "learning_rate": 2e-05, + "loss": 0.04172707, + "step": 15489 + }, + { + "epoch": 30.98, + "grad_norm": 1.0808502435684204, + "learning_rate": 2e-05, + "loss": 0.0374383, + "step": 15490 + }, + { + "epoch": 30.982, + "grad_norm": 1.153609275817871, + "learning_rate": 2e-05, + "loss": 0.04683513, + "step": 15491 + }, + { + "epoch": 30.984, + "grad_norm": 1.3860514163970947, + "learning_rate": 2e-05, + "loss": 0.04752363, + "step": 15492 + }, + { + "epoch": 30.986, + "grad_norm": 1.005586862564087, + "learning_rate": 2e-05, + "loss": 0.03461302, + "step": 15493 + }, + { + "epoch": 30.988, + "grad_norm": 2.63100266456604, + "learning_rate": 2e-05, + "loss": 0.05241647, + "step": 15494 + }, + { + "epoch": 30.99, + "grad_norm": 1.4951649904251099, + "learning_rate": 2e-05, + "loss": 0.03878006, + "step": 15495 + }, + { + "epoch": 30.992, + "grad_norm": 1.208921194076538, + "learning_rate": 2e-05, + "loss": 0.04567692, + "step": 15496 + }, + { + "epoch": 30.994, + "grad_norm": 1.2812520265579224, + "learning_rate": 2e-05, + "loss": 0.03168442, + "step": 15497 + }, + { + "epoch": 30.996, + "grad_norm": 1.3722220659255981, + "learning_rate": 2e-05, + "loss": 0.04763746, + "step": 15498 + }, + { + "epoch": 30.998, + "grad_norm": 1.4294661283493042, + "learning_rate": 2e-05, + "loss": 0.05221847, + "step": 15499 + }, + { + "epoch": 31.0, + "grad_norm": 1.2685956954956055, + "learning_rate": 2e-05, + "loss": 0.05318395, + "step": 15500 + }, + { + "epoch": 31.0, + "eval_performance": { + "AngleClassification_1": 0.994, + "AngleClassification_2": 0.998, + "AngleClassification_3": 0.9780439121756487, + "Equal_1": 1.0, + "Equal_2": 0.9880239520958084, + "Equal_3": 0.9860279441117764, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9919839679358717, + "Parallel_3": 0.994, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.992, + "Perpendicular_3": 0.8957915831663327, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.9936, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9880239520958084 + }, + "eval_runtime": 319.7306, + "eval_samples_per_second": 32.84, + "eval_steps_per_second": 0.657, + "step": 15500 + }, + { + "epoch": 31.002, + "grad_norm": 1.2148770093917847, + "learning_rate": 2e-05, + "loss": 0.04367323, + "step": 15501 + }, + { + "epoch": 31.004, + "grad_norm": 1.2555484771728516, + "learning_rate": 2e-05, + "loss": 0.04165998, + "step": 15502 + }, + { + "epoch": 31.006, + "grad_norm": 1.7054599523544312, + "learning_rate": 2e-05, + "loss": 0.04306837, + "step": 15503 + }, + { + "epoch": 31.008, + "grad_norm": 1.1247222423553467, + "learning_rate": 2e-05, + "loss": 0.04493858, + "step": 15504 + }, + { + "epoch": 31.01, + "grad_norm": 1.21216881275177, + "learning_rate": 2e-05, + "loss": 0.04912875, + "step": 15505 + }, + { + "epoch": 31.012, + "grad_norm": 1.0440160036087036, + "learning_rate": 2e-05, + "loss": 0.03388001, + "step": 15506 + }, + { + "epoch": 31.014, + "grad_norm": 2.1243808269500732, + "learning_rate": 2e-05, + "loss": 0.04466413, + "step": 15507 + }, + { + "epoch": 31.016, + "grad_norm": 1.024837851524353, + "learning_rate": 2e-05, + "loss": 0.03612889, + "step": 15508 + }, + { + "epoch": 31.018, + "grad_norm": 1.3261427879333496, + "learning_rate": 2e-05, + "loss": 0.04339204, + "step": 15509 + }, + { + "epoch": 31.02, + "grad_norm": 1.1927242279052734, + "learning_rate": 2e-05, + "loss": 0.04285469, + "step": 15510 + }, + { + "epoch": 31.022, + "grad_norm": 1.6944687366485596, + "learning_rate": 2e-05, + "loss": 0.06444372, + "step": 15511 + }, + { + "epoch": 31.024, + "grad_norm": 1.8218998908996582, + "learning_rate": 2e-05, + "loss": 0.04665653, + "step": 15512 + }, + { + "epoch": 31.026, + "grad_norm": 0.961631715297699, + "learning_rate": 2e-05, + "loss": 0.02488809, + "step": 15513 + }, + { + "epoch": 31.028, + "grad_norm": 1.3068662881851196, + "learning_rate": 2e-05, + "loss": 0.04824544, + "step": 15514 + }, + { + "epoch": 31.03, + "grad_norm": 1.7906019687652588, + "learning_rate": 2e-05, + "loss": 0.03604405, + "step": 15515 + }, + { + "epoch": 31.032, + "grad_norm": 0.9942117929458618, + "learning_rate": 2e-05, + "loss": 0.04298211, + "step": 15516 + }, + { + "epoch": 31.034, + "grad_norm": 1.2944165468215942, + "learning_rate": 2e-05, + "loss": 0.04983748, + "step": 15517 + }, + { + "epoch": 31.036, + "grad_norm": 1.3310195207595825, + "learning_rate": 2e-05, + "loss": 0.04212454, + "step": 15518 + }, + { + "epoch": 31.038, + "grad_norm": 1.0146511793136597, + "learning_rate": 2e-05, + "loss": 0.03470632, + "step": 15519 + }, + { + "epoch": 31.04, + "grad_norm": 1.0683945417404175, + "learning_rate": 2e-05, + "loss": 0.03555411, + "step": 15520 + }, + { + "epoch": 31.042, + "grad_norm": 0.9860395789146423, + "learning_rate": 2e-05, + "loss": 0.03534026, + "step": 15521 + }, + { + "epoch": 31.044, + "grad_norm": 1.2321219444274902, + "learning_rate": 2e-05, + "loss": 0.04802042, + "step": 15522 + }, + { + "epoch": 31.046, + "grad_norm": 0.9068757891654968, + "learning_rate": 2e-05, + "loss": 0.03739195, + "step": 15523 + }, + { + "epoch": 31.048, + "grad_norm": 1.0807992219924927, + "learning_rate": 2e-05, + "loss": 0.03552995, + "step": 15524 + }, + { + "epoch": 31.05, + "grad_norm": 1.2086913585662842, + "learning_rate": 2e-05, + "loss": 0.03402312, + "step": 15525 + }, + { + "epoch": 31.052, + "grad_norm": 1.1114925146102905, + "learning_rate": 2e-05, + "loss": 0.04508002, + "step": 15526 + }, + { + "epoch": 31.054, + "grad_norm": 1.053460717201233, + "learning_rate": 2e-05, + "loss": 0.03332749, + "step": 15527 + }, + { + "epoch": 31.056, + "grad_norm": 1.5291039943695068, + "learning_rate": 2e-05, + "loss": 0.07005154, + "step": 15528 + }, + { + "epoch": 31.058, + "grad_norm": 1.3969502449035645, + "learning_rate": 2e-05, + "loss": 0.03875972, + "step": 15529 + }, + { + "epoch": 31.06, + "grad_norm": 1.258709192276001, + "learning_rate": 2e-05, + "loss": 0.0648275, + "step": 15530 + }, + { + "epoch": 31.062, + "grad_norm": 2.1577484607696533, + "learning_rate": 2e-05, + "loss": 0.04727962, + "step": 15531 + }, + { + "epoch": 31.064, + "grad_norm": 1.453369379043579, + "learning_rate": 2e-05, + "loss": 0.03450019, + "step": 15532 + }, + { + "epoch": 31.066, + "grad_norm": 1.3698201179504395, + "learning_rate": 2e-05, + "loss": 0.0564566, + "step": 15533 + }, + { + "epoch": 31.068, + "grad_norm": 1.4147790670394897, + "learning_rate": 2e-05, + "loss": 0.06183496, + "step": 15534 + }, + { + "epoch": 31.07, + "grad_norm": 0.8963314294815063, + "learning_rate": 2e-05, + "loss": 0.0314716, + "step": 15535 + }, + { + "epoch": 31.072, + "grad_norm": 1.7133666276931763, + "learning_rate": 2e-05, + "loss": 0.03604105, + "step": 15536 + }, + { + "epoch": 31.074, + "grad_norm": 0.9274123907089233, + "learning_rate": 2e-05, + "loss": 0.03644059, + "step": 15537 + }, + { + "epoch": 31.076, + "grad_norm": 0.9932688474655151, + "learning_rate": 2e-05, + "loss": 0.03320706, + "step": 15538 + }, + { + "epoch": 31.078, + "grad_norm": 1.139175534248352, + "learning_rate": 2e-05, + "loss": 0.04096094, + "step": 15539 + }, + { + "epoch": 31.08, + "grad_norm": 2.054170846939087, + "learning_rate": 2e-05, + "loss": 0.03290722, + "step": 15540 + }, + { + "epoch": 31.082, + "grad_norm": 1.0771855115890503, + "learning_rate": 2e-05, + "loss": 0.04088487, + "step": 15541 + }, + { + "epoch": 31.084, + "grad_norm": 4.079372406005859, + "learning_rate": 2e-05, + "loss": 0.06007933, + "step": 15542 + }, + { + "epoch": 31.086, + "grad_norm": 4.406796932220459, + "learning_rate": 2e-05, + "loss": 0.0371773, + "step": 15543 + }, + { + "epoch": 31.088, + "grad_norm": 1.232641339302063, + "learning_rate": 2e-05, + "loss": 0.04272714, + "step": 15544 + }, + { + "epoch": 31.09, + "grad_norm": 1.339889407157898, + "learning_rate": 2e-05, + "loss": 0.03682033, + "step": 15545 + }, + { + "epoch": 31.092, + "grad_norm": 1.6685773134231567, + "learning_rate": 2e-05, + "loss": 0.04497613, + "step": 15546 + }, + { + "epoch": 31.094, + "grad_norm": 1.1808186769485474, + "learning_rate": 2e-05, + "loss": 0.04455563, + "step": 15547 + }, + { + "epoch": 31.096, + "grad_norm": 1.0447876453399658, + "learning_rate": 2e-05, + "loss": 0.03678391, + "step": 15548 + }, + { + "epoch": 31.098, + "grad_norm": 1.3275498151779175, + "learning_rate": 2e-05, + "loss": 0.05487559, + "step": 15549 + }, + { + "epoch": 31.1, + "grad_norm": 1.0034887790679932, + "learning_rate": 2e-05, + "loss": 0.03993573, + "step": 15550 + }, + { + "epoch": 31.102, + "grad_norm": 1.2984068393707275, + "learning_rate": 2e-05, + "loss": 0.03917623, + "step": 15551 + }, + { + "epoch": 31.104, + "grad_norm": 1.0606168508529663, + "learning_rate": 2e-05, + "loss": 0.03269617, + "step": 15552 + }, + { + "epoch": 31.106, + "grad_norm": 1.4763702154159546, + "learning_rate": 2e-05, + "loss": 0.05329809, + "step": 15553 + }, + { + "epoch": 31.108, + "grad_norm": 1.3631935119628906, + "learning_rate": 2e-05, + "loss": 0.03446741, + "step": 15554 + }, + { + "epoch": 31.11, + "grad_norm": 1.3570765256881714, + "learning_rate": 2e-05, + "loss": 0.04257222, + "step": 15555 + }, + { + "epoch": 31.112, + "grad_norm": 1.1822189092636108, + "learning_rate": 2e-05, + "loss": 0.04207947, + "step": 15556 + }, + { + "epoch": 31.114, + "grad_norm": 1.150335431098938, + "learning_rate": 2e-05, + "loss": 0.04551108, + "step": 15557 + }, + { + "epoch": 31.116, + "grad_norm": 1.3339632749557495, + "learning_rate": 2e-05, + "loss": 0.05508456, + "step": 15558 + }, + { + "epoch": 31.118, + "grad_norm": 2.0711183547973633, + "learning_rate": 2e-05, + "loss": 0.0530576, + "step": 15559 + }, + { + "epoch": 31.12, + "grad_norm": 1.3315399885177612, + "learning_rate": 2e-05, + "loss": 0.04907047, + "step": 15560 + }, + { + "epoch": 31.122, + "grad_norm": 1.204527497291565, + "learning_rate": 2e-05, + "loss": 0.04079658, + "step": 15561 + }, + { + "epoch": 31.124, + "grad_norm": 2.65455961227417, + "learning_rate": 2e-05, + "loss": 0.05661328, + "step": 15562 + }, + { + "epoch": 31.126, + "grad_norm": 1.3059120178222656, + "learning_rate": 2e-05, + "loss": 0.03965393, + "step": 15563 + }, + { + "epoch": 31.128, + "grad_norm": 1.079121708869934, + "learning_rate": 2e-05, + "loss": 0.04260961, + "step": 15564 + }, + { + "epoch": 31.13, + "grad_norm": 2.0246469974517822, + "learning_rate": 2e-05, + "loss": 0.03333722, + "step": 15565 + }, + { + "epoch": 31.132, + "grad_norm": 1.0588722229003906, + "learning_rate": 2e-05, + "loss": 0.04730598, + "step": 15566 + }, + { + "epoch": 31.134, + "grad_norm": 1.0964921712875366, + "learning_rate": 2e-05, + "loss": 0.0434188, + "step": 15567 + }, + { + "epoch": 31.136, + "grad_norm": 1.1378031969070435, + "learning_rate": 2e-05, + "loss": 0.04260813, + "step": 15568 + }, + { + "epoch": 31.138, + "grad_norm": 3.1608223915100098, + "learning_rate": 2e-05, + "loss": 0.0434106, + "step": 15569 + }, + { + "epoch": 31.14, + "grad_norm": 1.2680219411849976, + "learning_rate": 2e-05, + "loss": 0.04677624, + "step": 15570 + }, + { + "epoch": 31.142, + "grad_norm": 1.1608775854110718, + "learning_rate": 2e-05, + "loss": 0.04532572, + "step": 15571 + }, + { + "epoch": 31.144, + "grad_norm": 1.3475708961486816, + "learning_rate": 2e-05, + "loss": 0.05460385, + "step": 15572 + }, + { + "epoch": 31.146, + "grad_norm": 1.1137781143188477, + "learning_rate": 2e-05, + "loss": 0.04142278, + "step": 15573 + }, + { + "epoch": 31.148, + "grad_norm": 1.082829475402832, + "learning_rate": 2e-05, + "loss": 0.04784383, + "step": 15574 + }, + { + "epoch": 31.15, + "grad_norm": 1.2136229276657104, + "learning_rate": 2e-05, + "loss": 0.03027528, + "step": 15575 + }, + { + "epoch": 31.152, + "grad_norm": 1.0264850854873657, + "learning_rate": 2e-05, + "loss": 0.04279248, + "step": 15576 + }, + { + "epoch": 31.154, + "grad_norm": 1.5535433292388916, + "learning_rate": 2e-05, + "loss": 0.05268865, + "step": 15577 + }, + { + "epoch": 31.156, + "grad_norm": 1.4060524702072144, + "learning_rate": 2e-05, + "loss": 0.0343322, + "step": 15578 + }, + { + "epoch": 31.158, + "grad_norm": 0.6968619227409363, + "learning_rate": 2e-05, + "loss": 0.01679928, + "step": 15579 + }, + { + "epoch": 31.16, + "grad_norm": 1.15040123462677, + "learning_rate": 2e-05, + "loss": 0.04167127, + "step": 15580 + }, + { + "epoch": 31.162, + "grad_norm": 1.524133563041687, + "learning_rate": 2e-05, + "loss": 0.03878558, + "step": 15581 + }, + { + "epoch": 31.164, + "grad_norm": 2.028996706008911, + "learning_rate": 2e-05, + "loss": 0.05743674, + "step": 15582 + }, + { + "epoch": 31.166, + "grad_norm": 1.5730241537094116, + "learning_rate": 2e-05, + "loss": 0.04803181, + "step": 15583 + }, + { + "epoch": 31.168, + "grad_norm": 1.2136539220809937, + "learning_rate": 2e-05, + "loss": 0.03908694, + "step": 15584 + }, + { + "epoch": 31.17, + "grad_norm": 2.451467514038086, + "learning_rate": 2e-05, + "loss": 0.05021393, + "step": 15585 + }, + { + "epoch": 31.172, + "grad_norm": 0.967592179775238, + "learning_rate": 2e-05, + "loss": 0.02962617, + "step": 15586 + }, + { + "epoch": 31.174, + "grad_norm": 1.205581784248352, + "learning_rate": 2e-05, + "loss": 0.04359256, + "step": 15587 + }, + { + "epoch": 31.176, + "grad_norm": 1.8534575700759888, + "learning_rate": 2e-05, + "loss": 0.05359968, + "step": 15588 + }, + { + "epoch": 31.178, + "grad_norm": 1.4265109300613403, + "learning_rate": 2e-05, + "loss": 0.0480662, + "step": 15589 + }, + { + "epoch": 31.18, + "grad_norm": 2.661834716796875, + "learning_rate": 2e-05, + "loss": 0.05001631, + "step": 15590 + }, + { + "epoch": 31.182, + "grad_norm": 2.1135103702545166, + "learning_rate": 2e-05, + "loss": 0.03104893, + "step": 15591 + }, + { + "epoch": 31.184, + "grad_norm": 1.0201138257980347, + "learning_rate": 2e-05, + "loss": 0.03780366, + "step": 15592 + }, + { + "epoch": 31.186, + "grad_norm": 1.1135469675064087, + "learning_rate": 2e-05, + "loss": 0.03954279, + "step": 15593 + }, + { + "epoch": 31.188, + "grad_norm": 1.0150232315063477, + "learning_rate": 2e-05, + "loss": 0.0309415, + "step": 15594 + }, + { + "epoch": 31.19, + "grad_norm": 1.0710225105285645, + "learning_rate": 2e-05, + "loss": 0.03865776, + "step": 15595 + }, + { + "epoch": 31.192, + "grad_norm": 1.226999044418335, + "learning_rate": 2e-05, + "loss": 0.04560003, + "step": 15596 + }, + { + "epoch": 31.194, + "grad_norm": 1.3833179473876953, + "learning_rate": 2e-05, + "loss": 0.05527067, + "step": 15597 + }, + { + "epoch": 31.196, + "grad_norm": 1.1716560125350952, + "learning_rate": 2e-05, + "loss": 0.03517221, + "step": 15598 + }, + { + "epoch": 31.198, + "grad_norm": 1.7883951663970947, + "learning_rate": 2e-05, + "loss": 0.04532555, + "step": 15599 + }, + { + "epoch": 31.2, + "grad_norm": 1.13947594165802, + "learning_rate": 2e-05, + "loss": 0.03804951, + "step": 15600 + }, + { + "epoch": 31.202, + "grad_norm": 1.1460981369018555, + "learning_rate": 2e-05, + "loss": 0.04975809, + "step": 15601 + }, + { + "epoch": 31.204, + "grad_norm": 2.1435766220092773, + "learning_rate": 2e-05, + "loss": 0.04334519, + "step": 15602 + }, + { + "epoch": 31.206, + "grad_norm": 2.5278499126434326, + "learning_rate": 2e-05, + "loss": 0.04724212, + "step": 15603 + }, + { + "epoch": 31.208, + "grad_norm": 3.0959975719451904, + "learning_rate": 2e-05, + "loss": 0.04795504, + "step": 15604 + }, + { + "epoch": 31.21, + "grad_norm": 1.3681577444076538, + "learning_rate": 2e-05, + "loss": 0.04222552, + "step": 15605 + }, + { + "epoch": 31.212, + "grad_norm": 2.0547592639923096, + "learning_rate": 2e-05, + "loss": 0.04761481, + "step": 15606 + }, + { + "epoch": 31.214, + "grad_norm": 1.327380657196045, + "learning_rate": 2e-05, + "loss": 0.05030813, + "step": 15607 + }, + { + "epoch": 31.216, + "grad_norm": 0.9326903223991394, + "learning_rate": 2e-05, + "loss": 0.02635142, + "step": 15608 + }, + { + "epoch": 31.218, + "grad_norm": 2.4155571460723877, + "learning_rate": 2e-05, + "loss": 0.04080499, + "step": 15609 + }, + { + "epoch": 31.22, + "grad_norm": 0.9260361194610596, + "learning_rate": 2e-05, + "loss": 0.03045768, + "step": 15610 + }, + { + "epoch": 31.222, + "grad_norm": 1.0131243467330933, + "learning_rate": 2e-05, + "loss": 0.03150607, + "step": 15611 + }, + { + "epoch": 31.224, + "grad_norm": 1.1739399433135986, + "learning_rate": 2e-05, + "loss": 0.03871925, + "step": 15612 + }, + { + "epoch": 31.226, + "grad_norm": 1.072050929069519, + "learning_rate": 2e-05, + "loss": 0.04814139, + "step": 15613 + }, + { + "epoch": 31.228, + "grad_norm": 2.805246591567993, + "learning_rate": 2e-05, + "loss": 0.0489997, + "step": 15614 + }, + { + "epoch": 31.23, + "grad_norm": 2.114262580871582, + "learning_rate": 2e-05, + "loss": 0.06214041, + "step": 15615 + }, + { + "epoch": 31.232, + "grad_norm": 1.275655746459961, + "learning_rate": 2e-05, + "loss": 0.03479639, + "step": 15616 + }, + { + "epoch": 31.234, + "grad_norm": 1.3052101135253906, + "learning_rate": 2e-05, + "loss": 0.04745651, + "step": 15617 + }, + { + "epoch": 31.236, + "grad_norm": 2.008206605911255, + "learning_rate": 2e-05, + "loss": 0.02935056, + "step": 15618 + }, + { + "epoch": 31.238, + "grad_norm": 1.4074678421020508, + "learning_rate": 2e-05, + "loss": 0.05427198, + "step": 15619 + }, + { + "epoch": 31.24, + "grad_norm": 1.0969489812850952, + "learning_rate": 2e-05, + "loss": 0.04732572, + "step": 15620 + }, + { + "epoch": 31.242, + "grad_norm": 1.1318331956863403, + "learning_rate": 2e-05, + "loss": 0.04238062, + "step": 15621 + }, + { + "epoch": 31.244, + "grad_norm": 1.1431554555892944, + "learning_rate": 2e-05, + "loss": 0.05024539, + "step": 15622 + }, + { + "epoch": 31.246, + "grad_norm": 1.2575552463531494, + "learning_rate": 2e-05, + "loss": 0.03069566, + "step": 15623 + }, + { + "epoch": 31.248, + "grad_norm": 1.2473465204238892, + "learning_rate": 2e-05, + "loss": 0.04475783, + "step": 15624 + }, + { + "epoch": 31.25, + "grad_norm": 1.2875679731369019, + "learning_rate": 2e-05, + "loss": 0.03152344, + "step": 15625 + }, + { + "epoch": 31.252, + "grad_norm": 1.624121069908142, + "learning_rate": 2e-05, + "loss": 0.06142828, + "step": 15626 + }, + { + "epoch": 31.254, + "grad_norm": 1.1332637071609497, + "learning_rate": 2e-05, + "loss": 0.03317603, + "step": 15627 + }, + { + "epoch": 31.256, + "grad_norm": 1.1676865816116333, + "learning_rate": 2e-05, + "loss": 0.04019779, + "step": 15628 + }, + { + "epoch": 31.258, + "grad_norm": 2.188838005065918, + "learning_rate": 2e-05, + "loss": 0.03890157, + "step": 15629 + }, + { + "epoch": 31.26, + "grad_norm": 1.019037127494812, + "learning_rate": 2e-05, + "loss": 0.03228922, + "step": 15630 + }, + { + "epoch": 31.262, + "grad_norm": 1.123388409614563, + "learning_rate": 2e-05, + "loss": 0.03372766, + "step": 15631 + }, + { + "epoch": 31.264, + "grad_norm": 3.509737491607666, + "learning_rate": 2e-05, + "loss": 0.04830921, + "step": 15632 + }, + { + "epoch": 31.266, + "grad_norm": 1.3756506443023682, + "learning_rate": 2e-05, + "loss": 0.0438694, + "step": 15633 + }, + { + "epoch": 31.268, + "grad_norm": 1.2744717597961426, + "learning_rate": 2e-05, + "loss": 0.03653997, + "step": 15634 + }, + { + "epoch": 31.27, + "grad_norm": 1.5209195613861084, + "learning_rate": 2e-05, + "loss": 0.04338511, + "step": 15635 + }, + { + "epoch": 31.272, + "grad_norm": 2.2282023429870605, + "learning_rate": 2e-05, + "loss": 0.04440949, + "step": 15636 + }, + { + "epoch": 31.274, + "grad_norm": 1.1879875659942627, + "learning_rate": 2e-05, + "loss": 0.02991764, + "step": 15637 + }, + { + "epoch": 31.276, + "grad_norm": 0.8613758683204651, + "learning_rate": 2e-05, + "loss": 0.02830381, + "step": 15638 + }, + { + "epoch": 31.278, + "grad_norm": 1.0136206150054932, + "learning_rate": 2e-05, + "loss": 0.03036819, + "step": 15639 + }, + { + "epoch": 31.28, + "grad_norm": 1.2615082263946533, + "learning_rate": 2e-05, + "loss": 0.04580786, + "step": 15640 + }, + { + "epoch": 31.282, + "grad_norm": 1.1112679243087769, + "learning_rate": 2e-05, + "loss": 0.04442069, + "step": 15641 + }, + { + "epoch": 31.284, + "grad_norm": 1.0508791208267212, + "learning_rate": 2e-05, + "loss": 0.04491913, + "step": 15642 + }, + { + "epoch": 31.286, + "grad_norm": 1.1440688371658325, + "learning_rate": 2e-05, + "loss": 0.04976808, + "step": 15643 + }, + { + "epoch": 31.288, + "grad_norm": 1.3392294645309448, + "learning_rate": 2e-05, + "loss": 0.04487741, + "step": 15644 + }, + { + "epoch": 31.29, + "grad_norm": 1.3118019104003906, + "learning_rate": 2e-05, + "loss": 0.04272094, + "step": 15645 + }, + { + "epoch": 31.292, + "grad_norm": 1.3110008239746094, + "learning_rate": 2e-05, + "loss": 0.04961982, + "step": 15646 + }, + { + "epoch": 31.294, + "grad_norm": 1.3569951057434082, + "learning_rate": 2e-05, + "loss": 0.05010619, + "step": 15647 + }, + { + "epoch": 31.296, + "grad_norm": 1.1815541982650757, + "learning_rate": 2e-05, + "loss": 0.03432131, + "step": 15648 + }, + { + "epoch": 31.298, + "grad_norm": 1.3439204692840576, + "learning_rate": 2e-05, + "loss": 0.06246966, + "step": 15649 + }, + { + "epoch": 31.3, + "grad_norm": 1.0219476222991943, + "learning_rate": 2e-05, + "loss": 0.02712579, + "step": 15650 + }, + { + "epoch": 31.302, + "grad_norm": 0.9462128281593323, + "learning_rate": 2e-05, + "loss": 0.03200362, + "step": 15651 + }, + { + "epoch": 31.304, + "grad_norm": 1.5896122455596924, + "learning_rate": 2e-05, + "loss": 0.05370201, + "step": 15652 + }, + { + "epoch": 31.306, + "grad_norm": 2.8889029026031494, + "learning_rate": 2e-05, + "loss": 0.03914107, + "step": 15653 + }, + { + "epoch": 31.308, + "grad_norm": 1.5959099531173706, + "learning_rate": 2e-05, + "loss": 0.05540708, + "step": 15654 + }, + { + "epoch": 31.31, + "grad_norm": 1.9818872213363647, + "learning_rate": 2e-05, + "loss": 0.05058894, + "step": 15655 + }, + { + "epoch": 31.312, + "grad_norm": 1.4036006927490234, + "learning_rate": 2e-05, + "loss": 0.05569137, + "step": 15656 + }, + { + "epoch": 31.314, + "grad_norm": 1.2897917032241821, + "learning_rate": 2e-05, + "loss": 0.04362039, + "step": 15657 + }, + { + "epoch": 31.316, + "grad_norm": 1.1039726734161377, + "learning_rate": 2e-05, + "loss": 0.03560334, + "step": 15658 + }, + { + "epoch": 31.318, + "grad_norm": 1.2764880657196045, + "learning_rate": 2e-05, + "loss": 0.04904519, + "step": 15659 + }, + { + "epoch": 31.32, + "grad_norm": 2.2581369876861572, + "learning_rate": 2e-05, + "loss": 0.06880252, + "step": 15660 + }, + { + "epoch": 31.322, + "grad_norm": 1.424851655960083, + "learning_rate": 2e-05, + "loss": 0.03886406, + "step": 15661 + }, + { + "epoch": 31.324, + "grad_norm": 0.9566469192504883, + "learning_rate": 2e-05, + "loss": 0.03768406, + "step": 15662 + }, + { + "epoch": 31.326, + "grad_norm": 1.1624170541763306, + "learning_rate": 2e-05, + "loss": 0.04413099, + "step": 15663 + }, + { + "epoch": 31.328, + "grad_norm": 1.2362680435180664, + "learning_rate": 2e-05, + "loss": 0.03142476, + "step": 15664 + }, + { + "epoch": 31.33, + "grad_norm": 1.912945032119751, + "learning_rate": 2e-05, + "loss": 0.05409247, + "step": 15665 + }, + { + "epoch": 31.332, + "grad_norm": 2.0025062561035156, + "learning_rate": 2e-05, + "loss": 0.05454734, + "step": 15666 + }, + { + "epoch": 31.334, + "grad_norm": 1.087705373764038, + "learning_rate": 2e-05, + "loss": 0.04066493, + "step": 15667 + }, + { + "epoch": 31.336, + "grad_norm": 1.9127389192581177, + "learning_rate": 2e-05, + "loss": 0.05669509, + "step": 15668 + }, + { + "epoch": 31.338, + "grad_norm": 1.421940803527832, + "learning_rate": 2e-05, + "loss": 0.05013736, + "step": 15669 + }, + { + "epoch": 31.34, + "grad_norm": 2.1149110794067383, + "learning_rate": 2e-05, + "loss": 0.05951951, + "step": 15670 + }, + { + "epoch": 31.342, + "grad_norm": 2.23716402053833, + "learning_rate": 2e-05, + "loss": 0.04989032, + "step": 15671 + }, + { + "epoch": 31.344, + "grad_norm": 1.1148024797439575, + "learning_rate": 2e-05, + "loss": 0.03459584, + "step": 15672 + }, + { + "epoch": 31.346, + "grad_norm": 1.1864360570907593, + "learning_rate": 2e-05, + "loss": 0.04163406, + "step": 15673 + }, + { + "epoch": 31.348, + "grad_norm": 1.1121900081634521, + "learning_rate": 2e-05, + "loss": 0.04536529, + "step": 15674 + }, + { + "epoch": 31.35, + "grad_norm": 1.37013840675354, + "learning_rate": 2e-05, + "loss": 0.03631928, + "step": 15675 + }, + { + "epoch": 31.352, + "grad_norm": 1.0761029720306396, + "learning_rate": 2e-05, + "loss": 0.04334534, + "step": 15676 + }, + { + "epoch": 31.354, + "grad_norm": 1.2131975889205933, + "learning_rate": 2e-05, + "loss": 0.03891152, + "step": 15677 + }, + { + "epoch": 31.356, + "grad_norm": 1.1239383220672607, + "learning_rate": 2e-05, + "loss": 0.03443652, + "step": 15678 + }, + { + "epoch": 31.358, + "grad_norm": 1.4421390295028687, + "learning_rate": 2e-05, + "loss": 0.03504834, + "step": 15679 + }, + { + "epoch": 31.36, + "grad_norm": 1.6474403142929077, + "learning_rate": 2e-05, + "loss": 0.06349871, + "step": 15680 + }, + { + "epoch": 31.362, + "grad_norm": 1.2030842304229736, + "learning_rate": 2e-05, + "loss": 0.03427202, + "step": 15681 + }, + { + "epoch": 31.364, + "grad_norm": 1.1239585876464844, + "learning_rate": 2e-05, + "loss": 0.03592511, + "step": 15682 + }, + { + "epoch": 31.366, + "grad_norm": 1.0693938732147217, + "learning_rate": 2e-05, + "loss": 0.03550804, + "step": 15683 + }, + { + "epoch": 31.368, + "grad_norm": 1.2990278005599976, + "learning_rate": 2e-05, + "loss": 0.04546881, + "step": 15684 + }, + { + "epoch": 31.37, + "grad_norm": 1.4115841388702393, + "learning_rate": 2e-05, + "loss": 0.04322497, + "step": 15685 + }, + { + "epoch": 31.372, + "grad_norm": 1.1592683792114258, + "learning_rate": 2e-05, + "loss": 0.04757714, + "step": 15686 + }, + { + "epoch": 31.374, + "grad_norm": 1.0848994255065918, + "learning_rate": 2e-05, + "loss": 0.04322045, + "step": 15687 + }, + { + "epoch": 31.376, + "grad_norm": 1.319326639175415, + "learning_rate": 2e-05, + "loss": 0.04263577, + "step": 15688 + }, + { + "epoch": 31.378, + "grad_norm": 1.180602788925171, + "learning_rate": 2e-05, + "loss": 0.04362357, + "step": 15689 + }, + { + "epoch": 31.38, + "grad_norm": 1.1107655763626099, + "learning_rate": 2e-05, + "loss": 0.05218142, + "step": 15690 + }, + { + "epoch": 31.382, + "grad_norm": 1.0870996713638306, + "learning_rate": 2e-05, + "loss": 0.03446791, + "step": 15691 + }, + { + "epoch": 31.384, + "grad_norm": 1.0287660360336304, + "learning_rate": 2e-05, + "loss": 0.03068786, + "step": 15692 + }, + { + "epoch": 31.386, + "grad_norm": 1.2678953409194946, + "learning_rate": 2e-05, + "loss": 0.02981563, + "step": 15693 + }, + { + "epoch": 31.388, + "grad_norm": 1.2370007038116455, + "learning_rate": 2e-05, + "loss": 0.04931886, + "step": 15694 + }, + { + "epoch": 31.39, + "grad_norm": 1.0795314311981201, + "learning_rate": 2e-05, + "loss": 0.03978341, + "step": 15695 + }, + { + "epoch": 31.392, + "grad_norm": 1.0488916635513306, + "learning_rate": 2e-05, + "loss": 0.04341751, + "step": 15696 + }, + { + "epoch": 31.394, + "grad_norm": 1.9883416891098022, + "learning_rate": 2e-05, + "loss": 0.04387866, + "step": 15697 + }, + { + "epoch": 31.396, + "grad_norm": 1.0542610883712769, + "learning_rate": 2e-05, + "loss": 0.04586036, + "step": 15698 + }, + { + "epoch": 31.398, + "grad_norm": 1.5984288454055786, + "learning_rate": 2e-05, + "loss": 0.04890535, + "step": 15699 + }, + { + "epoch": 31.4, + "grad_norm": 1.2893099784851074, + "learning_rate": 2e-05, + "loss": 0.0567109, + "step": 15700 + }, + { + "epoch": 31.402, + "grad_norm": 1.1580631732940674, + "learning_rate": 2e-05, + "loss": 0.04688855, + "step": 15701 + }, + { + "epoch": 31.404, + "grad_norm": 1.5734524726867676, + "learning_rate": 2e-05, + "loss": 0.06079507, + "step": 15702 + }, + { + "epoch": 31.406, + "grad_norm": 1.2681607007980347, + "learning_rate": 2e-05, + "loss": 0.0585427, + "step": 15703 + }, + { + "epoch": 31.408, + "grad_norm": 1.4479280710220337, + "learning_rate": 2e-05, + "loss": 0.0500034, + "step": 15704 + }, + { + "epoch": 31.41, + "grad_norm": 1.5085829496383667, + "learning_rate": 2e-05, + "loss": 0.04053114, + "step": 15705 + }, + { + "epoch": 31.412, + "grad_norm": 1.2420114278793335, + "learning_rate": 2e-05, + "loss": 0.0312696, + "step": 15706 + }, + { + "epoch": 31.414, + "grad_norm": 1.4845657348632812, + "learning_rate": 2e-05, + "loss": 0.04520275, + "step": 15707 + }, + { + "epoch": 31.416, + "grad_norm": 1.0234241485595703, + "learning_rate": 2e-05, + "loss": 0.03995465, + "step": 15708 + }, + { + "epoch": 31.418, + "grad_norm": 0.9769031405448914, + "learning_rate": 2e-05, + "loss": 0.03126371, + "step": 15709 + }, + { + "epoch": 31.42, + "grad_norm": 1.2428449392318726, + "learning_rate": 2e-05, + "loss": 0.04003659, + "step": 15710 + }, + { + "epoch": 31.422, + "grad_norm": 1.1226387023925781, + "learning_rate": 2e-05, + "loss": 0.04901841, + "step": 15711 + }, + { + "epoch": 31.424, + "grad_norm": 0.9852843284606934, + "learning_rate": 2e-05, + "loss": 0.03026243, + "step": 15712 + }, + { + "epoch": 31.426, + "grad_norm": 1.0646196603775024, + "learning_rate": 2e-05, + "loss": 0.03818695, + "step": 15713 + }, + { + "epoch": 31.428, + "grad_norm": 1.188592553138733, + "learning_rate": 2e-05, + "loss": 0.03416793, + "step": 15714 + }, + { + "epoch": 31.43, + "grad_norm": 1.1710172891616821, + "learning_rate": 2e-05, + "loss": 0.03767353, + "step": 15715 + }, + { + "epoch": 31.432, + "grad_norm": 1.2032878398895264, + "learning_rate": 2e-05, + "loss": 0.04838914, + "step": 15716 + }, + { + "epoch": 31.434, + "grad_norm": 1.038709282875061, + "learning_rate": 2e-05, + "loss": 0.03334371, + "step": 15717 + }, + { + "epoch": 31.436, + "grad_norm": 1.0788524150848389, + "learning_rate": 2e-05, + "loss": 0.03386791, + "step": 15718 + }, + { + "epoch": 31.438, + "grad_norm": 1.163702368736267, + "learning_rate": 2e-05, + "loss": 0.0470679, + "step": 15719 + }, + { + "epoch": 31.44, + "grad_norm": 1.1703226566314697, + "learning_rate": 2e-05, + "loss": 0.04177292, + "step": 15720 + }, + { + "epoch": 31.442, + "grad_norm": 2.1952016353607178, + "learning_rate": 2e-05, + "loss": 0.03891946, + "step": 15721 + }, + { + "epoch": 31.444, + "grad_norm": 1.4332395792007446, + "learning_rate": 2e-05, + "loss": 0.05187111, + "step": 15722 + }, + { + "epoch": 31.446, + "grad_norm": 1.305293083190918, + "learning_rate": 2e-05, + "loss": 0.04548291, + "step": 15723 + }, + { + "epoch": 31.448, + "grad_norm": 1.0815423727035522, + "learning_rate": 2e-05, + "loss": 0.04857568, + "step": 15724 + }, + { + "epoch": 31.45, + "grad_norm": 1.2314715385437012, + "learning_rate": 2e-05, + "loss": 0.03415262, + "step": 15725 + }, + { + "epoch": 31.452, + "grad_norm": 1.8589730262756348, + "learning_rate": 2e-05, + "loss": 0.04466228, + "step": 15726 + }, + { + "epoch": 31.454, + "grad_norm": 1.4462237358093262, + "learning_rate": 2e-05, + "loss": 0.05154398, + "step": 15727 + }, + { + "epoch": 31.456, + "grad_norm": 1.0429617166519165, + "learning_rate": 2e-05, + "loss": 0.03432951, + "step": 15728 + }, + { + "epoch": 31.458, + "grad_norm": 1.2360962629318237, + "learning_rate": 2e-05, + "loss": 0.04059312, + "step": 15729 + }, + { + "epoch": 31.46, + "grad_norm": 1.1760252714157104, + "learning_rate": 2e-05, + "loss": 0.05076623, + "step": 15730 + }, + { + "epoch": 31.462, + "grad_norm": 1.2189456224441528, + "learning_rate": 2e-05, + "loss": 0.05206157, + "step": 15731 + }, + { + "epoch": 31.464, + "grad_norm": 1.028828740119934, + "learning_rate": 2e-05, + "loss": 0.02622063, + "step": 15732 + }, + { + "epoch": 31.466, + "grad_norm": 1.3809655904769897, + "learning_rate": 2e-05, + "loss": 0.04626153, + "step": 15733 + }, + { + "epoch": 31.468, + "grad_norm": 1.0601438283920288, + "learning_rate": 2e-05, + "loss": 0.03184626, + "step": 15734 + }, + { + "epoch": 31.47, + "grad_norm": 1.7758816480636597, + "learning_rate": 2e-05, + "loss": 0.04071598, + "step": 15735 + }, + { + "epoch": 31.472, + "grad_norm": 1.2127716541290283, + "learning_rate": 2e-05, + "loss": 0.0404386, + "step": 15736 + }, + { + "epoch": 31.474, + "grad_norm": 1.1199473142623901, + "learning_rate": 2e-05, + "loss": 0.03128079, + "step": 15737 + }, + { + "epoch": 31.476, + "grad_norm": 0.9239538908004761, + "learning_rate": 2e-05, + "loss": 0.0214905, + "step": 15738 + }, + { + "epoch": 31.478, + "grad_norm": 1.0289355516433716, + "learning_rate": 2e-05, + "loss": 0.03084357, + "step": 15739 + }, + { + "epoch": 31.48, + "grad_norm": 1.7557429075241089, + "learning_rate": 2e-05, + "loss": 0.04279177, + "step": 15740 + }, + { + "epoch": 31.482, + "grad_norm": 2.431920289993286, + "learning_rate": 2e-05, + "loss": 0.04216561, + "step": 15741 + }, + { + "epoch": 31.484, + "grad_norm": 1.3500877618789673, + "learning_rate": 2e-05, + "loss": 0.04621994, + "step": 15742 + }, + { + "epoch": 31.486, + "grad_norm": 1.0111196041107178, + "learning_rate": 2e-05, + "loss": 0.02924443, + "step": 15743 + }, + { + "epoch": 31.488, + "grad_norm": 1.5376261472702026, + "learning_rate": 2e-05, + "loss": 0.05262484, + "step": 15744 + }, + { + "epoch": 31.49, + "grad_norm": 1.3182742595672607, + "learning_rate": 2e-05, + "loss": 0.0479708, + "step": 15745 + }, + { + "epoch": 31.492, + "grad_norm": 1.2966049909591675, + "learning_rate": 2e-05, + "loss": 0.03965274, + "step": 15746 + }, + { + "epoch": 31.494, + "grad_norm": 1.1283825635910034, + "learning_rate": 2e-05, + "loss": 0.0380425, + "step": 15747 + }, + { + "epoch": 31.496, + "grad_norm": 1.2638022899627686, + "learning_rate": 2e-05, + "loss": 0.0417271, + "step": 15748 + }, + { + "epoch": 31.498, + "grad_norm": 1.2083308696746826, + "learning_rate": 2e-05, + "loss": 0.04929405, + "step": 15749 + }, + { + "epoch": 31.5, + "grad_norm": 1.5783114433288574, + "learning_rate": 2e-05, + "loss": 0.04968805, + "step": 15750 + }, + { + "epoch": 31.502, + "grad_norm": 2.3429791927337646, + "learning_rate": 2e-05, + "loss": 0.0579337, + "step": 15751 + }, + { + "epoch": 31.504, + "grad_norm": 1.2848727703094482, + "learning_rate": 2e-05, + "loss": 0.03835206, + "step": 15752 + }, + { + "epoch": 31.506, + "grad_norm": 1.3516781330108643, + "learning_rate": 2e-05, + "loss": 0.04133689, + "step": 15753 + }, + { + "epoch": 31.508, + "grad_norm": 1.0922443866729736, + "learning_rate": 2e-05, + "loss": 0.03277455, + "step": 15754 + }, + { + "epoch": 31.51, + "grad_norm": 1.2325778007507324, + "learning_rate": 2e-05, + "loss": 0.04386805, + "step": 15755 + }, + { + "epoch": 31.512, + "grad_norm": 1.190375566482544, + "learning_rate": 2e-05, + "loss": 0.04679071, + "step": 15756 + }, + { + "epoch": 31.514, + "grad_norm": 1.1857788562774658, + "learning_rate": 2e-05, + "loss": 0.04052466, + "step": 15757 + }, + { + "epoch": 31.516, + "grad_norm": 1.4863660335540771, + "learning_rate": 2e-05, + "loss": 0.04734954, + "step": 15758 + }, + { + "epoch": 31.518, + "grad_norm": 0.9761635661125183, + "learning_rate": 2e-05, + "loss": 0.04507826, + "step": 15759 + }, + { + "epoch": 31.52, + "grad_norm": 3.303678274154663, + "learning_rate": 2e-05, + "loss": 0.04152103, + "step": 15760 + }, + { + "epoch": 31.522, + "grad_norm": 1.3020141124725342, + "learning_rate": 2e-05, + "loss": 0.03344776, + "step": 15761 + }, + { + "epoch": 31.524, + "grad_norm": 1.2744166851043701, + "learning_rate": 2e-05, + "loss": 0.03905339, + "step": 15762 + }, + { + "epoch": 31.526, + "grad_norm": 1.3360401391983032, + "learning_rate": 2e-05, + "loss": 0.05134521, + "step": 15763 + }, + { + "epoch": 31.528, + "grad_norm": 1.5633838176727295, + "learning_rate": 2e-05, + "loss": 0.04629926, + "step": 15764 + }, + { + "epoch": 31.53, + "grad_norm": 1.144903540611267, + "learning_rate": 2e-05, + "loss": 0.04337474, + "step": 15765 + }, + { + "epoch": 31.532, + "grad_norm": 1.549691081047058, + "learning_rate": 2e-05, + "loss": 0.02903229, + "step": 15766 + }, + { + "epoch": 31.534, + "grad_norm": 1.6548807621002197, + "learning_rate": 2e-05, + "loss": 0.03670297, + "step": 15767 + }, + { + "epoch": 31.536, + "grad_norm": 1.8595404624938965, + "learning_rate": 2e-05, + "loss": 0.04848412, + "step": 15768 + }, + { + "epoch": 31.538, + "grad_norm": 0.9308550953865051, + "learning_rate": 2e-05, + "loss": 0.01970943, + "step": 15769 + }, + { + "epoch": 31.54, + "grad_norm": 1.6548179388046265, + "learning_rate": 2e-05, + "loss": 0.05121253, + "step": 15770 + }, + { + "epoch": 31.542, + "grad_norm": 1.1826523542404175, + "learning_rate": 2e-05, + "loss": 0.03165106, + "step": 15771 + }, + { + "epoch": 31.544, + "grad_norm": 2.287088394165039, + "learning_rate": 2e-05, + "loss": 0.05559523, + "step": 15772 + }, + { + "epoch": 31.546, + "grad_norm": 1.249127984046936, + "learning_rate": 2e-05, + "loss": 0.05151629, + "step": 15773 + }, + { + "epoch": 31.548000000000002, + "grad_norm": 2.0954861640930176, + "learning_rate": 2e-05, + "loss": 0.05453818, + "step": 15774 + }, + { + "epoch": 31.55, + "grad_norm": 1.1801806688308716, + "learning_rate": 2e-05, + "loss": 0.04041165, + "step": 15775 + }, + { + "epoch": 31.552, + "grad_norm": 1.1119970083236694, + "learning_rate": 2e-05, + "loss": 0.0460862, + "step": 15776 + }, + { + "epoch": 31.554, + "grad_norm": 2.1345860958099365, + "learning_rate": 2e-05, + "loss": 0.04225209, + "step": 15777 + }, + { + "epoch": 31.556, + "grad_norm": 1.1420177221298218, + "learning_rate": 2e-05, + "loss": 0.03541413, + "step": 15778 + }, + { + "epoch": 31.558, + "grad_norm": 1.9255207777023315, + "learning_rate": 2e-05, + "loss": 0.05924919, + "step": 15779 + }, + { + "epoch": 31.56, + "grad_norm": 3.971301317214966, + "learning_rate": 2e-05, + "loss": 0.05418454, + "step": 15780 + }, + { + "epoch": 31.562, + "grad_norm": 1.384949803352356, + "learning_rate": 2e-05, + "loss": 0.03826708, + "step": 15781 + }, + { + "epoch": 31.564, + "grad_norm": 1.1261556148529053, + "learning_rate": 2e-05, + "loss": 0.04444794, + "step": 15782 + }, + { + "epoch": 31.566, + "grad_norm": 1.696993112564087, + "learning_rate": 2e-05, + "loss": 0.03568056, + "step": 15783 + }, + { + "epoch": 31.568, + "grad_norm": 1.7849308252334595, + "learning_rate": 2e-05, + "loss": 0.03161497, + "step": 15784 + }, + { + "epoch": 31.57, + "grad_norm": 2.218967914581299, + "learning_rate": 2e-05, + "loss": 0.04789735, + "step": 15785 + }, + { + "epoch": 31.572, + "grad_norm": 1.280608892440796, + "learning_rate": 2e-05, + "loss": 0.04609762, + "step": 15786 + }, + { + "epoch": 31.574, + "grad_norm": 1.1164113283157349, + "learning_rate": 2e-05, + "loss": 0.03556387, + "step": 15787 + }, + { + "epoch": 31.576, + "grad_norm": 1.5488619804382324, + "learning_rate": 2e-05, + "loss": 0.06319657, + "step": 15788 + }, + { + "epoch": 31.578, + "grad_norm": 1.0593352317810059, + "learning_rate": 2e-05, + "loss": 0.0368279, + "step": 15789 + }, + { + "epoch": 31.58, + "grad_norm": 1.139189600944519, + "learning_rate": 2e-05, + "loss": 0.03742014, + "step": 15790 + }, + { + "epoch": 31.582, + "grad_norm": 1.1916528940200806, + "learning_rate": 2e-05, + "loss": 0.05514635, + "step": 15791 + }, + { + "epoch": 31.584, + "grad_norm": 2.922518491744995, + "learning_rate": 2e-05, + "loss": 0.06346483, + "step": 15792 + }, + { + "epoch": 31.586, + "grad_norm": 1.1857612133026123, + "learning_rate": 2e-05, + "loss": 0.04470282, + "step": 15793 + }, + { + "epoch": 31.588, + "grad_norm": 1.5709048509597778, + "learning_rate": 2e-05, + "loss": 0.05266313, + "step": 15794 + }, + { + "epoch": 31.59, + "grad_norm": 1.2784346342086792, + "learning_rate": 2e-05, + "loss": 0.02789311, + "step": 15795 + }, + { + "epoch": 31.592, + "grad_norm": 1.4792814254760742, + "learning_rate": 2e-05, + "loss": 0.06371169, + "step": 15796 + }, + { + "epoch": 31.594, + "grad_norm": 1.1451568603515625, + "learning_rate": 2e-05, + "loss": 0.03802731, + "step": 15797 + }, + { + "epoch": 31.596, + "grad_norm": 1.5499827861785889, + "learning_rate": 2e-05, + "loss": 0.05359296, + "step": 15798 + }, + { + "epoch": 31.598, + "grad_norm": 1.11916983127594, + "learning_rate": 2e-05, + "loss": 0.05549221, + "step": 15799 + }, + { + "epoch": 31.6, + "grad_norm": 1.0967146158218384, + "learning_rate": 2e-05, + "loss": 0.03953513, + "step": 15800 + }, + { + "epoch": 31.602, + "grad_norm": 1.4866015911102295, + "learning_rate": 2e-05, + "loss": 0.0373404, + "step": 15801 + }, + { + "epoch": 31.604, + "grad_norm": 1.3071479797363281, + "learning_rate": 2e-05, + "loss": 0.03140751, + "step": 15802 + }, + { + "epoch": 31.606, + "grad_norm": 1.8923872709274292, + "learning_rate": 2e-05, + "loss": 0.03232081, + "step": 15803 + }, + { + "epoch": 31.608, + "grad_norm": 1.0222185850143433, + "learning_rate": 2e-05, + "loss": 0.03265133, + "step": 15804 + }, + { + "epoch": 31.61, + "grad_norm": 1.0511828660964966, + "learning_rate": 2e-05, + "loss": 0.02982992, + "step": 15805 + }, + { + "epoch": 31.612, + "grad_norm": 1.0626296997070312, + "learning_rate": 2e-05, + "loss": 0.03582488, + "step": 15806 + }, + { + "epoch": 31.614, + "grad_norm": 1.239322304725647, + "learning_rate": 2e-05, + "loss": 0.04029377, + "step": 15807 + }, + { + "epoch": 31.616, + "grad_norm": 1.2532638311386108, + "learning_rate": 2e-05, + "loss": 0.04826434, + "step": 15808 + }, + { + "epoch": 31.618, + "grad_norm": 0.907904326915741, + "learning_rate": 2e-05, + "loss": 0.032974, + "step": 15809 + }, + { + "epoch": 31.62, + "grad_norm": 1.3583287000656128, + "learning_rate": 2e-05, + "loss": 0.05751316, + "step": 15810 + }, + { + "epoch": 31.622, + "grad_norm": 0.8747350573539734, + "learning_rate": 2e-05, + "loss": 0.02985252, + "step": 15811 + }, + { + "epoch": 31.624, + "grad_norm": 1.2014013528823853, + "learning_rate": 2e-05, + "loss": 0.0308294, + "step": 15812 + }, + { + "epoch": 31.626, + "grad_norm": 1.563045620918274, + "learning_rate": 2e-05, + "loss": 0.0466816, + "step": 15813 + }, + { + "epoch": 31.628, + "grad_norm": 1.5340209007263184, + "learning_rate": 2e-05, + "loss": 0.04322568, + "step": 15814 + }, + { + "epoch": 31.63, + "grad_norm": 1.1069697141647339, + "learning_rate": 2e-05, + "loss": 0.02941058, + "step": 15815 + }, + { + "epoch": 31.632, + "grad_norm": 1.6199052333831787, + "learning_rate": 2e-05, + "loss": 0.06167778, + "step": 15816 + }, + { + "epoch": 31.634, + "grad_norm": 0.9924496412277222, + "learning_rate": 2e-05, + "loss": 0.02608846, + "step": 15817 + }, + { + "epoch": 31.636, + "grad_norm": 1.212367296218872, + "learning_rate": 2e-05, + "loss": 0.05580743, + "step": 15818 + }, + { + "epoch": 31.638, + "grad_norm": 1.232508897781372, + "learning_rate": 2e-05, + "loss": 0.04425735, + "step": 15819 + }, + { + "epoch": 31.64, + "grad_norm": 2.6619043350219727, + "learning_rate": 2e-05, + "loss": 0.06107927, + "step": 15820 + }, + { + "epoch": 31.642, + "grad_norm": 1.5662249326705933, + "learning_rate": 2e-05, + "loss": 0.04231949, + "step": 15821 + }, + { + "epoch": 31.644, + "grad_norm": 1.066655158996582, + "learning_rate": 2e-05, + "loss": 0.03843967, + "step": 15822 + }, + { + "epoch": 31.646, + "grad_norm": 1.9721606969833374, + "learning_rate": 2e-05, + "loss": 0.05530218, + "step": 15823 + }, + { + "epoch": 31.648, + "grad_norm": 1.093684196472168, + "learning_rate": 2e-05, + "loss": 0.04849446, + "step": 15824 + }, + { + "epoch": 31.65, + "grad_norm": 1.434188961982727, + "learning_rate": 2e-05, + "loss": 0.05032559, + "step": 15825 + }, + { + "epoch": 31.652, + "grad_norm": 1.0273107290267944, + "learning_rate": 2e-05, + "loss": 0.03886499, + "step": 15826 + }, + { + "epoch": 31.654, + "grad_norm": 1.0134077072143555, + "learning_rate": 2e-05, + "loss": 0.0312471, + "step": 15827 + }, + { + "epoch": 31.656, + "grad_norm": 1.69369637966156, + "learning_rate": 2e-05, + "loss": 0.05736078, + "step": 15828 + }, + { + "epoch": 31.658, + "grad_norm": 3.8166019916534424, + "learning_rate": 2e-05, + "loss": 0.05662319, + "step": 15829 + }, + { + "epoch": 31.66, + "grad_norm": 1.4313768148422241, + "learning_rate": 2e-05, + "loss": 0.04016527, + "step": 15830 + }, + { + "epoch": 31.662, + "grad_norm": 1.6341408491134644, + "learning_rate": 2e-05, + "loss": 0.05576465, + "step": 15831 + }, + { + "epoch": 31.664, + "grad_norm": 1.6464592218399048, + "learning_rate": 2e-05, + "loss": 0.05058682, + "step": 15832 + }, + { + "epoch": 31.666, + "grad_norm": 1.6304293870925903, + "learning_rate": 2e-05, + "loss": 0.04562239, + "step": 15833 + }, + { + "epoch": 31.668, + "grad_norm": 1.1134459972381592, + "learning_rate": 2e-05, + "loss": 0.0356195, + "step": 15834 + }, + { + "epoch": 31.67, + "grad_norm": 0.9937390685081482, + "learning_rate": 2e-05, + "loss": 0.02685843, + "step": 15835 + }, + { + "epoch": 31.672, + "grad_norm": 1.066178321838379, + "learning_rate": 2e-05, + "loss": 0.03931656, + "step": 15836 + }, + { + "epoch": 31.674, + "grad_norm": 1.0088249444961548, + "learning_rate": 2e-05, + "loss": 0.04021916, + "step": 15837 + }, + { + "epoch": 31.676, + "grad_norm": 1.028534173965454, + "learning_rate": 2e-05, + "loss": 0.05189967, + "step": 15838 + }, + { + "epoch": 31.678, + "grad_norm": 4.298219203948975, + "learning_rate": 2e-05, + "loss": 0.03603112, + "step": 15839 + }, + { + "epoch": 31.68, + "grad_norm": 1.1019927263259888, + "learning_rate": 2e-05, + "loss": 0.03563758, + "step": 15840 + }, + { + "epoch": 31.682, + "grad_norm": 1.016705870628357, + "learning_rate": 2e-05, + "loss": 0.03160737, + "step": 15841 + }, + { + "epoch": 31.684, + "grad_norm": 1.2760132551193237, + "learning_rate": 2e-05, + "loss": 0.0512272, + "step": 15842 + }, + { + "epoch": 31.686, + "grad_norm": 1.0675709247589111, + "learning_rate": 2e-05, + "loss": 0.03803336, + "step": 15843 + }, + { + "epoch": 31.688, + "grad_norm": 1.0932615995407104, + "learning_rate": 2e-05, + "loss": 0.03163125, + "step": 15844 + }, + { + "epoch": 31.69, + "grad_norm": 1.4388149976730347, + "learning_rate": 2e-05, + "loss": 0.0546472, + "step": 15845 + }, + { + "epoch": 31.692, + "grad_norm": 1.247949242591858, + "learning_rate": 2e-05, + "loss": 0.04308487, + "step": 15846 + }, + { + "epoch": 31.694, + "grad_norm": 1.0364240407943726, + "learning_rate": 2e-05, + "loss": 0.04179894, + "step": 15847 + }, + { + "epoch": 31.696, + "grad_norm": 1.3593528270721436, + "learning_rate": 2e-05, + "loss": 0.03463472, + "step": 15848 + }, + { + "epoch": 31.698, + "grad_norm": 3.9153060913085938, + "learning_rate": 2e-05, + "loss": 0.0593623, + "step": 15849 + }, + { + "epoch": 31.7, + "grad_norm": 1.224266529083252, + "learning_rate": 2e-05, + "loss": 0.06305296, + "step": 15850 + }, + { + "epoch": 31.701999999999998, + "grad_norm": 1.8977373838424683, + "learning_rate": 2e-05, + "loss": 0.04459673, + "step": 15851 + }, + { + "epoch": 31.704, + "grad_norm": 1.1238889694213867, + "learning_rate": 2e-05, + "loss": 0.03320381, + "step": 15852 + }, + { + "epoch": 31.706, + "grad_norm": 1.8452239036560059, + "learning_rate": 2e-05, + "loss": 0.03556684, + "step": 15853 + }, + { + "epoch": 31.708, + "grad_norm": 1.469277024269104, + "learning_rate": 2e-05, + "loss": 0.05553747, + "step": 15854 + }, + { + "epoch": 31.71, + "grad_norm": 1.2231842279434204, + "learning_rate": 2e-05, + "loss": 0.04826688, + "step": 15855 + }, + { + "epoch": 31.712, + "grad_norm": 0.8785945177078247, + "learning_rate": 2e-05, + "loss": 0.03162833, + "step": 15856 + }, + { + "epoch": 31.714, + "grad_norm": 1.2260061502456665, + "learning_rate": 2e-05, + "loss": 0.04555026, + "step": 15857 + }, + { + "epoch": 31.716, + "grad_norm": 1.4658849239349365, + "learning_rate": 2e-05, + "loss": 0.04575991, + "step": 15858 + }, + { + "epoch": 31.718, + "grad_norm": 1.1947789192199707, + "learning_rate": 2e-05, + "loss": 0.03831368, + "step": 15859 + }, + { + "epoch": 31.72, + "grad_norm": 2.118058443069458, + "learning_rate": 2e-05, + "loss": 0.05042726, + "step": 15860 + }, + { + "epoch": 31.722, + "grad_norm": 0.9737758040428162, + "learning_rate": 2e-05, + "loss": 0.03438409, + "step": 15861 + }, + { + "epoch": 31.724, + "grad_norm": 1.606362223625183, + "learning_rate": 2e-05, + "loss": 0.03443389, + "step": 15862 + }, + { + "epoch": 31.726, + "grad_norm": 1.9236388206481934, + "learning_rate": 2e-05, + "loss": 0.04065321, + "step": 15863 + }, + { + "epoch": 31.728, + "grad_norm": 1.1777132749557495, + "learning_rate": 2e-05, + "loss": 0.04068302, + "step": 15864 + }, + { + "epoch": 31.73, + "grad_norm": 1.356837272644043, + "learning_rate": 2e-05, + "loss": 0.04885522, + "step": 15865 + }, + { + "epoch": 31.732, + "grad_norm": 1.158024787902832, + "learning_rate": 2e-05, + "loss": 0.03375251, + "step": 15866 + }, + { + "epoch": 31.734, + "grad_norm": 1.4615778923034668, + "learning_rate": 2e-05, + "loss": 0.03782218, + "step": 15867 + }, + { + "epoch": 31.736, + "grad_norm": 1.936156153678894, + "learning_rate": 2e-05, + "loss": 0.05020786, + "step": 15868 + }, + { + "epoch": 31.738, + "grad_norm": 1.1056143045425415, + "learning_rate": 2e-05, + "loss": 0.03084729, + "step": 15869 + }, + { + "epoch": 31.74, + "grad_norm": 1.153363823890686, + "learning_rate": 2e-05, + "loss": 0.04279421, + "step": 15870 + }, + { + "epoch": 31.742, + "grad_norm": 2.0926640033721924, + "learning_rate": 2e-05, + "loss": 0.05298992, + "step": 15871 + }, + { + "epoch": 31.744, + "grad_norm": 0.9748860597610474, + "learning_rate": 2e-05, + "loss": 0.02891666, + "step": 15872 + }, + { + "epoch": 31.746, + "grad_norm": 2.135936737060547, + "learning_rate": 2e-05, + "loss": 0.04711745, + "step": 15873 + }, + { + "epoch": 31.748, + "grad_norm": 1.9730417728424072, + "learning_rate": 2e-05, + "loss": 0.0426465, + "step": 15874 + }, + { + "epoch": 31.75, + "grad_norm": 2.211915969848633, + "learning_rate": 2e-05, + "loss": 0.0576695, + "step": 15875 + }, + { + "epoch": 31.752, + "grad_norm": 1.4895312786102295, + "learning_rate": 2e-05, + "loss": 0.03817018, + "step": 15876 + }, + { + "epoch": 31.754, + "grad_norm": 1.1091728210449219, + "learning_rate": 2e-05, + "loss": 0.03473945, + "step": 15877 + }, + { + "epoch": 31.756, + "grad_norm": 1.5757970809936523, + "learning_rate": 2e-05, + "loss": 0.05376294, + "step": 15878 + }, + { + "epoch": 31.758, + "grad_norm": 1.4097827672958374, + "learning_rate": 2e-05, + "loss": 0.0575452, + "step": 15879 + }, + { + "epoch": 31.76, + "grad_norm": 1.601607084274292, + "learning_rate": 2e-05, + "loss": 0.03805475, + "step": 15880 + }, + { + "epoch": 31.762, + "grad_norm": 1.084592580795288, + "learning_rate": 2e-05, + "loss": 0.03560298, + "step": 15881 + }, + { + "epoch": 31.764, + "grad_norm": 1.3301918506622314, + "learning_rate": 2e-05, + "loss": 0.05340309, + "step": 15882 + }, + { + "epoch": 31.766, + "grad_norm": 1.21677565574646, + "learning_rate": 2e-05, + "loss": 0.02901489, + "step": 15883 + }, + { + "epoch": 31.768, + "grad_norm": 1.5509135723114014, + "learning_rate": 2e-05, + "loss": 0.0422479, + "step": 15884 + }, + { + "epoch": 31.77, + "grad_norm": 2.1377058029174805, + "learning_rate": 2e-05, + "loss": 0.03911637, + "step": 15885 + }, + { + "epoch": 31.772, + "grad_norm": 0.8442297577857971, + "learning_rate": 2e-05, + "loss": 0.02588265, + "step": 15886 + }, + { + "epoch": 31.774, + "grad_norm": 1.5694454908370972, + "learning_rate": 2e-05, + "loss": 0.04644613, + "step": 15887 + }, + { + "epoch": 31.776, + "grad_norm": 1.1110693216323853, + "learning_rate": 2e-05, + "loss": 0.03558012, + "step": 15888 + }, + { + "epoch": 31.778, + "grad_norm": 1.0614808797836304, + "learning_rate": 2e-05, + "loss": 0.03117304, + "step": 15889 + }, + { + "epoch": 31.78, + "grad_norm": 1.6479281187057495, + "learning_rate": 2e-05, + "loss": 0.03382275, + "step": 15890 + }, + { + "epoch": 31.782, + "grad_norm": 3.055795192718506, + "learning_rate": 2e-05, + "loss": 0.02290535, + "step": 15891 + }, + { + "epoch": 31.784, + "grad_norm": 1.0296449661254883, + "learning_rate": 2e-05, + "loss": 0.04224445, + "step": 15892 + }, + { + "epoch": 31.786, + "grad_norm": 0.8810853958129883, + "learning_rate": 2e-05, + "loss": 0.03229618, + "step": 15893 + }, + { + "epoch": 31.788, + "grad_norm": 2.7027084827423096, + "learning_rate": 2e-05, + "loss": 0.05507965, + "step": 15894 + }, + { + "epoch": 31.79, + "grad_norm": 1.3966178894042969, + "learning_rate": 2e-05, + "loss": 0.03760487, + "step": 15895 + }, + { + "epoch": 31.792, + "grad_norm": 1.139143705368042, + "learning_rate": 2e-05, + "loss": 0.04034952, + "step": 15896 + }, + { + "epoch": 31.794, + "grad_norm": 1.1100138425827026, + "learning_rate": 2e-05, + "loss": 0.03606164, + "step": 15897 + }, + { + "epoch": 31.796, + "grad_norm": 1.0225300788879395, + "learning_rate": 2e-05, + "loss": 0.03408796, + "step": 15898 + }, + { + "epoch": 31.798000000000002, + "grad_norm": 1.2668366432189941, + "learning_rate": 2e-05, + "loss": 0.05540526, + "step": 15899 + }, + { + "epoch": 31.8, + "grad_norm": 1.2877216339111328, + "learning_rate": 2e-05, + "loss": 0.05548783, + "step": 15900 + }, + { + "epoch": 31.802, + "grad_norm": 1.4362618923187256, + "learning_rate": 2e-05, + "loss": 0.0343156, + "step": 15901 + }, + { + "epoch": 31.804, + "grad_norm": 1.0215818881988525, + "learning_rate": 2e-05, + "loss": 0.03574908, + "step": 15902 + }, + { + "epoch": 31.806, + "grad_norm": 1.2409071922302246, + "learning_rate": 2e-05, + "loss": 0.04538013, + "step": 15903 + }, + { + "epoch": 31.808, + "grad_norm": 1.2912746667861938, + "learning_rate": 2e-05, + "loss": 0.0512158, + "step": 15904 + }, + { + "epoch": 31.81, + "grad_norm": 2.350969076156616, + "learning_rate": 2e-05, + "loss": 0.0492909, + "step": 15905 + }, + { + "epoch": 31.812, + "grad_norm": 1.299422264099121, + "learning_rate": 2e-05, + "loss": 0.05618146, + "step": 15906 + }, + { + "epoch": 31.814, + "grad_norm": 1.3270517587661743, + "learning_rate": 2e-05, + "loss": 0.03777631, + "step": 15907 + }, + { + "epoch": 31.816, + "grad_norm": 2.0297582149505615, + "learning_rate": 2e-05, + "loss": 0.05447558, + "step": 15908 + }, + { + "epoch": 31.818, + "grad_norm": 1.204034686088562, + "learning_rate": 2e-05, + "loss": 0.04712773, + "step": 15909 + }, + { + "epoch": 31.82, + "grad_norm": 1.8368512392044067, + "learning_rate": 2e-05, + "loss": 0.03878936, + "step": 15910 + }, + { + "epoch": 31.822, + "grad_norm": 1.3600157499313354, + "learning_rate": 2e-05, + "loss": 0.04160419, + "step": 15911 + }, + { + "epoch": 31.824, + "grad_norm": 0.9613247513771057, + "learning_rate": 2e-05, + "loss": 0.03210657, + "step": 15912 + }, + { + "epoch": 31.826, + "grad_norm": 1.2313897609710693, + "learning_rate": 2e-05, + "loss": 0.0382159, + "step": 15913 + }, + { + "epoch": 31.828, + "grad_norm": 1.0835821628570557, + "learning_rate": 2e-05, + "loss": 0.04776543, + "step": 15914 + }, + { + "epoch": 31.83, + "grad_norm": 1.1532397270202637, + "learning_rate": 2e-05, + "loss": 0.02791233, + "step": 15915 + }, + { + "epoch": 31.832, + "grad_norm": 1.6072558164596558, + "learning_rate": 2e-05, + "loss": 0.04267748, + "step": 15916 + }, + { + "epoch": 31.834, + "grad_norm": 1.2094991207122803, + "learning_rate": 2e-05, + "loss": 0.0439063, + "step": 15917 + }, + { + "epoch": 31.836, + "grad_norm": 1.9390913248062134, + "learning_rate": 2e-05, + "loss": 0.06620531, + "step": 15918 + }, + { + "epoch": 31.838, + "grad_norm": 2.274019479751587, + "learning_rate": 2e-05, + "loss": 0.04259209, + "step": 15919 + }, + { + "epoch": 31.84, + "grad_norm": 1.9428718090057373, + "learning_rate": 2e-05, + "loss": 0.04785924, + "step": 15920 + }, + { + "epoch": 31.842, + "grad_norm": 1.422423243522644, + "learning_rate": 2e-05, + "loss": 0.04497388, + "step": 15921 + }, + { + "epoch": 31.844, + "grad_norm": 1.053911805152893, + "learning_rate": 2e-05, + "loss": 0.03690197, + "step": 15922 + }, + { + "epoch": 31.846, + "grad_norm": 3.13558030128479, + "learning_rate": 2e-05, + "loss": 0.05500776, + "step": 15923 + }, + { + "epoch": 31.848, + "grad_norm": 1.275652527809143, + "learning_rate": 2e-05, + "loss": 0.04231162, + "step": 15924 + }, + { + "epoch": 31.85, + "grad_norm": 1.9165621995925903, + "learning_rate": 2e-05, + "loss": 0.0441243, + "step": 15925 + }, + { + "epoch": 31.852, + "grad_norm": 2.099443197250366, + "learning_rate": 2e-05, + "loss": 0.04373053, + "step": 15926 + }, + { + "epoch": 31.854, + "grad_norm": 1.221465826034546, + "learning_rate": 2e-05, + "loss": 0.04799281, + "step": 15927 + }, + { + "epoch": 31.856, + "grad_norm": 1.7947920560836792, + "learning_rate": 2e-05, + "loss": 0.04117496, + "step": 15928 + }, + { + "epoch": 31.858, + "grad_norm": 1.142938256263733, + "learning_rate": 2e-05, + "loss": 0.03635032, + "step": 15929 + }, + { + "epoch": 31.86, + "grad_norm": 1.0332616567611694, + "learning_rate": 2e-05, + "loss": 0.04042405, + "step": 15930 + }, + { + "epoch": 31.862, + "grad_norm": 1.3157538175582886, + "learning_rate": 2e-05, + "loss": 0.06223164, + "step": 15931 + }, + { + "epoch": 31.864, + "grad_norm": 1.207903265953064, + "learning_rate": 2e-05, + "loss": 0.05030291, + "step": 15932 + }, + { + "epoch": 31.866, + "grad_norm": 1.8377586603164673, + "learning_rate": 2e-05, + "loss": 0.05019252, + "step": 15933 + }, + { + "epoch": 31.868, + "grad_norm": 0.9512478709220886, + "learning_rate": 2e-05, + "loss": 0.03300637, + "step": 15934 + }, + { + "epoch": 31.87, + "grad_norm": 1.3328098058700562, + "learning_rate": 2e-05, + "loss": 0.04929174, + "step": 15935 + }, + { + "epoch": 31.872, + "grad_norm": 1.0890222787857056, + "learning_rate": 2e-05, + "loss": 0.04026979, + "step": 15936 + }, + { + "epoch": 31.874, + "grad_norm": 0.9871217012405396, + "learning_rate": 2e-05, + "loss": 0.03689782, + "step": 15937 + }, + { + "epoch": 31.876, + "grad_norm": 1.0311745405197144, + "learning_rate": 2e-05, + "loss": 0.03890628, + "step": 15938 + }, + { + "epoch": 31.878, + "grad_norm": 1.252621054649353, + "learning_rate": 2e-05, + "loss": 0.05022133, + "step": 15939 + }, + { + "epoch": 31.88, + "grad_norm": 0.8780187964439392, + "learning_rate": 2e-05, + "loss": 0.02962154, + "step": 15940 + }, + { + "epoch": 31.882, + "grad_norm": 1.3588930368423462, + "learning_rate": 2e-05, + "loss": 0.04873728, + "step": 15941 + }, + { + "epoch": 31.884, + "grad_norm": 1.4300963878631592, + "learning_rate": 2e-05, + "loss": 0.05331673, + "step": 15942 + }, + { + "epoch": 31.886, + "grad_norm": 1.7596718072891235, + "learning_rate": 2e-05, + "loss": 0.05031583, + "step": 15943 + }, + { + "epoch": 31.888, + "grad_norm": 1.7224795818328857, + "learning_rate": 2e-05, + "loss": 0.05422819, + "step": 15944 + }, + { + "epoch": 31.89, + "grad_norm": 1.7030835151672363, + "learning_rate": 2e-05, + "loss": 0.05407435, + "step": 15945 + }, + { + "epoch": 31.892, + "grad_norm": 0.9744284749031067, + "learning_rate": 2e-05, + "loss": 0.03288863, + "step": 15946 + }, + { + "epoch": 31.894, + "grad_norm": 1.9340544939041138, + "learning_rate": 2e-05, + "loss": 0.04774361, + "step": 15947 + }, + { + "epoch": 31.896, + "grad_norm": 1.3208523988723755, + "learning_rate": 2e-05, + "loss": 0.02824499, + "step": 15948 + }, + { + "epoch": 31.898, + "grad_norm": 1.0506129264831543, + "learning_rate": 2e-05, + "loss": 0.03759496, + "step": 15949 + }, + { + "epoch": 31.9, + "grad_norm": 1.000072717666626, + "learning_rate": 2e-05, + "loss": 0.03729441, + "step": 15950 + }, + { + "epoch": 31.902, + "grad_norm": 1.8239920139312744, + "learning_rate": 2e-05, + "loss": 0.0503429, + "step": 15951 + }, + { + "epoch": 31.904, + "grad_norm": 0.9123238921165466, + "learning_rate": 2e-05, + "loss": 0.04094257, + "step": 15952 + }, + { + "epoch": 31.906, + "grad_norm": 2.360563278198242, + "learning_rate": 2e-05, + "loss": 0.04889352, + "step": 15953 + }, + { + "epoch": 31.908, + "grad_norm": 1.2828713655471802, + "learning_rate": 2e-05, + "loss": 0.04768791, + "step": 15954 + }, + { + "epoch": 31.91, + "grad_norm": 1.2418571710586548, + "learning_rate": 2e-05, + "loss": 0.04087563, + "step": 15955 + }, + { + "epoch": 31.912, + "grad_norm": 2.2912075519561768, + "learning_rate": 2e-05, + "loss": 0.05685142, + "step": 15956 + }, + { + "epoch": 31.914, + "grad_norm": 0.9731912612915039, + "learning_rate": 2e-05, + "loss": 0.03105514, + "step": 15957 + }, + { + "epoch": 31.916, + "grad_norm": 1.03424072265625, + "learning_rate": 2e-05, + "loss": 0.0377789, + "step": 15958 + }, + { + "epoch": 31.918, + "grad_norm": 1.2611950635910034, + "learning_rate": 2e-05, + "loss": 0.04463454, + "step": 15959 + }, + { + "epoch": 31.92, + "grad_norm": 1.9684666395187378, + "learning_rate": 2e-05, + "loss": 0.05295739, + "step": 15960 + }, + { + "epoch": 31.922, + "grad_norm": 1.3407577276229858, + "learning_rate": 2e-05, + "loss": 0.04236686, + "step": 15961 + }, + { + "epoch": 31.924, + "grad_norm": 1.2318034172058105, + "learning_rate": 2e-05, + "loss": 0.03647027, + "step": 15962 + }, + { + "epoch": 31.926, + "grad_norm": 1.745802402496338, + "learning_rate": 2e-05, + "loss": 0.04532122, + "step": 15963 + }, + { + "epoch": 31.928, + "grad_norm": 1.087468147277832, + "learning_rate": 2e-05, + "loss": 0.04168069, + "step": 15964 + }, + { + "epoch": 31.93, + "grad_norm": 2.0053272247314453, + "learning_rate": 2e-05, + "loss": 0.03822828, + "step": 15965 + }, + { + "epoch": 31.932, + "grad_norm": 1.0389565229415894, + "learning_rate": 2e-05, + "loss": 0.03328662, + "step": 15966 + }, + { + "epoch": 31.934, + "grad_norm": 1.394301414489746, + "learning_rate": 2e-05, + "loss": 0.05171059, + "step": 15967 + }, + { + "epoch": 31.936, + "grad_norm": 1.4605191946029663, + "learning_rate": 2e-05, + "loss": 0.0633463, + "step": 15968 + }, + { + "epoch": 31.938, + "grad_norm": 1.1026382446289062, + "learning_rate": 2e-05, + "loss": 0.04084253, + "step": 15969 + }, + { + "epoch": 31.94, + "grad_norm": 1.5898246765136719, + "learning_rate": 2e-05, + "loss": 0.06907326, + "step": 15970 + }, + { + "epoch": 31.942, + "grad_norm": 1.0612562894821167, + "learning_rate": 2e-05, + "loss": 0.03758101, + "step": 15971 + }, + { + "epoch": 31.944, + "grad_norm": 1.5062930583953857, + "learning_rate": 2e-05, + "loss": 0.03778771, + "step": 15972 + }, + { + "epoch": 31.946, + "grad_norm": 1.8301286697387695, + "learning_rate": 2e-05, + "loss": 0.03591305, + "step": 15973 + }, + { + "epoch": 31.948, + "grad_norm": 1.2562285661697388, + "learning_rate": 2e-05, + "loss": 0.03053888, + "step": 15974 + }, + { + "epoch": 31.95, + "grad_norm": 1.2209889888763428, + "learning_rate": 2e-05, + "loss": 0.05101029, + "step": 15975 + }, + { + "epoch": 31.951999999999998, + "grad_norm": 1.7810455560684204, + "learning_rate": 2e-05, + "loss": 0.04453781, + "step": 15976 + }, + { + "epoch": 31.954, + "grad_norm": 1.6434965133666992, + "learning_rate": 2e-05, + "loss": 0.0371579, + "step": 15977 + }, + { + "epoch": 31.956, + "grad_norm": 1.207597255706787, + "learning_rate": 2e-05, + "loss": 0.03318027, + "step": 15978 + }, + { + "epoch": 31.958, + "grad_norm": 1.3328630924224854, + "learning_rate": 2e-05, + "loss": 0.05412392, + "step": 15979 + }, + { + "epoch": 31.96, + "grad_norm": 1.0309226512908936, + "learning_rate": 2e-05, + "loss": 0.03783122, + "step": 15980 + }, + { + "epoch": 31.962, + "grad_norm": 1.042348861694336, + "learning_rate": 2e-05, + "loss": 0.03998961, + "step": 15981 + }, + { + "epoch": 31.964, + "grad_norm": 1.2534490823745728, + "learning_rate": 2e-05, + "loss": 0.03557429, + "step": 15982 + }, + { + "epoch": 31.966, + "grad_norm": 2.982203722000122, + "learning_rate": 2e-05, + "loss": 0.06171292, + "step": 15983 + }, + { + "epoch": 31.968, + "grad_norm": 1.2755846977233887, + "learning_rate": 2e-05, + "loss": 0.03844975, + "step": 15984 + }, + { + "epoch": 31.97, + "grad_norm": 1.0132505893707275, + "learning_rate": 2e-05, + "loss": 0.03931327, + "step": 15985 + }, + { + "epoch": 31.972, + "grad_norm": 1.2777570486068726, + "learning_rate": 2e-05, + "loss": 0.04398004, + "step": 15986 + }, + { + "epoch": 31.974, + "grad_norm": 1.2927149534225464, + "learning_rate": 2e-05, + "loss": 0.05497951, + "step": 15987 + }, + { + "epoch": 31.976, + "grad_norm": 0.8288196921348572, + "learning_rate": 2e-05, + "loss": 0.02825504, + "step": 15988 + }, + { + "epoch": 31.978, + "grad_norm": 1.1850969791412354, + "learning_rate": 2e-05, + "loss": 0.03954066, + "step": 15989 + }, + { + "epoch": 31.98, + "grad_norm": 0.8509149551391602, + "learning_rate": 2e-05, + "loss": 0.02893381, + "step": 15990 + }, + { + "epoch": 31.982, + "grad_norm": 1.2214093208312988, + "learning_rate": 2e-05, + "loss": 0.04563006, + "step": 15991 + }, + { + "epoch": 31.984, + "grad_norm": 1.3428093194961548, + "learning_rate": 2e-05, + "loss": 0.05709552, + "step": 15992 + }, + { + "epoch": 31.986, + "grad_norm": 1.2619773149490356, + "learning_rate": 2e-05, + "loss": 0.04833028, + "step": 15993 + }, + { + "epoch": 31.988, + "grad_norm": 1.4030076265335083, + "learning_rate": 2e-05, + "loss": 0.04587878, + "step": 15994 + }, + { + "epoch": 31.99, + "grad_norm": 1.113511085510254, + "learning_rate": 2e-05, + "loss": 0.03980121, + "step": 15995 + }, + { + "epoch": 31.992, + "grad_norm": 1.087285041809082, + "learning_rate": 2e-05, + "loss": 0.04423406, + "step": 15996 + }, + { + "epoch": 31.994, + "grad_norm": 1.2081905603408813, + "learning_rate": 2e-05, + "loss": 0.03930626, + "step": 15997 + }, + { + "epoch": 31.996, + "grad_norm": 0.9925308227539062, + "learning_rate": 2e-05, + "loss": 0.03966329, + "step": 15998 + }, + { + "epoch": 31.998, + "grad_norm": 1.5078144073486328, + "learning_rate": 2e-05, + "loss": 0.06151321, + "step": 15999 + }, + { + "epoch": 32.0, + "grad_norm": 1.641327142715454, + "learning_rate": 2e-05, + "loss": 0.05479438, + "step": 16000 + }, + { + "epoch": 32.0, + "eval_performance": { + "AngleClassification_1": 0.992, + "AngleClassification_2": 0.998, + "AngleClassification_3": 0.9780439121756487, + "Equal_1": 0.998, + "Equal_2": 0.9780439121756487, + "Equal_3": 0.9880239520958084, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9919839679358717, + "Parallel_3": 0.99, + "Perpendicular_1": 0.996, + "Perpendicular_2": 0.982, + "Perpendicular_3": 0.8767535070140281, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9956666666666667, + "PointLiesOnCircle_3": 0.9876, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9979959919839679, + "PointLiesOnLine_3": 0.9920159680638723 + }, + "eval_runtime": 320.2305, + "eval_samples_per_second": 32.789, + "eval_steps_per_second": 0.656, + "step": 16000 + }, + { + "epoch": 32.002, + "grad_norm": 1.0825588703155518, + "learning_rate": 2e-05, + "loss": 0.04043002, + "step": 16001 + }, + { + "epoch": 32.004, + "grad_norm": 1.116982102394104, + "learning_rate": 2e-05, + "loss": 0.0345992, + "step": 16002 + }, + { + "epoch": 32.006, + "grad_norm": 1.4669443368911743, + "learning_rate": 2e-05, + "loss": 0.05243903, + "step": 16003 + }, + { + "epoch": 32.008, + "grad_norm": 1.504705548286438, + "learning_rate": 2e-05, + "loss": 0.04028647, + "step": 16004 + }, + { + "epoch": 32.01, + "grad_norm": 1.2776727676391602, + "learning_rate": 2e-05, + "loss": 0.03870967, + "step": 16005 + }, + { + "epoch": 32.012, + "grad_norm": 2.4159090518951416, + "learning_rate": 2e-05, + "loss": 0.04641645, + "step": 16006 + }, + { + "epoch": 32.014, + "grad_norm": 1.1506953239440918, + "learning_rate": 2e-05, + "loss": 0.03702936, + "step": 16007 + }, + { + "epoch": 32.016, + "grad_norm": 1.2919611930847168, + "learning_rate": 2e-05, + "loss": 0.05337862, + "step": 16008 + }, + { + "epoch": 32.018, + "grad_norm": 0.9426947832107544, + "learning_rate": 2e-05, + "loss": 0.02725463, + "step": 16009 + }, + { + "epoch": 32.02, + "grad_norm": 1.00312077999115, + "learning_rate": 2e-05, + "loss": 0.04260667, + "step": 16010 + }, + { + "epoch": 32.022, + "grad_norm": 1.1375038623809814, + "learning_rate": 2e-05, + "loss": 0.03781604, + "step": 16011 + }, + { + "epoch": 32.024, + "grad_norm": 1.3393661975860596, + "learning_rate": 2e-05, + "loss": 0.04303484, + "step": 16012 + }, + { + "epoch": 32.026, + "grad_norm": 1.1762498617172241, + "learning_rate": 2e-05, + "loss": 0.04548337, + "step": 16013 + }, + { + "epoch": 32.028, + "grad_norm": 1.3804402351379395, + "learning_rate": 2e-05, + "loss": 0.0544548, + "step": 16014 + }, + { + "epoch": 32.03, + "grad_norm": 1.5973312854766846, + "learning_rate": 2e-05, + "loss": 0.0446333, + "step": 16015 + }, + { + "epoch": 32.032, + "grad_norm": 1.1906533241271973, + "learning_rate": 2e-05, + "loss": 0.0428127, + "step": 16016 + }, + { + "epoch": 32.034, + "grad_norm": 1.367288589477539, + "learning_rate": 2e-05, + "loss": 0.04731005, + "step": 16017 + }, + { + "epoch": 32.036, + "grad_norm": 1.3971643447875977, + "learning_rate": 2e-05, + "loss": 0.062555, + "step": 16018 + }, + { + "epoch": 32.038, + "grad_norm": 1.3580145835876465, + "learning_rate": 2e-05, + "loss": 0.04744005, + "step": 16019 + }, + { + "epoch": 32.04, + "grad_norm": 1.9001810550689697, + "learning_rate": 2e-05, + "loss": 0.03246989, + "step": 16020 + }, + { + "epoch": 32.042, + "grad_norm": 1.1311832666397095, + "learning_rate": 2e-05, + "loss": 0.04354143, + "step": 16021 + }, + { + "epoch": 32.044, + "grad_norm": 1.2986774444580078, + "learning_rate": 2e-05, + "loss": 0.04035509, + "step": 16022 + }, + { + "epoch": 32.046, + "grad_norm": 1.015791654586792, + "learning_rate": 2e-05, + "loss": 0.03449727, + "step": 16023 + }, + { + "epoch": 32.048, + "grad_norm": 1.2315489053726196, + "learning_rate": 2e-05, + "loss": 0.04751674, + "step": 16024 + }, + { + "epoch": 32.05, + "grad_norm": 1.6999454498291016, + "learning_rate": 2e-05, + "loss": 0.04061841, + "step": 16025 + }, + { + "epoch": 32.052, + "grad_norm": 1.1011244058609009, + "learning_rate": 2e-05, + "loss": 0.04402358, + "step": 16026 + }, + { + "epoch": 32.054, + "grad_norm": 0.9123277068138123, + "learning_rate": 2e-05, + "loss": 0.03008776, + "step": 16027 + }, + { + "epoch": 32.056, + "grad_norm": 1.5880179405212402, + "learning_rate": 2e-05, + "loss": 0.05666232, + "step": 16028 + }, + { + "epoch": 32.058, + "grad_norm": 1.4386612176895142, + "learning_rate": 2e-05, + "loss": 0.04437473, + "step": 16029 + }, + { + "epoch": 32.06, + "grad_norm": 1.1378834247589111, + "learning_rate": 2e-05, + "loss": 0.04078399, + "step": 16030 + }, + { + "epoch": 32.062, + "grad_norm": 1.1456459760665894, + "learning_rate": 2e-05, + "loss": 0.05264393, + "step": 16031 + }, + { + "epoch": 32.064, + "grad_norm": 1.2216919660568237, + "learning_rate": 2e-05, + "loss": 0.04822063, + "step": 16032 + }, + { + "epoch": 32.066, + "grad_norm": 1.7245386838912964, + "learning_rate": 2e-05, + "loss": 0.05039418, + "step": 16033 + }, + { + "epoch": 32.068, + "grad_norm": 1.1855204105377197, + "learning_rate": 2e-05, + "loss": 0.02989644, + "step": 16034 + }, + { + "epoch": 32.07, + "grad_norm": 1.0187755823135376, + "learning_rate": 2e-05, + "loss": 0.03046339, + "step": 16035 + }, + { + "epoch": 32.072, + "grad_norm": 1.0775607824325562, + "learning_rate": 2e-05, + "loss": 0.05659611, + "step": 16036 + }, + { + "epoch": 32.074, + "grad_norm": 1.234232783317566, + "learning_rate": 2e-05, + "loss": 0.0332603, + "step": 16037 + }, + { + "epoch": 32.076, + "grad_norm": 1.037575364112854, + "learning_rate": 2e-05, + "loss": 0.03690245, + "step": 16038 + }, + { + "epoch": 32.078, + "grad_norm": 1.1662631034851074, + "learning_rate": 2e-05, + "loss": 0.04369701, + "step": 16039 + }, + { + "epoch": 32.08, + "grad_norm": 1.172985553741455, + "learning_rate": 2e-05, + "loss": 0.03649969, + "step": 16040 + }, + { + "epoch": 32.082, + "grad_norm": 1.399276614189148, + "learning_rate": 2e-05, + "loss": 0.04782386, + "step": 16041 + }, + { + "epoch": 32.084, + "grad_norm": 1.089415431022644, + "learning_rate": 2e-05, + "loss": 0.0528977, + "step": 16042 + }, + { + "epoch": 32.086, + "grad_norm": 1.217584252357483, + "learning_rate": 2e-05, + "loss": 0.04556941, + "step": 16043 + }, + { + "epoch": 32.088, + "grad_norm": 1.1931408643722534, + "learning_rate": 2e-05, + "loss": 0.04738794, + "step": 16044 + }, + { + "epoch": 32.09, + "grad_norm": 1.0571660995483398, + "learning_rate": 2e-05, + "loss": 0.03878347, + "step": 16045 + }, + { + "epoch": 32.092, + "grad_norm": 1.4781181812286377, + "learning_rate": 2e-05, + "loss": 0.04194687, + "step": 16046 + }, + { + "epoch": 32.094, + "grad_norm": 4.250499248504639, + "learning_rate": 2e-05, + "loss": 0.03575663, + "step": 16047 + }, + { + "epoch": 32.096, + "grad_norm": 2.5705862045288086, + "learning_rate": 2e-05, + "loss": 0.04239442, + "step": 16048 + }, + { + "epoch": 32.098, + "grad_norm": 0.9903239607810974, + "learning_rate": 2e-05, + "loss": 0.04511203, + "step": 16049 + }, + { + "epoch": 32.1, + "grad_norm": 1.8635953664779663, + "learning_rate": 2e-05, + "loss": 0.05268551, + "step": 16050 + }, + { + "epoch": 32.102, + "grad_norm": 2.417490005493164, + "learning_rate": 2e-05, + "loss": 0.0534266, + "step": 16051 + }, + { + "epoch": 32.104, + "grad_norm": 1.4200366735458374, + "learning_rate": 2e-05, + "loss": 0.05390621, + "step": 16052 + }, + { + "epoch": 32.106, + "grad_norm": 1.796728491783142, + "learning_rate": 2e-05, + "loss": 0.05886037, + "step": 16053 + }, + { + "epoch": 32.108, + "grad_norm": 1.967190146446228, + "learning_rate": 2e-05, + "loss": 0.04227813, + "step": 16054 + }, + { + "epoch": 32.11, + "grad_norm": 1.7734386920928955, + "learning_rate": 2e-05, + "loss": 0.03885096, + "step": 16055 + }, + { + "epoch": 32.112, + "grad_norm": 1.0239137411117554, + "learning_rate": 2e-05, + "loss": 0.03256997, + "step": 16056 + }, + { + "epoch": 32.114, + "grad_norm": 1.1545599699020386, + "learning_rate": 2e-05, + "loss": 0.04887954, + "step": 16057 + }, + { + "epoch": 32.116, + "grad_norm": 2.557694673538208, + "learning_rate": 2e-05, + "loss": 0.04597983, + "step": 16058 + }, + { + "epoch": 32.118, + "grad_norm": 2.367849826812744, + "learning_rate": 2e-05, + "loss": 0.07052769, + "step": 16059 + }, + { + "epoch": 32.12, + "grad_norm": 1.2989052534103394, + "learning_rate": 2e-05, + "loss": 0.05320881, + "step": 16060 + }, + { + "epoch": 32.122, + "grad_norm": 1.6988813877105713, + "learning_rate": 2e-05, + "loss": 0.05019864, + "step": 16061 + }, + { + "epoch": 32.124, + "grad_norm": 1.2246428728103638, + "learning_rate": 2e-05, + "loss": 0.03273758, + "step": 16062 + }, + { + "epoch": 32.126, + "grad_norm": 0.8918790221214294, + "learning_rate": 2e-05, + "loss": 0.02186277, + "step": 16063 + }, + { + "epoch": 32.128, + "grad_norm": 0.9243330359458923, + "learning_rate": 2e-05, + "loss": 0.02869822, + "step": 16064 + }, + { + "epoch": 32.13, + "grad_norm": 0.9982576370239258, + "learning_rate": 2e-05, + "loss": 0.03740531, + "step": 16065 + }, + { + "epoch": 32.132, + "grad_norm": 1.0650466680526733, + "learning_rate": 2e-05, + "loss": 0.03073342, + "step": 16066 + }, + { + "epoch": 32.134, + "grad_norm": 2.1163761615753174, + "learning_rate": 2e-05, + "loss": 0.03885228, + "step": 16067 + }, + { + "epoch": 32.136, + "grad_norm": 0.9434223771095276, + "learning_rate": 2e-05, + "loss": 0.03199326, + "step": 16068 + }, + { + "epoch": 32.138, + "grad_norm": 0.8779048919677734, + "learning_rate": 2e-05, + "loss": 0.02758794, + "step": 16069 + }, + { + "epoch": 32.14, + "grad_norm": 1.155031442642212, + "learning_rate": 2e-05, + "loss": 0.04074682, + "step": 16070 + }, + { + "epoch": 32.142, + "grad_norm": 1.140641450881958, + "learning_rate": 2e-05, + "loss": 0.04276387, + "step": 16071 + }, + { + "epoch": 32.144, + "grad_norm": 0.932158887386322, + "learning_rate": 2e-05, + "loss": 0.02548703, + "step": 16072 + }, + { + "epoch": 32.146, + "grad_norm": 1.3973239660263062, + "learning_rate": 2e-05, + "loss": 0.05437903, + "step": 16073 + }, + { + "epoch": 32.148, + "grad_norm": 1.164240837097168, + "learning_rate": 2e-05, + "loss": 0.03863488, + "step": 16074 + }, + { + "epoch": 32.15, + "grad_norm": 1.162251591682434, + "learning_rate": 2e-05, + "loss": 0.05012304, + "step": 16075 + }, + { + "epoch": 32.152, + "grad_norm": 1.5069862604141235, + "learning_rate": 2e-05, + "loss": 0.04389899, + "step": 16076 + }, + { + "epoch": 32.154, + "grad_norm": 1.4252487421035767, + "learning_rate": 2e-05, + "loss": 0.04979219, + "step": 16077 + }, + { + "epoch": 32.156, + "grad_norm": 1.2503894567489624, + "learning_rate": 2e-05, + "loss": 0.04105882, + "step": 16078 + }, + { + "epoch": 32.158, + "grad_norm": 1.1806787252426147, + "learning_rate": 2e-05, + "loss": 0.04621601, + "step": 16079 + }, + { + "epoch": 32.16, + "grad_norm": 1.0560612678527832, + "learning_rate": 2e-05, + "loss": 0.03700335, + "step": 16080 + }, + { + "epoch": 32.162, + "grad_norm": 1.3735400438308716, + "learning_rate": 2e-05, + "loss": 0.04111753, + "step": 16081 + }, + { + "epoch": 32.164, + "grad_norm": 1.053409457206726, + "learning_rate": 2e-05, + "loss": 0.0357594, + "step": 16082 + }, + { + "epoch": 32.166, + "grad_norm": 2.5766215324401855, + "learning_rate": 2e-05, + "loss": 0.05252399, + "step": 16083 + }, + { + "epoch": 32.168, + "grad_norm": 1.9363195896148682, + "learning_rate": 2e-05, + "loss": 0.05619735, + "step": 16084 + }, + { + "epoch": 32.17, + "grad_norm": 0.9756081104278564, + "learning_rate": 2e-05, + "loss": 0.03354892, + "step": 16085 + }, + { + "epoch": 32.172, + "grad_norm": 0.9970953464508057, + "learning_rate": 2e-05, + "loss": 0.03117803, + "step": 16086 + }, + { + "epoch": 32.174, + "grad_norm": 1.3192036151885986, + "learning_rate": 2e-05, + "loss": 0.04156514, + "step": 16087 + }, + { + "epoch": 32.176, + "grad_norm": 1.3369141817092896, + "learning_rate": 2e-05, + "loss": 0.04897435, + "step": 16088 + }, + { + "epoch": 32.178, + "grad_norm": 1.4819189310073853, + "learning_rate": 2e-05, + "loss": 0.04706377, + "step": 16089 + }, + { + "epoch": 32.18, + "grad_norm": 1.1336413621902466, + "learning_rate": 2e-05, + "loss": 0.03167891, + "step": 16090 + }, + { + "epoch": 32.182, + "grad_norm": 1.1428834199905396, + "learning_rate": 2e-05, + "loss": 0.02547898, + "step": 16091 + }, + { + "epoch": 32.184, + "grad_norm": 1.2145164012908936, + "learning_rate": 2e-05, + "loss": 0.03935249, + "step": 16092 + }, + { + "epoch": 32.186, + "grad_norm": 1.4651552438735962, + "learning_rate": 2e-05, + "loss": 0.03130937, + "step": 16093 + }, + { + "epoch": 32.188, + "grad_norm": 1.480681300163269, + "learning_rate": 2e-05, + "loss": 0.03741454, + "step": 16094 + }, + { + "epoch": 32.19, + "grad_norm": 2.146885633468628, + "learning_rate": 2e-05, + "loss": 0.05036818, + "step": 16095 + }, + { + "epoch": 32.192, + "grad_norm": 1.1545674800872803, + "learning_rate": 2e-05, + "loss": 0.03278602, + "step": 16096 + }, + { + "epoch": 32.194, + "grad_norm": 0.9393602013587952, + "learning_rate": 2e-05, + "loss": 0.03584001, + "step": 16097 + }, + { + "epoch": 32.196, + "grad_norm": 1.2177561521530151, + "learning_rate": 2e-05, + "loss": 0.03207495, + "step": 16098 + }, + { + "epoch": 32.198, + "grad_norm": 1.0664947032928467, + "learning_rate": 2e-05, + "loss": 0.03970977, + "step": 16099 + }, + { + "epoch": 32.2, + "grad_norm": 1.6877775192260742, + "learning_rate": 2e-05, + "loss": 0.04805655, + "step": 16100 + }, + { + "epoch": 32.202, + "grad_norm": 1.9446738958358765, + "learning_rate": 2e-05, + "loss": 0.06819682, + "step": 16101 + }, + { + "epoch": 32.204, + "grad_norm": 1.219976782798767, + "learning_rate": 2e-05, + "loss": 0.04143453, + "step": 16102 + }, + { + "epoch": 32.206, + "grad_norm": 1.203502893447876, + "learning_rate": 2e-05, + "loss": 0.03769158, + "step": 16103 + }, + { + "epoch": 32.208, + "grad_norm": 1.0326348543167114, + "learning_rate": 2e-05, + "loss": 0.02815155, + "step": 16104 + }, + { + "epoch": 32.21, + "grad_norm": 0.9759824872016907, + "learning_rate": 2e-05, + "loss": 0.03706375, + "step": 16105 + }, + { + "epoch": 32.212, + "grad_norm": 1.9715107679367065, + "learning_rate": 2e-05, + "loss": 0.0670663, + "step": 16106 + }, + { + "epoch": 32.214, + "grad_norm": 2.088665723800659, + "learning_rate": 2e-05, + "loss": 0.04498452, + "step": 16107 + }, + { + "epoch": 32.216, + "grad_norm": 1.9537535905838013, + "learning_rate": 2e-05, + "loss": 0.06290226, + "step": 16108 + }, + { + "epoch": 32.218, + "grad_norm": 1.1504062414169312, + "learning_rate": 2e-05, + "loss": 0.0422533, + "step": 16109 + }, + { + "epoch": 32.22, + "grad_norm": 1.2755309343338013, + "learning_rate": 2e-05, + "loss": 0.05198646, + "step": 16110 + }, + { + "epoch": 32.222, + "grad_norm": 1.7408345937728882, + "learning_rate": 2e-05, + "loss": 0.03385472, + "step": 16111 + }, + { + "epoch": 32.224, + "grad_norm": 1.2028616666793823, + "learning_rate": 2e-05, + "loss": 0.03718933, + "step": 16112 + }, + { + "epoch": 32.226, + "grad_norm": 1.2710516452789307, + "learning_rate": 2e-05, + "loss": 0.04598017, + "step": 16113 + }, + { + "epoch": 32.228, + "grad_norm": 1.0447970628738403, + "learning_rate": 2e-05, + "loss": 0.04060001, + "step": 16114 + }, + { + "epoch": 32.23, + "grad_norm": 1.267838478088379, + "learning_rate": 2e-05, + "loss": 0.06093229, + "step": 16115 + }, + { + "epoch": 32.232, + "grad_norm": 1.31565260887146, + "learning_rate": 2e-05, + "loss": 0.03904974, + "step": 16116 + }, + { + "epoch": 32.234, + "grad_norm": 1.2240655422210693, + "learning_rate": 2e-05, + "loss": 0.03763383, + "step": 16117 + }, + { + "epoch": 32.236, + "grad_norm": 0.9750972390174866, + "learning_rate": 2e-05, + "loss": 0.03341448, + "step": 16118 + }, + { + "epoch": 32.238, + "grad_norm": 1.330909013748169, + "learning_rate": 2e-05, + "loss": 0.04453826, + "step": 16119 + }, + { + "epoch": 32.24, + "grad_norm": 2.050285816192627, + "learning_rate": 2e-05, + "loss": 0.06148023, + "step": 16120 + }, + { + "epoch": 32.242, + "grad_norm": 1.2034765481948853, + "learning_rate": 2e-05, + "loss": 0.03787356, + "step": 16121 + }, + { + "epoch": 32.244, + "grad_norm": 2.573892831802368, + "learning_rate": 2e-05, + "loss": 0.0428374, + "step": 16122 + }, + { + "epoch": 32.246, + "grad_norm": 1.063158631324768, + "learning_rate": 2e-05, + "loss": 0.03356419, + "step": 16123 + }, + { + "epoch": 32.248, + "grad_norm": 1.1658393144607544, + "learning_rate": 2e-05, + "loss": 0.02797179, + "step": 16124 + }, + { + "epoch": 32.25, + "grad_norm": 1.3309680223464966, + "learning_rate": 2e-05, + "loss": 0.06812909, + "step": 16125 + }, + { + "epoch": 32.252, + "grad_norm": 0.8895295858383179, + "learning_rate": 2e-05, + "loss": 0.03186877, + "step": 16126 + }, + { + "epoch": 32.254, + "grad_norm": 1.42412269115448, + "learning_rate": 2e-05, + "loss": 0.05221164, + "step": 16127 + }, + { + "epoch": 32.256, + "grad_norm": 1.7970479726791382, + "learning_rate": 2e-05, + "loss": 0.04089088, + "step": 16128 + }, + { + "epoch": 32.258, + "grad_norm": 1.4141465425491333, + "learning_rate": 2e-05, + "loss": 0.03509438, + "step": 16129 + }, + { + "epoch": 32.26, + "grad_norm": 1.255109190940857, + "learning_rate": 2e-05, + "loss": 0.03642782, + "step": 16130 + }, + { + "epoch": 32.262, + "grad_norm": 2.4638772010803223, + "learning_rate": 2e-05, + "loss": 0.06095183, + "step": 16131 + }, + { + "epoch": 32.264, + "grad_norm": 1.527658224105835, + "learning_rate": 2e-05, + "loss": 0.03274827, + "step": 16132 + }, + { + "epoch": 32.266, + "grad_norm": 1.2536580562591553, + "learning_rate": 2e-05, + "loss": 0.0438434, + "step": 16133 + }, + { + "epoch": 32.268, + "grad_norm": 1.317441701889038, + "learning_rate": 2e-05, + "loss": 0.05264898, + "step": 16134 + }, + { + "epoch": 32.27, + "grad_norm": 1.0908536911010742, + "learning_rate": 2e-05, + "loss": 0.04722644, + "step": 16135 + }, + { + "epoch": 32.272, + "grad_norm": 1.3812543153762817, + "learning_rate": 2e-05, + "loss": 0.04213402, + "step": 16136 + }, + { + "epoch": 32.274, + "grad_norm": 1.2621177434921265, + "learning_rate": 2e-05, + "loss": 0.04359785, + "step": 16137 + }, + { + "epoch": 32.276, + "grad_norm": 1.1774916648864746, + "learning_rate": 2e-05, + "loss": 0.03530777, + "step": 16138 + }, + { + "epoch": 32.278, + "grad_norm": 1.1465319395065308, + "learning_rate": 2e-05, + "loss": 0.04779294, + "step": 16139 + }, + { + "epoch": 32.28, + "grad_norm": 1.1407089233398438, + "learning_rate": 2e-05, + "loss": 0.04244178, + "step": 16140 + }, + { + "epoch": 32.282, + "grad_norm": 1.1952414512634277, + "learning_rate": 2e-05, + "loss": 0.03308697, + "step": 16141 + }, + { + "epoch": 32.284, + "grad_norm": 1.1828652620315552, + "learning_rate": 2e-05, + "loss": 0.03380813, + "step": 16142 + }, + { + "epoch": 32.286, + "grad_norm": 2.2525599002838135, + "learning_rate": 2e-05, + "loss": 0.05679112, + "step": 16143 + }, + { + "epoch": 32.288, + "grad_norm": 2.428598642349243, + "learning_rate": 2e-05, + "loss": 0.05677919, + "step": 16144 + }, + { + "epoch": 32.29, + "grad_norm": 1.4035303592681885, + "learning_rate": 2e-05, + "loss": 0.0381352, + "step": 16145 + }, + { + "epoch": 32.292, + "grad_norm": 1.2325247526168823, + "learning_rate": 2e-05, + "loss": 0.04354863, + "step": 16146 + }, + { + "epoch": 32.294, + "grad_norm": 1.1236069202423096, + "learning_rate": 2e-05, + "loss": 0.04897562, + "step": 16147 + }, + { + "epoch": 32.296, + "grad_norm": 1.730275273323059, + "learning_rate": 2e-05, + "loss": 0.03983387, + "step": 16148 + }, + { + "epoch": 32.298, + "grad_norm": 1.732548713684082, + "learning_rate": 2e-05, + "loss": 0.04608201, + "step": 16149 + }, + { + "epoch": 32.3, + "grad_norm": 1.9877454042434692, + "learning_rate": 2e-05, + "loss": 0.03996104, + "step": 16150 + }, + { + "epoch": 32.302, + "grad_norm": 1.0003015995025635, + "learning_rate": 2e-05, + "loss": 0.03154821, + "step": 16151 + }, + { + "epoch": 32.304, + "grad_norm": 1.7395548820495605, + "learning_rate": 2e-05, + "loss": 0.05859068, + "step": 16152 + }, + { + "epoch": 32.306, + "grad_norm": 1.7625404596328735, + "learning_rate": 2e-05, + "loss": 0.04158505, + "step": 16153 + }, + { + "epoch": 32.308, + "grad_norm": 1.3035045862197876, + "learning_rate": 2e-05, + "loss": 0.03832395, + "step": 16154 + }, + { + "epoch": 32.31, + "grad_norm": 1.2695159912109375, + "learning_rate": 2e-05, + "loss": 0.05645255, + "step": 16155 + }, + { + "epoch": 32.312, + "grad_norm": 1.68500816822052, + "learning_rate": 2e-05, + "loss": 0.07209504, + "step": 16156 + }, + { + "epoch": 32.314, + "grad_norm": 1.1984082460403442, + "learning_rate": 2e-05, + "loss": 0.03992949, + "step": 16157 + }, + { + "epoch": 32.316, + "grad_norm": 1.199971079826355, + "learning_rate": 2e-05, + "loss": 0.04131328, + "step": 16158 + }, + { + "epoch": 32.318, + "grad_norm": 1.0048577785491943, + "learning_rate": 2e-05, + "loss": 0.03791015, + "step": 16159 + }, + { + "epoch": 32.32, + "grad_norm": 1.0711760520935059, + "learning_rate": 2e-05, + "loss": 0.03038563, + "step": 16160 + }, + { + "epoch": 32.322, + "grad_norm": 1.5091052055358887, + "learning_rate": 2e-05, + "loss": 0.05084156, + "step": 16161 + }, + { + "epoch": 32.324, + "grad_norm": 1.170743703842163, + "learning_rate": 2e-05, + "loss": 0.04108653, + "step": 16162 + }, + { + "epoch": 32.326, + "grad_norm": 0.9663565158843994, + "learning_rate": 2e-05, + "loss": 0.03507331, + "step": 16163 + }, + { + "epoch": 32.328, + "grad_norm": 1.7884368896484375, + "learning_rate": 2e-05, + "loss": 0.03116245, + "step": 16164 + }, + { + "epoch": 32.33, + "grad_norm": 1.2119756937026978, + "learning_rate": 2e-05, + "loss": 0.03635027, + "step": 16165 + }, + { + "epoch": 32.332, + "grad_norm": 1.2660174369812012, + "learning_rate": 2e-05, + "loss": 0.05102555, + "step": 16166 + }, + { + "epoch": 32.334, + "grad_norm": 1.6132479906082153, + "learning_rate": 2e-05, + "loss": 0.05014277, + "step": 16167 + }, + { + "epoch": 32.336, + "grad_norm": 1.722839117050171, + "learning_rate": 2e-05, + "loss": 0.04384566, + "step": 16168 + }, + { + "epoch": 32.338, + "grad_norm": 1.3902097940444946, + "learning_rate": 2e-05, + "loss": 0.03447413, + "step": 16169 + }, + { + "epoch": 32.34, + "grad_norm": 1.2110004425048828, + "learning_rate": 2e-05, + "loss": 0.03868734, + "step": 16170 + }, + { + "epoch": 32.342, + "grad_norm": 1.4563666582107544, + "learning_rate": 2e-05, + "loss": 0.04213499, + "step": 16171 + }, + { + "epoch": 32.344, + "grad_norm": 1.213379144668579, + "learning_rate": 2e-05, + "loss": 0.05638143, + "step": 16172 + }, + { + "epoch": 32.346, + "grad_norm": 1.09957754611969, + "learning_rate": 2e-05, + "loss": 0.04616127, + "step": 16173 + }, + { + "epoch": 32.348, + "grad_norm": 1.0122456550598145, + "learning_rate": 2e-05, + "loss": 0.0424768, + "step": 16174 + }, + { + "epoch": 32.35, + "grad_norm": 1.259447455406189, + "learning_rate": 2e-05, + "loss": 0.04152884, + "step": 16175 + }, + { + "epoch": 32.352, + "grad_norm": 1.1292533874511719, + "learning_rate": 2e-05, + "loss": 0.04704421, + "step": 16176 + }, + { + "epoch": 32.354, + "grad_norm": 1.455267071723938, + "learning_rate": 2e-05, + "loss": 0.05591056, + "step": 16177 + }, + { + "epoch": 32.356, + "grad_norm": 1.2393004894256592, + "learning_rate": 2e-05, + "loss": 0.03246272, + "step": 16178 + }, + { + "epoch": 32.358, + "grad_norm": 1.4194928407669067, + "learning_rate": 2e-05, + "loss": 0.04797504, + "step": 16179 + }, + { + "epoch": 32.36, + "grad_norm": 1.3719854354858398, + "learning_rate": 2e-05, + "loss": 0.05387818, + "step": 16180 + }, + { + "epoch": 32.362, + "grad_norm": 1.0265268087387085, + "learning_rate": 2e-05, + "loss": 0.03407123, + "step": 16181 + }, + { + "epoch": 32.364, + "grad_norm": 1.1385420560836792, + "learning_rate": 2e-05, + "loss": 0.04104249, + "step": 16182 + }, + { + "epoch": 32.366, + "grad_norm": 1.4606928825378418, + "learning_rate": 2e-05, + "loss": 0.05414707, + "step": 16183 + }, + { + "epoch": 32.368, + "grad_norm": 1.4488290548324585, + "learning_rate": 2e-05, + "loss": 0.05977223, + "step": 16184 + }, + { + "epoch": 32.37, + "grad_norm": 1.556361436843872, + "learning_rate": 2e-05, + "loss": 0.03753366, + "step": 16185 + }, + { + "epoch": 32.372, + "grad_norm": 1.0535379648208618, + "learning_rate": 2e-05, + "loss": 0.038872, + "step": 16186 + }, + { + "epoch": 32.374, + "grad_norm": 1.3477610349655151, + "learning_rate": 2e-05, + "loss": 0.04552489, + "step": 16187 + }, + { + "epoch": 32.376, + "grad_norm": 0.9643598198890686, + "learning_rate": 2e-05, + "loss": 0.03325019, + "step": 16188 + }, + { + "epoch": 32.378, + "grad_norm": 2.3846733570098877, + "learning_rate": 2e-05, + "loss": 0.03706906, + "step": 16189 + }, + { + "epoch": 32.38, + "grad_norm": 1.3666075468063354, + "learning_rate": 2e-05, + "loss": 0.05090994, + "step": 16190 + }, + { + "epoch": 32.382, + "grad_norm": 1.3230539560317993, + "learning_rate": 2e-05, + "loss": 0.04045279, + "step": 16191 + }, + { + "epoch": 32.384, + "grad_norm": 1.1574872732162476, + "learning_rate": 2e-05, + "loss": 0.043694, + "step": 16192 + }, + { + "epoch": 32.386, + "grad_norm": 1.11777663230896, + "learning_rate": 2e-05, + "loss": 0.05323738, + "step": 16193 + }, + { + "epoch": 32.388, + "grad_norm": 1.1913745403289795, + "learning_rate": 2e-05, + "loss": 0.04276652, + "step": 16194 + }, + { + "epoch": 32.39, + "grad_norm": 2.0520830154418945, + "learning_rate": 2e-05, + "loss": 0.05131931, + "step": 16195 + }, + { + "epoch": 32.392, + "grad_norm": 1.3622318506240845, + "learning_rate": 2e-05, + "loss": 0.03571715, + "step": 16196 + }, + { + "epoch": 32.394, + "grad_norm": 1.278665542602539, + "learning_rate": 2e-05, + "loss": 0.04110078, + "step": 16197 + }, + { + "epoch": 32.396, + "grad_norm": 1.3019253015518188, + "learning_rate": 2e-05, + "loss": 0.03151403, + "step": 16198 + }, + { + "epoch": 32.398, + "grad_norm": 1.0655170679092407, + "learning_rate": 2e-05, + "loss": 0.03590965, + "step": 16199 + }, + { + "epoch": 32.4, + "grad_norm": 1.0709350109100342, + "learning_rate": 2e-05, + "loss": 0.03407409, + "step": 16200 + }, + { + "epoch": 32.402, + "grad_norm": 1.1004314422607422, + "learning_rate": 2e-05, + "loss": 0.04275486, + "step": 16201 + }, + { + "epoch": 32.404, + "grad_norm": 1.122451901435852, + "learning_rate": 2e-05, + "loss": 0.0433308, + "step": 16202 + }, + { + "epoch": 32.406, + "grad_norm": 1.286196231842041, + "learning_rate": 2e-05, + "loss": 0.05405517, + "step": 16203 + }, + { + "epoch": 32.408, + "grad_norm": 1.3067890405654907, + "learning_rate": 2e-05, + "loss": 0.04709282, + "step": 16204 + }, + { + "epoch": 32.41, + "grad_norm": 1.2169573307037354, + "learning_rate": 2e-05, + "loss": 0.04799424, + "step": 16205 + }, + { + "epoch": 32.412, + "grad_norm": 1.0351089239120483, + "learning_rate": 2e-05, + "loss": 0.02892509, + "step": 16206 + }, + { + "epoch": 32.414, + "grad_norm": 1.1305210590362549, + "learning_rate": 2e-05, + "loss": 0.03046542, + "step": 16207 + }, + { + "epoch": 32.416, + "grad_norm": 1.049100637435913, + "learning_rate": 2e-05, + "loss": 0.03685327, + "step": 16208 + }, + { + "epoch": 32.418, + "grad_norm": 2.5545873641967773, + "learning_rate": 2e-05, + "loss": 0.05403619, + "step": 16209 + }, + { + "epoch": 32.42, + "grad_norm": 1.5375778675079346, + "learning_rate": 2e-05, + "loss": 0.0551252, + "step": 16210 + }, + { + "epoch": 32.422, + "grad_norm": 1.355461597442627, + "learning_rate": 2e-05, + "loss": 0.03641314, + "step": 16211 + }, + { + "epoch": 32.424, + "grad_norm": 2.1974892616271973, + "learning_rate": 2e-05, + "loss": 0.0479245, + "step": 16212 + }, + { + "epoch": 32.426, + "grad_norm": 2.6640677452087402, + "learning_rate": 2e-05, + "loss": 0.05790597, + "step": 16213 + }, + { + "epoch": 32.428, + "grad_norm": 1.0314759016036987, + "learning_rate": 2e-05, + "loss": 0.02948291, + "step": 16214 + }, + { + "epoch": 32.43, + "grad_norm": 1.1263335943222046, + "learning_rate": 2e-05, + "loss": 0.03837535, + "step": 16215 + }, + { + "epoch": 32.432, + "grad_norm": 0.9716264605522156, + "learning_rate": 2e-05, + "loss": 0.03991567, + "step": 16216 + }, + { + "epoch": 32.434, + "grad_norm": 1.3745099306106567, + "learning_rate": 2e-05, + "loss": 0.05104405, + "step": 16217 + }, + { + "epoch": 32.436, + "grad_norm": 1.1306488513946533, + "learning_rate": 2e-05, + "loss": 0.04568043, + "step": 16218 + }, + { + "epoch": 32.438, + "grad_norm": 1.738258719444275, + "learning_rate": 2e-05, + "loss": 0.06466265, + "step": 16219 + }, + { + "epoch": 32.44, + "grad_norm": 1.6814087629318237, + "learning_rate": 2e-05, + "loss": 0.04136957, + "step": 16220 + }, + { + "epoch": 32.442, + "grad_norm": 1.1837775707244873, + "learning_rate": 2e-05, + "loss": 0.0486927, + "step": 16221 + }, + { + "epoch": 32.444, + "grad_norm": 1.257043719291687, + "learning_rate": 2e-05, + "loss": 0.04791939, + "step": 16222 + }, + { + "epoch": 32.446, + "grad_norm": 1.8833086490631104, + "learning_rate": 2e-05, + "loss": 0.03800988, + "step": 16223 + }, + { + "epoch": 32.448, + "grad_norm": 1.1962319612503052, + "learning_rate": 2e-05, + "loss": 0.03850331, + "step": 16224 + }, + { + "epoch": 32.45, + "grad_norm": 1.2660398483276367, + "learning_rate": 2e-05, + "loss": 0.03511299, + "step": 16225 + }, + { + "epoch": 32.452, + "grad_norm": 1.2324693202972412, + "learning_rate": 2e-05, + "loss": 0.04578947, + "step": 16226 + }, + { + "epoch": 32.454, + "grad_norm": 0.8969568014144897, + "learning_rate": 2e-05, + "loss": 0.03069787, + "step": 16227 + }, + { + "epoch": 32.456, + "grad_norm": 1.4285138845443726, + "learning_rate": 2e-05, + "loss": 0.06139054, + "step": 16228 + }, + { + "epoch": 32.458, + "grad_norm": 0.9827262163162231, + "learning_rate": 2e-05, + "loss": 0.03358797, + "step": 16229 + }, + { + "epoch": 32.46, + "grad_norm": 1.6161268949508667, + "learning_rate": 2e-05, + "loss": 0.05113133, + "step": 16230 + }, + { + "epoch": 32.462, + "grad_norm": 1.2342438697814941, + "learning_rate": 2e-05, + "loss": 0.04767845, + "step": 16231 + }, + { + "epoch": 32.464, + "grad_norm": 1.4637534618377686, + "learning_rate": 2e-05, + "loss": 0.05040576, + "step": 16232 + }, + { + "epoch": 32.466, + "grad_norm": 1.251790165901184, + "learning_rate": 2e-05, + "loss": 0.04769853, + "step": 16233 + }, + { + "epoch": 32.468, + "grad_norm": 2.195897102355957, + "learning_rate": 2e-05, + "loss": 0.04872566, + "step": 16234 + }, + { + "epoch": 32.47, + "grad_norm": 0.8366017937660217, + "learning_rate": 2e-05, + "loss": 0.03476872, + "step": 16235 + }, + { + "epoch": 32.472, + "grad_norm": 3.469074249267578, + "learning_rate": 2e-05, + "loss": 0.04331683, + "step": 16236 + }, + { + "epoch": 32.474, + "grad_norm": 2.101489782333374, + "learning_rate": 2e-05, + "loss": 0.06111233, + "step": 16237 + }, + { + "epoch": 32.476, + "grad_norm": 1.5878509283065796, + "learning_rate": 2e-05, + "loss": 0.04456938, + "step": 16238 + }, + { + "epoch": 32.478, + "grad_norm": 1.6882450580596924, + "learning_rate": 2e-05, + "loss": 0.06533901, + "step": 16239 + }, + { + "epoch": 32.48, + "grad_norm": 1.980782389640808, + "learning_rate": 2e-05, + "loss": 0.02859255, + "step": 16240 + }, + { + "epoch": 32.482, + "grad_norm": 2.289910316467285, + "learning_rate": 2e-05, + "loss": 0.04708671, + "step": 16241 + }, + { + "epoch": 32.484, + "grad_norm": 1.2156428098678589, + "learning_rate": 2e-05, + "loss": 0.06721392, + "step": 16242 + }, + { + "epoch": 32.486, + "grad_norm": 2.177699327468872, + "learning_rate": 2e-05, + "loss": 0.03834368, + "step": 16243 + }, + { + "epoch": 32.488, + "grad_norm": 1.3921353816986084, + "learning_rate": 2e-05, + "loss": 0.05225492, + "step": 16244 + }, + { + "epoch": 32.49, + "grad_norm": 1.0509675741195679, + "learning_rate": 2e-05, + "loss": 0.03782216, + "step": 16245 + }, + { + "epoch": 32.492, + "grad_norm": 0.8965580463409424, + "learning_rate": 2e-05, + "loss": 0.03118903, + "step": 16246 + }, + { + "epoch": 32.494, + "grad_norm": 1.2393367290496826, + "learning_rate": 2e-05, + "loss": 0.04286648, + "step": 16247 + }, + { + "epoch": 32.496, + "grad_norm": 4.540200233459473, + "learning_rate": 2e-05, + "loss": 0.05019096, + "step": 16248 + }, + { + "epoch": 32.498, + "grad_norm": 1.4414736032485962, + "learning_rate": 2e-05, + "loss": 0.03688266, + "step": 16249 + }, + { + "epoch": 32.5, + "grad_norm": 1.0771220922470093, + "learning_rate": 2e-05, + "loss": 0.04028372, + "step": 16250 + }, + { + "epoch": 32.502, + "grad_norm": 1.2341318130493164, + "learning_rate": 2e-05, + "loss": 0.04482314, + "step": 16251 + }, + { + "epoch": 32.504, + "grad_norm": 1.4283705949783325, + "learning_rate": 2e-05, + "loss": 0.04541865, + "step": 16252 + }, + { + "epoch": 32.506, + "grad_norm": 1.5509240627288818, + "learning_rate": 2e-05, + "loss": 0.06562129, + "step": 16253 + }, + { + "epoch": 32.508, + "grad_norm": 1.07283353805542, + "learning_rate": 2e-05, + "loss": 0.04550766, + "step": 16254 + }, + { + "epoch": 32.51, + "grad_norm": 1.1795142889022827, + "learning_rate": 2e-05, + "loss": 0.05442602, + "step": 16255 + }, + { + "epoch": 32.512, + "grad_norm": 1.1918917894363403, + "learning_rate": 2e-05, + "loss": 0.04531888, + "step": 16256 + }, + { + "epoch": 32.514, + "grad_norm": 1.6722873449325562, + "learning_rate": 2e-05, + "loss": 0.04310684, + "step": 16257 + }, + { + "epoch": 32.516, + "grad_norm": 1.2727532386779785, + "learning_rate": 2e-05, + "loss": 0.04338937, + "step": 16258 + }, + { + "epoch": 32.518, + "grad_norm": 1.411271095275879, + "learning_rate": 2e-05, + "loss": 0.0337945, + "step": 16259 + }, + { + "epoch": 32.52, + "grad_norm": 1.1750949621200562, + "learning_rate": 2e-05, + "loss": 0.03457836, + "step": 16260 + }, + { + "epoch": 32.522, + "grad_norm": 3.289924144744873, + "learning_rate": 2e-05, + "loss": 0.04035544, + "step": 16261 + }, + { + "epoch": 32.524, + "grad_norm": 0.9971786737442017, + "learning_rate": 2e-05, + "loss": 0.03331866, + "step": 16262 + }, + { + "epoch": 32.526, + "grad_norm": 1.3523638248443604, + "learning_rate": 2e-05, + "loss": 0.04086154, + "step": 16263 + }, + { + "epoch": 32.528, + "grad_norm": 1.1040973663330078, + "learning_rate": 2e-05, + "loss": 0.0335994, + "step": 16264 + }, + { + "epoch": 32.53, + "grad_norm": 1.7474960088729858, + "learning_rate": 2e-05, + "loss": 0.04292202, + "step": 16265 + }, + { + "epoch": 32.532, + "grad_norm": 1.0948997735977173, + "learning_rate": 2e-05, + "loss": 0.03585258, + "step": 16266 + }, + { + "epoch": 32.534, + "grad_norm": 1.512582778930664, + "learning_rate": 2e-05, + "loss": 0.04289878, + "step": 16267 + }, + { + "epoch": 32.536, + "grad_norm": 1.9142391681671143, + "learning_rate": 2e-05, + "loss": 0.04722939, + "step": 16268 + }, + { + "epoch": 32.538, + "grad_norm": 1.0834648609161377, + "learning_rate": 2e-05, + "loss": 0.03546905, + "step": 16269 + }, + { + "epoch": 32.54, + "grad_norm": 1.8602557182312012, + "learning_rate": 2e-05, + "loss": 0.03079529, + "step": 16270 + }, + { + "epoch": 32.542, + "grad_norm": 1.2603952884674072, + "learning_rate": 2e-05, + "loss": 0.04889859, + "step": 16271 + }, + { + "epoch": 32.544, + "grad_norm": 0.9242783188819885, + "learning_rate": 2e-05, + "loss": 0.02692599, + "step": 16272 + }, + { + "epoch": 32.546, + "grad_norm": 1.140777349472046, + "learning_rate": 2e-05, + "loss": 0.04153073, + "step": 16273 + }, + { + "epoch": 32.548, + "grad_norm": 1.153734564781189, + "learning_rate": 2e-05, + "loss": 0.04307986, + "step": 16274 + }, + { + "epoch": 32.55, + "grad_norm": 1.1327687501907349, + "learning_rate": 2e-05, + "loss": 0.04815378, + "step": 16275 + }, + { + "epoch": 32.552, + "grad_norm": 1.2450306415557861, + "learning_rate": 2e-05, + "loss": 0.0305282, + "step": 16276 + }, + { + "epoch": 32.554, + "grad_norm": 1.2972357273101807, + "learning_rate": 2e-05, + "loss": 0.05288756, + "step": 16277 + }, + { + "epoch": 32.556, + "grad_norm": 1.880336880683899, + "learning_rate": 2e-05, + "loss": 0.02928368, + "step": 16278 + }, + { + "epoch": 32.558, + "grad_norm": 1.2293763160705566, + "learning_rate": 2e-05, + "loss": 0.03244419, + "step": 16279 + }, + { + "epoch": 32.56, + "grad_norm": 1.0479623079299927, + "learning_rate": 2e-05, + "loss": 0.03361559, + "step": 16280 + }, + { + "epoch": 32.562, + "grad_norm": 1.0732969045639038, + "learning_rate": 2e-05, + "loss": 0.0405861, + "step": 16281 + }, + { + "epoch": 32.564, + "grad_norm": 1.251217007637024, + "learning_rate": 2e-05, + "loss": 0.03988477, + "step": 16282 + }, + { + "epoch": 32.566, + "grad_norm": 1.0673967599868774, + "learning_rate": 2e-05, + "loss": 0.03976289, + "step": 16283 + }, + { + "epoch": 32.568, + "grad_norm": 1.1585348844528198, + "learning_rate": 2e-05, + "loss": 0.0360926, + "step": 16284 + }, + { + "epoch": 32.57, + "grad_norm": 1.051295518875122, + "learning_rate": 2e-05, + "loss": 0.03265924, + "step": 16285 + }, + { + "epoch": 32.572, + "grad_norm": 1.1959424018859863, + "learning_rate": 2e-05, + "loss": 0.03449826, + "step": 16286 + }, + { + "epoch": 32.574, + "grad_norm": 2.357743978500366, + "learning_rate": 2e-05, + "loss": 0.06374232, + "step": 16287 + }, + { + "epoch": 32.576, + "grad_norm": 1.252048373222351, + "learning_rate": 2e-05, + "loss": 0.04862808, + "step": 16288 + }, + { + "epoch": 32.578, + "grad_norm": 1.318522572517395, + "learning_rate": 2e-05, + "loss": 0.05090618, + "step": 16289 + }, + { + "epoch": 32.58, + "grad_norm": 1.414203405380249, + "learning_rate": 2e-05, + "loss": 0.03399177, + "step": 16290 + }, + { + "epoch": 32.582, + "grad_norm": 1.1485605239868164, + "learning_rate": 2e-05, + "loss": 0.03333712, + "step": 16291 + }, + { + "epoch": 32.584, + "grad_norm": 1.231711506843567, + "learning_rate": 2e-05, + "loss": 0.04081344, + "step": 16292 + }, + { + "epoch": 32.586, + "grad_norm": 1.4785326719284058, + "learning_rate": 2e-05, + "loss": 0.04804887, + "step": 16293 + }, + { + "epoch": 32.588, + "grad_norm": 1.113040566444397, + "learning_rate": 2e-05, + "loss": 0.04360687, + "step": 16294 + }, + { + "epoch": 32.59, + "grad_norm": 1.2656840085983276, + "learning_rate": 2e-05, + "loss": 0.03424615, + "step": 16295 + }, + { + "epoch": 32.592, + "grad_norm": 2.090235471725464, + "learning_rate": 2e-05, + "loss": 0.03176118, + "step": 16296 + }, + { + "epoch": 32.594, + "grad_norm": 1.2419369220733643, + "learning_rate": 2e-05, + "loss": 0.0560364, + "step": 16297 + }, + { + "epoch": 32.596, + "grad_norm": 1.8632920980453491, + "learning_rate": 2e-05, + "loss": 0.04524683, + "step": 16298 + }, + { + "epoch": 32.598, + "grad_norm": 1.5768665075302124, + "learning_rate": 2e-05, + "loss": 0.04588313, + "step": 16299 + }, + { + "epoch": 32.6, + "grad_norm": 2.376133918762207, + "learning_rate": 2e-05, + "loss": 0.06110063, + "step": 16300 + }, + { + "epoch": 32.602, + "grad_norm": 1.2238625288009644, + "learning_rate": 2e-05, + "loss": 0.04093513, + "step": 16301 + }, + { + "epoch": 32.604, + "grad_norm": 1.8445336818695068, + "learning_rate": 2e-05, + "loss": 0.03895678, + "step": 16302 + }, + { + "epoch": 32.606, + "grad_norm": 1.5113000869750977, + "learning_rate": 2e-05, + "loss": 0.04946197, + "step": 16303 + }, + { + "epoch": 32.608, + "grad_norm": 1.107370376586914, + "learning_rate": 2e-05, + "loss": 0.02631177, + "step": 16304 + }, + { + "epoch": 32.61, + "grad_norm": 1.319182276725769, + "learning_rate": 2e-05, + "loss": 0.03960687, + "step": 16305 + }, + { + "epoch": 32.612, + "grad_norm": 1.1253376007080078, + "learning_rate": 2e-05, + "loss": 0.03078615, + "step": 16306 + }, + { + "epoch": 32.614, + "grad_norm": 1.0863707065582275, + "learning_rate": 2e-05, + "loss": 0.03195829, + "step": 16307 + }, + { + "epoch": 32.616, + "grad_norm": 1.9176464080810547, + "learning_rate": 2e-05, + "loss": 0.04252298, + "step": 16308 + }, + { + "epoch": 32.618, + "grad_norm": 1.123355746269226, + "learning_rate": 2e-05, + "loss": 0.03697478, + "step": 16309 + }, + { + "epoch": 32.62, + "grad_norm": 3.388070583343506, + "learning_rate": 2e-05, + "loss": 0.06398418, + "step": 16310 + }, + { + "epoch": 32.622, + "grad_norm": 1.3932557106018066, + "learning_rate": 2e-05, + "loss": 0.05024934, + "step": 16311 + }, + { + "epoch": 32.624, + "grad_norm": 1.2493484020233154, + "learning_rate": 2e-05, + "loss": 0.04943375, + "step": 16312 + }, + { + "epoch": 32.626, + "grad_norm": 1.1439610719680786, + "learning_rate": 2e-05, + "loss": 0.0385169, + "step": 16313 + }, + { + "epoch": 32.628, + "grad_norm": 2.4775614738464355, + "learning_rate": 2e-05, + "loss": 0.06101976, + "step": 16314 + }, + { + "epoch": 32.63, + "grad_norm": 2.373351573944092, + "learning_rate": 2e-05, + "loss": 0.04415338, + "step": 16315 + }, + { + "epoch": 32.632, + "grad_norm": 1.7569504976272583, + "learning_rate": 2e-05, + "loss": 0.04195081, + "step": 16316 + }, + { + "epoch": 32.634, + "grad_norm": 1.1292805671691895, + "learning_rate": 2e-05, + "loss": 0.035651, + "step": 16317 + }, + { + "epoch": 32.636, + "grad_norm": 1.251456618309021, + "learning_rate": 2e-05, + "loss": 0.04292186, + "step": 16318 + }, + { + "epoch": 32.638, + "grad_norm": 0.9629459977149963, + "learning_rate": 2e-05, + "loss": 0.02403329, + "step": 16319 + }, + { + "epoch": 32.64, + "grad_norm": 1.001502275466919, + "learning_rate": 2e-05, + "loss": 0.03040902, + "step": 16320 + }, + { + "epoch": 32.642, + "grad_norm": 1.3388837575912476, + "learning_rate": 2e-05, + "loss": 0.0465103, + "step": 16321 + }, + { + "epoch": 32.644, + "grad_norm": 1.0850396156311035, + "learning_rate": 2e-05, + "loss": 0.03716237, + "step": 16322 + }, + { + "epoch": 32.646, + "grad_norm": 3.381190776824951, + "learning_rate": 2e-05, + "loss": 0.06202536, + "step": 16323 + }, + { + "epoch": 32.648, + "grad_norm": 1.3530367612838745, + "learning_rate": 2e-05, + "loss": 0.03582821, + "step": 16324 + }, + { + "epoch": 32.65, + "grad_norm": 2.433734893798828, + "learning_rate": 2e-05, + "loss": 0.05068055, + "step": 16325 + }, + { + "epoch": 32.652, + "grad_norm": 1.7328464984893799, + "learning_rate": 2e-05, + "loss": 0.0590675, + "step": 16326 + }, + { + "epoch": 32.654, + "grad_norm": 1.1107189655303955, + "learning_rate": 2e-05, + "loss": 0.05116561, + "step": 16327 + }, + { + "epoch": 32.656, + "grad_norm": 1.2130955457687378, + "learning_rate": 2e-05, + "loss": 0.03328083, + "step": 16328 + }, + { + "epoch": 32.658, + "grad_norm": 1.7198522090911865, + "learning_rate": 2e-05, + "loss": 0.04785996, + "step": 16329 + }, + { + "epoch": 32.66, + "grad_norm": 1.5912532806396484, + "learning_rate": 2e-05, + "loss": 0.04132064, + "step": 16330 + }, + { + "epoch": 32.662, + "grad_norm": 1.930972933769226, + "learning_rate": 2e-05, + "loss": 0.03286422, + "step": 16331 + }, + { + "epoch": 32.664, + "grad_norm": 1.51142156124115, + "learning_rate": 2e-05, + "loss": 0.05630952, + "step": 16332 + }, + { + "epoch": 32.666, + "grad_norm": 1.2394816875457764, + "learning_rate": 2e-05, + "loss": 0.05155517, + "step": 16333 + }, + { + "epoch": 32.668, + "grad_norm": 2.041574716567993, + "learning_rate": 2e-05, + "loss": 0.04154414, + "step": 16334 + }, + { + "epoch": 32.67, + "grad_norm": 3.471482992172241, + "learning_rate": 2e-05, + "loss": 0.06497285, + "step": 16335 + }, + { + "epoch": 32.672, + "grad_norm": 1.1447969675064087, + "learning_rate": 2e-05, + "loss": 0.04535265, + "step": 16336 + }, + { + "epoch": 32.674, + "grad_norm": 1.678331732749939, + "learning_rate": 2e-05, + "loss": 0.0535218, + "step": 16337 + }, + { + "epoch": 32.676, + "grad_norm": 1.378172755241394, + "learning_rate": 2e-05, + "loss": 0.04844815, + "step": 16338 + }, + { + "epoch": 32.678, + "grad_norm": 0.9606616497039795, + "learning_rate": 2e-05, + "loss": 0.03191324, + "step": 16339 + }, + { + "epoch": 32.68, + "grad_norm": 1.0323848724365234, + "learning_rate": 2e-05, + "loss": 0.0283333, + "step": 16340 + }, + { + "epoch": 32.682, + "grad_norm": 1.1071007251739502, + "learning_rate": 2e-05, + "loss": 0.03747611, + "step": 16341 + }, + { + "epoch": 32.684, + "grad_norm": 1.0263428688049316, + "learning_rate": 2e-05, + "loss": 0.03178697, + "step": 16342 + }, + { + "epoch": 32.686, + "grad_norm": 1.2423036098480225, + "learning_rate": 2e-05, + "loss": 0.03112799, + "step": 16343 + }, + { + "epoch": 32.688, + "grad_norm": 1.1887843608856201, + "learning_rate": 2e-05, + "loss": 0.04574025, + "step": 16344 + }, + { + "epoch": 32.69, + "grad_norm": 1.2458415031433105, + "learning_rate": 2e-05, + "loss": 0.04187585, + "step": 16345 + }, + { + "epoch": 32.692, + "grad_norm": 1.3959715366363525, + "learning_rate": 2e-05, + "loss": 0.0457012, + "step": 16346 + }, + { + "epoch": 32.694, + "grad_norm": 1.9177616834640503, + "learning_rate": 2e-05, + "loss": 0.03737821, + "step": 16347 + }, + { + "epoch": 32.696, + "grad_norm": 1.9234297275543213, + "learning_rate": 2e-05, + "loss": 0.05901012, + "step": 16348 + }, + { + "epoch": 32.698, + "grad_norm": 2.6709837913513184, + "learning_rate": 2e-05, + "loss": 0.05703682, + "step": 16349 + }, + { + "epoch": 32.7, + "grad_norm": 1.9380576610565186, + "learning_rate": 2e-05, + "loss": 0.05842331, + "step": 16350 + }, + { + "epoch": 32.702, + "grad_norm": 1.0618531703948975, + "learning_rate": 2e-05, + "loss": 0.03581022, + "step": 16351 + }, + { + "epoch": 32.704, + "grad_norm": 1.2853940725326538, + "learning_rate": 2e-05, + "loss": 0.04471534, + "step": 16352 + }, + { + "epoch": 32.706, + "grad_norm": 1.9387348890304565, + "learning_rate": 2e-05, + "loss": 0.04033763, + "step": 16353 + }, + { + "epoch": 32.708, + "grad_norm": 1.651381015777588, + "learning_rate": 2e-05, + "loss": 0.04997224, + "step": 16354 + }, + { + "epoch": 32.71, + "grad_norm": 1.1719062328338623, + "learning_rate": 2e-05, + "loss": 0.04294977, + "step": 16355 + }, + { + "epoch": 32.712, + "grad_norm": 1.1827785968780518, + "learning_rate": 2e-05, + "loss": 0.06157464, + "step": 16356 + }, + { + "epoch": 32.714, + "grad_norm": 2.0270729064941406, + "learning_rate": 2e-05, + "loss": 0.04811145, + "step": 16357 + }, + { + "epoch": 32.716, + "grad_norm": 2.081660032272339, + "learning_rate": 2e-05, + "loss": 0.03992213, + "step": 16358 + }, + { + "epoch": 32.718, + "grad_norm": 1.2889872789382935, + "learning_rate": 2e-05, + "loss": 0.05166517, + "step": 16359 + }, + { + "epoch": 32.72, + "grad_norm": 1.5523995161056519, + "learning_rate": 2e-05, + "loss": 0.04683542, + "step": 16360 + }, + { + "epoch": 32.722, + "grad_norm": 1.2523963451385498, + "learning_rate": 2e-05, + "loss": 0.03490978, + "step": 16361 + }, + { + "epoch": 32.724, + "grad_norm": 1.1765249967575073, + "learning_rate": 2e-05, + "loss": 0.04525149, + "step": 16362 + }, + { + "epoch": 32.726, + "grad_norm": 2.704094409942627, + "learning_rate": 2e-05, + "loss": 0.05494591, + "step": 16363 + }, + { + "epoch": 32.728, + "grad_norm": 1.1506249904632568, + "learning_rate": 2e-05, + "loss": 0.04181469, + "step": 16364 + }, + { + "epoch": 32.73, + "grad_norm": 1.0797075033187866, + "learning_rate": 2e-05, + "loss": 0.03218354, + "step": 16365 + }, + { + "epoch": 32.732, + "grad_norm": 1.3011102676391602, + "learning_rate": 2e-05, + "loss": 0.03808787, + "step": 16366 + }, + { + "epoch": 32.734, + "grad_norm": 1.300958514213562, + "learning_rate": 2e-05, + "loss": 0.0296345, + "step": 16367 + }, + { + "epoch": 32.736, + "grad_norm": 1.4178552627563477, + "learning_rate": 2e-05, + "loss": 0.05248306, + "step": 16368 + }, + { + "epoch": 32.738, + "grad_norm": 1.0817713737487793, + "learning_rate": 2e-05, + "loss": 0.03825011, + "step": 16369 + }, + { + "epoch": 32.74, + "grad_norm": 1.0695898532867432, + "learning_rate": 2e-05, + "loss": 0.04175761, + "step": 16370 + }, + { + "epoch": 32.742, + "grad_norm": 2.968357801437378, + "learning_rate": 2e-05, + "loss": 0.05305419, + "step": 16371 + }, + { + "epoch": 32.744, + "grad_norm": 1.0489188432693481, + "learning_rate": 2e-05, + "loss": 0.03766435, + "step": 16372 + }, + { + "epoch": 32.746, + "grad_norm": 0.9474694132804871, + "learning_rate": 2e-05, + "loss": 0.03348034, + "step": 16373 + }, + { + "epoch": 32.748, + "grad_norm": 0.9930300116539001, + "learning_rate": 2e-05, + "loss": 0.03745917, + "step": 16374 + }, + { + "epoch": 32.75, + "grad_norm": 1.0736403465270996, + "learning_rate": 2e-05, + "loss": 0.0384927, + "step": 16375 + }, + { + "epoch": 32.752, + "grad_norm": 1.2080556154251099, + "learning_rate": 2e-05, + "loss": 0.05006144, + "step": 16376 + }, + { + "epoch": 32.754, + "grad_norm": 0.9601371884346008, + "learning_rate": 2e-05, + "loss": 0.03716236, + "step": 16377 + }, + { + "epoch": 32.756, + "grad_norm": 1.128725528717041, + "learning_rate": 2e-05, + "loss": 0.04063039, + "step": 16378 + }, + { + "epoch": 32.758, + "grad_norm": 1.2183740139007568, + "learning_rate": 2e-05, + "loss": 0.05256324, + "step": 16379 + }, + { + "epoch": 32.76, + "grad_norm": 3.258612632751465, + "learning_rate": 2e-05, + "loss": 0.04376873, + "step": 16380 + }, + { + "epoch": 32.762, + "grad_norm": 1.1058506965637207, + "learning_rate": 2e-05, + "loss": 0.04281379, + "step": 16381 + }, + { + "epoch": 32.764, + "grad_norm": 1.1320183277130127, + "learning_rate": 2e-05, + "loss": 0.04657583, + "step": 16382 + }, + { + "epoch": 32.766, + "grad_norm": 1.4201374053955078, + "learning_rate": 2e-05, + "loss": 0.04276565, + "step": 16383 + }, + { + "epoch": 32.768, + "grad_norm": 1.0543874502182007, + "learning_rate": 2e-05, + "loss": 0.04018901, + "step": 16384 + }, + { + "epoch": 32.77, + "grad_norm": 1.4601715803146362, + "learning_rate": 2e-05, + "loss": 0.0541468, + "step": 16385 + }, + { + "epoch": 32.772, + "grad_norm": 2.787783145904541, + "learning_rate": 2e-05, + "loss": 0.0562762, + "step": 16386 + }, + { + "epoch": 32.774, + "grad_norm": 1.4297536611557007, + "learning_rate": 2e-05, + "loss": 0.04877428, + "step": 16387 + }, + { + "epoch": 32.776, + "grad_norm": 1.5178031921386719, + "learning_rate": 2e-05, + "loss": 0.03471943, + "step": 16388 + }, + { + "epoch": 32.778, + "grad_norm": 1.089928388595581, + "learning_rate": 2e-05, + "loss": 0.04103519, + "step": 16389 + }, + { + "epoch": 32.78, + "grad_norm": 0.9991024136543274, + "learning_rate": 2e-05, + "loss": 0.03855386, + "step": 16390 + }, + { + "epoch": 32.782, + "grad_norm": 1.0029306411743164, + "learning_rate": 2e-05, + "loss": 0.04183157, + "step": 16391 + }, + { + "epoch": 32.784, + "grad_norm": 1.2355519533157349, + "learning_rate": 2e-05, + "loss": 0.05071557, + "step": 16392 + }, + { + "epoch": 32.786, + "grad_norm": 0.9400306940078735, + "learning_rate": 2e-05, + "loss": 0.03195136, + "step": 16393 + }, + { + "epoch": 32.788, + "grad_norm": 1.4382003545761108, + "learning_rate": 2e-05, + "loss": 0.04054421, + "step": 16394 + }, + { + "epoch": 32.79, + "grad_norm": 1.1275018453598022, + "learning_rate": 2e-05, + "loss": 0.03660734, + "step": 16395 + }, + { + "epoch": 32.792, + "grad_norm": 1.1579735279083252, + "learning_rate": 2e-05, + "loss": 0.03553727, + "step": 16396 + }, + { + "epoch": 32.794, + "grad_norm": 1.317656397819519, + "learning_rate": 2e-05, + "loss": 0.03424529, + "step": 16397 + }, + { + "epoch": 32.796, + "grad_norm": 2.1389896869659424, + "learning_rate": 2e-05, + "loss": 0.04343967, + "step": 16398 + }, + { + "epoch": 32.798, + "grad_norm": 1.9410218000411987, + "learning_rate": 2e-05, + "loss": 0.05163146, + "step": 16399 + }, + { + "epoch": 32.8, + "grad_norm": 1.146166205406189, + "learning_rate": 2e-05, + "loss": 0.03072646, + "step": 16400 + }, + { + "epoch": 32.802, + "grad_norm": 1.333733081817627, + "learning_rate": 2e-05, + "loss": 0.0516637, + "step": 16401 + }, + { + "epoch": 32.804, + "grad_norm": 1.0944757461547852, + "learning_rate": 2e-05, + "loss": 0.03826744, + "step": 16402 + }, + { + "epoch": 32.806, + "grad_norm": 1.241289734840393, + "learning_rate": 2e-05, + "loss": 0.04015851, + "step": 16403 + }, + { + "epoch": 32.808, + "grad_norm": 1.1392414569854736, + "learning_rate": 2e-05, + "loss": 0.04946699, + "step": 16404 + }, + { + "epoch": 32.81, + "grad_norm": 0.9604151844978333, + "learning_rate": 2e-05, + "loss": 0.02486083, + "step": 16405 + }, + { + "epoch": 32.812, + "grad_norm": 1.4133819341659546, + "learning_rate": 2e-05, + "loss": 0.03912335, + "step": 16406 + }, + { + "epoch": 32.814, + "grad_norm": 3.016547441482544, + "learning_rate": 2e-05, + "loss": 0.05131318, + "step": 16407 + }, + { + "epoch": 32.816, + "grad_norm": 1.6161396503448486, + "learning_rate": 2e-05, + "loss": 0.05032074, + "step": 16408 + }, + { + "epoch": 32.818, + "grad_norm": 1.1903536319732666, + "learning_rate": 2e-05, + "loss": 0.0486296, + "step": 16409 + }, + { + "epoch": 32.82, + "grad_norm": 1.2930749654769897, + "learning_rate": 2e-05, + "loss": 0.04178453, + "step": 16410 + }, + { + "epoch": 32.822, + "grad_norm": 0.9721649885177612, + "learning_rate": 2e-05, + "loss": 0.03203877, + "step": 16411 + }, + { + "epoch": 32.824, + "grad_norm": 1.4896728992462158, + "learning_rate": 2e-05, + "loss": 0.0417724, + "step": 16412 + }, + { + "epoch": 32.826, + "grad_norm": 0.9889609217643738, + "learning_rate": 2e-05, + "loss": 0.03722448, + "step": 16413 + }, + { + "epoch": 32.828, + "grad_norm": 1.1123636960983276, + "learning_rate": 2e-05, + "loss": 0.04222369, + "step": 16414 + }, + { + "epoch": 32.83, + "grad_norm": 1.0436445474624634, + "learning_rate": 2e-05, + "loss": 0.03794225, + "step": 16415 + }, + { + "epoch": 32.832, + "grad_norm": 1.930905818939209, + "learning_rate": 2e-05, + "loss": 0.04170159, + "step": 16416 + }, + { + "epoch": 32.834, + "grad_norm": 1.0021107196807861, + "learning_rate": 2e-05, + "loss": 0.03140467, + "step": 16417 + }, + { + "epoch": 32.836, + "grad_norm": 1.0246645212173462, + "learning_rate": 2e-05, + "loss": 0.03313623, + "step": 16418 + }, + { + "epoch": 32.838, + "grad_norm": 1.666858196258545, + "learning_rate": 2e-05, + "loss": 0.04175393, + "step": 16419 + }, + { + "epoch": 32.84, + "grad_norm": 1.1152204275131226, + "learning_rate": 2e-05, + "loss": 0.02505697, + "step": 16420 + }, + { + "epoch": 32.842, + "grad_norm": 0.992870032787323, + "learning_rate": 2e-05, + "loss": 0.03373595, + "step": 16421 + }, + { + "epoch": 32.844, + "grad_norm": 1.0322585105895996, + "learning_rate": 2e-05, + "loss": 0.03882974, + "step": 16422 + }, + { + "epoch": 32.846, + "grad_norm": 2.256075620651245, + "learning_rate": 2e-05, + "loss": 0.05817116, + "step": 16423 + }, + { + "epoch": 32.848, + "grad_norm": 5.627259254455566, + "learning_rate": 2e-05, + "loss": 0.06864762, + "step": 16424 + }, + { + "epoch": 32.85, + "grad_norm": 1.0085660219192505, + "learning_rate": 2e-05, + "loss": 0.04046705, + "step": 16425 + }, + { + "epoch": 32.852, + "grad_norm": 1.0974931716918945, + "learning_rate": 2e-05, + "loss": 0.03791347, + "step": 16426 + }, + { + "epoch": 32.854, + "grad_norm": 1.2425390481948853, + "learning_rate": 2e-05, + "loss": 0.03167582, + "step": 16427 + }, + { + "epoch": 32.856, + "grad_norm": 1.2744793891906738, + "learning_rate": 2e-05, + "loss": 0.04943582, + "step": 16428 + }, + { + "epoch": 32.858, + "grad_norm": 0.9825481176376343, + "learning_rate": 2e-05, + "loss": 0.02305823, + "step": 16429 + }, + { + "epoch": 32.86, + "grad_norm": 1.1719574928283691, + "learning_rate": 2e-05, + "loss": 0.04331547, + "step": 16430 + }, + { + "epoch": 32.862, + "grad_norm": 1.6855008602142334, + "learning_rate": 2e-05, + "loss": 0.04401325, + "step": 16431 + }, + { + "epoch": 32.864, + "grad_norm": 1.4206801652908325, + "learning_rate": 2e-05, + "loss": 0.04856441, + "step": 16432 + }, + { + "epoch": 32.866, + "grad_norm": 1.7253879308700562, + "learning_rate": 2e-05, + "loss": 0.03633871, + "step": 16433 + }, + { + "epoch": 32.868, + "grad_norm": 1.3289132118225098, + "learning_rate": 2e-05, + "loss": 0.03001435, + "step": 16434 + }, + { + "epoch": 32.87, + "grad_norm": 1.1799901723861694, + "learning_rate": 2e-05, + "loss": 0.03081987, + "step": 16435 + }, + { + "epoch": 32.872, + "grad_norm": 1.9092236757278442, + "learning_rate": 2e-05, + "loss": 0.03654516, + "step": 16436 + }, + { + "epoch": 32.874, + "grad_norm": 1.1453834772109985, + "learning_rate": 2e-05, + "loss": 0.04329348, + "step": 16437 + }, + { + "epoch": 32.876, + "grad_norm": 1.3431400060653687, + "learning_rate": 2e-05, + "loss": 0.04679033, + "step": 16438 + }, + { + "epoch": 32.878, + "grad_norm": 1.5955402851104736, + "learning_rate": 2e-05, + "loss": 0.07649103, + "step": 16439 + }, + { + "epoch": 32.88, + "grad_norm": 1.485243797302246, + "learning_rate": 2e-05, + "loss": 0.03790029, + "step": 16440 + }, + { + "epoch": 32.882, + "grad_norm": 1.3634883165359497, + "learning_rate": 2e-05, + "loss": 0.06115822, + "step": 16441 + }, + { + "epoch": 32.884, + "grad_norm": 1.1459612846374512, + "learning_rate": 2e-05, + "loss": 0.04496354, + "step": 16442 + }, + { + "epoch": 32.886, + "grad_norm": 1.2098444700241089, + "learning_rate": 2e-05, + "loss": 0.03849549, + "step": 16443 + }, + { + "epoch": 32.888, + "grad_norm": 1.6543594598770142, + "learning_rate": 2e-05, + "loss": 0.05749209, + "step": 16444 + }, + { + "epoch": 32.89, + "grad_norm": 1.1916124820709229, + "learning_rate": 2e-05, + "loss": 0.02707171, + "step": 16445 + }, + { + "epoch": 32.892, + "grad_norm": 1.328437328338623, + "learning_rate": 2e-05, + "loss": 0.04334527, + "step": 16446 + }, + { + "epoch": 32.894, + "grad_norm": 1.472269058227539, + "learning_rate": 2e-05, + "loss": 0.02811809, + "step": 16447 + }, + { + "epoch": 32.896, + "grad_norm": 1.1660832166671753, + "learning_rate": 2e-05, + "loss": 0.04044122, + "step": 16448 + }, + { + "epoch": 32.898, + "grad_norm": 4.141280174255371, + "learning_rate": 2e-05, + "loss": 0.04939477, + "step": 16449 + }, + { + "epoch": 32.9, + "grad_norm": 1.0704823732376099, + "learning_rate": 2e-05, + "loss": 0.03728681, + "step": 16450 + }, + { + "epoch": 32.902, + "grad_norm": 1.1379095315933228, + "learning_rate": 2e-05, + "loss": 0.04447804, + "step": 16451 + }, + { + "epoch": 32.904, + "grad_norm": 1.5318787097930908, + "learning_rate": 2e-05, + "loss": 0.03792928, + "step": 16452 + }, + { + "epoch": 32.906, + "grad_norm": 1.7072391510009766, + "learning_rate": 2e-05, + "loss": 0.03659452, + "step": 16453 + }, + { + "epoch": 32.908, + "grad_norm": 1.2457821369171143, + "learning_rate": 2e-05, + "loss": 0.03751954, + "step": 16454 + }, + { + "epoch": 32.91, + "grad_norm": 1.167049527168274, + "learning_rate": 2e-05, + "loss": 0.03698363, + "step": 16455 + }, + { + "epoch": 32.912, + "grad_norm": 1.0011647939682007, + "learning_rate": 2e-05, + "loss": 0.03130367, + "step": 16456 + }, + { + "epoch": 32.914, + "grad_norm": 0.9641328454017639, + "learning_rate": 2e-05, + "loss": 0.03359494, + "step": 16457 + }, + { + "epoch": 32.916, + "grad_norm": 1.1731847524642944, + "learning_rate": 2e-05, + "loss": 0.02458394, + "step": 16458 + }, + { + "epoch": 32.918, + "grad_norm": 1.1044111251831055, + "learning_rate": 2e-05, + "loss": 0.04152217, + "step": 16459 + }, + { + "epoch": 32.92, + "grad_norm": 1.6179026365280151, + "learning_rate": 2e-05, + "loss": 0.03974091, + "step": 16460 + }, + { + "epoch": 32.922, + "grad_norm": 1.6373225450515747, + "learning_rate": 2e-05, + "loss": 0.05506727, + "step": 16461 + }, + { + "epoch": 32.924, + "grad_norm": 1.1272046566009521, + "learning_rate": 2e-05, + "loss": 0.03284402, + "step": 16462 + }, + { + "epoch": 32.926, + "grad_norm": 1.5268757343292236, + "learning_rate": 2e-05, + "loss": 0.05714603, + "step": 16463 + }, + { + "epoch": 32.928, + "grad_norm": 1.1075844764709473, + "learning_rate": 2e-05, + "loss": 0.03451537, + "step": 16464 + }, + { + "epoch": 32.93, + "grad_norm": 1.25146484375, + "learning_rate": 2e-05, + "loss": 0.05552392, + "step": 16465 + }, + { + "epoch": 32.932, + "grad_norm": 1.9479658603668213, + "learning_rate": 2e-05, + "loss": 0.06264338, + "step": 16466 + }, + { + "epoch": 32.934, + "grad_norm": 1.2056105136871338, + "learning_rate": 2e-05, + "loss": 0.05005362, + "step": 16467 + }, + { + "epoch": 32.936, + "grad_norm": 1.3711949586868286, + "learning_rate": 2e-05, + "loss": 0.04337479, + "step": 16468 + }, + { + "epoch": 32.938, + "grad_norm": 1.2532182931900024, + "learning_rate": 2e-05, + "loss": 0.04488462, + "step": 16469 + }, + { + "epoch": 32.94, + "grad_norm": 1.350342035293579, + "learning_rate": 2e-05, + "loss": 0.04863258, + "step": 16470 + }, + { + "epoch": 32.942, + "grad_norm": 1.5152119398117065, + "learning_rate": 2e-05, + "loss": 0.03797042, + "step": 16471 + }, + { + "epoch": 32.944, + "grad_norm": 1.1705118417739868, + "learning_rate": 2e-05, + "loss": 0.04442852, + "step": 16472 + }, + { + "epoch": 32.946, + "grad_norm": 1.4079266786575317, + "learning_rate": 2e-05, + "loss": 0.0416984, + "step": 16473 + }, + { + "epoch": 32.948, + "grad_norm": 1.1473650932312012, + "learning_rate": 2e-05, + "loss": 0.0417373, + "step": 16474 + }, + { + "epoch": 32.95, + "grad_norm": 1.14018976688385, + "learning_rate": 2e-05, + "loss": 0.03788053, + "step": 16475 + }, + { + "epoch": 32.952, + "grad_norm": 1.0621168613433838, + "learning_rate": 2e-05, + "loss": 0.02839708, + "step": 16476 + }, + { + "epoch": 32.954, + "grad_norm": 1.2420809268951416, + "learning_rate": 2e-05, + "loss": 0.04673456, + "step": 16477 + }, + { + "epoch": 32.956, + "grad_norm": 1.4332877397537231, + "learning_rate": 2e-05, + "loss": 0.05584753, + "step": 16478 + }, + { + "epoch": 32.958, + "grad_norm": 0.9597718119621277, + "learning_rate": 2e-05, + "loss": 0.03618815, + "step": 16479 + }, + { + "epoch": 32.96, + "grad_norm": 1.8332748413085938, + "learning_rate": 2e-05, + "loss": 0.04130953, + "step": 16480 + }, + { + "epoch": 32.962, + "grad_norm": 3.7896127700805664, + "learning_rate": 2e-05, + "loss": 0.048007, + "step": 16481 + }, + { + "epoch": 32.964, + "grad_norm": 1.2358813285827637, + "learning_rate": 2e-05, + "loss": 0.04505859, + "step": 16482 + }, + { + "epoch": 32.966, + "grad_norm": 1.2916022539138794, + "learning_rate": 2e-05, + "loss": 0.02319424, + "step": 16483 + }, + { + "epoch": 32.968, + "grad_norm": 1.1144905090332031, + "learning_rate": 2e-05, + "loss": 0.04152855, + "step": 16484 + }, + { + "epoch": 32.97, + "grad_norm": 1.2684904336929321, + "learning_rate": 2e-05, + "loss": 0.04438142, + "step": 16485 + }, + { + "epoch": 32.972, + "grad_norm": 2.002365827560425, + "learning_rate": 2e-05, + "loss": 0.06185231, + "step": 16486 + }, + { + "epoch": 32.974, + "grad_norm": 1.3114463090896606, + "learning_rate": 2e-05, + "loss": 0.0507057, + "step": 16487 + }, + { + "epoch": 32.976, + "grad_norm": 0.9986602663993835, + "learning_rate": 2e-05, + "loss": 0.04370682, + "step": 16488 + }, + { + "epoch": 32.978, + "grad_norm": 0.9797910451889038, + "learning_rate": 2e-05, + "loss": 0.03410993, + "step": 16489 + }, + { + "epoch": 32.98, + "grad_norm": 1.8880815505981445, + "learning_rate": 2e-05, + "loss": 0.050763, + "step": 16490 + }, + { + "epoch": 32.982, + "grad_norm": 1.241797685623169, + "learning_rate": 2e-05, + "loss": 0.05129294, + "step": 16491 + }, + { + "epoch": 32.984, + "grad_norm": 1.9726879596710205, + "learning_rate": 2e-05, + "loss": 0.06170304, + "step": 16492 + }, + { + "epoch": 32.986, + "grad_norm": 1.1492379903793335, + "learning_rate": 2e-05, + "loss": 0.05158865, + "step": 16493 + }, + { + "epoch": 32.988, + "grad_norm": 1.4059292078018188, + "learning_rate": 2e-05, + "loss": 0.04178857, + "step": 16494 + }, + { + "epoch": 32.99, + "grad_norm": 1.3059656620025635, + "learning_rate": 2e-05, + "loss": 0.03553785, + "step": 16495 + }, + { + "epoch": 32.992, + "grad_norm": 1.3720580339431763, + "learning_rate": 2e-05, + "loss": 0.04478185, + "step": 16496 + }, + { + "epoch": 32.994, + "grad_norm": 1.007232427597046, + "learning_rate": 2e-05, + "loss": 0.03115787, + "step": 16497 + }, + { + "epoch": 32.996, + "grad_norm": 1.1226294040679932, + "learning_rate": 2e-05, + "loss": 0.05025791, + "step": 16498 + }, + { + "epoch": 32.998, + "grad_norm": 1.0878342390060425, + "learning_rate": 2e-05, + "loss": 0.04733447, + "step": 16499 + }, + { + "epoch": 33.0, + "grad_norm": 1.013754963874817, + "learning_rate": 2e-05, + "loss": 0.03848787, + "step": 16500 + }, + { + "epoch": 33.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9720558882235529, + "Equal_1": 1.0, + "Equal_2": 0.9800399201596807, + "Equal_3": 0.9840319361277445, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9960079840319361, + "Parallel_1": 0.9859719438877755, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.992, + "Perpendicular_1": 0.996, + "Perpendicular_2": 0.988, + "Perpendicular_3": 0.9138276553106213, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.9996666666666667, + "PointLiesOnCircle_3": 0.992, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9920159680638723 + }, + "eval_runtime": 320.8236, + "eval_samples_per_second": 32.728, + "eval_steps_per_second": 0.655, + "step": 16500 + }, + { + "epoch": 33.002, + "grad_norm": 1.0145152807235718, + "learning_rate": 2e-05, + "loss": 0.03352314, + "step": 16501 + }, + { + "epoch": 33.004, + "grad_norm": 2.6528162956237793, + "learning_rate": 2e-05, + "loss": 0.04622073, + "step": 16502 + }, + { + "epoch": 33.006, + "grad_norm": 1.5733745098114014, + "learning_rate": 2e-05, + "loss": 0.04280098, + "step": 16503 + }, + { + "epoch": 33.008, + "grad_norm": 2.088067054748535, + "learning_rate": 2e-05, + "loss": 0.04422682, + "step": 16504 + }, + { + "epoch": 33.01, + "grad_norm": 0.7775627970695496, + "learning_rate": 2e-05, + "loss": 0.01831872, + "step": 16505 + }, + { + "epoch": 33.012, + "grad_norm": 0.9788329601287842, + "learning_rate": 2e-05, + "loss": 0.03603578, + "step": 16506 + }, + { + "epoch": 33.014, + "grad_norm": 1.1010586023330688, + "learning_rate": 2e-05, + "loss": 0.03720935, + "step": 16507 + }, + { + "epoch": 33.016, + "grad_norm": 1.1118940114974976, + "learning_rate": 2e-05, + "loss": 0.04310315, + "step": 16508 + }, + { + "epoch": 33.018, + "grad_norm": 1.3882803916931152, + "learning_rate": 2e-05, + "loss": 0.04679479, + "step": 16509 + }, + { + "epoch": 33.02, + "grad_norm": 1.1422741413116455, + "learning_rate": 2e-05, + "loss": 0.03853836, + "step": 16510 + }, + { + "epoch": 33.022, + "grad_norm": 0.9372391104698181, + "learning_rate": 2e-05, + "loss": 0.0216747, + "step": 16511 + }, + { + "epoch": 33.024, + "grad_norm": 1.2110828161239624, + "learning_rate": 2e-05, + "loss": 0.04518433, + "step": 16512 + }, + { + "epoch": 33.026, + "grad_norm": 1.14997136592865, + "learning_rate": 2e-05, + "loss": 0.04907893, + "step": 16513 + }, + { + "epoch": 33.028, + "grad_norm": 1.4301350116729736, + "learning_rate": 2e-05, + "loss": 0.05764885, + "step": 16514 + }, + { + "epoch": 33.03, + "grad_norm": 1.0221556425094604, + "learning_rate": 2e-05, + "loss": 0.04257384, + "step": 16515 + }, + { + "epoch": 33.032, + "grad_norm": 1.0143057107925415, + "learning_rate": 2e-05, + "loss": 0.03372315, + "step": 16516 + }, + { + "epoch": 33.034, + "grad_norm": 1.1732292175292969, + "learning_rate": 2e-05, + "loss": 0.04997346, + "step": 16517 + }, + { + "epoch": 33.036, + "grad_norm": 1.0122156143188477, + "learning_rate": 2e-05, + "loss": 0.03192022, + "step": 16518 + }, + { + "epoch": 33.038, + "grad_norm": 1.9777421951293945, + "learning_rate": 2e-05, + "loss": 0.03694803, + "step": 16519 + }, + { + "epoch": 33.04, + "grad_norm": 1.1994304656982422, + "learning_rate": 2e-05, + "loss": 0.05166928, + "step": 16520 + }, + { + "epoch": 33.042, + "grad_norm": 0.8912333250045776, + "learning_rate": 2e-05, + "loss": 0.03158681, + "step": 16521 + }, + { + "epoch": 33.044, + "grad_norm": 1.4108567237854004, + "learning_rate": 2e-05, + "loss": 0.03951647, + "step": 16522 + }, + { + "epoch": 33.046, + "grad_norm": 1.5426366329193115, + "learning_rate": 2e-05, + "loss": 0.05633399, + "step": 16523 + }, + { + "epoch": 33.048, + "grad_norm": 1.044606328010559, + "learning_rate": 2e-05, + "loss": 0.03324772, + "step": 16524 + }, + { + "epoch": 33.05, + "grad_norm": 2.6143667697906494, + "learning_rate": 2e-05, + "loss": 0.06454179, + "step": 16525 + }, + { + "epoch": 33.052, + "grad_norm": 1.6238820552825928, + "learning_rate": 2e-05, + "loss": 0.02863217, + "step": 16526 + }, + { + "epoch": 33.054, + "grad_norm": 1.1710536479949951, + "learning_rate": 2e-05, + "loss": 0.05588196, + "step": 16527 + }, + { + "epoch": 33.056, + "grad_norm": 1.5752817392349243, + "learning_rate": 2e-05, + "loss": 0.03416557, + "step": 16528 + }, + { + "epoch": 33.058, + "grad_norm": 1.6088008880615234, + "learning_rate": 2e-05, + "loss": 0.05284679, + "step": 16529 + }, + { + "epoch": 33.06, + "grad_norm": 1.1662729978561401, + "learning_rate": 2e-05, + "loss": 0.03968053, + "step": 16530 + }, + { + "epoch": 33.062, + "grad_norm": 1.0963259935379028, + "learning_rate": 2e-05, + "loss": 0.04498053, + "step": 16531 + }, + { + "epoch": 33.064, + "grad_norm": 0.9817715287208557, + "learning_rate": 2e-05, + "loss": 0.03069879, + "step": 16532 + }, + { + "epoch": 33.066, + "grad_norm": 1.3074853420257568, + "learning_rate": 2e-05, + "loss": 0.06249923, + "step": 16533 + }, + { + "epoch": 33.068, + "grad_norm": 0.9941977262496948, + "learning_rate": 2e-05, + "loss": 0.03796024, + "step": 16534 + }, + { + "epoch": 33.07, + "grad_norm": 1.546793818473816, + "learning_rate": 2e-05, + "loss": 0.04255424, + "step": 16535 + }, + { + "epoch": 33.072, + "grad_norm": 1.2092461585998535, + "learning_rate": 2e-05, + "loss": 0.04925638, + "step": 16536 + }, + { + "epoch": 33.074, + "grad_norm": 1.3330050706863403, + "learning_rate": 2e-05, + "loss": 0.04654669, + "step": 16537 + }, + { + "epoch": 33.076, + "grad_norm": 1.539285659790039, + "learning_rate": 2e-05, + "loss": 0.04258667, + "step": 16538 + }, + { + "epoch": 33.078, + "grad_norm": 1.1344162225723267, + "learning_rate": 2e-05, + "loss": 0.03024994, + "step": 16539 + }, + { + "epoch": 33.08, + "grad_norm": 1.3139582872390747, + "learning_rate": 2e-05, + "loss": 0.06406873, + "step": 16540 + }, + { + "epoch": 33.082, + "grad_norm": 1.342620611190796, + "learning_rate": 2e-05, + "loss": 0.04298013, + "step": 16541 + }, + { + "epoch": 33.084, + "grad_norm": 1.1698533296585083, + "learning_rate": 2e-05, + "loss": 0.0356679, + "step": 16542 + }, + { + "epoch": 33.086, + "grad_norm": 1.3078049421310425, + "learning_rate": 2e-05, + "loss": 0.03047599, + "step": 16543 + }, + { + "epoch": 33.088, + "grad_norm": 1.1613730192184448, + "learning_rate": 2e-05, + "loss": 0.04012228, + "step": 16544 + }, + { + "epoch": 33.09, + "grad_norm": 1.5841779708862305, + "learning_rate": 2e-05, + "loss": 0.05739962, + "step": 16545 + }, + { + "epoch": 33.092, + "grad_norm": 1.175606608390808, + "learning_rate": 2e-05, + "loss": 0.0417013, + "step": 16546 + }, + { + "epoch": 33.094, + "grad_norm": 1.0316969156265259, + "learning_rate": 2e-05, + "loss": 0.02844536, + "step": 16547 + }, + { + "epoch": 33.096, + "grad_norm": 1.6557471752166748, + "learning_rate": 2e-05, + "loss": 0.04946776, + "step": 16548 + }, + { + "epoch": 33.098, + "grad_norm": 1.2517582178115845, + "learning_rate": 2e-05, + "loss": 0.03600208, + "step": 16549 + }, + { + "epoch": 33.1, + "grad_norm": 1.3024152517318726, + "learning_rate": 2e-05, + "loss": 0.0431541, + "step": 16550 + }, + { + "epoch": 33.102, + "grad_norm": 1.2932987213134766, + "learning_rate": 2e-05, + "loss": 0.04457521, + "step": 16551 + }, + { + "epoch": 33.104, + "grad_norm": 1.2561955451965332, + "learning_rate": 2e-05, + "loss": 0.05188843, + "step": 16552 + }, + { + "epoch": 33.106, + "grad_norm": 1.1888597011566162, + "learning_rate": 2e-05, + "loss": 0.03158218, + "step": 16553 + }, + { + "epoch": 33.108, + "grad_norm": 1.1802350282669067, + "learning_rate": 2e-05, + "loss": 0.04162861, + "step": 16554 + }, + { + "epoch": 33.11, + "grad_norm": 1.2707432508468628, + "learning_rate": 2e-05, + "loss": 0.0347266, + "step": 16555 + }, + { + "epoch": 33.112, + "grad_norm": 1.2336151599884033, + "learning_rate": 2e-05, + "loss": 0.03552887, + "step": 16556 + }, + { + "epoch": 33.114, + "grad_norm": 1.348934292793274, + "learning_rate": 2e-05, + "loss": 0.04676005, + "step": 16557 + }, + { + "epoch": 33.116, + "grad_norm": 0.9855770468711853, + "learning_rate": 2e-05, + "loss": 0.02963222, + "step": 16558 + }, + { + "epoch": 33.118, + "grad_norm": 0.9198537468910217, + "learning_rate": 2e-05, + "loss": 0.03035306, + "step": 16559 + }, + { + "epoch": 33.12, + "grad_norm": 1.309364914894104, + "learning_rate": 2e-05, + "loss": 0.05264792, + "step": 16560 + }, + { + "epoch": 33.122, + "grad_norm": 1.5512858629226685, + "learning_rate": 2e-05, + "loss": 0.05176829, + "step": 16561 + }, + { + "epoch": 33.124, + "grad_norm": 1.2333922386169434, + "learning_rate": 2e-05, + "loss": 0.04524909, + "step": 16562 + }, + { + "epoch": 33.126, + "grad_norm": 1.252514362335205, + "learning_rate": 2e-05, + "loss": 0.0393767, + "step": 16563 + }, + { + "epoch": 33.128, + "grad_norm": 0.95073002576828, + "learning_rate": 2e-05, + "loss": 0.0251915, + "step": 16564 + }, + { + "epoch": 33.13, + "grad_norm": 2.1087992191314697, + "learning_rate": 2e-05, + "loss": 0.04168628, + "step": 16565 + }, + { + "epoch": 33.132, + "grad_norm": 1.073662519454956, + "learning_rate": 2e-05, + "loss": 0.03738192, + "step": 16566 + }, + { + "epoch": 33.134, + "grad_norm": 2.494189977645874, + "learning_rate": 2e-05, + "loss": 0.03791768, + "step": 16567 + }, + { + "epoch": 33.136, + "grad_norm": 2.3711354732513428, + "learning_rate": 2e-05, + "loss": 0.05274618, + "step": 16568 + }, + { + "epoch": 33.138, + "grad_norm": 1.1939685344696045, + "learning_rate": 2e-05, + "loss": 0.05177773, + "step": 16569 + }, + { + "epoch": 33.14, + "grad_norm": 1.0729763507843018, + "learning_rate": 2e-05, + "loss": 0.04366021, + "step": 16570 + }, + { + "epoch": 33.142, + "grad_norm": 1.9815735816955566, + "learning_rate": 2e-05, + "loss": 0.05311869, + "step": 16571 + }, + { + "epoch": 33.144, + "grad_norm": 2.0230648517608643, + "learning_rate": 2e-05, + "loss": 0.05889687, + "step": 16572 + }, + { + "epoch": 33.146, + "grad_norm": 0.9048420190811157, + "learning_rate": 2e-05, + "loss": 0.02796107, + "step": 16573 + }, + { + "epoch": 33.148, + "grad_norm": 1.2179429531097412, + "learning_rate": 2e-05, + "loss": 0.04486234, + "step": 16574 + }, + { + "epoch": 33.15, + "grad_norm": 1.1371427774429321, + "learning_rate": 2e-05, + "loss": 0.03180134, + "step": 16575 + }, + { + "epoch": 33.152, + "grad_norm": 1.2799369096755981, + "learning_rate": 2e-05, + "loss": 0.04838013, + "step": 16576 + }, + { + "epoch": 33.154, + "grad_norm": 1.4155060052871704, + "learning_rate": 2e-05, + "loss": 0.03349139, + "step": 16577 + }, + { + "epoch": 33.156, + "grad_norm": 1.009975790977478, + "learning_rate": 2e-05, + "loss": 0.02858535, + "step": 16578 + }, + { + "epoch": 33.158, + "grad_norm": 1.0067310333251953, + "learning_rate": 2e-05, + "loss": 0.02488973, + "step": 16579 + }, + { + "epoch": 33.16, + "grad_norm": 1.6880412101745605, + "learning_rate": 2e-05, + "loss": 0.04501157, + "step": 16580 + }, + { + "epoch": 33.162, + "grad_norm": 1.3325822353363037, + "learning_rate": 2e-05, + "loss": 0.0444175, + "step": 16581 + }, + { + "epoch": 33.164, + "grad_norm": 5.777720928192139, + "learning_rate": 2e-05, + "loss": 0.0530759, + "step": 16582 + }, + { + "epoch": 33.166, + "grad_norm": 2.4529478549957275, + "learning_rate": 2e-05, + "loss": 0.04354583, + "step": 16583 + }, + { + "epoch": 33.168, + "grad_norm": 1.3145490884780884, + "learning_rate": 2e-05, + "loss": 0.03570919, + "step": 16584 + }, + { + "epoch": 33.17, + "grad_norm": 1.2409156560897827, + "learning_rate": 2e-05, + "loss": 0.04724094, + "step": 16585 + }, + { + "epoch": 33.172, + "grad_norm": 0.8115535974502563, + "learning_rate": 2e-05, + "loss": 0.03618132, + "step": 16586 + }, + { + "epoch": 33.174, + "grad_norm": 0.9847850799560547, + "learning_rate": 2e-05, + "loss": 0.03718961, + "step": 16587 + }, + { + "epoch": 33.176, + "grad_norm": 1.0901992321014404, + "learning_rate": 2e-05, + "loss": 0.03891124, + "step": 16588 + }, + { + "epoch": 33.178, + "grad_norm": 1.2606801986694336, + "learning_rate": 2e-05, + "loss": 0.04655527, + "step": 16589 + }, + { + "epoch": 33.18, + "grad_norm": 1.6437278985977173, + "learning_rate": 2e-05, + "loss": 0.04169775, + "step": 16590 + }, + { + "epoch": 33.182, + "grad_norm": 1.0829734802246094, + "learning_rate": 2e-05, + "loss": 0.03909619, + "step": 16591 + }, + { + "epoch": 33.184, + "grad_norm": 1.9588395357131958, + "learning_rate": 2e-05, + "loss": 0.05959759, + "step": 16592 + }, + { + "epoch": 33.186, + "grad_norm": 2.825683355331421, + "learning_rate": 2e-05, + "loss": 0.05063524, + "step": 16593 + }, + { + "epoch": 33.188, + "grad_norm": 1.0350728034973145, + "learning_rate": 2e-05, + "loss": 0.03315126, + "step": 16594 + }, + { + "epoch": 33.19, + "grad_norm": 2.2779905796051025, + "learning_rate": 2e-05, + "loss": 0.04792159, + "step": 16595 + }, + { + "epoch": 33.192, + "grad_norm": 1.0446611642837524, + "learning_rate": 2e-05, + "loss": 0.03796531, + "step": 16596 + }, + { + "epoch": 33.194, + "grad_norm": 1.1090408563613892, + "learning_rate": 2e-05, + "loss": 0.03750544, + "step": 16597 + }, + { + "epoch": 33.196, + "grad_norm": 1.0616990327835083, + "learning_rate": 2e-05, + "loss": 0.04366959, + "step": 16598 + }, + { + "epoch": 33.198, + "grad_norm": 1.0580826997756958, + "learning_rate": 2e-05, + "loss": 0.03785151, + "step": 16599 + }, + { + "epoch": 33.2, + "grad_norm": 1.757721185684204, + "learning_rate": 2e-05, + "loss": 0.04076345, + "step": 16600 + }, + { + "epoch": 33.202, + "grad_norm": 1.2572243213653564, + "learning_rate": 2e-05, + "loss": 0.04631349, + "step": 16601 + }, + { + "epoch": 33.204, + "grad_norm": 1.4636191129684448, + "learning_rate": 2e-05, + "loss": 0.04375532, + "step": 16602 + }, + { + "epoch": 33.206, + "grad_norm": 1.7812341451644897, + "learning_rate": 2e-05, + "loss": 0.03871331, + "step": 16603 + }, + { + "epoch": 33.208, + "grad_norm": 1.4072896242141724, + "learning_rate": 2e-05, + "loss": 0.05194937, + "step": 16604 + }, + { + "epoch": 33.21, + "grad_norm": 0.9747751951217651, + "learning_rate": 2e-05, + "loss": 0.02706435, + "step": 16605 + }, + { + "epoch": 33.212, + "grad_norm": 1.1808767318725586, + "learning_rate": 2e-05, + "loss": 0.03670924, + "step": 16606 + }, + { + "epoch": 33.214, + "grad_norm": 1.2573798894882202, + "learning_rate": 2e-05, + "loss": 0.03263208, + "step": 16607 + }, + { + "epoch": 33.216, + "grad_norm": 1.379146695137024, + "learning_rate": 2e-05, + "loss": 0.04529354, + "step": 16608 + }, + { + "epoch": 33.218, + "grad_norm": 1.177720308303833, + "learning_rate": 2e-05, + "loss": 0.03827986, + "step": 16609 + }, + { + "epoch": 33.22, + "grad_norm": 1.1560696363449097, + "learning_rate": 2e-05, + "loss": 0.04032604, + "step": 16610 + }, + { + "epoch": 33.222, + "grad_norm": 1.2086445093154907, + "learning_rate": 2e-05, + "loss": 0.04973095, + "step": 16611 + }, + { + "epoch": 33.224, + "grad_norm": 1.0575134754180908, + "learning_rate": 2e-05, + "loss": 0.03815421, + "step": 16612 + }, + { + "epoch": 33.226, + "grad_norm": 1.3156135082244873, + "learning_rate": 2e-05, + "loss": 0.0418507, + "step": 16613 + }, + { + "epoch": 33.228, + "grad_norm": 1.3325525522232056, + "learning_rate": 2e-05, + "loss": 0.0365724, + "step": 16614 + }, + { + "epoch": 33.23, + "grad_norm": 1.4213200807571411, + "learning_rate": 2e-05, + "loss": 0.04568385, + "step": 16615 + }, + { + "epoch": 33.232, + "grad_norm": 1.8256237506866455, + "learning_rate": 2e-05, + "loss": 0.06385624, + "step": 16616 + }, + { + "epoch": 33.234, + "grad_norm": 2.2751426696777344, + "learning_rate": 2e-05, + "loss": 0.04987566, + "step": 16617 + }, + { + "epoch": 33.236, + "grad_norm": 0.9569184184074402, + "learning_rate": 2e-05, + "loss": 0.02542327, + "step": 16618 + }, + { + "epoch": 33.238, + "grad_norm": 1.0519928932189941, + "learning_rate": 2e-05, + "loss": 0.03581095, + "step": 16619 + }, + { + "epoch": 33.24, + "grad_norm": 1.1077507734298706, + "learning_rate": 2e-05, + "loss": 0.03992999, + "step": 16620 + }, + { + "epoch": 33.242, + "grad_norm": 1.6223715543746948, + "learning_rate": 2e-05, + "loss": 0.02992079, + "step": 16621 + }, + { + "epoch": 33.244, + "grad_norm": 1.5266954898834229, + "learning_rate": 2e-05, + "loss": 0.03266663, + "step": 16622 + }, + { + "epoch": 33.246, + "grad_norm": 1.642143964767456, + "learning_rate": 2e-05, + "loss": 0.05565649, + "step": 16623 + }, + { + "epoch": 33.248, + "grad_norm": 2.065688371658325, + "learning_rate": 2e-05, + "loss": 0.0459539, + "step": 16624 + }, + { + "epoch": 33.25, + "grad_norm": 1.1380778551101685, + "learning_rate": 2e-05, + "loss": 0.04274027, + "step": 16625 + }, + { + "epoch": 33.252, + "grad_norm": 1.23936927318573, + "learning_rate": 2e-05, + "loss": 0.05491698, + "step": 16626 + }, + { + "epoch": 33.254, + "grad_norm": 1.0805511474609375, + "learning_rate": 2e-05, + "loss": 0.04202371, + "step": 16627 + }, + { + "epoch": 33.256, + "grad_norm": 1.1573930978775024, + "learning_rate": 2e-05, + "loss": 0.04801682, + "step": 16628 + }, + { + "epoch": 33.258, + "grad_norm": 1.1957443952560425, + "learning_rate": 2e-05, + "loss": 0.02971683, + "step": 16629 + }, + { + "epoch": 33.26, + "grad_norm": 1.1596754789352417, + "learning_rate": 2e-05, + "loss": 0.03906877, + "step": 16630 + }, + { + "epoch": 33.262, + "grad_norm": 1.0122718811035156, + "learning_rate": 2e-05, + "loss": 0.0367621, + "step": 16631 + }, + { + "epoch": 33.264, + "grad_norm": 1.2783515453338623, + "learning_rate": 2e-05, + "loss": 0.06680563, + "step": 16632 + }, + { + "epoch": 33.266, + "grad_norm": 1.4091695547103882, + "learning_rate": 2e-05, + "loss": 0.05388515, + "step": 16633 + }, + { + "epoch": 33.268, + "grad_norm": 1.1046702861785889, + "learning_rate": 2e-05, + "loss": 0.04830401, + "step": 16634 + }, + { + "epoch": 33.27, + "grad_norm": 1.6029551029205322, + "learning_rate": 2e-05, + "loss": 0.04491821, + "step": 16635 + }, + { + "epoch": 33.272, + "grad_norm": 1.1932250261306763, + "learning_rate": 2e-05, + "loss": 0.04055645, + "step": 16636 + }, + { + "epoch": 33.274, + "grad_norm": 1.7215577363967896, + "learning_rate": 2e-05, + "loss": 0.04914788, + "step": 16637 + }, + { + "epoch": 33.276, + "grad_norm": 0.9664545059204102, + "learning_rate": 2e-05, + "loss": 0.04409215, + "step": 16638 + }, + { + "epoch": 33.278, + "grad_norm": 1.0649373531341553, + "learning_rate": 2e-05, + "loss": 0.04012241, + "step": 16639 + }, + { + "epoch": 33.28, + "grad_norm": 1.5425022840499878, + "learning_rate": 2e-05, + "loss": 0.0414533, + "step": 16640 + }, + { + "epoch": 33.282, + "grad_norm": 1.4415966272354126, + "learning_rate": 2e-05, + "loss": 0.05605222, + "step": 16641 + }, + { + "epoch": 33.284, + "grad_norm": 1.9677499532699585, + "learning_rate": 2e-05, + "loss": 0.05646674, + "step": 16642 + }, + { + "epoch": 33.286, + "grad_norm": 1.0647591352462769, + "learning_rate": 2e-05, + "loss": 0.04393759, + "step": 16643 + }, + { + "epoch": 33.288, + "grad_norm": 0.9724460244178772, + "learning_rate": 2e-05, + "loss": 0.0201349, + "step": 16644 + }, + { + "epoch": 33.29, + "grad_norm": 1.3234846591949463, + "learning_rate": 2e-05, + "loss": 0.04536884, + "step": 16645 + }, + { + "epoch": 33.292, + "grad_norm": 1.315469741821289, + "learning_rate": 2e-05, + "loss": 0.03901994, + "step": 16646 + }, + { + "epoch": 33.294, + "grad_norm": 1.1740449666976929, + "learning_rate": 2e-05, + "loss": 0.04922675, + "step": 16647 + }, + { + "epoch": 33.296, + "grad_norm": 2.73299241065979, + "learning_rate": 2e-05, + "loss": 0.04908387, + "step": 16648 + }, + { + "epoch": 33.298, + "grad_norm": 1.0189504623413086, + "learning_rate": 2e-05, + "loss": 0.04484309, + "step": 16649 + }, + { + "epoch": 33.3, + "grad_norm": 2.3086318969726562, + "learning_rate": 2e-05, + "loss": 0.06764978, + "step": 16650 + }, + { + "epoch": 33.302, + "grad_norm": 0.8751531839370728, + "learning_rate": 2e-05, + "loss": 0.03591763, + "step": 16651 + }, + { + "epoch": 33.304, + "grad_norm": 1.2427144050598145, + "learning_rate": 2e-05, + "loss": 0.05099815, + "step": 16652 + }, + { + "epoch": 33.306, + "grad_norm": 1.8863247632980347, + "learning_rate": 2e-05, + "loss": 0.0360302, + "step": 16653 + }, + { + "epoch": 33.308, + "grad_norm": 1.1457570791244507, + "learning_rate": 2e-05, + "loss": 0.03447718, + "step": 16654 + }, + { + "epoch": 33.31, + "grad_norm": 1.273047685623169, + "learning_rate": 2e-05, + "loss": 0.03491887, + "step": 16655 + }, + { + "epoch": 33.312, + "grad_norm": 1.65647554397583, + "learning_rate": 2e-05, + "loss": 0.05843513, + "step": 16656 + }, + { + "epoch": 33.314, + "grad_norm": 1.1409825086593628, + "learning_rate": 2e-05, + "loss": 0.04555066, + "step": 16657 + }, + { + "epoch": 33.316, + "grad_norm": 3.0305325984954834, + "learning_rate": 2e-05, + "loss": 0.05683912, + "step": 16658 + }, + { + "epoch": 33.318, + "grad_norm": 2.7458958625793457, + "learning_rate": 2e-05, + "loss": 0.0510499, + "step": 16659 + }, + { + "epoch": 33.32, + "grad_norm": 2.914576768875122, + "learning_rate": 2e-05, + "loss": 0.04127868, + "step": 16660 + }, + { + "epoch": 33.322, + "grad_norm": 1.22411048412323, + "learning_rate": 2e-05, + "loss": 0.0514855, + "step": 16661 + }, + { + "epoch": 33.324, + "grad_norm": 1.5475612878799438, + "learning_rate": 2e-05, + "loss": 0.05710922, + "step": 16662 + }, + { + "epoch": 33.326, + "grad_norm": 1.0445101261138916, + "learning_rate": 2e-05, + "loss": 0.0416135, + "step": 16663 + }, + { + "epoch": 33.328, + "grad_norm": 1.168892502784729, + "learning_rate": 2e-05, + "loss": 0.02608304, + "step": 16664 + }, + { + "epoch": 33.33, + "grad_norm": 1.1032596826553345, + "learning_rate": 2e-05, + "loss": 0.03604908, + "step": 16665 + }, + { + "epoch": 33.332, + "grad_norm": 1.2727869749069214, + "learning_rate": 2e-05, + "loss": 0.06578045, + "step": 16666 + }, + { + "epoch": 33.334, + "grad_norm": 0.9599734544754028, + "learning_rate": 2e-05, + "loss": 0.03464708, + "step": 16667 + }, + { + "epoch": 33.336, + "grad_norm": 0.9870487451553345, + "learning_rate": 2e-05, + "loss": 0.04101057, + "step": 16668 + }, + { + "epoch": 33.338, + "grad_norm": 1.5237897634506226, + "learning_rate": 2e-05, + "loss": 0.06414343, + "step": 16669 + }, + { + "epoch": 33.34, + "grad_norm": 1.0481436252593994, + "learning_rate": 2e-05, + "loss": 0.03079265, + "step": 16670 + }, + { + "epoch": 33.342, + "grad_norm": 1.1099966764450073, + "learning_rate": 2e-05, + "loss": 0.04527031, + "step": 16671 + }, + { + "epoch": 33.344, + "grad_norm": 1.3195089101791382, + "learning_rate": 2e-05, + "loss": 0.02745794, + "step": 16672 + }, + { + "epoch": 33.346, + "grad_norm": 0.8879894018173218, + "learning_rate": 2e-05, + "loss": 0.03119539, + "step": 16673 + }, + { + "epoch": 33.348, + "grad_norm": 1.2862367630004883, + "learning_rate": 2e-05, + "loss": 0.04255934, + "step": 16674 + }, + { + "epoch": 33.35, + "grad_norm": 1.123695731163025, + "learning_rate": 2e-05, + "loss": 0.04763044, + "step": 16675 + }, + { + "epoch": 33.352, + "grad_norm": 1.058069109916687, + "learning_rate": 2e-05, + "loss": 0.03856095, + "step": 16676 + }, + { + "epoch": 33.354, + "grad_norm": 1.1873466968536377, + "learning_rate": 2e-05, + "loss": 0.04556847, + "step": 16677 + }, + { + "epoch": 33.356, + "grad_norm": 1.5896145105361938, + "learning_rate": 2e-05, + "loss": 0.06114259, + "step": 16678 + }, + { + "epoch": 33.358, + "grad_norm": 1.1013940572738647, + "learning_rate": 2e-05, + "loss": 0.04544736, + "step": 16679 + }, + { + "epoch": 33.36, + "grad_norm": 1.1660057306289673, + "learning_rate": 2e-05, + "loss": 0.04236136, + "step": 16680 + }, + { + "epoch": 33.362, + "grad_norm": 1.0070010423660278, + "learning_rate": 2e-05, + "loss": 0.03095785, + "step": 16681 + }, + { + "epoch": 33.364, + "grad_norm": 1.2435543537139893, + "learning_rate": 2e-05, + "loss": 0.05413376, + "step": 16682 + }, + { + "epoch": 33.366, + "grad_norm": 1.076140284538269, + "learning_rate": 2e-05, + "loss": 0.0455126, + "step": 16683 + }, + { + "epoch": 33.368, + "grad_norm": 0.9917166829109192, + "learning_rate": 2e-05, + "loss": 0.03302848, + "step": 16684 + }, + { + "epoch": 33.37, + "grad_norm": 1.7563289403915405, + "learning_rate": 2e-05, + "loss": 0.03998546, + "step": 16685 + }, + { + "epoch": 33.372, + "grad_norm": 1.0916879177093506, + "learning_rate": 2e-05, + "loss": 0.03832256, + "step": 16686 + }, + { + "epoch": 33.374, + "grad_norm": 1.0437517166137695, + "learning_rate": 2e-05, + "loss": 0.04042347, + "step": 16687 + }, + { + "epoch": 33.376, + "grad_norm": 2.85295033454895, + "learning_rate": 2e-05, + "loss": 0.04512815, + "step": 16688 + }, + { + "epoch": 33.378, + "grad_norm": 1.3456438779830933, + "learning_rate": 2e-05, + "loss": 0.05866044, + "step": 16689 + }, + { + "epoch": 33.38, + "grad_norm": 1.2579126358032227, + "learning_rate": 2e-05, + "loss": 0.04194574, + "step": 16690 + }, + { + "epoch": 33.382, + "grad_norm": 1.2284680604934692, + "learning_rate": 2e-05, + "loss": 0.04293683, + "step": 16691 + }, + { + "epoch": 33.384, + "grad_norm": 1.0738353729248047, + "learning_rate": 2e-05, + "loss": 0.03847217, + "step": 16692 + }, + { + "epoch": 33.386, + "grad_norm": 0.9319986701011658, + "learning_rate": 2e-05, + "loss": 0.02663419, + "step": 16693 + }, + { + "epoch": 33.388, + "grad_norm": 2.2574198246002197, + "learning_rate": 2e-05, + "loss": 0.03809108, + "step": 16694 + }, + { + "epoch": 33.39, + "grad_norm": 1.3886204957962036, + "learning_rate": 2e-05, + "loss": 0.04164458, + "step": 16695 + }, + { + "epoch": 33.392, + "grad_norm": 1.673781156539917, + "learning_rate": 2e-05, + "loss": 0.04233087, + "step": 16696 + }, + { + "epoch": 33.394, + "grad_norm": 1.2593107223510742, + "learning_rate": 2e-05, + "loss": 0.04103605, + "step": 16697 + }, + { + "epoch": 33.396, + "grad_norm": 2.0994770526885986, + "learning_rate": 2e-05, + "loss": 0.0581164, + "step": 16698 + }, + { + "epoch": 33.398, + "grad_norm": 1.1525026559829712, + "learning_rate": 2e-05, + "loss": 0.0525818, + "step": 16699 + }, + { + "epoch": 33.4, + "grad_norm": 1.141378402709961, + "learning_rate": 2e-05, + "loss": 0.05826852, + "step": 16700 + }, + { + "epoch": 33.402, + "grad_norm": 1.3574016094207764, + "learning_rate": 2e-05, + "loss": 0.05125176, + "step": 16701 + }, + { + "epoch": 33.404, + "grad_norm": 1.2207424640655518, + "learning_rate": 2e-05, + "loss": 0.03801119, + "step": 16702 + }, + { + "epoch": 33.406, + "grad_norm": 1.1650470495224, + "learning_rate": 2e-05, + "loss": 0.04888697, + "step": 16703 + }, + { + "epoch": 33.408, + "grad_norm": 1.325939416885376, + "learning_rate": 2e-05, + "loss": 0.05456465, + "step": 16704 + }, + { + "epoch": 33.41, + "grad_norm": 1.076897382736206, + "learning_rate": 2e-05, + "loss": 0.04185375, + "step": 16705 + }, + { + "epoch": 33.412, + "grad_norm": 1.0933011770248413, + "learning_rate": 2e-05, + "loss": 0.03941853, + "step": 16706 + }, + { + "epoch": 33.414, + "grad_norm": 1.4302270412445068, + "learning_rate": 2e-05, + "loss": 0.04674981, + "step": 16707 + }, + { + "epoch": 33.416, + "grad_norm": 5.180481910705566, + "learning_rate": 2e-05, + "loss": 0.05447872, + "step": 16708 + }, + { + "epoch": 33.418, + "grad_norm": 0.997200071811676, + "learning_rate": 2e-05, + "loss": 0.04152042, + "step": 16709 + }, + { + "epoch": 33.42, + "grad_norm": 1.2832751274108887, + "learning_rate": 2e-05, + "loss": 0.05069866, + "step": 16710 + }, + { + "epoch": 33.422, + "grad_norm": 1.0193212032318115, + "learning_rate": 2e-05, + "loss": 0.0379979, + "step": 16711 + }, + { + "epoch": 33.424, + "grad_norm": 1.0952942371368408, + "learning_rate": 2e-05, + "loss": 0.03752263, + "step": 16712 + }, + { + "epoch": 33.426, + "grad_norm": 1.048114538192749, + "learning_rate": 2e-05, + "loss": 0.05350174, + "step": 16713 + }, + { + "epoch": 33.428, + "grad_norm": 1.142867088317871, + "learning_rate": 2e-05, + "loss": 0.04559736, + "step": 16714 + }, + { + "epoch": 33.43, + "grad_norm": 1.1532340049743652, + "learning_rate": 2e-05, + "loss": 0.03369208, + "step": 16715 + }, + { + "epoch": 33.432, + "grad_norm": 2.2827866077423096, + "learning_rate": 2e-05, + "loss": 0.04446258, + "step": 16716 + }, + { + "epoch": 33.434, + "grad_norm": 1.1983774900436401, + "learning_rate": 2e-05, + "loss": 0.04126913, + "step": 16717 + }, + { + "epoch": 33.436, + "grad_norm": 1.3406853675842285, + "learning_rate": 2e-05, + "loss": 0.03477804, + "step": 16718 + }, + { + "epoch": 33.438, + "grad_norm": 1.4163920879364014, + "learning_rate": 2e-05, + "loss": 0.05400594, + "step": 16719 + }, + { + "epoch": 33.44, + "grad_norm": 1.202052116394043, + "learning_rate": 2e-05, + "loss": 0.04666064, + "step": 16720 + }, + { + "epoch": 33.442, + "grad_norm": 1.020628809928894, + "learning_rate": 2e-05, + "loss": 0.03358646, + "step": 16721 + }, + { + "epoch": 33.444, + "grad_norm": 1.0128796100616455, + "learning_rate": 2e-05, + "loss": 0.03198342, + "step": 16722 + }, + { + "epoch": 33.446, + "grad_norm": 1.3667223453521729, + "learning_rate": 2e-05, + "loss": 0.04862178, + "step": 16723 + }, + { + "epoch": 33.448, + "grad_norm": 1.0447344779968262, + "learning_rate": 2e-05, + "loss": 0.0359532, + "step": 16724 + }, + { + "epoch": 33.45, + "grad_norm": 1.2238010168075562, + "learning_rate": 2e-05, + "loss": 0.03374971, + "step": 16725 + }, + { + "epoch": 33.452, + "grad_norm": 1.7347222566604614, + "learning_rate": 2e-05, + "loss": 0.05317023, + "step": 16726 + }, + { + "epoch": 33.454, + "grad_norm": 1.2322536706924438, + "learning_rate": 2e-05, + "loss": 0.05196625, + "step": 16727 + }, + { + "epoch": 33.456, + "grad_norm": 1.0215041637420654, + "learning_rate": 2e-05, + "loss": 0.03156345, + "step": 16728 + }, + { + "epoch": 33.458, + "grad_norm": 1.5477778911590576, + "learning_rate": 2e-05, + "loss": 0.04214332, + "step": 16729 + }, + { + "epoch": 33.46, + "grad_norm": 1.1270982027053833, + "learning_rate": 2e-05, + "loss": 0.04706374, + "step": 16730 + }, + { + "epoch": 33.462, + "grad_norm": 1.2014801502227783, + "learning_rate": 2e-05, + "loss": 0.04483427, + "step": 16731 + }, + { + "epoch": 33.464, + "grad_norm": 1.4638102054595947, + "learning_rate": 2e-05, + "loss": 0.05931569, + "step": 16732 + }, + { + "epoch": 33.466, + "grad_norm": 1.084399700164795, + "learning_rate": 2e-05, + "loss": 0.04061939, + "step": 16733 + }, + { + "epoch": 33.468, + "grad_norm": 1.254996657371521, + "learning_rate": 2e-05, + "loss": 0.03688417, + "step": 16734 + }, + { + "epoch": 33.47, + "grad_norm": 1.2726435661315918, + "learning_rate": 2e-05, + "loss": 0.04665891, + "step": 16735 + }, + { + "epoch": 33.472, + "grad_norm": 1.0594907999038696, + "learning_rate": 2e-05, + "loss": 0.03953811, + "step": 16736 + }, + { + "epoch": 33.474, + "grad_norm": 1.8973904848098755, + "learning_rate": 2e-05, + "loss": 0.04582148, + "step": 16737 + }, + { + "epoch": 33.476, + "grad_norm": 1.1382102966308594, + "learning_rate": 2e-05, + "loss": 0.03187426, + "step": 16738 + }, + { + "epoch": 33.478, + "grad_norm": 1.8633939027786255, + "learning_rate": 2e-05, + "loss": 0.04606871, + "step": 16739 + }, + { + "epoch": 33.48, + "grad_norm": 1.2930206060409546, + "learning_rate": 2e-05, + "loss": 0.03444067, + "step": 16740 + }, + { + "epoch": 33.482, + "grad_norm": 1.696944236755371, + "learning_rate": 2e-05, + "loss": 0.04227548, + "step": 16741 + }, + { + "epoch": 33.484, + "grad_norm": 1.115965723991394, + "learning_rate": 2e-05, + "loss": 0.03508355, + "step": 16742 + }, + { + "epoch": 33.486, + "grad_norm": 1.2587809562683105, + "learning_rate": 2e-05, + "loss": 0.03677014, + "step": 16743 + }, + { + "epoch": 33.488, + "grad_norm": 1.0906723737716675, + "learning_rate": 2e-05, + "loss": 0.03932015, + "step": 16744 + }, + { + "epoch": 33.49, + "grad_norm": 1.193812370300293, + "learning_rate": 2e-05, + "loss": 0.04173867, + "step": 16745 + }, + { + "epoch": 33.492, + "grad_norm": 1.3376166820526123, + "learning_rate": 2e-05, + "loss": 0.03034848, + "step": 16746 + }, + { + "epoch": 33.494, + "grad_norm": 1.1713814735412598, + "learning_rate": 2e-05, + "loss": 0.04791263, + "step": 16747 + }, + { + "epoch": 33.496, + "grad_norm": 1.723981261253357, + "learning_rate": 2e-05, + "loss": 0.04253115, + "step": 16748 + }, + { + "epoch": 33.498, + "grad_norm": 1.1139684915542603, + "learning_rate": 2e-05, + "loss": 0.04166551, + "step": 16749 + }, + { + "epoch": 33.5, + "grad_norm": 1.292219877243042, + "learning_rate": 2e-05, + "loss": 0.03971953, + "step": 16750 + }, + { + "epoch": 33.502, + "grad_norm": 1.2598776817321777, + "learning_rate": 2e-05, + "loss": 0.04661015, + "step": 16751 + }, + { + "epoch": 33.504, + "grad_norm": 1.248761534690857, + "learning_rate": 2e-05, + "loss": 0.04709421, + "step": 16752 + }, + { + "epoch": 33.506, + "grad_norm": 1.290848970413208, + "learning_rate": 2e-05, + "loss": 0.04546386, + "step": 16753 + }, + { + "epoch": 33.508, + "grad_norm": 3.039777994155884, + "learning_rate": 2e-05, + "loss": 0.04197717, + "step": 16754 + }, + { + "epoch": 33.51, + "grad_norm": 1.0628236532211304, + "learning_rate": 2e-05, + "loss": 0.03273289, + "step": 16755 + }, + { + "epoch": 33.512, + "grad_norm": 1.1584326028823853, + "learning_rate": 2e-05, + "loss": 0.03990288, + "step": 16756 + }, + { + "epoch": 33.514, + "grad_norm": 1.013412356376648, + "learning_rate": 2e-05, + "loss": 0.02777958, + "step": 16757 + }, + { + "epoch": 33.516, + "grad_norm": 2.4592092037200928, + "learning_rate": 2e-05, + "loss": 0.03798764, + "step": 16758 + }, + { + "epoch": 33.518, + "grad_norm": 1.4188690185546875, + "learning_rate": 2e-05, + "loss": 0.04493807, + "step": 16759 + }, + { + "epoch": 33.52, + "grad_norm": 4.044872760772705, + "learning_rate": 2e-05, + "loss": 0.05675915, + "step": 16760 + }, + { + "epoch": 33.522, + "grad_norm": 1.4888228178024292, + "learning_rate": 2e-05, + "loss": 0.039264, + "step": 16761 + }, + { + "epoch": 33.524, + "grad_norm": 1.2201298475265503, + "learning_rate": 2e-05, + "loss": 0.04121239, + "step": 16762 + }, + { + "epoch": 33.526, + "grad_norm": 1.0518842935562134, + "learning_rate": 2e-05, + "loss": 0.03466121, + "step": 16763 + }, + { + "epoch": 33.528, + "grad_norm": 0.8494116067886353, + "learning_rate": 2e-05, + "loss": 0.02845479, + "step": 16764 + }, + { + "epoch": 33.53, + "grad_norm": 1.2486729621887207, + "learning_rate": 2e-05, + "loss": 0.0528804, + "step": 16765 + }, + { + "epoch": 33.532, + "grad_norm": 1.1569898128509521, + "learning_rate": 2e-05, + "loss": 0.03768142, + "step": 16766 + }, + { + "epoch": 33.534, + "grad_norm": 0.9712448120117188, + "learning_rate": 2e-05, + "loss": 0.03684617, + "step": 16767 + }, + { + "epoch": 33.536, + "grad_norm": 3.453179121017456, + "learning_rate": 2e-05, + "loss": 0.05391577, + "step": 16768 + }, + { + "epoch": 33.538, + "grad_norm": 1.2368735074996948, + "learning_rate": 2e-05, + "loss": 0.04700865, + "step": 16769 + }, + { + "epoch": 33.54, + "grad_norm": 1.745194435119629, + "learning_rate": 2e-05, + "loss": 0.04905911, + "step": 16770 + }, + { + "epoch": 33.542, + "grad_norm": 1.4005979299545288, + "learning_rate": 2e-05, + "loss": 0.05808295, + "step": 16771 + }, + { + "epoch": 33.544, + "grad_norm": 1.6781764030456543, + "learning_rate": 2e-05, + "loss": 0.03419307, + "step": 16772 + }, + { + "epoch": 33.546, + "grad_norm": 1.5262959003448486, + "learning_rate": 2e-05, + "loss": 0.04930171, + "step": 16773 + }, + { + "epoch": 33.548, + "grad_norm": 1.185598373413086, + "learning_rate": 2e-05, + "loss": 0.04440915, + "step": 16774 + }, + { + "epoch": 33.55, + "grad_norm": 1.0013930797576904, + "learning_rate": 2e-05, + "loss": 0.03913631, + "step": 16775 + }, + { + "epoch": 33.552, + "grad_norm": 1.5372309684753418, + "learning_rate": 2e-05, + "loss": 0.04706179, + "step": 16776 + }, + { + "epoch": 33.554, + "grad_norm": 1.3070640563964844, + "learning_rate": 2e-05, + "loss": 0.03147201, + "step": 16777 + }, + { + "epoch": 33.556, + "grad_norm": 1.387190818786621, + "learning_rate": 2e-05, + "loss": 0.04918057, + "step": 16778 + }, + { + "epoch": 33.558, + "grad_norm": 2.1223700046539307, + "learning_rate": 2e-05, + "loss": 0.05167126, + "step": 16779 + }, + { + "epoch": 33.56, + "grad_norm": 1.3010870218276978, + "learning_rate": 2e-05, + "loss": 0.04664216, + "step": 16780 + }, + { + "epoch": 33.562, + "grad_norm": 1.6655634641647339, + "learning_rate": 2e-05, + "loss": 0.04345709, + "step": 16781 + }, + { + "epoch": 33.564, + "grad_norm": 2.2487990856170654, + "learning_rate": 2e-05, + "loss": 0.04758396, + "step": 16782 + }, + { + "epoch": 33.566, + "grad_norm": 2.5132153034210205, + "learning_rate": 2e-05, + "loss": 0.05307368, + "step": 16783 + }, + { + "epoch": 33.568, + "grad_norm": 2.023242235183716, + "learning_rate": 2e-05, + "loss": 0.03659308, + "step": 16784 + }, + { + "epoch": 33.57, + "grad_norm": 1.0173568725585938, + "learning_rate": 2e-05, + "loss": 0.03574786, + "step": 16785 + }, + { + "epoch": 33.572, + "grad_norm": 1.4924144744873047, + "learning_rate": 2e-05, + "loss": 0.04335709, + "step": 16786 + }, + { + "epoch": 33.574, + "grad_norm": 1.2708486318588257, + "learning_rate": 2e-05, + "loss": 0.04690626, + "step": 16787 + }, + { + "epoch": 33.576, + "grad_norm": 1.9800394773483276, + "learning_rate": 2e-05, + "loss": 0.04107783, + "step": 16788 + }, + { + "epoch": 33.578, + "grad_norm": 2.4189324378967285, + "learning_rate": 2e-05, + "loss": 0.05079643, + "step": 16789 + }, + { + "epoch": 33.58, + "grad_norm": 1.4563528299331665, + "learning_rate": 2e-05, + "loss": 0.04888935, + "step": 16790 + }, + { + "epoch": 33.582, + "grad_norm": 2.460385799407959, + "learning_rate": 2e-05, + "loss": 0.04483432, + "step": 16791 + }, + { + "epoch": 33.584, + "grad_norm": 2.3535897731781006, + "learning_rate": 2e-05, + "loss": 0.0520683, + "step": 16792 + }, + { + "epoch": 33.586, + "grad_norm": 1.497182846069336, + "learning_rate": 2e-05, + "loss": 0.04292556, + "step": 16793 + }, + { + "epoch": 33.588, + "grad_norm": 2.476083993911743, + "learning_rate": 2e-05, + "loss": 0.04846964, + "step": 16794 + }, + { + "epoch": 33.59, + "grad_norm": 1.223767876625061, + "learning_rate": 2e-05, + "loss": 0.06145135, + "step": 16795 + }, + { + "epoch": 33.592, + "grad_norm": 2.5422379970550537, + "learning_rate": 2e-05, + "loss": 0.05470592, + "step": 16796 + }, + { + "epoch": 33.594, + "grad_norm": 1.644066333770752, + "learning_rate": 2e-05, + "loss": 0.03853694, + "step": 16797 + }, + { + "epoch": 33.596, + "grad_norm": 1.5902693271636963, + "learning_rate": 2e-05, + "loss": 0.04749844, + "step": 16798 + }, + { + "epoch": 33.598, + "grad_norm": 0.8459369540214539, + "learning_rate": 2e-05, + "loss": 0.02365252, + "step": 16799 + }, + { + "epoch": 33.6, + "grad_norm": 0.9091578722000122, + "learning_rate": 2e-05, + "loss": 0.03125982, + "step": 16800 + }, + { + "epoch": 33.602, + "grad_norm": 1.5347368717193604, + "learning_rate": 2e-05, + "loss": 0.04600755, + "step": 16801 + }, + { + "epoch": 33.604, + "grad_norm": 1.4432686567306519, + "learning_rate": 2e-05, + "loss": 0.04101708, + "step": 16802 + }, + { + "epoch": 33.606, + "grad_norm": 1.1293833255767822, + "learning_rate": 2e-05, + "loss": 0.04084689, + "step": 16803 + }, + { + "epoch": 33.608, + "grad_norm": 1.2917184829711914, + "learning_rate": 2e-05, + "loss": 0.04250922, + "step": 16804 + }, + { + "epoch": 33.61, + "grad_norm": 3.9780828952789307, + "learning_rate": 2e-05, + "loss": 0.03426572, + "step": 16805 + }, + { + "epoch": 33.612, + "grad_norm": 2.3805651664733887, + "learning_rate": 2e-05, + "loss": 0.03994187, + "step": 16806 + }, + { + "epoch": 33.614, + "grad_norm": 1.2909883260726929, + "learning_rate": 2e-05, + "loss": 0.0498014, + "step": 16807 + }, + { + "epoch": 33.616, + "grad_norm": 1.0932729244232178, + "learning_rate": 2e-05, + "loss": 0.03009418, + "step": 16808 + }, + { + "epoch": 33.618, + "grad_norm": 1.3504739999771118, + "learning_rate": 2e-05, + "loss": 0.04348255, + "step": 16809 + }, + { + "epoch": 33.62, + "grad_norm": 1.2880327701568604, + "learning_rate": 2e-05, + "loss": 0.05897732, + "step": 16810 + }, + { + "epoch": 33.622, + "grad_norm": 1.2188899517059326, + "learning_rate": 2e-05, + "loss": 0.04643267, + "step": 16811 + }, + { + "epoch": 33.624, + "grad_norm": 1.2222504615783691, + "learning_rate": 2e-05, + "loss": 0.03556299, + "step": 16812 + }, + { + "epoch": 33.626, + "grad_norm": 1.506365418434143, + "learning_rate": 2e-05, + "loss": 0.03758072, + "step": 16813 + }, + { + "epoch": 33.628, + "grad_norm": 1.1334285736083984, + "learning_rate": 2e-05, + "loss": 0.03652769, + "step": 16814 + }, + { + "epoch": 33.63, + "grad_norm": 1.8486438989639282, + "learning_rate": 2e-05, + "loss": 0.03768915, + "step": 16815 + }, + { + "epoch": 33.632, + "grad_norm": 2.419917345046997, + "learning_rate": 2e-05, + "loss": 0.05534883, + "step": 16816 + }, + { + "epoch": 33.634, + "grad_norm": 1.1120336055755615, + "learning_rate": 2e-05, + "loss": 0.02930129, + "step": 16817 + }, + { + "epoch": 33.636, + "grad_norm": 0.9981685280799866, + "learning_rate": 2e-05, + "loss": 0.03070155, + "step": 16818 + }, + { + "epoch": 33.638, + "grad_norm": 1.1295273303985596, + "learning_rate": 2e-05, + "loss": 0.04213031, + "step": 16819 + }, + { + "epoch": 33.64, + "grad_norm": 3.216989278793335, + "learning_rate": 2e-05, + "loss": 0.03833716, + "step": 16820 + }, + { + "epoch": 33.642, + "grad_norm": 2.445704936981201, + "learning_rate": 2e-05, + "loss": 0.04210251, + "step": 16821 + }, + { + "epoch": 33.644, + "grad_norm": 1.0248154401779175, + "learning_rate": 2e-05, + "loss": 0.03077208, + "step": 16822 + }, + { + "epoch": 33.646, + "grad_norm": 1.1069806814193726, + "learning_rate": 2e-05, + "loss": 0.04635371, + "step": 16823 + }, + { + "epoch": 33.648, + "grad_norm": 2.114872455596924, + "learning_rate": 2e-05, + "loss": 0.06503963, + "step": 16824 + }, + { + "epoch": 33.65, + "grad_norm": 1.2636666297912598, + "learning_rate": 2e-05, + "loss": 0.02907618, + "step": 16825 + }, + { + "epoch": 33.652, + "grad_norm": 1.651583194732666, + "learning_rate": 2e-05, + "loss": 0.03353261, + "step": 16826 + }, + { + "epoch": 33.654, + "grad_norm": 1.2803378105163574, + "learning_rate": 2e-05, + "loss": 0.03870942, + "step": 16827 + }, + { + "epoch": 33.656, + "grad_norm": 1.2577418088912964, + "learning_rate": 2e-05, + "loss": 0.04237111, + "step": 16828 + }, + { + "epoch": 33.658, + "grad_norm": 1.1378142833709717, + "learning_rate": 2e-05, + "loss": 0.04286456, + "step": 16829 + }, + { + "epoch": 33.66, + "grad_norm": 2.4869840145111084, + "learning_rate": 2e-05, + "loss": 0.03604544, + "step": 16830 + }, + { + "epoch": 33.662, + "grad_norm": 1.4990627765655518, + "learning_rate": 2e-05, + "loss": 0.03898953, + "step": 16831 + }, + { + "epoch": 33.664, + "grad_norm": 1.4893847703933716, + "learning_rate": 2e-05, + "loss": 0.040364, + "step": 16832 + }, + { + "epoch": 33.666, + "grad_norm": 1.1118310689926147, + "learning_rate": 2e-05, + "loss": 0.03826702, + "step": 16833 + }, + { + "epoch": 33.668, + "grad_norm": 1.1041016578674316, + "learning_rate": 2e-05, + "loss": 0.04368906, + "step": 16834 + }, + { + "epoch": 33.67, + "grad_norm": 1.1376192569732666, + "learning_rate": 2e-05, + "loss": 0.04184592, + "step": 16835 + }, + { + "epoch": 33.672, + "grad_norm": 1.2078814506530762, + "learning_rate": 2e-05, + "loss": 0.04568657, + "step": 16836 + }, + { + "epoch": 33.674, + "grad_norm": 1.4587289094924927, + "learning_rate": 2e-05, + "loss": 0.04338841, + "step": 16837 + }, + { + "epoch": 33.676, + "grad_norm": 1.011198878288269, + "learning_rate": 2e-05, + "loss": 0.03405341, + "step": 16838 + }, + { + "epoch": 33.678, + "grad_norm": 1.286076307296753, + "learning_rate": 2e-05, + "loss": 0.04624006, + "step": 16839 + }, + { + "epoch": 33.68, + "grad_norm": 1.1408072710037231, + "learning_rate": 2e-05, + "loss": 0.04370711, + "step": 16840 + }, + { + "epoch": 33.682, + "grad_norm": 1.0561197996139526, + "learning_rate": 2e-05, + "loss": 0.04226065, + "step": 16841 + }, + { + "epoch": 33.684, + "grad_norm": 1.5302330255508423, + "learning_rate": 2e-05, + "loss": 0.05760441, + "step": 16842 + }, + { + "epoch": 33.686, + "grad_norm": 1.3950103521347046, + "learning_rate": 2e-05, + "loss": 0.06257196, + "step": 16843 + }, + { + "epoch": 33.688, + "grad_norm": 1.3161698579788208, + "learning_rate": 2e-05, + "loss": 0.04578729, + "step": 16844 + }, + { + "epoch": 33.69, + "grad_norm": 1.1795623302459717, + "learning_rate": 2e-05, + "loss": 0.02204716, + "step": 16845 + }, + { + "epoch": 33.692, + "grad_norm": 1.0264464616775513, + "learning_rate": 2e-05, + "loss": 0.04933001, + "step": 16846 + }, + { + "epoch": 33.694, + "grad_norm": 1.3931221961975098, + "learning_rate": 2e-05, + "loss": 0.04392222, + "step": 16847 + }, + { + "epoch": 33.696, + "grad_norm": 0.9317388534545898, + "learning_rate": 2e-05, + "loss": 0.02685932, + "step": 16848 + }, + { + "epoch": 33.698, + "grad_norm": 0.9781988263130188, + "learning_rate": 2e-05, + "loss": 0.03584233, + "step": 16849 + }, + { + "epoch": 33.7, + "grad_norm": 1.2689793109893799, + "learning_rate": 2e-05, + "loss": 0.04835085, + "step": 16850 + }, + { + "epoch": 33.702, + "grad_norm": 1.4062492847442627, + "learning_rate": 2e-05, + "loss": 0.04714158, + "step": 16851 + }, + { + "epoch": 33.704, + "grad_norm": 3.2041304111480713, + "learning_rate": 2e-05, + "loss": 0.05374725, + "step": 16852 + }, + { + "epoch": 33.706, + "grad_norm": 1.2796287536621094, + "learning_rate": 2e-05, + "loss": 0.0468824, + "step": 16853 + }, + { + "epoch": 33.708, + "grad_norm": 2.208021640777588, + "learning_rate": 2e-05, + "loss": 0.05651149, + "step": 16854 + }, + { + "epoch": 33.71, + "grad_norm": 1.8455325365066528, + "learning_rate": 2e-05, + "loss": 0.0509971, + "step": 16855 + }, + { + "epoch": 33.712, + "grad_norm": 0.9580972790718079, + "learning_rate": 2e-05, + "loss": 0.02455459, + "step": 16856 + }, + { + "epoch": 33.714, + "grad_norm": 1.2142010927200317, + "learning_rate": 2e-05, + "loss": 0.03981315, + "step": 16857 + }, + { + "epoch": 33.716, + "grad_norm": 1.0216953754425049, + "learning_rate": 2e-05, + "loss": 0.0417695, + "step": 16858 + }, + { + "epoch": 33.718, + "grad_norm": 1.1976450681686401, + "learning_rate": 2e-05, + "loss": 0.02827924, + "step": 16859 + }, + { + "epoch": 33.72, + "grad_norm": 1.1605479717254639, + "learning_rate": 2e-05, + "loss": 0.04454171, + "step": 16860 + }, + { + "epoch": 33.722, + "grad_norm": 1.382978081703186, + "learning_rate": 2e-05, + "loss": 0.06117807, + "step": 16861 + }, + { + "epoch": 33.724, + "grad_norm": 0.8501904010772705, + "learning_rate": 2e-05, + "loss": 0.0254951, + "step": 16862 + }, + { + "epoch": 33.726, + "grad_norm": 1.1386749744415283, + "learning_rate": 2e-05, + "loss": 0.04194954, + "step": 16863 + }, + { + "epoch": 33.728, + "grad_norm": 0.990307092666626, + "learning_rate": 2e-05, + "loss": 0.03089491, + "step": 16864 + }, + { + "epoch": 33.73, + "grad_norm": 0.9442136287689209, + "learning_rate": 2e-05, + "loss": 0.03526897, + "step": 16865 + }, + { + "epoch": 33.732, + "grad_norm": 1.119691014289856, + "learning_rate": 2e-05, + "loss": 0.03854529, + "step": 16866 + }, + { + "epoch": 33.734, + "grad_norm": 1.748223066329956, + "learning_rate": 2e-05, + "loss": 0.02689804, + "step": 16867 + }, + { + "epoch": 33.736, + "grad_norm": 1.6101315021514893, + "learning_rate": 2e-05, + "loss": 0.04781541, + "step": 16868 + }, + { + "epoch": 33.738, + "grad_norm": 1.0552995204925537, + "learning_rate": 2e-05, + "loss": 0.03037945, + "step": 16869 + }, + { + "epoch": 33.74, + "grad_norm": 1.2612851858139038, + "learning_rate": 2e-05, + "loss": 0.0377595, + "step": 16870 + }, + { + "epoch": 33.742, + "grad_norm": 0.9552090167999268, + "learning_rate": 2e-05, + "loss": 0.02962127, + "step": 16871 + }, + { + "epoch": 33.744, + "grad_norm": 0.9637928009033203, + "learning_rate": 2e-05, + "loss": 0.04046572, + "step": 16872 + }, + { + "epoch": 33.746, + "grad_norm": 1.0410312414169312, + "learning_rate": 2e-05, + "loss": 0.04488287, + "step": 16873 + }, + { + "epoch": 33.748, + "grad_norm": 1.2417147159576416, + "learning_rate": 2e-05, + "loss": 0.05209735, + "step": 16874 + }, + { + "epoch": 33.75, + "grad_norm": 1.3345032930374146, + "learning_rate": 2e-05, + "loss": 0.06243813, + "step": 16875 + }, + { + "epoch": 33.752, + "grad_norm": 0.8462825417518616, + "learning_rate": 2e-05, + "loss": 0.02463733, + "step": 16876 + }, + { + "epoch": 33.754, + "grad_norm": 1.9590753316879272, + "learning_rate": 2e-05, + "loss": 0.05277262, + "step": 16877 + }, + { + "epoch": 33.756, + "grad_norm": 1.141790747642517, + "learning_rate": 2e-05, + "loss": 0.04721228, + "step": 16878 + }, + { + "epoch": 33.758, + "grad_norm": 0.9426037669181824, + "learning_rate": 2e-05, + "loss": 0.03288198, + "step": 16879 + }, + { + "epoch": 33.76, + "grad_norm": 1.2627002000808716, + "learning_rate": 2e-05, + "loss": 0.05176467, + "step": 16880 + }, + { + "epoch": 33.762, + "grad_norm": 1.8264248371124268, + "learning_rate": 2e-05, + "loss": 0.04593317, + "step": 16881 + }, + { + "epoch": 33.764, + "grad_norm": 1.3055675029754639, + "learning_rate": 2e-05, + "loss": 0.06014679, + "step": 16882 + }, + { + "epoch": 33.766, + "grad_norm": 1.199167013168335, + "learning_rate": 2e-05, + "loss": 0.05326901, + "step": 16883 + }, + { + "epoch": 33.768, + "grad_norm": 1.399657964706421, + "learning_rate": 2e-05, + "loss": 0.03976363, + "step": 16884 + }, + { + "epoch": 33.77, + "grad_norm": 1.4370189905166626, + "learning_rate": 2e-05, + "loss": 0.04173758, + "step": 16885 + }, + { + "epoch": 33.772, + "grad_norm": 1.3217477798461914, + "learning_rate": 2e-05, + "loss": 0.04312097, + "step": 16886 + }, + { + "epoch": 33.774, + "grad_norm": 1.0405267477035522, + "learning_rate": 2e-05, + "loss": 0.02572107, + "step": 16887 + }, + { + "epoch": 33.776, + "grad_norm": 3.8962230682373047, + "learning_rate": 2e-05, + "loss": 0.05806974, + "step": 16888 + }, + { + "epoch": 33.778, + "grad_norm": 1.183304786682129, + "learning_rate": 2e-05, + "loss": 0.05114609, + "step": 16889 + }, + { + "epoch": 33.78, + "grad_norm": 1.0144455432891846, + "learning_rate": 2e-05, + "loss": 0.03402551, + "step": 16890 + }, + { + "epoch": 33.782, + "grad_norm": 0.912356436252594, + "learning_rate": 2e-05, + "loss": 0.03162487, + "step": 16891 + }, + { + "epoch": 33.784, + "grad_norm": 1.5084586143493652, + "learning_rate": 2e-05, + "loss": 0.04724879, + "step": 16892 + }, + { + "epoch": 33.786, + "grad_norm": 1.081552267074585, + "learning_rate": 2e-05, + "loss": 0.03258973, + "step": 16893 + }, + { + "epoch": 33.788, + "grad_norm": 1.0721511840820312, + "learning_rate": 2e-05, + "loss": 0.04917396, + "step": 16894 + }, + { + "epoch": 33.79, + "grad_norm": 1.1885251998901367, + "learning_rate": 2e-05, + "loss": 0.04815122, + "step": 16895 + }, + { + "epoch": 33.792, + "grad_norm": 1.4012700319290161, + "learning_rate": 2e-05, + "loss": 0.04503307, + "step": 16896 + }, + { + "epoch": 33.794, + "grad_norm": 1.018221378326416, + "learning_rate": 2e-05, + "loss": 0.03371017, + "step": 16897 + }, + { + "epoch": 33.796, + "grad_norm": 1.1117615699768066, + "learning_rate": 2e-05, + "loss": 0.04455762, + "step": 16898 + }, + { + "epoch": 33.798, + "grad_norm": 2.338944435119629, + "learning_rate": 2e-05, + "loss": 0.05771154, + "step": 16899 + }, + { + "epoch": 33.8, + "grad_norm": 1.3427327871322632, + "learning_rate": 2e-05, + "loss": 0.04563061, + "step": 16900 + }, + { + "epoch": 33.802, + "grad_norm": 1.598863124847412, + "learning_rate": 2e-05, + "loss": 0.04085599, + "step": 16901 + }, + { + "epoch": 33.804, + "grad_norm": 4.519804000854492, + "learning_rate": 2e-05, + "loss": 0.05506884, + "step": 16902 + }, + { + "epoch": 33.806, + "grad_norm": 1.7649790048599243, + "learning_rate": 2e-05, + "loss": 0.04375284, + "step": 16903 + }, + { + "epoch": 33.808, + "grad_norm": 0.7932525873184204, + "learning_rate": 2e-05, + "loss": 0.02062046, + "step": 16904 + }, + { + "epoch": 33.81, + "grad_norm": 1.4744629859924316, + "learning_rate": 2e-05, + "loss": 0.0412587, + "step": 16905 + }, + { + "epoch": 33.812, + "grad_norm": 1.8036439418792725, + "learning_rate": 2e-05, + "loss": 0.04557544, + "step": 16906 + }, + { + "epoch": 33.814, + "grad_norm": 1.2862471342086792, + "learning_rate": 2e-05, + "loss": 0.06118719, + "step": 16907 + }, + { + "epoch": 33.816, + "grad_norm": 1.0730700492858887, + "learning_rate": 2e-05, + "loss": 0.03417275, + "step": 16908 + }, + { + "epoch": 33.818, + "grad_norm": 1.6941379308700562, + "learning_rate": 2e-05, + "loss": 0.06889927, + "step": 16909 + }, + { + "epoch": 33.82, + "grad_norm": 1.0613124370574951, + "learning_rate": 2e-05, + "loss": 0.04204726, + "step": 16910 + }, + { + "epoch": 33.822, + "grad_norm": 1.436385154724121, + "learning_rate": 2e-05, + "loss": 0.05231769, + "step": 16911 + }, + { + "epoch": 33.824, + "grad_norm": 1.1055042743682861, + "learning_rate": 2e-05, + "loss": 0.02975886, + "step": 16912 + }, + { + "epoch": 33.826, + "grad_norm": 1.3664203882217407, + "learning_rate": 2e-05, + "loss": 0.0504609, + "step": 16913 + }, + { + "epoch": 33.828, + "grad_norm": 1.1539993286132812, + "learning_rate": 2e-05, + "loss": 0.04137579, + "step": 16914 + }, + { + "epoch": 33.83, + "grad_norm": 1.266783595085144, + "learning_rate": 2e-05, + "loss": 0.04526065, + "step": 16915 + }, + { + "epoch": 33.832, + "grad_norm": 1.3044017553329468, + "learning_rate": 2e-05, + "loss": 0.04141079, + "step": 16916 + }, + { + "epoch": 33.834, + "grad_norm": 2.1756279468536377, + "learning_rate": 2e-05, + "loss": 0.04577321, + "step": 16917 + }, + { + "epoch": 33.836, + "grad_norm": 1.8856961727142334, + "learning_rate": 2e-05, + "loss": 0.04963861, + "step": 16918 + }, + { + "epoch": 33.838, + "grad_norm": 1.0676918029785156, + "learning_rate": 2e-05, + "loss": 0.04658003, + "step": 16919 + }, + { + "epoch": 33.84, + "grad_norm": 2.096513509750366, + "learning_rate": 2e-05, + "loss": 0.04904509, + "step": 16920 + }, + { + "epoch": 33.842, + "grad_norm": 1.2753912210464478, + "learning_rate": 2e-05, + "loss": 0.05007047, + "step": 16921 + }, + { + "epoch": 33.844, + "grad_norm": 0.9849660396575928, + "learning_rate": 2e-05, + "loss": 0.03128029, + "step": 16922 + }, + { + "epoch": 33.846, + "grad_norm": 1.4278568029403687, + "learning_rate": 2e-05, + "loss": 0.04545929, + "step": 16923 + }, + { + "epoch": 33.848, + "grad_norm": 1.4661980867385864, + "learning_rate": 2e-05, + "loss": 0.05411399, + "step": 16924 + }, + { + "epoch": 33.85, + "grad_norm": 1.2466408014297485, + "learning_rate": 2e-05, + "loss": 0.04471597, + "step": 16925 + }, + { + "epoch": 33.852, + "grad_norm": 0.9380550980567932, + "learning_rate": 2e-05, + "loss": 0.04102387, + "step": 16926 + }, + { + "epoch": 33.854, + "grad_norm": 1.1571276187896729, + "learning_rate": 2e-05, + "loss": 0.05654067, + "step": 16927 + }, + { + "epoch": 33.856, + "grad_norm": 0.9969253540039062, + "learning_rate": 2e-05, + "loss": 0.03924453, + "step": 16928 + }, + { + "epoch": 33.858, + "grad_norm": 1.004270076751709, + "learning_rate": 2e-05, + "loss": 0.0299517, + "step": 16929 + }, + { + "epoch": 33.86, + "grad_norm": 1.096602439880371, + "learning_rate": 2e-05, + "loss": 0.03355722, + "step": 16930 + }, + { + "epoch": 33.862, + "grad_norm": 1.0416635274887085, + "learning_rate": 2e-05, + "loss": 0.03369167, + "step": 16931 + }, + { + "epoch": 33.864, + "grad_norm": 1.327217698097229, + "learning_rate": 2e-05, + "loss": 0.05232648, + "step": 16932 + }, + { + "epoch": 33.866, + "grad_norm": 1.9164153337478638, + "learning_rate": 2e-05, + "loss": 0.05434879, + "step": 16933 + }, + { + "epoch": 33.868, + "grad_norm": 1.9355528354644775, + "learning_rate": 2e-05, + "loss": 0.04286207, + "step": 16934 + }, + { + "epoch": 33.87, + "grad_norm": 2.0409860610961914, + "learning_rate": 2e-05, + "loss": 0.03842381, + "step": 16935 + }, + { + "epoch": 33.872, + "grad_norm": 1.4562387466430664, + "learning_rate": 2e-05, + "loss": 0.03790183, + "step": 16936 + }, + { + "epoch": 33.874, + "grad_norm": 1.2432488203048706, + "learning_rate": 2e-05, + "loss": 0.04522638, + "step": 16937 + }, + { + "epoch": 33.876, + "grad_norm": 1.5313806533813477, + "learning_rate": 2e-05, + "loss": 0.04211116, + "step": 16938 + }, + { + "epoch": 33.878, + "grad_norm": 1.330566644668579, + "learning_rate": 2e-05, + "loss": 0.04907233, + "step": 16939 + }, + { + "epoch": 33.88, + "grad_norm": 1.2614879608154297, + "learning_rate": 2e-05, + "loss": 0.06006509, + "step": 16940 + }, + { + "epoch": 33.882, + "grad_norm": 0.8992356061935425, + "learning_rate": 2e-05, + "loss": 0.03267233, + "step": 16941 + }, + { + "epoch": 33.884, + "grad_norm": 1.0046213865280151, + "learning_rate": 2e-05, + "loss": 0.04911587, + "step": 16942 + }, + { + "epoch": 33.886, + "grad_norm": 1.811383605003357, + "learning_rate": 2e-05, + "loss": 0.05342248, + "step": 16943 + }, + { + "epoch": 33.888, + "grad_norm": 1.098055362701416, + "learning_rate": 2e-05, + "loss": 0.04004695, + "step": 16944 + }, + { + "epoch": 33.89, + "grad_norm": 1.5141690969467163, + "learning_rate": 2e-05, + "loss": 0.05056385, + "step": 16945 + }, + { + "epoch": 33.892, + "grad_norm": 1.2271699905395508, + "learning_rate": 2e-05, + "loss": 0.04924796, + "step": 16946 + }, + { + "epoch": 33.894, + "grad_norm": 1.2581502199172974, + "learning_rate": 2e-05, + "loss": 0.03863451, + "step": 16947 + }, + { + "epoch": 33.896, + "grad_norm": 0.9692806005477905, + "learning_rate": 2e-05, + "loss": 0.04194497, + "step": 16948 + }, + { + "epoch": 33.898, + "grad_norm": 1.4562419652938843, + "learning_rate": 2e-05, + "loss": 0.04685233, + "step": 16949 + }, + { + "epoch": 33.9, + "grad_norm": 1.32866632938385, + "learning_rate": 2e-05, + "loss": 0.05198713, + "step": 16950 + }, + { + "epoch": 33.902, + "grad_norm": 1.1685373783111572, + "learning_rate": 2e-05, + "loss": 0.04833569, + "step": 16951 + }, + { + "epoch": 33.904, + "grad_norm": 1.1611336469650269, + "learning_rate": 2e-05, + "loss": 0.04151731, + "step": 16952 + }, + { + "epoch": 33.906, + "grad_norm": 1.1714519262313843, + "learning_rate": 2e-05, + "loss": 0.04272041, + "step": 16953 + }, + { + "epoch": 33.908, + "grad_norm": 1.6427088975906372, + "learning_rate": 2e-05, + "loss": 0.05739013, + "step": 16954 + }, + { + "epoch": 33.91, + "grad_norm": 1.11444890499115, + "learning_rate": 2e-05, + "loss": 0.03927352, + "step": 16955 + }, + { + "epoch": 33.912, + "grad_norm": 1.4082242250442505, + "learning_rate": 2e-05, + "loss": 0.06680731, + "step": 16956 + }, + { + "epoch": 33.914, + "grad_norm": 1.295926809310913, + "learning_rate": 2e-05, + "loss": 0.04506046, + "step": 16957 + }, + { + "epoch": 33.916, + "grad_norm": 0.9405683279037476, + "learning_rate": 2e-05, + "loss": 0.02393782, + "step": 16958 + }, + { + "epoch": 33.918, + "grad_norm": 1.58418607711792, + "learning_rate": 2e-05, + "loss": 0.04732141, + "step": 16959 + }, + { + "epoch": 33.92, + "grad_norm": 1.6658512353897095, + "learning_rate": 2e-05, + "loss": 0.05234197, + "step": 16960 + }, + { + "epoch": 33.922, + "grad_norm": 1.5868287086486816, + "learning_rate": 2e-05, + "loss": 0.05572698, + "step": 16961 + }, + { + "epoch": 33.924, + "grad_norm": 1.2670631408691406, + "learning_rate": 2e-05, + "loss": 0.03493927, + "step": 16962 + }, + { + "epoch": 33.926, + "grad_norm": 1.2648658752441406, + "learning_rate": 2e-05, + "loss": 0.04391746, + "step": 16963 + }, + { + "epoch": 33.928, + "grad_norm": 1.5117779970169067, + "learning_rate": 2e-05, + "loss": 0.04314294, + "step": 16964 + }, + { + "epoch": 33.93, + "grad_norm": 1.159271001815796, + "learning_rate": 2e-05, + "loss": 0.03362155, + "step": 16965 + }, + { + "epoch": 33.932, + "grad_norm": 1.2142740488052368, + "learning_rate": 2e-05, + "loss": 0.03751262, + "step": 16966 + }, + { + "epoch": 33.934, + "grad_norm": 1.2300341129302979, + "learning_rate": 2e-05, + "loss": 0.04216234, + "step": 16967 + }, + { + "epoch": 33.936, + "grad_norm": 1.0165789127349854, + "learning_rate": 2e-05, + "loss": 0.03385535, + "step": 16968 + }, + { + "epoch": 33.938, + "grad_norm": 1.1254189014434814, + "learning_rate": 2e-05, + "loss": 0.03804931, + "step": 16969 + }, + { + "epoch": 33.94, + "grad_norm": 1.0231702327728271, + "learning_rate": 2e-05, + "loss": 0.03211974, + "step": 16970 + }, + { + "epoch": 33.942, + "grad_norm": 1.1226533651351929, + "learning_rate": 2e-05, + "loss": 0.04243637, + "step": 16971 + }, + { + "epoch": 33.944, + "grad_norm": 1.0575497150421143, + "learning_rate": 2e-05, + "loss": 0.03891562, + "step": 16972 + }, + { + "epoch": 33.946, + "grad_norm": 1.3300598859786987, + "learning_rate": 2e-05, + "loss": 0.03813462, + "step": 16973 + }, + { + "epoch": 33.948, + "grad_norm": 1.1231666803359985, + "learning_rate": 2e-05, + "loss": 0.04401132, + "step": 16974 + }, + { + "epoch": 33.95, + "grad_norm": 1.0384533405303955, + "learning_rate": 2e-05, + "loss": 0.04073372, + "step": 16975 + }, + { + "epoch": 33.952, + "grad_norm": 0.9705667495727539, + "learning_rate": 2e-05, + "loss": 0.0333796, + "step": 16976 + }, + { + "epoch": 33.954, + "grad_norm": 3.3953044414520264, + "learning_rate": 2e-05, + "loss": 0.04748266, + "step": 16977 + }, + { + "epoch": 33.956, + "grad_norm": 1.0151921510696411, + "learning_rate": 2e-05, + "loss": 0.03253239, + "step": 16978 + }, + { + "epoch": 33.958, + "grad_norm": 1.612838864326477, + "learning_rate": 2e-05, + "loss": 0.04560025, + "step": 16979 + }, + { + "epoch": 33.96, + "grad_norm": 1.2825559377670288, + "learning_rate": 2e-05, + "loss": 0.03292632, + "step": 16980 + }, + { + "epoch": 33.962, + "grad_norm": 1.1484404802322388, + "learning_rate": 2e-05, + "loss": 0.03267542, + "step": 16981 + }, + { + "epoch": 33.964, + "grad_norm": 1.3197839260101318, + "learning_rate": 2e-05, + "loss": 0.0385856, + "step": 16982 + }, + { + "epoch": 33.966, + "grad_norm": 1.66212797164917, + "learning_rate": 2e-05, + "loss": 0.03775371, + "step": 16983 + }, + { + "epoch": 33.968, + "grad_norm": 2.77333664894104, + "learning_rate": 2e-05, + "loss": 0.04835136, + "step": 16984 + }, + { + "epoch": 33.97, + "grad_norm": 1.2204259634017944, + "learning_rate": 2e-05, + "loss": 0.03903702, + "step": 16985 + }, + { + "epoch": 33.972, + "grad_norm": 1.9606164693832397, + "learning_rate": 2e-05, + "loss": 0.03565089, + "step": 16986 + }, + { + "epoch": 33.974, + "grad_norm": 1.029175043106079, + "learning_rate": 2e-05, + "loss": 0.03104983, + "step": 16987 + }, + { + "epoch": 33.976, + "grad_norm": 1.9764819145202637, + "learning_rate": 2e-05, + "loss": 0.04124163, + "step": 16988 + }, + { + "epoch": 33.978, + "grad_norm": 1.2919042110443115, + "learning_rate": 2e-05, + "loss": 0.04482358, + "step": 16989 + }, + { + "epoch": 33.98, + "grad_norm": 1.8748044967651367, + "learning_rate": 2e-05, + "loss": 0.05687953, + "step": 16990 + }, + { + "epoch": 33.982, + "grad_norm": 1.5990813970565796, + "learning_rate": 2e-05, + "loss": 0.06885971, + "step": 16991 + }, + { + "epoch": 33.984, + "grad_norm": 1.6569875478744507, + "learning_rate": 2e-05, + "loss": 0.02224008, + "step": 16992 + }, + { + "epoch": 33.986, + "grad_norm": 1.5059123039245605, + "learning_rate": 2e-05, + "loss": 0.03797079, + "step": 16993 + }, + { + "epoch": 33.988, + "grad_norm": 1.1320881843566895, + "learning_rate": 2e-05, + "loss": 0.04442799, + "step": 16994 + }, + { + "epoch": 33.99, + "grad_norm": 1.8858788013458252, + "learning_rate": 2e-05, + "loss": 0.05333455, + "step": 16995 + }, + { + "epoch": 33.992, + "grad_norm": 1.0770121812820435, + "learning_rate": 2e-05, + "loss": 0.04818095, + "step": 16996 + }, + { + "epoch": 33.994, + "grad_norm": 2.505625009536743, + "learning_rate": 2e-05, + "loss": 0.04461709, + "step": 16997 + }, + { + "epoch": 33.996, + "grad_norm": 1.5307080745697021, + "learning_rate": 2e-05, + "loss": 0.04006788, + "step": 16998 + }, + { + "epoch": 33.998, + "grad_norm": 1.1944080591201782, + "learning_rate": 2e-05, + "loss": 0.05010471, + "step": 16999 + }, + { + "epoch": 34.0, + "grad_norm": 1.0636547803878784, + "learning_rate": 2e-05, + "loss": 0.04512432, + "step": 17000 + }, + { + "epoch": 34.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9820359281437125, + "Equal_1": 1.0, + "Equal_2": 0.9860279441117764, + "Equal_3": 0.9860279441117764, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9919839679358717, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.992, + "Perpendicular_1": 0.996, + "Perpendicular_2": 0.98, + "Perpendicular_3": 0.905811623246493, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.996, + "PointLiesOnCircle_3": 0.994, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9880239520958084 + }, + "eval_runtime": 319.634, + "eval_samples_per_second": 32.85, + "eval_steps_per_second": 0.657, + "step": 17000 + }, + { + "epoch": 34.002, + "grad_norm": 1.0812188386917114, + "learning_rate": 2e-05, + "loss": 0.03332932, + "step": 17001 + }, + { + "epoch": 34.004, + "grad_norm": 1.6815991401672363, + "learning_rate": 2e-05, + "loss": 0.04507479, + "step": 17002 + }, + { + "epoch": 34.006, + "grad_norm": 1.3439853191375732, + "learning_rate": 2e-05, + "loss": 0.04140931, + "step": 17003 + }, + { + "epoch": 34.008, + "grad_norm": 1.2833200693130493, + "learning_rate": 2e-05, + "loss": 0.03846888, + "step": 17004 + }, + { + "epoch": 34.01, + "grad_norm": 1.3974056243896484, + "learning_rate": 2e-05, + "loss": 0.0499987, + "step": 17005 + }, + { + "epoch": 34.012, + "grad_norm": 2.1579604148864746, + "learning_rate": 2e-05, + "loss": 0.04521484, + "step": 17006 + }, + { + "epoch": 34.014, + "grad_norm": 1.3013967275619507, + "learning_rate": 2e-05, + "loss": 0.04818133, + "step": 17007 + }, + { + "epoch": 34.016, + "grad_norm": 1.740728497505188, + "learning_rate": 2e-05, + "loss": 0.04121596, + "step": 17008 + }, + { + "epoch": 34.018, + "grad_norm": 0.9428570866584778, + "learning_rate": 2e-05, + "loss": 0.03057436, + "step": 17009 + }, + { + "epoch": 34.02, + "grad_norm": 1.1139823198318481, + "learning_rate": 2e-05, + "loss": 0.04498542, + "step": 17010 + }, + { + "epoch": 34.022, + "grad_norm": 1.3437737226486206, + "learning_rate": 2e-05, + "loss": 0.04737519, + "step": 17011 + }, + { + "epoch": 34.024, + "grad_norm": 1.2312670946121216, + "learning_rate": 2e-05, + "loss": 0.04723905, + "step": 17012 + }, + { + "epoch": 34.026, + "grad_norm": 2.406672954559326, + "learning_rate": 2e-05, + "loss": 0.04161016, + "step": 17013 + }, + { + "epoch": 34.028, + "grad_norm": 1.13112211227417, + "learning_rate": 2e-05, + "loss": 0.05677346, + "step": 17014 + }, + { + "epoch": 34.03, + "grad_norm": 1.2599197626113892, + "learning_rate": 2e-05, + "loss": 0.05227666, + "step": 17015 + }, + { + "epoch": 34.032, + "grad_norm": 1.2989979982376099, + "learning_rate": 2e-05, + "loss": 0.0471041, + "step": 17016 + }, + { + "epoch": 34.034, + "grad_norm": 1.432118535041809, + "learning_rate": 2e-05, + "loss": 0.05415585, + "step": 17017 + }, + { + "epoch": 34.036, + "grad_norm": 1.2104634046554565, + "learning_rate": 2e-05, + "loss": 0.04434905, + "step": 17018 + }, + { + "epoch": 34.038, + "grad_norm": 1.2272220849990845, + "learning_rate": 2e-05, + "loss": 0.04309473, + "step": 17019 + }, + { + "epoch": 34.04, + "grad_norm": 1.2818056344985962, + "learning_rate": 2e-05, + "loss": 0.04750851, + "step": 17020 + }, + { + "epoch": 34.042, + "grad_norm": 1.795697808265686, + "learning_rate": 2e-05, + "loss": 0.04016794, + "step": 17021 + }, + { + "epoch": 34.044, + "grad_norm": 1.261646032333374, + "learning_rate": 2e-05, + "loss": 0.05218561, + "step": 17022 + }, + { + "epoch": 34.046, + "grad_norm": 1.2014031410217285, + "learning_rate": 2e-05, + "loss": 0.03123101, + "step": 17023 + }, + { + "epoch": 34.048, + "grad_norm": 1.3816989660263062, + "learning_rate": 2e-05, + "loss": 0.05157385, + "step": 17024 + }, + { + "epoch": 34.05, + "grad_norm": 0.9320183992385864, + "learning_rate": 2e-05, + "loss": 0.02841931, + "step": 17025 + }, + { + "epoch": 34.052, + "grad_norm": 1.5845396518707275, + "learning_rate": 2e-05, + "loss": 0.0396069, + "step": 17026 + }, + { + "epoch": 34.054, + "grad_norm": 1.7424157857894897, + "learning_rate": 2e-05, + "loss": 0.05160773, + "step": 17027 + }, + { + "epoch": 34.056, + "grad_norm": 1.3250648975372314, + "learning_rate": 2e-05, + "loss": 0.03230084, + "step": 17028 + }, + { + "epoch": 34.058, + "grad_norm": 1.3948570489883423, + "learning_rate": 2e-05, + "loss": 0.04338755, + "step": 17029 + }, + { + "epoch": 34.06, + "grad_norm": 1.9198899269104004, + "learning_rate": 2e-05, + "loss": 0.03582556, + "step": 17030 + }, + { + "epoch": 34.062, + "grad_norm": 1.0664349794387817, + "learning_rate": 2e-05, + "loss": 0.02794036, + "step": 17031 + }, + { + "epoch": 34.064, + "grad_norm": 1.0144078731536865, + "learning_rate": 2e-05, + "loss": 0.03366199, + "step": 17032 + }, + { + "epoch": 34.066, + "grad_norm": 1.4609384536743164, + "learning_rate": 2e-05, + "loss": 0.0822296, + "step": 17033 + }, + { + "epoch": 34.068, + "grad_norm": 1.6045973300933838, + "learning_rate": 2e-05, + "loss": 0.04467433, + "step": 17034 + }, + { + "epoch": 34.07, + "grad_norm": 2.1055264472961426, + "learning_rate": 2e-05, + "loss": 0.04265523, + "step": 17035 + }, + { + "epoch": 34.072, + "grad_norm": 1.1740909814834595, + "learning_rate": 2e-05, + "loss": 0.04725765, + "step": 17036 + }, + { + "epoch": 34.074, + "grad_norm": 1.264272928237915, + "learning_rate": 2e-05, + "loss": 0.04407164, + "step": 17037 + }, + { + "epoch": 34.076, + "grad_norm": 1.4800697565078735, + "learning_rate": 2e-05, + "loss": 0.05361645, + "step": 17038 + }, + { + "epoch": 34.078, + "grad_norm": 0.9612439870834351, + "learning_rate": 2e-05, + "loss": 0.02787845, + "step": 17039 + }, + { + "epoch": 34.08, + "grad_norm": 1.2758971452713013, + "learning_rate": 2e-05, + "loss": 0.03535602, + "step": 17040 + }, + { + "epoch": 34.082, + "grad_norm": 1.4546493291854858, + "learning_rate": 2e-05, + "loss": 0.03896, + "step": 17041 + }, + { + "epoch": 34.084, + "grad_norm": 1.3334996700286865, + "learning_rate": 2e-05, + "loss": 0.0402444, + "step": 17042 + }, + { + "epoch": 34.086, + "grad_norm": 0.9739708304405212, + "learning_rate": 2e-05, + "loss": 0.03635871, + "step": 17043 + }, + { + "epoch": 34.088, + "grad_norm": 1.5088661909103394, + "learning_rate": 2e-05, + "loss": 0.03371816, + "step": 17044 + }, + { + "epoch": 34.09, + "grad_norm": 1.130311131477356, + "learning_rate": 2e-05, + "loss": 0.03255319, + "step": 17045 + }, + { + "epoch": 34.092, + "grad_norm": 1.0935840606689453, + "learning_rate": 2e-05, + "loss": 0.03417723, + "step": 17046 + }, + { + "epoch": 34.094, + "grad_norm": 1.748842716217041, + "learning_rate": 2e-05, + "loss": 0.04565499, + "step": 17047 + }, + { + "epoch": 34.096, + "grad_norm": 0.9785546064376831, + "learning_rate": 2e-05, + "loss": 0.02791492, + "step": 17048 + }, + { + "epoch": 34.098, + "grad_norm": 1.1637617349624634, + "learning_rate": 2e-05, + "loss": 0.0296043, + "step": 17049 + }, + { + "epoch": 34.1, + "grad_norm": 1.0762447118759155, + "learning_rate": 2e-05, + "loss": 0.02891765, + "step": 17050 + }, + { + "epoch": 34.102, + "grad_norm": 0.9392781257629395, + "learning_rate": 2e-05, + "loss": 0.02610712, + "step": 17051 + }, + { + "epoch": 34.104, + "grad_norm": 1.363243579864502, + "learning_rate": 2e-05, + "loss": 0.04912724, + "step": 17052 + }, + { + "epoch": 34.106, + "grad_norm": 1.1277014017105103, + "learning_rate": 2e-05, + "loss": 0.0407737, + "step": 17053 + }, + { + "epoch": 34.108, + "grad_norm": 1.3390322923660278, + "learning_rate": 2e-05, + "loss": 0.04672408, + "step": 17054 + }, + { + "epoch": 34.11, + "grad_norm": 1.0482418537139893, + "learning_rate": 2e-05, + "loss": 0.03395636, + "step": 17055 + }, + { + "epoch": 34.112, + "grad_norm": 1.1809817552566528, + "learning_rate": 2e-05, + "loss": 0.04168408, + "step": 17056 + }, + { + "epoch": 34.114, + "grad_norm": 1.6997599601745605, + "learning_rate": 2e-05, + "loss": 0.07434915, + "step": 17057 + }, + { + "epoch": 34.116, + "grad_norm": 1.3499462604522705, + "learning_rate": 2e-05, + "loss": 0.04132972, + "step": 17058 + }, + { + "epoch": 34.118, + "grad_norm": 1.1026899814605713, + "learning_rate": 2e-05, + "loss": 0.04192564, + "step": 17059 + }, + { + "epoch": 34.12, + "grad_norm": 1.6073766946792603, + "learning_rate": 2e-05, + "loss": 0.02773944, + "step": 17060 + }, + { + "epoch": 34.122, + "grad_norm": 1.5061614513397217, + "learning_rate": 2e-05, + "loss": 0.04761346, + "step": 17061 + }, + { + "epoch": 34.124, + "grad_norm": 4.682290077209473, + "learning_rate": 2e-05, + "loss": 0.02792961, + "step": 17062 + }, + { + "epoch": 34.126, + "grad_norm": 2.404047966003418, + "learning_rate": 2e-05, + "loss": 0.03837546, + "step": 17063 + }, + { + "epoch": 34.128, + "grad_norm": 1.1497547626495361, + "learning_rate": 2e-05, + "loss": 0.05234189, + "step": 17064 + }, + { + "epoch": 34.13, + "grad_norm": 0.9120892286300659, + "learning_rate": 2e-05, + "loss": 0.02887437, + "step": 17065 + }, + { + "epoch": 34.132, + "grad_norm": 1.2333347797393799, + "learning_rate": 2e-05, + "loss": 0.04958087, + "step": 17066 + }, + { + "epoch": 34.134, + "grad_norm": 1.1911989450454712, + "learning_rate": 2e-05, + "loss": 0.03579342, + "step": 17067 + }, + { + "epoch": 34.136, + "grad_norm": 1.0716452598571777, + "learning_rate": 2e-05, + "loss": 0.04589354, + "step": 17068 + }, + { + "epoch": 34.138, + "grad_norm": 1.1978440284729004, + "learning_rate": 2e-05, + "loss": 0.04309279, + "step": 17069 + }, + { + "epoch": 34.14, + "grad_norm": 1.454819679260254, + "learning_rate": 2e-05, + "loss": 0.03111471, + "step": 17070 + }, + { + "epoch": 34.142, + "grad_norm": 1.3682692050933838, + "learning_rate": 2e-05, + "loss": 0.0418794, + "step": 17071 + }, + { + "epoch": 34.144, + "grad_norm": 1.9202009439468384, + "learning_rate": 2e-05, + "loss": 0.04135188, + "step": 17072 + }, + { + "epoch": 34.146, + "grad_norm": 1.1273765563964844, + "learning_rate": 2e-05, + "loss": 0.03442348, + "step": 17073 + }, + { + "epoch": 34.148, + "grad_norm": 1.3619593381881714, + "learning_rate": 2e-05, + "loss": 0.04825344, + "step": 17074 + }, + { + "epoch": 34.15, + "grad_norm": 1.4348422288894653, + "learning_rate": 2e-05, + "loss": 0.05776092, + "step": 17075 + }, + { + "epoch": 34.152, + "grad_norm": 0.9510125517845154, + "learning_rate": 2e-05, + "loss": 0.02918218, + "step": 17076 + }, + { + "epoch": 34.154, + "grad_norm": 1.260632872581482, + "learning_rate": 2e-05, + "loss": 0.04130819, + "step": 17077 + }, + { + "epoch": 34.156, + "grad_norm": 1.5947790145874023, + "learning_rate": 2e-05, + "loss": 0.05725566, + "step": 17078 + }, + { + "epoch": 34.158, + "grad_norm": 1.1542794704437256, + "learning_rate": 2e-05, + "loss": 0.04837495, + "step": 17079 + }, + { + "epoch": 34.16, + "grad_norm": 1.712775707244873, + "learning_rate": 2e-05, + "loss": 0.07433301, + "step": 17080 + }, + { + "epoch": 34.162, + "grad_norm": 1.1782052516937256, + "learning_rate": 2e-05, + "loss": 0.0498495, + "step": 17081 + }, + { + "epoch": 34.164, + "grad_norm": 1.077620267868042, + "learning_rate": 2e-05, + "loss": 0.040213, + "step": 17082 + }, + { + "epoch": 34.166, + "grad_norm": 1.2808398008346558, + "learning_rate": 2e-05, + "loss": 0.05944657, + "step": 17083 + }, + { + "epoch": 34.168, + "grad_norm": 1.1864197254180908, + "learning_rate": 2e-05, + "loss": 0.05283065, + "step": 17084 + }, + { + "epoch": 34.17, + "grad_norm": 1.0761622190475464, + "learning_rate": 2e-05, + "loss": 0.03802584, + "step": 17085 + }, + { + "epoch": 34.172, + "grad_norm": 1.062628149986267, + "learning_rate": 2e-05, + "loss": 0.04319198, + "step": 17086 + }, + { + "epoch": 34.174, + "grad_norm": 1.46955144405365, + "learning_rate": 2e-05, + "loss": 0.04133756, + "step": 17087 + }, + { + "epoch": 34.176, + "grad_norm": 0.9547522068023682, + "learning_rate": 2e-05, + "loss": 0.035832, + "step": 17088 + }, + { + "epoch": 34.178, + "grad_norm": 4.085507869720459, + "learning_rate": 2e-05, + "loss": 0.06185192, + "step": 17089 + }, + { + "epoch": 34.18, + "grad_norm": 2.0465495586395264, + "learning_rate": 2e-05, + "loss": 0.04510322, + "step": 17090 + }, + { + "epoch": 34.182, + "grad_norm": 1.657041311264038, + "learning_rate": 2e-05, + "loss": 0.04181718, + "step": 17091 + }, + { + "epoch": 34.184, + "grad_norm": 1.1236737966537476, + "learning_rate": 2e-05, + "loss": 0.0362424, + "step": 17092 + }, + { + "epoch": 34.186, + "grad_norm": 1.528167724609375, + "learning_rate": 2e-05, + "loss": 0.05361965, + "step": 17093 + }, + { + "epoch": 34.188, + "grad_norm": 0.9988680481910706, + "learning_rate": 2e-05, + "loss": 0.03881805, + "step": 17094 + }, + { + "epoch": 34.19, + "grad_norm": 2.2689661979675293, + "learning_rate": 2e-05, + "loss": 0.04616371, + "step": 17095 + }, + { + "epoch": 34.192, + "grad_norm": 1.069471836090088, + "learning_rate": 2e-05, + "loss": 0.0355745, + "step": 17096 + }, + { + "epoch": 34.194, + "grad_norm": 1.2013444900512695, + "learning_rate": 2e-05, + "loss": 0.04738539, + "step": 17097 + }, + { + "epoch": 34.196, + "grad_norm": 1.4872270822525024, + "learning_rate": 2e-05, + "loss": 0.04246665, + "step": 17098 + }, + { + "epoch": 34.198, + "grad_norm": 1.2898576259613037, + "learning_rate": 2e-05, + "loss": 0.03277687, + "step": 17099 + }, + { + "epoch": 34.2, + "grad_norm": 0.9258760213851929, + "learning_rate": 2e-05, + "loss": 0.02966371, + "step": 17100 + }, + { + "epoch": 34.202, + "grad_norm": 1.5347909927368164, + "learning_rate": 2e-05, + "loss": 0.03460214, + "step": 17101 + }, + { + "epoch": 34.204, + "grad_norm": 1.5886799097061157, + "learning_rate": 2e-05, + "loss": 0.04721684, + "step": 17102 + }, + { + "epoch": 34.206, + "grad_norm": 0.8684136271476746, + "learning_rate": 2e-05, + "loss": 0.03214806, + "step": 17103 + }, + { + "epoch": 34.208, + "grad_norm": 1.1119003295898438, + "learning_rate": 2e-05, + "loss": 0.04176969, + "step": 17104 + }, + { + "epoch": 34.21, + "grad_norm": 1.462074637413025, + "learning_rate": 2e-05, + "loss": 0.04227774, + "step": 17105 + }, + { + "epoch": 34.212, + "grad_norm": 1.4271862506866455, + "learning_rate": 2e-05, + "loss": 0.05438799, + "step": 17106 + }, + { + "epoch": 34.214, + "grad_norm": 1.3217233419418335, + "learning_rate": 2e-05, + "loss": 0.05445388, + "step": 17107 + }, + { + "epoch": 34.216, + "grad_norm": 0.9007492661476135, + "learning_rate": 2e-05, + "loss": 0.02505394, + "step": 17108 + }, + { + "epoch": 34.218, + "grad_norm": 1.0942732095718384, + "learning_rate": 2e-05, + "loss": 0.04544266, + "step": 17109 + }, + { + "epoch": 34.22, + "grad_norm": 1.4460761547088623, + "learning_rate": 2e-05, + "loss": 0.04229202, + "step": 17110 + }, + { + "epoch": 34.222, + "grad_norm": 1.192115306854248, + "learning_rate": 2e-05, + "loss": 0.05149196, + "step": 17111 + }, + { + "epoch": 34.224, + "grad_norm": 1.0671484470367432, + "learning_rate": 2e-05, + "loss": 0.03309963, + "step": 17112 + }, + { + "epoch": 34.226, + "grad_norm": 1.1515556573867798, + "learning_rate": 2e-05, + "loss": 0.04172114, + "step": 17113 + }, + { + "epoch": 34.228, + "grad_norm": 1.0468469858169556, + "learning_rate": 2e-05, + "loss": 0.03374918, + "step": 17114 + }, + { + "epoch": 34.23, + "grad_norm": 1.1221612691879272, + "learning_rate": 2e-05, + "loss": 0.05307683, + "step": 17115 + }, + { + "epoch": 34.232, + "grad_norm": 1.047747015953064, + "learning_rate": 2e-05, + "loss": 0.03866299, + "step": 17116 + }, + { + "epoch": 34.234, + "grad_norm": 0.9740225076675415, + "learning_rate": 2e-05, + "loss": 0.02800692, + "step": 17117 + }, + { + "epoch": 34.236, + "grad_norm": 1.0027884244918823, + "learning_rate": 2e-05, + "loss": 0.01938191, + "step": 17118 + }, + { + "epoch": 34.238, + "grad_norm": 1.1319332122802734, + "learning_rate": 2e-05, + "loss": 0.03879067, + "step": 17119 + }, + { + "epoch": 34.24, + "grad_norm": 1.2376909255981445, + "learning_rate": 2e-05, + "loss": 0.04197739, + "step": 17120 + }, + { + "epoch": 34.242, + "grad_norm": 1.7739689350128174, + "learning_rate": 2e-05, + "loss": 0.0463138, + "step": 17121 + }, + { + "epoch": 34.244, + "grad_norm": 1.1264393329620361, + "learning_rate": 2e-05, + "loss": 0.03141479, + "step": 17122 + }, + { + "epoch": 34.246, + "grad_norm": 1.261618733406067, + "learning_rate": 2e-05, + "loss": 0.03376903, + "step": 17123 + }, + { + "epoch": 34.248, + "grad_norm": 1.123366355895996, + "learning_rate": 2e-05, + "loss": 0.04086906, + "step": 17124 + }, + { + "epoch": 34.25, + "grad_norm": 2.098238229751587, + "learning_rate": 2e-05, + "loss": 0.03711101, + "step": 17125 + }, + { + "epoch": 34.252, + "grad_norm": 1.1595600843429565, + "learning_rate": 2e-05, + "loss": 0.04579591, + "step": 17126 + }, + { + "epoch": 34.254, + "grad_norm": 1.2296733856201172, + "learning_rate": 2e-05, + "loss": 0.05098995, + "step": 17127 + }, + { + "epoch": 34.256, + "grad_norm": 1.07576322555542, + "learning_rate": 2e-05, + "loss": 0.04090093, + "step": 17128 + }, + { + "epoch": 34.258, + "grad_norm": 1.4022080898284912, + "learning_rate": 2e-05, + "loss": 0.05891519, + "step": 17129 + }, + { + "epoch": 34.26, + "grad_norm": 1.2884039878845215, + "learning_rate": 2e-05, + "loss": 0.06018764, + "step": 17130 + }, + { + "epoch": 34.262, + "grad_norm": 1.513996958732605, + "learning_rate": 2e-05, + "loss": 0.05563644, + "step": 17131 + }, + { + "epoch": 34.264, + "grad_norm": 1.0170012712478638, + "learning_rate": 2e-05, + "loss": 0.0308474, + "step": 17132 + }, + { + "epoch": 34.266, + "grad_norm": 1.7678961753845215, + "learning_rate": 2e-05, + "loss": 0.03012414, + "step": 17133 + }, + { + "epoch": 34.268, + "grad_norm": 1.930487036705017, + "learning_rate": 2e-05, + "loss": 0.03921436, + "step": 17134 + }, + { + "epoch": 34.27, + "grad_norm": 1.1381620168685913, + "learning_rate": 2e-05, + "loss": 0.04433856, + "step": 17135 + }, + { + "epoch": 34.272, + "grad_norm": 1.3920421600341797, + "learning_rate": 2e-05, + "loss": 0.04576306, + "step": 17136 + }, + { + "epoch": 34.274, + "grad_norm": 1.132322907447815, + "learning_rate": 2e-05, + "loss": 0.04681329, + "step": 17137 + }, + { + "epoch": 34.276, + "grad_norm": 1.0427021980285645, + "learning_rate": 2e-05, + "loss": 0.03912016, + "step": 17138 + }, + { + "epoch": 34.278, + "grad_norm": 1.3596690893173218, + "learning_rate": 2e-05, + "loss": 0.0395805, + "step": 17139 + }, + { + "epoch": 34.28, + "grad_norm": 1.1399190425872803, + "learning_rate": 2e-05, + "loss": 0.03435064, + "step": 17140 + }, + { + "epoch": 34.282, + "grad_norm": 1.2819249629974365, + "learning_rate": 2e-05, + "loss": 0.0416284, + "step": 17141 + }, + { + "epoch": 34.284, + "grad_norm": 1.2883572578430176, + "learning_rate": 2e-05, + "loss": 0.02817941, + "step": 17142 + }, + { + "epoch": 34.286, + "grad_norm": 1.449006199836731, + "learning_rate": 2e-05, + "loss": 0.05150205, + "step": 17143 + }, + { + "epoch": 34.288, + "grad_norm": 1.3080250024795532, + "learning_rate": 2e-05, + "loss": 0.03051752, + "step": 17144 + }, + { + "epoch": 34.29, + "grad_norm": 1.1870465278625488, + "learning_rate": 2e-05, + "loss": 0.04249543, + "step": 17145 + }, + { + "epoch": 34.292, + "grad_norm": 1.2269293069839478, + "learning_rate": 2e-05, + "loss": 0.0272982, + "step": 17146 + }, + { + "epoch": 34.294, + "grad_norm": 1.2062244415283203, + "learning_rate": 2e-05, + "loss": 0.04380896, + "step": 17147 + }, + { + "epoch": 34.296, + "grad_norm": 1.1690454483032227, + "learning_rate": 2e-05, + "loss": 0.04224586, + "step": 17148 + }, + { + "epoch": 34.298, + "grad_norm": 1.076251745223999, + "learning_rate": 2e-05, + "loss": 0.04731111, + "step": 17149 + }, + { + "epoch": 34.3, + "grad_norm": 0.9125476479530334, + "learning_rate": 2e-05, + "loss": 0.03293589, + "step": 17150 + }, + { + "epoch": 34.302, + "grad_norm": 1.1172863245010376, + "learning_rate": 2e-05, + "loss": 0.04434405, + "step": 17151 + }, + { + "epoch": 34.304, + "grad_norm": 1.4537452459335327, + "learning_rate": 2e-05, + "loss": 0.05699692, + "step": 17152 + }, + { + "epoch": 34.306, + "grad_norm": 1.2859165668487549, + "learning_rate": 2e-05, + "loss": 0.05222432, + "step": 17153 + }, + { + "epoch": 34.308, + "grad_norm": 1.1391727924346924, + "learning_rate": 2e-05, + "loss": 0.04175641, + "step": 17154 + }, + { + "epoch": 34.31, + "grad_norm": 1.3534083366394043, + "learning_rate": 2e-05, + "loss": 0.04102936, + "step": 17155 + }, + { + "epoch": 34.312, + "grad_norm": 2.2053699493408203, + "learning_rate": 2e-05, + "loss": 0.04537173, + "step": 17156 + }, + { + "epoch": 34.314, + "grad_norm": 1.1330358982086182, + "learning_rate": 2e-05, + "loss": 0.05229243, + "step": 17157 + }, + { + "epoch": 34.316, + "grad_norm": 1.8947052955627441, + "learning_rate": 2e-05, + "loss": 0.02641325, + "step": 17158 + }, + { + "epoch": 34.318, + "grad_norm": 1.3450164794921875, + "learning_rate": 2e-05, + "loss": 0.04416771, + "step": 17159 + }, + { + "epoch": 34.32, + "grad_norm": 2.277355909347534, + "learning_rate": 2e-05, + "loss": 0.03164542, + "step": 17160 + }, + { + "epoch": 34.322, + "grad_norm": 1.2452149391174316, + "learning_rate": 2e-05, + "loss": 0.03742987, + "step": 17161 + }, + { + "epoch": 34.324, + "grad_norm": 1.3950529098510742, + "learning_rate": 2e-05, + "loss": 0.04363611, + "step": 17162 + }, + { + "epoch": 34.326, + "grad_norm": 1.2250491380691528, + "learning_rate": 2e-05, + "loss": 0.04496596, + "step": 17163 + }, + { + "epoch": 34.328, + "grad_norm": 1.1004265546798706, + "learning_rate": 2e-05, + "loss": 0.04556531, + "step": 17164 + }, + { + "epoch": 34.33, + "grad_norm": 1.185380458831787, + "learning_rate": 2e-05, + "loss": 0.0397131, + "step": 17165 + }, + { + "epoch": 34.332, + "grad_norm": 1.1757383346557617, + "learning_rate": 2e-05, + "loss": 0.04187442, + "step": 17166 + }, + { + "epoch": 34.334, + "grad_norm": 1.0646100044250488, + "learning_rate": 2e-05, + "loss": 0.04621556, + "step": 17167 + }, + { + "epoch": 34.336, + "grad_norm": 1.1299614906311035, + "learning_rate": 2e-05, + "loss": 0.05131941, + "step": 17168 + }, + { + "epoch": 34.338, + "grad_norm": 1.2977094650268555, + "learning_rate": 2e-05, + "loss": 0.04573498, + "step": 17169 + }, + { + "epoch": 34.34, + "grad_norm": 0.9921538829803467, + "learning_rate": 2e-05, + "loss": 0.03972767, + "step": 17170 + }, + { + "epoch": 34.342, + "grad_norm": 1.0485543012619019, + "learning_rate": 2e-05, + "loss": 0.03102573, + "step": 17171 + }, + { + "epoch": 34.344, + "grad_norm": 3.1085667610168457, + "learning_rate": 2e-05, + "loss": 0.05401374, + "step": 17172 + }, + { + "epoch": 34.346, + "grad_norm": 1.159866213798523, + "learning_rate": 2e-05, + "loss": 0.03029288, + "step": 17173 + }, + { + "epoch": 34.348, + "grad_norm": 1.4898662567138672, + "learning_rate": 2e-05, + "loss": 0.037357, + "step": 17174 + }, + { + "epoch": 34.35, + "grad_norm": 1.475274920463562, + "learning_rate": 2e-05, + "loss": 0.06047164, + "step": 17175 + }, + { + "epoch": 34.352, + "grad_norm": 1.2670931816101074, + "learning_rate": 2e-05, + "loss": 0.04098585, + "step": 17176 + }, + { + "epoch": 34.354, + "grad_norm": 1.1112204790115356, + "learning_rate": 2e-05, + "loss": 0.04078077, + "step": 17177 + }, + { + "epoch": 34.356, + "grad_norm": 1.1330214738845825, + "learning_rate": 2e-05, + "loss": 0.04091879, + "step": 17178 + }, + { + "epoch": 34.358, + "grad_norm": 1.0863505601882935, + "learning_rate": 2e-05, + "loss": 0.04044025, + "step": 17179 + }, + { + "epoch": 34.36, + "grad_norm": 1.4287241697311401, + "learning_rate": 2e-05, + "loss": 0.05269104, + "step": 17180 + }, + { + "epoch": 34.362, + "grad_norm": 1.0420600175857544, + "learning_rate": 2e-05, + "loss": 0.03304556, + "step": 17181 + }, + { + "epoch": 34.364, + "grad_norm": 1.1292872428894043, + "learning_rate": 2e-05, + "loss": 0.04075188, + "step": 17182 + }, + { + "epoch": 34.366, + "grad_norm": 1.257199764251709, + "learning_rate": 2e-05, + "loss": 0.06179626, + "step": 17183 + }, + { + "epoch": 34.368, + "grad_norm": 1.5898911952972412, + "learning_rate": 2e-05, + "loss": 0.06544568, + "step": 17184 + }, + { + "epoch": 34.37, + "grad_norm": 1.1866689920425415, + "learning_rate": 2e-05, + "loss": 0.03154187, + "step": 17185 + }, + { + "epoch": 34.372, + "grad_norm": 1.1588656902313232, + "learning_rate": 2e-05, + "loss": 0.04455046, + "step": 17186 + }, + { + "epoch": 34.374, + "grad_norm": 1.8778082132339478, + "learning_rate": 2e-05, + "loss": 0.05956361, + "step": 17187 + }, + { + "epoch": 34.376, + "grad_norm": 1.0748698711395264, + "learning_rate": 2e-05, + "loss": 0.02959104, + "step": 17188 + }, + { + "epoch": 34.378, + "grad_norm": 1.2204198837280273, + "learning_rate": 2e-05, + "loss": 0.0491489, + "step": 17189 + }, + { + "epoch": 34.38, + "grad_norm": 1.1138110160827637, + "learning_rate": 2e-05, + "loss": 0.04551839, + "step": 17190 + }, + { + "epoch": 34.382, + "grad_norm": 1.057921051979065, + "learning_rate": 2e-05, + "loss": 0.04800026, + "step": 17191 + }, + { + "epoch": 34.384, + "grad_norm": 0.97761470079422, + "learning_rate": 2e-05, + "loss": 0.03815104, + "step": 17192 + }, + { + "epoch": 34.386, + "grad_norm": 1.2259156703948975, + "learning_rate": 2e-05, + "loss": 0.03457344, + "step": 17193 + }, + { + "epoch": 34.388, + "grad_norm": 1.6335809230804443, + "learning_rate": 2e-05, + "loss": 0.05803145, + "step": 17194 + }, + { + "epoch": 34.39, + "grad_norm": 1.134580135345459, + "learning_rate": 2e-05, + "loss": 0.03817577, + "step": 17195 + }, + { + "epoch": 34.392, + "grad_norm": 1.2042521238327026, + "learning_rate": 2e-05, + "loss": 0.04776943, + "step": 17196 + }, + { + "epoch": 34.394, + "grad_norm": 0.9557961225509644, + "learning_rate": 2e-05, + "loss": 0.02998234, + "step": 17197 + }, + { + "epoch": 34.396, + "grad_norm": 1.669502854347229, + "learning_rate": 2e-05, + "loss": 0.05116883, + "step": 17198 + }, + { + "epoch": 34.398, + "grad_norm": 1.938193917274475, + "learning_rate": 2e-05, + "loss": 0.04275124, + "step": 17199 + }, + { + "epoch": 34.4, + "grad_norm": 0.95672208070755, + "learning_rate": 2e-05, + "loss": 0.02722719, + "step": 17200 + }, + { + "epoch": 34.402, + "grad_norm": 1.6504557132720947, + "learning_rate": 2e-05, + "loss": 0.04932757, + "step": 17201 + }, + { + "epoch": 34.404, + "grad_norm": 1.3142409324645996, + "learning_rate": 2e-05, + "loss": 0.05078411, + "step": 17202 + }, + { + "epoch": 34.406, + "grad_norm": 1.8775172233581543, + "learning_rate": 2e-05, + "loss": 0.0415501, + "step": 17203 + }, + { + "epoch": 34.408, + "grad_norm": 1.108086347579956, + "learning_rate": 2e-05, + "loss": 0.03562623, + "step": 17204 + }, + { + "epoch": 34.41, + "grad_norm": 1.1936184167861938, + "learning_rate": 2e-05, + "loss": 0.06731561, + "step": 17205 + }, + { + "epoch": 34.412, + "grad_norm": 1.4935089349746704, + "learning_rate": 2e-05, + "loss": 0.04526928, + "step": 17206 + }, + { + "epoch": 34.414, + "grad_norm": 0.9472814798355103, + "learning_rate": 2e-05, + "loss": 0.03531752, + "step": 17207 + }, + { + "epoch": 34.416, + "grad_norm": 1.1096245050430298, + "learning_rate": 2e-05, + "loss": 0.03912827, + "step": 17208 + }, + { + "epoch": 34.418, + "grad_norm": 1.2029078006744385, + "learning_rate": 2e-05, + "loss": 0.05240467, + "step": 17209 + }, + { + "epoch": 34.42, + "grad_norm": 1.08064603805542, + "learning_rate": 2e-05, + "loss": 0.04623947, + "step": 17210 + }, + { + "epoch": 34.422, + "grad_norm": 1.3892536163330078, + "learning_rate": 2e-05, + "loss": 0.04560886, + "step": 17211 + }, + { + "epoch": 34.424, + "grad_norm": 1.2040435075759888, + "learning_rate": 2e-05, + "loss": 0.03983107, + "step": 17212 + }, + { + "epoch": 34.426, + "grad_norm": 1.0492370128631592, + "learning_rate": 2e-05, + "loss": 0.03636064, + "step": 17213 + }, + { + "epoch": 34.428, + "grad_norm": 1.1657490730285645, + "learning_rate": 2e-05, + "loss": 0.04331504, + "step": 17214 + }, + { + "epoch": 34.43, + "grad_norm": 1.2659703493118286, + "learning_rate": 2e-05, + "loss": 0.05003392, + "step": 17215 + }, + { + "epoch": 34.432, + "grad_norm": 1.3494330644607544, + "learning_rate": 2e-05, + "loss": 0.02932413, + "step": 17216 + }, + { + "epoch": 34.434, + "grad_norm": 1.175642490386963, + "learning_rate": 2e-05, + "loss": 0.04383037, + "step": 17217 + }, + { + "epoch": 34.436, + "grad_norm": 1.0933257341384888, + "learning_rate": 2e-05, + "loss": 0.03921406, + "step": 17218 + }, + { + "epoch": 34.438, + "grad_norm": 1.1617385149002075, + "learning_rate": 2e-05, + "loss": 0.04727744, + "step": 17219 + }, + { + "epoch": 34.44, + "grad_norm": 1.1515668630599976, + "learning_rate": 2e-05, + "loss": 0.03612133, + "step": 17220 + }, + { + "epoch": 34.442, + "grad_norm": 1.1153593063354492, + "learning_rate": 2e-05, + "loss": 0.03581006, + "step": 17221 + }, + { + "epoch": 34.444, + "grad_norm": 1.1779838800430298, + "learning_rate": 2e-05, + "loss": 0.03837469, + "step": 17222 + }, + { + "epoch": 34.446, + "grad_norm": 1.257264256477356, + "learning_rate": 2e-05, + "loss": 0.04428399, + "step": 17223 + }, + { + "epoch": 34.448, + "grad_norm": 1.1216603517532349, + "learning_rate": 2e-05, + "loss": 0.03815703, + "step": 17224 + }, + { + "epoch": 34.45, + "grad_norm": 1.513350248336792, + "learning_rate": 2e-05, + "loss": 0.05262036, + "step": 17225 + }, + { + "epoch": 34.452, + "grad_norm": 1.668152093887329, + "learning_rate": 2e-05, + "loss": 0.04921922, + "step": 17226 + }, + { + "epoch": 34.454, + "grad_norm": 1.259291648864746, + "learning_rate": 2e-05, + "loss": 0.0468533, + "step": 17227 + }, + { + "epoch": 34.456, + "grad_norm": 2.0547330379486084, + "learning_rate": 2e-05, + "loss": 0.04505831, + "step": 17228 + }, + { + "epoch": 34.458, + "grad_norm": 1.0175834894180298, + "learning_rate": 2e-05, + "loss": 0.03889389, + "step": 17229 + }, + { + "epoch": 34.46, + "grad_norm": 1.205269694328308, + "learning_rate": 2e-05, + "loss": 0.05405382, + "step": 17230 + }, + { + "epoch": 34.462, + "grad_norm": 1.1954044103622437, + "learning_rate": 2e-05, + "loss": 0.06115945, + "step": 17231 + }, + { + "epoch": 34.464, + "grad_norm": 1.5559508800506592, + "learning_rate": 2e-05, + "loss": 0.04823997, + "step": 17232 + }, + { + "epoch": 34.466, + "grad_norm": 1.123988151550293, + "learning_rate": 2e-05, + "loss": 0.04282939, + "step": 17233 + }, + { + "epoch": 34.468, + "grad_norm": 1.5351160764694214, + "learning_rate": 2e-05, + "loss": 0.04434042, + "step": 17234 + }, + { + "epoch": 34.47, + "grad_norm": 1.1878026723861694, + "learning_rate": 2e-05, + "loss": 0.04576051, + "step": 17235 + }, + { + "epoch": 34.472, + "grad_norm": 1.456591248512268, + "learning_rate": 2e-05, + "loss": 0.04699751, + "step": 17236 + }, + { + "epoch": 34.474, + "grad_norm": 1.0269306898117065, + "learning_rate": 2e-05, + "loss": 0.04063164, + "step": 17237 + }, + { + "epoch": 34.476, + "grad_norm": 1.0819087028503418, + "learning_rate": 2e-05, + "loss": 0.04128548, + "step": 17238 + }, + { + "epoch": 34.478, + "grad_norm": 1.8286333084106445, + "learning_rate": 2e-05, + "loss": 0.04910714, + "step": 17239 + }, + { + "epoch": 34.48, + "grad_norm": 3.0177881717681885, + "learning_rate": 2e-05, + "loss": 0.05242274, + "step": 17240 + }, + { + "epoch": 34.482, + "grad_norm": 1.391753911972046, + "learning_rate": 2e-05, + "loss": 0.04662409, + "step": 17241 + }, + { + "epoch": 34.484, + "grad_norm": 1.5044915676116943, + "learning_rate": 2e-05, + "loss": 0.03636048, + "step": 17242 + }, + { + "epoch": 34.486, + "grad_norm": 1.38352370262146, + "learning_rate": 2e-05, + "loss": 0.04130649, + "step": 17243 + }, + { + "epoch": 34.488, + "grad_norm": 1.027741551399231, + "learning_rate": 2e-05, + "loss": 0.04034343, + "step": 17244 + }, + { + "epoch": 34.49, + "grad_norm": 1.7311643362045288, + "learning_rate": 2e-05, + "loss": 0.03252632, + "step": 17245 + }, + { + "epoch": 34.492, + "grad_norm": 0.9958137273788452, + "learning_rate": 2e-05, + "loss": 0.03432687, + "step": 17246 + }, + { + "epoch": 34.494, + "grad_norm": 0.9350914359092712, + "learning_rate": 2e-05, + "loss": 0.04081878, + "step": 17247 + }, + { + "epoch": 34.496, + "grad_norm": 1.0058718919754028, + "learning_rate": 2e-05, + "loss": 0.03128472, + "step": 17248 + }, + { + "epoch": 34.498, + "grad_norm": 1.1736693382263184, + "learning_rate": 2e-05, + "loss": 0.0507132, + "step": 17249 + }, + { + "epoch": 34.5, + "grad_norm": 2.9509620666503906, + "learning_rate": 2e-05, + "loss": 0.04513638, + "step": 17250 + }, + { + "epoch": 34.502, + "grad_norm": 1.1163856983184814, + "learning_rate": 2e-05, + "loss": 0.02961075, + "step": 17251 + }, + { + "epoch": 34.504, + "grad_norm": 1.2255582809448242, + "learning_rate": 2e-05, + "loss": 0.05628613, + "step": 17252 + }, + { + "epoch": 34.506, + "grad_norm": 1.1547157764434814, + "learning_rate": 2e-05, + "loss": 0.04185841, + "step": 17253 + }, + { + "epoch": 34.508, + "grad_norm": 1.0533969402313232, + "learning_rate": 2e-05, + "loss": 0.02807963, + "step": 17254 + }, + { + "epoch": 34.51, + "grad_norm": 1.1036581993103027, + "learning_rate": 2e-05, + "loss": 0.04609573, + "step": 17255 + }, + { + "epoch": 34.512, + "grad_norm": 1.2565536499023438, + "learning_rate": 2e-05, + "loss": 0.0547157, + "step": 17256 + }, + { + "epoch": 34.514, + "grad_norm": 1.1017217636108398, + "learning_rate": 2e-05, + "loss": 0.05001441, + "step": 17257 + }, + { + "epoch": 34.516, + "grad_norm": 1.0183624029159546, + "learning_rate": 2e-05, + "loss": 0.03306765, + "step": 17258 + }, + { + "epoch": 34.518, + "grad_norm": 1.5634461641311646, + "learning_rate": 2e-05, + "loss": 0.05358353, + "step": 17259 + }, + { + "epoch": 34.52, + "grad_norm": 0.8560883402824402, + "learning_rate": 2e-05, + "loss": 0.02674874, + "step": 17260 + }, + { + "epoch": 34.522, + "grad_norm": 1.7791016101837158, + "learning_rate": 2e-05, + "loss": 0.05959473, + "step": 17261 + }, + { + "epoch": 34.524, + "grad_norm": 1.059211015701294, + "learning_rate": 2e-05, + "loss": 0.04623821, + "step": 17262 + }, + { + "epoch": 34.526, + "grad_norm": 2.1442856788635254, + "learning_rate": 2e-05, + "loss": 0.03975558, + "step": 17263 + }, + { + "epoch": 34.528, + "grad_norm": 1.1825506687164307, + "learning_rate": 2e-05, + "loss": 0.04681484, + "step": 17264 + }, + { + "epoch": 34.53, + "grad_norm": 1.1355949640274048, + "learning_rate": 2e-05, + "loss": 0.04300448, + "step": 17265 + }, + { + "epoch": 34.532, + "grad_norm": 1.151884913444519, + "learning_rate": 2e-05, + "loss": 0.05281693, + "step": 17266 + }, + { + "epoch": 34.534, + "grad_norm": 1.244160532951355, + "learning_rate": 2e-05, + "loss": 0.05270495, + "step": 17267 + }, + { + "epoch": 34.536, + "grad_norm": 1.332995057106018, + "learning_rate": 2e-05, + "loss": 0.04768827, + "step": 17268 + }, + { + "epoch": 34.538, + "grad_norm": 1.1446064710617065, + "learning_rate": 2e-05, + "loss": 0.03133503, + "step": 17269 + }, + { + "epoch": 34.54, + "grad_norm": 1.1863194704055786, + "learning_rate": 2e-05, + "loss": 0.05849977, + "step": 17270 + }, + { + "epoch": 34.542, + "grad_norm": 1.2308465242385864, + "learning_rate": 2e-05, + "loss": 0.06080481, + "step": 17271 + }, + { + "epoch": 34.544, + "grad_norm": 2.3571889400482178, + "learning_rate": 2e-05, + "loss": 0.05764741, + "step": 17272 + }, + { + "epoch": 34.546, + "grad_norm": 0.713600218296051, + "learning_rate": 2e-05, + "loss": 0.0190373, + "step": 17273 + }, + { + "epoch": 34.548, + "grad_norm": 3.5225939750671387, + "learning_rate": 2e-05, + "loss": 0.05001704, + "step": 17274 + }, + { + "epoch": 34.55, + "grad_norm": 2.5222723484039307, + "learning_rate": 2e-05, + "loss": 0.03726556, + "step": 17275 + }, + { + "epoch": 34.552, + "grad_norm": 1.0584102869033813, + "learning_rate": 2e-05, + "loss": 0.04141295, + "step": 17276 + }, + { + "epoch": 34.554, + "grad_norm": 1.0614216327667236, + "learning_rate": 2e-05, + "loss": 0.04041729, + "step": 17277 + }, + { + "epoch": 34.556, + "grad_norm": 0.8729689121246338, + "learning_rate": 2e-05, + "loss": 0.03110387, + "step": 17278 + }, + { + "epoch": 34.558, + "grad_norm": 1.013295292854309, + "learning_rate": 2e-05, + "loss": 0.03849518, + "step": 17279 + }, + { + "epoch": 34.56, + "grad_norm": 1.2050495147705078, + "learning_rate": 2e-05, + "loss": 0.04337199, + "step": 17280 + }, + { + "epoch": 34.562, + "grad_norm": 1.1820077896118164, + "learning_rate": 2e-05, + "loss": 0.04693332, + "step": 17281 + }, + { + "epoch": 34.564, + "grad_norm": 1.015336036682129, + "learning_rate": 2e-05, + "loss": 0.0314662, + "step": 17282 + }, + { + "epoch": 34.566, + "grad_norm": 1.1241815090179443, + "learning_rate": 2e-05, + "loss": 0.04584487, + "step": 17283 + }, + { + "epoch": 34.568, + "grad_norm": 1.4243446588516235, + "learning_rate": 2e-05, + "loss": 0.04570295, + "step": 17284 + }, + { + "epoch": 34.57, + "grad_norm": 2.7489190101623535, + "learning_rate": 2e-05, + "loss": 0.04503506, + "step": 17285 + }, + { + "epoch": 34.572, + "grad_norm": 1.1095118522644043, + "learning_rate": 2e-05, + "loss": 0.04129759, + "step": 17286 + }, + { + "epoch": 34.574, + "grad_norm": 1.1222656965255737, + "learning_rate": 2e-05, + "loss": 0.04352672, + "step": 17287 + }, + { + "epoch": 34.576, + "grad_norm": 1.050182580947876, + "learning_rate": 2e-05, + "loss": 0.04432563, + "step": 17288 + }, + { + "epoch": 34.578, + "grad_norm": 1.1200228929519653, + "learning_rate": 2e-05, + "loss": 0.03937971, + "step": 17289 + }, + { + "epoch": 34.58, + "grad_norm": 1.0767937898635864, + "learning_rate": 2e-05, + "loss": 0.04535668, + "step": 17290 + }, + { + "epoch": 34.582, + "grad_norm": 1.556679129600525, + "learning_rate": 2e-05, + "loss": 0.05495925, + "step": 17291 + }, + { + "epoch": 34.584, + "grad_norm": 0.9800344109535217, + "learning_rate": 2e-05, + "loss": 0.03302306, + "step": 17292 + }, + { + "epoch": 34.586, + "grad_norm": 1.1676750183105469, + "learning_rate": 2e-05, + "loss": 0.04421997, + "step": 17293 + }, + { + "epoch": 34.588, + "grad_norm": 1.9742448329925537, + "learning_rate": 2e-05, + "loss": 0.05324573, + "step": 17294 + }, + { + "epoch": 34.59, + "grad_norm": 1.217177152633667, + "learning_rate": 2e-05, + "loss": 0.03502582, + "step": 17295 + }, + { + "epoch": 34.592, + "grad_norm": 1.535559058189392, + "learning_rate": 2e-05, + "loss": 0.05487555, + "step": 17296 + }, + { + "epoch": 34.594, + "grad_norm": 1.073294758796692, + "learning_rate": 2e-05, + "loss": 0.04047548, + "step": 17297 + }, + { + "epoch": 34.596, + "grad_norm": 1.468457579612732, + "learning_rate": 2e-05, + "loss": 0.03949284, + "step": 17298 + }, + { + "epoch": 34.598, + "grad_norm": 1.1987510919570923, + "learning_rate": 2e-05, + "loss": 0.038359, + "step": 17299 + }, + { + "epoch": 34.6, + "grad_norm": 1.8533692359924316, + "learning_rate": 2e-05, + "loss": 0.04139744, + "step": 17300 + }, + { + "epoch": 34.602, + "grad_norm": 0.9457276463508606, + "learning_rate": 2e-05, + "loss": 0.03080427, + "step": 17301 + }, + { + "epoch": 34.604, + "grad_norm": 1.3518832921981812, + "learning_rate": 2e-05, + "loss": 0.04552808, + "step": 17302 + }, + { + "epoch": 34.606, + "grad_norm": 1.7731199264526367, + "learning_rate": 2e-05, + "loss": 0.03314227, + "step": 17303 + }, + { + "epoch": 34.608, + "grad_norm": 1.6330634355545044, + "learning_rate": 2e-05, + "loss": 0.04139177, + "step": 17304 + }, + { + "epoch": 34.61, + "grad_norm": 1.111138939857483, + "learning_rate": 2e-05, + "loss": 0.03105775, + "step": 17305 + }, + { + "epoch": 34.612, + "grad_norm": 1.2260620594024658, + "learning_rate": 2e-05, + "loss": 0.0478155, + "step": 17306 + }, + { + "epoch": 34.614, + "grad_norm": 1.1791749000549316, + "learning_rate": 2e-05, + "loss": 0.04237508, + "step": 17307 + }, + { + "epoch": 34.616, + "grad_norm": 0.9272366166114807, + "learning_rate": 2e-05, + "loss": 0.03472099, + "step": 17308 + }, + { + "epoch": 34.618, + "grad_norm": 1.0389100313186646, + "learning_rate": 2e-05, + "loss": 0.03994408, + "step": 17309 + }, + { + "epoch": 34.62, + "grad_norm": 1.1838860511779785, + "learning_rate": 2e-05, + "loss": 0.03435428, + "step": 17310 + }, + { + "epoch": 34.622, + "grad_norm": 1.240273356437683, + "learning_rate": 2e-05, + "loss": 0.04923018, + "step": 17311 + }, + { + "epoch": 34.624, + "grad_norm": 1.0485180616378784, + "learning_rate": 2e-05, + "loss": 0.02882761, + "step": 17312 + }, + { + "epoch": 34.626, + "grad_norm": 1.092466950416565, + "learning_rate": 2e-05, + "loss": 0.02998949, + "step": 17313 + }, + { + "epoch": 34.628, + "grad_norm": 2.4525039196014404, + "learning_rate": 2e-05, + "loss": 0.05779274, + "step": 17314 + }, + { + "epoch": 34.63, + "grad_norm": 1.1219778060913086, + "learning_rate": 2e-05, + "loss": 0.03857387, + "step": 17315 + }, + { + "epoch": 34.632, + "grad_norm": 1.2241005897521973, + "learning_rate": 2e-05, + "loss": 0.04050406, + "step": 17316 + }, + { + "epoch": 34.634, + "grad_norm": 1.1506431102752686, + "learning_rate": 2e-05, + "loss": 0.04586707, + "step": 17317 + }, + { + "epoch": 34.636, + "grad_norm": 1.113982915878296, + "learning_rate": 2e-05, + "loss": 0.04473947, + "step": 17318 + }, + { + "epoch": 34.638, + "grad_norm": 1.2651467323303223, + "learning_rate": 2e-05, + "loss": 0.04464104, + "step": 17319 + }, + { + "epoch": 34.64, + "grad_norm": 1.0130887031555176, + "learning_rate": 2e-05, + "loss": 0.03748289, + "step": 17320 + }, + { + "epoch": 34.642, + "grad_norm": 1.977115273475647, + "learning_rate": 2e-05, + "loss": 0.0529108, + "step": 17321 + }, + { + "epoch": 34.644, + "grad_norm": 1.0311118364334106, + "learning_rate": 2e-05, + "loss": 0.05024704, + "step": 17322 + }, + { + "epoch": 34.646, + "grad_norm": 1.7532882690429688, + "learning_rate": 2e-05, + "loss": 0.05759249, + "step": 17323 + }, + { + "epoch": 34.648, + "grad_norm": 2.038602828979492, + "learning_rate": 2e-05, + "loss": 0.03318835, + "step": 17324 + }, + { + "epoch": 34.65, + "grad_norm": 1.232222557067871, + "learning_rate": 2e-05, + "loss": 0.04337482, + "step": 17325 + }, + { + "epoch": 34.652, + "grad_norm": 1.3680845499038696, + "learning_rate": 2e-05, + "loss": 0.02841615, + "step": 17326 + }, + { + "epoch": 34.654, + "grad_norm": 1.8066096305847168, + "learning_rate": 2e-05, + "loss": 0.03865731, + "step": 17327 + }, + { + "epoch": 34.656, + "grad_norm": 1.0195194482803345, + "learning_rate": 2e-05, + "loss": 0.03887212, + "step": 17328 + }, + { + "epoch": 34.658, + "grad_norm": 1.2496850490570068, + "learning_rate": 2e-05, + "loss": 0.04003082, + "step": 17329 + }, + { + "epoch": 34.66, + "grad_norm": 1.1898289918899536, + "learning_rate": 2e-05, + "loss": 0.03792207, + "step": 17330 + }, + { + "epoch": 34.662, + "grad_norm": 2.7598717212677, + "learning_rate": 2e-05, + "loss": 0.04439123, + "step": 17331 + }, + { + "epoch": 34.664, + "grad_norm": 1.5496615171432495, + "learning_rate": 2e-05, + "loss": 0.05002048, + "step": 17332 + }, + { + "epoch": 34.666, + "grad_norm": 1.36111581325531, + "learning_rate": 2e-05, + "loss": 0.03431807, + "step": 17333 + }, + { + "epoch": 34.668, + "grad_norm": 1.1194837093353271, + "learning_rate": 2e-05, + "loss": 0.04525601, + "step": 17334 + }, + { + "epoch": 34.67, + "grad_norm": 2.9787824153900146, + "learning_rate": 2e-05, + "loss": 0.05164247, + "step": 17335 + }, + { + "epoch": 34.672, + "grad_norm": 1.0257699489593506, + "learning_rate": 2e-05, + "loss": 0.03524974, + "step": 17336 + }, + { + "epoch": 34.674, + "grad_norm": 1.438684105873108, + "learning_rate": 2e-05, + "loss": 0.03349183, + "step": 17337 + }, + { + "epoch": 34.676, + "grad_norm": 1.544499397277832, + "learning_rate": 2e-05, + "loss": 0.04704629, + "step": 17338 + }, + { + "epoch": 34.678, + "grad_norm": 1.3472787141799927, + "learning_rate": 2e-05, + "loss": 0.04309006, + "step": 17339 + }, + { + "epoch": 34.68, + "grad_norm": 1.8793655633926392, + "learning_rate": 2e-05, + "loss": 0.05061891, + "step": 17340 + }, + { + "epoch": 34.682, + "grad_norm": 1.2611889839172363, + "learning_rate": 2e-05, + "loss": 0.05567399, + "step": 17341 + }, + { + "epoch": 34.684, + "grad_norm": 1.5432864427566528, + "learning_rate": 2e-05, + "loss": 0.03108479, + "step": 17342 + }, + { + "epoch": 34.686, + "grad_norm": 1.2350441217422485, + "learning_rate": 2e-05, + "loss": 0.03987078, + "step": 17343 + }, + { + "epoch": 34.688, + "grad_norm": 1.0708898305892944, + "learning_rate": 2e-05, + "loss": 0.03587945, + "step": 17344 + }, + { + "epoch": 34.69, + "grad_norm": 1.1190636157989502, + "learning_rate": 2e-05, + "loss": 0.04188859, + "step": 17345 + }, + { + "epoch": 34.692, + "grad_norm": 1.3909366130828857, + "learning_rate": 2e-05, + "loss": 0.0505821, + "step": 17346 + }, + { + "epoch": 34.694, + "grad_norm": 1.1136740446090698, + "learning_rate": 2e-05, + "loss": 0.04561438, + "step": 17347 + }, + { + "epoch": 34.696, + "grad_norm": 2.012559413909912, + "learning_rate": 2e-05, + "loss": 0.04886289, + "step": 17348 + }, + { + "epoch": 34.698, + "grad_norm": 1.2638075351715088, + "learning_rate": 2e-05, + "loss": 0.04957847, + "step": 17349 + }, + { + "epoch": 34.7, + "grad_norm": 1.1630687713623047, + "learning_rate": 2e-05, + "loss": 0.04921006, + "step": 17350 + }, + { + "epoch": 34.702, + "grad_norm": 1.2960374355316162, + "learning_rate": 2e-05, + "loss": 0.03544647, + "step": 17351 + }, + { + "epoch": 34.704, + "grad_norm": 1.0430259704589844, + "learning_rate": 2e-05, + "loss": 0.04330702, + "step": 17352 + }, + { + "epoch": 34.706, + "grad_norm": 1.0777888298034668, + "learning_rate": 2e-05, + "loss": 0.03540637, + "step": 17353 + }, + { + "epoch": 34.708, + "grad_norm": 1.176332712173462, + "learning_rate": 2e-05, + "loss": 0.03709304, + "step": 17354 + }, + { + "epoch": 34.71, + "grad_norm": 1.5132179260253906, + "learning_rate": 2e-05, + "loss": 0.05928625, + "step": 17355 + }, + { + "epoch": 34.712, + "grad_norm": 1.0462723970413208, + "learning_rate": 2e-05, + "loss": 0.04137703, + "step": 17356 + }, + { + "epoch": 34.714, + "grad_norm": 1.0485808849334717, + "learning_rate": 2e-05, + "loss": 0.04173344, + "step": 17357 + }, + { + "epoch": 34.716, + "grad_norm": 0.9149652719497681, + "learning_rate": 2e-05, + "loss": 0.03566572, + "step": 17358 + }, + { + "epoch": 34.718, + "grad_norm": 2.1620941162109375, + "learning_rate": 2e-05, + "loss": 0.05247457, + "step": 17359 + }, + { + "epoch": 34.72, + "grad_norm": 1.1164774894714355, + "learning_rate": 2e-05, + "loss": 0.04975937, + "step": 17360 + }, + { + "epoch": 34.722, + "grad_norm": 1.0158329010009766, + "learning_rate": 2e-05, + "loss": 0.04216751, + "step": 17361 + }, + { + "epoch": 34.724, + "grad_norm": 1.7261289358139038, + "learning_rate": 2e-05, + "loss": 0.06181233, + "step": 17362 + }, + { + "epoch": 34.726, + "grad_norm": 0.7953335046768188, + "learning_rate": 2e-05, + "loss": 0.02627176, + "step": 17363 + }, + { + "epoch": 34.728, + "grad_norm": 1.0371417999267578, + "learning_rate": 2e-05, + "loss": 0.03395245, + "step": 17364 + }, + { + "epoch": 34.73, + "grad_norm": 1.1978789567947388, + "learning_rate": 2e-05, + "loss": 0.03813355, + "step": 17365 + }, + { + "epoch": 34.732, + "grad_norm": 1.0728747844696045, + "learning_rate": 2e-05, + "loss": 0.0317129, + "step": 17366 + }, + { + "epoch": 34.734, + "grad_norm": 1.3377137184143066, + "learning_rate": 2e-05, + "loss": 0.04749308, + "step": 17367 + }, + { + "epoch": 34.736, + "grad_norm": 1.5253896713256836, + "learning_rate": 2e-05, + "loss": 0.04517586, + "step": 17368 + }, + { + "epoch": 34.738, + "grad_norm": 1.125253677368164, + "learning_rate": 2e-05, + "loss": 0.04429846, + "step": 17369 + }, + { + "epoch": 34.74, + "grad_norm": 1.279826283454895, + "learning_rate": 2e-05, + "loss": 0.04692722, + "step": 17370 + }, + { + "epoch": 34.742, + "grad_norm": 1.15207040309906, + "learning_rate": 2e-05, + "loss": 0.03784764, + "step": 17371 + }, + { + "epoch": 34.744, + "grad_norm": 1.4558398723602295, + "learning_rate": 2e-05, + "loss": 0.05303413, + "step": 17372 + }, + { + "epoch": 34.746, + "grad_norm": 2.0725667476654053, + "learning_rate": 2e-05, + "loss": 0.03780486, + "step": 17373 + }, + { + "epoch": 34.748, + "grad_norm": 1.101965069770813, + "learning_rate": 2e-05, + "loss": 0.0325265, + "step": 17374 + }, + { + "epoch": 34.75, + "grad_norm": 1.2319386005401611, + "learning_rate": 2e-05, + "loss": 0.05028831, + "step": 17375 + }, + { + "epoch": 34.752, + "grad_norm": 0.9592969417572021, + "learning_rate": 2e-05, + "loss": 0.03598889, + "step": 17376 + }, + { + "epoch": 34.754, + "grad_norm": 1.2435072660446167, + "learning_rate": 2e-05, + "loss": 0.04904407, + "step": 17377 + }, + { + "epoch": 34.756, + "grad_norm": 1.0948567390441895, + "learning_rate": 2e-05, + "loss": 0.03987746, + "step": 17378 + }, + { + "epoch": 34.758, + "grad_norm": 1.072523593902588, + "learning_rate": 2e-05, + "loss": 0.04469089, + "step": 17379 + }, + { + "epoch": 34.76, + "grad_norm": 4.93184757232666, + "learning_rate": 2e-05, + "loss": 0.05690203, + "step": 17380 + }, + { + "epoch": 34.762, + "grad_norm": 1.764129638671875, + "learning_rate": 2e-05, + "loss": 0.06511163, + "step": 17381 + }, + { + "epoch": 34.764, + "grad_norm": 4.052314758300781, + "learning_rate": 2e-05, + "loss": 0.05666729, + "step": 17382 + }, + { + "epoch": 34.766, + "grad_norm": 1.1972640752792358, + "learning_rate": 2e-05, + "loss": 0.04556664, + "step": 17383 + }, + { + "epoch": 34.768, + "grad_norm": 1.0919245481491089, + "learning_rate": 2e-05, + "loss": 0.03124575, + "step": 17384 + }, + { + "epoch": 34.77, + "grad_norm": 1.072306513786316, + "learning_rate": 2e-05, + "loss": 0.03128473, + "step": 17385 + }, + { + "epoch": 34.772, + "grad_norm": 1.295246958732605, + "learning_rate": 2e-05, + "loss": 0.05659188, + "step": 17386 + }, + { + "epoch": 34.774, + "grad_norm": 0.5764150619506836, + "learning_rate": 2e-05, + "loss": 0.01222921, + "step": 17387 + }, + { + "epoch": 34.776, + "grad_norm": 2.142376661300659, + "learning_rate": 2e-05, + "loss": 0.05650788, + "step": 17388 + }, + { + "epoch": 34.778, + "grad_norm": 1.199610710144043, + "learning_rate": 2e-05, + "loss": 0.03026199, + "step": 17389 + }, + { + "epoch": 34.78, + "grad_norm": 4.657124996185303, + "learning_rate": 2e-05, + "loss": 0.05358446, + "step": 17390 + }, + { + "epoch": 34.782, + "grad_norm": 1.857397198677063, + "learning_rate": 2e-05, + "loss": 0.04083439, + "step": 17391 + }, + { + "epoch": 34.784, + "grad_norm": 1.1742832660675049, + "learning_rate": 2e-05, + "loss": 0.04639963, + "step": 17392 + }, + { + "epoch": 34.786, + "grad_norm": 1.3055615425109863, + "learning_rate": 2e-05, + "loss": 0.03700022, + "step": 17393 + }, + { + "epoch": 34.788, + "grad_norm": 1.1897599697113037, + "learning_rate": 2e-05, + "loss": 0.03541516, + "step": 17394 + }, + { + "epoch": 34.79, + "grad_norm": 0.9236932992935181, + "learning_rate": 2e-05, + "loss": 0.02803895, + "step": 17395 + }, + { + "epoch": 34.792, + "grad_norm": 1.2352205514907837, + "learning_rate": 2e-05, + "loss": 0.05203598, + "step": 17396 + }, + { + "epoch": 34.794, + "grad_norm": 1.137605905532837, + "learning_rate": 2e-05, + "loss": 0.04387431, + "step": 17397 + }, + { + "epoch": 34.796, + "grad_norm": 1.4612951278686523, + "learning_rate": 2e-05, + "loss": 0.03690141, + "step": 17398 + }, + { + "epoch": 34.798, + "grad_norm": 1.1920640468597412, + "learning_rate": 2e-05, + "loss": 0.03032911, + "step": 17399 + }, + { + "epoch": 34.8, + "grad_norm": 1.4039093255996704, + "learning_rate": 2e-05, + "loss": 0.04082662, + "step": 17400 + }, + { + "epoch": 34.802, + "grad_norm": 1.3465620279312134, + "learning_rate": 2e-05, + "loss": 0.04480909, + "step": 17401 + }, + { + "epoch": 34.804, + "grad_norm": 1.015446424484253, + "learning_rate": 2e-05, + "loss": 0.04655547, + "step": 17402 + }, + { + "epoch": 34.806, + "grad_norm": 1.3519433736801147, + "learning_rate": 2e-05, + "loss": 0.0522088, + "step": 17403 + }, + { + "epoch": 34.808, + "grad_norm": 1.9576835632324219, + "learning_rate": 2e-05, + "loss": 0.04513229, + "step": 17404 + }, + { + "epoch": 34.81, + "grad_norm": 1.3243753910064697, + "learning_rate": 2e-05, + "loss": 0.0440766, + "step": 17405 + }, + { + "epoch": 34.812, + "grad_norm": 2.486659526824951, + "learning_rate": 2e-05, + "loss": 0.04024377, + "step": 17406 + }, + { + "epoch": 34.814, + "grad_norm": 1.11258065700531, + "learning_rate": 2e-05, + "loss": 0.03342846, + "step": 17407 + }, + { + "epoch": 34.816, + "grad_norm": 1.34394109249115, + "learning_rate": 2e-05, + "loss": 0.0359684, + "step": 17408 + }, + { + "epoch": 34.818, + "grad_norm": 1.8080177307128906, + "learning_rate": 2e-05, + "loss": 0.04266899, + "step": 17409 + }, + { + "epoch": 34.82, + "grad_norm": 0.9578447937965393, + "learning_rate": 2e-05, + "loss": 0.02569779, + "step": 17410 + }, + { + "epoch": 34.822, + "grad_norm": 1.6380552053451538, + "learning_rate": 2e-05, + "loss": 0.04872344, + "step": 17411 + }, + { + "epoch": 34.824, + "grad_norm": 1.7012923955917358, + "learning_rate": 2e-05, + "loss": 0.03796242, + "step": 17412 + }, + { + "epoch": 34.826, + "grad_norm": 1.7097322940826416, + "learning_rate": 2e-05, + "loss": 0.04473326, + "step": 17413 + }, + { + "epoch": 34.828, + "grad_norm": 0.8996273279190063, + "learning_rate": 2e-05, + "loss": 0.02824127, + "step": 17414 + }, + { + "epoch": 34.83, + "grad_norm": 1.1458615064620972, + "learning_rate": 2e-05, + "loss": 0.04492954, + "step": 17415 + }, + { + "epoch": 34.832, + "grad_norm": 1.754016637802124, + "learning_rate": 2e-05, + "loss": 0.04475306, + "step": 17416 + }, + { + "epoch": 34.834, + "grad_norm": 1.4149738550186157, + "learning_rate": 2e-05, + "loss": 0.05934685, + "step": 17417 + }, + { + "epoch": 34.836, + "grad_norm": 1.1486657857894897, + "learning_rate": 2e-05, + "loss": 0.03290381, + "step": 17418 + }, + { + "epoch": 34.838, + "grad_norm": 1.1294008493423462, + "learning_rate": 2e-05, + "loss": 0.0412181, + "step": 17419 + }, + { + "epoch": 34.84, + "grad_norm": 1.3856641054153442, + "learning_rate": 2e-05, + "loss": 0.04381644, + "step": 17420 + }, + { + "epoch": 34.842, + "grad_norm": 1.5705233812332153, + "learning_rate": 2e-05, + "loss": 0.0567019, + "step": 17421 + }, + { + "epoch": 34.844, + "grad_norm": 2.1528782844543457, + "learning_rate": 2e-05, + "loss": 0.03446703, + "step": 17422 + }, + { + "epoch": 34.846, + "grad_norm": 3.2811214923858643, + "learning_rate": 2e-05, + "loss": 0.05094865, + "step": 17423 + }, + { + "epoch": 34.848, + "grad_norm": 1.390089988708496, + "learning_rate": 2e-05, + "loss": 0.06304034, + "step": 17424 + }, + { + "epoch": 34.85, + "grad_norm": 1.1673579216003418, + "learning_rate": 2e-05, + "loss": 0.03815352, + "step": 17425 + }, + { + "epoch": 34.852, + "grad_norm": 1.155975103378296, + "learning_rate": 2e-05, + "loss": 0.04023435, + "step": 17426 + }, + { + "epoch": 34.854, + "grad_norm": 2.51444673538208, + "learning_rate": 2e-05, + "loss": 0.03916804, + "step": 17427 + }, + { + "epoch": 34.856, + "grad_norm": 4.942361354827881, + "learning_rate": 2e-05, + "loss": 0.06820999, + "step": 17428 + }, + { + "epoch": 34.858, + "grad_norm": 1.0035266876220703, + "learning_rate": 2e-05, + "loss": 0.03401071, + "step": 17429 + }, + { + "epoch": 34.86, + "grad_norm": 1.0705126523971558, + "learning_rate": 2e-05, + "loss": 0.02997341, + "step": 17430 + }, + { + "epoch": 34.862, + "grad_norm": 1.0698009729385376, + "learning_rate": 2e-05, + "loss": 0.03294614, + "step": 17431 + }, + { + "epoch": 34.864, + "grad_norm": 1.062578558921814, + "learning_rate": 2e-05, + "loss": 0.04528739, + "step": 17432 + }, + { + "epoch": 34.866, + "grad_norm": 1.1648660898208618, + "learning_rate": 2e-05, + "loss": 0.04851899, + "step": 17433 + }, + { + "epoch": 34.868, + "grad_norm": 1.1196273565292358, + "learning_rate": 2e-05, + "loss": 0.04238444, + "step": 17434 + }, + { + "epoch": 34.87, + "grad_norm": 1.043180227279663, + "learning_rate": 2e-05, + "loss": 0.04557565, + "step": 17435 + }, + { + "epoch": 34.872, + "grad_norm": 1.079825520515442, + "learning_rate": 2e-05, + "loss": 0.04296175, + "step": 17436 + }, + { + "epoch": 34.874, + "grad_norm": 1.0005967617034912, + "learning_rate": 2e-05, + "loss": 0.02957804, + "step": 17437 + }, + { + "epoch": 34.876, + "grad_norm": 1.1389814615249634, + "learning_rate": 2e-05, + "loss": 0.0373849, + "step": 17438 + }, + { + "epoch": 34.878, + "grad_norm": 1.3328295946121216, + "learning_rate": 2e-05, + "loss": 0.03476042, + "step": 17439 + }, + { + "epoch": 34.88, + "grad_norm": 1.4265692234039307, + "learning_rate": 2e-05, + "loss": 0.05438258, + "step": 17440 + }, + { + "epoch": 34.882, + "grad_norm": 1.3445887565612793, + "learning_rate": 2e-05, + "loss": 0.04738908, + "step": 17441 + }, + { + "epoch": 34.884, + "grad_norm": 1.100516676902771, + "learning_rate": 2e-05, + "loss": 0.03541394, + "step": 17442 + }, + { + "epoch": 34.886, + "grad_norm": 1.2599114179611206, + "learning_rate": 2e-05, + "loss": 0.05563427, + "step": 17443 + }, + { + "epoch": 34.888, + "grad_norm": 1.3111646175384521, + "learning_rate": 2e-05, + "loss": 0.04711374, + "step": 17444 + }, + { + "epoch": 34.89, + "grad_norm": 1.0831414461135864, + "learning_rate": 2e-05, + "loss": 0.03449087, + "step": 17445 + }, + { + "epoch": 34.892, + "grad_norm": 1.390146255493164, + "learning_rate": 2e-05, + "loss": 0.04913954, + "step": 17446 + }, + { + "epoch": 34.894, + "grad_norm": 1.4582270383834839, + "learning_rate": 2e-05, + "loss": 0.04977434, + "step": 17447 + }, + { + "epoch": 34.896, + "grad_norm": 1.040456771850586, + "learning_rate": 2e-05, + "loss": 0.04064398, + "step": 17448 + }, + { + "epoch": 34.898, + "grad_norm": 1.125290036201477, + "learning_rate": 2e-05, + "loss": 0.0368836, + "step": 17449 + }, + { + "epoch": 34.9, + "grad_norm": 1.6109248399734497, + "learning_rate": 2e-05, + "loss": 0.05745726, + "step": 17450 + }, + { + "epoch": 34.902, + "grad_norm": 1.6481939554214478, + "learning_rate": 2e-05, + "loss": 0.05567736, + "step": 17451 + }, + { + "epoch": 34.904, + "grad_norm": 1.1878528594970703, + "learning_rate": 2e-05, + "loss": 0.03877228, + "step": 17452 + }, + { + "epoch": 34.906, + "grad_norm": 1.0018279552459717, + "learning_rate": 2e-05, + "loss": 0.03138274, + "step": 17453 + }, + { + "epoch": 34.908, + "grad_norm": 1.4977102279663086, + "learning_rate": 2e-05, + "loss": 0.03562863, + "step": 17454 + }, + { + "epoch": 34.91, + "grad_norm": 1.0117096900939941, + "learning_rate": 2e-05, + "loss": 0.03509045, + "step": 17455 + }, + { + "epoch": 34.912, + "grad_norm": 1.3096954822540283, + "learning_rate": 2e-05, + "loss": 0.04645215, + "step": 17456 + }, + { + "epoch": 34.914, + "grad_norm": 1.7684214115142822, + "learning_rate": 2e-05, + "loss": 0.04929952, + "step": 17457 + }, + { + "epoch": 34.916, + "grad_norm": 1.0269994735717773, + "learning_rate": 2e-05, + "loss": 0.03455571, + "step": 17458 + }, + { + "epoch": 34.918, + "grad_norm": 0.9055759310722351, + "learning_rate": 2e-05, + "loss": 0.03060112, + "step": 17459 + }, + { + "epoch": 34.92, + "grad_norm": 1.3847931623458862, + "learning_rate": 2e-05, + "loss": 0.05905629, + "step": 17460 + }, + { + "epoch": 34.922, + "grad_norm": 0.8554072380065918, + "learning_rate": 2e-05, + "loss": 0.02603616, + "step": 17461 + }, + { + "epoch": 34.924, + "grad_norm": 1.3800654411315918, + "learning_rate": 2e-05, + "loss": 0.04450605, + "step": 17462 + }, + { + "epoch": 34.926, + "grad_norm": 1.0499212741851807, + "learning_rate": 2e-05, + "loss": 0.03139212, + "step": 17463 + }, + { + "epoch": 34.928, + "grad_norm": 1.5442363023757935, + "learning_rate": 2e-05, + "loss": 0.05026272, + "step": 17464 + }, + { + "epoch": 34.93, + "grad_norm": 1.0467393398284912, + "learning_rate": 2e-05, + "loss": 0.03508221, + "step": 17465 + }, + { + "epoch": 34.932, + "grad_norm": 1.9836903810501099, + "learning_rate": 2e-05, + "loss": 0.03904892, + "step": 17466 + }, + { + "epoch": 34.934, + "grad_norm": 1.667801022529602, + "learning_rate": 2e-05, + "loss": 0.06595255, + "step": 17467 + }, + { + "epoch": 34.936, + "grad_norm": 1.248084545135498, + "learning_rate": 2e-05, + "loss": 0.04658974, + "step": 17468 + }, + { + "epoch": 34.938, + "grad_norm": 1.1718586683273315, + "learning_rate": 2e-05, + "loss": 0.04607401, + "step": 17469 + }, + { + "epoch": 34.94, + "grad_norm": 1.562265396118164, + "learning_rate": 2e-05, + "loss": 0.05189687, + "step": 17470 + }, + { + "epoch": 34.942, + "grad_norm": 2.5033798217773438, + "learning_rate": 2e-05, + "loss": 0.04024173, + "step": 17471 + }, + { + "epoch": 34.944, + "grad_norm": 1.3833789825439453, + "learning_rate": 2e-05, + "loss": 0.04375815, + "step": 17472 + }, + { + "epoch": 34.946, + "grad_norm": 1.2791060209274292, + "learning_rate": 2e-05, + "loss": 0.03404561, + "step": 17473 + }, + { + "epoch": 34.948, + "grad_norm": 4.338647365570068, + "learning_rate": 2e-05, + "loss": 0.05873897, + "step": 17474 + }, + { + "epoch": 34.95, + "grad_norm": 1.4956046342849731, + "learning_rate": 2e-05, + "loss": 0.04620642, + "step": 17475 + }, + { + "epoch": 34.952, + "grad_norm": 1.1326979398727417, + "learning_rate": 2e-05, + "loss": 0.04122002, + "step": 17476 + }, + { + "epoch": 34.954, + "grad_norm": 1.0940099954605103, + "learning_rate": 2e-05, + "loss": 0.04148469, + "step": 17477 + }, + { + "epoch": 34.956, + "grad_norm": 1.1113457679748535, + "learning_rate": 2e-05, + "loss": 0.04638319, + "step": 17478 + }, + { + "epoch": 34.958, + "grad_norm": 1.088361144065857, + "learning_rate": 2e-05, + "loss": 0.04051323, + "step": 17479 + }, + { + "epoch": 34.96, + "grad_norm": 1.1193358898162842, + "learning_rate": 2e-05, + "loss": 0.04485134, + "step": 17480 + }, + { + "epoch": 34.962, + "grad_norm": 1.2051522731781006, + "learning_rate": 2e-05, + "loss": 0.03799698, + "step": 17481 + }, + { + "epoch": 34.964, + "grad_norm": 1.1342461109161377, + "learning_rate": 2e-05, + "loss": 0.03318293, + "step": 17482 + }, + { + "epoch": 34.966, + "grad_norm": 1.2009114027023315, + "learning_rate": 2e-05, + "loss": 0.04339875, + "step": 17483 + }, + { + "epoch": 34.968, + "grad_norm": 1.1767265796661377, + "learning_rate": 2e-05, + "loss": 0.04513696, + "step": 17484 + }, + { + "epoch": 34.97, + "grad_norm": 1.503244400024414, + "learning_rate": 2e-05, + "loss": 0.07590361, + "step": 17485 + }, + { + "epoch": 34.972, + "grad_norm": 0.9458696246147156, + "learning_rate": 2e-05, + "loss": 0.03492884, + "step": 17486 + }, + { + "epoch": 34.974, + "grad_norm": 1.2209609746932983, + "learning_rate": 2e-05, + "loss": 0.02998404, + "step": 17487 + }, + { + "epoch": 34.976, + "grad_norm": 1.2666972875595093, + "learning_rate": 2e-05, + "loss": 0.05165854, + "step": 17488 + }, + { + "epoch": 34.978, + "grad_norm": 1.2078595161437988, + "learning_rate": 2e-05, + "loss": 0.04767955, + "step": 17489 + }, + { + "epoch": 34.98, + "grad_norm": 1.205334186553955, + "learning_rate": 2e-05, + "loss": 0.05208175, + "step": 17490 + }, + { + "epoch": 34.982, + "grad_norm": 1.0179728269577026, + "learning_rate": 2e-05, + "loss": 0.03114939, + "step": 17491 + }, + { + "epoch": 34.984, + "grad_norm": 1.8329960107803345, + "learning_rate": 2e-05, + "loss": 0.06110509, + "step": 17492 + }, + { + "epoch": 34.986, + "grad_norm": 1.045487880706787, + "learning_rate": 2e-05, + "loss": 0.02562892, + "step": 17493 + }, + { + "epoch": 34.988, + "grad_norm": 1.160578966140747, + "learning_rate": 2e-05, + "loss": 0.04529508, + "step": 17494 + }, + { + "epoch": 34.99, + "grad_norm": 1.1257965564727783, + "learning_rate": 2e-05, + "loss": 0.04581528, + "step": 17495 + }, + { + "epoch": 34.992, + "grad_norm": 1.1430251598358154, + "learning_rate": 2e-05, + "loss": 0.05036756, + "step": 17496 + }, + { + "epoch": 34.994, + "grad_norm": 1.6295039653778076, + "learning_rate": 2e-05, + "loss": 0.04688197, + "step": 17497 + }, + { + "epoch": 34.996, + "grad_norm": 1.4103097915649414, + "learning_rate": 2e-05, + "loss": 0.05346009, + "step": 17498 + }, + { + "epoch": 34.998, + "grad_norm": 1.2611072063446045, + "learning_rate": 2e-05, + "loss": 0.0430942, + "step": 17499 + }, + { + "epoch": 35.0, + "grad_norm": 2.2868216037750244, + "learning_rate": 2e-05, + "loss": 0.04770452, + "step": 17500 + }, + { + "epoch": 35.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9820359281437125, + "Equal_1": 1.0, + "Equal_2": 0.9780439121756487, + "Equal_3": 0.9880239520958084, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.994, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.996, + "Perpendicular_3": 0.9038076152304609, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.996, + "PointLiesOnCircle_3": 0.988, + "PointLiesOnLine_1": 0.9959919839679359, + "PointLiesOnLine_2": 0.9919839679358717, + "PointLiesOnLine_3": 0.9900199600798403 + }, + "eval_runtime": 319.8873, + "eval_samples_per_second": 32.824, + "eval_steps_per_second": 0.656, + "step": 17500 + }, + { + "epoch": 35.002, + "grad_norm": 1.9222464561462402, + "learning_rate": 2e-05, + "loss": 0.03432529, + "step": 17501 + }, + { + "epoch": 35.004, + "grad_norm": 1.4019160270690918, + "learning_rate": 2e-05, + "loss": 0.03604925, + "step": 17502 + }, + { + "epoch": 35.006, + "grad_norm": 0.9827688336372375, + "learning_rate": 2e-05, + "loss": 0.02998075, + "step": 17503 + }, + { + "epoch": 35.008, + "grad_norm": 1.1721317768096924, + "learning_rate": 2e-05, + "loss": 0.04828912, + "step": 17504 + }, + { + "epoch": 35.01, + "grad_norm": 1.120781421661377, + "learning_rate": 2e-05, + "loss": 0.04639394, + "step": 17505 + }, + { + "epoch": 35.012, + "grad_norm": 1.3322588205337524, + "learning_rate": 2e-05, + "loss": 0.03849679, + "step": 17506 + }, + { + "epoch": 35.014, + "grad_norm": 1.1528602838516235, + "learning_rate": 2e-05, + "loss": 0.04227344, + "step": 17507 + }, + { + "epoch": 35.016, + "grad_norm": 1.0254783630371094, + "learning_rate": 2e-05, + "loss": 0.02713573, + "step": 17508 + }, + { + "epoch": 35.018, + "grad_norm": 2.194639205932617, + "learning_rate": 2e-05, + "loss": 0.06066288, + "step": 17509 + }, + { + "epoch": 35.02, + "grad_norm": 1.1950559616088867, + "learning_rate": 2e-05, + "loss": 0.05654139, + "step": 17510 + }, + { + "epoch": 35.022, + "grad_norm": 1.4963222742080688, + "learning_rate": 2e-05, + "loss": 0.03999151, + "step": 17511 + }, + { + "epoch": 35.024, + "grad_norm": 1.4710837602615356, + "learning_rate": 2e-05, + "loss": 0.05531586, + "step": 17512 + }, + { + "epoch": 35.026, + "grad_norm": 1.7270408868789673, + "learning_rate": 2e-05, + "loss": 0.04574881, + "step": 17513 + }, + { + "epoch": 35.028, + "grad_norm": 1.1254507303237915, + "learning_rate": 2e-05, + "loss": 0.04714286, + "step": 17514 + }, + { + "epoch": 35.03, + "grad_norm": 1.2321765422821045, + "learning_rate": 2e-05, + "loss": 0.0381449, + "step": 17515 + }, + { + "epoch": 35.032, + "grad_norm": 1.3832772970199585, + "learning_rate": 2e-05, + "loss": 0.04403721, + "step": 17516 + }, + { + "epoch": 35.034, + "grad_norm": 2.5237765312194824, + "learning_rate": 2e-05, + "loss": 0.0705086, + "step": 17517 + }, + { + "epoch": 35.036, + "grad_norm": 1.3510403633117676, + "learning_rate": 2e-05, + "loss": 0.04805104, + "step": 17518 + }, + { + "epoch": 35.038, + "grad_norm": 1.1089110374450684, + "learning_rate": 2e-05, + "loss": 0.04748667, + "step": 17519 + }, + { + "epoch": 35.04, + "grad_norm": 1.0043740272521973, + "learning_rate": 2e-05, + "loss": 0.03443906, + "step": 17520 + }, + { + "epoch": 35.042, + "grad_norm": 1.2505918741226196, + "learning_rate": 2e-05, + "loss": 0.0434991, + "step": 17521 + }, + { + "epoch": 35.044, + "grad_norm": 1.0381025075912476, + "learning_rate": 2e-05, + "loss": 0.04269955, + "step": 17522 + }, + { + "epoch": 35.046, + "grad_norm": 1.2612941265106201, + "learning_rate": 2e-05, + "loss": 0.04119605, + "step": 17523 + }, + { + "epoch": 35.048, + "grad_norm": 2.3623406887054443, + "learning_rate": 2e-05, + "loss": 0.03740188, + "step": 17524 + }, + { + "epoch": 35.05, + "grad_norm": 1.2907884120941162, + "learning_rate": 2e-05, + "loss": 0.04235482, + "step": 17525 + }, + { + "epoch": 35.052, + "grad_norm": 0.83847975730896, + "learning_rate": 2e-05, + "loss": 0.02778682, + "step": 17526 + }, + { + "epoch": 35.054, + "grad_norm": 1.2352306842803955, + "learning_rate": 2e-05, + "loss": 0.04302741, + "step": 17527 + }, + { + "epoch": 35.056, + "grad_norm": 1.0656414031982422, + "learning_rate": 2e-05, + "loss": 0.04405284, + "step": 17528 + }, + { + "epoch": 35.058, + "grad_norm": 2.7260079383850098, + "learning_rate": 2e-05, + "loss": 0.04868072, + "step": 17529 + }, + { + "epoch": 35.06, + "grad_norm": 0.9766820073127747, + "learning_rate": 2e-05, + "loss": 0.03807481, + "step": 17530 + }, + { + "epoch": 35.062, + "grad_norm": 0.8948190808296204, + "learning_rate": 2e-05, + "loss": 0.02815506, + "step": 17531 + }, + { + "epoch": 35.064, + "grad_norm": 2.2773566246032715, + "learning_rate": 2e-05, + "loss": 0.05238456, + "step": 17532 + }, + { + "epoch": 35.066, + "grad_norm": 1.1301003694534302, + "learning_rate": 2e-05, + "loss": 0.04259311, + "step": 17533 + }, + { + "epoch": 35.068, + "grad_norm": 1.2741830348968506, + "learning_rate": 2e-05, + "loss": 0.06291425, + "step": 17534 + }, + { + "epoch": 35.07, + "grad_norm": 1.1298288106918335, + "learning_rate": 2e-05, + "loss": 0.04579066, + "step": 17535 + }, + { + "epoch": 35.072, + "grad_norm": 1.111412525177002, + "learning_rate": 2e-05, + "loss": 0.02930526, + "step": 17536 + }, + { + "epoch": 35.074, + "grad_norm": 1.3507663011550903, + "learning_rate": 2e-05, + "loss": 0.06096983, + "step": 17537 + }, + { + "epoch": 35.076, + "grad_norm": 1.3331928253173828, + "learning_rate": 2e-05, + "loss": 0.06763585, + "step": 17538 + }, + { + "epoch": 35.078, + "grad_norm": 1.7155866622924805, + "learning_rate": 2e-05, + "loss": 0.04312644, + "step": 17539 + }, + { + "epoch": 35.08, + "grad_norm": 1.359623908996582, + "learning_rate": 2e-05, + "loss": 0.04029921, + "step": 17540 + }, + { + "epoch": 35.082, + "grad_norm": 1.0476104021072388, + "learning_rate": 2e-05, + "loss": 0.04592532, + "step": 17541 + }, + { + "epoch": 35.084, + "grad_norm": 2.1303536891937256, + "learning_rate": 2e-05, + "loss": 0.03890026, + "step": 17542 + }, + { + "epoch": 35.086, + "grad_norm": 1.1645375490188599, + "learning_rate": 2e-05, + "loss": 0.04217415, + "step": 17543 + }, + { + "epoch": 35.088, + "grad_norm": 1.1522724628448486, + "learning_rate": 2e-05, + "loss": 0.05060149, + "step": 17544 + }, + { + "epoch": 35.09, + "grad_norm": 0.8973413705825806, + "learning_rate": 2e-05, + "loss": 0.02097614, + "step": 17545 + }, + { + "epoch": 35.092, + "grad_norm": 1.3803879022598267, + "learning_rate": 2e-05, + "loss": 0.04214935, + "step": 17546 + }, + { + "epoch": 35.094, + "grad_norm": 1.174539566040039, + "learning_rate": 2e-05, + "loss": 0.04683707, + "step": 17547 + }, + { + "epoch": 35.096, + "grad_norm": 1.838232398033142, + "learning_rate": 2e-05, + "loss": 0.04326537, + "step": 17548 + }, + { + "epoch": 35.098, + "grad_norm": 1.5689642429351807, + "learning_rate": 2e-05, + "loss": 0.04072879, + "step": 17549 + }, + { + "epoch": 35.1, + "grad_norm": 1.0995928049087524, + "learning_rate": 2e-05, + "loss": 0.03880709, + "step": 17550 + }, + { + "epoch": 35.102, + "grad_norm": 0.8378002047538757, + "learning_rate": 2e-05, + "loss": 0.02281972, + "step": 17551 + }, + { + "epoch": 35.104, + "grad_norm": 1.1497136354446411, + "learning_rate": 2e-05, + "loss": 0.04842537, + "step": 17552 + }, + { + "epoch": 35.106, + "grad_norm": 1.2610912322998047, + "learning_rate": 2e-05, + "loss": 0.04262506, + "step": 17553 + }, + { + "epoch": 35.108, + "grad_norm": 0.969350278377533, + "learning_rate": 2e-05, + "loss": 0.03585976, + "step": 17554 + }, + { + "epoch": 35.11, + "grad_norm": 1.0584502220153809, + "learning_rate": 2e-05, + "loss": 0.03664526, + "step": 17555 + }, + { + "epoch": 35.112, + "grad_norm": 1.3232938051223755, + "learning_rate": 2e-05, + "loss": 0.05472728, + "step": 17556 + }, + { + "epoch": 35.114, + "grad_norm": 1.1075985431671143, + "learning_rate": 2e-05, + "loss": 0.05077582, + "step": 17557 + }, + { + "epoch": 35.116, + "grad_norm": 1.2517019510269165, + "learning_rate": 2e-05, + "loss": 0.05858276, + "step": 17558 + }, + { + "epoch": 35.118, + "grad_norm": 1.2317686080932617, + "learning_rate": 2e-05, + "loss": 0.04953536, + "step": 17559 + }, + { + "epoch": 35.12, + "grad_norm": 2.1341912746429443, + "learning_rate": 2e-05, + "loss": 0.03007166, + "step": 17560 + }, + { + "epoch": 35.122, + "grad_norm": 2.465643882751465, + "learning_rate": 2e-05, + "loss": 0.03830947, + "step": 17561 + }, + { + "epoch": 35.124, + "grad_norm": 1.5876415967941284, + "learning_rate": 2e-05, + "loss": 0.03763131, + "step": 17562 + }, + { + "epoch": 35.126, + "grad_norm": 1.1403216123580933, + "learning_rate": 2e-05, + "loss": 0.04078696, + "step": 17563 + }, + { + "epoch": 35.128, + "grad_norm": 1.7711175680160522, + "learning_rate": 2e-05, + "loss": 0.05040234, + "step": 17564 + }, + { + "epoch": 35.13, + "grad_norm": 1.334191918373108, + "learning_rate": 2e-05, + "loss": 0.04977368, + "step": 17565 + }, + { + "epoch": 35.132, + "grad_norm": 1.9622347354888916, + "learning_rate": 2e-05, + "loss": 0.04543035, + "step": 17566 + }, + { + "epoch": 35.134, + "grad_norm": 1.4173260927200317, + "learning_rate": 2e-05, + "loss": 0.05295032, + "step": 17567 + }, + { + "epoch": 35.136, + "grad_norm": 1.4834239482879639, + "learning_rate": 2e-05, + "loss": 0.04917359, + "step": 17568 + }, + { + "epoch": 35.138, + "grad_norm": 1.31399405002594, + "learning_rate": 2e-05, + "loss": 0.03308884, + "step": 17569 + }, + { + "epoch": 35.14, + "grad_norm": 1.307672142982483, + "learning_rate": 2e-05, + "loss": 0.04067755, + "step": 17570 + }, + { + "epoch": 35.142, + "grad_norm": 1.3411407470703125, + "learning_rate": 2e-05, + "loss": 0.041333, + "step": 17571 + }, + { + "epoch": 35.144, + "grad_norm": 5.57957649230957, + "learning_rate": 2e-05, + "loss": 0.04797975, + "step": 17572 + }, + { + "epoch": 35.146, + "grad_norm": 1.3772554397583008, + "learning_rate": 2e-05, + "loss": 0.05607633, + "step": 17573 + }, + { + "epoch": 35.148, + "grad_norm": 2.2462568283081055, + "learning_rate": 2e-05, + "loss": 0.06492513, + "step": 17574 + }, + { + "epoch": 35.15, + "grad_norm": 1.0662626028060913, + "learning_rate": 2e-05, + "loss": 0.05135501, + "step": 17575 + }, + { + "epoch": 35.152, + "grad_norm": 1.4305570125579834, + "learning_rate": 2e-05, + "loss": 0.04058772, + "step": 17576 + }, + { + "epoch": 35.154, + "grad_norm": 1.031005620956421, + "learning_rate": 2e-05, + "loss": 0.0381646, + "step": 17577 + }, + { + "epoch": 35.156, + "grad_norm": 1.168686866760254, + "learning_rate": 2e-05, + "loss": 0.04385469, + "step": 17578 + }, + { + "epoch": 35.158, + "grad_norm": 0.9006215333938599, + "learning_rate": 2e-05, + "loss": 0.03294571, + "step": 17579 + }, + { + "epoch": 35.16, + "grad_norm": 1.1258612871170044, + "learning_rate": 2e-05, + "loss": 0.05022587, + "step": 17580 + }, + { + "epoch": 35.162, + "grad_norm": 1.2660801410675049, + "learning_rate": 2e-05, + "loss": 0.06481211, + "step": 17581 + }, + { + "epoch": 35.164, + "grad_norm": 1.8085393905639648, + "learning_rate": 2e-05, + "loss": 0.03045495, + "step": 17582 + }, + { + "epoch": 35.166, + "grad_norm": 1.5225801467895508, + "learning_rate": 2e-05, + "loss": 0.04185986, + "step": 17583 + }, + { + "epoch": 35.168, + "grad_norm": 1.3285081386566162, + "learning_rate": 2e-05, + "loss": 0.03883301, + "step": 17584 + }, + { + "epoch": 35.17, + "grad_norm": 0.9945167899131775, + "learning_rate": 2e-05, + "loss": 0.03533852, + "step": 17585 + }, + { + "epoch": 35.172, + "grad_norm": 1.0777450799942017, + "learning_rate": 2e-05, + "loss": 0.03911363, + "step": 17586 + }, + { + "epoch": 35.174, + "grad_norm": 0.8671603202819824, + "learning_rate": 2e-05, + "loss": 0.02011818, + "step": 17587 + }, + { + "epoch": 35.176, + "grad_norm": 1.2364228963851929, + "learning_rate": 2e-05, + "loss": 0.04776933, + "step": 17588 + }, + { + "epoch": 35.178, + "grad_norm": 1.3007594347000122, + "learning_rate": 2e-05, + "loss": 0.04221889, + "step": 17589 + }, + { + "epoch": 35.18, + "grad_norm": 1.1037037372589111, + "learning_rate": 2e-05, + "loss": 0.03446237, + "step": 17590 + }, + { + "epoch": 35.182, + "grad_norm": 1.2064841985702515, + "learning_rate": 2e-05, + "loss": 0.04089108, + "step": 17591 + }, + { + "epoch": 35.184, + "grad_norm": 0.9847924113273621, + "learning_rate": 2e-05, + "loss": 0.0352126, + "step": 17592 + }, + { + "epoch": 35.186, + "grad_norm": 1.4962389469146729, + "learning_rate": 2e-05, + "loss": 0.06021068, + "step": 17593 + }, + { + "epoch": 35.188, + "grad_norm": 1.4527535438537598, + "learning_rate": 2e-05, + "loss": 0.0581253, + "step": 17594 + }, + { + "epoch": 35.19, + "grad_norm": 1.0838546752929688, + "learning_rate": 2e-05, + "loss": 0.03708592, + "step": 17595 + }, + { + "epoch": 35.192, + "grad_norm": 1.216435432434082, + "learning_rate": 2e-05, + "loss": 0.0478345, + "step": 17596 + }, + { + "epoch": 35.194, + "grad_norm": 3.266355514526367, + "learning_rate": 2e-05, + "loss": 0.04139251, + "step": 17597 + }, + { + "epoch": 35.196, + "grad_norm": 2.6940810680389404, + "learning_rate": 2e-05, + "loss": 0.04337883, + "step": 17598 + }, + { + "epoch": 35.198, + "grad_norm": 1.02515709400177, + "learning_rate": 2e-05, + "loss": 0.0304161, + "step": 17599 + }, + { + "epoch": 35.2, + "grad_norm": 1.0762580633163452, + "learning_rate": 2e-05, + "loss": 0.04152739, + "step": 17600 + }, + { + "epoch": 35.202, + "grad_norm": 2.462306499481201, + "learning_rate": 2e-05, + "loss": 0.03642452, + "step": 17601 + }, + { + "epoch": 35.204, + "grad_norm": 1.2067070007324219, + "learning_rate": 2e-05, + "loss": 0.04705513, + "step": 17602 + }, + { + "epoch": 35.206, + "grad_norm": 1.4548958539962769, + "learning_rate": 2e-05, + "loss": 0.06413062, + "step": 17603 + }, + { + "epoch": 35.208, + "grad_norm": 1.3760452270507812, + "learning_rate": 2e-05, + "loss": 0.05110206, + "step": 17604 + }, + { + "epoch": 35.21, + "grad_norm": 1.0767890214920044, + "learning_rate": 2e-05, + "loss": 0.03728911, + "step": 17605 + }, + { + "epoch": 35.212, + "grad_norm": 1.1798242330551147, + "learning_rate": 2e-05, + "loss": 0.04494874, + "step": 17606 + }, + { + "epoch": 35.214, + "grad_norm": 1.1159026622772217, + "learning_rate": 2e-05, + "loss": 0.04022665, + "step": 17607 + }, + { + "epoch": 35.216, + "grad_norm": 1.239780306816101, + "learning_rate": 2e-05, + "loss": 0.04985084, + "step": 17608 + }, + { + "epoch": 35.218, + "grad_norm": 1.381771445274353, + "learning_rate": 2e-05, + "loss": 0.02961168, + "step": 17609 + }, + { + "epoch": 35.22, + "grad_norm": 1.4819316864013672, + "learning_rate": 2e-05, + "loss": 0.04220682, + "step": 17610 + }, + { + "epoch": 35.222, + "grad_norm": 1.0734423398971558, + "learning_rate": 2e-05, + "loss": 0.03761606, + "step": 17611 + }, + { + "epoch": 35.224, + "grad_norm": 1.9286874532699585, + "learning_rate": 2e-05, + "loss": 0.04495767, + "step": 17612 + }, + { + "epoch": 35.226, + "grad_norm": 1.3085813522338867, + "learning_rate": 2e-05, + "loss": 0.04169934, + "step": 17613 + }, + { + "epoch": 35.228, + "grad_norm": 1.250126838684082, + "learning_rate": 2e-05, + "loss": 0.04465639, + "step": 17614 + }, + { + "epoch": 35.23, + "grad_norm": 1.335493803024292, + "learning_rate": 2e-05, + "loss": 0.04012146, + "step": 17615 + }, + { + "epoch": 35.232, + "grad_norm": 1.5321751832962036, + "learning_rate": 2e-05, + "loss": 0.06640027, + "step": 17616 + }, + { + "epoch": 35.234, + "grad_norm": 2.0908942222595215, + "learning_rate": 2e-05, + "loss": 0.05824264, + "step": 17617 + }, + { + "epoch": 35.236, + "grad_norm": 1.1407338380813599, + "learning_rate": 2e-05, + "loss": 0.04133571, + "step": 17618 + }, + { + "epoch": 35.238, + "grad_norm": 1.3628578186035156, + "learning_rate": 2e-05, + "loss": 0.03877375, + "step": 17619 + }, + { + "epoch": 35.24, + "grad_norm": 3.9366886615753174, + "learning_rate": 2e-05, + "loss": 0.0478052, + "step": 17620 + }, + { + "epoch": 35.242, + "grad_norm": 2.441206216812134, + "learning_rate": 2e-05, + "loss": 0.03485952, + "step": 17621 + }, + { + "epoch": 35.244, + "grad_norm": 2.0725955963134766, + "learning_rate": 2e-05, + "loss": 0.03543587, + "step": 17622 + }, + { + "epoch": 35.246, + "grad_norm": 1.0343211889266968, + "learning_rate": 2e-05, + "loss": 0.03453851, + "step": 17623 + }, + { + "epoch": 35.248, + "grad_norm": 0.9420909881591797, + "learning_rate": 2e-05, + "loss": 0.02860404, + "step": 17624 + }, + { + "epoch": 35.25, + "grad_norm": 1.3072288036346436, + "learning_rate": 2e-05, + "loss": 0.04773654, + "step": 17625 + }, + { + "epoch": 35.252, + "grad_norm": 1.3463304042816162, + "learning_rate": 2e-05, + "loss": 0.03698765, + "step": 17626 + }, + { + "epoch": 35.254, + "grad_norm": 2.281853199005127, + "learning_rate": 2e-05, + "loss": 0.05275848, + "step": 17627 + }, + { + "epoch": 35.256, + "grad_norm": 1.1776528358459473, + "learning_rate": 2e-05, + "loss": 0.02095211, + "step": 17628 + }, + { + "epoch": 35.258, + "grad_norm": 1.3505492210388184, + "learning_rate": 2e-05, + "loss": 0.04562983, + "step": 17629 + }, + { + "epoch": 35.26, + "grad_norm": 1.349131464958191, + "learning_rate": 2e-05, + "loss": 0.03608799, + "step": 17630 + }, + { + "epoch": 35.262, + "grad_norm": 1.218159794807434, + "learning_rate": 2e-05, + "loss": 0.0397279, + "step": 17631 + }, + { + "epoch": 35.264, + "grad_norm": 2.076681137084961, + "learning_rate": 2e-05, + "loss": 0.06574233, + "step": 17632 + }, + { + "epoch": 35.266, + "grad_norm": 1.0747252702713013, + "learning_rate": 2e-05, + "loss": 0.04180413, + "step": 17633 + }, + { + "epoch": 35.268, + "grad_norm": 1.5515995025634766, + "learning_rate": 2e-05, + "loss": 0.05342175, + "step": 17634 + }, + { + "epoch": 35.27, + "grad_norm": 1.4581546783447266, + "learning_rate": 2e-05, + "loss": 0.06602959, + "step": 17635 + }, + { + "epoch": 35.272, + "grad_norm": 3.7796173095703125, + "learning_rate": 2e-05, + "loss": 0.0564354, + "step": 17636 + }, + { + "epoch": 35.274, + "grad_norm": 3.0336711406707764, + "learning_rate": 2e-05, + "loss": 0.05401599, + "step": 17637 + }, + { + "epoch": 35.276, + "grad_norm": 1.2968264818191528, + "learning_rate": 2e-05, + "loss": 0.0414682, + "step": 17638 + }, + { + "epoch": 35.278, + "grad_norm": 2.132004737854004, + "learning_rate": 2e-05, + "loss": 0.0513651, + "step": 17639 + }, + { + "epoch": 35.28, + "grad_norm": 2.880918025970459, + "learning_rate": 2e-05, + "loss": 0.04456554, + "step": 17640 + }, + { + "epoch": 35.282, + "grad_norm": 1.0972630977630615, + "learning_rate": 2e-05, + "loss": 0.04144361, + "step": 17641 + }, + { + "epoch": 35.284, + "grad_norm": 1.0943173170089722, + "learning_rate": 2e-05, + "loss": 0.04478058, + "step": 17642 + }, + { + "epoch": 35.286, + "grad_norm": 1.1014378070831299, + "learning_rate": 2e-05, + "loss": 0.04401636, + "step": 17643 + }, + { + "epoch": 35.288, + "grad_norm": 1.2285698652267456, + "learning_rate": 2e-05, + "loss": 0.04485837, + "step": 17644 + }, + { + "epoch": 35.29, + "grad_norm": 1.2702783346176147, + "learning_rate": 2e-05, + "loss": 0.0508531, + "step": 17645 + }, + { + "epoch": 35.292, + "grad_norm": 1.179574728012085, + "learning_rate": 2e-05, + "loss": 0.03975598, + "step": 17646 + }, + { + "epoch": 35.294, + "grad_norm": 0.9671401381492615, + "learning_rate": 2e-05, + "loss": 0.03313099, + "step": 17647 + }, + { + "epoch": 35.296, + "grad_norm": 1.1078392267227173, + "learning_rate": 2e-05, + "loss": 0.03759495, + "step": 17648 + }, + { + "epoch": 35.298, + "grad_norm": 1.5418148040771484, + "learning_rate": 2e-05, + "loss": 0.0561055, + "step": 17649 + }, + { + "epoch": 35.3, + "grad_norm": 1.1559892892837524, + "learning_rate": 2e-05, + "loss": 0.04232976, + "step": 17650 + }, + { + "epoch": 35.302, + "grad_norm": 1.9095799922943115, + "learning_rate": 2e-05, + "loss": 0.05348632, + "step": 17651 + }, + { + "epoch": 35.304, + "grad_norm": 0.9116816520690918, + "learning_rate": 2e-05, + "loss": 0.03719225, + "step": 17652 + }, + { + "epoch": 35.306, + "grad_norm": 1.5335626602172852, + "learning_rate": 2e-05, + "loss": 0.0475245, + "step": 17653 + }, + { + "epoch": 35.308, + "grad_norm": 1.8686447143554688, + "learning_rate": 2e-05, + "loss": 0.0369437, + "step": 17654 + }, + { + "epoch": 35.31, + "grad_norm": 1.4260104894638062, + "learning_rate": 2e-05, + "loss": 0.0563136, + "step": 17655 + }, + { + "epoch": 35.312, + "grad_norm": 1.3176708221435547, + "learning_rate": 2e-05, + "loss": 0.04525885, + "step": 17656 + }, + { + "epoch": 35.314, + "grad_norm": 1.062448263168335, + "learning_rate": 2e-05, + "loss": 0.03579092, + "step": 17657 + }, + { + "epoch": 35.316, + "grad_norm": 0.8730036020278931, + "learning_rate": 2e-05, + "loss": 0.02629447, + "step": 17658 + }, + { + "epoch": 35.318, + "grad_norm": 1.2714184522628784, + "learning_rate": 2e-05, + "loss": 0.03522924, + "step": 17659 + }, + { + "epoch": 35.32, + "grad_norm": 1.3497344255447388, + "learning_rate": 2e-05, + "loss": 0.0463984, + "step": 17660 + }, + { + "epoch": 35.322, + "grad_norm": 2.49761962890625, + "learning_rate": 2e-05, + "loss": 0.0529076, + "step": 17661 + }, + { + "epoch": 35.324, + "grad_norm": 1.4788845777511597, + "learning_rate": 2e-05, + "loss": 0.04073741, + "step": 17662 + }, + { + "epoch": 35.326, + "grad_norm": 1.1707643270492554, + "learning_rate": 2e-05, + "loss": 0.03425131, + "step": 17663 + }, + { + "epoch": 35.328, + "grad_norm": 0.9647262692451477, + "learning_rate": 2e-05, + "loss": 0.02807866, + "step": 17664 + }, + { + "epoch": 35.33, + "grad_norm": 1.2062174081802368, + "learning_rate": 2e-05, + "loss": 0.06155042, + "step": 17665 + }, + { + "epoch": 35.332, + "grad_norm": 0.9606382250785828, + "learning_rate": 2e-05, + "loss": 0.03627588, + "step": 17666 + }, + { + "epoch": 35.334, + "grad_norm": 1.414086937904358, + "learning_rate": 2e-05, + "loss": 0.04845129, + "step": 17667 + }, + { + "epoch": 35.336, + "grad_norm": 1.6463829278945923, + "learning_rate": 2e-05, + "loss": 0.03860418, + "step": 17668 + }, + { + "epoch": 35.338, + "grad_norm": 1.1206401586532593, + "learning_rate": 2e-05, + "loss": 0.04083535, + "step": 17669 + }, + { + "epoch": 35.34, + "grad_norm": 2.738128662109375, + "learning_rate": 2e-05, + "loss": 0.06861308, + "step": 17670 + }, + { + "epoch": 35.342, + "grad_norm": 2.6043827533721924, + "learning_rate": 2e-05, + "loss": 0.05943043, + "step": 17671 + }, + { + "epoch": 35.344, + "grad_norm": 1.0954211950302124, + "learning_rate": 2e-05, + "loss": 0.04462023, + "step": 17672 + }, + { + "epoch": 35.346, + "grad_norm": 1.1352951526641846, + "learning_rate": 2e-05, + "loss": 0.05026999, + "step": 17673 + }, + { + "epoch": 35.348, + "grad_norm": 1.3933370113372803, + "learning_rate": 2e-05, + "loss": 0.03066497, + "step": 17674 + }, + { + "epoch": 35.35, + "grad_norm": 1.0098553895950317, + "learning_rate": 2e-05, + "loss": 0.04665726, + "step": 17675 + }, + { + "epoch": 35.352, + "grad_norm": 1.050323486328125, + "learning_rate": 2e-05, + "loss": 0.04057711, + "step": 17676 + }, + { + "epoch": 35.354, + "grad_norm": 1.5547417402267456, + "learning_rate": 2e-05, + "loss": 0.04004311, + "step": 17677 + }, + { + "epoch": 35.356, + "grad_norm": 0.8295509219169617, + "learning_rate": 2e-05, + "loss": 0.02311123, + "step": 17678 + }, + { + "epoch": 35.358, + "grad_norm": 1.6722297668457031, + "learning_rate": 2e-05, + "loss": 0.04569897, + "step": 17679 + }, + { + "epoch": 35.36, + "grad_norm": 1.2508385181427002, + "learning_rate": 2e-05, + "loss": 0.04461116, + "step": 17680 + }, + { + "epoch": 35.362, + "grad_norm": 1.20958411693573, + "learning_rate": 2e-05, + "loss": 0.04479233, + "step": 17681 + }, + { + "epoch": 35.364, + "grad_norm": 1.1637946367263794, + "learning_rate": 2e-05, + "loss": 0.04216972, + "step": 17682 + }, + { + "epoch": 35.366, + "grad_norm": 1.2432118654251099, + "learning_rate": 2e-05, + "loss": 0.0468252, + "step": 17683 + }, + { + "epoch": 35.368, + "grad_norm": 1.830465316772461, + "learning_rate": 2e-05, + "loss": 0.04752547, + "step": 17684 + }, + { + "epoch": 35.37, + "grad_norm": 1.0570487976074219, + "learning_rate": 2e-05, + "loss": 0.03741091, + "step": 17685 + }, + { + "epoch": 35.372, + "grad_norm": 2.6473779678344727, + "learning_rate": 2e-05, + "loss": 0.04489774, + "step": 17686 + }, + { + "epoch": 35.374, + "grad_norm": 1.851365327835083, + "learning_rate": 2e-05, + "loss": 0.04287185, + "step": 17687 + }, + { + "epoch": 35.376, + "grad_norm": 0.9998928308486938, + "learning_rate": 2e-05, + "loss": 0.02766431, + "step": 17688 + }, + { + "epoch": 35.378, + "grad_norm": 0.9591315388679504, + "learning_rate": 2e-05, + "loss": 0.04038401, + "step": 17689 + }, + { + "epoch": 35.38, + "grad_norm": 1.2864335775375366, + "learning_rate": 2e-05, + "loss": 0.03989007, + "step": 17690 + }, + { + "epoch": 35.382, + "grad_norm": 0.9475865364074707, + "learning_rate": 2e-05, + "loss": 0.02444749, + "step": 17691 + }, + { + "epoch": 35.384, + "grad_norm": 1.568764090538025, + "learning_rate": 2e-05, + "loss": 0.0419982, + "step": 17692 + }, + { + "epoch": 35.386, + "grad_norm": 1.4672602415084839, + "learning_rate": 2e-05, + "loss": 0.04082417, + "step": 17693 + }, + { + "epoch": 35.388, + "grad_norm": 1.2463346719741821, + "learning_rate": 2e-05, + "loss": 0.04095122, + "step": 17694 + }, + { + "epoch": 35.39, + "grad_norm": 0.9424423575401306, + "learning_rate": 2e-05, + "loss": 0.03527466, + "step": 17695 + }, + { + "epoch": 35.392, + "grad_norm": 1.1378228664398193, + "learning_rate": 2e-05, + "loss": 0.04225166, + "step": 17696 + }, + { + "epoch": 35.394, + "grad_norm": 0.9545497298240662, + "learning_rate": 2e-05, + "loss": 0.0355546, + "step": 17697 + }, + { + "epoch": 35.396, + "grad_norm": 1.1322404146194458, + "learning_rate": 2e-05, + "loss": 0.05043054, + "step": 17698 + }, + { + "epoch": 35.398, + "grad_norm": 1.0658605098724365, + "learning_rate": 2e-05, + "loss": 0.04704443, + "step": 17699 + }, + { + "epoch": 35.4, + "grad_norm": 1.517007827758789, + "learning_rate": 2e-05, + "loss": 0.03810771, + "step": 17700 + }, + { + "epoch": 35.402, + "grad_norm": 1.1072192192077637, + "learning_rate": 2e-05, + "loss": 0.04016921, + "step": 17701 + }, + { + "epoch": 35.404, + "grad_norm": 2.109288215637207, + "learning_rate": 2e-05, + "loss": 0.04785345, + "step": 17702 + }, + { + "epoch": 35.406, + "grad_norm": 2.865124464035034, + "learning_rate": 2e-05, + "loss": 0.04972965, + "step": 17703 + }, + { + "epoch": 35.408, + "grad_norm": 1.0339571237564087, + "learning_rate": 2e-05, + "loss": 0.03648417, + "step": 17704 + }, + { + "epoch": 35.41, + "grad_norm": 1.8117121458053589, + "learning_rate": 2e-05, + "loss": 0.04509541, + "step": 17705 + }, + { + "epoch": 35.412, + "grad_norm": 1.2131115198135376, + "learning_rate": 2e-05, + "loss": 0.04989497, + "step": 17706 + }, + { + "epoch": 35.414, + "grad_norm": 1.2343026399612427, + "learning_rate": 2e-05, + "loss": 0.04979831, + "step": 17707 + }, + { + "epoch": 35.416, + "grad_norm": 1.052886724472046, + "learning_rate": 2e-05, + "loss": 0.03326161, + "step": 17708 + }, + { + "epoch": 35.418, + "grad_norm": 1.1077262163162231, + "learning_rate": 2e-05, + "loss": 0.0355419, + "step": 17709 + }, + { + "epoch": 35.42, + "grad_norm": 1.4632622003555298, + "learning_rate": 2e-05, + "loss": 0.047338, + "step": 17710 + }, + { + "epoch": 35.422, + "grad_norm": 1.1542631387710571, + "learning_rate": 2e-05, + "loss": 0.04234635, + "step": 17711 + }, + { + "epoch": 35.424, + "grad_norm": 4.269802570343018, + "learning_rate": 2e-05, + "loss": 0.03470967, + "step": 17712 + }, + { + "epoch": 35.426, + "grad_norm": 1.3135454654693604, + "learning_rate": 2e-05, + "loss": 0.04146645, + "step": 17713 + }, + { + "epoch": 35.428, + "grad_norm": 1.0634958744049072, + "learning_rate": 2e-05, + "loss": 0.02393471, + "step": 17714 + }, + { + "epoch": 35.43, + "grad_norm": 1.000059723854065, + "learning_rate": 2e-05, + "loss": 0.0369064, + "step": 17715 + }, + { + "epoch": 35.432, + "grad_norm": 2.9598135948181152, + "learning_rate": 2e-05, + "loss": 0.04118183, + "step": 17716 + }, + { + "epoch": 35.434, + "grad_norm": 1.133852481842041, + "learning_rate": 2e-05, + "loss": 0.04435404, + "step": 17717 + }, + { + "epoch": 35.436, + "grad_norm": 0.9735626578330994, + "learning_rate": 2e-05, + "loss": 0.03352756, + "step": 17718 + }, + { + "epoch": 35.438, + "grad_norm": 1.356118083000183, + "learning_rate": 2e-05, + "loss": 0.05865297, + "step": 17719 + }, + { + "epoch": 35.44, + "grad_norm": 0.9971575736999512, + "learning_rate": 2e-05, + "loss": 0.02424793, + "step": 17720 + }, + { + "epoch": 35.442, + "grad_norm": 1.2533522844314575, + "learning_rate": 2e-05, + "loss": 0.03869835, + "step": 17721 + }, + { + "epoch": 35.444, + "grad_norm": 0.8965350985527039, + "learning_rate": 2e-05, + "loss": 0.03387743, + "step": 17722 + }, + { + "epoch": 35.446, + "grad_norm": 1.1389036178588867, + "learning_rate": 2e-05, + "loss": 0.04247169, + "step": 17723 + }, + { + "epoch": 35.448, + "grad_norm": 1.369778037071228, + "learning_rate": 2e-05, + "loss": 0.06438439, + "step": 17724 + }, + { + "epoch": 35.45, + "grad_norm": 1.1697176694869995, + "learning_rate": 2e-05, + "loss": 0.04246277, + "step": 17725 + }, + { + "epoch": 35.452, + "grad_norm": 1.5296801328659058, + "learning_rate": 2e-05, + "loss": 0.03574028, + "step": 17726 + }, + { + "epoch": 35.454, + "grad_norm": 1.0308504104614258, + "learning_rate": 2e-05, + "loss": 0.03716336, + "step": 17727 + }, + { + "epoch": 35.456, + "grad_norm": 1.0190324783325195, + "learning_rate": 2e-05, + "loss": 0.03794012, + "step": 17728 + }, + { + "epoch": 35.458, + "grad_norm": 1.2319554090499878, + "learning_rate": 2e-05, + "loss": 0.04770632, + "step": 17729 + }, + { + "epoch": 35.46, + "grad_norm": 1.7517224550247192, + "learning_rate": 2e-05, + "loss": 0.04705173, + "step": 17730 + }, + { + "epoch": 35.462, + "grad_norm": 1.5621715784072876, + "learning_rate": 2e-05, + "loss": 0.04366981, + "step": 17731 + }, + { + "epoch": 35.464, + "grad_norm": 1.3193645477294922, + "learning_rate": 2e-05, + "loss": 0.05270861, + "step": 17732 + }, + { + "epoch": 35.466, + "grad_norm": 0.9536468982696533, + "learning_rate": 2e-05, + "loss": 0.0323831, + "step": 17733 + }, + { + "epoch": 35.468, + "grad_norm": 1.2698876857757568, + "learning_rate": 2e-05, + "loss": 0.04317931, + "step": 17734 + }, + { + "epoch": 35.47, + "grad_norm": 1.2431633472442627, + "learning_rate": 2e-05, + "loss": 0.05195896, + "step": 17735 + }, + { + "epoch": 35.472, + "grad_norm": 1.0701299905776978, + "learning_rate": 2e-05, + "loss": 0.03547895, + "step": 17736 + }, + { + "epoch": 35.474, + "grad_norm": 1.2134490013122559, + "learning_rate": 2e-05, + "loss": 0.05419344, + "step": 17737 + }, + { + "epoch": 35.476, + "grad_norm": 1.2833257913589478, + "learning_rate": 2e-05, + "loss": 0.03962324, + "step": 17738 + }, + { + "epoch": 35.478, + "grad_norm": 1.1532343626022339, + "learning_rate": 2e-05, + "loss": 0.03333162, + "step": 17739 + }, + { + "epoch": 35.48, + "grad_norm": 1.1357736587524414, + "learning_rate": 2e-05, + "loss": 0.03868002, + "step": 17740 + }, + { + "epoch": 35.482, + "grad_norm": 1.2286957502365112, + "learning_rate": 2e-05, + "loss": 0.04705796, + "step": 17741 + }, + { + "epoch": 35.484, + "grad_norm": 1.0505681037902832, + "learning_rate": 2e-05, + "loss": 0.04804849, + "step": 17742 + }, + { + "epoch": 35.486, + "grad_norm": 1.327806830406189, + "learning_rate": 2e-05, + "loss": 0.04823949, + "step": 17743 + }, + { + "epoch": 35.488, + "grad_norm": 1.412100076675415, + "learning_rate": 2e-05, + "loss": 0.06005654, + "step": 17744 + }, + { + "epoch": 35.49, + "grad_norm": 1.1800956726074219, + "learning_rate": 2e-05, + "loss": 0.04362959, + "step": 17745 + }, + { + "epoch": 35.492, + "grad_norm": 1.503348708152771, + "learning_rate": 2e-05, + "loss": 0.03750347, + "step": 17746 + }, + { + "epoch": 35.494, + "grad_norm": 1.1118701696395874, + "learning_rate": 2e-05, + "loss": 0.03852141, + "step": 17747 + }, + { + "epoch": 35.496, + "grad_norm": 1.1212671995162964, + "learning_rate": 2e-05, + "loss": 0.0386279, + "step": 17748 + }, + { + "epoch": 35.498, + "grad_norm": 1.625443935394287, + "learning_rate": 2e-05, + "loss": 0.0379408, + "step": 17749 + }, + { + "epoch": 35.5, + "grad_norm": 1.2306393384933472, + "learning_rate": 2e-05, + "loss": 0.04350326, + "step": 17750 + }, + { + "epoch": 35.502, + "grad_norm": 2.8825604915618896, + "learning_rate": 2e-05, + "loss": 0.05902184, + "step": 17751 + }, + { + "epoch": 35.504, + "grad_norm": 0.9984104037284851, + "learning_rate": 2e-05, + "loss": 0.0298137, + "step": 17752 + }, + { + "epoch": 35.506, + "grad_norm": 1.1372625827789307, + "learning_rate": 2e-05, + "loss": 0.04307988, + "step": 17753 + }, + { + "epoch": 35.508, + "grad_norm": 1.264777660369873, + "learning_rate": 2e-05, + "loss": 0.05577639, + "step": 17754 + }, + { + "epoch": 35.51, + "grad_norm": 1.6673481464385986, + "learning_rate": 2e-05, + "loss": 0.03963385, + "step": 17755 + }, + { + "epoch": 35.512, + "grad_norm": 1.1486952304840088, + "learning_rate": 2e-05, + "loss": 0.0351126, + "step": 17756 + }, + { + "epoch": 35.514, + "grad_norm": 1.608984351158142, + "learning_rate": 2e-05, + "loss": 0.04021899, + "step": 17757 + }, + { + "epoch": 35.516, + "grad_norm": 1.772530436515808, + "learning_rate": 2e-05, + "loss": 0.05870032, + "step": 17758 + }, + { + "epoch": 35.518, + "grad_norm": 2.1825215816497803, + "learning_rate": 2e-05, + "loss": 0.03407493, + "step": 17759 + }, + { + "epoch": 35.52, + "grad_norm": 1.3578035831451416, + "learning_rate": 2e-05, + "loss": 0.04622669, + "step": 17760 + }, + { + "epoch": 35.522, + "grad_norm": 0.9831930994987488, + "learning_rate": 2e-05, + "loss": 0.0285987, + "step": 17761 + }, + { + "epoch": 35.524, + "grad_norm": 1.3356895446777344, + "learning_rate": 2e-05, + "loss": 0.04365962, + "step": 17762 + }, + { + "epoch": 35.526, + "grad_norm": 1.0443134307861328, + "learning_rate": 2e-05, + "loss": 0.03748023, + "step": 17763 + }, + { + "epoch": 35.528, + "grad_norm": 1.1682651042938232, + "learning_rate": 2e-05, + "loss": 0.04070065, + "step": 17764 + }, + { + "epoch": 35.53, + "grad_norm": 1.5059261322021484, + "learning_rate": 2e-05, + "loss": 0.03936179, + "step": 17765 + }, + { + "epoch": 35.532, + "grad_norm": 1.2437570095062256, + "learning_rate": 2e-05, + "loss": 0.05362586, + "step": 17766 + }, + { + "epoch": 35.534, + "grad_norm": 1.5657322406768799, + "learning_rate": 2e-05, + "loss": 0.04938463, + "step": 17767 + }, + { + "epoch": 35.536, + "grad_norm": 0.9113253355026245, + "learning_rate": 2e-05, + "loss": 0.02991488, + "step": 17768 + }, + { + "epoch": 35.538, + "grad_norm": 1.499922752380371, + "learning_rate": 2e-05, + "loss": 0.04125605, + "step": 17769 + }, + { + "epoch": 35.54, + "grad_norm": 1.880131721496582, + "learning_rate": 2e-05, + "loss": 0.04844569, + "step": 17770 + }, + { + "epoch": 35.542, + "grad_norm": 1.1775963306427002, + "learning_rate": 2e-05, + "loss": 0.0403573, + "step": 17771 + }, + { + "epoch": 35.544, + "grad_norm": 1.082275629043579, + "learning_rate": 2e-05, + "loss": 0.05027766, + "step": 17772 + }, + { + "epoch": 35.546, + "grad_norm": 1.3758782148361206, + "learning_rate": 2e-05, + "loss": 0.03608634, + "step": 17773 + }, + { + "epoch": 35.548, + "grad_norm": 1.6104998588562012, + "learning_rate": 2e-05, + "loss": 0.05908882, + "step": 17774 + }, + { + "epoch": 35.55, + "grad_norm": 1.236854076385498, + "learning_rate": 2e-05, + "loss": 0.0350708, + "step": 17775 + }, + { + "epoch": 35.552, + "grad_norm": 1.2313014268875122, + "learning_rate": 2e-05, + "loss": 0.0381471, + "step": 17776 + }, + { + "epoch": 35.554, + "grad_norm": 1.328316569328308, + "learning_rate": 2e-05, + "loss": 0.03309543, + "step": 17777 + }, + { + "epoch": 35.556, + "grad_norm": 0.9129838943481445, + "learning_rate": 2e-05, + "loss": 0.02543531, + "step": 17778 + }, + { + "epoch": 35.558, + "grad_norm": 1.7649102210998535, + "learning_rate": 2e-05, + "loss": 0.06082066, + "step": 17779 + }, + { + "epoch": 35.56, + "grad_norm": 1.1966280937194824, + "learning_rate": 2e-05, + "loss": 0.03694362, + "step": 17780 + }, + { + "epoch": 35.562, + "grad_norm": 1.4627177715301514, + "learning_rate": 2e-05, + "loss": 0.05721138, + "step": 17781 + }, + { + "epoch": 35.564, + "grad_norm": 1.157109260559082, + "learning_rate": 2e-05, + "loss": 0.03740972, + "step": 17782 + }, + { + "epoch": 35.566, + "grad_norm": 1.4257314205169678, + "learning_rate": 2e-05, + "loss": 0.03248692, + "step": 17783 + }, + { + "epoch": 35.568, + "grad_norm": 0.8779945969581604, + "learning_rate": 2e-05, + "loss": 0.02795019, + "step": 17784 + }, + { + "epoch": 35.57, + "grad_norm": 1.033923625946045, + "learning_rate": 2e-05, + "loss": 0.0340129, + "step": 17785 + }, + { + "epoch": 35.572, + "grad_norm": 1.221801996231079, + "learning_rate": 2e-05, + "loss": 0.04395582, + "step": 17786 + }, + { + "epoch": 35.574, + "grad_norm": 1.7551618814468384, + "learning_rate": 2e-05, + "loss": 0.03690699, + "step": 17787 + }, + { + "epoch": 35.576, + "grad_norm": 1.3354934453964233, + "learning_rate": 2e-05, + "loss": 0.05585198, + "step": 17788 + }, + { + "epoch": 35.578, + "grad_norm": 1.180930495262146, + "learning_rate": 2e-05, + "loss": 0.04893908, + "step": 17789 + }, + { + "epoch": 35.58, + "grad_norm": 1.4613933563232422, + "learning_rate": 2e-05, + "loss": 0.02561338, + "step": 17790 + }, + { + "epoch": 35.582, + "grad_norm": 1.3072844743728638, + "learning_rate": 2e-05, + "loss": 0.05350181, + "step": 17791 + }, + { + "epoch": 35.584, + "grad_norm": 1.5525765419006348, + "learning_rate": 2e-05, + "loss": 0.05785905, + "step": 17792 + }, + { + "epoch": 35.586, + "grad_norm": 1.8124005794525146, + "learning_rate": 2e-05, + "loss": 0.04928161, + "step": 17793 + }, + { + "epoch": 35.588, + "grad_norm": 1.0615509748458862, + "learning_rate": 2e-05, + "loss": 0.04574425, + "step": 17794 + }, + { + "epoch": 35.59, + "grad_norm": 1.153123140335083, + "learning_rate": 2e-05, + "loss": 0.04651198, + "step": 17795 + }, + { + "epoch": 35.592, + "grad_norm": 1.0932104587554932, + "learning_rate": 2e-05, + "loss": 0.03670096, + "step": 17796 + }, + { + "epoch": 35.594, + "grad_norm": 1.3459787368774414, + "learning_rate": 2e-05, + "loss": 0.04722689, + "step": 17797 + }, + { + "epoch": 35.596, + "grad_norm": 1.6134040355682373, + "learning_rate": 2e-05, + "loss": 0.05787089, + "step": 17798 + }, + { + "epoch": 35.598, + "grad_norm": 0.8614208698272705, + "learning_rate": 2e-05, + "loss": 0.02445607, + "step": 17799 + }, + { + "epoch": 35.6, + "grad_norm": 1.3574453592300415, + "learning_rate": 2e-05, + "loss": 0.05048421, + "step": 17800 + }, + { + "epoch": 35.602, + "grad_norm": 1.5967652797698975, + "learning_rate": 2e-05, + "loss": 0.05926722, + "step": 17801 + }, + { + "epoch": 35.604, + "grad_norm": 1.3235586881637573, + "learning_rate": 2e-05, + "loss": 0.04076, + "step": 17802 + }, + { + "epoch": 35.606, + "grad_norm": 1.4626275300979614, + "learning_rate": 2e-05, + "loss": 0.03682541, + "step": 17803 + }, + { + "epoch": 35.608, + "grad_norm": 1.6847271919250488, + "learning_rate": 2e-05, + "loss": 0.0393777, + "step": 17804 + }, + { + "epoch": 35.61, + "grad_norm": 2.216646194458008, + "learning_rate": 2e-05, + "loss": 0.03593418, + "step": 17805 + }, + { + "epoch": 35.612, + "grad_norm": 1.109135627746582, + "learning_rate": 2e-05, + "loss": 0.03311303, + "step": 17806 + }, + { + "epoch": 35.614, + "grad_norm": 1.2525417804718018, + "learning_rate": 2e-05, + "loss": 0.05682575, + "step": 17807 + }, + { + "epoch": 35.616, + "grad_norm": 1.1396535634994507, + "learning_rate": 2e-05, + "loss": 0.0337383, + "step": 17808 + }, + { + "epoch": 35.618, + "grad_norm": 0.8801586031913757, + "learning_rate": 2e-05, + "loss": 0.02758865, + "step": 17809 + }, + { + "epoch": 35.62, + "grad_norm": 1.4734177589416504, + "learning_rate": 2e-05, + "loss": 0.04279312, + "step": 17810 + }, + { + "epoch": 35.622, + "grad_norm": 1.2697399854660034, + "learning_rate": 2e-05, + "loss": 0.03729413, + "step": 17811 + }, + { + "epoch": 35.624, + "grad_norm": 1.2622113227844238, + "learning_rate": 2e-05, + "loss": 0.05834858, + "step": 17812 + }, + { + "epoch": 35.626, + "grad_norm": 1.3223472833633423, + "learning_rate": 2e-05, + "loss": 0.04107845, + "step": 17813 + }, + { + "epoch": 35.628, + "grad_norm": 1.3146884441375732, + "learning_rate": 2e-05, + "loss": 0.05463982, + "step": 17814 + }, + { + "epoch": 35.63, + "grad_norm": 0.9225746393203735, + "learning_rate": 2e-05, + "loss": 0.02599794, + "step": 17815 + }, + { + "epoch": 35.632, + "grad_norm": 1.4230635166168213, + "learning_rate": 2e-05, + "loss": 0.04019064, + "step": 17816 + }, + { + "epoch": 35.634, + "grad_norm": 1.6114380359649658, + "learning_rate": 2e-05, + "loss": 0.05018803, + "step": 17817 + }, + { + "epoch": 35.636, + "grad_norm": 1.1191251277923584, + "learning_rate": 2e-05, + "loss": 0.03190347, + "step": 17818 + }, + { + "epoch": 35.638, + "grad_norm": 2.0187177658081055, + "learning_rate": 2e-05, + "loss": 0.04886641, + "step": 17819 + }, + { + "epoch": 35.64, + "grad_norm": 1.0409353971481323, + "learning_rate": 2e-05, + "loss": 0.03332942, + "step": 17820 + }, + { + "epoch": 35.642, + "grad_norm": 1.3523070812225342, + "learning_rate": 2e-05, + "loss": 0.05481984, + "step": 17821 + }, + { + "epoch": 35.644, + "grad_norm": 0.8652260899543762, + "learning_rate": 2e-05, + "loss": 0.02689623, + "step": 17822 + }, + { + "epoch": 35.646, + "grad_norm": 1.0217618942260742, + "learning_rate": 2e-05, + "loss": 0.03837987, + "step": 17823 + }, + { + "epoch": 35.648, + "grad_norm": 2.860708713531494, + "learning_rate": 2e-05, + "loss": 0.0455092, + "step": 17824 + }, + { + "epoch": 35.65, + "grad_norm": 1.2912145853042603, + "learning_rate": 2e-05, + "loss": 0.04535623, + "step": 17825 + }, + { + "epoch": 35.652, + "grad_norm": 1.1695078611373901, + "learning_rate": 2e-05, + "loss": 0.047709, + "step": 17826 + }, + { + "epoch": 35.654, + "grad_norm": 1.5502129793167114, + "learning_rate": 2e-05, + "loss": 0.05648194, + "step": 17827 + }, + { + "epoch": 35.656, + "grad_norm": 0.9700777530670166, + "learning_rate": 2e-05, + "loss": 0.02404735, + "step": 17828 + }, + { + "epoch": 35.658, + "grad_norm": 1.0643705129623413, + "learning_rate": 2e-05, + "loss": 0.02897286, + "step": 17829 + }, + { + "epoch": 35.66, + "grad_norm": 1.0671511888504028, + "learning_rate": 2e-05, + "loss": 0.03070662, + "step": 17830 + }, + { + "epoch": 35.662, + "grad_norm": 1.3201302289962769, + "learning_rate": 2e-05, + "loss": 0.04533841, + "step": 17831 + }, + { + "epoch": 35.664, + "grad_norm": 1.1541919708251953, + "learning_rate": 2e-05, + "loss": 0.044394, + "step": 17832 + }, + { + "epoch": 35.666, + "grad_norm": 1.3019088506698608, + "learning_rate": 2e-05, + "loss": 0.06686329, + "step": 17833 + }, + { + "epoch": 35.668, + "grad_norm": 1.0324286222457886, + "learning_rate": 2e-05, + "loss": 0.04079415, + "step": 17834 + }, + { + "epoch": 35.67, + "grad_norm": 1.0423561334609985, + "learning_rate": 2e-05, + "loss": 0.03136035, + "step": 17835 + }, + { + "epoch": 35.672, + "grad_norm": 1.6549469232559204, + "learning_rate": 2e-05, + "loss": 0.0495429, + "step": 17836 + }, + { + "epoch": 35.674, + "grad_norm": 3.071321725845337, + "learning_rate": 2e-05, + "loss": 0.04680189, + "step": 17837 + }, + { + "epoch": 35.676, + "grad_norm": 1.4656696319580078, + "learning_rate": 2e-05, + "loss": 0.04357903, + "step": 17838 + }, + { + "epoch": 35.678, + "grad_norm": 1.0006139278411865, + "learning_rate": 2e-05, + "loss": 0.02649287, + "step": 17839 + }, + { + "epoch": 35.68, + "grad_norm": 0.8368288278579712, + "learning_rate": 2e-05, + "loss": 0.02872172, + "step": 17840 + }, + { + "epoch": 35.682, + "grad_norm": 1.1067603826522827, + "learning_rate": 2e-05, + "loss": 0.04076967, + "step": 17841 + }, + { + "epoch": 35.684, + "grad_norm": 1.4088644981384277, + "learning_rate": 2e-05, + "loss": 0.04598125, + "step": 17842 + }, + { + "epoch": 35.686, + "grad_norm": 1.1588618755340576, + "learning_rate": 2e-05, + "loss": 0.03723881, + "step": 17843 + }, + { + "epoch": 35.688, + "grad_norm": 1.1325336694717407, + "learning_rate": 2e-05, + "loss": 0.04327301, + "step": 17844 + }, + { + "epoch": 35.69, + "grad_norm": 2.3693675994873047, + "learning_rate": 2e-05, + "loss": 0.05009006, + "step": 17845 + }, + { + "epoch": 35.692, + "grad_norm": 1.2514508962631226, + "learning_rate": 2e-05, + "loss": 0.03267618, + "step": 17846 + }, + { + "epoch": 35.694, + "grad_norm": 1.3177813291549683, + "learning_rate": 2e-05, + "loss": 0.02680693, + "step": 17847 + }, + { + "epoch": 35.696, + "grad_norm": 1.2522759437561035, + "learning_rate": 2e-05, + "loss": 0.04521048, + "step": 17848 + }, + { + "epoch": 35.698, + "grad_norm": 1.2396931648254395, + "learning_rate": 2e-05, + "loss": 0.04618759, + "step": 17849 + }, + { + "epoch": 35.7, + "grad_norm": 1.2118010520935059, + "learning_rate": 2e-05, + "loss": 0.03588796, + "step": 17850 + }, + { + "epoch": 35.702, + "grad_norm": 1.1289559602737427, + "learning_rate": 2e-05, + "loss": 0.04039676, + "step": 17851 + }, + { + "epoch": 35.704, + "grad_norm": 2.2850546836853027, + "learning_rate": 2e-05, + "loss": 0.04581477, + "step": 17852 + }, + { + "epoch": 35.706, + "grad_norm": 1.0945602655410767, + "learning_rate": 2e-05, + "loss": 0.02884125, + "step": 17853 + }, + { + "epoch": 35.708, + "grad_norm": 1.218193769454956, + "learning_rate": 2e-05, + "loss": 0.03631239, + "step": 17854 + }, + { + "epoch": 35.71, + "grad_norm": 1.4844825267791748, + "learning_rate": 2e-05, + "loss": 0.02633794, + "step": 17855 + }, + { + "epoch": 35.712, + "grad_norm": 2.8186700344085693, + "learning_rate": 2e-05, + "loss": 0.05136289, + "step": 17856 + }, + { + "epoch": 35.714, + "grad_norm": 1.0413309335708618, + "learning_rate": 2e-05, + "loss": 0.02841013, + "step": 17857 + }, + { + "epoch": 35.716, + "grad_norm": 1.4248929023742676, + "learning_rate": 2e-05, + "loss": 0.04046667, + "step": 17858 + }, + { + "epoch": 35.718, + "grad_norm": 1.086219310760498, + "learning_rate": 2e-05, + "loss": 0.03643753, + "step": 17859 + }, + { + "epoch": 35.72, + "grad_norm": 1.0413047075271606, + "learning_rate": 2e-05, + "loss": 0.03506773, + "step": 17860 + }, + { + "epoch": 35.722, + "grad_norm": 1.138818383216858, + "learning_rate": 2e-05, + "loss": 0.04383482, + "step": 17861 + }, + { + "epoch": 35.724, + "grad_norm": 1.0784032344818115, + "learning_rate": 2e-05, + "loss": 0.04035743, + "step": 17862 + }, + { + "epoch": 35.726, + "grad_norm": 1.1928517818450928, + "learning_rate": 2e-05, + "loss": 0.04039065, + "step": 17863 + }, + { + "epoch": 35.728, + "grad_norm": 1.3450567722320557, + "learning_rate": 2e-05, + "loss": 0.051686, + "step": 17864 + }, + { + "epoch": 35.73, + "grad_norm": 1.3145865201950073, + "learning_rate": 2e-05, + "loss": 0.05856508, + "step": 17865 + }, + { + "epoch": 35.732, + "grad_norm": 1.7338427305221558, + "learning_rate": 2e-05, + "loss": 0.05495846, + "step": 17866 + }, + { + "epoch": 35.734, + "grad_norm": 1.2222492694854736, + "learning_rate": 2e-05, + "loss": 0.04681749, + "step": 17867 + }, + { + "epoch": 35.736, + "grad_norm": 2.382176399230957, + "learning_rate": 2e-05, + "loss": 0.0592317, + "step": 17868 + }, + { + "epoch": 35.738, + "grad_norm": 1.3851975202560425, + "learning_rate": 2e-05, + "loss": 0.05830447, + "step": 17869 + }, + { + "epoch": 35.74, + "grad_norm": 1.8277791738510132, + "learning_rate": 2e-05, + "loss": 0.04027114, + "step": 17870 + }, + { + "epoch": 35.742, + "grad_norm": 1.3364810943603516, + "learning_rate": 2e-05, + "loss": 0.04788211, + "step": 17871 + }, + { + "epoch": 35.744, + "grad_norm": 1.4539287090301514, + "learning_rate": 2e-05, + "loss": 0.05281156, + "step": 17872 + }, + { + "epoch": 35.746, + "grad_norm": 2.4535436630249023, + "learning_rate": 2e-05, + "loss": 0.05732207, + "step": 17873 + }, + { + "epoch": 35.748, + "grad_norm": 1.3156609535217285, + "learning_rate": 2e-05, + "loss": 0.04180748, + "step": 17874 + }, + { + "epoch": 35.75, + "grad_norm": 1.083983063697815, + "learning_rate": 2e-05, + "loss": 0.02724944, + "step": 17875 + }, + { + "epoch": 35.752, + "grad_norm": 1.2590404748916626, + "learning_rate": 2e-05, + "loss": 0.04488721, + "step": 17876 + }, + { + "epoch": 35.754, + "grad_norm": 1.1767133474349976, + "learning_rate": 2e-05, + "loss": 0.03508144, + "step": 17877 + }, + { + "epoch": 35.756, + "grad_norm": 0.9333146810531616, + "learning_rate": 2e-05, + "loss": 0.03762346, + "step": 17878 + }, + { + "epoch": 35.758, + "grad_norm": 3.9581522941589355, + "learning_rate": 2e-05, + "loss": 0.05363543, + "step": 17879 + }, + { + "epoch": 35.76, + "grad_norm": 1.9999139308929443, + "learning_rate": 2e-05, + "loss": 0.05396073, + "step": 17880 + }, + { + "epoch": 35.762, + "grad_norm": 1.12847900390625, + "learning_rate": 2e-05, + "loss": 0.04188031, + "step": 17881 + }, + { + "epoch": 35.764, + "grad_norm": 1.221652626991272, + "learning_rate": 2e-05, + "loss": 0.04514348, + "step": 17882 + }, + { + "epoch": 35.766, + "grad_norm": 1.2314302921295166, + "learning_rate": 2e-05, + "loss": 0.04036722, + "step": 17883 + }, + { + "epoch": 35.768, + "grad_norm": 1.4413890838623047, + "learning_rate": 2e-05, + "loss": 0.05193814, + "step": 17884 + }, + { + "epoch": 35.77, + "grad_norm": 1.3951367139816284, + "learning_rate": 2e-05, + "loss": 0.03265073, + "step": 17885 + }, + { + "epoch": 35.772, + "grad_norm": 1.0324435234069824, + "learning_rate": 2e-05, + "loss": 0.03620376, + "step": 17886 + }, + { + "epoch": 35.774, + "grad_norm": 1.1713546514511108, + "learning_rate": 2e-05, + "loss": 0.04896335, + "step": 17887 + }, + { + "epoch": 35.776, + "grad_norm": 0.991287350654602, + "learning_rate": 2e-05, + "loss": 0.03548204, + "step": 17888 + }, + { + "epoch": 35.778, + "grad_norm": 2.7516236305236816, + "learning_rate": 2e-05, + "loss": 0.06211228, + "step": 17889 + }, + { + "epoch": 35.78, + "grad_norm": 2.7214620113372803, + "learning_rate": 2e-05, + "loss": 0.05013406, + "step": 17890 + }, + { + "epoch": 35.782, + "grad_norm": 1.233435034751892, + "learning_rate": 2e-05, + "loss": 0.04874311, + "step": 17891 + }, + { + "epoch": 35.784, + "grad_norm": 1.7725192308425903, + "learning_rate": 2e-05, + "loss": 0.04005573, + "step": 17892 + }, + { + "epoch": 35.786, + "grad_norm": 1.3703975677490234, + "learning_rate": 2e-05, + "loss": 0.04984409, + "step": 17893 + }, + { + "epoch": 35.788, + "grad_norm": 1.3456453084945679, + "learning_rate": 2e-05, + "loss": 0.04753321, + "step": 17894 + }, + { + "epoch": 35.79, + "grad_norm": 1.2621350288391113, + "learning_rate": 2e-05, + "loss": 0.05199986, + "step": 17895 + }, + { + "epoch": 35.792, + "grad_norm": 1.3152614831924438, + "learning_rate": 2e-05, + "loss": 0.03647928, + "step": 17896 + }, + { + "epoch": 35.794, + "grad_norm": 0.9398272633552551, + "learning_rate": 2e-05, + "loss": 0.02891801, + "step": 17897 + }, + { + "epoch": 35.796, + "grad_norm": 1.0658025741577148, + "learning_rate": 2e-05, + "loss": 0.03435162, + "step": 17898 + }, + { + "epoch": 35.798, + "grad_norm": 1.157807469367981, + "learning_rate": 2e-05, + "loss": 0.03711173, + "step": 17899 + }, + { + "epoch": 35.8, + "grad_norm": 2.451808214187622, + "learning_rate": 2e-05, + "loss": 0.03681424, + "step": 17900 + }, + { + "epoch": 35.802, + "grad_norm": 1.0727334022521973, + "learning_rate": 2e-05, + "loss": 0.04209089, + "step": 17901 + }, + { + "epoch": 35.804, + "grad_norm": 1.314285159111023, + "learning_rate": 2e-05, + "loss": 0.0380992, + "step": 17902 + }, + { + "epoch": 35.806, + "grad_norm": 1.1336785554885864, + "learning_rate": 2e-05, + "loss": 0.03314412, + "step": 17903 + }, + { + "epoch": 35.808, + "grad_norm": 1.7368667125701904, + "learning_rate": 2e-05, + "loss": 0.0345618, + "step": 17904 + }, + { + "epoch": 35.81, + "grad_norm": 2.1174581050872803, + "learning_rate": 2e-05, + "loss": 0.06510151, + "step": 17905 + }, + { + "epoch": 35.812, + "grad_norm": 1.6650149822235107, + "learning_rate": 2e-05, + "loss": 0.04526421, + "step": 17906 + }, + { + "epoch": 35.814, + "grad_norm": 1.2398277521133423, + "learning_rate": 2e-05, + "loss": 0.037236, + "step": 17907 + }, + { + "epoch": 35.816, + "grad_norm": 1.1168713569641113, + "learning_rate": 2e-05, + "loss": 0.03919902, + "step": 17908 + }, + { + "epoch": 35.818, + "grad_norm": 1.6926127672195435, + "learning_rate": 2e-05, + "loss": 0.05567209, + "step": 17909 + }, + { + "epoch": 35.82, + "grad_norm": 0.9734522104263306, + "learning_rate": 2e-05, + "loss": 0.0383275, + "step": 17910 + }, + { + "epoch": 35.822, + "grad_norm": 1.0680774450302124, + "learning_rate": 2e-05, + "loss": 0.04609864, + "step": 17911 + }, + { + "epoch": 35.824, + "grad_norm": 1.3214465379714966, + "learning_rate": 2e-05, + "loss": 0.04425301, + "step": 17912 + }, + { + "epoch": 35.826, + "grad_norm": 1.187393307685852, + "learning_rate": 2e-05, + "loss": 0.04773477, + "step": 17913 + }, + { + "epoch": 35.828, + "grad_norm": 2.846890926361084, + "learning_rate": 2e-05, + "loss": 0.04056527, + "step": 17914 + }, + { + "epoch": 35.83, + "grad_norm": 1.2541725635528564, + "learning_rate": 2e-05, + "loss": 0.04198845, + "step": 17915 + }, + { + "epoch": 35.832, + "grad_norm": 1.2594901323318481, + "learning_rate": 2e-05, + "loss": 0.04509019, + "step": 17916 + }, + { + "epoch": 35.834, + "grad_norm": 1.6290315389633179, + "learning_rate": 2e-05, + "loss": 0.04840211, + "step": 17917 + }, + { + "epoch": 35.836, + "grad_norm": 1.9645651578903198, + "learning_rate": 2e-05, + "loss": 0.04485439, + "step": 17918 + }, + { + "epoch": 35.838, + "grad_norm": 1.1047340631484985, + "learning_rate": 2e-05, + "loss": 0.04456278, + "step": 17919 + }, + { + "epoch": 35.84, + "grad_norm": 1.4236819744110107, + "learning_rate": 2e-05, + "loss": 0.05413448, + "step": 17920 + }, + { + "epoch": 35.842, + "grad_norm": 1.3188161849975586, + "learning_rate": 2e-05, + "loss": 0.05367437, + "step": 17921 + }, + { + "epoch": 35.844, + "grad_norm": 1.1377904415130615, + "learning_rate": 2e-05, + "loss": 0.04965197, + "step": 17922 + }, + { + "epoch": 35.846, + "grad_norm": 2.1205976009368896, + "learning_rate": 2e-05, + "loss": 0.05599382, + "step": 17923 + }, + { + "epoch": 35.848, + "grad_norm": 1.2316977977752686, + "learning_rate": 2e-05, + "loss": 0.04126142, + "step": 17924 + }, + { + "epoch": 35.85, + "grad_norm": 1.2489413022994995, + "learning_rate": 2e-05, + "loss": 0.04298563, + "step": 17925 + }, + { + "epoch": 35.852, + "grad_norm": 1.4723291397094727, + "learning_rate": 2e-05, + "loss": 0.04364316, + "step": 17926 + }, + { + "epoch": 35.854, + "grad_norm": 0.9845232963562012, + "learning_rate": 2e-05, + "loss": 0.03328026, + "step": 17927 + }, + { + "epoch": 35.856, + "grad_norm": 1.11769700050354, + "learning_rate": 2e-05, + "loss": 0.03830117, + "step": 17928 + }, + { + "epoch": 35.858, + "grad_norm": 1.0789967775344849, + "learning_rate": 2e-05, + "loss": 0.03304796, + "step": 17929 + }, + { + "epoch": 35.86, + "grad_norm": 1.4628854990005493, + "learning_rate": 2e-05, + "loss": 0.04162867, + "step": 17930 + }, + { + "epoch": 35.862, + "grad_norm": 1.1505697965621948, + "learning_rate": 2e-05, + "loss": 0.04003832, + "step": 17931 + }, + { + "epoch": 35.864, + "grad_norm": 0.9786707758903503, + "learning_rate": 2e-05, + "loss": 0.04291322, + "step": 17932 + }, + { + "epoch": 35.866, + "grad_norm": 1.514482021331787, + "learning_rate": 2e-05, + "loss": 0.04682028, + "step": 17933 + }, + { + "epoch": 35.868, + "grad_norm": 1.150888442993164, + "learning_rate": 2e-05, + "loss": 0.04786021, + "step": 17934 + }, + { + "epoch": 35.87, + "grad_norm": 1.2002613544464111, + "learning_rate": 2e-05, + "loss": 0.0451806, + "step": 17935 + }, + { + "epoch": 35.872, + "grad_norm": 1.3272571563720703, + "learning_rate": 2e-05, + "loss": 0.05705176, + "step": 17936 + }, + { + "epoch": 35.874, + "grad_norm": 1.1266093254089355, + "learning_rate": 2e-05, + "loss": 0.03312952, + "step": 17937 + }, + { + "epoch": 35.876, + "grad_norm": 1.2619445323944092, + "learning_rate": 2e-05, + "loss": 0.04542191, + "step": 17938 + }, + { + "epoch": 35.878, + "grad_norm": 1.2090903520584106, + "learning_rate": 2e-05, + "loss": 0.04657109, + "step": 17939 + }, + { + "epoch": 35.88, + "grad_norm": 1.0083225965499878, + "learning_rate": 2e-05, + "loss": 0.03867437, + "step": 17940 + }, + { + "epoch": 35.882, + "grad_norm": 1.0939233303070068, + "learning_rate": 2e-05, + "loss": 0.02842722, + "step": 17941 + }, + { + "epoch": 35.884, + "grad_norm": 1.0067133903503418, + "learning_rate": 2e-05, + "loss": 0.03263469, + "step": 17942 + }, + { + "epoch": 35.886, + "grad_norm": 1.1299545764923096, + "learning_rate": 2e-05, + "loss": 0.04446633, + "step": 17943 + }, + { + "epoch": 35.888, + "grad_norm": 1.0273183584213257, + "learning_rate": 2e-05, + "loss": 0.05045258, + "step": 17944 + }, + { + "epoch": 35.89, + "grad_norm": 1.154341697692871, + "learning_rate": 2e-05, + "loss": 0.03366087, + "step": 17945 + }, + { + "epoch": 35.892, + "grad_norm": 1.1345313787460327, + "learning_rate": 2e-05, + "loss": 0.03205415, + "step": 17946 + }, + { + "epoch": 35.894, + "grad_norm": 1.1819157600402832, + "learning_rate": 2e-05, + "loss": 0.04350836, + "step": 17947 + }, + { + "epoch": 35.896, + "grad_norm": 1.7022780179977417, + "learning_rate": 2e-05, + "loss": 0.048735, + "step": 17948 + }, + { + "epoch": 35.898, + "grad_norm": 0.8520524501800537, + "learning_rate": 2e-05, + "loss": 0.02894574, + "step": 17949 + }, + { + "epoch": 35.9, + "grad_norm": 1.0947223901748657, + "learning_rate": 2e-05, + "loss": 0.03767444, + "step": 17950 + }, + { + "epoch": 35.902, + "grad_norm": 2.9594931602478027, + "learning_rate": 2e-05, + "loss": 0.0309197, + "step": 17951 + }, + { + "epoch": 35.904, + "grad_norm": 1.3062098026275635, + "learning_rate": 2e-05, + "loss": 0.04968993, + "step": 17952 + }, + { + "epoch": 35.906, + "grad_norm": 1.0888330936431885, + "learning_rate": 2e-05, + "loss": 0.04075143, + "step": 17953 + }, + { + "epoch": 35.908, + "grad_norm": 1.0041322708129883, + "learning_rate": 2e-05, + "loss": 0.03644698, + "step": 17954 + }, + { + "epoch": 35.91, + "grad_norm": 1.0307637453079224, + "learning_rate": 2e-05, + "loss": 0.04232953, + "step": 17955 + }, + { + "epoch": 35.912, + "grad_norm": 1.2270115613937378, + "learning_rate": 2e-05, + "loss": 0.04430905, + "step": 17956 + }, + { + "epoch": 35.914, + "grad_norm": 1.2956207990646362, + "learning_rate": 2e-05, + "loss": 0.06014199, + "step": 17957 + }, + { + "epoch": 35.916, + "grad_norm": 1.5076323747634888, + "learning_rate": 2e-05, + "loss": 0.0593874, + "step": 17958 + }, + { + "epoch": 35.918, + "grad_norm": 1.1515308618545532, + "learning_rate": 2e-05, + "loss": 0.04382184, + "step": 17959 + }, + { + "epoch": 35.92, + "grad_norm": 1.1946406364440918, + "learning_rate": 2e-05, + "loss": 0.05360639, + "step": 17960 + }, + { + "epoch": 35.922, + "grad_norm": 1.332234501838684, + "learning_rate": 2e-05, + "loss": 0.03495939, + "step": 17961 + }, + { + "epoch": 35.924, + "grad_norm": 1.5882817506790161, + "learning_rate": 2e-05, + "loss": 0.02272969, + "step": 17962 + }, + { + "epoch": 35.926, + "grad_norm": 1.1679266691207886, + "learning_rate": 2e-05, + "loss": 0.05053775, + "step": 17963 + }, + { + "epoch": 35.928, + "grad_norm": 1.0632550716400146, + "learning_rate": 2e-05, + "loss": 0.04534806, + "step": 17964 + }, + { + "epoch": 35.93, + "grad_norm": 1.6096125841140747, + "learning_rate": 2e-05, + "loss": 0.0572957, + "step": 17965 + }, + { + "epoch": 35.932, + "grad_norm": 1.2726202011108398, + "learning_rate": 2e-05, + "loss": 0.04721672, + "step": 17966 + }, + { + "epoch": 35.934, + "grad_norm": 1.637152075767517, + "learning_rate": 2e-05, + "loss": 0.03014551, + "step": 17967 + }, + { + "epoch": 35.936, + "grad_norm": 1.2941720485687256, + "learning_rate": 2e-05, + "loss": 0.04506875, + "step": 17968 + }, + { + "epoch": 35.938, + "grad_norm": 1.165555477142334, + "learning_rate": 2e-05, + "loss": 0.04141459, + "step": 17969 + }, + { + "epoch": 35.94, + "grad_norm": 1.2198277711868286, + "learning_rate": 2e-05, + "loss": 0.04705646, + "step": 17970 + }, + { + "epoch": 35.942, + "grad_norm": 1.2274084091186523, + "learning_rate": 2e-05, + "loss": 0.04948511, + "step": 17971 + }, + { + "epoch": 35.944, + "grad_norm": 1.474108338356018, + "learning_rate": 2e-05, + "loss": 0.04234532, + "step": 17972 + }, + { + "epoch": 35.946, + "grad_norm": 1.1247254610061646, + "learning_rate": 2e-05, + "loss": 0.04175204, + "step": 17973 + }, + { + "epoch": 35.948, + "grad_norm": 1.2691006660461426, + "learning_rate": 2e-05, + "loss": 0.05016943, + "step": 17974 + }, + { + "epoch": 35.95, + "grad_norm": 1.319199800491333, + "learning_rate": 2e-05, + "loss": 0.05450393, + "step": 17975 + }, + { + "epoch": 35.952, + "grad_norm": 1.6631337404251099, + "learning_rate": 2e-05, + "loss": 0.05106764, + "step": 17976 + }, + { + "epoch": 35.954, + "grad_norm": 1.2125028371810913, + "learning_rate": 2e-05, + "loss": 0.05636699, + "step": 17977 + }, + { + "epoch": 35.956, + "grad_norm": 1.0986932516098022, + "learning_rate": 2e-05, + "loss": 0.04804093, + "step": 17978 + }, + { + "epoch": 35.958, + "grad_norm": 2.018481731414795, + "learning_rate": 2e-05, + "loss": 0.04954034, + "step": 17979 + }, + { + "epoch": 35.96, + "grad_norm": 1.1213892698287964, + "learning_rate": 2e-05, + "loss": 0.03687691, + "step": 17980 + }, + { + "epoch": 35.962, + "grad_norm": 1.2770687341690063, + "learning_rate": 2e-05, + "loss": 0.04670438, + "step": 17981 + }, + { + "epoch": 35.964, + "grad_norm": 1.8598698377609253, + "learning_rate": 2e-05, + "loss": 0.04108175, + "step": 17982 + }, + { + "epoch": 35.966, + "grad_norm": 0.9518316388130188, + "learning_rate": 2e-05, + "loss": 0.03023814, + "step": 17983 + }, + { + "epoch": 35.968, + "grad_norm": 1.3764249086380005, + "learning_rate": 2e-05, + "loss": 0.03840397, + "step": 17984 + }, + { + "epoch": 35.97, + "grad_norm": 0.9351934194564819, + "learning_rate": 2e-05, + "loss": 0.03416009, + "step": 17985 + }, + { + "epoch": 35.972, + "grad_norm": 1.1933887004852295, + "learning_rate": 2e-05, + "loss": 0.03491713, + "step": 17986 + }, + { + "epoch": 35.974, + "grad_norm": 1.3929616212844849, + "learning_rate": 2e-05, + "loss": 0.05122756, + "step": 17987 + }, + { + "epoch": 35.976, + "grad_norm": 1.3909257650375366, + "learning_rate": 2e-05, + "loss": 0.04380251, + "step": 17988 + }, + { + "epoch": 35.978, + "grad_norm": 1.3311553001403809, + "learning_rate": 2e-05, + "loss": 0.04476759, + "step": 17989 + }, + { + "epoch": 35.98, + "grad_norm": 1.3036085367202759, + "learning_rate": 2e-05, + "loss": 0.05047145, + "step": 17990 + }, + { + "epoch": 35.982, + "grad_norm": 1.5605629682540894, + "learning_rate": 2e-05, + "loss": 0.04389017, + "step": 17991 + }, + { + "epoch": 35.984, + "grad_norm": 1.2548414468765259, + "learning_rate": 2e-05, + "loss": 0.03170571, + "step": 17992 + }, + { + "epoch": 35.986, + "grad_norm": 1.2417798042297363, + "learning_rate": 2e-05, + "loss": 0.04844061, + "step": 17993 + }, + { + "epoch": 35.988, + "grad_norm": 1.1286331415176392, + "learning_rate": 2e-05, + "loss": 0.04077397, + "step": 17994 + }, + { + "epoch": 35.99, + "grad_norm": 1.581292748451233, + "learning_rate": 2e-05, + "loss": 0.03007511, + "step": 17995 + }, + { + "epoch": 35.992, + "grad_norm": 1.491782784461975, + "learning_rate": 2e-05, + "loss": 0.03442695, + "step": 17996 + }, + { + "epoch": 35.994, + "grad_norm": 1.134804368019104, + "learning_rate": 2e-05, + "loss": 0.03684225, + "step": 17997 + }, + { + "epoch": 35.996, + "grad_norm": 2.2671315670013428, + "learning_rate": 2e-05, + "loss": 0.04276287, + "step": 17998 + }, + { + "epoch": 35.998, + "grad_norm": 1.1745227575302124, + "learning_rate": 2e-05, + "loss": 0.047242, + "step": 17999 + }, + { + "epoch": 36.0, + "grad_norm": 1.1241718530654907, + "learning_rate": 2e-05, + "loss": 0.0284595, + "step": 18000 + }, + { + "epoch": 36.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9820359281437125, + "Equal_1": 0.998, + "Equal_2": 0.9800399201596807, + "Equal_3": 0.9900199600798403, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9959919839679359, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 0.994, + "Perpendicular_2": 0.994, + "Perpendicular_3": 0.8837675350701403, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.996, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9880239520958084 + }, + "eval_runtime": 319.9643, + "eval_samples_per_second": 32.816, + "eval_steps_per_second": 0.656, + "step": 18000 + }, + { + "epoch": 36.002, + "grad_norm": 1.679438829421997, + "learning_rate": 2e-05, + "loss": 0.06195679, + "step": 18001 + }, + { + "epoch": 36.004, + "grad_norm": 1.7376255989074707, + "learning_rate": 2e-05, + "loss": 0.06136629, + "step": 18002 + }, + { + "epoch": 36.006, + "grad_norm": 1.0917325019836426, + "learning_rate": 2e-05, + "loss": 0.04106084, + "step": 18003 + }, + { + "epoch": 36.008, + "grad_norm": 1.2398790121078491, + "learning_rate": 2e-05, + "loss": 0.05541268, + "step": 18004 + }, + { + "epoch": 36.01, + "grad_norm": 1.5430097579956055, + "learning_rate": 2e-05, + "loss": 0.04105628, + "step": 18005 + }, + { + "epoch": 36.012, + "grad_norm": 1.0548675060272217, + "learning_rate": 2e-05, + "loss": 0.03720319, + "step": 18006 + }, + { + "epoch": 36.014, + "grad_norm": 1.1223161220550537, + "learning_rate": 2e-05, + "loss": 0.03772051, + "step": 18007 + }, + { + "epoch": 36.016, + "grad_norm": 1.4329164028167725, + "learning_rate": 2e-05, + "loss": 0.04604128, + "step": 18008 + }, + { + "epoch": 36.018, + "grad_norm": 2.9451615810394287, + "learning_rate": 2e-05, + "loss": 0.0591033, + "step": 18009 + }, + { + "epoch": 36.02, + "grad_norm": 1.7627770900726318, + "learning_rate": 2e-05, + "loss": 0.05250013, + "step": 18010 + }, + { + "epoch": 36.022, + "grad_norm": 0.9771580696105957, + "learning_rate": 2e-05, + "loss": 0.04982607, + "step": 18011 + }, + { + "epoch": 36.024, + "grad_norm": 1.4142186641693115, + "learning_rate": 2e-05, + "loss": 0.05801858, + "step": 18012 + }, + { + "epoch": 36.026, + "grad_norm": 0.9408489465713501, + "learning_rate": 2e-05, + "loss": 0.0294435, + "step": 18013 + }, + { + "epoch": 36.028, + "grad_norm": 1.266186237335205, + "learning_rate": 2e-05, + "loss": 0.04173048, + "step": 18014 + }, + { + "epoch": 36.03, + "grad_norm": 1.2469063997268677, + "learning_rate": 2e-05, + "loss": 0.05035538, + "step": 18015 + }, + { + "epoch": 36.032, + "grad_norm": 0.9710808396339417, + "learning_rate": 2e-05, + "loss": 0.03061551, + "step": 18016 + }, + { + "epoch": 36.034, + "grad_norm": 1.1993094682693481, + "learning_rate": 2e-05, + "loss": 0.0386519, + "step": 18017 + }, + { + "epoch": 36.036, + "grad_norm": 1.0367350578308105, + "learning_rate": 2e-05, + "loss": 0.04080002, + "step": 18018 + }, + { + "epoch": 36.038, + "grad_norm": 1.0484521389007568, + "learning_rate": 2e-05, + "loss": 0.03632751, + "step": 18019 + }, + { + "epoch": 36.04, + "grad_norm": 1.096776008605957, + "learning_rate": 2e-05, + "loss": 0.03241234, + "step": 18020 + }, + { + "epoch": 36.042, + "grad_norm": 1.2367771863937378, + "learning_rate": 2e-05, + "loss": 0.04970008, + "step": 18021 + }, + { + "epoch": 36.044, + "grad_norm": 1.095942735671997, + "learning_rate": 2e-05, + "loss": 0.03803166, + "step": 18022 + }, + { + "epoch": 36.046, + "grad_norm": 1.2419805526733398, + "learning_rate": 2e-05, + "loss": 0.03578827, + "step": 18023 + }, + { + "epoch": 36.048, + "grad_norm": 0.9755739569664001, + "learning_rate": 2e-05, + "loss": 0.03225515, + "step": 18024 + }, + { + "epoch": 36.05, + "grad_norm": 0.9210935235023499, + "learning_rate": 2e-05, + "loss": 0.03545808, + "step": 18025 + }, + { + "epoch": 36.052, + "grad_norm": 1.514643669128418, + "learning_rate": 2e-05, + "loss": 0.04578691, + "step": 18026 + }, + { + "epoch": 36.054, + "grad_norm": 1.3199200630187988, + "learning_rate": 2e-05, + "loss": 0.04699472, + "step": 18027 + }, + { + "epoch": 36.056, + "grad_norm": 1.3400589227676392, + "learning_rate": 2e-05, + "loss": 0.04195932, + "step": 18028 + }, + { + "epoch": 36.058, + "grad_norm": 1.3302844762802124, + "learning_rate": 2e-05, + "loss": 0.05917486, + "step": 18029 + }, + { + "epoch": 36.06, + "grad_norm": 0.9098374247550964, + "learning_rate": 2e-05, + "loss": 0.02756627, + "step": 18030 + }, + { + "epoch": 36.062, + "grad_norm": 0.9550777673721313, + "learning_rate": 2e-05, + "loss": 0.02522951, + "step": 18031 + }, + { + "epoch": 36.064, + "grad_norm": 1.3498256206512451, + "learning_rate": 2e-05, + "loss": 0.03331952, + "step": 18032 + }, + { + "epoch": 36.066, + "grad_norm": 1.149773359298706, + "learning_rate": 2e-05, + "loss": 0.04008397, + "step": 18033 + }, + { + "epoch": 36.068, + "grad_norm": 1.6260327100753784, + "learning_rate": 2e-05, + "loss": 0.02002457, + "step": 18034 + }, + { + "epoch": 36.07, + "grad_norm": 1.0886062383651733, + "learning_rate": 2e-05, + "loss": 0.03689205, + "step": 18035 + }, + { + "epoch": 36.072, + "grad_norm": 2.1114742755889893, + "learning_rate": 2e-05, + "loss": 0.04384237, + "step": 18036 + }, + { + "epoch": 36.074, + "grad_norm": 1.199912428855896, + "learning_rate": 2e-05, + "loss": 0.03832989, + "step": 18037 + }, + { + "epoch": 36.076, + "grad_norm": 1.1416548490524292, + "learning_rate": 2e-05, + "loss": 0.04295663, + "step": 18038 + }, + { + "epoch": 36.078, + "grad_norm": 5.6245245933532715, + "learning_rate": 2e-05, + "loss": 0.04268225, + "step": 18039 + }, + { + "epoch": 36.08, + "grad_norm": 1.2393498420715332, + "learning_rate": 2e-05, + "loss": 0.03811008, + "step": 18040 + }, + { + "epoch": 36.082, + "grad_norm": 1.0027004480361938, + "learning_rate": 2e-05, + "loss": 0.03410269, + "step": 18041 + }, + { + "epoch": 36.084, + "grad_norm": 1.7057132720947266, + "learning_rate": 2e-05, + "loss": 0.03965345, + "step": 18042 + }, + { + "epoch": 36.086, + "grad_norm": 0.9601574540138245, + "learning_rate": 2e-05, + "loss": 0.02651468, + "step": 18043 + }, + { + "epoch": 36.088, + "grad_norm": 1.2537764310836792, + "learning_rate": 2e-05, + "loss": 0.03860223, + "step": 18044 + }, + { + "epoch": 36.09, + "grad_norm": 1.3177660703659058, + "learning_rate": 2e-05, + "loss": 0.05462038, + "step": 18045 + }, + { + "epoch": 36.092, + "grad_norm": 1.2594581842422485, + "learning_rate": 2e-05, + "loss": 0.04445657, + "step": 18046 + }, + { + "epoch": 36.094, + "grad_norm": 1.132057547569275, + "learning_rate": 2e-05, + "loss": 0.05026545, + "step": 18047 + }, + { + "epoch": 36.096, + "grad_norm": 1.5587725639343262, + "learning_rate": 2e-05, + "loss": 0.049737, + "step": 18048 + }, + { + "epoch": 36.098, + "grad_norm": 1.235729694366455, + "learning_rate": 2e-05, + "loss": 0.04665924, + "step": 18049 + }, + { + "epoch": 36.1, + "grad_norm": 1.4784846305847168, + "learning_rate": 2e-05, + "loss": 0.04161521, + "step": 18050 + }, + { + "epoch": 36.102, + "grad_norm": 1.898224115371704, + "learning_rate": 2e-05, + "loss": 0.05342315, + "step": 18051 + }, + { + "epoch": 36.104, + "grad_norm": 1.1921885013580322, + "learning_rate": 2e-05, + "loss": 0.03857575, + "step": 18052 + }, + { + "epoch": 36.106, + "grad_norm": 1.1267322301864624, + "learning_rate": 2e-05, + "loss": 0.04110962, + "step": 18053 + }, + { + "epoch": 36.108, + "grad_norm": 1.1211832761764526, + "learning_rate": 2e-05, + "loss": 0.04066541, + "step": 18054 + }, + { + "epoch": 36.11, + "grad_norm": 1.1396983861923218, + "learning_rate": 2e-05, + "loss": 0.03554956, + "step": 18055 + }, + { + "epoch": 36.112, + "grad_norm": 4.26861572265625, + "learning_rate": 2e-05, + "loss": 0.04480337, + "step": 18056 + }, + { + "epoch": 36.114, + "grad_norm": 1.3064028024673462, + "learning_rate": 2e-05, + "loss": 0.05451906, + "step": 18057 + }, + { + "epoch": 36.116, + "grad_norm": 1.1184580326080322, + "learning_rate": 2e-05, + "loss": 0.04466115, + "step": 18058 + }, + { + "epoch": 36.118, + "grad_norm": 2.734436273574829, + "learning_rate": 2e-05, + "loss": 0.04473677, + "step": 18059 + }, + { + "epoch": 36.12, + "grad_norm": 1.1926535367965698, + "learning_rate": 2e-05, + "loss": 0.04510562, + "step": 18060 + }, + { + "epoch": 36.122, + "grad_norm": 2.227550506591797, + "learning_rate": 2e-05, + "loss": 0.03720244, + "step": 18061 + }, + { + "epoch": 36.124, + "grad_norm": 1.2409284114837646, + "learning_rate": 2e-05, + "loss": 0.04574725, + "step": 18062 + }, + { + "epoch": 36.126, + "grad_norm": 1.0186352729797363, + "learning_rate": 2e-05, + "loss": 0.03966548, + "step": 18063 + }, + { + "epoch": 36.128, + "grad_norm": 1.098799467086792, + "learning_rate": 2e-05, + "loss": 0.04066562, + "step": 18064 + }, + { + "epoch": 36.13, + "grad_norm": 0.9615515470504761, + "learning_rate": 2e-05, + "loss": 0.02897996, + "step": 18065 + }, + { + "epoch": 36.132, + "grad_norm": 2.371126413345337, + "learning_rate": 2e-05, + "loss": 0.06829099, + "step": 18066 + }, + { + "epoch": 36.134, + "grad_norm": 1.4561325311660767, + "learning_rate": 2e-05, + "loss": 0.04208488, + "step": 18067 + }, + { + "epoch": 36.136, + "grad_norm": 1.1017025709152222, + "learning_rate": 2e-05, + "loss": 0.04764595, + "step": 18068 + }, + { + "epoch": 36.138, + "grad_norm": 1.6677707433700562, + "learning_rate": 2e-05, + "loss": 0.03898593, + "step": 18069 + }, + { + "epoch": 36.14, + "grad_norm": 1.2435901165008545, + "learning_rate": 2e-05, + "loss": 0.03861672, + "step": 18070 + }, + { + "epoch": 36.142, + "grad_norm": 1.3686686754226685, + "learning_rate": 2e-05, + "loss": 0.05133841, + "step": 18071 + }, + { + "epoch": 36.144, + "grad_norm": 1.1071763038635254, + "learning_rate": 2e-05, + "loss": 0.04213175, + "step": 18072 + }, + { + "epoch": 36.146, + "grad_norm": 1.1424134969711304, + "learning_rate": 2e-05, + "loss": 0.04421389, + "step": 18073 + }, + { + "epoch": 36.148, + "grad_norm": 1.2619765996932983, + "learning_rate": 2e-05, + "loss": 0.05113876, + "step": 18074 + }, + { + "epoch": 36.15, + "grad_norm": 2.3322865962982178, + "learning_rate": 2e-05, + "loss": 0.05380969, + "step": 18075 + }, + { + "epoch": 36.152, + "grad_norm": 1.5360093116760254, + "learning_rate": 2e-05, + "loss": 0.03704157, + "step": 18076 + }, + { + "epoch": 36.154, + "grad_norm": 2.7045676708221436, + "learning_rate": 2e-05, + "loss": 0.03724046, + "step": 18077 + }, + { + "epoch": 36.156, + "grad_norm": 1.778171181678772, + "learning_rate": 2e-05, + "loss": 0.06204282, + "step": 18078 + }, + { + "epoch": 36.158, + "grad_norm": 2.811089277267456, + "learning_rate": 2e-05, + "loss": 0.04121366, + "step": 18079 + }, + { + "epoch": 36.16, + "grad_norm": 1.1129862070083618, + "learning_rate": 2e-05, + "loss": 0.03516244, + "step": 18080 + }, + { + "epoch": 36.162, + "grad_norm": 1.1569979190826416, + "learning_rate": 2e-05, + "loss": 0.04278459, + "step": 18081 + }, + { + "epoch": 36.164, + "grad_norm": 1.064875841140747, + "learning_rate": 2e-05, + "loss": 0.03698279, + "step": 18082 + }, + { + "epoch": 36.166, + "grad_norm": 1.0886690616607666, + "learning_rate": 2e-05, + "loss": 0.04283421, + "step": 18083 + }, + { + "epoch": 36.168, + "grad_norm": 3.398261785507202, + "learning_rate": 2e-05, + "loss": 0.03867529, + "step": 18084 + }, + { + "epoch": 36.17, + "grad_norm": 1.657079815864563, + "learning_rate": 2e-05, + "loss": 0.04585808, + "step": 18085 + }, + { + "epoch": 36.172, + "grad_norm": 1.3320543766021729, + "learning_rate": 2e-05, + "loss": 0.04661385, + "step": 18086 + }, + { + "epoch": 36.174, + "grad_norm": 0.8988568782806396, + "learning_rate": 2e-05, + "loss": 0.02427765, + "step": 18087 + }, + { + "epoch": 36.176, + "grad_norm": 1.284536361694336, + "learning_rate": 2e-05, + "loss": 0.0532205, + "step": 18088 + }, + { + "epoch": 36.178, + "grad_norm": 1.1643673181533813, + "learning_rate": 2e-05, + "loss": 0.03632015, + "step": 18089 + }, + { + "epoch": 36.18, + "grad_norm": 1.3021336793899536, + "learning_rate": 2e-05, + "loss": 0.05177925, + "step": 18090 + }, + { + "epoch": 36.182, + "grad_norm": 2.0210723876953125, + "learning_rate": 2e-05, + "loss": 0.04610447, + "step": 18091 + }, + { + "epoch": 36.184, + "grad_norm": 0.9699888229370117, + "learning_rate": 2e-05, + "loss": 0.0398217, + "step": 18092 + }, + { + "epoch": 36.186, + "grad_norm": 1.0085978507995605, + "learning_rate": 2e-05, + "loss": 0.03745851, + "step": 18093 + }, + { + "epoch": 36.188, + "grad_norm": 1.3533179759979248, + "learning_rate": 2e-05, + "loss": 0.05687679, + "step": 18094 + }, + { + "epoch": 36.19, + "grad_norm": 1.0797785520553589, + "learning_rate": 2e-05, + "loss": 0.04193022, + "step": 18095 + }, + { + "epoch": 36.192, + "grad_norm": 1.4561572074890137, + "learning_rate": 2e-05, + "loss": 0.03306325, + "step": 18096 + }, + { + "epoch": 36.194, + "grad_norm": 1.2475993633270264, + "learning_rate": 2e-05, + "loss": 0.03667925, + "step": 18097 + }, + { + "epoch": 36.196, + "grad_norm": 0.9133176803588867, + "learning_rate": 2e-05, + "loss": 0.02343579, + "step": 18098 + }, + { + "epoch": 36.198, + "grad_norm": 1.6272944211959839, + "learning_rate": 2e-05, + "loss": 0.04554627, + "step": 18099 + }, + { + "epoch": 36.2, + "grad_norm": 1.5745600461959839, + "learning_rate": 2e-05, + "loss": 0.04788917, + "step": 18100 + }, + { + "epoch": 36.202, + "grad_norm": 1.2428606748580933, + "learning_rate": 2e-05, + "loss": 0.04026973, + "step": 18101 + }, + { + "epoch": 36.204, + "grad_norm": 1.6128360033035278, + "learning_rate": 2e-05, + "loss": 0.05024995, + "step": 18102 + }, + { + "epoch": 36.206, + "grad_norm": 1.0230610370635986, + "learning_rate": 2e-05, + "loss": 0.03382932, + "step": 18103 + }, + { + "epoch": 36.208, + "grad_norm": 1.075156331062317, + "learning_rate": 2e-05, + "loss": 0.04463224, + "step": 18104 + }, + { + "epoch": 36.21, + "grad_norm": 3.846419095993042, + "learning_rate": 2e-05, + "loss": 0.05278007, + "step": 18105 + }, + { + "epoch": 36.212, + "grad_norm": 0.9034450054168701, + "learning_rate": 2e-05, + "loss": 0.03557969, + "step": 18106 + }, + { + "epoch": 36.214, + "grad_norm": 0.923078179359436, + "learning_rate": 2e-05, + "loss": 0.03522691, + "step": 18107 + }, + { + "epoch": 36.216, + "grad_norm": 1.3479729890823364, + "learning_rate": 2e-05, + "loss": 0.0439695, + "step": 18108 + }, + { + "epoch": 36.218, + "grad_norm": 2.1420326232910156, + "learning_rate": 2e-05, + "loss": 0.06162531, + "step": 18109 + }, + { + "epoch": 36.22, + "grad_norm": 1.184019684791565, + "learning_rate": 2e-05, + "loss": 0.04064212, + "step": 18110 + }, + { + "epoch": 36.222, + "grad_norm": 1.0127112865447998, + "learning_rate": 2e-05, + "loss": 0.03657559, + "step": 18111 + }, + { + "epoch": 36.224, + "grad_norm": 0.9742773175239563, + "learning_rate": 2e-05, + "loss": 0.03809122, + "step": 18112 + }, + { + "epoch": 36.226, + "grad_norm": 1.1490448713302612, + "learning_rate": 2e-05, + "loss": 0.04694427, + "step": 18113 + }, + { + "epoch": 36.228, + "grad_norm": 0.9744361042976379, + "learning_rate": 2e-05, + "loss": 0.02423439, + "step": 18114 + }, + { + "epoch": 36.23, + "grad_norm": 1.1082568168640137, + "learning_rate": 2e-05, + "loss": 0.04402753, + "step": 18115 + }, + { + "epoch": 36.232, + "grad_norm": 1.1237958669662476, + "learning_rate": 2e-05, + "loss": 0.03403135, + "step": 18116 + }, + { + "epoch": 36.234, + "grad_norm": 1.0820505619049072, + "learning_rate": 2e-05, + "loss": 0.03654093, + "step": 18117 + }, + { + "epoch": 36.236, + "grad_norm": 1.0331096649169922, + "learning_rate": 2e-05, + "loss": 0.03929486, + "step": 18118 + }, + { + "epoch": 36.238, + "grad_norm": 1.2376075983047485, + "learning_rate": 2e-05, + "loss": 0.03313967, + "step": 18119 + }, + { + "epoch": 36.24, + "grad_norm": 1.596076250076294, + "learning_rate": 2e-05, + "loss": 0.04521656, + "step": 18120 + }, + { + "epoch": 36.242, + "grad_norm": 1.2083992958068848, + "learning_rate": 2e-05, + "loss": 0.04801769, + "step": 18121 + }, + { + "epoch": 36.244, + "grad_norm": 1.3693628311157227, + "learning_rate": 2e-05, + "loss": 0.05652841, + "step": 18122 + }, + { + "epoch": 36.246, + "grad_norm": 1.8372113704681396, + "learning_rate": 2e-05, + "loss": 0.03900075, + "step": 18123 + }, + { + "epoch": 36.248, + "grad_norm": 1.6199109554290771, + "learning_rate": 2e-05, + "loss": 0.05532604, + "step": 18124 + }, + { + "epoch": 36.25, + "grad_norm": 1.1799906492233276, + "learning_rate": 2e-05, + "loss": 0.03721724, + "step": 18125 + }, + { + "epoch": 36.252, + "grad_norm": 1.0089119672775269, + "learning_rate": 2e-05, + "loss": 0.03412523, + "step": 18126 + }, + { + "epoch": 36.254, + "grad_norm": 1.3146682977676392, + "learning_rate": 2e-05, + "loss": 0.03678686, + "step": 18127 + }, + { + "epoch": 36.256, + "grad_norm": 1.4293696880340576, + "learning_rate": 2e-05, + "loss": 0.05230099, + "step": 18128 + }, + { + "epoch": 36.258, + "grad_norm": 1.1753870248794556, + "learning_rate": 2e-05, + "loss": 0.04117392, + "step": 18129 + }, + { + "epoch": 36.26, + "grad_norm": 1.2926392555236816, + "learning_rate": 2e-05, + "loss": 0.04781886, + "step": 18130 + }, + { + "epoch": 36.262, + "grad_norm": 1.7987608909606934, + "learning_rate": 2e-05, + "loss": 0.05109328, + "step": 18131 + }, + { + "epoch": 36.264, + "grad_norm": 1.0095752477645874, + "learning_rate": 2e-05, + "loss": 0.04271962, + "step": 18132 + }, + { + "epoch": 36.266, + "grad_norm": 0.901914656162262, + "learning_rate": 2e-05, + "loss": 0.02623034, + "step": 18133 + }, + { + "epoch": 36.268, + "grad_norm": 1.1711057424545288, + "learning_rate": 2e-05, + "loss": 0.03598663, + "step": 18134 + }, + { + "epoch": 36.27, + "grad_norm": 0.9837378859519958, + "learning_rate": 2e-05, + "loss": 0.03688063, + "step": 18135 + }, + { + "epoch": 36.272, + "grad_norm": 1.2390130758285522, + "learning_rate": 2e-05, + "loss": 0.05092961, + "step": 18136 + }, + { + "epoch": 36.274, + "grad_norm": 1.270090103149414, + "learning_rate": 2e-05, + "loss": 0.0404551, + "step": 18137 + }, + { + "epoch": 36.276, + "grad_norm": 1.5756562948226929, + "learning_rate": 2e-05, + "loss": 0.04972642, + "step": 18138 + }, + { + "epoch": 36.278, + "grad_norm": 0.9940088987350464, + "learning_rate": 2e-05, + "loss": 0.03693637, + "step": 18139 + }, + { + "epoch": 36.28, + "grad_norm": 0.9043747782707214, + "learning_rate": 2e-05, + "loss": 0.02635769, + "step": 18140 + }, + { + "epoch": 36.282, + "grad_norm": 1.2040432691574097, + "learning_rate": 2e-05, + "loss": 0.04434269, + "step": 18141 + }, + { + "epoch": 36.284, + "grad_norm": 1.0949373245239258, + "learning_rate": 2e-05, + "loss": 0.03474148, + "step": 18142 + }, + { + "epoch": 36.286, + "grad_norm": 1.09050452709198, + "learning_rate": 2e-05, + "loss": 0.04845259, + "step": 18143 + }, + { + "epoch": 36.288, + "grad_norm": 1.3407831192016602, + "learning_rate": 2e-05, + "loss": 0.04304389, + "step": 18144 + }, + { + "epoch": 36.29, + "grad_norm": 1.1936886310577393, + "learning_rate": 2e-05, + "loss": 0.04481979, + "step": 18145 + }, + { + "epoch": 36.292, + "grad_norm": 2.4183106422424316, + "learning_rate": 2e-05, + "loss": 0.05331695, + "step": 18146 + }, + { + "epoch": 36.294, + "grad_norm": 1.1295486688613892, + "learning_rate": 2e-05, + "loss": 0.03321164, + "step": 18147 + }, + { + "epoch": 36.296, + "grad_norm": 1.360809087753296, + "learning_rate": 2e-05, + "loss": 0.03916159, + "step": 18148 + }, + { + "epoch": 36.298, + "grad_norm": 1.247570276260376, + "learning_rate": 2e-05, + "loss": 0.0428711, + "step": 18149 + }, + { + "epoch": 36.3, + "grad_norm": 1.036298394203186, + "learning_rate": 2e-05, + "loss": 0.04656066, + "step": 18150 + }, + { + "epoch": 36.302, + "grad_norm": 0.9646769165992737, + "learning_rate": 2e-05, + "loss": 0.03399408, + "step": 18151 + }, + { + "epoch": 36.304, + "grad_norm": 1.164666771888733, + "learning_rate": 2e-05, + "loss": 0.04837671, + "step": 18152 + }, + { + "epoch": 36.306, + "grad_norm": 1.2248945236206055, + "learning_rate": 2e-05, + "loss": 0.0307962, + "step": 18153 + }, + { + "epoch": 36.308, + "grad_norm": 1.1758040189743042, + "learning_rate": 2e-05, + "loss": 0.0404579, + "step": 18154 + }, + { + "epoch": 36.31, + "grad_norm": 1.0307793617248535, + "learning_rate": 2e-05, + "loss": 0.0290431, + "step": 18155 + }, + { + "epoch": 36.312, + "grad_norm": 0.920478880405426, + "learning_rate": 2e-05, + "loss": 0.03528172, + "step": 18156 + }, + { + "epoch": 36.314, + "grad_norm": 1.0334807634353638, + "learning_rate": 2e-05, + "loss": 0.05046558, + "step": 18157 + }, + { + "epoch": 36.316, + "grad_norm": 0.9907777309417725, + "learning_rate": 2e-05, + "loss": 0.03023825, + "step": 18158 + }, + { + "epoch": 36.318, + "grad_norm": 1.227904200553894, + "learning_rate": 2e-05, + "loss": 0.04913838, + "step": 18159 + }, + { + "epoch": 36.32, + "grad_norm": 1.2009419202804565, + "learning_rate": 2e-05, + "loss": 0.04932972, + "step": 18160 + }, + { + "epoch": 36.322, + "grad_norm": 0.9665030241012573, + "learning_rate": 2e-05, + "loss": 0.03171739, + "step": 18161 + }, + { + "epoch": 36.324, + "grad_norm": 1.2279425859451294, + "learning_rate": 2e-05, + "loss": 0.0608729, + "step": 18162 + }, + { + "epoch": 36.326, + "grad_norm": 0.9468924403190613, + "learning_rate": 2e-05, + "loss": 0.03156189, + "step": 18163 + }, + { + "epoch": 36.328, + "grad_norm": 1.0261448621749878, + "learning_rate": 2e-05, + "loss": 0.03610597, + "step": 18164 + }, + { + "epoch": 36.33, + "grad_norm": 1.276781678199768, + "learning_rate": 2e-05, + "loss": 0.03549629, + "step": 18165 + }, + { + "epoch": 36.332, + "grad_norm": 0.9759575724601746, + "learning_rate": 2e-05, + "loss": 0.03994817, + "step": 18166 + }, + { + "epoch": 36.334, + "grad_norm": 1.7276750802993774, + "learning_rate": 2e-05, + "loss": 0.04654774, + "step": 18167 + }, + { + "epoch": 36.336, + "grad_norm": 0.9664547443389893, + "learning_rate": 2e-05, + "loss": 0.03177688, + "step": 18168 + }, + { + "epoch": 36.338, + "grad_norm": 1.9670281410217285, + "learning_rate": 2e-05, + "loss": 0.04651101, + "step": 18169 + }, + { + "epoch": 36.34, + "grad_norm": 1.802821159362793, + "learning_rate": 2e-05, + "loss": 0.04297075, + "step": 18170 + }, + { + "epoch": 36.342, + "grad_norm": 1.661774754524231, + "learning_rate": 2e-05, + "loss": 0.06117053, + "step": 18171 + }, + { + "epoch": 36.344, + "grad_norm": 2.3734986782073975, + "learning_rate": 2e-05, + "loss": 0.05952629, + "step": 18172 + }, + { + "epoch": 36.346, + "grad_norm": 1.2025349140167236, + "learning_rate": 2e-05, + "loss": 0.04102109, + "step": 18173 + }, + { + "epoch": 36.348, + "grad_norm": 1.0982835292816162, + "learning_rate": 2e-05, + "loss": 0.04104698, + "step": 18174 + }, + { + "epoch": 36.35, + "grad_norm": 1.157949686050415, + "learning_rate": 2e-05, + "loss": 0.03813836, + "step": 18175 + }, + { + "epoch": 36.352, + "grad_norm": 1.055647611618042, + "learning_rate": 2e-05, + "loss": 0.03598933, + "step": 18176 + }, + { + "epoch": 36.354, + "grad_norm": 1.3801038265228271, + "learning_rate": 2e-05, + "loss": 0.04944832, + "step": 18177 + }, + { + "epoch": 36.356, + "grad_norm": 1.322588324546814, + "learning_rate": 2e-05, + "loss": 0.03539138, + "step": 18178 + }, + { + "epoch": 36.358, + "grad_norm": 1.4064815044403076, + "learning_rate": 2e-05, + "loss": 0.05578845, + "step": 18179 + }, + { + "epoch": 36.36, + "grad_norm": 3.2461321353912354, + "learning_rate": 2e-05, + "loss": 0.04246819, + "step": 18180 + }, + { + "epoch": 36.362, + "grad_norm": 1.1064374446868896, + "learning_rate": 2e-05, + "loss": 0.03681324, + "step": 18181 + }, + { + "epoch": 36.364, + "grad_norm": 1.1665987968444824, + "learning_rate": 2e-05, + "loss": 0.04058225, + "step": 18182 + }, + { + "epoch": 36.366, + "grad_norm": 1.029982089996338, + "learning_rate": 2e-05, + "loss": 0.03287161, + "step": 18183 + }, + { + "epoch": 36.368, + "grad_norm": 1.1927478313446045, + "learning_rate": 2e-05, + "loss": 0.05367187, + "step": 18184 + }, + { + "epoch": 36.37, + "grad_norm": 1.2972198724746704, + "learning_rate": 2e-05, + "loss": 0.04106817, + "step": 18185 + }, + { + "epoch": 36.372, + "grad_norm": 1.027411699295044, + "learning_rate": 2e-05, + "loss": 0.04088916, + "step": 18186 + }, + { + "epoch": 36.374, + "grad_norm": 1.171830415725708, + "learning_rate": 2e-05, + "loss": 0.05644122, + "step": 18187 + }, + { + "epoch": 36.376, + "grad_norm": 1.1561020612716675, + "learning_rate": 2e-05, + "loss": 0.04188239, + "step": 18188 + }, + { + "epoch": 36.378, + "grad_norm": 1.0463422536849976, + "learning_rate": 2e-05, + "loss": 0.04296407, + "step": 18189 + }, + { + "epoch": 36.38, + "grad_norm": 1.2229862213134766, + "learning_rate": 2e-05, + "loss": 0.03844842, + "step": 18190 + }, + { + "epoch": 36.382, + "grad_norm": 0.8768106698989868, + "learning_rate": 2e-05, + "loss": 0.03120964, + "step": 18191 + }, + { + "epoch": 36.384, + "grad_norm": 1.1767603158950806, + "learning_rate": 2e-05, + "loss": 0.03380742, + "step": 18192 + }, + { + "epoch": 36.386, + "grad_norm": 0.9635298848152161, + "learning_rate": 2e-05, + "loss": 0.03339863, + "step": 18193 + }, + { + "epoch": 36.388, + "grad_norm": 0.9768321514129639, + "learning_rate": 2e-05, + "loss": 0.03579611, + "step": 18194 + }, + { + "epoch": 36.39, + "grad_norm": 0.8538011312484741, + "learning_rate": 2e-05, + "loss": 0.02970689, + "step": 18195 + }, + { + "epoch": 36.392, + "grad_norm": 1.0878775119781494, + "learning_rate": 2e-05, + "loss": 0.04001588, + "step": 18196 + }, + { + "epoch": 36.394, + "grad_norm": 1.142221212387085, + "learning_rate": 2e-05, + "loss": 0.04551936, + "step": 18197 + }, + { + "epoch": 36.396, + "grad_norm": 1.5530418157577515, + "learning_rate": 2e-05, + "loss": 0.03043629, + "step": 18198 + }, + { + "epoch": 36.398, + "grad_norm": 1.637148141860962, + "learning_rate": 2e-05, + "loss": 0.03448223, + "step": 18199 + }, + { + "epoch": 36.4, + "grad_norm": 1.4483563899993896, + "learning_rate": 2e-05, + "loss": 0.05348013, + "step": 18200 + }, + { + "epoch": 36.402, + "grad_norm": 1.0635143518447876, + "learning_rate": 2e-05, + "loss": 0.03588112, + "step": 18201 + }, + { + "epoch": 36.404, + "grad_norm": 2.611689329147339, + "learning_rate": 2e-05, + "loss": 0.06078885, + "step": 18202 + }, + { + "epoch": 36.406, + "grad_norm": 1.502625584602356, + "learning_rate": 2e-05, + "loss": 0.03423886, + "step": 18203 + }, + { + "epoch": 36.408, + "grad_norm": 1.3298054933547974, + "learning_rate": 2e-05, + "loss": 0.04151591, + "step": 18204 + }, + { + "epoch": 36.41, + "grad_norm": 1.27322518825531, + "learning_rate": 2e-05, + "loss": 0.05073592, + "step": 18205 + }, + { + "epoch": 36.412, + "grad_norm": 1.035677433013916, + "learning_rate": 2e-05, + "loss": 0.04113561, + "step": 18206 + }, + { + "epoch": 36.414, + "grad_norm": 1.7269580364227295, + "learning_rate": 2e-05, + "loss": 0.04068903, + "step": 18207 + }, + { + "epoch": 36.416, + "grad_norm": 1.314218521118164, + "learning_rate": 2e-05, + "loss": 0.04115914, + "step": 18208 + }, + { + "epoch": 36.418, + "grad_norm": 1.3631287813186646, + "learning_rate": 2e-05, + "loss": 0.05018599, + "step": 18209 + }, + { + "epoch": 36.42, + "grad_norm": 1.4252984523773193, + "learning_rate": 2e-05, + "loss": 0.05225445, + "step": 18210 + }, + { + "epoch": 36.422, + "grad_norm": 1.0618059635162354, + "learning_rate": 2e-05, + "loss": 0.03559672, + "step": 18211 + }, + { + "epoch": 36.424, + "grad_norm": 1.217882752418518, + "learning_rate": 2e-05, + "loss": 0.0485566, + "step": 18212 + }, + { + "epoch": 36.426, + "grad_norm": 1.2114055156707764, + "learning_rate": 2e-05, + "loss": 0.04762184, + "step": 18213 + }, + { + "epoch": 36.428, + "grad_norm": 2.0187506675720215, + "learning_rate": 2e-05, + "loss": 0.05630653, + "step": 18214 + }, + { + "epoch": 36.43, + "grad_norm": 1.9597657918930054, + "learning_rate": 2e-05, + "loss": 0.06257641, + "step": 18215 + }, + { + "epoch": 36.432, + "grad_norm": 0.9971281886100769, + "learning_rate": 2e-05, + "loss": 0.03872079, + "step": 18216 + }, + { + "epoch": 36.434, + "grad_norm": 1.2572100162506104, + "learning_rate": 2e-05, + "loss": 0.03614823, + "step": 18217 + }, + { + "epoch": 36.436, + "grad_norm": 1.1719560623168945, + "learning_rate": 2e-05, + "loss": 0.03753065, + "step": 18218 + }, + { + "epoch": 36.438, + "grad_norm": 1.0278385877609253, + "learning_rate": 2e-05, + "loss": 0.03851878, + "step": 18219 + }, + { + "epoch": 36.44, + "grad_norm": 1.2253730297088623, + "learning_rate": 2e-05, + "loss": 0.0503806, + "step": 18220 + }, + { + "epoch": 36.442, + "grad_norm": 0.88956218957901, + "learning_rate": 2e-05, + "loss": 0.02707586, + "step": 18221 + }, + { + "epoch": 36.444, + "grad_norm": 0.8710851669311523, + "learning_rate": 2e-05, + "loss": 0.0222133, + "step": 18222 + }, + { + "epoch": 36.446, + "grad_norm": 1.2004845142364502, + "learning_rate": 2e-05, + "loss": 0.05466457, + "step": 18223 + }, + { + "epoch": 36.448, + "grad_norm": 1.3014144897460938, + "learning_rate": 2e-05, + "loss": 0.05169172, + "step": 18224 + }, + { + "epoch": 36.45, + "grad_norm": 1.0197980403900146, + "learning_rate": 2e-05, + "loss": 0.03822663, + "step": 18225 + }, + { + "epoch": 36.452, + "grad_norm": 1.3792431354522705, + "learning_rate": 2e-05, + "loss": 0.04293468, + "step": 18226 + }, + { + "epoch": 36.454, + "grad_norm": 1.1744886636734009, + "learning_rate": 2e-05, + "loss": 0.05211157, + "step": 18227 + }, + { + "epoch": 36.456, + "grad_norm": 1.598494052886963, + "learning_rate": 2e-05, + "loss": 0.04538838, + "step": 18228 + }, + { + "epoch": 36.458, + "grad_norm": 0.8673466444015503, + "learning_rate": 2e-05, + "loss": 0.0319301, + "step": 18229 + }, + { + "epoch": 36.46, + "grad_norm": 1.918244481086731, + "learning_rate": 2e-05, + "loss": 0.05689289, + "step": 18230 + }, + { + "epoch": 36.462, + "grad_norm": 1.1343375444412231, + "learning_rate": 2e-05, + "loss": 0.04852476, + "step": 18231 + }, + { + "epoch": 36.464, + "grad_norm": 1.910357117652893, + "learning_rate": 2e-05, + "loss": 0.04690136, + "step": 18232 + }, + { + "epoch": 36.466, + "grad_norm": 1.1695517301559448, + "learning_rate": 2e-05, + "loss": 0.04719057, + "step": 18233 + }, + { + "epoch": 36.468, + "grad_norm": 1.595226526260376, + "learning_rate": 2e-05, + "loss": 0.04680973, + "step": 18234 + }, + { + "epoch": 36.47, + "grad_norm": 1.0426281690597534, + "learning_rate": 2e-05, + "loss": 0.03990892, + "step": 18235 + }, + { + "epoch": 36.472, + "grad_norm": 1.362654209136963, + "learning_rate": 2e-05, + "loss": 0.03500029, + "step": 18236 + }, + { + "epoch": 36.474, + "grad_norm": 1.1419274806976318, + "learning_rate": 2e-05, + "loss": 0.04769664, + "step": 18237 + }, + { + "epoch": 36.476, + "grad_norm": 1.1063685417175293, + "learning_rate": 2e-05, + "loss": 0.03118099, + "step": 18238 + }, + { + "epoch": 36.478, + "grad_norm": 1.0712615251541138, + "learning_rate": 2e-05, + "loss": 0.0473005, + "step": 18239 + }, + { + "epoch": 36.48, + "grad_norm": 0.9738886952400208, + "learning_rate": 2e-05, + "loss": 0.03309727, + "step": 18240 + }, + { + "epoch": 36.482, + "grad_norm": 1.6552538871765137, + "learning_rate": 2e-05, + "loss": 0.05177336, + "step": 18241 + }, + { + "epoch": 36.484, + "grad_norm": 0.9547532200813293, + "learning_rate": 2e-05, + "loss": 0.03158896, + "step": 18242 + }, + { + "epoch": 36.486, + "grad_norm": 1.1574902534484863, + "learning_rate": 2e-05, + "loss": 0.03545589, + "step": 18243 + }, + { + "epoch": 36.488, + "grad_norm": 1.1547956466674805, + "learning_rate": 2e-05, + "loss": 0.05037494, + "step": 18244 + }, + { + "epoch": 36.49, + "grad_norm": 1.5758951902389526, + "learning_rate": 2e-05, + "loss": 0.03462842, + "step": 18245 + }, + { + "epoch": 36.492, + "grad_norm": 1.3058934211730957, + "learning_rate": 2e-05, + "loss": 0.05186077, + "step": 18246 + }, + { + "epoch": 36.494, + "grad_norm": 2.153010129928589, + "learning_rate": 2e-05, + "loss": 0.05587346, + "step": 18247 + }, + { + "epoch": 36.496, + "grad_norm": 1.0306788682937622, + "learning_rate": 2e-05, + "loss": 0.02917742, + "step": 18248 + }, + { + "epoch": 36.498, + "grad_norm": 1.1398119926452637, + "learning_rate": 2e-05, + "loss": 0.0482577, + "step": 18249 + }, + { + "epoch": 36.5, + "grad_norm": 1.2190545797348022, + "learning_rate": 2e-05, + "loss": 0.04970558, + "step": 18250 + }, + { + "epoch": 36.502, + "grad_norm": 1.0682768821716309, + "learning_rate": 2e-05, + "loss": 0.03955474, + "step": 18251 + }, + { + "epoch": 36.504, + "grad_norm": 1.061971664428711, + "learning_rate": 2e-05, + "loss": 0.03687964, + "step": 18252 + }, + { + "epoch": 36.506, + "grad_norm": 1.0158944129943848, + "learning_rate": 2e-05, + "loss": 0.03078072, + "step": 18253 + }, + { + "epoch": 36.508, + "grad_norm": 1.3401365280151367, + "learning_rate": 2e-05, + "loss": 0.0381853, + "step": 18254 + }, + { + "epoch": 36.51, + "grad_norm": 1.350685715675354, + "learning_rate": 2e-05, + "loss": 0.04724355, + "step": 18255 + }, + { + "epoch": 36.512, + "grad_norm": 2.3168015480041504, + "learning_rate": 2e-05, + "loss": 0.04308641, + "step": 18256 + }, + { + "epoch": 36.514, + "grad_norm": 1.6673002243041992, + "learning_rate": 2e-05, + "loss": 0.04929385, + "step": 18257 + }, + { + "epoch": 36.516, + "grad_norm": 0.9662206768989563, + "learning_rate": 2e-05, + "loss": 0.03089986, + "step": 18258 + }, + { + "epoch": 36.518, + "grad_norm": 1.062555193901062, + "learning_rate": 2e-05, + "loss": 0.0395766, + "step": 18259 + }, + { + "epoch": 36.52, + "grad_norm": 1.0343207120895386, + "learning_rate": 2e-05, + "loss": 0.02805191, + "step": 18260 + }, + { + "epoch": 36.522, + "grad_norm": 0.9246268272399902, + "learning_rate": 2e-05, + "loss": 0.02393855, + "step": 18261 + }, + { + "epoch": 36.524, + "grad_norm": 1.8312151432037354, + "learning_rate": 2e-05, + "loss": 0.04711398, + "step": 18262 + }, + { + "epoch": 36.526, + "grad_norm": 1.2133851051330566, + "learning_rate": 2e-05, + "loss": 0.04653526, + "step": 18263 + }, + { + "epoch": 36.528, + "grad_norm": 0.9863196015357971, + "learning_rate": 2e-05, + "loss": 0.02355134, + "step": 18264 + }, + { + "epoch": 36.53, + "grad_norm": 1.0515233278274536, + "learning_rate": 2e-05, + "loss": 0.03952796, + "step": 18265 + }, + { + "epoch": 36.532, + "grad_norm": 0.971024751663208, + "learning_rate": 2e-05, + "loss": 0.03000312, + "step": 18266 + }, + { + "epoch": 36.534, + "grad_norm": 1.2390265464782715, + "learning_rate": 2e-05, + "loss": 0.03372075, + "step": 18267 + }, + { + "epoch": 36.536, + "grad_norm": 0.8119271993637085, + "learning_rate": 2e-05, + "loss": 0.0300536, + "step": 18268 + }, + { + "epoch": 36.538, + "grad_norm": 0.9378778338432312, + "learning_rate": 2e-05, + "loss": 0.02700995, + "step": 18269 + }, + { + "epoch": 36.54, + "grad_norm": 2.0789153575897217, + "learning_rate": 2e-05, + "loss": 0.03948624, + "step": 18270 + }, + { + "epoch": 36.542, + "grad_norm": 1.76309335231781, + "learning_rate": 2e-05, + "loss": 0.04286005, + "step": 18271 + }, + { + "epoch": 36.544, + "grad_norm": 4.2802581787109375, + "learning_rate": 2e-05, + "loss": 0.03132661, + "step": 18272 + }, + { + "epoch": 36.546, + "grad_norm": 2.573993444442749, + "learning_rate": 2e-05, + "loss": 0.04764672, + "step": 18273 + }, + { + "epoch": 36.548, + "grad_norm": 0.87221759557724, + "learning_rate": 2e-05, + "loss": 0.02382921, + "step": 18274 + }, + { + "epoch": 36.55, + "grad_norm": 1.0562106370925903, + "learning_rate": 2e-05, + "loss": 0.04094611, + "step": 18275 + }, + { + "epoch": 36.552, + "grad_norm": 1.2717176675796509, + "learning_rate": 2e-05, + "loss": 0.0410012, + "step": 18276 + }, + { + "epoch": 36.554, + "grad_norm": 1.2382874488830566, + "learning_rate": 2e-05, + "loss": 0.03663464, + "step": 18277 + }, + { + "epoch": 36.556, + "grad_norm": 1.964185118675232, + "learning_rate": 2e-05, + "loss": 0.04018112, + "step": 18278 + }, + { + "epoch": 36.558, + "grad_norm": 1.2324621677398682, + "learning_rate": 2e-05, + "loss": 0.03608364, + "step": 18279 + }, + { + "epoch": 36.56, + "grad_norm": 1.2070492506027222, + "learning_rate": 2e-05, + "loss": 0.05842055, + "step": 18280 + }, + { + "epoch": 36.562, + "grad_norm": 1.1493645906448364, + "learning_rate": 2e-05, + "loss": 0.05052029, + "step": 18281 + }, + { + "epoch": 36.564, + "grad_norm": 2.735915422439575, + "learning_rate": 2e-05, + "loss": 0.04691009, + "step": 18282 + }, + { + "epoch": 36.566, + "grad_norm": 1.2203706502914429, + "learning_rate": 2e-05, + "loss": 0.04751462, + "step": 18283 + }, + { + "epoch": 36.568, + "grad_norm": 1.1800801753997803, + "learning_rate": 2e-05, + "loss": 0.05025848, + "step": 18284 + }, + { + "epoch": 36.57, + "grad_norm": 1.0068162679672241, + "learning_rate": 2e-05, + "loss": 0.03809405, + "step": 18285 + }, + { + "epoch": 36.572, + "grad_norm": 1.2168195247650146, + "learning_rate": 2e-05, + "loss": 0.04326154, + "step": 18286 + }, + { + "epoch": 36.574, + "grad_norm": 1.434417724609375, + "learning_rate": 2e-05, + "loss": 0.0574247, + "step": 18287 + }, + { + "epoch": 36.576, + "grad_norm": 1.1110270023345947, + "learning_rate": 2e-05, + "loss": 0.0267412, + "step": 18288 + }, + { + "epoch": 36.578, + "grad_norm": 1.2842190265655518, + "learning_rate": 2e-05, + "loss": 0.04922368, + "step": 18289 + }, + { + "epoch": 36.58, + "grad_norm": 1.4136754274368286, + "learning_rate": 2e-05, + "loss": 0.03303508, + "step": 18290 + }, + { + "epoch": 36.582, + "grad_norm": 1.025667667388916, + "learning_rate": 2e-05, + "loss": 0.03115687, + "step": 18291 + }, + { + "epoch": 36.584, + "grad_norm": 1.3057501316070557, + "learning_rate": 2e-05, + "loss": 0.05253988, + "step": 18292 + }, + { + "epoch": 36.586, + "grad_norm": 1.0783129930496216, + "learning_rate": 2e-05, + "loss": 0.03761948, + "step": 18293 + }, + { + "epoch": 36.588, + "grad_norm": 1.1081454753875732, + "learning_rate": 2e-05, + "loss": 0.03277981, + "step": 18294 + }, + { + "epoch": 36.59, + "grad_norm": 1.0502467155456543, + "learning_rate": 2e-05, + "loss": 0.03914522, + "step": 18295 + }, + { + "epoch": 36.592, + "grad_norm": 1.080041766166687, + "learning_rate": 2e-05, + "loss": 0.03669895, + "step": 18296 + }, + { + "epoch": 36.594, + "grad_norm": 2.066941976547241, + "learning_rate": 2e-05, + "loss": 0.05867566, + "step": 18297 + }, + { + "epoch": 36.596, + "grad_norm": 1.1148244142532349, + "learning_rate": 2e-05, + "loss": 0.04393861, + "step": 18298 + }, + { + "epoch": 36.598, + "grad_norm": 1.0712250471115112, + "learning_rate": 2e-05, + "loss": 0.03914363, + "step": 18299 + }, + { + "epoch": 36.6, + "grad_norm": 1.2081435918807983, + "learning_rate": 2e-05, + "loss": 0.03771123, + "step": 18300 + }, + { + "epoch": 36.602, + "grad_norm": 1.143244743347168, + "learning_rate": 2e-05, + "loss": 0.04224782, + "step": 18301 + }, + { + "epoch": 36.604, + "grad_norm": 1.2254302501678467, + "learning_rate": 2e-05, + "loss": 0.04013958, + "step": 18302 + }, + { + "epoch": 36.606, + "grad_norm": 1.3752824068069458, + "learning_rate": 2e-05, + "loss": 0.050212, + "step": 18303 + }, + { + "epoch": 36.608, + "grad_norm": 0.94117671251297, + "learning_rate": 2e-05, + "loss": 0.03055703, + "step": 18304 + }, + { + "epoch": 36.61, + "grad_norm": 1.1006180047988892, + "learning_rate": 2e-05, + "loss": 0.03390218, + "step": 18305 + }, + { + "epoch": 36.612, + "grad_norm": 1.3862262964248657, + "learning_rate": 2e-05, + "loss": 0.05241469, + "step": 18306 + }, + { + "epoch": 36.614, + "grad_norm": 1.8648090362548828, + "learning_rate": 2e-05, + "loss": 0.04927859, + "step": 18307 + }, + { + "epoch": 36.616, + "grad_norm": 1.4699233770370483, + "learning_rate": 2e-05, + "loss": 0.04433728, + "step": 18308 + }, + { + "epoch": 36.618, + "grad_norm": 1.4989550113677979, + "learning_rate": 2e-05, + "loss": 0.05942933, + "step": 18309 + }, + { + "epoch": 36.62, + "grad_norm": 1.1221908330917358, + "learning_rate": 2e-05, + "loss": 0.047569, + "step": 18310 + }, + { + "epoch": 36.622, + "grad_norm": 1.540700078010559, + "learning_rate": 2e-05, + "loss": 0.05225103, + "step": 18311 + }, + { + "epoch": 36.624, + "grad_norm": 1.2131165266036987, + "learning_rate": 2e-05, + "loss": 0.04262118, + "step": 18312 + }, + { + "epoch": 36.626, + "grad_norm": 1.1024413108825684, + "learning_rate": 2e-05, + "loss": 0.03573479, + "step": 18313 + }, + { + "epoch": 36.628, + "grad_norm": 1.9609568119049072, + "learning_rate": 2e-05, + "loss": 0.04972448, + "step": 18314 + }, + { + "epoch": 36.63, + "grad_norm": 1.236175775527954, + "learning_rate": 2e-05, + "loss": 0.03609653, + "step": 18315 + }, + { + "epoch": 36.632, + "grad_norm": 1.202167272567749, + "learning_rate": 2e-05, + "loss": 0.04706704, + "step": 18316 + }, + { + "epoch": 36.634, + "grad_norm": 1.1365604400634766, + "learning_rate": 2e-05, + "loss": 0.02697523, + "step": 18317 + }, + { + "epoch": 36.636, + "grad_norm": 2.451118230819702, + "learning_rate": 2e-05, + "loss": 0.04429175, + "step": 18318 + }, + { + "epoch": 36.638, + "grad_norm": 1.0182795524597168, + "learning_rate": 2e-05, + "loss": 0.03436865, + "step": 18319 + }, + { + "epoch": 36.64, + "grad_norm": 1.2875375747680664, + "learning_rate": 2e-05, + "loss": 0.0550098, + "step": 18320 + }, + { + "epoch": 36.642, + "grad_norm": 1.559504508972168, + "learning_rate": 2e-05, + "loss": 0.05564836, + "step": 18321 + }, + { + "epoch": 36.644, + "grad_norm": 1.1889086961746216, + "learning_rate": 2e-05, + "loss": 0.03784901, + "step": 18322 + }, + { + "epoch": 36.646, + "grad_norm": 0.8898120522499084, + "learning_rate": 2e-05, + "loss": 0.0278656, + "step": 18323 + }, + { + "epoch": 36.648, + "grad_norm": 1.1782766580581665, + "learning_rate": 2e-05, + "loss": 0.03811631, + "step": 18324 + }, + { + "epoch": 36.65, + "grad_norm": 1.2543221712112427, + "learning_rate": 2e-05, + "loss": 0.03640786, + "step": 18325 + }, + { + "epoch": 36.652, + "grad_norm": 1.0462356805801392, + "learning_rate": 2e-05, + "loss": 0.03829445, + "step": 18326 + }, + { + "epoch": 36.654, + "grad_norm": 1.3017123937606812, + "learning_rate": 2e-05, + "loss": 0.03575253, + "step": 18327 + }, + { + "epoch": 36.656, + "grad_norm": 1.342948317527771, + "learning_rate": 2e-05, + "loss": 0.03757192, + "step": 18328 + }, + { + "epoch": 36.658, + "grad_norm": 1.066953420639038, + "learning_rate": 2e-05, + "loss": 0.04221145, + "step": 18329 + }, + { + "epoch": 36.66, + "grad_norm": 1.3225663900375366, + "learning_rate": 2e-05, + "loss": 0.05248147, + "step": 18330 + }, + { + "epoch": 36.662, + "grad_norm": 2.983029842376709, + "learning_rate": 2e-05, + "loss": 0.03616619, + "step": 18331 + }, + { + "epoch": 36.664, + "grad_norm": 2.2221550941467285, + "learning_rate": 2e-05, + "loss": 0.04058126, + "step": 18332 + }, + { + "epoch": 36.666, + "grad_norm": 1.315987467765808, + "learning_rate": 2e-05, + "loss": 0.03588897, + "step": 18333 + }, + { + "epoch": 36.668, + "grad_norm": 0.9215741157531738, + "learning_rate": 2e-05, + "loss": 0.03925972, + "step": 18334 + }, + { + "epoch": 36.67, + "grad_norm": 1.2016059160232544, + "learning_rate": 2e-05, + "loss": 0.05430229, + "step": 18335 + }, + { + "epoch": 36.672, + "grad_norm": 0.921889066696167, + "learning_rate": 2e-05, + "loss": 0.03469507, + "step": 18336 + }, + { + "epoch": 36.674, + "grad_norm": 0.9315686821937561, + "learning_rate": 2e-05, + "loss": 0.02187958, + "step": 18337 + }, + { + "epoch": 36.676, + "grad_norm": 0.9967896938323975, + "learning_rate": 2e-05, + "loss": 0.0331551, + "step": 18338 + }, + { + "epoch": 36.678, + "grad_norm": 1.1743967533111572, + "learning_rate": 2e-05, + "loss": 0.04423882, + "step": 18339 + }, + { + "epoch": 36.68, + "grad_norm": 1.2126519680023193, + "learning_rate": 2e-05, + "loss": 0.03514692, + "step": 18340 + }, + { + "epoch": 36.682, + "grad_norm": 1.3199232816696167, + "learning_rate": 2e-05, + "loss": 0.03419685, + "step": 18341 + }, + { + "epoch": 36.684, + "grad_norm": 1.7518872022628784, + "learning_rate": 2e-05, + "loss": 0.05697691, + "step": 18342 + }, + { + "epoch": 36.686, + "grad_norm": 1.0076147317886353, + "learning_rate": 2e-05, + "loss": 0.0281866, + "step": 18343 + }, + { + "epoch": 36.688, + "grad_norm": 1.5172227621078491, + "learning_rate": 2e-05, + "loss": 0.05067704, + "step": 18344 + }, + { + "epoch": 36.69, + "grad_norm": 1.6824908256530762, + "learning_rate": 2e-05, + "loss": 0.04503788, + "step": 18345 + }, + { + "epoch": 36.692, + "grad_norm": 1.3745967149734497, + "learning_rate": 2e-05, + "loss": 0.0483752, + "step": 18346 + }, + { + "epoch": 36.694, + "grad_norm": 1.2463769912719727, + "learning_rate": 2e-05, + "loss": 0.05347317, + "step": 18347 + }, + { + "epoch": 36.696, + "grad_norm": 1.0087859630584717, + "learning_rate": 2e-05, + "loss": 0.03465716, + "step": 18348 + }, + { + "epoch": 36.698, + "grad_norm": 1.2997995615005493, + "learning_rate": 2e-05, + "loss": 0.03587429, + "step": 18349 + }, + { + "epoch": 36.7, + "grad_norm": 1.1634178161621094, + "learning_rate": 2e-05, + "loss": 0.02643599, + "step": 18350 + }, + { + "epoch": 36.702, + "grad_norm": 1.2005338668823242, + "learning_rate": 2e-05, + "loss": 0.04984972, + "step": 18351 + }, + { + "epoch": 36.704, + "grad_norm": 0.9318526387214661, + "learning_rate": 2e-05, + "loss": 0.02411564, + "step": 18352 + }, + { + "epoch": 36.706, + "grad_norm": 1.1339325904846191, + "learning_rate": 2e-05, + "loss": 0.03448115, + "step": 18353 + }, + { + "epoch": 36.708, + "grad_norm": 1.1679779291152954, + "learning_rate": 2e-05, + "loss": 0.03647584, + "step": 18354 + }, + { + "epoch": 36.71, + "grad_norm": 1.2104750871658325, + "learning_rate": 2e-05, + "loss": 0.0515867, + "step": 18355 + }, + { + "epoch": 36.712, + "grad_norm": 2.100127935409546, + "learning_rate": 2e-05, + "loss": 0.05232673, + "step": 18356 + }, + { + "epoch": 36.714, + "grad_norm": 1.754447340965271, + "learning_rate": 2e-05, + "loss": 0.04861075, + "step": 18357 + }, + { + "epoch": 36.716, + "grad_norm": 1.008870005607605, + "learning_rate": 2e-05, + "loss": 0.03333037, + "step": 18358 + }, + { + "epoch": 36.718, + "grad_norm": 1.7049437761306763, + "learning_rate": 2e-05, + "loss": 0.03618935, + "step": 18359 + }, + { + "epoch": 36.72, + "grad_norm": 1.8496776819229126, + "learning_rate": 2e-05, + "loss": 0.05417331, + "step": 18360 + }, + { + "epoch": 36.722, + "grad_norm": 1.0589567422866821, + "learning_rate": 2e-05, + "loss": 0.02868169, + "step": 18361 + }, + { + "epoch": 36.724, + "grad_norm": 0.9607617259025574, + "learning_rate": 2e-05, + "loss": 0.03263365, + "step": 18362 + }, + { + "epoch": 36.726, + "grad_norm": 2.0316452980041504, + "learning_rate": 2e-05, + "loss": 0.04388903, + "step": 18363 + }, + { + "epoch": 36.728, + "grad_norm": 1.2382222414016724, + "learning_rate": 2e-05, + "loss": 0.04471454, + "step": 18364 + }, + { + "epoch": 36.73, + "grad_norm": 1.287028193473816, + "learning_rate": 2e-05, + "loss": 0.04440287, + "step": 18365 + }, + { + "epoch": 36.732, + "grad_norm": 1.2623907327651978, + "learning_rate": 2e-05, + "loss": 0.03233304, + "step": 18366 + }, + { + "epoch": 36.734, + "grad_norm": 1.9164531230926514, + "learning_rate": 2e-05, + "loss": 0.04776657, + "step": 18367 + }, + { + "epoch": 36.736, + "grad_norm": 1.3518916368484497, + "learning_rate": 2e-05, + "loss": 0.06090958, + "step": 18368 + }, + { + "epoch": 36.738, + "grad_norm": 1.8850350379943848, + "learning_rate": 2e-05, + "loss": 0.03893983, + "step": 18369 + }, + { + "epoch": 36.74, + "grad_norm": 1.0571264028549194, + "learning_rate": 2e-05, + "loss": 0.03137238, + "step": 18370 + }, + { + "epoch": 36.742, + "grad_norm": 1.1914355754852295, + "learning_rate": 2e-05, + "loss": 0.03433579, + "step": 18371 + }, + { + "epoch": 36.744, + "grad_norm": 0.8919147849082947, + "learning_rate": 2e-05, + "loss": 0.0218019, + "step": 18372 + }, + { + "epoch": 36.746, + "grad_norm": 1.1925781965255737, + "learning_rate": 2e-05, + "loss": 0.05873271, + "step": 18373 + }, + { + "epoch": 36.748, + "grad_norm": 1.1038137674331665, + "learning_rate": 2e-05, + "loss": 0.04641431, + "step": 18374 + }, + { + "epoch": 36.75, + "grad_norm": 0.965733528137207, + "learning_rate": 2e-05, + "loss": 0.02856233, + "step": 18375 + }, + { + "epoch": 36.752, + "grad_norm": 1.5396867990493774, + "learning_rate": 2e-05, + "loss": 0.04703198, + "step": 18376 + }, + { + "epoch": 36.754, + "grad_norm": 2.1341171264648438, + "learning_rate": 2e-05, + "loss": 0.04486314, + "step": 18377 + }, + { + "epoch": 36.756, + "grad_norm": 1.7395333051681519, + "learning_rate": 2e-05, + "loss": 0.04411499, + "step": 18378 + }, + { + "epoch": 36.758, + "grad_norm": 1.706061840057373, + "learning_rate": 2e-05, + "loss": 0.03239989, + "step": 18379 + }, + { + "epoch": 36.76, + "grad_norm": 1.6504026651382446, + "learning_rate": 2e-05, + "loss": 0.03501267, + "step": 18380 + }, + { + "epoch": 36.762, + "grad_norm": 1.3423389196395874, + "learning_rate": 2e-05, + "loss": 0.05261109, + "step": 18381 + }, + { + "epoch": 36.764, + "grad_norm": 1.3317161798477173, + "learning_rate": 2e-05, + "loss": 0.04133341, + "step": 18382 + }, + { + "epoch": 36.766, + "grad_norm": 1.6479923725128174, + "learning_rate": 2e-05, + "loss": 0.04605314, + "step": 18383 + }, + { + "epoch": 36.768, + "grad_norm": 1.1072478294372559, + "learning_rate": 2e-05, + "loss": 0.03600177, + "step": 18384 + }, + { + "epoch": 36.77, + "grad_norm": 1.1794970035552979, + "learning_rate": 2e-05, + "loss": 0.05206569, + "step": 18385 + }, + { + "epoch": 36.772, + "grad_norm": 1.075430989265442, + "learning_rate": 2e-05, + "loss": 0.03614403, + "step": 18386 + }, + { + "epoch": 36.774, + "grad_norm": 1.2344156503677368, + "learning_rate": 2e-05, + "loss": 0.06035028, + "step": 18387 + }, + { + "epoch": 36.776, + "grad_norm": 1.0581718683242798, + "learning_rate": 2e-05, + "loss": 0.0294181, + "step": 18388 + }, + { + "epoch": 36.778, + "grad_norm": 1.7531042098999023, + "learning_rate": 2e-05, + "loss": 0.04625558, + "step": 18389 + }, + { + "epoch": 36.78, + "grad_norm": 1.1771091222763062, + "learning_rate": 2e-05, + "loss": 0.04027297, + "step": 18390 + }, + { + "epoch": 36.782, + "grad_norm": 0.9724439978599548, + "learning_rate": 2e-05, + "loss": 0.03231468, + "step": 18391 + }, + { + "epoch": 36.784, + "grad_norm": 1.1481809616088867, + "learning_rate": 2e-05, + "loss": 0.03959997, + "step": 18392 + }, + { + "epoch": 36.786, + "grad_norm": 1.2926446199417114, + "learning_rate": 2e-05, + "loss": 0.05425737, + "step": 18393 + }, + { + "epoch": 36.788, + "grad_norm": 1.4414113759994507, + "learning_rate": 2e-05, + "loss": 0.04928648, + "step": 18394 + }, + { + "epoch": 36.79, + "grad_norm": 1.5177888870239258, + "learning_rate": 2e-05, + "loss": 0.03690218, + "step": 18395 + }, + { + "epoch": 36.792, + "grad_norm": 1.5356523990631104, + "learning_rate": 2e-05, + "loss": 0.04718888, + "step": 18396 + }, + { + "epoch": 36.794, + "grad_norm": 2.2311973571777344, + "learning_rate": 2e-05, + "loss": 0.03536499, + "step": 18397 + }, + { + "epoch": 36.796, + "grad_norm": 2.834324836730957, + "learning_rate": 2e-05, + "loss": 0.04099808, + "step": 18398 + }, + { + "epoch": 36.798, + "grad_norm": 1.1920080184936523, + "learning_rate": 2e-05, + "loss": 0.04433641, + "step": 18399 + }, + { + "epoch": 36.8, + "grad_norm": 1.1526068449020386, + "learning_rate": 2e-05, + "loss": 0.04516728, + "step": 18400 + }, + { + "epoch": 36.802, + "grad_norm": 1.2687426805496216, + "learning_rate": 2e-05, + "loss": 0.03586322, + "step": 18401 + }, + { + "epoch": 36.804, + "grad_norm": 0.9818292856216431, + "learning_rate": 2e-05, + "loss": 0.02395935, + "step": 18402 + }, + { + "epoch": 36.806, + "grad_norm": 2.2147302627563477, + "learning_rate": 2e-05, + "loss": 0.04670376, + "step": 18403 + }, + { + "epoch": 36.808, + "grad_norm": 1.6681506633758545, + "learning_rate": 2e-05, + "loss": 0.05808145, + "step": 18404 + }, + { + "epoch": 36.81, + "grad_norm": 1.1924699544906616, + "learning_rate": 2e-05, + "loss": 0.04742082, + "step": 18405 + }, + { + "epoch": 36.812, + "grad_norm": 2.0133984088897705, + "learning_rate": 2e-05, + "loss": 0.03901449, + "step": 18406 + }, + { + "epoch": 36.814, + "grad_norm": 1.1701860427856445, + "learning_rate": 2e-05, + "loss": 0.02848022, + "step": 18407 + }, + { + "epoch": 36.816, + "grad_norm": 1.089707374572754, + "learning_rate": 2e-05, + "loss": 0.04020487, + "step": 18408 + }, + { + "epoch": 36.818, + "grad_norm": 1.0396581888198853, + "learning_rate": 2e-05, + "loss": 0.03704511, + "step": 18409 + }, + { + "epoch": 36.82, + "grad_norm": 1.6695880889892578, + "learning_rate": 2e-05, + "loss": 0.0519683, + "step": 18410 + }, + { + "epoch": 36.822, + "grad_norm": 1.1158127784729004, + "learning_rate": 2e-05, + "loss": 0.04286835, + "step": 18411 + }, + { + "epoch": 36.824, + "grad_norm": 1.1745147705078125, + "learning_rate": 2e-05, + "loss": 0.03942792, + "step": 18412 + }, + { + "epoch": 36.826, + "grad_norm": 0.8886756896972656, + "learning_rate": 2e-05, + "loss": 0.02771359, + "step": 18413 + }, + { + "epoch": 36.828, + "grad_norm": 1.2267519235610962, + "learning_rate": 2e-05, + "loss": 0.0511631, + "step": 18414 + }, + { + "epoch": 36.83, + "grad_norm": 2.179556369781494, + "learning_rate": 2e-05, + "loss": 0.06088002, + "step": 18415 + }, + { + "epoch": 36.832, + "grad_norm": 1.2211542129516602, + "learning_rate": 2e-05, + "loss": 0.04303684, + "step": 18416 + }, + { + "epoch": 36.834, + "grad_norm": 2.2510719299316406, + "learning_rate": 2e-05, + "loss": 0.05186936, + "step": 18417 + }, + { + "epoch": 36.836, + "grad_norm": 1.2228981256484985, + "learning_rate": 2e-05, + "loss": 0.04879232, + "step": 18418 + }, + { + "epoch": 36.838, + "grad_norm": 1.098894715309143, + "learning_rate": 2e-05, + "loss": 0.03979594, + "step": 18419 + }, + { + "epoch": 36.84, + "grad_norm": 1.2254807949066162, + "learning_rate": 2e-05, + "loss": 0.03765118, + "step": 18420 + }, + { + "epoch": 36.842, + "grad_norm": 1.175650954246521, + "learning_rate": 2e-05, + "loss": 0.04586347, + "step": 18421 + }, + { + "epoch": 36.844, + "grad_norm": 1.22563636302948, + "learning_rate": 2e-05, + "loss": 0.0531856, + "step": 18422 + }, + { + "epoch": 36.846, + "grad_norm": 1.2885513305664062, + "learning_rate": 2e-05, + "loss": 0.04552082, + "step": 18423 + }, + { + "epoch": 36.848, + "grad_norm": 1.1295452117919922, + "learning_rate": 2e-05, + "loss": 0.0402256, + "step": 18424 + }, + { + "epoch": 36.85, + "grad_norm": 1.0516687631607056, + "learning_rate": 2e-05, + "loss": 0.04456215, + "step": 18425 + }, + { + "epoch": 36.852, + "grad_norm": 1.0733994245529175, + "learning_rate": 2e-05, + "loss": 0.03550437, + "step": 18426 + }, + { + "epoch": 36.854, + "grad_norm": 1.1007059812545776, + "learning_rate": 2e-05, + "loss": 0.04091225, + "step": 18427 + }, + { + "epoch": 36.856, + "grad_norm": 1.5547980070114136, + "learning_rate": 2e-05, + "loss": 0.04072155, + "step": 18428 + }, + { + "epoch": 36.858, + "grad_norm": 1.1082590818405151, + "learning_rate": 2e-05, + "loss": 0.03597355, + "step": 18429 + }, + { + "epoch": 36.86, + "grad_norm": 1.299734354019165, + "learning_rate": 2e-05, + "loss": 0.03989294, + "step": 18430 + }, + { + "epoch": 36.862, + "grad_norm": 0.9722706079483032, + "learning_rate": 2e-05, + "loss": 0.03567673, + "step": 18431 + }, + { + "epoch": 36.864, + "grad_norm": 1.2522664070129395, + "learning_rate": 2e-05, + "loss": 0.04869508, + "step": 18432 + }, + { + "epoch": 36.866, + "grad_norm": 1.638421654701233, + "learning_rate": 2e-05, + "loss": 0.03735339, + "step": 18433 + }, + { + "epoch": 36.868, + "grad_norm": 1.0237358808517456, + "learning_rate": 2e-05, + "loss": 0.04364359, + "step": 18434 + }, + { + "epoch": 36.87, + "grad_norm": 1.2411482334136963, + "learning_rate": 2e-05, + "loss": 0.03947395, + "step": 18435 + }, + { + "epoch": 36.872, + "grad_norm": 1.159613847732544, + "learning_rate": 2e-05, + "loss": 0.04042497, + "step": 18436 + }, + { + "epoch": 36.874, + "grad_norm": 1.2544336318969727, + "learning_rate": 2e-05, + "loss": 0.0312546, + "step": 18437 + }, + { + "epoch": 36.876, + "grad_norm": 1.9354642629623413, + "learning_rate": 2e-05, + "loss": 0.0527093, + "step": 18438 + }, + { + "epoch": 36.878, + "grad_norm": 3.310471773147583, + "learning_rate": 2e-05, + "loss": 0.04944807, + "step": 18439 + }, + { + "epoch": 36.88, + "grad_norm": 1.1568819284439087, + "learning_rate": 2e-05, + "loss": 0.0369446, + "step": 18440 + }, + { + "epoch": 36.882, + "grad_norm": 1.2008459568023682, + "learning_rate": 2e-05, + "loss": 0.04868155, + "step": 18441 + }, + { + "epoch": 36.884, + "grad_norm": 1.064301609992981, + "learning_rate": 2e-05, + "loss": 0.03515521, + "step": 18442 + }, + { + "epoch": 36.886, + "grad_norm": 1.3205647468566895, + "learning_rate": 2e-05, + "loss": 0.03227288, + "step": 18443 + }, + { + "epoch": 36.888, + "grad_norm": 1.5607926845550537, + "learning_rate": 2e-05, + "loss": 0.04700986, + "step": 18444 + }, + { + "epoch": 36.89, + "grad_norm": 1.1326090097427368, + "learning_rate": 2e-05, + "loss": 0.03524204, + "step": 18445 + }, + { + "epoch": 36.892, + "grad_norm": 0.9644931554794312, + "learning_rate": 2e-05, + "loss": 0.0342176, + "step": 18446 + }, + { + "epoch": 36.894, + "grad_norm": 1.2272518873214722, + "learning_rate": 2e-05, + "loss": 0.03322006, + "step": 18447 + }, + { + "epoch": 36.896, + "grad_norm": 1.061239242553711, + "learning_rate": 2e-05, + "loss": 0.03333382, + "step": 18448 + }, + { + "epoch": 36.898, + "grad_norm": 1.1636090278625488, + "learning_rate": 2e-05, + "loss": 0.04022244, + "step": 18449 + }, + { + "epoch": 36.9, + "grad_norm": 1.0282191038131714, + "learning_rate": 2e-05, + "loss": 0.03084, + "step": 18450 + }, + { + "epoch": 36.902, + "grad_norm": 2.087050199508667, + "learning_rate": 2e-05, + "loss": 0.0529213, + "step": 18451 + }, + { + "epoch": 36.904, + "grad_norm": 1.0408107042312622, + "learning_rate": 2e-05, + "loss": 0.04363842, + "step": 18452 + }, + { + "epoch": 36.906, + "grad_norm": 1.284910798072815, + "learning_rate": 2e-05, + "loss": 0.0489226, + "step": 18453 + }, + { + "epoch": 36.908, + "grad_norm": 1.6543158292770386, + "learning_rate": 2e-05, + "loss": 0.06356652, + "step": 18454 + }, + { + "epoch": 36.91, + "grad_norm": 1.2143512964248657, + "learning_rate": 2e-05, + "loss": 0.05602705, + "step": 18455 + }, + { + "epoch": 36.912, + "grad_norm": 1.095168113708496, + "learning_rate": 2e-05, + "loss": 0.04102691, + "step": 18456 + }, + { + "epoch": 36.914, + "grad_norm": 1.4219430685043335, + "learning_rate": 2e-05, + "loss": 0.04681015, + "step": 18457 + }, + { + "epoch": 36.916, + "grad_norm": 1.1736583709716797, + "learning_rate": 2e-05, + "loss": 0.04344864, + "step": 18458 + }, + { + "epoch": 36.918, + "grad_norm": 0.9385769963264465, + "learning_rate": 2e-05, + "loss": 0.02884827, + "step": 18459 + }, + { + "epoch": 36.92, + "grad_norm": 1.4176242351531982, + "learning_rate": 2e-05, + "loss": 0.06416596, + "step": 18460 + }, + { + "epoch": 36.922, + "grad_norm": 1.288359522819519, + "learning_rate": 2e-05, + "loss": 0.04654737, + "step": 18461 + }, + { + "epoch": 36.924, + "grad_norm": 1.1389371156692505, + "learning_rate": 2e-05, + "loss": 0.04628037, + "step": 18462 + }, + { + "epoch": 36.926, + "grad_norm": 1.098805546760559, + "learning_rate": 2e-05, + "loss": 0.03729876, + "step": 18463 + }, + { + "epoch": 36.928, + "grad_norm": 1.4324229955673218, + "learning_rate": 2e-05, + "loss": 0.0529637, + "step": 18464 + }, + { + "epoch": 36.93, + "grad_norm": 1.4143197536468506, + "learning_rate": 2e-05, + "loss": 0.03819642, + "step": 18465 + }, + { + "epoch": 36.932, + "grad_norm": 1.4157121181488037, + "learning_rate": 2e-05, + "loss": 0.04541536, + "step": 18466 + }, + { + "epoch": 36.934, + "grad_norm": 1.393878698348999, + "learning_rate": 2e-05, + "loss": 0.04265588, + "step": 18467 + }, + { + "epoch": 36.936, + "grad_norm": 1.0789780616760254, + "learning_rate": 2e-05, + "loss": 0.04042588, + "step": 18468 + }, + { + "epoch": 36.938, + "grad_norm": 1.6265769004821777, + "learning_rate": 2e-05, + "loss": 0.05098584, + "step": 18469 + }, + { + "epoch": 36.94, + "grad_norm": 1.1690245866775513, + "learning_rate": 2e-05, + "loss": 0.04554411, + "step": 18470 + }, + { + "epoch": 36.942, + "grad_norm": 1.3794010877609253, + "learning_rate": 2e-05, + "loss": 0.044034, + "step": 18471 + }, + { + "epoch": 36.944, + "grad_norm": 0.8413645625114441, + "learning_rate": 2e-05, + "loss": 0.02523428, + "step": 18472 + }, + { + "epoch": 36.946, + "grad_norm": 0.9403594732284546, + "learning_rate": 2e-05, + "loss": 0.03138512, + "step": 18473 + }, + { + "epoch": 36.948, + "grad_norm": 1.1274261474609375, + "learning_rate": 2e-05, + "loss": 0.0406789, + "step": 18474 + }, + { + "epoch": 36.95, + "grad_norm": 1.4938697814941406, + "learning_rate": 2e-05, + "loss": 0.05272559, + "step": 18475 + }, + { + "epoch": 36.952, + "grad_norm": 1.0316252708435059, + "learning_rate": 2e-05, + "loss": 0.03562522, + "step": 18476 + }, + { + "epoch": 36.954, + "grad_norm": 1.3863626718521118, + "learning_rate": 2e-05, + "loss": 0.03377339, + "step": 18477 + }, + { + "epoch": 36.956, + "grad_norm": 1.7842824459075928, + "learning_rate": 2e-05, + "loss": 0.05903577, + "step": 18478 + }, + { + "epoch": 36.958, + "grad_norm": 1.35276198387146, + "learning_rate": 2e-05, + "loss": 0.03431714, + "step": 18479 + }, + { + "epoch": 36.96, + "grad_norm": 1.2699631452560425, + "learning_rate": 2e-05, + "loss": 0.0535304, + "step": 18480 + }, + { + "epoch": 36.962, + "grad_norm": 1.2348072528839111, + "learning_rate": 2e-05, + "loss": 0.04691994, + "step": 18481 + }, + { + "epoch": 36.964, + "grad_norm": 0.9137774705886841, + "learning_rate": 2e-05, + "loss": 0.0308315, + "step": 18482 + }, + { + "epoch": 36.966, + "grad_norm": 1.0427780151367188, + "learning_rate": 2e-05, + "loss": 0.0346647, + "step": 18483 + }, + { + "epoch": 36.968, + "grad_norm": 2.189744472503662, + "learning_rate": 2e-05, + "loss": 0.03301712, + "step": 18484 + }, + { + "epoch": 36.97, + "grad_norm": 1.0989826917648315, + "learning_rate": 2e-05, + "loss": 0.02908636, + "step": 18485 + }, + { + "epoch": 36.972, + "grad_norm": 1.6174060106277466, + "learning_rate": 2e-05, + "loss": 0.04544824, + "step": 18486 + }, + { + "epoch": 36.974, + "grad_norm": 1.2668603658676147, + "learning_rate": 2e-05, + "loss": 0.05053727, + "step": 18487 + }, + { + "epoch": 36.976, + "grad_norm": 1.0576096773147583, + "learning_rate": 2e-05, + "loss": 0.03743074, + "step": 18488 + }, + { + "epoch": 36.978, + "grad_norm": 1.1801321506500244, + "learning_rate": 2e-05, + "loss": 0.03683414, + "step": 18489 + }, + { + "epoch": 36.98, + "grad_norm": 1.1503815650939941, + "learning_rate": 2e-05, + "loss": 0.03822848, + "step": 18490 + }, + { + "epoch": 36.982, + "grad_norm": 1.3549894094467163, + "learning_rate": 2e-05, + "loss": 0.05866414, + "step": 18491 + }, + { + "epoch": 36.984, + "grad_norm": 1.0250877141952515, + "learning_rate": 2e-05, + "loss": 0.03789775, + "step": 18492 + }, + { + "epoch": 36.986, + "grad_norm": 1.0828768014907837, + "learning_rate": 2e-05, + "loss": 0.04611751, + "step": 18493 + }, + { + "epoch": 36.988, + "grad_norm": 1.385727882385254, + "learning_rate": 2e-05, + "loss": 0.05194426, + "step": 18494 + }, + { + "epoch": 36.99, + "grad_norm": 1.17097008228302, + "learning_rate": 2e-05, + "loss": 0.05635056, + "step": 18495 + }, + { + "epoch": 36.992, + "grad_norm": 1.1106641292572021, + "learning_rate": 2e-05, + "loss": 0.04129127, + "step": 18496 + }, + { + "epoch": 36.994, + "grad_norm": 1.0685235261917114, + "learning_rate": 2e-05, + "loss": 0.04287387, + "step": 18497 + }, + { + "epoch": 36.996, + "grad_norm": 1.188862919807434, + "learning_rate": 2e-05, + "loss": 0.04415998, + "step": 18498 + }, + { + "epoch": 36.998, + "grad_norm": 0.956224799156189, + "learning_rate": 2e-05, + "loss": 0.03282576, + "step": 18499 + }, + { + "epoch": 37.0, + "grad_norm": 1.2222694158554077, + "learning_rate": 2e-05, + "loss": 0.0414125, + "step": 18500 + }, + { + "epoch": 37.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9880239520958084, + "Equal_1": 0.994, + "Equal_2": 0.9880239520958084, + "Equal_3": 0.9900199600798403, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.994, + "Perpendicular_1": 0.996, + "Perpendicular_2": 0.988, + "Perpendicular_3": 0.9018036072144289, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.996, + "PointLiesOnCircle_3": 0.996, + "PointLiesOnLine_1": 1.0, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9880239520958084 + }, + "eval_runtime": 323.3803, + "eval_samples_per_second": 32.47, + "eval_steps_per_second": 0.649, + "step": 18500 + }, + { + "epoch": 37.002, + "grad_norm": 1.2086259126663208, + "learning_rate": 2e-05, + "loss": 0.05474807, + "step": 18501 + }, + { + "epoch": 37.004, + "grad_norm": 1.1692421436309814, + "learning_rate": 2e-05, + "loss": 0.04277514, + "step": 18502 + }, + { + "epoch": 37.006, + "grad_norm": 1.1070613861083984, + "learning_rate": 2e-05, + "loss": 0.03776816, + "step": 18503 + }, + { + "epoch": 37.008, + "grad_norm": 1.263321876525879, + "learning_rate": 2e-05, + "loss": 0.04286732, + "step": 18504 + }, + { + "epoch": 37.01, + "grad_norm": 1.0955123901367188, + "learning_rate": 2e-05, + "loss": 0.02977836, + "step": 18505 + }, + { + "epoch": 37.012, + "grad_norm": 1.2032184600830078, + "learning_rate": 2e-05, + "loss": 0.03929453, + "step": 18506 + }, + { + "epoch": 37.014, + "grad_norm": 1.0266116857528687, + "learning_rate": 2e-05, + "loss": 0.02832175, + "step": 18507 + }, + { + "epoch": 37.016, + "grad_norm": 1.862282633781433, + "learning_rate": 2e-05, + "loss": 0.04781446, + "step": 18508 + }, + { + "epoch": 37.018, + "grad_norm": 1.4959344863891602, + "learning_rate": 2e-05, + "loss": 0.05474871, + "step": 18509 + }, + { + "epoch": 37.02, + "grad_norm": 2.0433266162872314, + "learning_rate": 2e-05, + "loss": 0.05582491, + "step": 18510 + }, + { + "epoch": 37.022, + "grad_norm": 1.0982203483581543, + "learning_rate": 2e-05, + "loss": 0.03969976, + "step": 18511 + }, + { + "epoch": 37.024, + "grad_norm": 1.8421516418457031, + "learning_rate": 2e-05, + "loss": 0.05257302, + "step": 18512 + }, + { + "epoch": 37.026, + "grad_norm": 1.0704379081726074, + "learning_rate": 2e-05, + "loss": 0.03272648, + "step": 18513 + }, + { + "epoch": 37.028, + "grad_norm": 1.063231110572815, + "learning_rate": 2e-05, + "loss": 0.03291452, + "step": 18514 + }, + { + "epoch": 37.03, + "grad_norm": 1.1486153602600098, + "learning_rate": 2e-05, + "loss": 0.04854581, + "step": 18515 + }, + { + "epoch": 37.032, + "grad_norm": 1.0761146545410156, + "learning_rate": 2e-05, + "loss": 0.04210125, + "step": 18516 + }, + { + "epoch": 37.034, + "grad_norm": 1.477710247039795, + "learning_rate": 2e-05, + "loss": 0.05219606, + "step": 18517 + }, + { + "epoch": 37.036, + "grad_norm": 1.0120596885681152, + "learning_rate": 2e-05, + "loss": 0.02972842, + "step": 18518 + }, + { + "epoch": 37.038, + "grad_norm": 1.341340184211731, + "learning_rate": 2e-05, + "loss": 0.03905867, + "step": 18519 + }, + { + "epoch": 37.04, + "grad_norm": 1.8006370067596436, + "learning_rate": 2e-05, + "loss": 0.04028159, + "step": 18520 + }, + { + "epoch": 37.042, + "grad_norm": 1.06367027759552, + "learning_rate": 2e-05, + "loss": 0.0451144, + "step": 18521 + }, + { + "epoch": 37.044, + "grad_norm": 1.3736926317214966, + "learning_rate": 2e-05, + "loss": 0.05268521, + "step": 18522 + }, + { + "epoch": 37.046, + "grad_norm": 3.3430514335632324, + "learning_rate": 2e-05, + "loss": 0.05855541, + "step": 18523 + }, + { + "epoch": 37.048, + "grad_norm": 1.0005395412445068, + "learning_rate": 2e-05, + "loss": 0.03178389, + "step": 18524 + }, + { + "epoch": 37.05, + "grad_norm": 1.4999123811721802, + "learning_rate": 2e-05, + "loss": 0.03547132, + "step": 18525 + }, + { + "epoch": 37.052, + "grad_norm": 1.2250981330871582, + "learning_rate": 2e-05, + "loss": 0.04226706, + "step": 18526 + }, + { + "epoch": 37.054, + "grad_norm": 1.1662254333496094, + "learning_rate": 2e-05, + "loss": 0.03948696, + "step": 18527 + }, + { + "epoch": 37.056, + "grad_norm": 1.1504510641098022, + "learning_rate": 2e-05, + "loss": 0.04601368, + "step": 18528 + }, + { + "epoch": 37.058, + "grad_norm": 1.0720382928848267, + "learning_rate": 2e-05, + "loss": 0.03848777, + "step": 18529 + }, + { + "epoch": 37.06, + "grad_norm": 1.3089898824691772, + "learning_rate": 2e-05, + "loss": 0.03624338, + "step": 18530 + }, + { + "epoch": 37.062, + "grad_norm": 1.2138144969940186, + "learning_rate": 2e-05, + "loss": 0.0507899, + "step": 18531 + }, + { + "epoch": 37.064, + "grad_norm": 1.297494888305664, + "learning_rate": 2e-05, + "loss": 0.04721798, + "step": 18532 + }, + { + "epoch": 37.066, + "grad_norm": 1.2399834394454956, + "learning_rate": 2e-05, + "loss": 0.04290456, + "step": 18533 + }, + { + "epoch": 37.068, + "grad_norm": 1.258151888847351, + "learning_rate": 2e-05, + "loss": 0.03323229, + "step": 18534 + }, + { + "epoch": 37.07, + "grad_norm": 2.544591188430786, + "learning_rate": 2e-05, + "loss": 0.04624221, + "step": 18535 + }, + { + "epoch": 37.072, + "grad_norm": 1.0994566679000854, + "learning_rate": 2e-05, + "loss": 0.03203875, + "step": 18536 + }, + { + "epoch": 37.074, + "grad_norm": 1.5202358961105347, + "learning_rate": 2e-05, + "loss": 0.04309528, + "step": 18537 + }, + { + "epoch": 37.076, + "grad_norm": 1.162284255027771, + "learning_rate": 2e-05, + "loss": 0.04106382, + "step": 18538 + }, + { + "epoch": 37.078, + "grad_norm": 1.5119906663894653, + "learning_rate": 2e-05, + "loss": 0.05221222, + "step": 18539 + }, + { + "epoch": 37.08, + "grad_norm": 1.0148025751113892, + "learning_rate": 2e-05, + "loss": 0.04121029, + "step": 18540 + }, + { + "epoch": 37.082, + "grad_norm": 1.2499613761901855, + "learning_rate": 2e-05, + "loss": 0.05167337, + "step": 18541 + }, + { + "epoch": 37.084, + "grad_norm": 1.3583709001541138, + "learning_rate": 2e-05, + "loss": 0.04471046, + "step": 18542 + }, + { + "epoch": 37.086, + "grad_norm": 2.9629805088043213, + "learning_rate": 2e-05, + "loss": 0.04842234, + "step": 18543 + }, + { + "epoch": 37.088, + "grad_norm": 1.327346920967102, + "learning_rate": 2e-05, + "loss": 0.05041461, + "step": 18544 + }, + { + "epoch": 37.09, + "grad_norm": 1.7716262340545654, + "learning_rate": 2e-05, + "loss": 0.03213685, + "step": 18545 + }, + { + "epoch": 37.092, + "grad_norm": 1.3329129219055176, + "learning_rate": 2e-05, + "loss": 0.05127243, + "step": 18546 + }, + { + "epoch": 37.094, + "grad_norm": 1.2340625524520874, + "learning_rate": 2e-05, + "loss": 0.04882501, + "step": 18547 + }, + { + "epoch": 37.096, + "grad_norm": 1.5020076036453247, + "learning_rate": 2e-05, + "loss": 0.05312416, + "step": 18548 + }, + { + "epoch": 37.098, + "grad_norm": 1.0506970882415771, + "learning_rate": 2e-05, + "loss": 0.04135638, + "step": 18549 + }, + { + "epoch": 37.1, + "grad_norm": 0.8881466388702393, + "learning_rate": 2e-05, + "loss": 0.02312918, + "step": 18550 + }, + { + "epoch": 37.102, + "grad_norm": 2.499708652496338, + "learning_rate": 2e-05, + "loss": 0.05453001, + "step": 18551 + }, + { + "epoch": 37.104, + "grad_norm": 0.9243218302726746, + "learning_rate": 2e-05, + "loss": 0.02658354, + "step": 18552 + }, + { + "epoch": 37.106, + "grad_norm": 0.8885984420776367, + "learning_rate": 2e-05, + "loss": 0.03515668, + "step": 18553 + }, + { + "epoch": 37.108, + "grad_norm": 1.3641797304153442, + "learning_rate": 2e-05, + "loss": 0.05175686, + "step": 18554 + }, + { + "epoch": 37.11, + "grad_norm": 1.5672239065170288, + "learning_rate": 2e-05, + "loss": 0.05572821, + "step": 18555 + }, + { + "epoch": 37.112, + "grad_norm": 1.1886248588562012, + "learning_rate": 2e-05, + "loss": 0.03903022, + "step": 18556 + }, + { + "epoch": 37.114, + "grad_norm": 2.1297879219055176, + "learning_rate": 2e-05, + "loss": 0.04060325, + "step": 18557 + }, + { + "epoch": 37.116, + "grad_norm": 1.0565165281295776, + "learning_rate": 2e-05, + "loss": 0.03858557, + "step": 18558 + }, + { + "epoch": 37.118, + "grad_norm": 0.9887270927429199, + "learning_rate": 2e-05, + "loss": 0.0331764, + "step": 18559 + }, + { + "epoch": 37.12, + "grad_norm": 1.3902815580368042, + "learning_rate": 2e-05, + "loss": 0.04316443, + "step": 18560 + }, + { + "epoch": 37.122, + "grad_norm": 1.7422220706939697, + "learning_rate": 2e-05, + "loss": 0.05364904, + "step": 18561 + }, + { + "epoch": 37.124, + "grad_norm": 1.6973824501037598, + "learning_rate": 2e-05, + "loss": 0.06084717, + "step": 18562 + }, + { + "epoch": 37.126, + "grad_norm": 1.2839064598083496, + "learning_rate": 2e-05, + "loss": 0.04210863, + "step": 18563 + }, + { + "epoch": 37.128, + "grad_norm": 1.687252402305603, + "learning_rate": 2e-05, + "loss": 0.03391319, + "step": 18564 + }, + { + "epoch": 37.13, + "grad_norm": 1.2945353984832764, + "learning_rate": 2e-05, + "loss": 0.04598271, + "step": 18565 + }, + { + "epoch": 37.132, + "grad_norm": 1.3326228857040405, + "learning_rate": 2e-05, + "loss": 0.04093219, + "step": 18566 + }, + { + "epoch": 37.134, + "grad_norm": 2.1192924976348877, + "learning_rate": 2e-05, + "loss": 0.03747287, + "step": 18567 + }, + { + "epoch": 37.136, + "grad_norm": 0.9054780006408691, + "learning_rate": 2e-05, + "loss": 0.02849467, + "step": 18568 + }, + { + "epoch": 37.138, + "grad_norm": 1.0603820085525513, + "learning_rate": 2e-05, + "loss": 0.0402334, + "step": 18569 + }, + { + "epoch": 37.14, + "grad_norm": 1.367003083229065, + "learning_rate": 2e-05, + "loss": 0.05233489, + "step": 18570 + }, + { + "epoch": 37.142, + "grad_norm": 0.9761447906494141, + "learning_rate": 2e-05, + "loss": 0.03226305, + "step": 18571 + }, + { + "epoch": 37.144, + "grad_norm": 1.0491434335708618, + "learning_rate": 2e-05, + "loss": 0.03535546, + "step": 18572 + }, + { + "epoch": 37.146, + "grad_norm": 1.1767827272415161, + "learning_rate": 2e-05, + "loss": 0.0434572, + "step": 18573 + }, + { + "epoch": 37.148, + "grad_norm": 1.0024957656860352, + "learning_rate": 2e-05, + "loss": 0.03991302, + "step": 18574 + }, + { + "epoch": 37.15, + "grad_norm": 1.4435670375823975, + "learning_rate": 2e-05, + "loss": 0.04887334, + "step": 18575 + }, + { + "epoch": 37.152, + "grad_norm": 1.0476168394088745, + "learning_rate": 2e-05, + "loss": 0.03045881, + "step": 18576 + }, + { + "epoch": 37.154, + "grad_norm": 3.0771420001983643, + "learning_rate": 2e-05, + "loss": 0.05882487, + "step": 18577 + }, + { + "epoch": 37.156, + "grad_norm": 1.1227463483810425, + "learning_rate": 2e-05, + "loss": 0.04296419, + "step": 18578 + }, + { + "epoch": 37.158, + "grad_norm": 0.8635389804840088, + "learning_rate": 2e-05, + "loss": 0.02800985, + "step": 18579 + }, + { + "epoch": 37.16, + "grad_norm": 1.0923333168029785, + "learning_rate": 2e-05, + "loss": 0.03294976, + "step": 18580 + }, + { + "epoch": 37.162, + "grad_norm": 1.2803630828857422, + "learning_rate": 2e-05, + "loss": 0.05049956, + "step": 18581 + }, + { + "epoch": 37.164, + "grad_norm": 2.036414384841919, + "learning_rate": 2e-05, + "loss": 0.03803521, + "step": 18582 + }, + { + "epoch": 37.166, + "grad_norm": 1.534833312034607, + "learning_rate": 2e-05, + "loss": 0.03264375, + "step": 18583 + }, + { + "epoch": 37.168, + "grad_norm": 1.0317829847335815, + "learning_rate": 2e-05, + "loss": 0.02741832, + "step": 18584 + }, + { + "epoch": 37.17, + "grad_norm": 1.0632944107055664, + "learning_rate": 2e-05, + "loss": 0.03481572, + "step": 18585 + }, + { + "epoch": 37.172, + "grad_norm": 1.263419270515442, + "learning_rate": 2e-05, + "loss": 0.04441663, + "step": 18586 + }, + { + "epoch": 37.174, + "grad_norm": 1.0531634092330933, + "learning_rate": 2e-05, + "loss": 0.03552748, + "step": 18587 + }, + { + "epoch": 37.176, + "grad_norm": 1.569259524345398, + "learning_rate": 2e-05, + "loss": 0.05546115, + "step": 18588 + }, + { + "epoch": 37.178, + "grad_norm": 1.3170031309127808, + "learning_rate": 2e-05, + "loss": 0.05026477, + "step": 18589 + }, + { + "epoch": 37.18, + "grad_norm": 1.1640214920043945, + "learning_rate": 2e-05, + "loss": 0.04955488, + "step": 18590 + }, + { + "epoch": 37.182, + "grad_norm": 1.6744143962860107, + "learning_rate": 2e-05, + "loss": 0.03792101, + "step": 18591 + }, + { + "epoch": 37.184, + "grad_norm": 0.9118578433990479, + "learning_rate": 2e-05, + "loss": 0.02556996, + "step": 18592 + }, + { + "epoch": 37.186, + "grad_norm": 1.644417643547058, + "learning_rate": 2e-05, + "loss": 0.04530199, + "step": 18593 + }, + { + "epoch": 37.188, + "grad_norm": 1.7604995965957642, + "learning_rate": 2e-05, + "loss": 0.06252389, + "step": 18594 + }, + { + "epoch": 37.19, + "grad_norm": 1.0273133516311646, + "learning_rate": 2e-05, + "loss": 0.04975293, + "step": 18595 + }, + { + "epoch": 37.192, + "grad_norm": 1.05352783203125, + "learning_rate": 2e-05, + "loss": 0.04171479, + "step": 18596 + }, + { + "epoch": 37.194, + "grad_norm": 0.9587264060974121, + "learning_rate": 2e-05, + "loss": 0.03576777, + "step": 18597 + }, + { + "epoch": 37.196, + "grad_norm": 1.094084620475769, + "learning_rate": 2e-05, + "loss": 0.02543401, + "step": 18598 + }, + { + "epoch": 37.198, + "grad_norm": 1.139223575592041, + "learning_rate": 2e-05, + "loss": 0.03940034, + "step": 18599 + }, + { + "epoch": 37.2, + "grad_norm": 1.050518274307251, + "learning_rate": 2e-05, + "loss": 0.0513064, + "step": 18600 + }, + { + "epoch": 37.202, + "grad_norm": 1.5931733846664429, + "learning_rate": 2e-05, + "loss": 0.03497151, + "step": 18601 + }, + { + "epoch": 37.204, + "grad_norm": 1.2428843975067139, + "learning_rate": 2e-05, + "loss": 0.04900038, + "step": 18602 + }, + { + "epoch": 37.206, + "grad_norm": 5.434542179107666, + "learning_rate": 2e-05, + "loss": 0.05245988, + "step": 18603 + }, + { + "epoch": 37.208, + "grad_norm": 1.1642683744430542, + "learning_rate": 2e-05, + "loss": 0.05161486, + "step": 18604 + }, + { + "epoch": 37.21, + "grad_norm": 1.0116956233978271, + "learning_rate": 2e-05, + "loss": 0.03130523, + "step": 18605 + }, + { + "epoch": 37.212, + "grad_norm": 1.7632371187210083, + "learning_rate": 2e-05, + "loss": 0.04773512, + "step": 18606 + }, + { + "epoch": 37.214, + "grad_norm": 0.9744578003883362, + "learning_rate": 2e-05, + "loss": 0.03005046, + "step": 18607 + }, + { + "epoch": 37.216, + "grad_norm": 1.5116866827011108, + "learning_rate": 2e-05, + "loss": 0.04617791, + "step": 18608 + }, + { + "epoch": 37.218, + "grad_norm": 2.665987730026245, + "learning_rate": 2e-05, + "loss": 0.03156227, + "step": 18609 + }, + { + "epoch": 37.22, + "grad_norm": 1.0697249174118042, + "learning_rate": 2e-05, + "loss": 0.03917909, + "step": 18610 + }, + { + "epoch": 37.222, + "grad_norm": 1.0376712083816528, + "learning_rate": 2e-05, + "loss": 0.04070775, + "step": 18611 + }, + { + "epoch": 37.224, + "grad_norm": 1.1940553188323975, + "learning_rate": 2e-05, + "loss": 0.03836515, + "step": 18612 + }, + { + "epoch": 37.226, + "grad_norm": 0.9701217412948608, + "learning_rate": 2e-05, + "loss": 0.03590652, + "step": 18613 + }, + { + "epoch": 37.228, + "grad_norm": 1.2851488590240479, + "learning_rate": 2e-05, + "loss": 0.04535402, + "step": 18614 + }, + { + "epoch": 37.23, + "grad_norm": 2.4411749839782715, + "learning_rate": 2e-05, + "loss": 0.06194766, + "step": 18615 + }, + { + "epoch": 37.232, + "grad_norm": 1.3536734580993652, + "learning_rate": 2e-05, + "loss": 0.04599462, + "step": 18616 + }, + { + "epoch": 37.234, + "grad_norm": 1.2606672048568726, + "learning_rate": 2e-05, + "loss": 0.04580709, + "step": 18617 + }, + { + "epoch": 37.236, + "grad_norm": 1.3303290605545044, + "learning_rate": 2e-05, + "loss": 0.04706927, + "step": 18618 + }, + { + "epoch": 37.238, + "grad_norm": 1.4099715948104858, + "learning_rate": 2e-05, + "loss": 0.02947021, + "step": 18619 + }, + { + "epoch": 37.24, + "grad_norm": 1.0826631784439087, + "learning_rate": 2e-05, + "loss": 0.03750202, + "step": 18620 + }, + { + "epoch": 37.242, + "grad_norm": 1.1417129039764404, + "learning_rate": 2e-05, + "loss": 0.05023696, + "step": 18621 + }, + { + "epoch": 37.244, + "grad_norm": 1.0746430158615112, + "learning_rate": 2e-05, + "loss": 0.04541557, + "step": 18622 + }, + { + "epoch": 37.246, + "grad_norm": 1.4391956329345703, + "learning_rate": 2e-05, + "loss": 0.05088585, + "step": 18623 + }, + { + "epoch": 37.248, + "grad_norm": 1.7742950916290283, + "learning_rate": 2e-05, + "loss": 0.04011762, + "step": 18624 + }, + { + "epoch": 37.25, + "grad_norm": 2.4555623531341553, + "learning_rate": 2e-05, + "loss": 0.04191405, + "step": 18625 + }, + { + "epoch": 37.252, + "grad_norm": 1.0835415124893188, + "learning_rate": 2e-05, + "loss": 0.04408552, + "step": 18626 + }, + { + "epoch": 37.254, + "grad_norm": 1.494162917137146, + "learning_rate": 2e-05, + "loss": 0.0393875, + "step": 18627 + }, + { + "epoch": 37.256, + "grad_norm": 0.9933038353919983, + "learning_rate": 2e-05, + "loss": 0.0362031, + "step": 18628 + }, + { + "epoch": 37.258, + "grad_norm": 1.1506096124649048, + "learning_rate": 2e-05, + "loss": 0.0375497, + "step": 18629 + }, + { + "epoch": 37.26, + "grad_norm": 1.341207504272461, + "learning_rate": 2e-05, + "loss": 0.04895544, + "step": 18630 + }, + { + "epoch": 37.262, + "grad_norm": 1.4760468006134033, + "learning_rate": 2e-05, + "loss": 0.04239687, + "step": 18631 + }, + { + "epoch": 37.264, + "grad_norm": 1.1099522113800049, + "learning_rate": 2e-05, + "loss": 0.05547397, + "step": 18632 + }, + { + "epoch": 37.266, + "grad_norm": 1.1346218585968018, + "learning_rate": 2e-05, + "loss": 0.04982274, + "step": 18633 + }, + { + "epoch": 37.268, + "grad_norm": 0.9855700731277466, + "learning_rate": 2e-05, + "loss": 0.03562504, + "step": 18634 + }, + { + "epoch": 37.27, + "grad_norm": 1.561249017715454, + "learning_rate": 2e-05, + "loss": 0.05814129, + "step": 18635 + }, + { + "epoch": 37.272, + "grad_norm": 1.0120232105255127, + "learning_rate": 2e-05, + "loss": 0.0441656, + "step": 18636 + }, + { + "epoch": 37.274, + "grad_norm": 3.5975611209869385, + "learning_rate": 2e-05, + "loss": 0.03757409, + "step": 18637 + }, + { + "epoch": 37.276, + "grad_norm": 1.7085171937942505, + "learning_rate": 2e-05, + "loss": 0.03554228, + "step": 18638 + }, + { + "epoch": 37.278, + "grad_norm": 1.2262259721755981, + "learning_rate": 2e-05, + "loss": 0.02165484, + "step": 18639 + }, + { + "epoch": 37.28, + "grad_norm": 1.2670469284057617, + "learning_rate": 2e-05, + "loss": 0.03860283, + "step": 18640 + }, + { + "epoch": 37.282, + "grad_norm": 1.1696661710739136, + "learning_rate": 2e-05, + "loss": 0.04609601, + "step": 18641 + }, + { + "epoch": 37.284, + "grad_norm": 2.9907474517822266, + "learning_rate": 2e-05, + "loss": 0.05883395, + "step": 18642 + }, + { + "epoch": 37.286, + "grad_norm": 1.062591552734375, + "learning_rate": 2e-05, + "loss": 0.04270145, + "step": 18643 + }, + { + "epoch": 37.288, + "grad_norm": 1.022788166999817, + "learning_rate": 2e-05, + "loss": 0.02788961, + "step": 18644 + }, + { + "epoch": 37.29, + "grad_norm": 1.3785815238952637, + "learning_rate": 2e-05, + "loss": 0.04528718, + "step": 18645 + }, + { + "epoch": 37.292, + "grad_norm": 1.0437957048416138, + "learning_rate": 2e-05, + "loss": 0.03974991, + "step": 18646 + }, + { + "epoch": 37.294, + "grad_norm": 1.1609976291656494, + "learning_rate": 2e-05, + "loss": 0.05013696, + "step": 18647 + }, + { + "epoch": 37.296, + "grad_norm": 0.8697720170021057, + "learning_rate": 2e-05, + "loss": 0.03053538, + "step": 18648 + }, + { + "epoch": 37.298, + "grad_norm": 0.9573113918304443, + "learning_rate": 2e-05, + "loss": 0.03847982, + "step": 18649 + }, + { + "epoch": 37.3, + "grad_norm": 1.183646559715271, + "learning_rate": 2e-05, + "loss": 0.04569777, + "step": 18650 + }, + { + "epoch": 37.302, + "grad_norm": 0.8802410364151001, + "learning_rate": 2e-05, + "loss": 0.02939706, + "step": 18651 + }, + { + "epoch": 37.304, + "grad_norm": 1.2559164762496948, + "learning_rate": 2e-05, + "loss": 0.0539922, + "step": 18652 + }, + { + "epoch": 37.306, + "grad_norm": 1.3703943490982056, + "learning_rate": 2e-05, + "loss": 0.04603429, + "step": 18653 + }, + { + "epoch": 37.308, + "grad_norm": 1.1752663850784302, + "learning_rate": 2e-05, + "loss": 0.05147523, + "step": 18654 + }, + { + "epoch": 37.31, + "grad_norm": 1.039732575416565, + "learning_rate": 2e-05, + "loss": 0.03672284, + "step": 18655 + }, + { + "epoch": 37.312, + "grad_norm": 1.0590288639068604, + "learning_rate": 2e-05, + "loss": 0.04019672, + "step": 18656 + }, + { + "epoch": 37.314, + "grad_norm": 1.8295365571975708, + "learning_rate": 2e-05, + "loss": 0.05494248, + "step": 18657 + }, + { + "epoch": 37.316, + "grad_norm": 1.3013070821762085, + "learning_rate": 2e-05, + "loss": 0.03197949, + "step": 18658 + }, + { + "epoch": 37.318, + "grad_norm": 1.4694881439208984, + "learning_rate": 2e-05, + "loss": 0.02851127, + "step": 18659 + }, + { + "epoch": 37.32, + "grad_norm": 1.1061211824417114, + "learning_rate": 2e-05, + "loss": 0.03776077, + "step": 18660 + }, + { + "epoch": 37.322, + "grad_norm": 0.9431129097938538, + "learning_rate": 2e-05, + "loss": 0.03077835, + "step": 18661 + }, + { + "epoch": 37.324, + "grad_norm": 0.9578332901000977, + "learning_rate": 2e-05, + "loss": 0.02925043, + "step": 18662 + }, + { + "epoch": 37.326, + "grad_norm": 1.0546810626983643, + "learning_rate": 2e-05, + "loss": 0.04426049, + "step": 18663 + }, + { + "epoch": 37.328, + "grad_norm": 1.1375724077224731, + "learning_rate": 2e-05, + "loss": 0.02645444, + "step": 18664 + }, + { + "epoch": 37.33, + "grad_norm": 1.3067609071731567, + "learning_rate": 2e-05, + "loss": 0.04584169, + "step": 18665 + }, + { + "epoch": 37.332, + "grad_norm": 2.0529701709747314, + "learning_rate": 2e-05, + "loss": 0.04706659, + "step": 18666 + }, + { + "epoch": 37.334, + "grad_norm": 1.4616862535476685, + "learning_rate": 2e-05, + "loss": 0.04822695, + "step": 18667 + }, + { + "epoch": 37.336, + "grad_norm": 1.1066008806228638, + "learning_rate": 2e-05, + "loss": 0.03952828, + "step": 18668 + }, + { + "epoch": 37.338, + "grad_norm": 2.8666927814483643, + "learning_rate": 2e-05, + "loss": 0.04431106, + "step": 18669 + }, + { + "epoch": 37.34, + "grad_norm": 1.526808500289917, + "learning_rate": 2e-05, + "loss": 0.04228971, + "step": 18670 + }, + { + "epoch": 37.342, + "grad_norm": 1.7227466106414795, + "learning_rate": 2e-05, + "loss": 0.03985681, + "step": 18671 + }, + { + "epoch": 37.344, + "grad_norm": 0.960451602935791, + "learning_rate": 2e-05, + "loss": 0.02552288, + "step": 18672 + }, + { + "epoch": 37.346, + "grad_norm": 1.1226481199264526, + "learning_rate": 2e-05, + "loss": 0.04143129, + "step": 18673 + }, + { + "epoch": 37.348, + "grad_norm": 2.2680718898773193, + "learning_rate": 2e-05, + "loss": 0.05270278, + "step": 18674 + }, + { + "epoch": 37.35, + "grad_norm": 0.8541618585586548, + "learning_rate": 2e-05, + "loss": 0.0224099, + "step": 18675 + }, + { + "epoch": 37.352, + "grad_norm": 1.2834433317184448, + "learning_rate": 2e-05, + "loss": 0.05758996, + "step": 18676 + }, + { + "epoch": 37.354, + "grad_norm": 1.2858184576034546, + "learning_rate": 2e-05, + "loss": 0.03408124, + "step": 18677 + }, + { + "epoch": 37.356, + "grad_norm": 1.2244290113449097, + "learning_rate": 2e-05, + "loss": 0.04213716, + "step": 18678 + }, + { + "epoch": 37.358, + "grad_norm": 1.783223271369934, + "learning_rate": 2e-05, + "loss": 0.05025373, + "step": 18679 + }, + { + "epoch": 37.36, + "grad_norm": 1.0761710405349731, + "learning_rate": 2e-05, + "loss": 0.04476252, + "step": 18680 + }, + { + "epoch": 37.362, + "grad_norm": 0.9174004197120667, + "learning_rate": 2e-05, + "loss": 0.02644821, + "step": 18681 + }, + { + "epoch": 37.364, + "grad_norm": 1.5571218729019165, + "learning_rate": 2e-05, + "loss": 0.03530311, + "step": 18682 + }, + { + "epoch": 37.366, + "grad_norm": 1.3173847198486328, + "learning_rate": 2e-05, + "loss": 0.0494157, + "step": 18683 + }, + { + "epoch": 37.368, + "grad_norm": 3.338667869567871, + "learning_rate": 2e-05, + "loss": 0.07118287, + "step": 18684 + }, + { + "epoch": 37.37, + "grad_norm": 1.157820701599121, + "learning_rate": 2e-05, + "loss": 0.0302717, + "step": 18685 + }, + { + "epoch": 37.372, + "grad_norm": 0.9768419861793518, + "learning_rate": 2e-05, + "loss": 0.04007539, + "step": 18686 + }, + { + "epoch": 37.374, + "grad_norm": 1.2794311046600342, + "learning_rate": 2e-05, + "loss": 0.04950311, + "step": 18687 + }, + { + "epoch": 37.376, + "grad_norm": 1.141780972480774, + "learning_rate": 2e-05, + "loss": 0.03149645, + "step": 18688 + }, + { + "epoch": 37.378, + "grad_norm": 2.4156718254089355, + "learning_rate": 2e-05, + "loss": 0.04496891, + "step": 18689 + }, + { + "epoch": 37.38, + "grad_norm": 0.9722008109092712, + "learning_rate": 2e-05, + "loss": 0.0443184, + "step": 18690 + }, + { + "epoch": 37.382, + "grad_norm": 1.1352218389511108, + "learning_rate": 2e-05, + "loss": 0.03867435, + "step": 18691 + }, + { + "epoch": 37.384, + "grad_norm": 1.627010464668274, + "learning_rate": 2e-05, + "loss": 0.06134836, + "step": 18692 + }, + { + "epoch": 37.386, + "grad_norm": 1.1376895904541016, + "learning_rate": 2e-05, + "loss": 0.03281328, + "step": 18693 + }, + { + "epoch": 37.388, + "grad_norm": 1.209214210510254, + "learning_rate": 2e-05, + "loss": 0.04794659, + "step": 18694 + }, + { + "epoch": 37.39, + "grad_norm": 1.2975291013717651, + "learning_rate": 2e-05, + "loss": 0.05038759, + "step": 18695 + }, + { + "epoch": 37.392, + "grad_norm": 1.3444797992706299, + "learning_rate": 2e-05, + "loss": 0.03968658, + "step": 18696 + }, + { + "epoch": 37.394, + "grad_norm": 1.1733344793319702, + "learning_rate": 2e-05, + "loss": 0.04530336, + "step": 18697 + }, + { + "epoch": 37.396, + "grad_norm": 0.9696661233901978, + "learning_rate": 2e-05, + "loss": 0.03088088, + "step": 18698 + }, + { + "epoch": 37.398, + "grad_norm": 1.0545789003372192, + "learning_rate": 2e-05, + "loss": 0.03587662, + "step": 18699 + }, + { + "epoch": 37.4, + "grad_norm": 1.031237006187439, + "learning_rate": 2e-05, + "loss": 0.03672967, + "step": 18700 + }, + { + "epoch": 37.402, + "grad_norm": 1.3680472373962402, + "learning_rate": 2e-05, + "loss": 0.05069418, + "step": 18701 + }, + { + "epoch": 37.404, + "grad_norm": 1.0634808540344238, + "learning_rate": 2e-05, + "loss": 0.04009442, + "step": 18702 + }, + { + "epoch": 37.406, + "grad_norm": 0.896915078163147, + "learning_rate": 2e-05, + "loss": 0.02137747, + "step": 18703 + }, + { + "epoch": 37.408, + "grad_norm": 1.0462524890899658, + "learning_rate": 2e-05, + "loss": 0.03831872, + "step": 18704 + }, + { + "epoch": 37.41, + "grad_norm": 1.36514151096344, + "learning_rate": 2e-05, + "loss": 0.05347653, + "step": 18705 + }, + { + "epoch": 37.412, + "grad_norm": 1.608876347541809, + "learning_rate": 2e-05, + "loss": 0.03230177, + "step": 18706 + }, + { + "epoch": 37.414, + "grad_norm": 1.5585176944732666, + "learning_rate": 2e-05, + "loss": 0.04574811, + "step": 18707 + }, + { + "epoch": 37.416, + "grad_norm": 1.5784344673156738, + "learning_rate": 2e-05, + "loss": 0.0467354, + "step": 18708 + }, + { + "epoch": 37.418, + "grad_norm": 1.5583195686340332, + "learning_rate": 2e-05, + "loss": 0.04075438, + "step": 18709 + }, + { + "epoch": 37.42, + "grad_norm": 1.0302294492721558, + "learning_rate": 2e-05, + "loss": 0.04351837, + "step": 18710 + }, + { + "epoch": 37.422, + "grad_norm": 1.7316805124282837, + "learning_rate": 2e-05, + "loss": 0.02402731, + "step": 18711 + }, + { + "epoch": 37.424, + "grad_norm": 1.5626541376113892, + "learning_rate": 2e-05, + "loss": 0.03232317, + "step": 18712 + }, + { + "epoch": 37.426, + "grad_norm": 0.9855737686157227, + "learning_rate": 2e-05, + "loss": 0.03887186, + "step": 18713 + }, + { + "epoch": 37.428, + "grad_norm": 1.4292187690734863, + "learning_rate": 2e-05, + "loss": 0.05129439, + "step": 18714 + }, + { + "epoch": 37.43, + "grad_norm": 1.3205820322036743, + "learning_rate": 2e-05, + "loss": 0.03848224, + "step": 18715 + }, + { + "epoch": 37.432, + "grad_norm": 1.474755048751831, + "learning_rate": 2e-05, + "loss": 0.0503183, + "step": 18716 + }, + { + "epoch": 37.434, + "grad_norm": 1.0868511199951172, + "learning_rate": 2e-05, + "loss": 0.02255207, + "step": 18717 + }, + { + "epoch": 37.436, + "grad_norm": 1.2556840181350708, + "learning_rate": 2e-05, + "loss": 0.05313279, + "step": 18718 + }, + { + "epoch": 37.438, + "grad_norm": 1.2861770391464233, + "learning_rate": 2e-05, + "loss": 0.05083389, + "step": 18719 + }, + { + "epoch": 37.44, + "grad_norm": 1.3172714710235596, + "learning_rate": 2e-05, + "loss": 0.0380227, + "step": 18720 + }, + { + "epoch": 37.442, + "grad_norm": 1.6785846948623657, + "learning_rate": 2e-05, + "loss": 0.05099306, + "step": 18721 + }, + { + "epoch": 37.444, + "grad_norm": 1.040389895439148, + "learning_rate": 2e-05, + "loss": 0.03511147, + "step": 18722 + }, + { + "epoch": 37.446, + "grad_norm": 1.0567665100097656, + "learning_rate": 2e-05, + "loss": 0.02976083, + "step": 18723 + }, + { + "epoch": 37.448, + "grad_norm": 0.9617815613746643, + "learning_rate": 2e-05, + "loss": 0.03670631, + "step": 18724 + }, + { + "epoch": 37.45, + "grad_norm": 1.069361925125122, + "learning_rate": 2e-05, + "loss": 0.03904826, + "step": 18725 + }, + { + "epoch": 37.452, + "grad_norm": 1.329677939414978, + "learning_rate": 2e-05, + "loss": 0.05781191, + "step": 18726 + }, + { + "epoch": 37.454, + "grad_norm": 1.146031379699707, + "learning_rate": 2e-05, + "loss": 0.05027099, + "step": 18727 + }, + { + "epoch": 37.456, + "grad_norm": 1.0236581563949585, + "learning_rate": 2e-05, + "loss": 0.0411206, + "step": 18728 + }, + { + "epoch": 37.458, + "grad_norm": 1.2816264629364014, + "learning_rate": 2e-05, + "loss": 0.04896802, + "step": 18729 + }, + { + "epoch": 37.46, + "grad_norm": 1.1948260068893433, + "learning_rate": 2e-05, + "loss": 0.03480673, + "step": 18730 + }, + { + "epoch": 37.462, + "grad_norm": 1.7572070360183716, + "learning_rate": 2e-05, + "loss": 0.04478199, + "step": 18731 + }, + { + "epoch": 37.464, + "grad_norm": 1.0101138353347778, + "learning_rate": 2e-05, + "loss": 0.044527, + "step": 18732 + }, + { + "epoch": 37.466, + "grad_norm": 1.537125825881958, + "learning_rate": 2e-05, + "loss": 0.04904353, + "step": 18733 + }, + { + "epoch": 37.468, + "grad_norm": 2.0057132244110107, + "learning_rate": 2e-05, + "loss": 0.05284391, + "step": 18734 + }, + { + "epoch": 37.47, + "grad_norm": 1.139591932296753, + "learning_rate": 2e-05, + "loss": 0.03955012, + "step": 18735 + }, + { + "epoch": 37.472, + "grad_norm": 1.3117825984954834, + "learning_rate": 2e-05, + "loss": 0.03998091, + "step": 18736 + }, + { + "epoch": 37.474, + "grad_norm": 1.599098563194275, + "learning_rate": 2e-05, + "loss": 0.0488559, + "step": 18737 + }, + { + "epoch": 37.476, + "grad_norm": 1.0199345350265503, + "learning_rate": 2e-05, + "loss": 0.02837726, + "step": 18738 + }, + { + "epoch": 37.478, + "grad_norm": 1.672568678855896, + "learning_rate": 2e-05, + "loss": 0.05150389, + "step": 18739 + }, + { + "epoch": 37.48, + "grad_norm": 0.9359270334243774, + "learning_rate": 2e-05, + "loss": 0.03139933, + "step": 18740 + }, + { + "epoch": 37.482, + "grad_norm": 1.5860284566879272, + "learning_rate": 2e-05, + "loss": 0.04607878, + "step": 18741 + }, + { + "epoch": 37.484, + "grad_norm": 1.0930761098861694, + "learning_rate": 2e-05, + "loss": 0.03963002, + "step": 18742 + }, + { + "epoch": 37.486, + "grad_norm": 1.213152527809143, + "learning_rate": 2e-05, + "loss": 0.04821583, + "step": 18743 + }, + { + "epoch": 37.488, + "grad_norm": 0.9678628444671631, + "learning_rate": 2e-05, + "loss": 0.03491917, + "step": 18744 + }, + { + "epoch": 37.49, + "grad_norm": 1.1579011678695679, + "learning_rate": 2e-05, + "loss": 0.05182432, + "step": 18745 + }, + { + "epoch": 37.492, + "grad_norm": 0.9998135566711426, + "learning_rate": 2e-05, + "loss": 0.02899381, + "step": 18746 + }, + { + "epoch": 37.494, + "grad_norm": 1.365634560585022, + "learning_rate": 2e-05, + "loss": 0.03647054, + "step": 18747 + }, + { + "epoch": 37.496, + "grad_norm": 2.8775148391723633, + "learning_rate": 2e-05, + "loss": 0.06468409, + "step": 18748 + }, + { + "epoch": 37.498, + "grad_norm": 1.3849204778671265, + "learning_rate": 2e-05, + "loss": 0.05843618, + "step": 18749 + }, + { + "epoch": 37.5, + "grad_norm": 1.4979232549667358, + "learning_rate": 2e-05, + "loss": 0.05652013, + "step": 18750 + }, + { + "epoch": 37.502, + "grad_norm": 1.2144519090652466, + "learning_rate": 2e-05, + "loss": 0.0389149, + "step": 18751 + }, + { + "epoch": 37.504, + "grad_norm": 1.0543650388717651, + "learning_rate": 2e-05, + "loss": 0.0313639, + "step": 18752 + }, + { + "epoch": 37.506, + "grad_norm": 1.038210391998291, + "learning_rate": 2e-05, + "loss": 0.04335963, + "step": 18753 + }, + { + "epoch": 37.508, + "grad_norm": 1.0079690217971802, + "learning_rate": 2e-05, + "loss": 0.03755116, + "step": 18754 + }, + { + "epoch": 37.51, + "grad_norm": 1.770876169204712, + "learning_rate": 2e-05, + "loss": 0.04107982, + "step": 18755 + }, + { + "epoch": 37.512, + "grad_norm": 1.6218074560165405, + "learning_rate": 2e-05, + "loss": 0.04625713, + "step": 18756 + }, + { + "epoch": 37.514, + "grad_norm": 1.1750694513320923, + "learning_rate": 2e-05, + "loss": 0.03950002, + "step": 18757 + }, + { + "epoch": 37.516, + "grad_norm": 1.2037353515625, + "learning_rate": 2e-05, + "loss": 0.04342839, + "step": 18758 + }, + { + "epoch": 37.518, + "grad_norm": 1.1445868015289307, + "learning_rate": 2e-05, + "loss": 0.04752634, + "step": 18759 + }, + { + "epoch": 37.52, + "grad_norm": 1.0486931800842285, + "learning_rate": 2e-05, + "loss": 0.04497886, + "step": 18760 + }, + { + "epoch": 37.522, + "grad_norm": 0.9221504330635071, + "learning_rate": 2e-05, + "loss": 0.0370011, + "step": 18761 + }, + { + "epoch": 37.524, + "grad_norm": 0.8999439477920532, + "learning_rate": 2e-05, + "loss": 0.03294341, + "step": 18762 + }, + { + "epoch": 37.526, + "grad_norm": 1.1722791194915771, + "learning_rate": 2e-05, + "loss": 0.03934911, + "step": 18763 + }, + { + "epoch": 37.528, + "grad_norm": 1.2961126565933228, + "learning_rate": 2e-05, + "loss": 0.04197449, + "step": 18764 + }, + { + "epoch": 37.53, + "grad_norm": 0.9199429154396057, + "learning_rate": 2e-05, + "loss": 0.02904882, + "step": 18765 + }, + { + "epoch": 37.532, + "grad_norm": 0.9045437574386597, + "learning_rate": 2e-05, + "loss": 0.03960499, + "step": 18766 + }, + { + "epoch": 37.534, + "grad_norm": 0.9374881982803345, + "learning_rate": 2e-05, + "loss": 0.03096394, + "step": 18767 + }, + { + "epoch": 37.536, + "grad_norm": 1.2470446825027466, + "learning_rate": 2e-05, + "loss": 0.04021191, + "step": 18768 + }, + { + "epoch": 37.538, + "grad_norm": 1.3754534721374512, + "learning_rate": 2e-05, + "loss": 0.0446906, + "step": 18769 + }, + { + "epoch": 37.54, + "grad_norm": 1.608644962310791, + "learning_rate": 2e-05, + "loss": 0.04486812, + "step": 18770 + }, + { + "epoch": 37.542, + "grad_norm": 1.4814962148666382, + "learning_rate": 2e-05, + "loss": 0.03845561, + "step": 18771 + }, + { + "epoch": 37.544, + "grad_norm": 0.8970831036567688, + "learning_rate": 2e-05, + "loss": 0.03752285, + "step": 18772 + }, + { + "epoch": 37.546, + "grad_norm": 1.1656574010849, + "learning_rate": 2e-05, + "loss": 0.03882036, + "step": 18773 + }, + { + "epoch": 37.548, + "grad_norm": 0.8688967823982239, + "learning_rate": 2e-05, + "loss": 0.01826111, + "step": 18774 + }, + { + "epoch": 37.55, + "grad_norm": 1.0525118112564087, + "learning_rate": 2e-05, + "loss": 0.04012558, + "step": 18775 + }, + { + "epoch": 37.552, + "grad_norm": 1.1830229759216309, + "learning_rate": 2e-05, + "loss": 0.05035875, + "step": 18776 + }, + { + "epoch": 37.554, + "grad_norm": 1.1931424140930176, + "learning_rate": 2e-05, + "loss": 0.04494492, + "step": 18777 + }, + { + "epoch": 37.556, + "grad_norm": 1.3106215000152588, + "learning_rate": 2e-05, + "loss": 0.04554956, + "step": 18778 + }, + { + "epoch": 37.558, + "grad_norm": 1.2222228050231934, + "learning_rate": 2e-05, + "loss": 0.04136209, + "step": 18779 + }, + { + "epoch": 37.56, + "grad_norm": 2.1237621307373047, + "learning_rate": 2e-05, + "loss": 0.04145623, + "step": 18780 + }, + { + "epoch": 37.562, + "grad_norm": 1.1739251613616943, + "learning_rate": 2e-05, + "loss": 0.04080192, + "step": 18781 + }, + { + "epoch": 37.564, + "grad_norm": 2.191302537918091, + "learning_rate": 2e-05, + "loss": 0.06097953, + "step": 18782 + }, + { + "epoch": 37.566, + "grad_norm": 0.9671980738639832, + "learning_rate": 2e-05, + "loss": 0.03338189, + "step": 18783 + }, + { + "epoch": 37.568, + "grad_norm": 1.8375569581985474, + "learning_rate": 2e-05, + "loss": 0.04796746, + "step": 18784 + }, + { + "epoch": 37.57, + "grad_norm": 1.0496113300323486, + "learning_rate": 2e-05, + "loss": 0.02443472, + "step": 18785 + }, + { + "epoch": 37.572, + "grad_norm": 2.503190755844116, + "learning_rate": 2e-05, + "loss": 0.05313979, + "step": 18786 + }, + { + "epoch": 37.574, + "grad_norm": 0.965300977230072, + "learning_rate": 2e-05, + "loss": 0.03321171, + "step": 18787 + }, + { + "epoch": 37.576, + "grad_norm": 1.3297483921051025, + "learning_rate": 2e-05, + "loss": 0.03972304, + "step": 18788 + }, + { + "epoch": 37.578, + "grad_norm": 1.1176469326019287, + "learning_rate": 2e-05, + "loss": 0.03872283, + "step": 18789 + }, + { + "epoch": 37.58, + "grad_norm": 1.2309982776641846, + "learning_rate": 2e-05, + "loss": 0.05587587, + "step": 18790 + }, + { + "epoch": 37.582, + "grad_norm": 1.7290147542953491, + "learning_rate": 2e-05, + "loss": 0.04843904, + "step": 18791 + }, + { + "epoch": 37.584, + "grad_norm": 1.3155032396316528, + "learning_rate": 2e-05, + "loss": 0.04053347, + "step": 18792 + }, + { + "epoch": 37.586, + "grad_norm": 1.361668586730957, + "learning_rate": 2e-05, + "loss": 0.03224905, + "step": 18793 + }, + { + "epoch": 37.588, + "grad_norm": 1.1360480785369873, + "learning_rate": 2e-05, + "loss": 0.03972386, + "step": 18794 + }, + { + "epoch": 37.59, + "grad_norm": 1.3317919969558716, + "learning_rate": 2e-05, + "loss": 0.05043165, + "step": 18795 + }, + { + "epoch": 37.592, + "grad_norm": 1.1488837003707886, + "learning_rate": 2e-05, + "loss": 0.03248937, + "step": 18796 + }, + { + "epoch": 37.594, + "grad_norm": 1.4023540019989014, + "learning_rate": 2e-05, + "loss": 0.03502064, + "step": 18797 + }, + { + "epoch": 37.596, + "grad_norm": 1.0766652822494507, + "learning_rate": 2e-05, + "loss": 0.03397478, + "step": 18798 + }, + { + "epoch": 37.598, + "grad_norm": 1.0824081897735596, + "learning_rate": 2e-05, + "loss": 0.04138397, + "step": 18799 + }, + { + "epoch": 37.6, + "grad_norm": 1.0459606647491455, + "learning_rate": 2e-05, + "loss": 0.04331833, + "step": 18800 + }, + { + "epoch": 37.602, + "grad_norm": 1.3006701469421387, + "learning_rate": 2e-05, + "loss": 0.06298004, + "step": 18801 + }, + { + "epoch": 37.604, + "grad_norm": 0.9166358113288879, + "learning_rate": 2e-05, + "loss": 0.02415272, + "step": 18802 + }, + { + "epoch": 37.606, + "grad_norm": 1.091423511505127, + "learning_rate": 2e-05, + "loss": 0.03883186, + "step": 18803 + }, + { + "epoch": 37.608, + "grad_norm": 1.242665410041809, + "learning_rate": 2e-05, + "loss": 0.04210323, + "step": 18804 + }, + { + "epoch": 37.61, + "grad_norm": 1.4640295505523682, + "learning_rate": 2e-05, + "loss": 0.05143019, + "step": 18805 + }, + { + "epoch": 37.612, + "grad_norm": 1.2153027057647705, + "learning_rate": 2e-05, + "loss": 0.05529271, + "step": 18806 + }, + { + "epoch": 37.614, + "grad_norm": 2.2144391536712646, + "learning_rate": 2e-05, + "loss": 0.04179824, + "step": 18807 + }, + { + "epoch": 37.616, + "grad_norm": 1.0586007833480835, + "learning_rate": 2e-05, + "loss": 0.0338909, + "step": 18808 + }, + { + "epoch": 37.618, + "grad_norm": 1.1268335580825806, + "learning_rate": 2e-05, + "loss": 0.03529718, + "step": 18809 + }, + { + "epoch": 37.62, + "grad_norm": 1.4889854192733765, + "learning_rate": 2e-05, + "loss": 0.0585743, + "step": 18810 + }, + { + "epoch": 37.622, + "grad_norm": 1.6791647672653198, + "learning_rate": 2e-05, + "loss": 0.05393176, + "step": 18811 + }, + { + "epoch": 37.624, + "grad_norm": 1.1075037717819214, + "learning_rate": 2e-05, + "loss": 0.02849288, + "step": 18812 + }, + { + "epoch": 37.626, + "grad_norm": 1.3440752029418945, + "learning_rate": 2e-05, + "loss": 0.04735216, + "step": 18813 + }, + { + "epoch": 37.628, + "grad_norm": 1.2121247053146362, + "learning_rate": 2e-05, + "loss": 0.03134248, + "step": 18814 + }, + { + "epoch": 37.63, + "grad_norm": 1.0615363121032715, + "learning_rate": 2e-05, + "loss": 0.03442275, + "step": 18815 + }, + { + "epoch": 37.632, + "grad_norm": 1.183545708656311, + "learning_rate": 2e-05, + "loss": 0.03669387, + "step": 18816 + }, + { + "epoch": 37.634, + "grad_norm": 0.9101192951202393, + "learning_rate": 2e-05, + "loss": 0.03575148, + "step": 18817 + }, + { + "epoch": 37.636, + "grad_norm": 1.5718504190444946, + "learning_rate": 2e-05, + "loss": 0.05680436, + "step": 18818 + }, + { + "epoch": 37.638, + "grad_norm": 1.2223116159439087, + "learning_rate": 2e-05, + "loss": 0.04046562, + "step": 18819 + }, + { + "epoch": 37.64, + "grad_norm": 1.1684964895248413, + "learning_rate": 2e-05, + "loss": 0.0525642, + "step": 18820 + }, + { + "epoch": 37.642, + "grad_norm": 1.0936496257781982, + "learning_rate": 2e-05, + "loss": 0.04073422, + "step": 18821 + }, + { + "epoch": 37.644, + "grad_norm": 1.0097864866256714, + "learning_rate": 2e-05, + "loss": 0.03097866, + "step": 18822 + }, + { + "epoch": 37.646, + "grad_norm": 1.2851216793060303, + "learning_rate": 2e-05, + "loss": 0.05580614, + "step": 18823 + }, + { + "epoch": 37.648, + "grad_norm": 1.9229702949523926, + "learning_rate": 2e-05, + "loss": 0.03958568, + "step": 18824 + }, + { + "epoch": 37.65, + "grad_norm": 1.3668385744094849, + "learning_rate": 2e-05, + "loss": 0.05524768, + "step": 18825 + }, + { + "epoch": 37.652, + "grad_norm": 1.417112946510315, + "learning_rate": 2e-05, + "loss": 0.04026988, + "step": 18826 + }, + { + "epoch": 37.654, + "grad_norm": 1.154919147491455, + "learning_rate": 2e-05, + "loss": 0.04654554, + "step": 18827 + }, + { + "epoch": 37.656, + "grad_norm": 0.9886472821235657, + "learning_rate": 2e-05, + "loss": 0.03133095, + "step": 18828 + }, + { + "epoch": 37.658, + "grad_norm": 1.2836391925811768, + "learning_rate": 2e-05, + "loss": 0.03864687, + "step": 18829 + }, + { + "epoch": 37.66, + "grad_norm": 1.2053261995315552, + "learning_rate": 2e-05, + "loss": 0.05016092, + "step": 18830 + }, + { + "epoch": 37.662, + "grad_norm": 1.1073130369186401, + "learning_rate": 2e-05, + "loss": 0.03775865, + "step": 18831 + }, + { + "epoch": 37.664, + "grad_norm": 1.2172409296035767, + "learning_rate": 2e-05, + "loss": 0.05411567, + "step": 18832 + }, + { + "epoch": 37.666, + "grad_norm": 1.1823009252548218, + "learning_rate": 2e-05, + "loss": 0.0415944, + "step": 18833 + }, + { + "epoch": 37.668, + "grad_norm": 1.0593645572662354, + "learning_rate": 2e-05, + "loss": 0.03887521, + "step": 18834 + }, + { + "epoch": 37.67, + "grad_norm": 1.0737974643707275, + "learning_rate": 2e-05, + "loss": 0.03494824, + "step": 18835 + }, + { + "epoch": 37.672, + "grad_norm": 1.4756815433502197, + "learning_rate": 2e-05, + "loss": 0.05688382, + "step": 18836 + }, + { + "epoch": 37.674, + "grad_norm": 1.2232595682144165, + "learning_rate": 2e-05, + "loss": 0.05065658, + "step": 18837 + }, + { + "epoch": 37.676, + "grad_norm": 1.5146193504333496, + "learning_rate": 2e-05, + "loss": 0.04606935, + "step": 18838 + }, + { + "epoch": 37.678, + "grad_norm": 1.1179404258728027, + "learning_rate": 2e-05, + "loss": 0.04785991, + "step": 18839 + }, + { + "epoch": 37.68, + "grad_norm": 1.7704997062683105, + "learning_rate": 2e-05, + "loss": 0.03499816, + "step": 18840 + }, + { + "epoch": 37.682, + "grad_norm": 2.1645781993865967, + "learning_rate": 2e-05, + "loss": 0.06860285, + "step": 18841 + }, + { + "epoch": 37.684, + "grad_norm": 1.4239698648452759, + "learning_rate": 2e-05, + "loss": 0.05423044, + "step": 18842 + }, + { + "epoch": 37.686, + "grad_norm": 1.2061432600021362, + "learning_rate": 2e-05, + "loss": 0.0422, + "step": 18843 + }, + { + "epoch": 37.688, + "grad_norm": 1.434126615524292, + "learning_rate": 2e-05, + "loss": 0.05467582, + "step": 18844 + }, + { + "epoch": 37.69, + "grad_norm": 1.0668646097183228, + "learning_rate": 2e-05, + "loss": 0.04183818, + "step": 18845 + }, + { + "epoch": 37.692, + "grad_norm": 0.9105406999588013, + "learning_rate": 2e-05, + "loss": 0.0302218, + "step": 18846 + }, + { + "epoch": 37.694, + "grad_norm": 2.7572875022888184, + "learning_rate": 2e-05, + "loss": 0.04500583, + "step": 18847 + }, + { + "epoch": 37.696, + "grad_norm": 2.284607172012329, + "learning_rate": 2e-05, + "loss": 0.04284734, + "step": 18848 + }, + { + "epoch": 37.698, + "grad_norm": 2.073329210281372, + "learning_rate": 2e-05, + "loss": 0.03520484, + "step": 18849 + }, + { + "epoch": 37.7, + "grad_norm": 3.604762077331543, + "learning_rate": 2e-05, + "loss": 0.06385957, + "step": 18850 + }, + { + "epoch": 37.702, + "grad_norm": 1.1945736408233643, + "learning_rate": 2e-05, + "loss": 0.03985525, + "step": 18851 + }, + { + "epoch": 37.704, + "grad_norm": 1.2185908555984497, + "learning_rate": 2e-05, + "loss": 0.03318772, + "step": 18852 + }, + { + "epoch": 37.706, + "grad_norm": 1.2688206434249878, + "learning_rate": 2e-05, + "loss": 0.05495746, + "step": 18853 + }, + { + "epoch": 37.708, + "grad_norm": 1.0381929874420166, + "learning_rate": 2e-05, + "loss": 0.03600042, + "step": 18854 + }, + { + "epoch": 37.71, + "grad_norm": 0.952168345451355, + "learning_rate": 2e-05, + "loss": 0.02752374, + "step": 18855 + }, + { + "epoch": 37.712, + "grad_norm": 1.2322078943252563, + "learning_rate": 2e-05, + "loss": 0.04587801, + "step": 18856 + }, + { + "epoch": 37.714, + "grad_norm": 1.2536684274673462, + "learning_rate": 2e-05, + "loss": 0.03789248, + "step": 18857 + }, + { + "epoch": 37.716, + "grad_norm": 2.647052526473999, + "learning_rate": 2e-05, + "loss": 0.07668023, + "step": 18858 + }, + { + "epoch": 37.718, + "grad_norm": 2.344609022140503, + "learning_rate": 2e-05, + "loss": 0.06175537, + "step": 18859 + }, + { + "epoch": 37.72, + "grad_norm": 1.710271954536438, + "learning_rate": 2e-05, + "loss": 0.07425263, + "step": 18860 + }, + { + "epoch": 37.722, + "grad_norm": 0.9385952353477478, + "learning_rate": 2e-05, + "loss": 0.03988194, + "step": 18861 + }, + { + "epoch": 37.724, + "grad_norm": 1.0760703086853027, + "learning_rate": 2e-05, + "loss": 0.03247745, + "step": 18862 + }, + { + "epoch": 37.726, + "grad_norm": 1.346384882926941, + "learning_rate": 2e-05, + "loss": 0.0547515, + "step": 18863 + }, + { + "epoch": 37.728, + "grad_norm": 1.0319280624389648, + "learning_rate": 2e-05, + "loss": 0.03719486, + "step": 18864 + }, + { + "epoch": 37.73, + "grad_norm": 1.327825665473938, + "learning_rate": 2e-05, + "loss": 0.02865359, + "step": 18865 + }, + { + "epoch": 37.732, + "grad_norm": 1.7671387195587158, + "learning_rate": 2e-05, + "loss": 0.03048402, + "step": 18866 + }, + { + "epoch": 37.734, + "grad_norm": 1.0499013662338257, + "learning_rate": 2e-05, + "loss": 0.03205416, + "step": 18867 + }, + { + "epoch": 37.736, + "grad_norm": 0.9944764971733093, + "learning_rate": 2e-05, + "loss": 0.03722709, + "step": 18868 + }, + { + "epoch": 37.738, + "grad_norm": 1.3089152574539185, + "learning_rate": 2e-05, + "loss": 0.04756982, + "step": 18869 + }, + { + "epoch": 37.74, + "grad_norm": 1.5196123123168945, + "learning_rate": 2e-05, + "loss": 0.06203455, + "step": 18870 + }, + { + "epoch": 37.742, + "grad_norm": 1.11556077003479, + "learning_rate": 2e-05, + "loss": 0.03433929, + "step": 18871 + }, + { + "epoch": 37.744, + "grad_norm": 2.001180410385132, + "learning_rate": 2e-05, + "loss": 0.046538, + "step": 18872 + }, + { + "epoch": 37.746, + "grad_norm": 1.2591460943222046, + "learning_rate": 2e-05, + "loss": 0.05003956, + "step": 18873 + }, + { + "epoch": 37.748, + "grad_norm": 1.6820541620254517, + "learning_rate": 2e-05, + "loss": 0.043096, + "step": 18874 + }, + { + "epoch": 37.75, + "grad_norm": 1.2268869876861572, + "learning_rate": 2e-05, + "loss": 0.04136671, + "step": 18875 + }, + { + "epoch": 37.752, + "grad_norm": 1.2533190250396729, + "learning_rate": 2e-05, + "loss": 0.0366976, + "step": 18876 + }, + { + "epoch": 37.754, + "grad_norm": 1.4866986274719238, + "learning_rate": 2e-05, + "loss": 0.03636875, + "step": 18877 + }, + { + "epoch": 37.756, + "grad_norm": 1.5848642587661743, + "learning_rate": 2e-05, + "loss": 0.04737707, + "step": 18878 + }, + { + "epoch": 37.758, + "grad_norm": 1.353268027305603, + "learning_rate": 2e-05, + "loss": 0.03133782, + "step": 18879 + }, + { + "epoch": 37.76, + "grad_norm": 0.9636505842208862, + "learning_rate": 2e-05, + "loss": 0.0301811, + "step": 18880 + }, + { + "epoch": 37.762, + "grad_norm": 1.2497457265853882, + "learning_rate": 2e-05, + "loss": 0.05001786, + "step": 18881 + }, + { + "epoch": 37.764, + "grad_norm": 1.0673328638076782, + "learning_rate": 2e-05, + "loss": 0.03139454, + "step": 18882 + }, + { + "epoch": 37.766, + "grad_norm": 1.557485580444336, + "learning_rate": 2e-05, + "loss": 0.0464134, + "step": 18883 + }, + { + "epoch": 37.768, + "grad_norm": 0.973440408706665, + "learning_rate": 2e-05, + "loss": 0.03469043, + "step": 18884 + }, + { + "epoch": 37.77, + "grad_norm": 1.1748510599136353, + "learning_rate": 2e-05, + "loss": 0.03730968, + "step": 18885 + }, + { + "epoch": 37.772, + "grad_norm": 1.0944515466690063, + "learning_rate": 2e-05, + "loss": 0.03888043, + "step": 18886 + }, + { + "epoch": 37.774, + "grad_norm": 3.6084587574005127, + "learning_rate": 2e-05, + "loss": 0.05079032, + "step": 18887 + }, + { + "epoch": 37.776, + "grad_norm": 1.7033201456069946, + "learning_rate": 2e-05, + "loss": 0.0442396, + "step": 18888 + }, + { + "epoch": 37.778, + "grad_norm": 1.3825198411941528, + "learning_rate": 2e-05, + "loss": 0.05911713, + "step": 18889 + }, + { + "epoch": 37.78, + "grad_norm": 1.3935472965240479, + "learning_rate": 2e-05, + "loss": 0.04597136, + "step": 18890 + }, + { + "epoch": 37.782, + "grad_norm": 1.515001893043518, + "learning_rate": 2e-05, + "loss": 0.04437057, + "step": 18891 + }, + { + "epoch": 37.784, + "grad_norm": 1.1268634796142578, + "learning_rate": 2e-05, + "loss": 0.03411634, + "step": 18892 + }, + { + "epoch": 37.786, + "grad_norm": 0.9976750016212463, + "learning_rate": 2e-05, + "loss": 0.03444566, + "step": 18893 + }, + { + "epoch": 37.788, + "grad_norm": 1.5224945545196533, + "learning_rate": 2e-05, + "loss": 0.0607723, + "step": 18894 + }, + { + "epoch": 37.79, + "grad_norm": 1.3955048322677612, + "learning_rate": 2e-05, + "loss": 0.05213226, + "step": 18895 + }, + { + "epoch": 37.792, + "grad_norm": 1.1549447774887085, + "learning_rate": 2e-05, + "loss": 0.03736443, + "step": 18896 + }, + { + "epoch": 37.794, + "grad_norm": 0.8485814929008484, + "learning_rate": 2e-05, + "loss": 0.02462177, + "step": 18897 + }, + { + "epoch": 37.796, + "grad_norm": 1.3781180381774902, + "learning_rate": 2e-05, + "loss": 0.05236533, + "step": 18898 + }, + { + "epoch": 37.798, + "grad_norm": 1.4734482765197754, + "learning_rate": 2e-05, + "loss": 0.05728751, + "step": 18899 + }, + { + "epoch": 37.8, + "grad_norm": 1.1258692741394043, + "learning_rate": 2e-05, + "loss": 0.03739987, + "step": 18900 + }, + { + "epoch": 37.802, + "grad_norm": 1.2827837467193604, + "learning_rate": 2e-05, + "loss": 0.05210217, + "step": 18901 + }, + { + "epoch": 37.804, + "grad_norm": 1.1557444334030151, + "learning_rate": 2e-05, + "loss": 0.0390267, + "step": 18902 + }, + { + "epoch": 37.806, + "grad_norm": 1.1604247093200684, + "learning_rate": 2e-05, + "loss": 0.04971418, + "step": 18903 + }, + { + "epoch": 37.808, + "grad_norm": 1.0621238946914673, + "learning_rate": 2e-05, + "loss": 0.03640627, + "step": 18904 + }, + { + "epoch": 37.81, + "grad_norm": 0.9951264262199402, + "learning_rate": 2e-05, + "loss": 0.03904339, + "step": 18905 + }, + { + "epoch": 37.812, + "grad_norm": 1.199965476989746, + "learning_rate": 2e-05, + "loss": 0.04364878, + "step": 18906 + }, + { + "epoch": 37.814, + "grad_norm": 3.0767629146575928, + "learning_rate": 2e-05, + "loss": 0.04365045, + "step": 18907 + }, + { + "epoch": 37.816, + "grad_norm": 0.8647134304046631, + "learning_rate": 2e-05, + "loss": 0.0334196, + "step": 18908 + }, + { + "epoch": 37.818, + "grad_norm": 1.1921844482421875, + "learning_rate": 2e-05, + "loss": 0.03833631, + "step": 18909 + }, + { + "epoch": 37.82, + "grad_norm": 1.0273867845535278, + "learning_rate": 2e-05, + "loss": 0.04143481, + "step": 18910 + }, + { + "epoch": 37.822, + "grad_norm": 1.125104308128357, + "learning_rate": 2e-05, + "loss": 0.05544262, + "step": 18911 + }, + { + "epoch": 37.824, + "grad_norm": 1.0411149263381958, + "learning_rate": 2e-05, + "loss": 0.04289424, + "step": 18912 + }, + { + "epoch": 37.826, + "grad_norm": 1.2323323488235474, + "learning_rate": 2e-05, + "loss": 0.05162044, + "step": 18913 + }, + { + "epoch": 37.828, + "grad_norm": 1.2034226655960083, + "learning_rate": 2e-05, + "loss": 0.05214075, + "step": 18914 + }, + { + "epoch": 37.83, + "grad_norm": 1.3405710458755493, + "learning_rate": 2e-05, + "loss": 0.05168641, + "step": 18915 + }, + { + "epoch": 37.832, + "grad_norm": 1.2981528043746948, + "learning_rate": 2e-05, + "loss": 0.04356202, + "step": 18916 + }, + { + "epoch": 37.834, + "grad_norm": 0.9416662454605103, + "learning_rate": 2e-05, + "loss": 0.03926639, + "step": 18917 + }, + { + "epoch": 37.836, + "grad_norm": 1.080146312713623, + "learning_rate": 2e-05, + "loss": 0.03623664, + "step": 18918 + }, + { + "epoch": 37.838, + "grad_norm": 1.164093255996704, + "learning_rate": 2e-05, + "loss": 0.04676047, + "step": 18919 + }, + { + "epoch": 37.84, + "grad_norm": 1.0014550685882568, + "learning_rate": 2e-05, + "loss": 0.03417875, + "step": 18920 + }, + { + "epoch": 37.842, + "grad_norm": 1.2077052593231201, + "learning_rate": 2e-05, + "loss": 0.03918765, + "step": 18921 + }, + { + "epoch": 37.844, + "grad_norm": 1.121488094329834, + "learning_rate": 2e-05, + "loss": 0.04199528, + "step": 18922 + }, + { + "epoch": 37.846, + "grad_norm": 2.043963670730591, + "learning_rate": 2e-05, + "loss": 0.06683844, + "step": 18923 + }, + { + "epoch": 37.848, + "grad_norm": 1.0892078876495361, + "learning_rate": 2e-05, + "loss": 0.04384585, + "step": 18924 + }, + { + "epoch": 37.85, + "grad_norm": 1.333020567893982, + "learning_rate": 2e-05, + "loss": 0.04862942, + "step": 18925 + }, + { + "epoch": 37.852, + "grad_norm": 1.0511001348495483, + "learning_rate": 2e-05, + "loss": 0.03976622, + "step": 18926 + }, + { + "epoch": 37.854, + "grad_norm": 1.2701404094696045, + "learning_rate": 2e-05, + "loss": 0.04149718, + "step": 18927 + }, + { + "epoch": 37.856, + "grad_norm": 1.0651763677597046, + "learning_rate": 2e-05, + "loss": 0.03586752, + "step": 18928 + }, + { + "epoch": 37.858, + "grad_norm": 1.0089900493621826, + "learning_rate": 2e-05, + "loss": 0.03525446, + "step": 18929 + }, + { + "epoch": 37.86, + "grad_norm": 1.1876775026321411, + "learning_rate": 2e-05, + "loss": 0.04068927, + "step": 18930 + }, + { + "epoch": 37.862, + "grad_norm": 1.7308181524276733, + "learning_rate": 2e-05, + "loss": 0.04104917, + "step": 18931 + }, + { + "epoch": 37.864, + "grad_norm": 1.0084033012390137, + "learning_rate": 2e-05, + "loss": 0.03396938, + "step": 18932 + }, + { + "epoch": 37.866, + "grad_norm": 1.4454501867294312, + "learning_rate": 2e-05, + "loss": 0.05074462, + "step": 18933 + }, + { + "epoch": 37.868, + "grad_norm": 1.103598952293396, + "learning_rate": 2e-05, + "loss": 0.04150467, + "step": 18934 + }, + { + "epoch": 37.87, + "grad_norm": 1.0260006189346313, + "learning_rate": 2e-05, + "loss": 0.03909385, + "step": 18935 + }, + { + "epoch": 37.872, + "grad_norm": 1.1484661102294922, + "learning_rate": 2e-05, + "loss": 0.0446529, + "step": 18936 + }, + { + "epoch": 37.874, + "grad_norm": 0.9371235966682434, + "learning_rate": 2e-05, + "loss": 0.03469496, + "step": 18937 + }, + { + "epoch": 37.876, + "grad_norm": 2.441399097442627, + "learning_rate": 2e-05, + "loss": 0.04239362, + "step": 18938 + }, + { + "epoch": 37.878, + "grad_norm": 0.9973140358924866, + "learning_rate": 2e-05, + "loss": 0.03371353, + "step": 18939 + }, + { + "epoch": 37.88, + "grad_norm": 1.1159844398498535, + "learning_rate": 2e-05, + "loss": 0.04418877, + "step": 18940 + }, + { + "epoch": 37.882, + "grad_norm": 1.0986323356628418, + "learning_rate": 2e-05, + "loss": 0.0396911, + "step": 18941 + }, + { + "epoch": 37.884, + "grad_norm": 2.239150285720825, + "learning_rate": 2e-05, + "loss": 0.05810123, + "step": 18942 + }, + { + "epoch": 37.886, + "grad_norm": 1.1507476568222046, + "learning_rate": 2e-05, + "loss": 0.04050256, + "step": 18943 + }, + { + "epoch": 37.888, + "grad_norm": 1.643411636352539, + "learning_rate": 2e-05, + "loss": 0.07198939, + "step": 18944 + }, + { + "epoch": 37.89, + "grad_norm": 1.7802282571792603, + "learning_rate": 2e-05, + "loss": 0.0361943, + "step": 18945 + }, + { + "epoch": 37.892, + "grad_norm": 1.0710393190383911, + "learning_rate": 2e-05, + "loss": 0.0276309, + "step": 18946 + }, + { + "epoch": 37.894, + "grad_norm": 1.3028510808944702, + "learning_rate": 2e-05, + "loss": 0.04614664, + "step": 18947 + }, + { + "epoch": 37.896, + "grad_norm": 1.6930269002914429, + "learning_rate": 2e-05, + "loss": 0.05176745, + "step": 18948 + }, + { + "epoch": 37.898, + "grad_norm": 1.3994868993759155, + "learning_rate": 2e-05, + "loss": 0.04597143, + "step": 18949 + }, + { + "epoch": 37.9, + "grad_norm": 1.300676703453064, + "learning_rate": 2e-05, + "loss": 0.0538766, + "step": 18950 + }, + { + "epoch": 37.902, + "grad_norm": 1.897986888885498, + "learning_rate": 2e-05, + "loss": 0.03771758, + "step": 18951 + }, + { + "epoch": 37.904, + "grad_norm": 1.1213583946228027, + "learning_rate": 2e-05, + "loss": 0.04674072, + "step": 18952 + }, + { + "epoch": 37.906, + "grad_norm": 1.3618868589401245, + "learning_rate": 2e-05, + "loss": 0.06035518, + "step": 18953 + }, + { + "epoch": 37.908, + "grad_norm": 1.2731349468231201, + "learning_rate": 2e-05, + "loss": 0.05713828, + "step": 18954 + }, + { + "epoch": 37.91, + "grad_norm": 1.0878880023956299, + "learning_rate": 2e-05, + "loss": 0.05491067, + "step": 18955 + }, + { + "epoch": 37.912, + "grad_norm": 1.5778956413269043, + "learning_rate": 2e-05, + "loss": 0.05252867, + "step": 18956 + }, + { + "epoch": 37.914, + "grad_norm": 0.9078095555305481, + "learning_rate": 2e-05, + "loss": 0.03131845, + "step": 18957 + }, + { + "epoch": 37.916, + "grad_norm": 1.0328174829483032, + "learning_rate": 2e-05, + "loss": 0.03666805, + "step": 18958 + }, + { + "epoch": 37.918, + "grad_norm": 1.363809585571289, + "learning_rate": 2e-05, + "loss": 0.05573125, + "step": 18959 + }, + { + "epoch": 37.92, + "grad_norm": 1.0942089557647705, + "learning_rate": 2e-05, + "loss": 0.05161095, + "step": 18960 + }, + { + "epoch": 37.922, + "grad_norm": 1.2652561664581299, + "learning_rate": 2e-05, + "loss": 0.05439752, + "step": 18961 + }, + { + "epoch": 37.924, + "grad_norm": 1.4360697269439697, + "learning_rate": 2e-05, + "loss": 0.04909812, + "step": 18962 + }, + { + "epoch": 37.926, + "grad_norm": 1.5713309049606323, + "learning_rate": 2e-05, + "loss": 0.05065858, + "step": 18963 + }, + { + "epoch": 37.928, + "grad_norm": 1.3560004234313965, + "learning_rate": 2e-05, + "loss": 0.05886821, + "step": 18964 + }, + { + "epoch": 37.93, + "grad_norm": 1.1961268186569214, + "learning_rate": 2e-05, + "loss": 0.04319284, + "step": 18965 + }, + { + "epoch": 37.932, + "grad_norm": 1.2720478773117065, + "learning_rate": 2e-05, + "loss": 0.07183904, + "step": 18966 + }, + { + "epoch": 37.934, + "grad_norm": 1.26603102684021, + "learning_rate": 2e-05, + "loss": 0.04299184, + "step": 18967 + }, + { + "epoch": 37.936, + "grad_norm": 1.0896682739257812, + "learning_rate": 2e-05, + "loss": 0.04101091, + "step": 18968 + }, + { + "epoch": 37.938, + "grad_norm": 1.333804726600647, + "learning_rate": 2e-05, + "loss": 0.03814703, + "step": 18969 + }, + { + "epoch": 37.94, + "grad_norm": 1.6315276622772217, + "learning_rate": 2e-05, + "loss": 0.02696334, + "step": 18970 + }, + { + "epoch": 37.942, + "grad_norm": 1.2752485275268555, + "learning_rate": 2e-05, + "loss": 0.04623385, + "step": 18971 + }, + { + "epoch": 37.944, + "grad_norm": 1.2738232612609863, + "learning_rate": 2e-05, + "loss": 0.05119524, + "step": 18972 + }, + { + "epoch": 37.946, + "grad_norm": 1.229262351989746, + "learning_rate": 2e-05, + "loss": 0.03859371, + "step": 18973 + }, + { + "epoch": 37.948, + "grad_norm": 1.292868971824646, + "learning_rate": 2e-05, + "loss": 0.03332417, + "step": 18974 + }, + { + "epoch": 37.95, + "grad_norm": 0.9618489742279053, + "learning_rate": 2e-05, + "loss": 0.03035795, + "step": 18975 + }, + { + "epoch": 37.952, + "grad_norm": 0.9348632097244263, + "learning_rate": 2e-05, + "loss": 0.03629586, + "step": 18976 + }, + { + "epoch": 37.954, + "grad_norm": 1.0433769226074219, + "learning_rate": 2e-05, + "loss": 0.04461107, + "step": 18977 + }, + { + "epoch": 37.956, + "grad_norm": 1.8684062957763672, + "learning_rate": 2e-05, + "loss": 0.04248746, + "step": 18978 + }, + { + "epoch": 37.958, + "grad_norm": 1.2349817752838135, + "learning_rate": 2e-05, + "loss": 0.04595061, + "step": 18979 + }, + { + "epoch": 37.96, + "grad_norm": 1.1329549551010132, + "learning_rate": 2e-05, + "loss": 0.05171024, + "step": 18980 + }, + { + "epoch": 37.962, + "grad_norm": 1.4046157598495483, + "learning_rate": 2e-05, + "loss": 0.04267633, + "step": 18981 + }, + { + "epoch": 37.964, + "grad_norm": 1.4222880601882935, + "learning_rate": 2e-05, + "loss": 0.03729562, + "step": 18982 + }, + { + "epoch": 37.966, + "grad_norm": 1.1951045989990234, + "learning_rate": 2e-05, + "loss": 0.04684995, + "step": 18983 + }, + { + "epoch": 37.968, + "grad_norm": 1.7370728254318237, + "learning_rate": 2e-05, + "loss": 0.0354794, + "step": 18984 + }, + { + "epoch": 37.97, + "grad_norm": 1.2649239301681519, + "learning_rate": 2e-05, + "loss": 0.03856641, + "step": 18985 + }, + { + "epoch": 37.972, + "grad_norm": 1.2516887187957764, + "learning_rate": 2e-05, + "loss": 0.03826671, + "step": 18986 + }, + { + "epoch": 37.974, + "grad_norm": 1.4137239456176758, + "learning_rate": 2e-05, + "loss": 0.04873248, + "step": 18987 + }, + { + "epoch": 37.976, + "grad_norm": 1.205417275428772, + "learning_rate": 2e-05, + "loss": 0.03422677, + "step": 18988 + }, + { + "epoch": 37.978, + "grad_norm": 1.3862630128860474, + "learning_rate": 2e-05, + "loss": 0.03816392, + "step": 18989 + }, + { + "epoch": 37.98, + "grad_norm": 1.0804787874221802, + "learning_rate": 2e-05, + "loss": 0.04000157, + "step": 18990 + }, + { + "epoch": 37.982, + "grad_norm": 1.2574692964553833, + "learning_rate": 2e-05, + "loss": 0.04244877, + "step": 18991 + }, + { + "epoch": 37.984, + "grad_norm": 0.8911446928977966, + "learning_rate": 2e-05, + "loss": 0.02988572, + "step": 18992 + }, + { + "epoch": 37.986, + "grad_norm": 1.157371997833252, + "learning_rate": 2e-05, + "loss": 0.03278516, + "step": 18993 + }, + { + "epoch": 37.988, + "grad_norm": 1.2546195983886719, + "learning_rate": 2e-05, + "loss": 0.04516232, + "step": 18994 + }, + { + "epoch": 37.99, + "grad_norm": 1.0913975238800049, + "learning_rate": 2e-05, + "loss": 0.03805106, + "step": 18995 + }, + { + "epoch": 37.992, + "grad_norm": 1.3969191312789917, + "learning_rate": 2e-05, + "loss": 0.04859225, + "step": 18996 + }, + { + "epoch": 37.994, + "grad_norm": 1.4129071235656738, + "learning_rate": 2e-05, + "loss": 0.03075989, + "step": 18997 + }, + { + "epoch": 37.996, + "grad_norm": 1.4782291650772095, + "learning_rate": 2e-05, + "loss": 0.05404112, + "step": 18998 + }, + { + "epoch": 37.998, + "grad_norm": 2.0971972942352295, + "learning_rate": 2e-05, + "loss": 0.06696081, + "step": 18999 + }, + { + "epoch": 38.0, + "grad_norm": 1.0580739974975586, + "learning_rate": 2e-05, + "loss": 0.03982267, + "step": 19000 + }, + { + "epoch": 38.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9900199600798403, + "Equal_1": 0.998, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9840319361277445, + "LineComparison_1": 1.0, + "LineComparison_2": 0.998003992015968, + "LineComparison_3": 1.0, + "Parallel_1": 0.9919839679358717, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.992, + "Perpendicular_1": 0.998, + "Perpendicular_2": 1.0, + "Perpendicular_3": 0.8977955911823647, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.9956, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9840319361277445 + }, + "eval_runtime": 319.6496, + "eval_samples_per_second": 32.848, + "eval_steps_per_second": 0.657, + "step": 19000 + }, + { + "epoch": 38.002, + "grad_norm": 1.2653032541275024, + "learning_rate": 2e-05, + "loss": 0.05122709, + "step": 19001 + }, + { + "epoch": 38.004, + "grad_norm": 1.1466037034988403, + "learning_rate": 2e-05, + "loss": 0.03442731, + "step": 19002 + }, + { + "epoch": 38.006, + "grad_norm": 2.500974178314209, + "learning_rate": 2e-05, + "loss": 0.05683993, + "step": 19003 + }, + { + "epoch": 38.008, + "grad_norm": 1.1632379293441772, + "learning_rate": 2e-05, + "loss": 0.04220579, + "step": 19004 + }, + { + "epoch": 38.01, + "grad_norm": 1.1766409873962402, + "learning_rate": 2e-05, + "loss": 0.0437943, + "step": 19005 + }, + { + "epoch": 38.012, + "grad_norm": 1.2098218202590942, + "learning_rate": 2e-05, + "loss": 0.04391178, + "step": 19006 + }, + { + "epoch": 38.014, + "grad_norm": 1.236122965812683, + "learning_rate": 2e-05, + "loss": 0.05343842, + "step": 19007 + }, + { + "epoch": 38.016, + "grad_norm": 1.6261658668518066, + "learning_rate": 2e-05, + "loss": 0.06093828, + "step": 19008 + }, + { + "epoch": 38.018, + "grad_norm": 1.1033320426940918, + "learning_rate": 2e-05, + "loss": 0.03286046, + "step": 19009 + }, + { + "epoch": 38.02, + "grad_norm": 2.3395283222198486, + "learning_rate": 2e-05, + "loss": 0.04223632, + "step": 19010 + }, + { + "epoch": 38.022, + "grad_norm": 1.4763344526290894, + "learning_rate": 2e-05, + "loss": 0.05102969, + "step": 19011 + }, + { + "epoch": 38.024, + "grad_norm": 1.1282676458358765, + "learning_rate": 2e-05, + "loss": 0.04249762, + "step": 19012 + }, + { + "epoch": 38.026, + "grad_norm": 1.2232784032821655, + "learning_rate": 2e-05, + "loss": 0.03590241, + "step": 19013 + }, + { + "epoch": 38.028, + "grad_norm": 1.187089204788208, + "learning_rate": 2e-05, + "loss": 0.05354986, + "step": 19014 + }, + { + "epoch": 38.03, + "grad_norm": 1.0039637088775635, + "learning_rate": 2e-05, + "loss": 0.03929551, + "step": 19015 + }, + { + "epoch": 38.032, + "grad_norm": 1.1649113893508911, + "learning_rate": 2e-05, + "loss": 0.04089183, + "step": 19016 + }, + { + "epoch": 38.034, + "grad_norm": 1.1364166736602783, + "learning_rate": 2e-05, + "loss": 0.0420451, + "step": 19017 + }, + { + "epoch": 38.036, + "grad_norm": 1.5335687398910522, + "learning_rate": 2e-05, + "loss": 0.04455702, + "step": 19018 + }, + { + "epoch": 38.038, + "grad_norm": 1.4168477058410645, + "learning_rate": 2e-05, + "loss": 0.05238315, + "step": 19019 + }, + { + "epoch": 38.04, + "grad_norm": 1.2343907356262207, + "learning_rate": 2e-05, + "loss": 0.04423891, + "step": 19020 + }, + { + "epoch": 38.042, + "grad_norm": 1.1726269721984863, + "learning_rate": 2e-05, + "loss": 0.05387194, + "step": 19021 + }, + { + "epoch": 38.044, + "grad_norm": 2.447615146636963, + "learning_rate": 2e-05, + "loss": 0.05135684, + "step": 19022 + }, + { + "epoch": 38.046, + "grad_norm": 1.5104469060897827, + "learning_rate": 2e-05, + "loss": 0.043165, + "step": 19023 + }, + { + "epoch": 38.048, + "grad_norm": 1.3275978565216064, + "learning_rate": 2e-05, + "loss": 0.04400131, + "step": 19024 + }, + { + "epoch": 38.05, + "grad_norm": 1.2232415676116943, + "learning_rate": 2e-05, + "loss": 0.04481234, + "step": 19025 + }, + { + "epoch": 38.052, + "grad_norm": 1.1241319179534912, + "learning_rate": 2e-05, + "loss": 0.04850707, + "step": 19026 + }, + { + "epoch": 38.054, + "grad_norm": 1.286199688911438, + "learning_rate": 2e-05, + "loss": 0.04397567, + "step": 19027 + }, + { + "epoch": 38.056, + "grad_norm": 1.2442455291748047, + "learning_rate": 2e-05, + "loss": 0.04638599, + "step": 19028 + }, + { + "epoch": 38.058, + "grad_norm": 1.691164493560791, + "learning_rate": 2e-05, + "loss": 0.05108991, + "step": 19029 + }, + { + "epoch": 38.06, + "grad_norm": 1.2171449661254883, + "learning_rate": 2e-05, + "loss": 0.04477761, + "step": 19030 + }, + { + "epoch": 38.062, + "grad_norm": 1.1396671533584595, + "learning_rate": 2e-05, + "loss": 0.05528203, + "step": 19031 + }, + { + "epoch": 38.064, + "grad_norm": 1.8548091650009155, + "learning_rate": 2e-05, + "loss": 0.05561259, + "step": 19032 + }, + { + "epoch": 38.066, + "grad_norm": 1.0096694231033325, + "learning_rate": 2e-05, + "loss": 0.03706703, + "step": 19033 + }, + { + "epoch": 38.068, + "grad_norm": 1.2540130615234375, + "learning_rate": 2e-05, + "loss": 0.0528646, + "step": 19034 + }, + { + "epoch": 38.07, + "grad_norm": 1.436498999595642, + "learning_rate": 2e-05, + "loss": 0.05726215, + "step": 19035 + }, + { + "epoch": 38.072, + "grad_norm": 1.2053461074829102, + "learning_rate": 2e-05, + "loss": 0.03862919, + "step": 19036 + }, + { + "epoch": 38.074, + "grad_norm": 1.2847776412963867, + "learning_rate": 2e-05, + "loss": 0.04858331, + "step": 19037 + }, + { + "epoch": 38.076, + "grad_norm": 2.5267117023468018, + "learning_rate": 2e-05, + "loss": 0.04583256, + "step": 19038 + }, + { + "epoch": 38.078, + "grad_norm": 1.9496853351593018, + "learning_rate": 2e-05, + "loss": 0.05861641, + "step": 19039 + }, + { + "epoch": 38.08, + "grad_norm": 1.204428791999817, + "learning_rate": 2e-05, + "loss": 0.04567886, + "step": 19040 + }, + { + "epoch": 38.082, + "grad_norm": 1.1530473232269287, + "learning_rate": 2e-05, + "loss": 0.03973469, + "step": 19041 + }, + { + "epoch": 38.084, + "grad_norm": 1.2695164680480957, + "learning_rate": 2e-05, + "loss": 0.05841737, + "step": 19042 + }, + { + "epoch": 38.086, + "grad_norm": 1.5974242687225342, + "learning_rate": 2e-05, + "loss": 0.04161192, + "step": 19043 + }, + { + "epoch": 38.088, + "grad_norm": 1.1564900875091553, + "learning_rate": 2e-05, + "loss": 0.05044547, + "step": 19044 + }, + { + "epoch": 38.09, + "grad_norm": 1.2367781400680542, + "learning_rate": 2e-05, + "loss": 0.03753296, + "step": 19045 + }, + { + "epoch": 38.092, + "grad_norm": 1.287509560585022, + "learning_rate": 2e-05, + "loss": 0.04154661, + "step": 19046 + }, + { + "epoch": 38.094, + "grad_norm": 1.1446880102157593, + "learning_rate": 2e-05, + "loss": 0.03765508, + "step": 19047 + }, + { + "epoch": 38.096, + "grad_norm": 1.177966594696045, + "learning_rate": 2e-05, + "loss": 0.05387244, + "step": 19048 + }, + { + "epoch": 38.098, + "grad_norm": 1.4184907674789429, + "learning_rate": 2e-05, + "loss": 0.03067533, + "step": 19049 + }, + { + "epoch": 38.1, + "grad_norm": 1.4943373203277588, + "learning_rate": 2e-05, + "loss": 0.05171797, + "step": 19050 + }, + { + "epoch": 38.102, + "grad_norm": 1.4353896379470825, + "learning_rate": 2e-05, + "loss": 0.06505428, + "step": 19051 + }, + { + "epoch": 38.104, + "grad_norm": 1.2791118621826172, + "learning_rate": 2e-05, + "loss": 0.0403844, + "step": 19052 + }, + { + "epoch": 38.106, + "grad_norm": 1.1676243543624878, + "learning_rate": 2e-05, + "loss": 0.05121243, + "step": 19053 + }, + { + "epoch": 38.108, + "grad_norm": 1.0785608291625977, + "learning_rate": 2e-05, + "loss": 0.04726262, + "step": 19054 + }, + { + "epoch": 38.11, + "grad_norm": 0.9102175235748291, + "learning_rate": 2e-05, + "loss": 0.02758621, + "step": 19055 + }, + { + "epoch": 38.112, + "grad_norm": 1.3264470100402832, + "learning_rate": 2e-05, + "loss": 0.05410461, + "step": 19056 + }, + { + "epoch": 38.114, + "grad_norm": 1.18822181224823, + "learning_rate": 2e-05, + "loss": 0.04347523, + "step": 19057 + }, + { + "epoch": 38.116, + "grad_norm": 1.4534834623336792, + "learning_rate": 2e-05, + "loss": 0.04533903, + "step": 19058 + }, + { + "epoch": 38.118, + "grad_norm": 1.193938136100769, + "learning_rate": 2e-05, + "loss": 0.04152393, + "step": 19059 + }, + { + "epoch": 38.12, + "grad_norm": 0.9936333298683167, + "learning_rate": 2e-05, + "loss": 0.02456021, + "step": 19060 + }, + { + "epoch": 38.122, + "grad_norm": 1.4597140550613403, + "learning_rate": 2e-05, + "loss": 0.04886616, + "step": 19061 + }, + { + "epoch": 38.124, + "grad_norm": 1.4201385974884033, + "learning_rate": 2e-05, + "loss": 0.05720758, + "step": 19062 + }, + { + "epoch": 38.126, + "grad_norm": 1.197838544845581, + "learning_rate": 2e-05, + "loss": 0.04132029, + "step": 19063 + }, + { + "epoch": 38.128, + "grad_norm": 2.9631714820861816, + "learning_rate": 2e-05, + "loss": 0.05553018, + "step": 19064 + }, + { + "epoch": 38.13, + "grad_norm": 1.32180655002594, + "learning_rate": 2e-05, + "loss": 0.03765612, + "step": 19065 + }, + { + "epoch": 38.132, + "grad_norm": 1.2723172903060913, + "learning_rate": 2e-05, + "loss": 0.05038822, + "step": 19066 + }, + { + "epoch": 38.134, + "grad_norm": 1.2453999519348145, + "learning_rate": 2e-05, + "loss": 0.04746665, + "step": 19067 + }, + { + "epoch": 38.136, + "grad_norm": 1.2901151180267334, + "learning_rate": 2e-05, + "loss": 0.05191978, + "step": 19068 + }, + { + "epoch": 38.138, + "grad_norm": 1.393450379371643, + "learning_rate": 2e-05, + "loss": 0.04058621, + "step": 19069 + }, + { + "epoch": 38.14, + "grad_norm": 3.79284405708313, + "learning_rate": 2e-05, + "loss": 0.089285, + "step": 19070 + }, + { + "epoch": 38.142, + "grad_norm": 1.231909990310669, + "learning_rate": 2e-05, + "loss": 0.04528357, + "step": 19071 + }, + { + "epoch": 38.144, + "grad_norm": 0.9689210057258606, + "learning_rate": 2e-05, + "loss": 0.03057509, + "step": 19072 + }, + { + "epoch": 38.146, + "grad_norm": 1.169950246810913, + "learning_rate": 2e-05, + "loss": 0.04367265, + "step": 19073 + }, + { + "epoch": 38.148, + "grad_norm": 1.002368688583374, + "learning_rate": 2e-05, + "loss": 0.0362767, + "step": 19074 + }, + { + "epoch": 38.15, + "grad_norm": 1.0449949502944946, + "learning_rate": 2e-05, + "loss": 0.03191894, + "step": 19075 + }, + { + "epoch": 38.152, + "grad_norm": 1.23591947555542, + "learning_rate": 2e-05, + "loss": 0.0624435, + "step": 19076 + }, + { + "epoch": 38.154, + "grad_norm": 1.3809751272201538, + "learning_rate": 2e-05, + "loss": 0.04096212, + "step": 19077 + }, + { + "epoch": 38.156, + "grad_norm": 1.3251644372940063, + "learning_rate": 2e-05, + "loss": 0.04161641, + "step": 19078 + }, + { + "epoch": 38.158, + "grad_norm": 1.9493850469589233, + "learning_rate": 2e-05, + "loss": 0.04814357, + "step": 19079 + }, + { + "epoch": 38.16, + "grad_norm": 1.3206309080123901, + "learning_rate": 2e-05, + "loss": 0.05091274, + "step": 19080 + }, + { + "epoch": 38.162, + "grad_norm": 1.2875269651412964, + "learning_rate": 2e-05, + "loss": 0.04496816, + "step": 19081 + }, + { + "epoch": 38.164, + "grad_norm": 0.9897270202636719, + "learning_rate": 2e-05, + "loss": 0.03702825, + "step": 19082 + }, + { + "epoch": 38.166, + "grad_norm": 1.2053303718566895, + "learning_rate": 2e-05, + "loss": 0.04418178, + "step": 19083 + }, + { + "epoch": 38.168, + "grad_norm": 1.2316017150878906, + "learning_rate": 2e-05, + "loss": 0.05090442, + "step": 19084 + }, + { + "epoch": 38.17, + "grad_norm": 1.5326013565063477, + "learning_rate": 2e-05, + "loss": 0.0485841, + "step": 19085 + }, + { + "epoch": 38.172, + "grad_norm": 1.6336983442306519, + "learning_rate": 2e-05, + "loss": 0.04477915, + "step": 19086 + }, + { + "epoch": 38.174, + "grad_norm": 1.2164212465286255, + "learning_rate": 2e-05, + "loss": 0.0437744, + "step": 19087 + }, + { + "epoch": 38.176, + "grad_norm": 1.078202724456787, + "learning_rate": 2e-05, + "loss": 0.04401769, + "step": 19088 + }, + { + "epoch": 38.178, + "grad_norm": 1.8545653820037842, + "learning_rate": 2e-05, + "loss": 0.05269055, + "step": 19089 + }, + { + "epoch": 38.18, + "grad_norm": 1.920038104057312, + "learning_rate": 2e-05, + "loss": 0.05271509, + "step": 19090 + }, + { + "epoch": 38.182, + "grad_norm": 1.242497205734253, + "learning_rate": 2e-05, + "loss": 0.0489504, + "step": 19091 + }, + { + "epoch": 38.184, + "grad_norm": 1.3824882507324219, + "learning_rate": 2e-05, + "loss": 0.04757221, + "step": 19092 + }, + { + "epoch": 38.186, + "grad_norm": 1.007732629776001, + "learning_rate": 2e-05, + "loss": 0.03562904, + "step": 19093 + }, + { + "epoch": 38.188, + "grad_norm": 1.1626834869384766, + "learning_rate": 2e-05, + "loss": 0.04818817, + "step": 19094 + }, + { + "epoch": 38.19, + "grad_norm": 2.008382558822632, + "learning_rate": 2e-05, + "loss": 0.05179016, + "step": 19095 + }, + { + "epoch": 38.192, + "grad_norm": 1.0385682582855225, + "learning_rate": 2e-05, + "loss": 0.03288983, + "step": 19096 + }, + { + "epoch": 38.194, + "grad_norm": 1.3801738023757935, + "learning_rate": 2e-05, + "loss": 0.05888782, + "step": 19097 + }, + { + "epoch": 38.196, + "grad_norm": 1.1773533821105957, + "learning_rate": 2e-05, + "loss": 0.04121737, + "step": 19098 + }, + { + "epoch": 38.198, + "grad_norm": 1.0565184354782104, + "learning_rate": 2e-05, + "loss": 0.03833776, + "step": 19099 + }, + { + "epoch": 38.2, + "grad_norm": 1.225994348526001, + "learning_rate": 2e-05, + "loss": 0.04604607, + "step": 19100 + }, + { + "epoch": 38.202, + "grad_norm": 1.1828093528747559, + "learning_rate": 2e-05, + "loss": 0.0390872, + "step": 19101 + }, + { + "epoch": 38.204, + "grad_norm": 1.1246726512908936, + "learning_rate": 2e-05, + "loss": 0.04674789, + "step": 19102 + }, + { + "epoch": 38.206, + "grad_norm": 1.1224784851074219, + "learning_rate": 2e-05, + "loss": 0.04065286, + "step": 19103 + }, + { + "epoch": 38.208, + "grad_norm": 1.1293448209762573, + "learning_rate": 2e-05, + "loss": 0.03387726, + "step": 19104 + }, + { + "epoch": 38.21, + "grad_norm": 1.6799935102462769, + "learning_rate": 2e-05, + "loss": 0.04963982, + "step": 19105 + }, + { + "epoch": 38.212, + "grad_norm": 0.9128696322441101, + "learning_rate": 2e-05, + "loss": 0.03350312, + "step": 19106 + }, + { + "epoch": 38.214, + "grad_norm": 1.3261280059814453, + "learning_rate": 2e-05, + "loss": 0.04866079, + "step": 19107 + }, + { + "epoch": 38.216, + "grad_norm": 1.1664577722549438, + "learning_rate": 2e-05, + "loss": 0.03899412, + "step": 19108 + }, + { + "epoch": 38.218, + "grad_norm": 1.4886510372161865, + "learning_rate": 2e-05, + "loss": 0.05114747, + "step": 19109 + }, + { + "epoch": 38.22, + "grad_norm": 1.2345974445343018, + "learning_rate": 2e-05, + "loss": 0.05016024, + "step": 19110 + }, + { + "epoch": 38.222, + "grad_norm": 3.129671812057495, + "learning_rate": 2e-05, + "loss": 0.05178972, + "step": 19111 + }, + { + "epoch": 38.224, + "grad_norm": 1.3265200853347778, + "learning_rate": 2e-05, + "loss": 0.04992515, + "step": 19112 + }, + { + "epoch": 38.226, + "grad_norm": 1.359765887260437, + "learning_rate": 2e-05, + "loss": 0.05563714, + "step": 19113 + }, + { + "epoch": 38.228, + "grad_norm": 1.2336148023605347, + "learning_rate": 2e-05, + "loss": 0.03507271, + "step": 19114 + }, + { + "epoch": 38.23, + "grad_norm": 1.2638157606124878, + "learning_rate": 2e-05, + "loss": 0.06374335, + "step": 19115 + }, + { + "epoch": 38.232, + "grad_norm": 1.4349949359893799, + "learning_rate": 2e-05, + "loss": 0.05200667, + "step": 19116 + }, + { + "epoch": 38.234, + "grad_norm": 2.013317584991455, + "learning_rate": 2e-05, + "loss": 0.03400422, + "step": 19117 + }, + { + "epoch": 38.236, + "grad_norm": 1.2802153825759888, + "learning_rate": 2e-05, + "loss": 0.03919921, + "step": 19118 + }, + { + "epoch": 38.238, + "grad_norm": 0.9960325360298157, + "learning_rate": 2e-05, + "loss": 0.03037401, + "step": 19119 + }, + { + "epoch": 38.24, + "grad_norm": 2.0125842094421387, + "learning_rate": 2e-05, + "loss": 0.05429566, + "step": 19120 + }, + { + "epoch": 38.242, + "grad_norm": 1.291225552558899, + "learning_rate": 2e-05, + "loss": 0.05072563, + "step": 19121 + }, + { + "epoch": 38.244, + "grad_norm": 0.9041070938110352, + "learning_rate": 2e-05, + "loss": 0.03640107, + "step": 19122 + }, + { + "epoch": 38.246, + "grad_norm": 2.185776710510254, + "learning_rate": 2e-05, + "loss": 0.04717995, + "step": 19123 + }, + { + "epoch": 38.248, + "grad_norm": 1.4118475914001465, + "learning_rate": 2e-05, + "loss": 0.03875816, + "step": 19124 + }, + { + "epoch": 38.25, + "grad_norm": 1.0311429500579834, + "learning_rate": 2e-05, + "loss": 0.05210552, + "step": 19125 + }, + { + "epoch": 38.252, + "grad_norm": 0.9929075241088867, + "learning_rate": 2e-05, + "loss": 0.04484344, + "step": 19126 + }, + { + "epoch": 38.254, + "grad_norm": 1.266503930091858, + "learning_rate": 2e-05, + "loss": 0.05835935, + "step": 19127 + }, + { + "epoch": 38.256, + "grad_norm": 1.0839781761169434, + "learning_rate": 2e-05, + "loss": 0.04155935, + "step": 19128 + }, + { + "epoch": 38.258, + "grad_norm": 1.6843171119689941, + "learning_rate": 2e-05, + "loss": 0.05540327, + "step": 19129 + }, + { + "epoch": 38.26, + "grad_norm": 1.0930052995681763, + "learning_rate": 2e-05, + "loss": 0.04292402, + "step": 19130 + }, + { + "epoch": 38.262, + "grad_norm": 1.0712100267410278, + "learning_rate": 2e-05, + "loss": 0.0455328, + "step": 19131 + }, + { + "epoch": 38.264, + "grad_norm": 1.5520873069763184, + "learning_rate": 2e-05, + "loss": 0.05642373, + "step": 19132 + }, + { + "epoch": 38.266, + "grad_norm": 1.319484829902649, + "learning_rate": 2e-05, + "loss": 0.05045997, + "step": 19133 + }, + { + "epoch": 38.268, + "grad_norm": 1.1289286613464355, + "learning_rate": 2e-05, + "loss": 0.03494524, + "step": 19134 + }, + { + "epoch": 38.27, + "grad_norm": 2.3391904830932617, + "learning_rate": 2e-05, + "loss": 0.05442752, + "step": 19135 + }, + { + "epoch": 38.272, + "grad_norm": 1.3560073375701904, + "learning_rate": 2e-05, + "loss": 0.0621442, + "step": 19136 + }, + { + "epoch": 38.274, + "grad_norm": 1.094653844833374, + "learning_rate": 2e-05, + "loss": 0.04534341, + "step": 19137 + }, + { + "epoch": 38.276, + "grad_norm": 1.2565248012542725, + "learning_rate": 2e-05, + "loss": 0.042792, + "step": 19138 + }, + { + "epoch": 38.278, + "grad_norm": 1.1656410694122314, + "learning_rate": 2e-05, + "loss": 0.0445144, + "step": 19139 + }, + { + "epoch": 38.28, + "grad_norm": 1.1632707118988037, + "learning_rate": 2e-05, + "loss": 0.04515301, + "step": 19140 + }, + { + "epoch": 38.282, + "grad_norm": 1.305557370185852, + "learning_rate": 2e-05, + "loss": 0.04854187, + "step": 19141 + }, + { + "epoch": 38.284, + "grad_norm": 1.1423861980438232, + "learning_rate": 2e-05, + "loss": 0.04725296, + "step": 19142 + }, + { + "epoch": 38.286, + "grad_norm": 1.8827954530715942, + "learning_rate": 2e-05, + "loss": 0.05522726, + "step": 19143 + }, + { + "epoch": 38.288, + "grad_norm": 1.2071250677108765, + "learning_rate": 2e-05, + "loss": 0.03631373, + "step": 19144 + }, + { + "epoch": 38.29, + "grad_norm": 1.5091750621795654, + "learning_rate": 2e-05, + "loss": 0.04798866, + "step": 19145 + }, + { + "epoch": 38.292, + "grad_norm": 1.3303464651107788, + "learning_rate": 2e-05, + "loss": 0.0500569, + "step": 19146 + }, + { + "epoch": 38.294, + "grad_norm": 1.247398018836975, + "learning_rate": 2e-05, + "loss": 0.04999819, + "step": 19147 + }, + { + "epoch": 38.296, + "grad_norm": 1.242234230041504, + "learning_rate": 2e-05, + "loss": 0.0521092, + "step": 19148 + }, + { + "epoch": 38.298, + "grad_norm": 1.7514032125473022, + "learning_rate": 2e-05, + "loss": 0.04611446, + "step": 19149 + }, + { + "epoch": 38.3, + "grad_norm": 2.0535500049591064, + "learning_rate": 2e-05, + "loss": 0.04753162, + "step": 19150 + }, + { + "epoch": 38.302, + "grad_norm": 1.1260801553726196, + "learning_rate": 2e-05, + "loss": 0.03980887, + "step": 19151 + }, + { + "epoch": 38.304, + "grad_norm": 1.2702425718307495, + "learning_rate": 2e-05, + "loss": 0.0628961, + "step": 19152 + }, + { + "epoch": 38.306, + "grad_norm": 1.7497895956039429, + "learning_rate": 2e-05, + "loss": 0.04326449, + "step": 19153 + }, + { + "epoch": 38.308, + "grad_norm": 1.2419395446777344, + "learning_rate": 2e-05, + "loss": 0.06270743, + "step": 19154 + }, + { + "epoch": 38.31, + "grad_norm": 2.1101601123809814, + "learning_rate": 2e-05, + "loss": 0.04829663, + "step": 19155 + }, + { + "epoch": 38.312, + "grad_norm": 1.3105263710021973, + "learning_rate": 2e-05, + "loss": 0.05578191, + "step": 19156 + }, + { + "epoch": 38.314, + "grad_norm": 1.2866804599761963, + "learning_rate": 2e-05, + "loss": 0.04316871, + "step": 19157 + }, + { + "epoch": 38.316, + "grad_norm": 1.2355014085769653, + "learning_rate": 2e-05, + "loss": 0.06234382, + "step": 19158 + }, + { + "epoch": 38.318, + "grad_norm": 1.1769856214523315, + "learning_rate": 2e-05, + "loss": 0.02845784, + "step": 19159 + }, + { + "epoch": 38.32, + "grad_norm": 1.315886378288269, + "learning_rate": 2e-05, + "loss": 0.05123859, + "step": 19160 + }, + { + "epoch": 38.322, + "grad_norm": 1.1736537218093872, + "learning_rate": 2e-05, + "loss": 0.05540813, + "step": 19161 + }, + { + "epoch": 38.324, + "grad_norm": 1.1168758869171143, + "learning_rate": 2e-05, + "loss": 0.04026729, + "step": 19162 + }, + { + "epoch": 38.326, + "grad_norm": 1.267259120941162, + "learning_rate": 2e-05, + "loss": 0.05570924, + "step": 19163 + }, + { + "epoch": 38.328, + "grad_norm": 0.9030328989028931, + "learning_rate": 2e-05, + "loss": 0.03402457, + "step": 19164 + }, + { + "epoch": 38.33, + "grad_norm": 1.660753846168518, + "learning_rate": 2e-05, + "loss": 0.06479599, + "step": 19165 + }, + { + "epoch": 38.332, + "grad_norm": 1.6835894584655762, + "learning_rate": 2e-05, + "loss": 0.05633393, + "step": 19166 + }, + { + "epoch": 38.334, + "grad_norm": 1.3103505373001099, + "learning_rate": 2e-05, + "loss": 0.05098883, + "step": 19167 + }, + { + "epoch": 38.336, + "grad_norm": 1.1868908405303955, + "learning_rate": 2e-05, + "loss": 0.04712537, + "step": 19168 + }, + { + "epoch": 38.338, + "grad_norm": 1.0892060995101929, + "learning_rate": 2e-05, + "loss": 0.05058462, + "step": 19169 + }, + { + "epoch": 38.34, + "grad_norm": 1.2633219957351685, + "learning_rate": 2e-05, + "loss": 0.05579946, + "step": 19170 + }, + { + "epoch": 38.342, + "grad_norm": 1.1790961027145386, + "learning_rate": 2e-05, + "loss": 0.04451218, + "step": 19171 + }, + { + "epoch": 38.344, + "grad_norm": 1.1820404529571533, + "learning_rate": 2e-05, + "loss": 0.04325217, + "step": 19172 + }, + { + "epoch": 38.346, + "grad_norm": 1.4797415733337402, + "learning_rate": 2e-05, + "loss": 0.04494851, + "step": 19173 + }, + { + "epoch": 38.348, + "grad_norm": 1.6756718158721924, + "learning_rate": 2e-05, + "loss": 0.04998035, + "step": 19174 + }, + { + "epoch": 38.35, + "grad_norm": 1.0109578371047974, + "learning_rate": 2e-05, + "loss": 0.04028856, + "step": 19175 + }, + { + "epoch": 38.352, + "grad_norm": 1.486900806427002, + "learning_rate": 2e-05, + "loss": 0.04508706, + "step": 19176 + }, + { + "epoch": 38.354, + "grad_norm": 1.2009479999542236, + "learning_rate": 2e-05, + "loss": 0.05440203, + "step": 19177 + }, + { + "epoch": 38.356, + "grad_norm": 1.3738353252410889, + "learning_rate": 2e-05, + "loss": 0.04805979, + "step": 19178 + }, + { + "epoch": 38.358, + "grad_norm": 1.2725608348846436, + "learning_rate": 2e-05, + "loss": 0.05642783, + "step": 19179 + }, + { + "epoch": 38.36, + "grad_norm": 1.2467677593231201, + "learning_rate": 2e-05, + "loss": 0.07003997, + "step": 19180 + }, + { + "epoch": 38.362, + "grad_norm": 0.9856001734733582, + "learning_rate": 2e-05, + "loss": 0.0346032, + "step": 19181 + }, + { + "epoch": 38.364, + "grad_norm": 1.4484895467758179, + "learning_rate": 2e-05, + "loss": 0.0645525, + "step": 19182 + }, + { + "epoch": 38.366, + "grad_norm": 3.5123982429504395, + "learning_rate": 2e-05, + "loss": 0.03936257, + "step": 19183 + }, + { + "epoch": 38.368, + "grad_norm": 1.4622162580490112, + "learning_rate": 2e-05, + "loss": 0.03753219, + "step": 19184 + }, + { + "epoch": 38.37, + "grad_norm": 1.103468656539917, + "learning_rate": 2e-05, + "loss": 0.04038829, + "step": 19185 + }, + { + "epoch": 38.372, + "grad_norm": 1.114812970161438, + "learning_rate": 2e-05, + "loss": 0.04476861, + "step": 19186 + }, + { + "epoch": 38.374, + "grad_norm": 1.14467453956604, + "learning_rate": 2e-05, + "loss": 0.03987251, + "step": 19187 + }, + { + "epoch": 38.376, + "grad_norm": 1.4186773300170898, + "learning_rate": 2e-05, + "loss": 0.05686907, + "step": 19188 + }, + { + "epoch": 38.378, + "grad_norm": 2.0654118061065674, + "learning_rate": 2e-05, + "loss": 0.05757586, + "step": 19189 + }, + { + "epoch": 38.38, + "grad_norm": 4.5302414894104, + "learning_rate": 2e-05, + "loss": 0.04920984, + "step": 19190 + }, + { + "epoch": 38.382, + "grad_norm": 1.2781339883804321, + "learning_rate": 2e-05, + "loss": 0.04979723, + "step": 19191 + }, + { + "epoch": 38.384, + "grad_norm": 1.0826224088668823, + "learning_rate": 2e-05, + "loss": 0.03591222, + "step": 19192 + }, + { + "epoch": 38.386, + "grad_norm": 1.9421765804290771, + "learning_rate": 2e-05, + "loss": 0.05677667, + "step": 19193 + }, + { + "epoch": 38.388, + "grad_norm": 1.4869078397750854, + "learning_rate": 2e-05, + "loss": 0.05089599, + "step": 19194 + }, + { + "epoch": 38.39, + "grad_norm": 0.8969262838363647, + "learning_rate": 2e-05, + "loss": 0.02550054, + "step": 19195 + }, + { + "epoch": 38.392, + "grad_norm": 2.2101852893829346, + "learning_rate": 2e-05, + "loss": 0.07438694, + "step": 19196 + }, + { + "epoch": 38.394, + "grad_norm": 1.3274297714233398, + "learning_rate": 2e-05, + "loss": 0.03078259, + "step": 19197 + }, + { + "epoch": 38.396, + "grad_norm": 1.6053683757781982, + "learning_rate": 2e-05, + "loss": 0.06297421, + "step": 19198 + }, + { + "epoch": 38.398, + "grad_norm": 1.0665053129196167, + "learning_rate": 2e-05, + "loss": 0.03194698, + "step": 19199 + }, + { + "epoch": 38.4, + "grad_norm": 1.062880277633667, + "learning_rate": 2e-05, + "loss": 0.03726966, + "step": 19200 + }, + { + "epoch": 38.402, + "grad_norm": 1.6072918176651, + "learning_rate": 2e-05, + "loss": 0.04004788, + "step": 19201 + }, + { + "epoch": 38.404, + "grad_norm": 1.026965856552124, + "learning_rate": 2e-05, + "loss": 0.04006281, + "step": 19202 + }, + { + "epoch": 38.406, + "grad_norm": 1.131760597229004, + "learning_rate": 2e-05, + "loss": 0.04987944, + "step": 19203 + }, + { + "epoch": 38.408, + "grad_norm": 1.1853435039520264, + "learning_rate": 2e-05, + "loss": 0.04732582, + "step": 19204 + }, + { + "epoch": 38.41, + "grad_norm": 1.2606431245803833, + "learning_rate": 2e-05, + "loss": 0.04184794, + "step": 19205 + }, + { + "epoch": 38.412, + "grad_norm": 1.06600022315979, + "learning_rate": 2e-05, + "loss": 0.04323639, + "step": 19206 + }, + { + "epoch": 38.414, + "grad_norm": 1.5144490003585815, + "learning_rate": 2e-05, + "loss": 0.05575287, + "step": 19207 + }, + { + "epoch": 38.416, + "grad_norm": 1.3510489463806152, + "learning_rate": 2e-05, + "loss": 0.05808666, + "step": 19208 + }, + { + "epoch": 38.418, + "grad_norm": 1.2126901149749756, + "learning_rate": 2e-05, + "loss": 0.04525844, + "step": 19209 + }, + { + "epoch": 38.42, + "grad_norm": 1.9178175926208496, + "learning_rate": 2e-05, + "loss": 0.04605162, + "step": 19210 + }, + { + "epoch": 38.422, + "grad_norm": 1.0999705791473389, + "learning_rate": 2e-05, + "loss": 0.03449193, + "step": 19211 + }, + { + "epoch": 38.424, + "grad_norm": 1.1063787937164307, + "learning_rate": 2e-05, + "loss": 0.04200126, + "step": 19212 + }, + { + "epoch": 38.426, + "grad_norm": 1.1517763137817383, + "learning_rate": 2e-05, + "loss": 0.03924587, + "step": 19213 + }, + { + "epoch": 38.428, + "grad_norm": 1.4197931289672852, + "learning_rate": 2e-05, + "loss": 0.06597655, + "step": 19214 + }, + { + "epoch": 38.43, + "grad_norm": 1.267537236213684, + "learning_rate": 2e-05, + "loss": 0.04415938, + "step": 19215 + }, + { + "epoch": 38.432, + "grad_norm": 1.7044909000396729, + "learning_rate": 2e-05, + "loss": 0.05466124, + "step": 19216 + }, + { + "epoch": 38.434, + "grad_norm": 1.201332449913025, + "learning_rate": 2e-05, + "loss": 0.04956157, + "step": 19217 + }, + { + "epoch": 38.436, + "grad_norm": 1.2084065675735474, + "learning_rate": 2e-05, + "loss": 0.05828648, + "step": 19218 + }, + { + "epoch": 38.438, + "grad_norm": 0.9206735491752625, + "learning_rate": 2e-05, + "loss": 0.0277809, + "step": 19219 + }, + { + "epoch": 38.44, + "grad_norm": 1.2664018869400024, + "learning_rate": 2e-05, + "loss": 0.05090235, + "step": 19220 + }, + { + "epoch": 38.442, + "grad_norm": 1.4958431720733643, + "learning_rate": 2e-05, + "loss": 0.03930274, + "step": 19221 + }, + { + "epoch": 38.444, + "grad_norm": 1.0052601099014282, + "learning_rate": 2e-05, + "loss": 0.03331514, + "step": 19222 + }, + { + "epoch": 38.446, + "grad_norm": 1.4637385606765747, + "learning_rate": 2e-05, + "loss": 0.03536751, + "step": 19223 + }, + { + "epoch": 38.448, + "grad_norm": 2.041100263595581, + "learning_rate": 2e-05, + "loss": 0.05582372, + "step": 19224 + }, + { + "epoch": 38.45, + "grad_norm": 2.045063018798828, + "learning_rate": 2e-05, + "loss": 0.05946861, + "step": 19225 + }, + { + "epoch": 38.452, + "grad_norm": 1.763791799545288, + "learning_rate": 2e-05, + "loss": 0.03987541, + "step": 19226 + }, + { + "epoch": 38.454, + "grad_norm": 1.3544304370880127, + "learning_rate": 2e-05, + "loss": 0.04928677, + "step": 19227 + }, + { + "epoch": 38.456, + "grad_norm": 1.0796279907226562, + "learning_rate": 2e-05, + "loss": 0.04805015, + "step": 19228 + }, + { + "epoch": 38.458, + "grad_norm": 1.1521120071411133, + "learning_rate": 2e-05, + "loss": 0.0500932, + "step": 19229 + }, + { + "epoch": 38.46, + "grad_norm": 1.0432672500610352, + "learning_rate": 2e-05, + "loss": 0.0380001, + "step": 19230 + }, + { + "epoch": 38.462, + "grad_norm": 1.323717474937439, + "learning_rate": 2e-05, + "loss": 0.04665473, + "step": 19231 + }, + { + "epoch": 38.464, + "grad_norm": 1.5594381093978882, + "learning_rate": 2e-05, + "loss": 0.0410453, + "step": 19232 + }, + { + "epoch": 38.466, + "grad_norm": 1.2671560049057007, + "learning_rate": 2e-05, + "loss": 0.04604392, + "step": 19233 + }, + { + "epoch": 38.468, + "grad_norm": 2.227236032485962, + "learning_rate": 2e-05, + "loss": 0.06509386, + "step": 19234 + }, + { + "epoch": 38.47, + "grad_norm": 1.537432074546814, + "learning_rate": 2e-05, + "loss": 0.05568233, + "step": 19235 + }, + { + "epoch": 38.472, + "grad_norm": 1.411525011062622, + "learning_rate": 2e-05, + "loss": 0.05239115, + "step": 19236 + }, + { + "epoch": 38.474, + "grad_norm": 1.2611944675445557, + "learning_rate": 2e-05, + "loss": 0.03326932, + "step": 19237 + }, + { + "epoch": 38.476, + "grad_norm": 1.2077014446258545, + "learning_rate": 2e-05, + "loss": 0.05218486, + "step": 19238 + }, + { + "epoch": 38.478, + "grad_norm": 1.6436618566513062, + "learning_rate": 2e-05, + "loss": 0.05138172, + "step": 19239 + }, + { + "epoch": 38.48, + "grad_norm": 1.2838315963745117, + "learning_rate": 2e-05, + "loss": 0.04669505, + "step": 19240 + }, + { + "epoch": 38.482, + "grad_norm": 1.3253170251846313, + "learning_rate": 2e-05, + "loss": 0.05545774, + "step": 19241 + }, + { + "epoch": 38.484, + "grad_norm": 1.3036000728607178, + "learning_rate": 2e-05, + "loss": 0.05103177, + "step": 19242 + }, + { + "epoch": 38.486, + "grad_norm": 1.1623722314834595, + "learning_rate": 2e-05, + "loss": 0.04469373, + "step": 19243 + }, + { + "epoch": 38.488, + "grad_norm": 1.3823708295822144, + "learning_rate": 2e-05, + "loss": 0.05241356, + "step": 19244 + }, + { + "epoch": 38.49, + "grad_norm": 1.8385578393936157, + "learning_rate": 2e-05, + "loss": 0.05396533, + "step": 19245 + }, + { + "epoch": 38.492, + "grad_norm": 1.2699154615402222, + "learning_rate": 2e-05, + "loss": 0.05115933, + "step": 19246 + }, + { + "epoch": 38.494, + "grad_norm": 1.23024582862854, + "learning_rate": 2e-05, + "loss": 0.03833042, + "step": 19247 + }, + { + "epoch": 38.496, + "grad_norm": 1.3315479755401611, + "learning_rate": 2e-05, + "loss": 0.0449665, + "step": 19248 + }, + { + "epoch": 38.498, + "grad_norm": 1.5424014329910278, + "learning_rate": 2e-05, + "loss": 0.04347795, + "step": 19249 + }, + { + "epoch": 38.5, + "grad_norm": 1.245294213294983, + "learning_rate": 2e-05, + "loss": 0.05182352, + "step": 19250 + }, + { + "epoch": 38.502, + "grad_norm": 2.529345750808716, + "learning_rate": 2e-05, + "loss": 0.04279209, + "step": 19251 + }, + { + "epoch": 38.504, + "grad_norm": 1.083033800125122, + "learning_rate": 2e-05, + "loss": 0.04436203, + "step": 19252 + }, + { + "epoch": 38.506, + "grad_norm": 1.3808947801589966, + "learning_rate": 2e-05, + "loss": 0.06541286, + "step": 19253 + }, + { + "epoch": 38.508, + "grad_norm": 1.0596139430999756, + "learning_rate": 2e-05, + "loss": 0.03044969, + "step": 19254 + }, + { + "epoch": 38.51, + "grad_norm": 1.435849666595459, + "learning_rate": 2e-05, + "loss": 0.06559452, + "step": 19255 + }, + { + "epoch": 38.512, + "grad_norm": 1.3231841325759888, + "learning_rate": 2e-05, + "loss": 0.05220275, + "step": 19256 + }, + { + "epoch": 38.514, + "grad_norm": 1.2284640073776245, + "learning_rate": 2e-05, + "loss": 0.05421726, + "step": 19257 + }, + { + "epoch": 38.516, + "grad_norm": 1.0423113107681274, + "learning_rate": 2e-05, + "loss": 0.0366753, + "step": 19258 + }, + { + "epoch": 38.518, + "grad_norm": 1.5633624792099, + "learning_rate": 2e-05, + "loss": 0.05434642, + "step": 19259 + }, + { + "epoch": 38.52, + "grad_norm": 2.032729148864746, + "learning_rate": 2e-05, + "loss": 0.04470177, + "step": 19260 + }, + { + "epoch": 38.522, + "grad_norm": 1.3375166654586792, + "learning_rate": 2e-05, + "loss": 0.04499324, + "step": 19261 + }, + { + "epoch": 38.524, + "grad_norm": 1.552940011024475, + "learning_rate": 2e-05, + "loss": 0.05302621, + "step": 19262 + }, + { + "epoch": 38.526, + "grad_norm": 1.896203637123108, + "learning_rate": 2e-05, + "loss": 0.05639236, + "step": 19263 + }, + { + "epoch": 38.528, + "grad_norm": 1.3673580884933472, + "learning_rate": 2e-05, + "loss": 0.05501523, + "step": 19264 + }, + { + "epoch": 38.53, + "grad_norm": 1.3088792562484741, + "learning_rate": 2e-05, + "loss": 0.04335637, + "step": 19265 + }, + { + "epoch": 38.532, + "grad_norm": 1.1292967796325684, + "learning_rate": 2e-05, + "loss": 0.04752572, + "step": 19266 + }, + { + "epoch": 38.534, + "grad_norm": 1.1001681089401245, + "learning_rate": 2e-05, + "loss": 0.04105626, + "step": 19267 + }, + { + "epoch": 38.536, + "grad_norm": 1.1444545984268188, + "learning_rate": 2e-05, + "loss": 0.03688351, + "step": 19268 + }, + { + "epoch": 38.538, + "grad_norm": 1.626502275466919, + "learning_rate": 2e-05, + "loss": 0.0603931, + "step": 19269 + }, + { + "epoch": 38.54, + "grad_norm": 2.7609822750091553, + "learning_rate": 2e-05, + "loss": 0.04431499, + "step": 19270 + }, + { + "epoch": 38.542, + "grad_norm": 1.9874058961868286, + "learning_rate": 2e-05, + "loss": 0.04629558, + "step": 19271 + }, + { + "epoch": 38.544, + "grad_norm": 1.3202643394470215, + "learning_rate": 2e-05, + "loss": 0.0395207, + "step": 19272 + }, + { + "epoch": 38.546, + "grad_norm": 3.389218807220459, + "learning_rate": 2e-05, + "loss": 0.03313368, + "step": 19273 + }, + { + "epoch": 38.548, + "grad_norm": 1.8491859436035156, + "learning_rate": 2e-05, + "loss": 0.05389631, + "step": 19274 + }, + { + "epoch": 38.55, + "grad_norm": 1.5536370277404785, + "learning_rate": 2e-05, + "loss": 0.04745733, + "step": 19275 + }, + { + "epoch": 38.552, + "grad_norm": 1.2798070907592773, + "learning_rate": 2e-05, + "loss": 0.04839146, + "step": 19276 + }, + { + "epoch": 38.554, + "grad_norm": 1.372031807899475, + "learning_rate": 2e-05, + "loss": 0.04501195, + "step": 19277 + }, + { + "epoch": 38.556, + "grad_norm": 2.0433499813079834, + "learning_rate": 2e-05, + "loss": 0.04445722, + "step": 19278 + }, + { + "epoch": 38.558, + "grad_norm": 2.050704002380371, + "learning_rate": 2e-05, + "loss": 0.06163099, + "step": 19279 + }, + { + "epoch": 38.56, + "grad_norm": 1.116773247718811, + "learning_rate": 2e-05, + "loss": 0.04683295, + "step": 19280 + }, + { + "epoch": 38.562, + "grad_norm": 1.1566107273101807, + "learning_rate": 2e-05, + "loss": 0.04381316, + "step": 19281 + }, + { + "epoch": 38.564, + "grad_norm": 1.1835641860961914, + "learning_rate": 2e-05, + "loss": 0.05227757, + "step": 19282 + }, + { + "epoch": 38.566, + "grad_norm": 1.1514893770217896, + "learning_rate": 2e-05, + "loss": 0.04288551, + "step": 19283 + }, + { + "epoch": 38.568, + "grad_norm": 0.9600772857666016, + "learning_rate": 2e-05, + "loss": 0.03298654, + "step": 19284 + }, + { + "epoch": 38.57, + "grad_norm": 1.2036709785461426, + "learning_rate": 2e-05, + "loss": 0.050576, + "step": 19285 + }, + { + "epoch": 38.572, + "grad_norm": 3.4883370399475098, + "learning_rate": 2e-05, + "loss": 0.05142732, + "step": 19286 + }, + { + "epoch": 38.574, + "grad_norm": 1.1536023616790771, + "learning_rate": 2e-05, + "loss": 0.04423029, + "step": 19287 + }, + { + "epoch": 38.576, + "grad_norm": 1.2535980939865112, + "learning_rate": 2e-05, + "loss": 0.04615848, + "step": 19288 + }, + { + "epoch": 38.578, + "grad_norm": 1.3373180627822876, + "learning_rate": 2e-05, + "loss": 0.0535438, + "step": 19289 + }, + { + "epoch": 38.58, + "grad_norm": 1.3583929538726807, + "learning_rate": 2e-05, + "loss": 0.04743036, + "step": 19290 + }, + { + "epoch": 38.582, + "grad_norm": 1.2381739616394043, + "learning_rate": 2e-05, + "loss": 0.04024129, + "step": 19291 + }, + { + "epoch": 38.584, + "grad_norm": 1.3908512592315674, + "learning_rate": 2e-05, + "loss": 0.03594154, + "step": 19292 + }, + { + "epoch": 38.586, + "grad_norm": 2.500706911087036, + "learning_rate": 2e-05, + "loss": 0.0516097, + "step": 19293 + }, + { + "epoch": 38.588, + "grad_norm": 1.6076672077178955, + "learning_rate": 2e-05, + "loss": 0.04902358, + "step": 19294 + }, + { + "epoch": 38.59, + "grad_norm": 1.1396098136901855, + "learning_rate": 2e-05, + "loss": 0.04243046, + "step": 19295 + }, + { + "epoch": 38.592, + "grad_norm": 1.1097677946090698, + "learning_rate": 2e-05, + "loss": 0.04250278, + "step": 19296 + }, + { + "epoch": 38.594, + "grad_norm": 1.5975714921951294, + "learning_rate": 2e-05, + "loss": 0.05274806, + "step": 19297 + }, + { + "epoch": 38.596, + "grad_norm": 1.1447633504867554, + "learning_rate": 2e-05, + "loss": 0.03151781, + "step": 19298 + }, + { + "epoch": 38.598, + "grad_norm": 0.938985288143158, + "learning_rate": 2e-05, + "loss": 0.03843331, + "step": 19299 + }, + { + "epoch": 38.6, + "grad_norm": 2.130866050720215, + "learning_rate": 2e-05, + "loss": 0.06860761, + "step": 19300 + }, + { + "epoch": 38.602, + "grad_norm": 1.0700651407241821, + "learning_rate": 2e-05, + "loss": 0.03487083, + "step": 19301 + }, + { + "epoch": 38.604, + "grad_norm": 1.744493007659912, + "learning_rate": 2e-05, + "loss": 0.06696406, + "step": 19302 + }, + { + "epoch": 38.606, + "grad_norm": 1.0361813306808472, + "learning_rate": 2e-05, + "loss": 0.04734627, + "step": 19303 + }, + { + "epoch": 38.608, + "grad_norm": 2.7028353214263916, + "learning_rate": 2e-05, + "loss": 0.0526311, + "step": 19304 + }, + { + "epoch": 38.61, + "grad_norm": 1.9693485498428345, + "learning_rate": 2e-05, + "loss": 0.04218089, + "step": 19305 + }, + { + "epoch": 38.612, + "grad_norm": 1.2493609189987183, + "learning_rate": 2e-05, + "loss": 0.05733298, + "step": 19306 + }, + { + "epoch": 38.614, + "grad_norm": 1.406848669052124, + "learning_rate": 2e-05, + "loss": 0.04433504, + "step": 19307 + }, + { + "epoch": 38.616, + "grad_norm": 1.199169397354126, + "learning_rate": 2e-05, + "loss": 0.04781777, + "step": 19308 + }, + { + "epoch": 38.618, + "grad_norm": 1.0775588750839233, + "learning_rate": 2e-05, + "loss": 0.0368211, + "step": 19309 + }, + { + "epoch": 38.62, + "grad_norm": 1.0789852142333984, + "learning_rate": 2e-05, + "loss": 0.05344281, + "step": 19310 + }, + { + "epoch": 38.622, + "grad_norm": 1.7068625688552856, + "learning_rate": 2e-05, + "loss": 0.03457305, + "step": 19311 + }, + { + "epoch": 38.624, + "grad_norm": 1.6094670295715332, + "learning_rate": 2e-05, + "loss": 0.04848836, + "step": 19312 + }, + { + "epoch": 38.626, + "grad_norm": 1.0214130878448486, + "learning_rate": 2e-05, + "loss": 0.03269782, + "step": 19313 + }, + { + "epoch": 38.628, + "grad_norm": 2.0523335933685303, + "learning_rate": 2e-05, + "loss": 0.04664942, + "step": 19314 + }, + { + "epoch": 38.63, + "grad_norm": 1.2456196546554565, + "learning_rate": 2e-05, + "loss": 0.03976301, + "step": 19315 + }, + { + "epoch": 38.632, + "grad_norm": 1.076731562614441, + "learning_rate": 2e-05, + "loss": 0.03696981, + "step": 19316 + }, + { + "epoch": 38.634, + "grad_norm": 1.4856189489364624, + "learning_rate": 2e-05, + "loss": 0.05270289, + "step": 19317 + }, + { + "epoch": 38.636, + "grad_norm": 1.0082216262817383, + "learning_rate": 2e-05, + "loss": 0.03996618, + "step": 19318 + }, + { + "epoch": 38.638, + "grad_norm": 1.2000486850738525, + "learning_rate": 2e-05, + "loss": 0.05204409, + "step": 19319 + }, + { + "epoch": 38.64, + "grad_norm": 1.0704103708267212, + "learning_rate": 2e-05, + "loss": 0.04252416, + "step": 19320 + }, + { + "epoch": 38.642, + "grad_norm": 1.4279651641845703, + "learning_rate": 2e-05, + "loss": 0.05704008, + "step": 19321 + }, + { + "epoch": 38.644, + "grad_norm": 1.1782804727554321, + "learning_rate": 2e-05, + "loss": 0.04499515, + "step": 19322 + }, + { + "epoch": 38.646, + "grad_norm": 1.00314199924469, + "learning_rate": 2e-05, + "loss": 0.03433149, + "step": 19323 + }, + { + "epoch": 38.648, + "grad_norm": 1.2116609811782837, + "learning_rate": 2e-05, + "loss": 0.04214825, + "step": 19324 + }, + { + "epoch": 38.65, + "grad_norm": 1.176286220550537, + "learning_rate": 2e-05, + "loss": 0.04334562, + "step": 19325 + }, + { + "epoch": 38.652, + "grad_norm": 1.6107664108276367, + "learning_rate": 2e-05, + "loss": 0.04943176, + "step": 19326 + }, + { + "epoch": 38.654, + "grad_norm": 1.3699370622634888, + "learning_rate": 2e-05, + "loss": 0.05509346, + "step": 19327 + }, + { + "epoch": 38.656, + "grad_norm": 2.1541030406951904, + "learning_rate": 2e-05, + "loss": 0.0614479, + "step": 19328 + }, + { + "epoch": 38.658, + "grad_norm": 1.3070214986801147, + "learning_rate": 2e-05, + "loss": 0.04495452, + "step": 19329 + }, + { + "epoch": 38.66, + "grad_norm": 1.4150477647781372, + "learning_rate": 2e-05, + "loss": 0.04712527, + "step": 19330 + }, + { + "epoch": 38.662, + "grad_norm": 2.080083131790161, + "learning_rate": 2e-05, + "loss": 0.04044828, + "step": 19331 + }, + { + "epoch": 38.664, + "grad_norm": 1.225448489189148, + "learning_rate": 2e-05, + "loss": 0.05349731, + "step": 19332 + }, + { + "epoch": 38.666, + "grad_norm": 1.4030938148498535, + "learning_rate": 2e-05, + "loss": 0.04746981, + "step": 19333 + }, + { + "epoch": 38.668, + "grad_norm": 1.162904143333435, + "learning_rate": 2e-05, + "loss": 0.05157799, + "step": 19334 + }, + { + "epoch": 38.67, + "grad_norm": 1.4816622734069824, + "learning_rate": 2e-05, + "loss": 0.04534583, + "step": 19335 + }, + { + "epoch": 38.672, + "grad_norm": 3.349114179611206, + "learning_rate": 2e-05, + "loss": 0.04592524, + "step": 19336 + }, + { + "epoch": 38.674, + "grad_norm": 1.2360336780548096, + "learning_rate": 2e-05, + "loss": 0.04279222, + "step": 19337 + }, + { + "epoch": 38.676, + "grad_norm": 2.137404441833496, + "learning_rate": 2e-05, + "loss": 0.04810013, + "step": 19338 + }, + { + "epoch": 38.678, + "grad_norm": 1.7355518341064453, + "learning_rate": 2e-05, + "loss": 0.0334636, + "step": 19339 + }, + { + "epoch": 38.68, + "grad_norm": 1.9419608116149902, + "learning_rate": 2e-05, + "loss": 0.05887587, + "step": 19340 + }, + { + "epoch": 38.682, + "grad_norm": 1.431899905204773, + "learning_rate": 2e-05, + "loss": 0.05057277, + "step": 19341 + }, + { + "epoch": 38.684, + "grad_norm": 1.284608244895935, + "learning_rate": 2e-05, + "loss": 0.04898845, + "step": 19342 + }, + { + "epoch": 38.686, + "grad_norm": 1.426267385482788, + "learning_rate": 2e-05, + "loss": 0.05257843, + "step": 19343 + }, + { + "epoch": 38.688, + "grad_norm": 1.2423020601272583, + "learning_rate": 2e-05, + "loss": 0.05175748, + "step": 19344 + }, + { + "epoch": 38.69, + "grad_norm": 1.3101670742034912, + "learning_rate": 2e-05, + "loss": 0.04395001, + "step": 19345 + }, + { + "epoch": 38.692, + "grad_norm": 1.2051000595092773, + "learning_rate": 2e-05, + "loss": 0.04174311, + "step": 19346 + }, + { + "epoch": 38.694, + "grad_norm": 1.1244724988937378, + "learning_rate": 2e-05, + "loss": 0.04402897, + "step": 19347 + }, + { + "epoch": 38.696, + "grad_norm": 1.2426832914352417, + "learning_rate": 2e-05, + "loss": 0.04704241, + "step": 19348 + }, + { + "epoch": 38.698, + "grad_norm": 1.4777288436889648, + "learning_rate": 2e-05, + "loss": 0.06701204, + "step": 19349 + }, + { + "epoch": 38.7, + "grad_norm": 2.267500162124634, + "learning_rate": 2e-05, + "loss": 0.05487394, + "step": 19350 + }, + { + "epoch": 38.702, + "grad_norm": 1.73554527759552, + "learning_rate": 2e-05, + "loss": 0.05015863, + "step": 19351 + }, + { + "epoch": 38.704, + "grad_norm": 2.264845371246338, + "learning_rate": 2e-05, + "loss": 0.03342777, + "step": 19352 + }, + { + "epoch": 38.706, + "grad_norm": 1.1378469467163086, + "learning_rate": 2e-05, + "loss": 0.04409365, + "step": 19353 + }, + { + "epoch": 38.708, + "grad_norm": 1.3917489051818848, + "learning_rate": 2e-05, + "loss": 0.04507478, + "step": 19354 + }, + { + "epoch": 38.71, + "grad_norm": 1.3060683012008667, + "learning_rate": 2e-05, + "loss": 0.05471382, + "step": 19355 + }, + { + "epoch": 38.712, + "grad_norm": 1.3260350227355957, + "learning_rate": 2e-05, + "loss": 0.04912686, + "step": 19356 + }, + { + "epoch": 38.714, + "grad_norm": 2.3827006816864014, + "learning_rate": 2e-05, + "loss": 0.05015289, + "step": 19357 + }, + { + "epoch": 38.716, + "grad_norm": 1.5596778392791748, + "learning_rate": 2e-05, + "loss": 0.0522569, + "step": 19358 + }, + { + "epoch": 38.718, + "grad_norm": 2.026498556137085, + "learning_rate": 2e-05, + "loss": 0.04310253, + "step": 19359 + }, + { + "epoch": 38.72, + "grad_norm": 1.090510606765747, + "learning_rate": 2e-05, + "loss": 0.03988282, + "step": 19360 + }, + { + "epoch": 38.722, + "grad_norm": 1.136027455329895, + "learning_rate": 2e-05, + "loss": 0.04197805, + "step": 19361 + }, + { + "epoch": 38.724, + "grad_norm": 1.0448952913284302, + "learning_rate": 2e-05, + "loss": 0.04863923, + "step": 19362 + }, + { + "epoch": 38.726, + "grad_norm": 1.3965747356414795, + "learning_rate": 2e-05, + "loss": 0.05056681, + "step": 19363 + }, + { + "epoch": 38.728, + "grad_norm": 1.705721378326416, + "learning_rate": 2e-05, + "loss": 0.05423497, + "step": 19364 + }, + { + "epoch": 38.73, + "grad_norm": 2.248636245727539, + "learning_rate": 2e-05, + "loss": 0.06463277, + "step": 19365 + }, + { + "epoch": 38.732, + "grad_norm": 1.4248642921447754, + "learning_rate": 2e-05, + "loss": 0.04670746, + "step": 19366 + }, + { + "epoch": 38.734, + "grad_norm": 1.1761763095855713, + "learning_rate": 2e-05, + "loss": 0.05271841, + "step": 19367 + }, + { + "epoch": 38.736, + "grad_norm": 1.2061562538146973, + "learning_rate": 2e-05, + "loss": 0.04730795, + "step": 19368 + }, + { + "epoch": 38.738, + "grad_norm": 1.3923081159591675, + "learning_rate": 2e-05, + "loss": 0.05840427, + "step": 19369 + }, + { + "epoch": 38.74, + "grad_norm": 1.250772476196289, + "learning_rate": 2e-05, + "loss": 0.04998203, + "step": 19370 + }, + { + "epoch": 38.742, + "grad_norm": 1.2058557271957397, + "learning_rate": 2e-05, + "loss": 0.04659589, + "step": 19371 + }, + { + "epoch": 38.744, + "grad_norm": 1.7007390260696411, + "learning_rate": 2e-05, + "loss": 0.05928213, + "step": 19372 + }, + { + "epoch": 38.746, + "grad_norm": 2.651254415512085, + "learning_rate": 2e-05, + "loss": 0.05975854, + "step": 19373 + }, + { + "epoch": 38.748, + "grad_norm": 3.1047842502593994, + "learning_rate": 2e-05, + "loss": 0.06387729, + "step": 19374 + }, + { + "epoch": 38.75, + "grad_norm": 5.55999755859375, + "learning_rate": 2e-05, + "loss": 0.06207875, + "step": 19375 + }, + { + "epoch": 38.752, + "grad_norm": 1.1087514162063599, + "learning_rate": 2e-05, + "loss": 0.03848689, + "step": 19376 + }, + { + "epoch": 38.754, + "grad_norm": 1.1218056678771973, + "learning_rate": 2e-05, + "loss": 0.045539, + "step": 19377 + }, + { + "epoch": 38.756, + "grad_norm": 1.1395851373672485, + "learning_rate": 2e-05, + "loss": 0.05567363, + "step": 19378 + }, + { + "epoch": 38.758, + "grad_norm": 1.1848268508911133, + "learning_rate": 2e-05, + "loss": 0.04403425, + "step": 19379 + }, + { + "epoch": 38.76, + "grad_norm": 1.0772857666015625, + "learning_rate": 2e-05, + "loss": 0.04982436, + "step": 19380 + }, + { + "epoch": 38.762, + "grad_norm": 1.0320247411727905, + "learning_rate": 2e-05, + "loss": 0.03512707, + "step": 19381 + }, + { + "epoch": 38.764, + "grad_norm": 1.3068004846572876, + "learning_rate": 2e-05, + "loss": 0.04124657, + "step": 19382 + }, + { + "epoch": 38.766, + "grad_norm": 1.5414135456085205, + "learning_rate": 2e-05, + "loss": 0.05403698, + "step": 19383 + }, + { + "epoch": 38.768, + "grad_norm": 1.0777575969696045, + "learning_rate": 2e-05, + "loss": 0.05617758, + "step": 19384 + }, + { + "epoch": 38.77, + "grad_norm": 1.1199369430541992, + "learning_rate": 2e-05, + "loss": 0.04268428, + "step": 19385 + }, + { + "epoch": 38.772, + "grad_norm": 1.1600946187973022, + "learning_rate": 2e-05, + "loss": 0.0405513, + "step": 19386 + }, + { + "epoch": 38.774, + "grad_norm": 3.984130382537842, + "learning_rate": 2e-05, + "loss": 0.05990764, + "step": 19387 + }, + { + "epoch": 38.776, + "grad_norm": 1.1094694137573242, + "learning_rate": 2e-05, + "loss": 0.03420043, + "step": 19388 + }, + { + "epoch": 38.778, + "grad_norm": 2.123666286468506, + "learning_rate": 2e-05, + "loss": 0.05570301, + "step": 19389 + }, + { + "epoch": 38.78, + "grad_norm": 1.096800446510315, + "learning_rate": 2e-05, + "loss": 0.04737834, + "step": 19390 + }, + { + "epoch": 38.782, + "grad_norm": 1.0921696424484253, + "learning_rate": 2e-05, + "loss": 0.04886656, + "step": 19391 + }, + { + "epoch": 38.784, + "grad_norm": 1.10837984085083, + "learning_rate": 2e-05, + "loss": 0.03572984, + "step": 19392 + }, + { + "epoch": 38.786, + "grad_norm": 0.9516921639442444, + "learning_rate": 2e-05, + "loss": 0.03601223, + "step": 19393 + }, + { + "epoch": 38.788, + "grad_norm": 1.0108002424240112, + "learning_rate": 2e-05, + "loss": 0.04009066, + "step": 19394 + }, + { + "epoch": 38.79, + "grad_norm": 1.7825038433074951, + "learning_rate": 2e-05, + "loss": 0.05417824, + "step": 19395 + }, + { + "epoch": 38.792, + "grad_norm": 1.7519702911376953, + "learning_rate": 2e-05, + "loss": 0.05254176, + "step": 19396 + }, + { + "epoch": 38.794, + "grad_norm": 1.3303686380386353, + "learning_rate": 2e-05, + "loss": 0.05503142, + "step": 19397 + }, + { + "epoch": 38.796, + "grad_norm": 1.049363136291504, + "learning_rate": 2e-05, + "loss": 0.0418441, + "step": 19398 + }, + { + "epoch": 38.798, + "grad_norm": 1.388549566268921, + "learning_rate": 2e-05, + "loss": 0.04791044, + "step": 19399 + }, + { + "epoch": 38.8, + "grad_norm": 1.4452887773513794, + "learning_rate": 2e-05, + "loss": 0.04734738, + "step": 19400 + }, + { + "epoch": 38.802, + "grad_norm": 1.2454729080200195, + "learning_rate": 2e-05, + "loss": 0.05771025, + "step": 19401 + }, + { + "epoch": 38.804, + "grad_norm": 1.1745954751968384, + "learning_rate": 2e-05, + "loss": 0.04313898, + "step": 19402 + }, + { + "epoch": 38.806, + "grad_norm": 1.3334699869155884, + "learning_rate": 2e-05, + "loss": 0.06793031, + "step": 19403 + }, + { + "epoch": 38.808, + "grad_norm": 1.42832350730896, + "learning_rate": 2e-05, + "loss": 0.03323733, + "step": 19404 + }, + { + "epoch": 38.81, + "grad_norm": 1.0687870979309082, + "learning_rate": 2e-05, + "loss": 0.0360961, + "step": 19405 + }, + { + "epoch": 38.812, + "grad_norm": 1.741623878479004, + "learning_rate": 2e-05, + "loss": 0.06625956, + "step": 19406 + }, + { + "epoch": 38.814, + "grad_norm": 1.6357783079147339, + "learning_rate": 2e-05, + "loss": 0.05045918, + "step": 19407 + }, + { + "epoch": 38.816, + "grad_norm": 1.15137779712677, + "learning_rate": 2e-05, + "loss": 0.03751816, + "step": 19408 + }, + { + "epoch": 38.818, + "grad_norm": 1.2414789199829102, + "learning_rate": 2e-05, + "loss": 0.04433636, + "step": 19409 + }, + { + "epoch": 38.82, + "grad_norm": 1.303000569343567, + "learning_rate": 2e-05, + "loss": 0.05153903, + "step": 19410 + }, + { + "epoch": 38.822, + "grad_norm": 2.2832419872283936, + "learning_rate": 2e-05, + "loss": 0.05765641, + "step": 19411 + }, + { + "epoch": 38.824, + "grad_norm": 1.1722428798675537, + "learning_rate": 2e-05, + "loss": 0.04241635, + "step": 19412 + }, + { + "epoch": 38.826, + "grad_norm": 1.613328218460083, + "learning_rate": 2e-05, + "loss": 0.03948274, + "step": 19413 + }, + { + "epoch": 38.828, + "grad_norm": 1.2444097995758057, + "learning_rate": 2e-05, + "loss": 0.04656627, + "step": 19414 + }, + { + "epoch": 38.83, + "grad_norm": 1.2532914876937866, + "learning_rate": 2e-05, + "loss": 0.0406023, + "step": 19415 + }, + { + "epoch": 38.832, + "grad_norm": 1.4562067985534668, + "learning_rate": 2e-05, + "loss": 0.06386394, + "step": 19416 + }, + { + "epoch": 38.834, + "grad_norm": 1.2982302904129028, + "learning_rate": 2e-05, + "loss": 0.04366529, + "step": 19417 + }, + { + "epoch": 38.836, + "grad_norm": 1.3933048248291016, + "learning_rate": 2e-05, + "loss": 0.05839304, + "step": 19418 + }, + { + "epoch": 38.838, + "grad_norm": 1.2442266941070557, + "learning_rate": 2e-05, + "loss": 0.04477119, + "step": 19419 + }, + { + "epoch": 38.84, + "grad_norm": 1.2520731687545776, + "learning_rate": 2e-05, + "loss": 0.05134234, + "step": 19420 + }, + { + "epoch": 38.842, + "grad_norm": 1.3517646789550781, + "learning_rate": 2e-05, + "loss": 0.04959063, + "step": 19421 + }, + { + "epoch": 38.844, + "grad_norm": 1.1629074811935425, + "learning_rate": 2e-05, + "loss": 0.04994313, + "step": 19422 + }, + { + "epoch": 38.846, + "grad_norm": 4.365392684936523, + "learning_rate": 2e-05, + "loss": 0.0617961, + "step": 19423 + }, + { + "epoch": 38.848, + "grad_norm": 2.191051959991455, + "learning_rate": 2e-05, + "loss": 0.04884791, + "step": 19424 + }, + { + "epoch": 38.85, + "grad_norm": 1.9126558303833008, + "learning_rate": 2e-05, + "loss": 0.04287531, + "step": 19425 + }, + { + "epoch": 38.852, + "grad_norm": 1.3821977376937866, + "learning_rate": 2e-05, + "loss": 0.0512047, + "step": 19426 + }, + { + "epoch": 38.854, + "grad_norm": 2.440401554107666, + "learning_rate": 2e-05, + "loss": 0.06993075, + "step": 19427 + }, + { + "epoch": 38.856, + "grad_norm": 1.3591678142547607, + "learning_rate": 2e-05, + "loss": 0.05418681, + "step": 19428 + }, + { + "epoch": 38.858, + "grad_norm": 1.1129034757614136, + "learning_rate": 2e-05, + "loss": 0.04099159, + "step": 19429 + }, + { + "epoch": 38.86, + "grad_norm": 0.9613949656486511, + "learning_rate": 2e-05, + "loss": 0.02941245, + "step": 19430 + }, + { + "epoch": 38.862, + "grad_norm": 0.8730504512786865, + "learning_rate": 2e-05, + "loss": 0.02738763, + "step": 19431 + }, + { + "epoch": 38.864, + "grad_norm": 1.091715931892395, + "learning_rate": 2e-05, + "loss": 0.03717642, + "step": 19432 + }, + { + "epoch": 38.866, + "grad_norm": 1.1144849061965942, + "learning_rate": 2e-05, + "loss": 0.04412191, + "step": 19433 + }, + { + "epoch": 38.868, + "grad_norm": 1.2194790840148926, + "learning_rate": 2e-05, + "loss": 0.04663887, + "step": 19434 + }, + { + "epoch": 38.87, + "grad_norm": 1.3206437826156616, + "learning_rate": 2e-05, + "loss": 0.05822754, + "step": 19435 + }, + { + "epoch": 38.872, + "grad_norm": 1.0756211280822754, + "learning_rate": 2e-05, + "loss": 0.03367351, + "step": 19436 + }, + { + "epoch": 38.874, + "grad_norm": 1.2603015899658203, + "learning_rate": 2e-05, + "loss": 0.04884558, + "step": 19437 + }, + { + "epoch": 38.876, + "grad_norm": 1.3238136768341064, + "learning_rate": 2e-05, + "loss": 0.0366739, + "step": 19438 + }, + { + "epoch": 38.878, + "grad_norm": 1.2530521154403687, + "learning_rate": 2e-05, + "loss": 0.05306315, + "step": 19439 + }, + { + "epoch": 38.88, + "grad_norm": 1.3359322547912598, + "learning_rate": 2e-05, + "loss": 0.04838687, + "step": 19440 + }, + { + "epoch": 38.882, + "grad_norm": 1.201737880706787, + "learning_rate": 2e-05, + "loss": 0.05184001, + "step": 19441 + }, + { + "epoch": 38.884, + "grad_norm": 1.2076383829116821, + "learning_rate": 2e-05, + "loss": 0.04534627, + "step": 19442 + }, + { + "epoch": 38.886, + "grad_norm": 0.982481062412262, + "learning_rate": 2e-05, + "loss": 0.03846693, + "step": 19443 + }, + { + "epoch": 38.888, + "grad_norm": 1.109921932220459, + "learning_rate": 2e-05, + "loss": 0.04263131, + "step": 19444 + }, + { + "epoch": 38.89, + "grad_norm": 1.115616798400879, + "learning_rate": 2e-05, + "loss": 0.05047696, + "step": 19445 + }, + { + "epoch": 38.892, + "grad_norm": 1.1365337371826172, + "learning_rate": 2e-05, + "loss": 0.0466363, + "step": 19446 + }, + { + "epoch": 38.894, + "grad_norm": 1.30055832862854, + "learning_rate": 2e-05, + "loss": 0.06107081, + "step": 19447 + }, + { + "epoch": 38.896, + "grad_norm": 1.2148528099060059, + "learning_rate": 2e-05, + "loss": 0.04618006, + "step": 19448 + }, + { + "epoch": 38.898, + "grad_norm": 1.095479130744934, + "learning_rate": 2e-05, + "loss": 0.04486157, + "step": 19449 + }, + { + "epoch": 38.9, + "grad_norm": 1.2125225067138672, + "learning_rate": 2e-05, + "loss": 0.04702915, + "step": 19450 + }, + { + "epoch": 38.902, + "grad_norm": 1.1719056367874146, + "learning_rate": 2e-05, + "loss": 0.05031497, + "step": 19451 + }, + { + "epoch": 38.904, + "grad_norm": 1.9016114473342896, + "learning_rate": 2e-05, + "loss": 0.06579206, + "step": 19452 + }, + { + "epoch": 38.906, + "grad_norm": 2.084883213043213, + "learning_rate": 2e-05, + "loss": 0.04503134, + "step": 19453 + }, + { + "epoch": 38.908, + "grad_norm": 3.5525739192962646, + "learning_rate": 2e-05, + "loss": 0.06399915, + "step": 19454 + }, + { + "epoch": 38.91, + "grad_norm": 1.238961935043335, + "learning_rate": 2e-05, + "loss": 0.05072328, + "step": 19455 + }, + { + "epoch": 38.912, + "grad_norm": 1.1139464378356934, + "learning_rate": 2e-05, + "loss": 0.05063874, + "step": 19456 + }, + { + "epoch": 38.914, + "grad_norm": 1.591186285018921, + "learning_rate": 2e-05, + "loss": 0.04680803, + "step": 19457 + }, + { + "epoch": 38.916, + "grad_norm": 1.3666948080062866, + "learning_rate": 2e-05, + "loss": 0.05156454, + "step": 19458 + }, + { + "epoch": 38.918, + "grad_norm": 1.5260530710220337, + "learning_rate": 2e-05, + "loss": 0.07067595, + "step": 19459 + }, + { + "epoch": 38.92, + "grad_norm": 1.2484623193740845, + "learning_rate": 2e-05, + "loss": 0.03980666, + "step": 19460 + }, + { + "epoch": 38.922, + "grad_norm": 1.2105351686477661, + "learning_rate": 2e-05, + "loss": 0.05022258, + "step": 19461 + }, + { + "epoch": 38.924, + "grad_norm": 0.9624425172805786, + "learning_rate": 2e-05, + "loss": 0.02262569, + "step": 19462 + }, + { + "epoch": 38.926, + "grad_norm": 1.1966172456741333, + "learning_rate": 2e-05, + "loss": 0.04677077, + "step": 19463 + }, + { + "epoch": 38.928, + "grad_norm": 2.7761287689208984, + "learning_rate": 2e-05, + "loss": 0.04095355, + "step": 19464 + }, + { + "epoch": 38.93, + "grad_norm": 1.3439223766326904, + "learning_rate": 2e-05, + "loss": 0.05076102, + "step": 19465 + }, + { + "epoch": 38.932, + "grad_norm": 1.281233310699463, + "learning_rate": 2e-05, + "loss": 0.05176672, + "step": 19466 + }, + { + "epoch": 38.934, + "grad_norm": 1.2068181037902832, + "learning_rate": 2e-05, + "loss": 0.03931053, + "step": 19467 + }, + { + "epoch": 38.936, + "grad_norm": 2.3821423053741455, + "learning_rate": 2e-05, + "loss": 0.05391637, + "step": 19468 + }, + { + "epoch": 38.938, + "grad_norm": 0.8565077781677246, + "learning_rate": 2e-05, + "loss": 0.02748447, + "step": 19469 + }, + { + "epoch": 38.94, + "grad_norm": 1.593507170677185, + "learning_rate": 2e-05, + "loss": 0.05532858, + "step": 19470 + }, + { + "epoch": 38.942, + "grad_norm": 1.2847896814346313, + "learning_rate": 2e-05, + "loss": 0.052665, + "step": 19471 + }, + { + "epoch": 38.944, + "grad_norm": 3.365082263946533, + "learning_rate": 2e-05, + "loss": 0.04390424, + "step": 19472 + }, + { + "epoch": 38.946, + "grad_norm": 1.228043556213379, + "learning_rate": 2e-05, + "loss": 0.03701095, + "step": 19473 + }, + { + "epoch": 38.948, + "grad_norm": 1.2872579097747803, + "learning_rate": 2e-05, + "loss": 0.06041116, + "step": 19474 + }, + { + "epoch": 38.95, + "grad_norm": 5.743537902832031, + "learning_rate": 2e-05, + "loss": 0.05942594, + "step": 19475 + }, + { + "epoch": 38.952, + "grad_norm": 1.4819161891937256, + "learning_rate": 2e-05, + "loss": 0.05315714, + "step": 19476 + }, + { + "epoch": 38.954, + "grad_norm": 4.263488292694092, + "learning_rate": 2e-05, + "loss": 0.05684969, + "step": 19477 + }, + { + "epoch": 38.956, + "grad_norm": 1.293532133102417, + "learning_rate": 2e-05, + "loss": 0.05474854, + "step": 19478 + }, + { + "epoch": 38.958, + "grad_norm": 1.5321731567382812, + "learning_rate": 2e-05, + "loss": 0.0521361, + "step": 19479 + }, + { + "epoch": 38.96, + "grad_norm": 1.5575649738311768, + "learning_rate": 2e-05, + "loss": 0.0335691, + "step": 19480 + }, + { + "epoch": 38.962, + "grad_norm": 1.2537888288497925, + "learning_rate": 2e-05, + "loss": 0.04636952, + "step": 19481 + }, + { + "epoch": 38.964, + "grad_norm": 1.7497940063476562, + "learning_rate": 2e-05, + "loss": 0.06122085, + "step": 19482 + }, + { + "epoch": 38.966, + "grad_norm": 0.9865143299102783, + "learning_rate": 2e-05, + "loss": 0.03592449, + "step": 19483 + }, + { + "epoch": 38.968, + "grad_norm": 1.0101889371871948, + "learning_rate": 2e-05, + "loss": 0.04233185, + "step": 19484 + }, + { + "epoch": 38.97, + "grad_norm": 1.1617642641067505, + "learning_rate": 2e-05, + "loss": 0.04329985, + "step": 19485 + }, + { + "epoch": 38.972, + "grad_norm": 1.1280710697174072, + "learning_rate": 2e-05, + "loss": 0.03594364, + "step": 19486 + }, + { + "epoch": 38.974, + "grad_norm": 1.4211704730987549, + "learning_rate": 2e-05, + "loss": 0.06367813, + "step": 19487 + }, + { + "epoch": 38.976, + "grad_norm": 1.0155389308929443, + "learning_rate": 2e-05, + "loss": 0.04221959, + "step": 19488 + }, + { + "epoch": 38.978, + "grad_norm": 0.9778318405151367, + "learning_rate": 2e-05, + "loss": 0.0305156, + "step": 19489 + }, + { + "epoch": 38.98, + "grad_norm": 1.2039105892181396, + "learning_rate": 2e-05, + "loss": 0.04294903, + "step": 19490 + }, + { + "epoch": 38.982, + "grad_norm": 0.9735546112060547, + "learning_rate": 2e-05, + "loss": 0.03935603, + "step": 19491 + }, + { + "epoch": 38.984, + "grad_norm": 2.8653101921081543, + "learning_rate": 2e-05, + "loss": 0.05370308, + "step": 19492 + }, + { + "epoch": 38.986, + "grad_norm": 1.1377127170562744, + "learning_rate": 2e-05, + "loss": 0.04588469, + "step": 19493 + }, + { + "epoch": 38.988, + "grad_norm": 1.1004186868667603, + "learning_rate": 2e-05, + "loss": 0.05202518, + "step": 19494 + }, + { + "epoch": 38.99, + "grad_norm": 1.7122163772583008, + "learning_rate": 2e-05, + "loss": 0.03624396, + "step": 19495 + }, + { + "epoch": 38.992, + "grad_norm": 2.000995635986328, + "learning_rate": 2e-05, + "loss": 0.04281428, + "step": 19496 + }, + { + "epoch": 38.994, + "grad_norm": 1.3111525774002075, + "learning_rate": 2e-05, + "loss": 0.05325833, + "step": 19497 + }, + { + "epoch": 38.996, + "grad_norm": 1.5315814018249512, + "learning_rate": 2e-05, + "loss": 0.06504035, + "step": 19498 + }, + { + "epoch": 38.998, + "grad_norm": 1.1684447526931763, + "learning_rate": 2e-05, + "loss": 0.04492722, + "step": 19499 + }, + { + "epoch": 39.0, + "grad_norm": 1.3859971761703491, + "learning_rate": 2e-05, + "loss": 0.06541097, + "step": 19500 + }, + { + "epoch": 39.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9920159680638723, + "Equal_1": 1.0, + "Equal_2": 0.9860279441117764, + "Equal_3": 0.9900199600798403, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 1.0, + "Parallel_1": 0.9879759519038076, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 0.996, + "Perpendicular_2": 0.992, + "Perpendicular_3": 0.8847695390781564, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.9972666666666666, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9840319361277445 + }, + "eval_runtime": 320.8325, + "eval_samples_per_second": 32.727, + "eval_steps_per_second": 0.655, + "step": 19500 + }, + { + "epoch": 39.002, + "grad_norm": 6.220664024353027, + "learning_rate": 2e-05, + "loss": 0.04667213, + "step": 19501 + }, + { + "epoch": 39.004, + "grad_norm": 1.135877013206482, + "learning_rate": 2e-05, + "loss": 0.03528913, + "step": 19502 + }, + { + "epoch": 39.006, + "grad_norm": 1.918533205986023, + "learning_rate": 2e-05, + "loss": 0.05770627, + "step": 19503 + }, + { + "epoch": 39.008, + "grad_norm": 1.3888102769851685, + "learning_rate": 2e-05, + "loss": 0.05147287, + "step": 19504 + }, + { + "epoch": 39.01, + "grad_norm": 1.2796348333358765, + "learning_rate": 2e-05, + "loss": 0.04314057, + "step": 19505 + }, + { + "epoch": 39.012, + "grad_norm": 1.2352646589279175, + "learning_rate": 2e-05, + "loss": 0.04238914, + "step": 19506 + }, + { + "epoch": 39.014, + "grad_norm": 1.5327287912368774, + "learning_rate": 2e-05, + "loss": 0.06135549, + "step": 19507 + }, + { + "epoch": 39.016, + "grad_norm": 1.9634387493133545, + "learning_rate": 2e-05, + "loss": 0.04630491, + "step": 19508 + }, + { + "epoch": 39.018, + "grad_norm": 1.3204143047332764, + "learning_rate": 2e-05, + "loss": 0.04790439, + "step": 19509 + }, + { + "epoch": 39.02, + "grad_norm": 1.5740340948104858, + "learning_rate": 2e-05, + "loss": 0.04070012, + "step": 19510 + }, + { + "epoch": 39.022, + "grad_norm": 1.3499553203582764, + "learning_rate": 2e-05, + "loss": 0.05047411, + "step": 19511 + }, + { + "epoch": 39.024, + "grad_norm": 0.9350709915161133, + "learning_rate": 2e-05, + "loss": 0.02976116, + "step": 19512 + }, + { + "epoch": 39.026, + "grad_norm": 1.0683033466339111, + "learning_rate": 2e-05, + "loss": 0.03930287, + "step": 19513 + }, + { + "epoch": 39.028, + "grad_norm": 1.1646449565887451, + "learning_rate": 2e-05, + "loss": 0.04475568, + "step": 19514 + }, + { + "epoch": 39.03, + "grad_norm": 1.3507591485977173, + "learning_rate": 2e-05, + "loss": 0.05256594, + "step": 19515 + }, + { + "epoch": 39.032, + "grad_norm": 1.2592138051986694, + "learning_rate": 2e-05, + "loss": 0.04029159, + "step": 19516 + }, + { + "epoch": 39.034, + "grad_norm": 1.0979448556900024, + "learning_rate": 2e-05, + "loss": 0.05017549, + "step": 19517 + }, + { + "epoch": 39.036, + "grad_norm": 1.0506963729858398, + "learning_rate": 2e-05, + "loss": 0.04544584, + "step": 19518 + }, + { + "epoch": 39.038, + "grad_norm": 1.2208514213562012, + "learning_rate": 2e-05, + "loss": 0.04804792, + "step": 19519 + }, + { + "epoch": 39.04, + "grad_norm": 1.086912989616394, + "learning_rate": 2e-05, + "loss": 0.04290297, + "step": 19520 + }, + { + "epoch": 39.042, + "grad_norm": 2.351929187774658, + "learning_rate": 2e-05, + "loss": 0.0460132, + "step": 19521 + }, + { + "epoch": 39.044, + "grad_norm": 1.1975494623184204, + "learning_rate": 2e-05, + "loss": 0.04094232, + "step": 19522 + }, + { + "epoch": 39.046, + "grad_norm": 1.4654606580734253, + "learning_rate": 2e-05, + "loss": 0.0614688, + "step": 19523 + }, + { + "epoch": 39.048, + "grad_norm": 1.3602313995361328, + "learning_rate": 2e-05, + "loss": 0.05025218, + "step": 19524 + }, + { + "epoch": 39.05, + "grad_norm": 1.1130294799804688, + "learning_rate": 2e-05, + "loss": 0.03360223, + "step": 19525 + }, + { + "epoch": 39.052, + "grad_norm": 1.3653548955917358, + "learning_rate": 2e-05, + "loss": 0.0470277, + "step": 19526 + }, + { + "epoch": 39.054, + "grad_norm": 1.0190237760543823, + "learning_rate": 2e-05, + "loss": 0.03658837, + "step": 19527 + }, + { + "epoch": 39.056, + "grad_norm": 1.2680890560150146, + "learning_rate": 2e-05, + "loss": 0.04371505, + "step": 19528 + }, + { + "epoch": 39.058, + "grad_norm": 1.6253620386123657, + "learning_rate": 2e-05, + "loss": 0.05293868, + "step": 19529 + }, + { + "epoch": 39.06, + "grad_norm": 1.2560430765151978, + "learning_rate": 2e-05, + "loss": 0.03810944, + "step": 19530 + }, + { + "epoch": 39.062, + "grad_norm": 1.2208555936813354, + "learning_rate": 2e-05, + "loss": 0.04190925, + "step": 19531 + }, + { + "epoch": 39.064, + "grad_norm": 1.098758578300476, + "learning_rate": 2e-05, + "loss": 0.03760337, + "step": 19532 + }, + { + "epoch": 39.066, + "grad_norm": 1.0823376178741455, + "learning_rate": 2e-05, + "loss": 0.040763, + "step": 19533 + }, + { + "epoch": 39.068, + "grad_norm": 1.1920530796051025, + "learning_rate": 2e-05, + "loss": 0.03933562, + "step": 19534 + }, + { + "epoch": 39.07, + "grad_norm": 1.3417820930480957, + "learning_rate": 2e-05, + "loss": 0.07266225, + "step": 19535 + }, + { + "epoch": 39.072, + "grad_norm": 1.156474232673645, + "learning_rate": 2e-05, + "loss": 0.03301832, + "step": 19536 + }, + { + "epoch": 39.074, + "grad_norm": 1.7914104461669922, + "learning_rate": 2e-05, + "loss": 0.05243705, + "step": 19537 + }, + { + "epoch": 39.076, + "grad_norm": 1.2239590883255005, + "learning_rate": 2e-05, + "loss": 0.05815011, + "step": 19538 + }, + { + "epoch": 39.078, + "grad_norm": 1.9638248682022095, + "learning_rate": 2e-05, + "loss": 0.058245, + "step": 19539 + }, + { + "epoch": 39.08, + "grad_norm": 1.095078468322754, + "learning_rate": 2e-05, + "loss": 0.041117, + "step": 19540 + }, + { + "epoch": 39.082, + "grad_norm": 1.0802770853042603, + "learning_rate": 2e-05, + "loss": 0.04389591, + "step": 19541 + }, + { + "epoch": 39.084, + "grad_norm": 2.124570369720459, + "learning_rate": 2e-05, + "loss": 0.05145385, + "step": 19542 + }, + { + "epoch": 39.086, + "grad_norm": 1.1902073621749878, + "learning_rate": 2e-05, + "loss": 0.03708824, + "step": 19543 + }, + { + "epoch": 39.088, + "grad_norm": 1.1799651384353638, + "learning_rate": 2e-05, + "loss": 0.03826425, + "step": 19544 + }, + { + "epoch": 39.09, + "grad_norm": 2.205336570739746, + "learning_rate": 2e-05, + "loss": 0.0574959, + "step": 19545 + }, + { + "epoch": 39.092, + "grad_norm": 1.5711607933044434, + "learning_rate": 2e-05, + "loss": 0.04823216, + "step": 19546 + }, + { + "epoch": 39.094, + "grad_norm": 1.6747071743011475, + "learning_rate": 2e-05, + "loss": 0.063738, + "step": 19547 + }, + { + "epoch": 39.096, + "grad_norm": 1.288379430770874, + "learning_rate": 2e-05, + "loss": 0.05128692, + "step": 19548 + }, + { + "epoch": 39.098, + "grad_norm": 1.3630702495574951, + "learning_rate": 2e-05, + "loss": 0.04941294, + "step": 19549 + }, + { + "epoch": 39.1, + "grad_norm": 2.2876434326171875, + "learning_rate": 2e-05, + "loss": 0.05177325, + "step": 19550 + }, + { + "epoch": 39.102, + "grad_norm": 1.884543538093567, + "learning_rate": 2e-05, + "loss": 0.0452019, + "step": 19551 + }, + { + "epoch": 39.104, + "grad_norm": 2.4394567012786865, + "learning_rate": 2e-05, + "loss": 0.05596255, + "step": 19552 + }, + { + "epoch": 39.106, + "grad_norm": 1.3302148580551147, + "learning_rate": 2e-05, + "loss": 0.04958398, + "step": 19553 + }, + { + "epoch": 39.108, + "grad_norm": 0.9878301620483398, + "learning_rate": 2e-05, + "loss": 0.03418873, + "step": 19554 + }, + { + "epoch": 39.11, + "grad_norm": 1.4796167612075806, + "learning_rate": 2e-05, + "loss": 0.06076609, + "step": 19555 + }, + { + "epoch": 39.112, + "grad_norm": 0.9968535304069519, + "learning_rate": 2e-05, + "loss": 0.0371155, + "step": 19556 + }, + { + "epoch": 39.114, + "grad_norm": 1.0637849569320679, + "learning_rate": 2e-05, + "loss": 0.03895561, + "step": 19557 + }, + { + "epoch": 39.116, + "grad_norm": 1.259835958480835, + "learning_rate": 2e-05, + "loss": 0.05485725, + "step": 19558 + }, + { + "epoch": 39.118, + "grad_norm": 1.407067894935608, + "learning_rate": 2e-05, + "loss": 0.03801053, + "step": 19559 + }, + { + "epoch": 39.12, + "grad_norm": 1.506779432296753, + "learning_rate": 2e-05, + "loss": 0.0610331, + "step": 19560 + }, + { + "epoch": 39.122, + "grad_norm": 1.299699306488037, + "learning_rate": 2e-05, + "loss": 0.06505786, + "step": 19561 + }, + { + "epoch": 39.124, + "grad_norm": 1.213064193725586, + "learning_rate": 2e-05, + "loss": 0.04449017, + "step": 19562 + }, + { + "epoch": 39.126, + "grad_norm": 1.0915803909301758, + "learning_rate": 2e-05, + "loss": 0.04768021, + "step": 19563 + }, + { + "epoch": 39.128, + "grad_norm": 1.2898131608963013, + "learning_rate": 2e-05, + "loss": 0.06006561, + "step": 19564 + }, + { + "epoch": 39.13, + "grad_norm": 1.134291410446167, + "learning_rate": 2e-05, + "loss": 0.04017518, + "step": 19565 + }, + { + "epoch": 39.132, + "grad_norm": 1.839423656463623, + "learning_rate": 2e-05, + "loss": 0.05884816, + "step": 19566 + }, + { + "epoch": 39.134, + "grad_norm": 3.196049451828003, + "learning_rate": 2e-05, + "loss": 0.0544239, + "step": 19567 + }, + { + "epoch": 39.136, + "grad_norm": 1.3496533632278442, + "learning_rate": 2e-05, + "loss": 0.06023031, + "step": 19568 + }, + { + "epoch": 39.138, + "grad_norm": 1.06864595413208, + "learning_rate": 2e-05, + "loss": 0.04296235, + "step": 19569 + }, + { + "epoch": 39.14, + "grad_norm": 0.9733324646949768, + "learning_rate": 2e-05, + "loss": 0.03551897, + "step": 19570 + }, + { + "epoch": 39.142, + "grad_norm": 1.19362211227417, + "learning_rate": 2e-05, + "loss": 0.05036366, + "step": 19571 + }, + { + "epoch": 39.144, + "grad_norm": 1.0025893449783325, + "learning_rate": 2e-05, + "loss": 0.0287686, + "step": 19572 + }, + { + "epoch": 39.146, + "grad_norm": 2.9667563438415527, + "learning_rate": 2e-05, + "loss": 0.0478669, + "step": 19573 + }, + { + "epoch": 39.148, + "grad_norm": 1.4983551502227783, + "learning_rate": 2e-05, + "loss": 0.05278377, + "step": 19574 + }, + { + "epoch": 39.15, + "grad_norm": 1.3283973932266235, + "learning_rate": 2e-05, + "loss": 0.05365999, + "step": 19575 + }, + { + "epoch": 39.152, + "grad_norm": 1.4052914381027222, + "learning_rate": 2e-05, + "loss": 0.04093612, + "step": 19576 + }, + { + "epoch": 39.154, + "grad_norm": 1.2109755277633667, + "learning_rate": 2e-05, + "loss": 0.04671524, + "step": 19577 + }, + { + "epoch": 39.156, + "grad_norm": 1.4497380256652832, + "learning_rate": 2e-05, + "loss": 0.05801364, + "step": 19578 + }, + { + "epoch": 39.158, + "grad_norm": 1.0449851751327515, + "learning_rate": 2e-05, + "loss": 0.03876752, + "step": 19579 + }, + { + "epoch": 39.16, + "grad_norm": 0.9883303642272949, + "learning_rate": 2e-05, + "loss": 0.02868701, + "step": 19580 + }, + { + "epoch": 39.162, + "grad_norm": 1.0830588340759277, + "learning_rate": 2e-05, + "loss": 0.03940789, + "step": 19581 + }, + { + "epoch": 39.164, + "grad_norm": 1.4366869926452637, + "learning_rate": 2e-05, + "loss": 0.07369938, + "step": 19582 + }, + { + "epoch": 39.166, + "grad_norm": 1.415928840637207, + "learning_rate": 2e-05, + "loss": 0.04259369, + "step": 19583 + }, + { + "epoch": 39.168, + "grad_norm": 1.3500007390975952, + "learning_rate": 2e-05, + "loss": 0.05315146, + "step": 19584 + }, + { + "epoch": 39.17, + "grad_norm": 1.1341204643249512, + "learning_rate": 2e-05, + "loss": 0.03991456, + "step": 19585 + }, + { + "epoch": 39.172, + "grad_norm": 1.41619074344635, + "learning_rate": 2e-05, + "loss": 0.05198916, + "step": 19586 + }, + { + "epoch": 39.174, + "grad_norm": 1.457855224609375, + "learning_rate": 2e-05, + "loss": 0.06273693, + "step": 19587 + }, + { + "epoch": 39.176, + "grad_norm": 1.1896886825561523, + "learning_rate": 2e-05, + "loss": 0.05241989, + "step": 19588 + }, + { + "epoch": 39.178, + "grad_norm": 1.280698299407959, + "learning_rate": 2e-05, + "loss": 0.04846141, + "step": 19589 + }, + { + "epoch": 39.18, + "grad_norm": 1.0273659229278564, + "learning_rate": 2e-05, + "loss": 0.03391914, + "step": 19590 + }, + { + "epoch": 39.182, + "grad_norm": 1.2925435304641724, + "learning_rate": 2e-05, + "loss": 0.05286665, + "step": 19591 + }, + { + "epoch": 39.184, + "grad_norm": 0.9915168881416321, + "learning_rate": 2e-05, + "loss": 0.04317682, + "step": 19592 + }, + { + "epoch": 39.186, + "grad_norm": 1.0538667440414429, + "learning_rate": 2e-05, + "loss": 0.02936137, + "step": 19593 + }, + { + "epoch": 39.188, + "grad_norm": 1.22129225730896, + "learning_rate": 2e-05, + "loss": 0.0674229, + "step": 19594 + }, + { + "epoch": 39.19, + "grad_norm": 1.3039966821670532, + "learning_rate": 2e-05, + "loss": 0.04973481, + "step": 19595 + }, + { + "epoch": 39.192, + "grad_norm": 1.0738667249679565, + "learning_rate": 2e-05, + "loss": 0.03898337, + "step": 19596 + }, + { + "epoch": 39.194, + "grad_norm": 1.290103793144226, + "learning_rate": 2e-05, + "loss": 0.04972884, + "step": 19597 + }, + { + "epoch": 39.196, + "grad_norm": 1.3575490713119507, + "learning_rate": 2e-05, + "loss": 0.0455936, + "step": 19598 + }, + { + "epoch": 39.198, + "grad_norm": 1.1250419616699219, + "learning_rate": 2e-05, + "loss": 0.03418372, + "step": 19599 + }, + { + "epoch": 39.2, + "grad_norm": 1.4324772357940674, + "learning_rate": 2e-05, + "loss": 0.04676975, + "step": 19600 + }, + { + "epoch": 39.202, + "grad_norm": 1.14113187789917, + "learning_rate": 2e-05, + "loss": 0.036985, + "step": 19601 + }, + { + "epoch": 39.204, + "grad_norm": 1.313949704170227, + "learning_rate": 2e-05, + "loss": 0.05859777, + "step": 19602 + }, + { + "epoch": 39.206, + "grad_norm": 1.142185926437378, + "learning_rate": 2e-05, + "loss": 0.04276698, + "step": 19603 + }, + { + "epoch": 39.208, + "grad_norm": 1.3142091035842896, + "learning_rate": 2e-05, + "loss": 0.05003233, + "step": 19604 + }, + { + "epoch": 39.21, + "grad_norm": 1.3124581575393677, + "learning_rate": 2e-05, + "loss": 0.06136658, + "step": 19605 + }, + { + "epoch": 39.212, + "grad_norm": 1.2220021486282349, + "learning_rate": 2e-05, + "loss": 0.04311784, + "step": 19606 + }, + { + "epoch": 39.214, + "grad_norm": 1.3002612590789795, + "learning_rate": 2e-05, + "loss": 0.0366911, + "step": 19607 + }, + { + "epoch": 39.216, + "grad_norm": 1.4152746200561523, + "learning_rate": 2e-05, + "loss": 0.0430445, + "step": 19608 + }, + { + "epoch": 39.218, + "grad_norm": 1.1827176809310913, + "learning_rate": 2e-05, + "loss": 0.04736789, + "step": 19609 + }, + { + "epoch": 39.22, + "grad_norm": 1.1137902736663818, + "learning_rate": 2e-05, + "loss": 0.04015935, + "step": 19610 + }, + { + "epoch": 39.222, + "grad_norm": 1.7880975008010864, + "learning_rate": 2e-05, + "loss": 0.04855542, + "step": 19611 + }, + { + "epoch": 39.224, + "grad_norm": 1.182569146156311, + "learning_rate": 2e-05, + "loss": 0.05796768, + "step": 19612 + }, + { + "epoch": 39.226, + "grad_norm": 2.2326362133026123, + "learning_rate": 2e-05, + "loss": 0.04780351, + "step": 19613 + }, + { + "epoch": 39.228, + "grad_norm": 1.2634199857711792, + "learning_rate": 2e-05, + "loss": 0.04764343, + "step": 19614 + }, + { + "epoch": 39.23, + "grad_norm": 1.3403682708740234, + "learning_rate": 2e-05, + "loss": 0.04632024, + "step": 19615 + }, + { + "epoch": 39.232, + "grad_norm": 1.009716510772705, + "learning_rate": 2e-05, + "loss": 0.03490862, + "step": 19616 + }, + { + "epoch": 39.234, + "grad_norm": 1.03059983253479, + "learning_rate": 2e-05, + "loss": 0.03645796, + "step": 19617 + }, + { + "epoch": 39.236, + "grad_norm": 1.0769610404968262, + "learning_rate": 2e-05, + "loss": 0.03788844, + "step": 19618 + }, + { + "epoch": 39.238, + "grad_norm": 1.1544963121414185, + "learning_rate": 2e-05, + "loss": 0.04667365, + "step": 19619 + }, + { + "epoch": 39.24, + "grad_norm": 1.1700681447982788, + "learning_rate": 2e-05, + "loss": 0.04401879, + "step": 19620 + }, + { + "epoch": 39.242, + "grad_norm": 1.196091890335083, + "learning_rate": 2e-05, + "loss": 0.05693919, + "step": 19621 + }, + { + "epoch": 39.244, + "grad_norm": 2.257040023803711, + "learning_rate": 2e-05, + "loss": 0.03902316, + "step": 19622 + }, + { + "epoch": 39.246, + "grad_norm": 1.00798499584198, + "learning_rate": 2e-05, + "loss": 0.03313605, + "step": 19623 + }, + { + "epoch": 39.248, + "grad_norm": 1.4052647352218628, + "learning_rate": 2e-05, + "loss": 0.05165982, + "step": 19624 + }, + { + "epoch": 39.25, + "grad_norm": 1.2141923904418945, + "learning_rate": 2e-05, + "loss": 0.03951096, + "step": 19625 + }, + { + "epoch": 39.252, + "grad_norm": 1.3865104913711548, + "learning_rate": 2e-05, + "loss": 0.04825272, + "step": 19626 + }, + { + "epoch": 39.254, + "grad_norm": 1.102805733680725, + "learning_rate": 2e-05, + "loss": 0.03778962, + "step": 19627 + }, + { + "epoch": 39.256, + "grad_norm": 0.9773703813552856, + "learning_rate": 2e-05, + "loss": 0.03974072, + "step": 19628 + }, + { + "epoch": 39.258, + "grad_norm": 1.361572027206421, + "learning_rate": 2e-05, + "loss": 0.03857236, + "step": 19629 + }, + { + "epoch": 39.26, + "grad_norm": 2.139104127883911, + "learning_rate": 2e-05, + "loss": 0.05093241, + "step": 19630 + }, + { + "epoch": 39.262, + "grad_norm": 1.442545771598816, + "learning_rate": 2e-05, + "loss": 0.04413161, + "step": 19631 + }, + { + "epoch": 39.264, + "grad_norm": 1.0915734767913818, + "learning_rate": 2e-05, + "loss": 0.03375326, + "step": 19632 + }, + { + "epoch": 39.266, + "grad_norm": 1.3909891843795776, + "learning_rate": 2e-05, + "loss": 0.0558477, + "step": 19633 + }, + { + "epoch": 39.268, + "grad_norm": 1.2892683744430542, + "learning_rate": 2e-05, + "loss": 0.05482936, + "step": 19634 + }, + { + "epoch": 39.27, + "grad_norm": 1.4405442476272583, + "learning_rate": 2e-05, + "loss": 0.05377769, + "step": 19635 + }, + { + "epoch": 39.272, + "grad_norm": 1.9378975629806519, + "learning_rate": 2e-05, + "loss": 0.04612908, + "step": 19636 + }, + { + "epoch": 39.274, + "grad_norm": 0.9840026497840881, + "learning_rate": 2e-05, + "loss": 0.04712234, + "step": 19637 + }, + { + "epoch": 39.276, + "grad_norm": 1.2211089134216309, + "learning_rate": 2e-05, + "loss": 0.05115188, + "step": 19638 + }, + { + "epoch": 39.278, + "grad_norm": 2.96844220161438, + "learning_rate": 2e-05, + "loss": 0.04160873, + "step": 19639 + }, + { + "epoch": 39.28, + "grad_norm": 2.2236135005950928, + "learning_rate": 2e-05, + "loss": 0.03255095, + "step": 19640 + }, + { + "epoch": 39.282, + "grad_norm": 1.4627702236175537, + "learning_rate": 2e-05, + "loss": 0.06116011, + "step": 19641 + }, + { + "epoch": 39.284, + "grad_norm": 1.137810468673706, + "learning_rate": 2e-05, + "loss": 0.04151687, + "step": 19642 + }, + { + "epoch": 39.286, + "grad_norm": 1.4149549007415771, + "learning_rate": 2e-05, + "loss": 0.0420138, + "step": 19643 + }, + { + "epoch": 39.288, + "grad_norm": 1.1547538042068481, + "learning_rate": 2e-05, + "loss": 0.04780672, + "step": 19644 + }, + { + "epoch": 39.29, + "grad_norm": 1.460732340812683, + "learning_rate": 2e-05, + "loss": 0.04886123, + "step": 19645 + }, + { + "epoch": 39.292, + "grad_norm": 1.4473869800567627, + "learning_rate": 2e-05, + "loss": 0.04399436, + "step": 19646 + }, + { + "epoch": 39.294, + "grad_norm": 1.1946806907653809, + "learning_rate": 2e-05, + "loss": 0.04288406, + "step": 19647 + }, + { + "epoch": 39.296, + "grad_norm": 1.7457919120788574, + "learning_rate": 2e-05, + "loss": 0.05616899, + "step": 19648 + }, + { + "epoch": 39.298, + "grad_norm": 1.1243897676467896, + "learning_rate": 2e-05, + "loss": 0.04388375, + "step": 19649 + }, + { + "epoch": 39.3, + "grad_norm": 1.2903263568878174, + "learning_rate": 2e-05, + "loss": 0.0539849, + "step": 19650 + }, + { + "epoch": 39.302, + "grad_norm": 1.4681276082992554, + "learning_rate": 2e-05, + "loss": 0.06013412, + "step": 19651 + }, + { + "epoch": 39.304, + "grad_norm": 1.7446203231811523, + "learning_rate": 2e-05, + "loss": 0.07473246, + "step": 19652 + }, + { + "epoch": 39.306, + "grad_norm": 1.5309257507324219, + "learning_rate": 2e-05, + "loss": 0.06777739, + "step": 19653 + }, + { + "epoch": 39.308, + "grad_norm": 1.5686246156692505, + "learning_rate": 2e-05, + "loss": 0.06154291, + "step": 19654 + }, + { + "epoch": 39.31, + "grad_norm": 1.139773964881897, + "learning_rate": 2e-05, + "loss": 0.03822503, + "step": 19655 + }, + { + "epoch": 39.312, + "grad_norm": 2.308309555053711, + "learning_rate": 2e-05, + "loss": 0.04015625, + "step": 19656 + }, + { + "epoch": 39.314, + "grad_norm": 1.5338242053985596, + "learning_rate": 2e-05, + "loss": 0.05909691, + "step": 19657 + }, + { + "epoch": 39.316, + "grad_norm": 1.2191451787948608, + "learning_rate": 2e-05, + "loss": 0.04259738, + "step": 19658 + }, + { + "epoch": 39.318, + "grad_norm": 1.2544686794281006, + "learning_rate": 2e-05, + "loss": 0.06194799, + "step": 19659 + }, + { + "epoch": 39.32, + "grad_norm": 1.1385127305984497, + "learning_rate": 2e-05, + "loss": 0.04202759, + "step": 19660 + }, + { + "epoch": 39.322, + "grad_norm": 1.508982539176941, + "learning_rate": 2e-05, + "loss": 0.0428558, + "step": 19661 + }, + { + "epoch": 39.324, + "grad_norm": 1.4978950023651123, + "learning_rate": 2e-05, + "loss": 0.06243214, + "step": 19662 + }, + { + "epoch": 39.326, + "grad_norm": 2.656769037246704, + "learning_rate": 2e-05, + "loss": 0.06248422, + "step": 19663 + }, + { + "epoch": 39.328, + "grad_norm": 1.2013676166534424, + "learning_rate": 2e-05, + "loss": 0.04629922, + "step": 19664 + }, + { + "epoch": 39.33, + "grad_norm": 1.4212125539779663, + "learning_rate": 2e-05, + "loss": 0.0516174, + "step": 19665 + }, + { + "epoch": 39.332, + "grad_norm": 1.4654594659805298, + "learning_rate": 2e-05, + "loss": 0.04654077, + "step": 19666 + }, + { + "epoch": 39.334, + "grad_norm": 1.1820266246795654, + "learning_rate": 2e-05, + "loss": 0.04918681, + "step": 19667 + }, + { + "epoch": 39.336, + "grad_norm": 1.7852951288223267, + "learning_rate": 2e-05, + "loss": 0.05676861, + "step": 19668 + }, + { + "epoch": 39.338, + "grad_norm": 1.3146127462387085, + "learning_rate": 2e-05, + "loss": 0.06642872, + "step": 19669 + }, + { + "epoch": 39.34, + "grad_norm": 1.1040120124816895, + "learning_rate": 2e-05, + "loss": 0.03930391, + "step": 19670 + }, + { + "epoch": 39.342, + "grad_norm": 1.4802639484405518, + "learning_rate": 2e-05, + "loss": 0.05159596, + "step": 19671 + }, + { + "epoch": 39.344, + "grad_norm": 1.3350907564163208, + "learning_rate": 2e-05, + "loss": 0.05917858, + "step": 19672 + }, + { + "epoch": 39.346, + "grad_norm": 1.3922500610351562, + "learning_rate": 2e-05, + "loss": 0.04254092, + "step": 19673 + }, + { + "epoch": 39.348, + "grad_norm": 1.271043062210083, + "learning_rate": 2e-05, + "loss": 0.05474164, + "step": 19674 + }, + { + "epoch": 39.35, + "grad_norm": 1.4507440328598022, + "learning_rate": 2e-05, + "loss": 0.05754908, + "step": 19675 + }, + { + "epoch": 39.352, + "grad_norm": 1.2747180461883545, + "learning_rate": 2e-05, + "loss": 0.06754243, + "step": 19676 + }, + { + "epoch": 39.354, + "grad_norm": 1.7789802551269531, + "learning_rate": 2e-05, + "loss": 0.0436502, + "step": 19677 + }, + { + "epoch": 39.356, + "grad_norm": 2.4285247325897217, + "learning_rate": 2e-05, + "loss": 0.05950401, + "step": 19678 + }, + { + "epoch": 39.358, + "grad_norm": 1.8559372425079346, + "learning_rate": 2e-05, + "loss": 0.04614679, + "step": 19679 + }, + { + "epoch": 39.36, + "grad_norm": 1.1284407377243042, + "learning_rate": 2e-05, + "loss": 0.05482705, + "step": 19680 + }, + { + "epoch": 39.362, + "grad_norm": 1.2401177883148193, + "learning_rate": 2e-05, + "loss": 0.04206383, + "step": 19681 + }, + { + "epoch": 39.364, + "grad_norm": 1.6358333826065063, + "learning_rate": 2e-05, + "loss": 0.04896562, + "step": 19682 + }, + { + "epoch": 39.366, + "grad_norm": 1.0610060691833496, + "learning_rate": 2e-05, + "loss": 0.03826354, + "step": 19683 + }, + { + "epoch": 39.368, + "grad_norm": 1.349070429801941, + "learning_rate": 2e-05, + "loss": 0.05081733, + "step": 19684 + }, + { + "epoch": 39.37, + "grad_norm": 1.0122675895690918, + "learning_rate": 2e-05, + "loss": 0.03560245, + "step": 19685 + }, + { + "epoch": 39.372, + "grad_norm": 1.1908326148986816, + "learning_rate": 2e-05, + "loss": 0.04643635, + "step": 19686 + }, + { + "epoch": 39.374, + "grad_norm": 1.500412106513977, + "learning_rate": 2e-05, + "loss": 0.04018075, + "step": 19687 + }, + { + "epoch": 39.376, + "grad_norm": 1.1961153745651245, + "learning_rate": 2e-05, + "loss": 0.03434346, + "step": 19688 + }, + { + "epoch": 39.378, + "grad_norm": 1.0913282632827759, + "learning_rate": 2e-05, + "loss": 0.03738025, + "step": 19689 + }, + { + "epoch": 39.38, + "grad_norm": 1.1147562265396118, + "learning_rate": 2e-05, + "loss": 0.04534988, + "step": 19690 + }, + { + "epoch": 39.382, + "grad_norm": 1.3109427690505981, + "learning_rate": 2e-05, + "loss": 0.04121095, + "step": 19691 + }, + { + "epoch": 39.384, + "grad_norm": 1.143779993057251, + "learning_rate": 2e-05, + "loss": 0.03789821, + "step": 19692 + }, + { + "epoch": 39.386, + "grad_norm": 1.3124995231628418, + "learning_rate": 2e-05, + "loss": 0.04957093, + "step": 19693 + }, + { + "epoch": 39.388, + "grad_norm": 1.287955403327942, + "learning_rate": 2e-05, + "loss": 0.05975793, + "step": 19694 + }, + { + "epoch": 39.39, + "grad_norm": 2.0113747119903564, + "learning_rate": 2e-05, + "loss": 0.0466843, + "step": 19695 + }, + { + "epoch": 39.392, + "grad_norm": 1.1586730480194092, + "learning_rate": 2e-05, + "loss": 0.04480777, + "step": 19696 + }, + { + "epoch": 39.394, + "grad_norm": 1.1337339878082275, + "learning_rate": 2e-05, + "loss": 0.03835721, + "step": 19697 + }, + { + "epoch": 39.396, + "grad_norm": 1.8198338747024536, + "learning_rate": 2e-05, + "loss": 0.04342664, + "step": 19698 + }, + { + "epoch": 39.398, + "grad_norm": 1.068575382232666, + "learning_rate": 2e-05, + "loss": 0.02720687, + "step": 19699 + }, + { + "epoch": 39.4, + "grad_norm": 2.4282875061035156, + "learning_rate": 2e-05, + "loss": 0.05048774, + "step": 19700 + }, + { + "epoch": 39.402, + "grad_norm": 1.351075291633606, + "learning_rate": 2e-05, + "loss": 0.05489894, + "step": 19701 + }, + { + "epoch": 39.404, + "grad_norm": 1.1262435913085938, + "learning_rate": 2e-05, + "loss": 0.0376655, + "step": 19702 + }, + { + "epoch": 39.406, + "grad_norm": 1.3584601879119873, + "learning_rate": 2e-05, + "loss": 0.05599149, + "step": 19703 + }, + { + "epoch": 39.408, + "grad_norm": 1.2703129053115845, + "learning_rate": 2e-05, + "loss": 0.06736299, + "step": 19704 + }, + { + "epoch": 39.41, + "grad_norm": 1.1085890531539917, + "learning_rate": 2e-05, + "loss": 0.04166878, + "step": 19705 + }, + { + "epoch": 39.412, + "grad_norm": 1.3138976097106934, + "learning_rate": 2e-05, + "loss": 0.05483936, + "step": 19706 + }, + { + "epoch": 39.414, + "grad_norm": 1.3568849563598633, + "learning_rate": 2e-05, + "loss": 0.0636967, + "step": 19707 + }, + { + "epoch": 39.416, + "grad_norm": 1.2971667051315308, + "learning_rate": 2e-05, + "loss": 0.05253425, + "step": 19708 + }, + { + "epoch": 39.418, + "grad_norm": 1.1735317707061768, + "learning_rate": 2e-05, + "loss": 0.03824739, + "step": 19709 + }, + { + "epoch": 39.42, + "grad_norm": 1.3102675676345825, + "learning_rate": 2e-05, + "loss": 0.05568937, + "step": 19710 + }, + { + "epoch": 39.422, + "grad_norm": 1.6656438112258911, + "learning_rate": 2e-05, + "loss": 0.04870186, + "step": 19711 + }, + { + "epoch": 39.424, + "grad_norm": 1.870893120765686, + "learning_rate": 2e-05, + "loss": 0.06180416, + "step": 19712 + }, + { + "epoch": 39.426, + "grad_norm": 1.1797996759414673, + "learning_rate": 2e-05, + "loss": 0.04374686, + "step": 19713 + }, + { + "epoch": 39.428, + "grad_norm": 1.4457714557647705, + "learning_rate": 2e-05, + "loss": 0.04901951, + "step": 19714 + }, + { + "epoch": 39.43, + "grad_norm": 1.1880093812942505, + "learning_rate": 2e-05, + "loss": 0.05779697, + "step": 19715 + }, + { + "epoch": 39.432, + "grad_norm": 1.140159249305725, + "learning_rate": 2e-05, + "loss": 0.04950743, + "step": 19716 + }, + { + "epoch": 39.434, + "grad_norm": 1.0973641872406006, + "learning_rate": 2e-05, + "loss": 0.03795099, + "step": 19717 + }, + { + "epoch": 39.436, + "grad_norm": 1.176837682723999, + "learning_rate": 2e-05, + "loss": 0.04808137, + "step": 19718 + }, + { + "epoch": 39.438, + "grad_norm": 1.2320091724395752, + "learning_rate": 2e-05, + "loss": 0.04875737, + "step": 19719 + }, + { + "epoch": 39.44, + "grad_norm": 1.197391390800476, + "learning_rate": 2e-05, + "loss": 0.03575107, + "step": 19720 + }, + { + "epoch": 39.442, + "grad_norm": 1.4280883073806763, + "learning_rate": 2e-05, + "loss": 0.05063617, + "step": 19721 + }, + { + "epoch": 39.444, + "grad_norm": 1.6574267148971558, + "learning_rate": 2e-05, + "loss": 0.04451539, + "step": 19722 + }, + { + "epoch": 39.446, + "grad_norm": 1.1137486696243286, + "learning_rate": 2e-05, + "loss": 0.03630831, + "step": 19723 + }, + { + "epoch": 39.448, + "grad_norm": 1.1743730306625366, + "learning_rate": 2e-05, + "loss": 0.04013621, + "step": 19724 + }, + { + "epoch": 39.45, + "grad_norm": 1.239919900894165, + "learning_rate": 2e-05, + "loss": 0.05531546, + "step": 19725 + }, + { + "epoch": 39.452, + "grad_norm": 1.6629247665405273, + "learning_rate": 2e-05, + "loss": 0.06353351, + "step": 19726 + }, + { + "epoch": 39.454, + "grad_norm": 1.5018807649612427, + "learning_rate": 2e-05, + "loss": 0.07370369, + "step": 19727 + }, + { + "epoch": 39.456, + "grad_norm": 1.5102373361587524, + "learning_rate": 2e-05, + "loss": 0.0483415, + "step": 19728 + }, + { + "epoch": 39.458, + "grad_norm": 1.106083631515503, + "learning_rate": 2e-05, + "loss": 0.04570149, + "step": 19729 + }, + { + "epoch": 39.46, + "grad_norm": 1.376544713973999, + "learning_rate": 2e-05, + "loss": 0.05271851, + "step": 19730 + }, + { + "epoch": 39.462, + "grad_norm": 1.1500600576400757, + "learning_rate": 2e-05, + "loss": 0.05022399, + "step": 19731 + }, + { + "epoch": 39.464, + "grad_norm": 1.136629343032837, + "learning_rate": 2e-05, + "loss": 0.04801833, + "step": 19732 + }, + { + "epoch": 39.466, + "grad_norm": 1.6498074531555176, + "learning_rate": 2e-05, + "loss": 0.06893224, + "step": 19733 + }, + { + "epoch": 39.468, + "grad_norm": 1.306951642036438, + "learning_rate": 2e-05, + "loss": 0.06067065, + "step": 19734 + }, + { + "epoch": 39.47, + "grad_norm": 1.1913270950317383, + "learning_rate": 2e-05, + "loss": 0.03944825, + "step": 19735 + }, + { + "epoch": 39.472, + "grad_norm": 1.2636955976486206, + "learning_rate": 2e-05, + "loss": 0.04280094, + "step": 19736 + }, + { + "epoch": 39.474, + "grad_norm": 1.0089428424835205, + "learning_rate": 2e-05, + "loss": 0.03867158, + "step": 19737 + }, + { + "epoch": 39.476, + "grad_norm": 1.6654586791992188, + "learning_rate": 2e-05, + "loss": 0.06643804, + "step": 19738 + }, + { + "epoch": 39.478, + "grad_norm": 2.2843704223632812, + "learning_rate": 2e-05, + "loss": 0.05996677, + "step": 19739 + }, + { + "epoch": 39.48, + "grad_norm": 1.0888022184371948, + "learning_rate": 2e-05, + "loss": 0.0362496, + "step": 19740 + }, + { + "epoch": 39.482, + "grad_norm": 1.196286916732788, + "learning_rate": 2e-05, + "loss": 0.04796268, + "step": 19741 + }, + { + "epoch": 39.484, + "grad_norm": 1.6246408224105835, + "learning_rate": 2e-05, + "loss": 0.06164134, + "step": 19742 + }, + { + "epoch": 39.486, + "grad_norm": 1.4118708372116089, + "learning_rate": 2e-05, + "loss": 0.06805681, + "step": 19743 + }, + { + "epoch": 39.488, + "grad_norm": 1.5130550861358643, + "learning_rate": 2e-05, + "loss": 0.0542859, + "step": 19744 + }, + { + "epoch": 39.49, + "grad_norm": 0.8534611463546753, + "learning_rate": 2e-05, + "loss": 0.02287557, + "step": 19745 + }, + { + "epoch": 39.492, + "grad_norm": 1.087668538093567, + "learning_rate": 2e-05, + "loss": 0.03380512, + "step": 19746 + }, + { + "epoch": 39.494, + "grad_norm": 1.2685211896896362, + "learning_rate": 2e-05, + "loss": 0.05318061, + "step": 19747 + }, + { + "epoch": 39.496, + "grad_norm": 1.2841920852661133, + "learning_rate": 2e-05, + "loss": 0.05055482, + "step": 19748 + }, + { + "epoch": 39.498, + "grad_norm": 1.1762704849243164, + "learning_rate": 2e-05, + "loss": 0.04554278, + "step": 19749 + }, + { + "epoch": 39.5, + "grad_norm": 3.7946743965148926, + "learning_rate": 2e-05, + "loss": 0.03693722, + "step": 19750 + }, + { + "epoch": 39.502, + "grad_norm": 1.1349517107009888, + "learning_rate": 2e-05, + "loss": 0.02860119, + "step": 19751 + }, + { + "epoch": 39.504, + "grad_norm": 2.01347017288208, + "learning_rate": 2e-05, + "loss": 0.04260724, + "step": 19752 + }, + { + "epoch": 39.506, + "grad_norm": 1.267555594444275, + "learning_rate": 2e-05, + "loss": 0.05181779, + "step": 19753 + }, + { + "epoch": 39.508, + "grad_norm": 1.1315144300460815, + "learning_rate": 2e-05, + "loss": 0.04263924, + "step": 19754 + }, + { + "epoch": 39.51, + "grad_norm": 2.0047526359558105, + "learning_rate": 2e-05, + "loss": 0.04533667, + "step": 19755 + }, + { + "epoch": 39.512, + "grad_norm": 1.342085361480713, + "learning_rate": 2e-05, + "loss": 0.04838677, + "step": 19756 + }, + { + "epoch": 39.514, + "grad_norm": 1.331516146659851, + "learning_rate": 2e-05, + "loss": 0.06132433, + "step": 19757 + }, + { + "epoch": 39.516, + "grad_norm": 1.1576191186904907, + "learning_rate": 2e-05, + "loss": 0.04208809, + "step": 19758 + }, + { + "epoch": 39.518, + "grad_norm": 0.9372735023498535, + "learning_rate": 2e-05, + "loss": 0.03229483, + "step": 19759 + }, + { + "epoch": 39.52, + "grad_norm": 2.456406831741333, + "learning_rate": 2e-05, + "loss": 0.05844424, + "step": 19760 + }, + { + "epoch": 39.522, + "grad_norm": 1.2554352283477783, + "learning_rate": 2e-05, + "loss": 0.05017581, + "step": 19761 + }, + { + "epoch": 39.524, + "grad_norm": 1.6045249700546265, + "learning_rate": 2e-05, + "loss": 0.06052469, + "step": 19762 + }, + { + "epoch": 39.526, + "grad_norm": 1.3831950426101685, + "learning_rate": 2e-05, + "loss": 0.05542463, + "step": 19763 + }, + { + "epoch": 39.528, + "grad_norm": 1.6362686157226562, + "learning_rate": 2e-05, + "loss": 0.05046823, + "step": 19764 + }, + { + "epoch": 39.53, + "grad_norm": 1.8844678401947021, + "learning_rate": 2e-05, + "loss": 0.05892744, + "step": 19765 + }, + { + "epoch": 39.532, + "grad_norm": 1.026892066001892, + "learning_rate": 2e-05, + "loss": 0.04069612, + "step": 19766 + }, + { + "epoch": 39.534, + "grad_norm": 1.6162285804748535, + "learning_rate": 2e-05, + "loss": 0.04750034, + "step": 19767 + }, + { + "epoch": 39.536, + "grad_norm": 1.3901042938232422, + "learning_rate": 2e-05, + "loss": 0.05620121, + "step": 19768 + }, + { + "epoch": 39.538, + "grad_norm": 1.1447288990020752, + "learning_rate": 2e-05, + "loss": 0.04215189, + "step": 19769 + }, + { + "epoch": 39.54, + "grad_norm": 2.04343843460083, + "learning_rate": 2e-05, + "loss": 0.04056927, + "step": 19770 + }, + { + "epoch": 39.542, + "grad_norm": 1.376480221748352, + "learning_rate": 2e-05, + "loss": 0.04138695, + "step": 19771 + }, + { + "epoch": 39.544, + "grad_norm": 1.5023736953735352, + "learning_rate": 2e-05, + "loss": 0.04907219, + "step": 19772 + }, + { + "epoch": 39.546, + "grad_norm": 2.4118905067443848, + "learning_rate": 2e-05, + "loss": 0.05148922, + "step": 19773 + }, + { + "epoch": 39.548, + "grad_norm": 3.517075300216675, + "learning_rate": 2e-05, + "loss": 0.05087829, + "step": 19774 + }, + { + "epoch": 39.55, + "grad_norm": 1.0423048734664917, + "learning_rate": 2e-05, + "loss": 0.03624145, + "step": 19775 + }, + { + "epoch": 39.552, + "grad_norm": 1.3357534408569336, + "learning_rate": 2e-05, + "loss": 0.04405185, + "step": 19776 + }, + { + "epoch": 39.554, + "grad_norm": 1.2442307472229004, + "learning_rate": 2e-05, + "loss": 0.04776893, + "step": 19777 + }, + { + "epoch": 39.556, + "grad_norm": 1.0064873695373535, + "learning_rate": 2e-05, + "loss": 0.03839774, + "step": 19778 + }, + { + "epoch": 39.558, + "grad_norm": 1.5154045820236206, + "learning_rate": 2e-05, + "loss": 0.04444873, + "step": 19779 + }, + { + "epoch": 39.56, + "grad_norm": 1.2269160747528076, + "learning_rate": 2e-05, + "loss": 0.04485428, + "step": 19780 + }, + { + "epoch": 39.562, + "grad_norm": 1.1330827474594116, + "learning_rate": 2e-05, + "loss": 0.04535697, + "step": 19781 + }, + { + "epoch": 39.564, + "grad_norm": 1.6141606569290161, + "learning_rate": 2e-05, + "loss": 0.04330592, + "step": 19782 + }, + { + "epoch": 39.566, + "grad_norm": 1.2627594470977783, + "learning_rate": 2e-05, + "loss": 0.03273316, + "step": 19783 + }, + { + "epoch": 39.568, + "grad_norm": 1.0792535543441772, + "learning_rate": 2e-05, + "loss": 0.0406695, + "step": 19784 + }, + { + "epoch": 39.57, + "grad_norm": 1.397845983505249, + "learning_rate": 2e-05, + "loss": 0.05157507, + "step": 19785 + }, + { + "epoch": 39.572, + "grad_norm": 2.2438647747039795, + "learning_rate": 2e-05, + "loss": 0.067011, + "step": 19786 + }, + { + "epoch": 39.574, + "grad_norm": 1.2847806215286255, + "learning_rate": 2e-05, + "loss": 0.03849096, + "step": 19787 + }, + { + "epoch": 39.576, + "grad_norm": 1.204473614692688, + "learning_rate": 2e-05, + "loss": 0.05438119, + "step": 19788 + }, + { + "epoch": 39.578, + "grad_norm": 1.9760804176330566, + "learning_rate": 2e-05, + "loss": 0.05682136, + "step": 19789 + }, + { + "epoch": 39.58, + "grad_norm": 1.181208610534668, + "learning_rate": 2e-05, + "loss": 0.0492033, + "step": 19790 + }, + { + "epoch": 39.582, + "grad_norm": 1.273768424987793, + "learning_rate": 2e-05, + "loss": 0.04544124, + "step": 19791 + }, + { + "epoch": 39.584, + "grad_norm": 1.5114247798919678, + "learning_rate": 2e-05, + "loss": 0.06623958, + "step": 19792 + }, + { + "epoch": 39.586, + "grad_norm": 1.1232013702392578, + "learning_rate": 2e-05, + "loss": 0.04789654, + "step": 19793 + }, + { + "epoch": 39.588, + "grad_norm": 1.1755037307739258, + "learning_rate": 2e-05, + "loss": 0.050361, + "step": 19794 + }, + { + "epoch": 39.59, + "grad_norm": 1.6072686910629272, + "learning_rate": 2e-05, + "loss": 0.06229908, + "step": 19795 + }, + { + "epoch": 39.592, + "grad_norm": 1.416585087776184, + "learning_rate": 2e-05, + "loss": 0.04778691, + "step": 19796 + }, + { + "epoch": 39.594, + "grad_norm": 1.3451114892959595, + "learning_rate": 2e-05, + "loss": 0.05900617, + "step": 19797 + }, + { + "epoch": 39.596, + "grad_norm": 1.1347873210906982, + "learning_rate": 2e-05, + "loss": 0.04485462, + "step": 19798 + }, + { + "epoch": 39.598, + "grad_norm": 1.158407211303711, + "learning_rate": 2e-05, + "loss": 0.04194099, + "step": 19799 + }, + { + "epoch": 39.6, + "grad_norm": 1.222896695137024, + "learning_rate": 2e-05, + "loss": 0.04154717, + "step": 19800 + }, + { + "epoch": 39.602, + "grad_norm": 1.1443710327148438, + "learning_rate": 2e-05, + "loss": 0.04118493, + "step": 19801 + }, + { + "epoch": 39.604, + "grad_norm": 1.2057150602340698, + "learning_rate": 2e-05, + "loss": 0.05563217, + "step": 19802 + }, + { + "epoch": 39.606, + "grad_norm": 1.192069172859192, + "learning_rate": 2e-05, + "loss": 0.0467116, + "step": 19803 + }, + { + "epoch": 39.608, + "grad_norm": 1.410652756690979, + "learning_rate": 2e-05, + "loss": 0.03325736, + "step": 19804 + }, + { + "epoch": 39.61, + "grad_norm": 1.1169686317443848, + "learning_rate": 2e-05, + "loss": 0.03927244, + "step": 19805 + }, + { + "epoch": 39.612, + "grad_norm": 1.2299331426620483, + "learning_rate": 2e-05, + "loss": 0.04283129, + "step": 19806 + }, + { + "epoch": 39.614, + "grad_norm": 1.020370602607727, + "learning_rate": 2e-05, + "loss": 0.03461327, + "step": 19807 + }, + { + "epoch": 39.616, + "grad_norm": 1.2263249158859253, + "learning_rate": 2e-05, + "loss": 0.05317787, + "step": 19808 + }, + { + "epoch": 39.618, + "grad_norm": 2.3587546348571777, + "learning_rate": 2e-05, + "loss": 0.04611384, + "step": 19809 + }, + { + "epoch": 39.62, + "grad_norm": 1.2532998323440552, + "learning_rate": 2e-05, + "loss": 0.04584979, + "step": 19810 + }, + { + "epoch": 39.622, + "grad_norm": 0.9980117082595825, + "learning_rate": 2e-05, + "loss": 0.03598379, + "step": 19811 + }, + { + "epoch": 39.624, + "grad_norm": 1.3296000957489014, + "learning_rate": 2e-05, + "loss": 0.06023554, + "step": 19812 + }, + { + "epoch": 39.626, + "grad_norm": 1.3050363063812256, + "learning_rate": 2e-05, + "loss": 0.03866673, + "step": 19813 + }, + { + "epoch": 39.628, + "grad_norm": 1.0335310697555542, + "learning_rate": 2e-05, + "loss": 0.03773557, + "step": 19814 + }, + { + "epoch": 39.63, + "grad_norm": 1.102943778038025, + "learning_rate": 2e-05, + "loss": 0.05241087, + "step": 19815 + }, + { + "epoch": 39.632, + "grad_norm": 2.9029152393341064, + "learning_rate": 2e-05, + "loss": 0.06153958, + "step": 19816 + }, + { + "epoch": 39.634, + "grad_norm": 1.207022786140442, + "learning_rate": 2e-05, + "loss": 0.0496684, + "step": 19817 + }, + { + "epoch": 39.636, + "grad_norm": 1.147627353668213, + "learning_rate": 2e-05, + "loss": 0.04096475, + "step": 19818 + }, + { + "epoch": 39.638, + "grad_norm": 1.1633808612823486, + "learning_rate": 2e-05, + "loss": 0.0518549, + "step": 19819 + }, + { + "epoch": 39.64, + "grad_norm": 2.9582087993621826, + "learning_rate": 2e-05, + "loss": 0.04467195, + "step": 19820 + }, + { + "epoch": 39.642, + "grad_norm": 1.5623703002929688, + "learning_rate": 2e-05, + "loss": 0.04819065, + "step": 19821 + }, + { + "epoch": 39.644, + "grad_norm": 1.2664448022842407, + "learning_rate": 2e-05, + "loss": 0.04783477, + "step": 19822 + }, + { + "epoch": 39.646, + "grad_norm": 1.313110589981079, + "learning_rate": 2e-05, + "loss": 0.03875414, + "step": 19823 + }, + { + "epoch": 39.648, + "grad_norm": 1.150399923324585, + "learning_rate": 2e-05, + "loss": 0.05495393, + "step": 19824 + }, + { + "epoch": 39.65, + "grad_norm": 1.273444414138794, + "learning_rate": 2e-05, + "loss": 0.05313744, + "step": 19825 + }, + { + "epoch": 39.652, + "grad_norm": 1.288045048713684, + "learning_rate": 2e-05, + "loss": 0.05415475, + "step": 19826 + }, + { + "epoch": 39.654, + "grad_norm": 1.1999372243881226, + "learning_rate": 2e-05, + "loss": 0.04567619, + "step": 19827 + }, + { + "epoch": 39.656, + "grad_norm": 1.0808748006820679, + "learning_rate": 2e-05, + "loss": 0.04751875, + "step": 19828 + }, + { + "epoch": 39.658, + "grad_norm": 1.122815489768982, + "learning_rate": 2e-05, + "loss": 0.0411781, + "step": 19829 + }, + { + "epoch": 39.66, + "grad_norm": 1.6972079277038574, + "learning_rate": 2e-05, + "loss": 0.05976425, + "step": 19830 + }, + { + "epoch": 39.662, + "grad_norm": 1.0641003847122192, + "learning_rate": 2e-05, + "loss": 0.0371467, + "step": 19831 + }, + { + "epoch": 39.664, + "grad_norm": 1.3201067447662354, + "learning_rate": 2e-05, + "loss": 0.04569467, + "step": 19832 + }, + { + "epoch": 39.666, + "grad_norm": 1.5276159048080444, + "learning_rate": 2e-05, + "loss": 0.04464874, + "step": 19833 + }, + { + "epoch": 39.668, + "grad_norm": 1.1895759105682373, + "learning_rate": 2e-05, + "loss": 0.04681417, + "step": 19834 + }, + { + "epoch": 39.67, + "grad_norm": 1.2756211757659912, + "learning_rate": 2e-05, + "loss": 0.04865978, + "step": 19835 + }, + { + "epoch": 39.672, + "grad_norm": 1.315328598022461, + "learning_rate": 2e-05, + "loss": 0.07096517, + "step": 19836 + }, + { + "epoch": 39.674, + "grad_norm": 1.2672423124313354, + "learning_rate": 2e-05, + "loss": 0.04316231, + "step": 19837 + }, + { + "epoch": 39.676, + "grad_norm": 1.077695369720459, + "learning_rate": 2e-05, + "loss": 0.03674557, + "step": 19838 + }, + { + "epoch": 39.678, + "grad_norm": 1.8062963485717773, + "learning_rate": 2e-05, + "loss": 0.05743162, + "step": 19839 + }, + { + "epoch": 39.68, + "grad_norm": 1.1730942726135254, + "learning_rate": 2e-05, + "loss": 0.03555157, + "step": 19840 + }, + { + "epoch": 39.682, + "grad_norm": 1.0135302543640137, + "learning_rate": 2e-05, + "loss": 0.02771145, + "step": 19841 + }, + { + "epoch": 39.684, + "grad_norm": 1.5072895288467407, + "learning_rate": 2e-05, + "loss": 0.06209271, + "step": 19842 + }, + { + "epoch": 39.686, + "grad_norm": 1.412246823310852, + "learning_rate": 2e-05, + "loss": 0.04875721, + "step": 19843 + }, + { + "epoch": 39.688, + "grad_norm": 1.330792784690857, + "learning_rate": 2e-05, + "loss": 0.05368386, + "step": 19844 + }, + { + "epoch": 39.69, + "grad_norm": 1.3758635520935059, + "learning_rate": 2e-05, + "loss": 0.05050598, + "step": 19845 + }, + { + "epoch": 39.692, + "grad_norm": 1.2462283372879028, + "learning_rate": 2e-05, + "loss": 0.0519301, + "step": 19846 + }, + { + "epoch": 39.694, + "grad_norm": 1.4314063787460327, + "learning_rate": 2e-05, + "loss": 0.04765672, + "step": 19847 + }, + { + "epoch": 39.696, + "grad_norm": 1.3259481191635132, + "learning_rate": 2e-05, + "loss": 0.05249347, + "step": 19848 + }, + { + "epoch": 39.698, + "grad_norm": 1.4222193956375122, + "learning_rate": 2e-05, + "loss": 0.0517953, + "step": 19849 + }, + { + "epoch": 39.7, + "grad_norm": 1.3300631046295166, + "learning_rate": 2e-05, + "loss": 0.05170576, + "step": 19850 + }, + { + "epoch": 39.702, + "grad_norm": 1.3391106128692627, + "learning_rate": 2e-05, + "loss": 0.05839144, + "step": 19851 + }, + { + "epoch": 39.704, + "grad_norm": 0.9856824278831482, + "learning_rate": 2e-05, + "loss": 0.03652864, + "step": 19852 + }, + { + "epoch": 39.706, + "grad_norm": 1.2734506130218506, + "learning_rate": 2e-05, + "loss": 0.05710208, + "step": 19853 + }, + { + "epoch": 39.708, + "grad_norm": 1.9725619554519653, + "learning_rate": 2e-05, + "loss": 0.05481605, + "step": 19854 + }, + { + "epoch": 39.71, + "grad_norm": 1.1700278520584106, + "learning_rate": 2e-05, + "loss": 0.05501316, + "step": 19855 + }, + { + "epoch": 39.712, + "grad_norm": 0.8477627635002136, + "learning_rate": 2e-05, + "loss": 0.02406826, + "step": 19856 + }, + { + "epoch": 39.714, + "grad_norm": 1.3238353729248047, + "learning_rate": 2e-05, + "loss": 0.04450976, + "step": 19857 + }, + { + "epoch": 39.716, + "grad_norm": 2.597429037094116, + "learning_rate": 2e-05, + "loss": 0.04625923, + "step": 19858 + }, + { + "epoch": 39.718, + "grad_norm": 1.2103281021118164, + "learning_rate": 2e-05, + "loss": 0.05398448, + "step": 19859 + }, + { + "epoch": 39.72, + "grad_norm": 1.249700665473938, + "learning_rate": 2e-05, + "loss": 0.05766566, + "step": 19860 + }, + { + "epoch": 39.722, + "grad_norm": 1.2194814682006836, + "learning_rate": 2e-05, + "loss": 0.05054, + "step": 19861 + }, + { + "epoch": 39.724, + "grad_norm": 1.3230112791061401, + "learning_rate": 2e-05, + "loss": 0.04640412, + "step": 19862 + }, + { + "epoch": 39.726, + "grad_norm": 1.38727867603302, + "learning_rate": 2e-05, + "loss": 0.04579895, + "step": 19863 + }, + { + "epoch": 39.728, + "grad_norm": 1.1224366426467896, + "learning_rate": 2e-05, + "loss": 0.03637297, + "step": 19864 + }, + { + "epoch": 39.73, + "grad_norm": 1.8750441074371338, + "learning_rate": 2e-05, + "loss": 0.06596401, + "step": 19865 + }, + { + "epoch": 39.732, + "grad_norm": 1.2330036163330078, + "learning_rate": 2e-05, + "loss": 0.06313409, + "step": 19866 + }, + { + "epoch": 39.734, + "grad_norm": 1.1567729711532593, + "learning_rate": 2e-05, + "loss": 0.03799208, + "step": 19867 + }, + { + "epoch": 39.736, + "grad_norm": 1.8047136068344116, + "learning_rate": 2e-05, + "loss": 0.05676999, + "step": 19868 + }, + { + "epoch": 39.738, + "grad_norm": 1.2310786247253418, + "learning_rate": 2e-05, + "loss": 0.04846741, + "step": 19869 + }, + { + "epoch": 39.74, + "grad_norm": 1.0939598083496094, + "learning_rate": 2e-05, + "loss": 0.05183221, + "step": 19870 + }, + { + "epoch": 39.742, + "grad_norm": 1.2205032110214233, + "learning_rate": 2e-05, + "loss": 0.05188471, + "step": 19871 + }, + { + "epoch": 39.744, + "grad_norm": 1.108619213104248, + "learning_rate": 2e-05, + "loss": 0.05545724, + "step": 19872 + }, + { + "epoch": 39.746, + "grad_norm": 1.2437492609024048, + "learning_rate": 2e-05, + "loss": 0.057475, + "step": 19873 + }, + { + "epoch": 39.748, + "grad_norm": 1.030137300491333, + "learning_rate": 2e-05, + "loss": 0.03131631, + "step": 19874 + }, + { + "epoch": 39.75, + "grad_norm": 1.290551781654358, + "learning_rate": 2e-05, + "loss": 0.05823213, + "step": 19875 + }, + { + "epoch": 39.752, + "grad_norm": 1.6363590955734253, + "learning_rate": 2e-05, + "loss": 0.05555134, + "step": 19876 + }, + { + "epoch": 39.754, + "grad_norm": 2.338663101196289, + "learning_rate": 2e-05, + "loss": 0.05460815, + "step": 19877 + }, + { + "epoch": 39.756, + "grad_norm": 1.5691533088684082, + "learning_rate": 2e-05, + "loss": 0.05019718, + "step": 19878 + }, + { + "epoch": 39.758, + "grad_norm": 1.2325677871704102, + "learning_rate": 2e-05, + "loss": 0.04595042, + "step": 19879 + }, + { + "epoch": 39.76, + "grad_norm": 1.2312854528427124, + "learning_rate": 2e-05, + "loss": 0.03850013, + "step": 19880 + }, + { + "epoch": 39.762, + "grad_norm": 1.3569062948226929, + "learning_rate": 2e-05, + "loss": 0.05348967, + "step": 19881 + }, + { + "epoch": 39.764, + "grad_norm": 2.510478973388672, + "learning_rate": 2e-05, + "loss": 0.05543127, + "step": 19882 + }, + { + "epoch": 39.766, + "grad_norm": 1.1178513765335083, + "learning_rate": 2e-05, + "loss": 0.05213483, + "step": 19883 + }, + { + "epoch": 39.768, + "grad_norm": 1.2882338762283325, + "learning_rate": 2e-05, + "loss": 0.06439535, + "step": 19884 + }, + { + "epoch": 39.77, + "grad_norm": 1.2819217443466187, + "learning_rate": 2e-05, + "loss": 0.06040879, + "step": 19885 + }, + { + "epoch": 39.772, + "grad_norm": 1.1934309005737305, + "learning_rate": 2e-05, + "loss": 0.04919674, + "step": 19886 + }, + { + "epoch": 39.774, + "grad_norm": 1.0528241395950317, + "learning_rate": 2e-05, + "loss": 0.03462794, + "step": 19887 + }, + { + "epoch": 39.776, + "grad_norm": 1.1237131357192993, + "learning_rate": 2e-05, + "loss": 0.03184299, + "step": 19888 + }, + { + "epoch": 39.778, + "grad_norm": 1.4120211601257324, + "learning_rate": 2e-05, + "loss": 0.05772817, + "step": 19889 + }, + { + "epoch": 39.78, + "grad_norm": 1.1867109537124634, + "learning_rate": 2e-05, + "loss": 0.04693531, + "step": 19890 + }, + { + "epoch": 39.782, + "grad_norm": 1.1656479835510254, + "learning_rate": 2e-05, + "loss": 0.03705209, + "step": 19891 + }, + { + "epoch": 39.784, + "grad_norm": 1.0704561471939087, + "learning_rate": 2e-05, + "loss": 0.04141489, + "step": 19892 + }, + { + "epoch": 39.786, + "grad_norm": 1.3049782514572144, + "learning_rate": 2e-05, + "loss": 0.06903137, + "step": 19893 + }, + { + "epoch": 39.788, + "grad_norm": 1.3069772720336914, + "learning_rate": 2e-05, + "loss": 0.0366349, + "step": 19894 + }, + { + "epoch": 39.79, + "grad_norm": 1.3293408155441284, + "learning_rate": 2e-05, + "loss": 0.05534213, + "step": 19895 + }, + { + "epoch": 39.792, + "grad_norm": 1.7149981260299683, + "learning_rate": 2e-05, + "loss": 0.06210932, + "step": 19896 + }, + { + "epoch": 39.794, + "grad_norm": 1.0320147275924683, + "learning_rate": 2e-05, + "loss": 0.04126735, + "step": 19897 + }, + { + "epoch": 39.796, + "grad_norm": 1.9870035648345947, + "learning_rate": 2e-05, + "loss": 0.05784594, + "step": 19898 + }, + { + "epoch": 39.798, + "grad_norm": 0.9793746471405029, + "learning_rate": 2e-05, + "loss": 0.03283355, + "step": 19899 + }, + { + "epoch": 39.8, + "grad_norm": 1.228532075881958, + "learning_rate": 2e-05, + "loss": 0.04221442, + "step": 19900 + }, + { + "epoch": 39.802, + "grad_norm": 1.2652114629745483, + "learning_rate": 2e-05, + "loss": 0.05898855, + "step": 19901 + }, + { + "epoch": 39.804, + "grad_norm": 1.1174923181533813, + "learning_rate": 2e-05, + "loss": 0.04276735, + "step": 19902 + }, + { + "epoch": 39.806, + "grad_norm": 1.0496095418930054, + "learning_rate": 2e-05, + "loss": 0.04212025, + "step": 19903 + }, + { + "epoch": 39.808, + "grad_norm": 1.0706851482391357, + "learning_rate": 2e-05, + "loss": 0.03774155, + "step": 19904 + }, + { + "epoch": 39.81, + "grad_norm": 1.1920790672302246, + "learning_rate": 2e-05, + "loss": 0.04768289, + "step": 19905 + }, + { + "epoch": 39.812, + "grad_norm": 1.502715826034546, + "learning_rate": 2e-05, + "loss": 0.04195279, + "step": 19906 + }, + { + "epoch": 39.814, + "grad_norm": 1.9300915002822876, + "learning_rate": 2e-05, + "loss": 0.04984208, + "step": 19907 + }, + { + "epoch": 39.816, + "grad_norm": 2.316269636154175, + "learning_rate": 2e-05, + "loss": 0.04897273, + "step": 19908 + }, + { + "epoch": 39.818, + "grad_norm": 1.250138759613037, + "learning_rate": 2e-05, + "loss": 0.04602588, + "step": 19909 + }, + { + "epoch": 39.82, + "grad_norm": 1.653391718864441, + "learning_rate": 2e-05, + "loss": 0.06034857, + "step": 19910 + }, + { + "epoch": 39.822, + "grad_norm": 1.1049057245254517, + "learning_rate": 2e-05, + "loss": 0.03826465, + "step": 19911 + }, + { + "epoch": 39.824, + "grad_norm": 1.1933602094650269, + "learning_rate": 2e-05, + "loss": 0.03310779, + "step": 19912 + }, + { + "epoch": 39.826, + "grad_norm": 3.675931930541992, + "learning_rate": 2e-05, + "loss": 0.05223902, + "step": 19913 + }, + { + "epoch": 39.828, + "grad_norm": 1.1655473709106445, + "learning_rate": 2e-05, + "loss": 0.0628321, + "step": 19914 + }, + { + "epoch": 39.83, + "grad_norm": 1.7604655027389526, + "learning_rate": 2e-05, + "loss": 0.06078061, + "step": 19915 + }, + { + "epoch": 39.832, + "grad_norm": 1.100632667541504, + "learning_rate": 2e-05, + "loss": 0.04568919, + "step": 19916 + }, + { + "epoch": 39.834, + "grad_norm": 1.2774721384048462, + "learning_rate": 2e-05, + "loss": 0.05435498, + "step": 19917 + }, + { + "epoch": 39.836, + "grad_norm": 1.217751145362854, + "learning_rate": 2e-05, + "loss": 0.05859322, + "step": 19918 + }, + { + "epoch": 39.838, + "grad_norm": 1.317592978477478, + "learning_rate": 2e-05, + "loss": 0.05665311, + "step": 19919 + }, + { + "epoch": 39.84, + "grad_norm": 0.949860692024231, + "learning_rate": 2e-05, + "loss": 0.02791356, + "step": 19920 + }, + { + "epoch": 39.842, + "grad_norm": 1.1154917478561401, + "learning_rate": 2e-05, + "loss": 0.05110167, + "step": 19921 + }, + { + "epoch": 39.844, + "grad_norm": 1.161690354347229, + "learning_rate": 2e-05, + "loss": 0.0497525, + "step": 19922 + }, + { + "epoch": 39.846, + "grad_norm": 1.190811038017273, + "learning_rate": 2e-05, + "loss": 0.0368493, + "step": 19923 + }, + { + "epoch": 39.848, + "grad_norm": 1.6861010789871216, + "learning_rate": 2e-05, + "loss": 0.05476474, + "step": 19924 + }, + { + "epoch": 39.85, + "grad_norm": 1.9949997663497925, + "learning_rate": 2e-05, + "loss": 0.0414875, + "step": 19925 + }, + { + "epoch": 39.852, + "grad_norm": 1.2739412784576416, + "learning_rate": 2e-05, + "loss": 0.05026732, + "step": 19926 + }, + { + "epoch": 39.854, + "grad_norm": 1.3758777379989624, + "learning_rate": 2e-05, + "loss": 0.06855834, + "step": 19927 + }, + { + "epoch": 39.856, + "grad_norm": 1.577846884727478, + "learning_rate": 2e-05, + "loss": 0.05498583, + "step": 19928 + }, + { + "epoch": 39.858, + "grad_norm": 1.269954800605774, + "learning_rate": 2e-05, + "loss": 0.0476332, + "step": 19929 + }, + { + "epoch": 39.86, + "grad_norm": 0.9598180055618286, + "learning_rate": 2e-05, + "loss": 0.04460917, + "step": 19930 + }, + { + "epoch": 39.862, + "grad_norm": 1.1671171188354492, + "learning_rate": 2e-05, + "loss": 0.04713274, + "step": 19931 + }, + { + "epoch": 39.864, + "grad_norm": 1.8716930150985718, + "learning_rate": 2e-05, + "loss": 0.0466135, + "step": 19932 + }, + { + "epoch": 39.866, + "grad_norm": 1.3691669702529907, + "learning_rate": 2e-05, + "loss": 0.04337586, + "step": 19933 + }, + { + "epoch": 39.868, + "grad_norm": 1.1902244091033936, + "learning_rate": 2e-05, + "loss": 0.04896775, + "step": 19934 + }, + { + "epoch": 39.87, + "grad_norm": 1.0358586311340332, + "learning_rate": 2e-05, + "loss": 0.04044461, + "step": 19935 + }, + { + "epoch": 39.872, + "grad_norm": 1.4702568054199219, + "learning_rate": 2e-05, + "loss": 0.06271239, + "step": 19936 + }, + { + "epoch": 39.874, + "grad_norm": 1.0855350494384766, + "learning_rate": 2e-05, + "loss": 0.03931961, + "step": 19937 + }, + { + "epoch": 39.876, + "grad_norm": 1.1319669485092163, + "learning_rate": 2e-05, + "loss": 0.03855097, + "step": 19938 + }, + { + "epoch": 39.878, + "grad_norm": 1.354026436805725, + "learning_rate": 2e-05, + "loss": 0.06375487, + "step": 19939 + }, + { + "epoch": 39.88, + "grad_norm": 0.9435744285583496, + "learning_rate": 2e-05, + "loss": 0.03325384, + "step": 19940 + }, + { + "epoch": 39.882, + "grad_norm": 1.2593286037445068, + "learning_rate": 2e-05, + "loss": 0.05153989, + "step": 19941 + }, + { + "epoch": 39.884, + "grad_norm": 0.9804494976997375, + "learning_rate": 2e-05, + "loss": 0.03355004, + "step": 19942 + }, + { + "epoch": 39.886, + "grad_norm": 1.18596351146698, + "learning_rate": 2e-05, + "loss": 0.0458352, + "step": 19943 + }, + { + "epoch": 39.888, + "grad_norm": 1.5126147270202637, + "learning_rate": 2e-05, + "loss": 0.05285957, + "step": 19944 + }, + { + "epoch": 39.89, + "grad_norm": 1.1133012771606445, + "learning_rate": 2e-05, + "loss": 0.03969733, + "step": 19945 + }, + { + "epoch": 39.892, + "grad_norm": 2.6804747581481934, + "learning_rate": 2e-05, + "loss": 0.06336376, + "step": 19946 + }, + { + "epoch": 39.894, + "grad_norm": 3.003247022628784, + "learning_rate": 2e-05, + "loss": 0.04598739, + "step": 19947 + }, + { + "epoch": 39.896, + "grad_norm": 1.1411057710647583, + "learning_rate": 2e-05, + "loss": 0.03566032, + "step": 19948 + }, + { + "epoch": 39.898, + "grad_norm": 1.0855693817138672, + "learning_rate": 2e-05, + "loss": 0.03875684, + "step": 19949 + }, + { + "epoch": 39.9, + "grad_norm": 1.9709842205047607, + "learning_rate": 2e-05, + "loss": 0.05917383, + "step": 19950 + }, + { + "epoch": 39.902, + "grad_norm": 1.1566156148910522, + "learning_rate": 2e-05, + "loss": 0.05174723, + "step": 19951 + }, + { + "epoch": 39.904, + "grad_norm": 1.0482864379882812, + "learning_rate": 2e-05, + "loss": 0.04446269, + "step": 19952 + }, + { + "epoch": 39.906, + "grad_norm": 1.2975873947143555, + "learning_rate": 2e-05, + "loss": 0.04888238, + "step": 19953 + }, + { + "epoch": 39.908, + "grad_norm": 1.3561941385269165, + "learning_rate": 2e-05, + "loss": 0.05117349, + "step": 19954 + }, + { + "epoch": 39.91, + "grad_norm": 0.9680836200714111, + "learning_rate": 2e-05, + "loss": 0.03723207, + "step": 19955 + }, + { + "epoch": 39.912, + "grad_norm": 1.2413675785064697, + "learning_rate": 2e-05, + "loss": 0.05808984, + "step": 19956 + }, + { + "epoch": 39.914, + "grad_norm": 1.634905219078064, + "learning_rate": 2e-05, + "loss": 0.05397768, + "step": 19957 + }, + { + "epoch": 39.916, + "grad_norm": 1.1176763772964478, + "learning_rate": 2e-05, + "loss": 0.04666106, + "step": 19958 + }, + { + "epoch": 39.918, + "grad_norm": 1.37368643283844, + "learning_rate": 2e-05, + "loss": 0.04420487, + "step": 19959 + }, + { + "epoch": 39.92, + "grad_norm": 1.1115273237228394, + "learning_rate": 2e-05, + "loss": 0.04129547, + "step": 19960 + }, + { + "epoch": 39.922, + "grad_norm": 1.0844300985336304, + "learning_rate": 2e-05, + "loss": 0.02857055, + "step": 19961 + }, + { + "epoch": 39.924, + "grad_norm": 2.0654237270355225, + "learning_rate": 2e-05, + "loss": 0.05185968, + "step": 19962 + }, + { + "epoch": 39.926, + "grad_norm": 1.4213045835494995, + "learning_rate": 2e-05, + "loss": 0.03576629, + "step": 19963 + }, + { + "epoch": 39.928, + "grad_norm": 1.399935007095337, + "learning_rate": 2e-05, + "loss": 0.05327077, + "step": 19964 + }, + { + "epoch": 39.93, + "grad_norm": 1.1581571102142334, + "learning_rate": 2e-05, + "loss": 0.04553617, + "step": 19965 + }, + { + "epoch": 39.932, + "grad_norm": 2.8895864486694336, + "learning_rate": 2e-05, + "loss": 0.0493044, + "step": 19966 + }, + { + "epoch": 39.934, + "grad_norm": 1.4619674682617188, + "learning_rate": 2e-05, + "loss": 0.04714632, + "step": 19967 + }, + { + "epoch": 39.936, + "grad_norm": 1.8203679323196411, + "learning_rate": 2e-05, + "loss": 0.06075837, + "step": 19968 + }, + { + "epoch": 39.938, + "grad_norm": 1.141274333000183, + "learning_rate": 2e-05, + "loss": 0.04157832, + "step": 19969 + }, + { + "epoch": 39.94, + "grad_norm": 1.180153727531433, + "learning_rate": 2e-05, + "loss": 0.05030315, + "step": 19970 + }, + { + "epoch": 39.942, + "grad_norm": 1.2318646907806396, + "learning_rate": 2e-05, + "loss": 0.04882877, + "step": 19971 + }, + { + "epoch": 39.944, + "grad_norm": 1.3025565147399902, + "learning_rate": 2e-05, + "loss": 0.0572556, + "step": 19972 + }, + { + "epoch": 39.946, + "grad_norm": 1.211354374885559, + "learning_rate": 2e-05, + "loss": 0.04019732, + "step": 19973 + }, + { + "epoch": 39.948, + "grad_norm": 1.2427458763122559, + "learning_rate": 2e-05, + "loss": 0.05008765, + "step": 19974 + }, + { + "epoch": 39.95, + "grad_norm": 1.2193975448608398, + "learning_rate": 2e-05, + "loss": 0.05430697, + "step": 19975 + }, + { + "epoch": 39.952, + "grad_norm": 1.3001223802566528, + "learning_rate": 2e-05, + "loss": 0.05283924, + "step": 19976 + }, + { + "epoch": 39.954, + "grad_norm": 1.1963660717010498, + "learning_rate": 2e-05, + "loss": 0.04262936, + "step": 19977 + }, + { + "epoch": 39.956, + "grad_norm": 1.0662003755569458, + "learning_rate": 2e-05, + "loss": 0.02899798, + "step": 19978 + }, + { + "epoch": 39.958, + "grad_norm": 1.1126896142959595, + "learning_rate": 2e-05, + "loss": 0.04382278, + "step": 19979 + }, + { + "epoch": 39.96, + "grad_norm": 1.4259101152420044, + "learning_rate": 2e-05, + "loss": 0.04453399, + "step": 19980 + }, + { + "epoch": 39.962, + "grad_norm": 1.2776509523391724, + "learning_rate": 2e-05, + "loss": 0.05190249, + "step": 19981 + }, + { + "epoch": 39.964, + "grad_norm": 1.6215944290161133, + "learning_rate": 2e-05, + "loss": 0.04128952, + "step": 19982 + }, + { + "epoch": 39.966, + "grad_norm": 1.6409664154052734, + "learning_rate": 2e-05, + "loss": 0.06073495, + "step": 19983 + }, + { + "epoch": 39.968, + "grad_norm": 1.4451102018356323, + "learning_rate": 2e-05, + "loss": 0.04271209, + "step": 19984 + }, + { + "epoch": 39.97, + "grad_norm": 1.8261207342147827, + "learning_rate": 2e-05, + "loss": 0.04644296, + "step": 19985 + }, + { + "epoch": 39.972, + "grad_norm": 1.125677466392517, + "learning_rate": 2e-05, + "loss": 0.0407895, + "step": 19986 + }, + { + "epoch": 39.974, + "grad_norm": 1.1863775253295898, + "learning_rate": 2e-05, + "loss": 0.04969215, + "step": 19987 + }, + { + "epoch": 39.976, + "grad_norm": 1.3171180486679077, + "learning_rate": 2e-05, + "loss": 0.04542191, + "step": 19988 + }, + { + "epoch": 39.978, + "grad_norm": 1.259590983390808, + "learning_rate": 2e-05, + "loss": 0.04899602, + "step": 19989 + }, + { + "epoch": 39.98, + "grad_norm": 1.566371202468872, + "learning_rate": 2e-05, + "loss": 0.05686514, + "step": 19990 + }, + { + "epoch": 39.982, + "grad_norm": 1.311056137084961, + "learning_rate": 2e-05, + "loss": 0.05832597, + "step": 19991 + }, + { + "epoch": 39.984, + "grad_norm": 1.8373768329620361, + "learning_rate": 2e-05, + "loss": 0.06506597, + "step": 19992 + }, + { + "epoch": 39.986, + "grad_norm": 1.1901731491088867, + "learning_rate": 2e-05, + "loss": 0.04623218, + "step": 19993 + }, + { + "epoch": 39.988, + "grad_norm": 1.0414644479751587, + "learning_rate": 2e-05, + "loss": 0.034441, + "step": 19994 + }, + { + "epoch": 39.99, + "grad_norm": 1.188096284866333, + "learning_rate": 2e-05, + "loss": 0.04996618, + "step": 19995 + }, + { + "epoch": 39.992, + "grad_norm": 0.9998059868812561, + "learning_rate": 2e-05, + "loss": 0.03086119, + "step": 19996 + }, + { + "epoch": 39.994, + "grad_norm": 1.2495959997177124, + "learning_rate": 2e-05, + "loss": 0.04241527, + "step": 19997 + }, + { + "epoch": 39.996, + "grad_norm": 1.2383801937103271, + "learning_rate": 2e-05, + "loss": 0.049159, + "step": 19998 + }, + { + "epoch": 39.998, + "grad_norm": 3.58790922164917, + "learning_rate": 2e-05, + "loss": 0.05261368, + "step": 19999 + }, + { + "epoch": 40.0, + "grad_norm": 2.5835986137390137, + "learning_rate": 2e-05, + "loss": 0.05580256, + "step": 20000 + }, + { + "epoch": 40.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9840319361277445, + "Equal_1": 1.0, + "Equal_2": 0.9880239520958084, + "Equal_3": 0.9940119760479041, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9959919839679359, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.992, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.992, + "Perpendicular_3": 0.9018036072144289, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.99, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9860279441117764 + }, + "eval_runtime": 319.7794, + "eval_samples_per_second": 32.835, + "eval_steps_per_second": 0.657, + "step": 20000 + }, + { + "epoch": 40.002, + "grad_norm": 1.5618497133255005, + "learning_rate": 2e-05, + "loss": 0.05388392, + "step": 20001 + }, + { + "epoch": 40.004, + "grad_norm": 1.2499672174453735, + "learning_rate": 2e-05, + "loss": 0.04120894, + "step": 20002 + }, + { + "epoch": 40.006, + "grad_norm": 1.2109254598617554, + "learning_rate": 2e-05, + "loss": 0.03839901, + "step": 20003 + }, + { + "epoch": 40.008, + "grad_norm": 1.3738662004470825, + "learning_rate": 2e-05, + "loss": 0.05607092, + "step": 20004 + }, + { + "epoch": 40.01, + "grad_norm": 1.273634672164917, + "learning_rate": 2e-05, + "loss": 0.05029005, + "step": 20005 + }, + { + "epoch": 40.012, + "grad_norm": 1.2355257272720337, + "learning_rate": 2e-05, + "loss": 0.04626942, + "step": 20006 + }, + { + "epoch": 40.014, + "grad_norm": 1.1767611503601074, + "learning_rate": 2e-05, + "loss": 0.04972693, + "step": 20007 + }, + { + "epoch": 40.016, + "grad_norm": 1.1207237243652344, + "learning_rate": 2e-05, + "loss": 0.03946103, + "step": 20008 + }, + { + "epoch": 40.018, + "grad_norm": 2.209026336669922, + "learning_rate": 2e-05, + "loss": 0.05307207, + "step": 20009 + }, + { + "epoch": 40.02, + "grad_norm": 1.8616058826446533, + "learning_rate": 2e-05, + "loss": 0.05713968, + "step": 20010 + }, + { + "epoch": 40.022, + "grad_norm": 1.272833228111267, + "learning_rate": 2e-05, + "loss": 0.05387183, + "step": 20011 + }, + { + "epoch": 40.024, + "grad_norm": 1.3753788471221924, + "learning_rate": 2e-05, + "loss": 0.04635529, + "step": 20012 + }, + { + "epoch": 40.026, + "grad_norm": 1.1166011095046997, + "learning_rate": 2e-05, + "loss": 0.03984597, + "step": 20013 + }, + { + "epoch": 40.028, + "grad_norm": 1.2804628610610962, + "learning_rate": 2e-05, + "loss": 0.04099889, + "step": 20014 + }, + { + "epoch": 40.03, + "grad_norm": 1.6008166074752808, + "learning_rate": 2e-05, + "loss": 0.04193804, + "step": 20015 + }, + { + "epoch": 40.032, + "grad_norm": 1.178210735321045, + "learning_rate": 2e-05, + "loss": 0.05431996, + "step": 20016 + }, + { + "epoch": 40.034, + "grad_norm": 1.2867320775985718, + "learning_rate": 2e-05, + "loss": 0.04413334, + "step": 20017 + }, + { + "epoch": 40.036, + "grad_norm": 1.4618786573410034, + "learning_rate": 2e-05, + "loss": 0.04305667, + "step": 20018 + }, + { + "epoch": 40.038, + "grad_norm": 1.2682616710662842, + "learning_rate": 2e-05, + "loss": 0.04806349, + "step": 20019 + }, + { + "epoch": 40.04, + "grad_norm": 1.1537147760391235, + "learning_rate": 2e-05, + "loss": 0.0417571, + "step": 20020 + }, + { + "epoch": 40.042, + "grad_norm": 0.8992504477500916, + "learning_rate": 2e-05, + "loss": 0.03431882, + "step": 20021 + }, + { + "epoch": 40.044, + "grad_norm": 2.1597468852996826, + "learning_rate": 2e-05, + "loss": 0.05409172, + "step": 20022 + }, + { + "epoch": 40.046, + "grad_norm": 1.5585830211639404, + "learning_rate": 2e-05, + "loss": 0.0792776, + "step": 20023 + }, + { + "epoch": 40.048, + "grad_norm": 1.5551427602767944, + "learning_rate": 2e-05, + "loss": 0.05392529, + "step": 20024 + }, + { + "epoch": 40.05, + "grad_norm": 1.1377781629562378, + "learning_rate": 2e-05, + "loss": 0.04901553, + "step": 20025 + }, + { + "epoch": 40.052, + "grad_norm": 1.1749012470245361, + "learning_rate": 2e-05, + "loss": 0.04288197, + "step": 20026 + }, + { + "epoch": 40.054, + "grad_norm": 1.2844057083129883, + "learning_rate": 2e-05, + "loss": 0.05152037, + "step": 20027 + }, + { + "epoch": 40.056, + "grad_norm": 0.8824015855789185, + "learning_rate": 2e-05, + "loss": 0.02397311, + "step": 20028 + }, + { + "epoch": 40.058, + "grad_norm": 1.6838172674179077, + "learning_rate": 2e-05, + "loss": 0.05278652, + "step": 20029 + }, + { + "epoch": 40.06, + "grad_norm": 1.0910221338272095, + "learning_rate": 2e-05, + "loss": 0.03923189, + "step": 20030 + }, + { + "epoch": 40.062, + "grad_norm": 1.179529070854187, + "learning_rate": 2e-05, + "loss": 0.04087804, + "step": 20031 + }, + { + "epoch": 40.064, + "grad_norm": 1.462498664855957, + "learning_rate": 2e-05, + "loss": 0.05278556, + "step": 20032 + }, + { + "epoch": 40.066, + "grad_norm": 1.1131311655044556, + "learning_rate": 2e-05, + "loss": 0.04452292, + "step": 20033 + }, + { + "epoch": 40.068, + "grad_norm": 1.4193816184997559, + "learning_rate": 2e-05, + "loss": 0.05834501, + "step": 20034 + }, + { + "epoch": 40.07, + "grad_norm": 1.2489386796951294, + "learning_rate": 2e-05, + "loss": 0.04075392, + "step": 20035 + }, + { + "epoch": 40.072, + "grad_norm": 1.0575319528579712, + "learning_rate": 2e-05, + "loss": 0.0387546, + "step": 20036 + }, + { + "epoch": 40.074, + "grad_norm": 1.1875722408294678, + "learning_rate": 2e-05, + "loss": 0.04389498, + "step": 20037 + }, + { + "epoch": 40.076, + "grad_norm": 1.1869674921035767, + "learning_rate": 2e-05, + "loss": 0.02951438, + "step": 20038 + }, + { + "epoch": 40.078, + "grad_norm": 2.2647807598114014, + "learning_rate": 2e-05, + "loss": 0.06623721, + "step": 20039 + }, + { + "epoch": 40.08, + "grad_norm": 1.8764697313308716, + "learning_rate": 2e-05, + "loss": 0.04736568, + "step": 20040 + }, + { + "epoch": 40.082, + "grad_norm": 1.6881517171859741, + "learning_rate": 2e-05, + "loss": 0.05365419, + "step": 20041 + }, + { + "epoch": 40.084, + "grad_norm": 1.8008352518081665, + "learning_rate": 2e-05, + "loss": 0.06396797, + "step": 20042 + }, + { + "epoch": 40.086, + "grad_norm": 1.4374672174453735, + "learning_rate": 2e-05, + "loss": 0.06762437, + "step": 20043 + }, + { + "epoch": 40.088, + "grad_norm": 1.4115700721740723, + "learning_rate": 2e-05, + "loss": 0.04840955, + "step": 20044 + }, + { + "epoch": 40.09, + "grad_norm": 1.0511454343795776, + "learning_rate": 2e-05, + "loss": 0.03845333, + "step": 20045 + }, + { + "epoch": 40.092, + "grad_norm": 1.0274492502212524, + "learning_rate": 2e-05, + "loss": 0.03665432, + "step": 20046 + }, + { + "epoch": 40.094, + "grad_norm": 2.0367865562438965, + "learning_rate": 2e-05, + "loss": 0.04981969, + "step": 20047 + }, + { + "epoch": 40.096, + "grad_norm": 1.2091890573501587, + "learning_rate": 2e-05, + "loss": 0.0455159, + "step": 20048 + }, + { + "epoch": 40.098, + "grad_norm": 1.1141090393066406, + "learning_rate": 2e-05, + "loss": 0.04782473, + "step": 20049 + }, + { + "epoch": 40.1, + "grad_norm": 1.4072805643081665, + "learning_rate": 2e-05, + "loss": 0.05027459, + "step": 20050 + }, + { + "epoch": 40.102, + "grad_norm": 1.0128663778305054, + "learning_rate": 2e-05, + "loss": 0.03786948, + "step": 20051 + }, + { + "epoch": 40.104, + "grad_norm": 1.241088628768921, + "learning_rate": 2e-05, + "loss": 0.04034789, + "step": 20052 + }, + { + "epoch": 40.106, + "grad_norm": 2.3295323848724365, + "learning_rate": 2e-05, + "loss": 0.04255938, + "step": 20053 + }, + { + "epoch": 40.108, + "grad_norm": 1.2355490922927856, + "learning_rate": 2e-05, + "loss": 0.04925885, + "step": 20054 + }, + { + "epoch": 40.11, + "grad_norm": 1.342072606086731, + "learning_rate": 2e-05, + "loss": 0.05003198, + "step": 20055 + }, + { + "epoch": 40.112, + "grad_norm": 1.1754595041275024, + "learning_rate": 2e-05, + "loss": 0.04749743, + "step": 20056 + }, + { + "epoch": 40.114, + "grad_norm": 1.235671043395996, + "learning_rate": 2e-05, + "loss": 0.05393402, + "step": 20057 + }, + { + "epoch": 40.116, + "grad_norm": 1.2485733032226562, + "learning_rate": 2e-05, + "loss": 0.0627479, + "step": 20058 + }, + { + "epoch": 40.118, + "grad_norm": 1.5328861474990845, + "learning_rate": 2e-05, + "loss": 0.05113653, + "step": 20059 + }, + { + "epoch": 40.12, + "grad_norm": 1.499184489250183, + "learning_rate": 2e-05, + "loss": 0.05487531, + "step": 20060 + }, + { + "epoch": 40.122, + "grad_norm": 2.340745449066162, + "learning_rate": 2e-05, + "loss": 0.05198453, + "step": 20061 + }, + { + "epoch": 40.124, + "grad_norm": 1.106819748878479, + "learning_rate": 2e-05, + "loss": 0.0406749, + "step": 20062 + }, + { + "epoch": 40.126, + "grad_norm": 0.9359315037727356, + "learning_rate": 2e-05, + "loss": 0.02896912, + "step": 20063 + }, + { + "epoch": 40.128, + "grad_norm": 1.3963433504104614, + "learning_rate": 2e-05, + "loss": 0.05835143, + "step": 20064 + }, + { + "epoch": 40.13, + "grad_norm": 1.1196558475494385, + "learning_rate": 2e-05, + "loss": 0.04942945, + "step": 20065 + }, + { + "epoch": 40.132, + "grad_norm": 1.1688649654388428, + "learning_rate": 2e-05, + "loss": 0.05483418, + "step": 20066 + }, + { + "epoch": 40.134, + "grad_norm": 1.5023531913757324, + "learning_rate": 2e-05, + "loss": 0.05200912, + "step": 20067 + }, + { + "epoch": 40.136, + "grad_norm": 1.1763683557510376, + "learning_rate": 2e-05, + "loss": 0.04295605, + "step": 20068 + }, + { + "epoch": 40.138, + "grad_norm": 1.2277802228927612, + "learning_rate": 2e-05, + "loss": 0.04733496, + "step": 20069 + }, + { + "epoch": 40.14, + "grad_norm": 1.6241992712020874, + "learning_rate": 2e-05, + "loss": 0.06077222, + "step": 20070 + }, + { + "epoch": 40.142, + "grad_norm": 2.3605220317840576, + "learning_rate": 2e-05, + "loss": 0.05581881, + "step": 20071 + }, + { + "epoch": 40.144, + "grad_norm": 1.1370664834976196, + "learning_rate": 2e-05, + "loss": 0.04241094, + "step": 20072 + }, + { + "epoch": 40.146, + "grad_norm": 1.6637158393859863, + "learning_rate": 2e-05, + "loss": 0.05778241, + "step": 20073 + }, + { + "epoch": 40.148, + "grad_norm": 1.4280550479888916, + "learning_rate": 2e-05, + "loss": 0.05228476, + "step": 20074 + }, + { + "epoch": 40.15, + "grad_norm": 1.5583044290542603, + "learning_rate": 2e-05, + "loss": 0.05583321, + "step": 20075 + }, + { + "epoch": 40.152, + "grad_norm": 1.119242548942566, + "learning_rate": 2e-05, + "loss": 0.04524466, + "step": 20076 + }, + { + "epoch": 40.154, + "grad_norm": 1.4672080278396606, + "learning_rate": 2e-05, + "loss": 0.06838915, + "step": 20077 + }, + { + "epoch": 40.156, + "grad_norm": 2.8078713417053223, + "learning_rate": 2e-05, + "loss": 0.06085558, + "step": 20078 + }, + { + "epoch": 40.158, + "grad_norm": 7.261538028717041, + "learning_rate": 2e-05, + "loss": 0.04814359, + "step": 20079 + }, + { + "epoch": 40.16, + "grad_norm": 1.254329800605774, + "learning_rate": 2e-05, + "loss": 0.04063053, + "step": 20080 + }, + { + "epoch": 40.162, + "grad_norm": 1.3946176767349243, + "learning_rate": 2e-05, + "loss": 0.04426568, + "step": 20081 + }, + { + "epoch": 40.164, + "grad_norm": 1.089093565940857, + "learning_rate": 2e-05, + "loss": 0.04132479, + "step": 20082 + }, + { + "epoch": 40.166, + "grad_norm": 1.2767221927642822, + "learning_rate": 2e-05, + "loss": 0.05414563, + "step": 20083 + }, + { + "epoch": 40.168, + "grad_norm": 1.287773609161377, + "learning_rate": 2e-05, + "loss": 0.04473702, + "step": 20084 + }, + { + "epoch": 40.17, + "grad_norm": 0.8289897441864014, + "learning_rate": 2e-05, + "loss": 0.03129639, + "step": 20085 + }, + { + "epoch": 40.172, + "grad_norm": 1.047951579093933, + "learning_rate": 2e-05, + "loss": 0.03783755, + "step": 20086 + }, + { + "epoch": 40.174, + "grad_norm": 0.9991368651390076, + "learning_rate": 2e-05, + "loss": 0.03878902, + "step": 20087 + }, + { + "epoch": 40.176, + "grad_norm": 1.249922275543213, + "learning_rate": 2e-05, + "loss": 0.04872551, + "step": 20088 + }, + { + "epoch": 40.178, + "grad_norm": 1.3479992151260376, + "learning_rate": 2e-05, + "loss": 0.07139233, + "step": 20089 + }, + { + "epoch": 40.18, + "grad_norm": 2.5222294330596924, + "learning_rate": 2e-05, + "loss": 0.05283742, + "step": 20090 + }, + { + "epoch": 40.182, + "grad_norm": 1.0991319417953491, + "learning_rate": 2e-05, + "loss": 0.05567234, + "step": 20091 + }, + { + "epoch": 40.184, + "grad_norm": 1.6871635913848877, + "learning_rate": 2e-05, + "loss": 0.08690213, + "step": 20092 + }, + { + "epoch": 40.186, + "grad_norm": 1.2134432792663574, + "learning_rate": 2e-05, + "loss": 0.04679528, + "step": 20093 + }, + { + "epoch": 40.188, + "grad_norm": 1.456554889678955, + "learning_rate": 2e-05, + "loss": 0.05808542, + "step": 20094 + }, + { + "epoch": 40.19, + "grad_norm": 1.1095123291015625, + "learning_rate": 2e-05, + "loss": 0.04110887, + "step": 20095 + }, + { + "epoch": 40.192, + "grad_norm": 0.9948191046714783, + "learning_rate": 2e-05, + "loss": 0.0418423, + "step": 20096 + }, + { + "epoch": 40.194, + "grad_norm": 1.4417386054992676, + "learning_rate": 2e-05, + "loss": 0.06203289, + "step": 20097 + }, + { + "epoch": 40.196, + "grad_norm": 2.3071508407592773, + "learning_rate": 2e-05, + "loss": 0.04821993, + "step": 20098 + }, + { + "epoch": 40.198, + "grad_norm": 1.352673888206482, + "learning_rate": 2e-05, + "loss": 0.03600384, + "step": 20099 + }, + { + "epoch": 40.2, + "grad_norm": 1.105892300605774, + "learning_rate": 2e-05, + "loss": 0.03900053, + "step": 20100 + }, + { + "epoch": 40.202, + "grad_norm": 1.2061972618103027, + "learning_rate": 2e-05, + "loss": 0.05580106, + "step": 20101 + }, + { + "epoch": 40.204, + "grad_norm": 1.2827521562576294, + "learning_rate": 2e-05, + "loss": 0.04791928, + "step": 20102 + }, + { + "epoch": 40.206, + "grad_norm": 1.2111438512802124, + "learning_rate": 2e-05, + "loss": 0.04924633, + "step": 20103 + }, + { + "epoch": 40.208, + "grad_norm": 1.4191051721572876, + "learning_rate": 2e-05, + "loss": 0.04869965, + "step": 20104 + }, + { + "epoch": 40.21, + "grad_norm": 1.1339068412780762, + "learning_rate": 2e-05, + "loss": 0.05062132, + "step": 20105 + }, + { + "epoch": 40.212, + "grad_norm": 1.2514359951019287, + "learning_rate": 2e-05, + "loss": 0.04554267, + "step": 20106 + }, + { + "epoch": 40.214, + "grad_norm": 1.1411757469177246, + "learning_rate": 2e-05, + "loss": 0.04453161, + "step": 20107 + }, + { + "epoch": 40.216, + "grad_norm": 1.28267240524292, + "learning_rate": 2e-05, + "loss": 0.03762868, + "step": 20108 + }, + { + "epoch": 40.218, + "grad_norm": 1.2932469844818115, + "learning_rate": 2e-05, + "loss": 0.05375424, + "step": 20109 + }, + { + "epoch": 40.22, + "grad_norm": 1.5914359092712402, + "learning_rate": 2e-05, + "loss": 0.05069669, + "step": 20110 + }, + { + "epoch": 40.222, + "grad_norm": 1.1139858961105347, + "learning_rate": 2e-05, + "loss": 0.03766536, + "step": 20111 + }, + { + "epoch": 40.224, + "grad_norm": 1.3693923950195312, + "learning_rate": 2e-05, + "loss": 0.04846753, + "step": 20112 + }, + { + "epoch": 40.226, + "grad_norm": 1.1039000749588013, + "learning_rate": 2e-05, + "loss": 0.035409, + "step": 20113 + }, + { + "epoch": 40.228, + "grad_norm": 1.1846332550048828, + "learning_rate": 2e-05, + "loss": 0.04201246, + "step": 20114 + }, + { + "epoch": 40.23, + "grad_norm": 2.8767950534820557, + "learning_rate": 2e-05, + "loss": 0.06352147, + "step": 20115 + }, + { + "epoch": 40.232, + "grad_norm": 1.286635160446167, + "learning_rate": 2e-05, + "loss": 0.05608144, + "step": 20116 + }, + { + "epoch": 40.234, + "grad_norm": 1.0336881875991821, + "learning_rate": 2e-05, + "loss": 0.03256797, + "step": 20117 + }, + { + "epoch": 40.236, + "grad_norm": 8.691652297973633, + "learning_rate": 2e-05, + "loss": 0.0322794, + "step": 20118 + }, + { + "epoch": 40.238, + "grad_norm": 2.27980375289917, + "learning_rate": 2e-05, + "loss": 0.03699232, + "step": 20119 + }, + { + "epoch": 40.24, + "grad_norm": 1.9396880865097046, + "learning_rate": 2e-05, + "loss": 0.06942445, + "step": 20120 + }, + { + "epoch": 40.242, + "grad_norm": 1.0819038152694702, + "learning_rate": 2e-05, + "loss": 0.04012003, + "step": 20121 + }, + { + "epoch": 40.244, + "grad_norm": 1.3157330751419067, + "learning_rate": 2e-05, + "loss": 0.05315004, + "step": 20122 + }, + { + "epoch": 40.246, + "grad_norm": 1.216844081878662, + "learning_rate": 2e-05, + "loss": 0.05938418, + "step": 20123 + }, + { + "epoch": 40.248, + "grad_norm": 1.3893853425979614, + "learning_rate": 2e-05, + "loss": 0.04685809, + "step": 20124 + }, + { + "epoch": 40.25, + "grad_norm": 1.0636508464813232, + "learning_rate": 2e-05, + "loss": 0.03838153, + "step": 20125 + }, + { + "epoch": 40.252, + "grad_norm": 1.0984792709350586, + "learning_rate": 2e-05, + "loss": 0.05215897, + "step": 20126 + }, + { + "epoch": 40.254, + "grad_norm": 1.0352123975753784, + "learning_rate": 2e-05, + "loss": 0.04288472, + "step": 20127 + }, + { + "epoch": 40.256, + "grad_norm": 1.0150216817855835, + "learning_rate": 2e-05, + "loss": 0.03558073, + "step": 20128 + }, + { + "epoch": 40.258, + "grad_norm": 1.1401340961456299, + "learning_rate": 2e-05, + "loss": 0.03874261, + "step": 20129 + }, + { + "epoch": 40.26, + "grad_norm": 1.2564568519592285, + "learning_rate": 2e-05, + "loss": 0.05399378, + "step": 20130 + }, + { + "epoch": 40.262, + "grad_norm": 1.2055622339248657, + "learning_rate": 2e-05, + "loss": 0.05442786, + "step": 20131 + }, + { + "epoch": 40.264, + "grad_norm": 1.2611266374588013, + "learning_rate": 2e-05, + "loss": 0.04908681, + "step": 20132 + }, + { + "epoch": 40.266, + "grad_norm": 1.8219330310821533, + "learning_rate": 2e-05, + "loss": 0.05871871, + "step": 20133 + }, + { + "epoch": 40.268, + "grad_norm": 1.118198275566101, + "learning_rate": 2e-05, + "loss": 0.04319108, + "step": 20134 + }, + { + "epoch": 40.27, + "grad_norm": 1.0127724409103394, + "learning_rate": 2e-05, + "loss": 0.03908618, + "step": 20135 + }, + { + "epoch": 40.272, + "grad_norm": 1.1128853559494019, + "learning_rate": 2e-05, + "loss": 0.04767665, + "step": 20136 + }, + { + "epoch": 40.274, + "grad_norm": 1.198813557624817, + "learning_rate": 2e-05, + "loss": 0.04259463, + "step": 20137 + }, + { + "epoch": 40.276, + "grad_norm": 1.0490918159484863, + "learning_rate": 2e-05, + "loss": 0.02958579, + "step": 20138 + }, + { + "epoch": 40.278, + "grad_norm": 1.3297768831253052, + "learning_rate": 2e-05, + "loss": 0.05636571, + "step": 20139 + }, + { + "epoch": 40.28, + "grad_norm": 1.2086269855499268, + "learning_rate": 2e-05, + "loss": 0.04821461, + "step": 20140 + }, + { + "epoch": 40.282, + "grad_norm": 2.00396728515625, + "learning_rate": 2e-05, + "loss": 0.03132957, + "step": 20141 + }, + { + "epoch": 40.284, + "grad_norm": 1.4229713678359985, + "learning_rate": 2e-05, + "loss": 0.04370468, + "step": 20142 + }, + { + "epoch": 40.286, + "grad_norm": 1.6150267124176025, + "learning_rate": 2e-05, + "loss": 0.05704456, + "step": 20143 + }, + { + "epoch": 40.288, + "grad_norm": 0.9842348098754883, + "learning_rate": 2e-05, + "loss": 0.02401066, + "step": 20144 + }, + { + "epoch": 40.29, + "grad_norm": 1.246802806854248, + "learning_rate": 2e-05, + "loss": 0.03580172, + "step": 20145 + }, + { + "epoch": 40.292, + "grad_norm": 1.4223207235336304, + "learning_rate": 2e-05, + "loss": 0.04059513, + "step": 20146 + }, + { + "epoch": 40.294, + "grad_norm": 1.252277135848999, + "learning_rate": 2e-05, + "loss": 0.05555695, + "step": 20147 + }, + { + "epoch": 40.296, + "grad_norm": 1.1984466314315796, + "learning_rate": 2e-05, + "loss": 0.04715933, + "step": 20148 + }, + { + "epoch": 40.298, + "grad_norm": 1.1603848934173584, + "learning_rate": 2e-05, + "loss": 0.04525081, + "step": 20149 + }, + { + "epoch": 40.3, + "grad_norm": 1.5191564559936523, + "learning_rate": 2e-05, + "loss": 0.05356734, + "step": 20150 + }, + { + "epoch": 40.302, + "grad_norm": 2.3878793716430664, + "learning_rate": 2e-05, + "loss": 0.04827441, + "step": 20151 + }, + { + "epoch": 40.304, + "grad_norm": 1.3220070600509644, + "learning_rate": 2e-05, + "loss": 0.05512287, + "step": 20152 + }, + { + "epoch": 40.306, + "grad_norm": 1.7690134048461914, + "learning_rate": 2e-05, + "loss": 0.07210412, + "step": 20153 + }, + { + "epoch": 40.308, + "grad_norm": 3.268732786178589, + "learning_rate": 2e-05, + "loss": 0.05046576, + "step": 20154 + }, + { + "epoch": 40.31, + "grad_norm": 1.3176329135894775, + "learning_rate": 2e-05, + "loss": 0.04881255, + "step": 20155 + }, + { + "epoch": 40.312, + "grad_norm": 1.2548283338546753, + "learning_rate": 2e-05, + "loss": 0.04261008, + "step": 20156 + }, + { + "epoch": 40.314, + "grad_norm": 1.4517444372177124, + "learning_rate": 2e-05, + "loss": 0.04030664, + "step": 20157 + }, + { + "epoch": 40.316, + "grad_norm": 1.297654628753662, + "learning_rate": 2e-05, + "loss": 0.04864812, + "step": 20158 + }, + { + "epoch": 40.318, + "grad_norm": 1.3605537414550781, + "learning_rate": 2e-05, + "loss": 0.05398957, + "step": 20159 + }, + { + "epoch": 40.32, + "grad_norm": 1.0863500833511353, + "learning_rate": 2e-05, + "loss": 0.04536635, + "step": 20160 + }, + { + "epoch": 40.322, + "grad_norm": 1.3427700996398926, + "learning_rate": 2e-05, + "loss": 0.04969367, + "step": 20161 + }, + { + "epoch": 40.324, + "grad_norm": 1.6353856325149536, + "learning_rate": 2e-05, + "loss": 0.04320132, + "step": 20162 + }, + { + "epoch": 40.326, + "grad_norm": 1.2196747064590454, + "learning_rate": 2e-05, + "loss": 0.03800016, + "step": 20163 + }, + { + "epoch": 40.328, + "grad_norm": 1.7924127578735352, + "learning_rate": 2e-05, + "loss": 0.07237592, + "step": 20164 + }, + { + "epoch": 40.33, + "grad_norm": 1.7914211750030518, + "learning_rate": 2e-05, + "loss": 0.05794446, + "step": 20165 + }, + { + "epoch": 40.332, + "grad_norm": 1.2573678493499756, + "learning_rate": 2e-05, + "loss": 0.04247341, + "step": 20166 + }, + { + "epoch": 40.334, + "grad_norm": 1.1113066673278809, + "learning_rate": 2e-05, + "loss": 0.05276919, + "step": 20167 + }, + { + "epoch": 40.336, + "grad_norm": 1.5416284799575806, + "learning_rate": 2e-05, + "loss": 0.05891626, + "step": 20168 + }, + { + "epoch": 40.338, + "grad_norm": 1.1009063720703125, + "learning_rate": 2e-05, + "loss": 0.0362423, + "step": 20169 + }, + { + "epoch": 40.34, + "grad_norm": 1.2229732275009155, + "learning_rate": 2e-05, + "loss": 0.04161092, + "step": 20170 + }, + { + "epoch": 40.342, + "grad_norm": 1.4811763763427734, + "learning_rate": 2e-05, + "loss": 0.05466798, + "step": 20171 + }, + { + "epoch": 40.344, + "grad_norm": 1.1758745908737183, + "learning_rate": 2e-05, + "loss": 0.04076952, + "step": 20172 + }, + { + "epoch": 40.346, + "grad_norm": 1.35615873336792, + "learning_rate": 2e-05, + "loss": 0.0481509, + "step": 20173 + }, + { + "epoch": 40.348, + "grad_norm": 1.1503114700317383, + "learning_rate": 2e-05, + "loss": 0.03523769, + "step": 20174 + }, + { + "epoch": 40.35, + "grad_norm": 1.1574171781539917, + "learning_rate": 2e-05, + "loss": 0.03359126, + "step": 20175 + }, + { + "epoch": 40.352, + "grad_norm": 1.2408151626586914, + "learning_rate": 2e-05, + "loss": 0.05215861, + "step": 20176 + }, + { + "epoch": 40.354, + "grad_norm": 1.2449312210083008, + "learning_rate": 2e-05, + "loss": 0.05467416, + "step": 20177 + }, + { + "epoch": 40.356, + "grad_norm": 0.9955729246139526, + "learning_rate": 2e-05, + "loss": 0.03369155, + "step": 20178 + }, + { + "epoch": 40.358, + "grad_norm": 1.5078949928283691, + "learning_rate": 2e-05, + "loss": 0.0526335, + "step": 20179 + }, + { + "epoch": 40.36, + "grad_norm": 1.6468613147735596, + "learning_rate": 2e-05, + "loss": 0.0292394, + "step": 20180 + }, + { + "epoch": 40.362, + "grad_norm": 1.2313929796218872, + "learning_rate": 2e-05, + "loss": 0.05515527, + "step": 20181 + }, + { + "epoch": 40.364, + "grad_norm": 1.1853318214416504, + "learning_rate": 2e-05, + "loss": 0.04114341, + "step": 20182 + }, + { + "epoch": 40.366, + "grad_norm": 2.5944883823394775, + "learning_rate": 2e-05, + "loss": 0.04679396, + "step": 20183 + }, + { + "epoch": 40.368, + "grad_norm": 1.1291593313217163, + "learning_rate": 2e-05, + "loss": 0.03800917, + "step": 20184 + }, + { + "epoch": 40.37, + "grad_norm": 1.2536524534225464, + "learning_rate": 2e-05, + "loss": 0.03788491, + "step": 20185 + }, + { + "epoch": 40.372, + "grad_norm": 1.110937237739563, + "learning_rate": 2e-05, + "loss": 0.03799309, + "step": 20186 + }, + { + "epoch": 40.374, + "grad_norm": 1.246289610862732, + "learning_rate": 2e-05, + "loss": 0.05560606, + "step": 20187 + }, + { + "epoch": 40.376, + "grad_norm": 1.2249597311019897, + "learning_rate": 2e-05, + "loss": 0.05621621, + "step": 20188 + }, + { + "epoch": 40.378, + "grad_norm": 1.4832850694656372, + "learning_rate": 2e-05, + "loss": 0.05166864, + "step": 20189 + }, + { + "epoch": 40.38, + "grad_norm": 1.8920128345489502, + "learning_rate": 2e-05, + "loss": 0.06390877, + "step": 20190 + }, + { + "epoch": 40.382, + "grad_norm": 1.3039014339447021, + "learning_rate": 2e-05, + "loss": 0.04282786, + "step": 20191 + }, + { + "epoch": 40.384, + "grad_norm": 1.2546817064285278, + "learning_rate": 2e-05, + "loss": 0.05530909, + "step": 20192 + }, + { + "epoch": 40.386, + "grad_norm": 1.0137908458709717, + "learning_rate": 2e-05, + "loss": 0.03802532, + "step": 20193 + }, + { + "epoch": 40.388, + "grad_norm": 3.2582902908325195, + "learning_rate": 2e-05, + "loss": 0.04777801, + "step": 20194 + }, + { + "epoch": 40.39, + "grad_norm": 1.6945880651474, + "learning_rate": 2e-05, + "loss": 0.06353603, + "step": 20195 + }, + { + "epoch": 40.392, + "grad_norm": 1.861193299293518, + "learning_rate": 2e-05, + "loss": 0.0449325, + "step": 20196 + }, + { + "epoch": 40.394, + "grad_norm": 1.4629795551300049, + "learning_rate": 2e-05, + "loss": 0.06012824, + "step": 20197 + }, + { + "epoch": 40.396, + "grad_norm": 1.067794919013977, + "learning_rate": 2e-05, + "loss": 0.04181834, + "step": 20198 + }, + { + "epoch": 40.398, + "grad_norm": 1.1766245365142822, + "learning_rate": 2e-05, + "loss": 0.04275657, + "step": 20199 + }, + { + "epoch": 40.4, + "grad_norm": 3.7547171115875244, + "learning_rate": 2e-05, + "loss": 0.05930864, + "step": 20200 + }, + { + "epoch": 40.402, + "grad_norm": 1.0909967422485352, + "learning_rate": 2e-05, + "loss": 0.04474407, + "step": 20201 + }, + { + "epoch": 40.404, + "grad_norm": 1.1844147443771362, + "learning_rate": 2e-05, + "loss": 0.04869113, + "step": 20202 + }, + { + "epoch": 40.406, + "grad_norm": 0.9126821160316467, + "learning_rate": 2e-05, + "loss": 0.03085716, + "step": 20203 + }, + { + "epoch": 40.408, + "grad_norm": 1.0406267642974854, + "learning_rate": 2e-05, + "loss": 0.03293871, + "step": 20204 + }, + { + "epoch": 40.41, + "grad_norm": 0.9543370008468628, + "learning_rate": 2e-05, + "loss": 0.03391955, + "step": 20205 + }, + { + "epoch": 40.412, + "grad_norm": 0.8653976917266846, + "learning_rate": 2e-05, + "loss": 0.02381743, + "step": 20206 + }, + { + "epoch": 40.414, + "grad_norm": 1.078957200050354, + "learning_rate": 2e-05, + "loss": 0.04199693, + "step": 20207 + }, + { + "epoch": 40.416, + "grad_norm": 4.299551963806152, + "learning_rate": 2e-05, + "loss": 0.05353047, + "step": 20208 + }, + { + "epoch": 40.418, + "grad_norm": 1.4246419668197632, + "learning_rate": 2e-05, + "loss": 0.04532846, + "step": 20209 + }, + { + "epoch": 40.42, + "grad_norm": 1.2102235555648804, + "learning_rate": 2e-05, + "loss": 0.05503858, + "step": 20210 + }, + { + "epoch": 40.422, + "grad_norm": 1.1240041255950928, + "learning_rate": 2e-05, + "loss": 0.03953891, + "step": 20211 + }, + { + "epoch": 40.424, + "grad_norm": 1.3952480554580688, + "learning_rate": 2e-05, + "loss": 0.05917449, + "step": 20212 + }, + { + "epoch": 40.426, + "grad_norm": 1.2208077907562256, + "learning_rate": 2e-05, + "loss": 0.03653198, + "step": 20213 + }, + { + "epoch": 40.428, + "grad_norm": 1.1388483047485352, + "learning_rate": 2e-05, + "loss": 0.04844316, + "step": 20214 + }, + { + "epoch": 40.43, + "grad_norm": 1.294251561164856, + "learning_rate": 2e-05, + "loss": 0.04915586, + "step": 20215 + }, + { + "epoch": 40.432, + "grad_norm": 2.817385196685791, + "learning_rate": 2e-05, + "loss": 0.0448599, + "step": 20216 + }, + { + "epoch": 40.434, + "grad_norm": 1.1939760446548462, + "learning_rate": 2e-05, + "loss": 0.04789099, + "step": 20217 + }, + { + "epoch": 40.436, + "grad_norm": 1.2851756811141968, + "learning_rate": 2e-05, + "loss": 0.05403595, + "step": 20218 + }, + { + "epoch": 40.438, + "grad_norm": 1.0391663312911987, + "learning_rate": 2e-05, + "loss": 0.04843492, + "step": 20219 + }, + { + "epoch": 40.44, + "grad_norm": 1.0590914487838745, + "learning_rate": 2e-05, + "loss": 0.04758727, + "step": 20220 + }, + { + "epoch": 40.442, + "grad_norm": 1.6666922569274902, + "learning_rate": 2e-05, + "loss": 0.05023216, + "step": 20221 + }, + { + "epoch": 40.444, + "grad_norm": 1.1484979391098022, + "learning_rate": 2e-05, + "loss": 0.0386183, + "step": 20222 + }, + { + "epoch": 40.446, + "grad_norm": 2.4382147789001465, + "learning_rate": 2e-05, + "loss": 0.06324873, + "step": 20223 + }, + { + "epoch": 40.448, + "grad_norm": 1.1172566413879395, + "learning_rate": 2e-05, + "loss": 0.04193846, + "step": 20224 + }, + { + "epoch": 40.45, + "grad_norm": 4.376603603363037, + "learning_rate": 2e-05, + "loss": 0.05779196, + "step": 20225 + }, + { + "epoch": 40.452, + "grad_norm": 1.0015493631362915, + "learning_rate": 2e-05, + "loss": 0.02681638, + "step": 20226 + }, + { + "epoch": 40.454, + "grad_norm": 1.31887948513031, + "learning_rate": 2e-05, + "loss": 0.05519604, + "step": 20227 + }, + { + "epoch": 40.456, + "grad_norm": 1.7096487283706665, + "learning_rate": 2e-05, + "loss": 0.07397369, + "step": 20228 + }, + { + "epoch": 40.458, + "grad_norm": 1.3221633434295654, + "learning_rate": 2e-05, + "loss": 0.05340026, + "step": 20229 + }, + { + "epoch": 40.46, + "grad_norm": 1.0366623401641846, + "learning_rate": 2e-05, + "loss": 0.03435313, + "step": 20230 + }, + { + "epoch": 40.462, + "grad_norm": 1.385132908821106, + "learning_rate": 2e-05, + "loss": 0.06896439, + "step": 20231 + }, + { + "epoch": 40.464, + "grad_norm": 2.499610662460327, + "learning_rate": 2e-05, + "loss": 0.06424156, + "step": 20232 + }, + { + "epoch": 40.466, + "grad_norm": 1.1677004098892212, + "learning_rate": 2e-05, + "loss": 0.03992238, + "step": 20233 + }, + { + "epoch": 40.468, + "grad_norm": 3.2381439208984375, + "learning_rate": 2e-05, + "loss": 0.05073539, + "step": 20234 + }, + { + "epoch": 40.47, + "grad_norm": 1.6042966842651367, + "learning_rate": 2e-05, + "loss": 0.03290394, + "step": 20235 + }, + { + "epoch": 40.472, + "grad_norm": 1.2547248601913452, + "learning_rate": 2e-05, + "loss": 0.04911025, + "step": 20236 + }, + { + "epoch": 40.474, + "grad_norm": 1.3358653783798218, + "learning_rate": 2e-05, + "loss": 0.0705348, + "step": 20237 + }, + { + "epoch": 40.476, + "grad_norm": 1.3888798952102661, + "learning_rate": 2e-05, + "loss": 0.04967166, + "step": 20238 + }, + { + "epoch": 40.478, + "grad_norm": 1.2543953657150269, + "learning_rate": 2e-05, + "loss": 0.04666407, + "step": 20239 + }, + { + "epoch": 40.48, + "grad_norm": 1.0593624114990234, + "learning_rate": 2e-05, + "loss": 0.03600551, + "step": 20240 + }, + { + "epoch": 40.482, + "grad_norm": 1.375593900680542, + "learning_rate": 2e-05, + "loss": 0.06600478, + "step": 20241 + }, + { + "epoch": 40.484, + "grad_norm": 1.3863525390625, + "learning_rate": 2e-05, + "loss": 0.03970123, + "step": 20242 + }, + { + "epoch": 40.486, + "grad_norm": 1.4013750553131104, + "learning_rate": 2e-05, + "loss": 0.05529185, + "step": 20243 + }, + { + "epoch": 40.488, + "grad_norm": 1.3088881969451904, + "learning_rate": 2e-05, + "loss": 0.05219998, + "step": 20244 + }, + { + "epoch": 40.49, + "grad_norm": 1.4817276000976562, + "learning_rate": 2e-05, + "loss": 0.05724048, + "step": 20245 + }, + { + "epoch": 40.492, + "grad_norm": 1.4716103076934814, + "learning_rate": 2e-05, + "loss": 0.05594869, + "step": 20246 + }, + { + "epoch": 40.494, + "grad_norm": 1.3419957160949707, + "learning_rate": 2e-05, + "loss": 0.06096876, + "step": 20247 + }, + { + "epoch": 40.496, + "grad_norm": 0.9824522137641907, + "learning_rate": 2e-05, + "loss": 0.03611157, + "step": 20248 + }, + { + "epoch": 40.498, + "grad_norm": 1.4187341928482056, + "learning_rate": 2e-05, + "loss": 0.05075159, + "step": 20249 + }, + { + "epoch": 40.5, + "grad_norm": 1.4452311992645264, + "learning_rate": 2e-05, + "loss": 0.04892913, + "step": 20250 + }, + { + "epoch": 40.502, + "grad_norm": 1.3794788122177124, + "learning_rate": 2e-05, + "loss": 0.04053912, + "step": 20251 + }, + { + "epoch": 40.504, + "grad_norm": 1.2607942819595337, + "learning_rate": 2e-05, + "loss": 0.04043246, + "step": 20252 + }, + { + "epoch": 40.506, + "grad_norm": 1.2887449264526367, + "learning_rate": 2e-05, + "loss": 0.04719897, + "step": 20253 + }, + { + "epoch": 40.508, + "grad_norm": 1.3014177083969116, + "learning_rate": 2e-05, + "loss": 0.05486921, + "step": 20254 + }, + { + "epoch": 40.51, + "grad_norm": 1.0283253192901611, + "learning_rate": 2e-05, + "loss": 0.03235307, + "step": 20255 + }, + { + "epoch": 40.512, + "grad_norm": 2.2812840938568115, + "learning_rate": 2e-05, + "loss": 0.05784634, + "step": 20256 + }, + { + "epoch": 40.514, + "grad_norm": 1.0626236200332642, + "learning_rate": 2e-05, + "loss": 0.0345856, + "step": 20257 + }, + { + "epoch": 40.516, + "grad_norm": 1.3558917045593262, + "learning_rate": 2e-05, + "loss": 0.05744244, + "step": 20258 + }, + { + "epoch": 40.518, + "grad_norm": 1.1966677904129028, + "learning_rate": 2e-05, + "loss": 0.04859721, + "step": 20259 + }, + { + "epoch": 40.52, + "grad_norm": 2.496509552001953, + "learning_rate": 2e-05, + "loss": 0.05492661, + "step": 20260 + }, + { + "epoch": 40.522, + "grad_norm": 1.3958922624588013, + "learning_rate": 2e-05, + "loss": 0.04244683, + "step": 20261 + }, + { + "epoch": 40.524, + "grad_norm": 1.9222729206085205, + "learning_rate": 2e-05, + "loss": 0.04568821, + "step": 20262 + }, + { + "epoch": 40.526, + "grad_norm": 1.3235441446304321, + "learning_rate": 2e-05, + "loss": 0.04356699, + "step": 20263 + }, + { + "epoch": 40.528, + "grad_norm": 1.4533641338348389, + "learning_rate": 2e-05, + "loss": 0.052623, + "step": 20264 + }, + { + "epoch": 40.53, + "grad_norm": 1.1498498916625977, + "learning_rate": 2e-05, + "loss": 0.04153707, + "step": 20265 + }, + { + "epoch": 40.532, + "grad_norm": 1.6196757555007935, + "learning_rate": 2e-05, + "loss": 0.05171479, + "step": 20266 + }, + { + "epoch": 40.534, + "grad_norm": 1.4504308700561523, + "learning_rate": 2e-05, + "loss": 0.04189808, + "step": 20267 + }, + { + "epoch": 40.536, + "grad_norm": 2.833212375640869, + "learning_rate": 2e-05, + "loss": 0.05008708, + "step": 20268 + }, + { + "epoch": 40.538, + "grad_norm": 1.1308425664901733, + "learning_rate": 2e-05, + "loss": 0.03203578, + "step": 20269 + }, + { + "epoch": 40.54, + "grad_norm": 1.2283337116241455, + "learning_rate": 2e-05, + "loss": 0.05175584, + "step": 20270 + }, + { + "epoch": 40.542, + "grad_norm": 1.1894733905792236, + "learning_rate": 2e-05, + "loss": 0.04817605, + "step": 20271 + }, + { + "epoch": 40.544, + "grad_norm": 1.2060327529907227, + "learning_rate": 2e-05, + "loss": 0.04768998, + "step": 20272 + }, + { + "epoch": 40.546, + "grad_norm": 1.2168883085250854, + "learning_rate": 2e-05, + "loss": 0.05099332, + "step": 20273 + }, + { + "epoch": 40.548, + "grad_norm": 1.1243617534637451, + "learning_rate": 2e-05, + "loss": 0.03701821, + "step": 20274 + }, + { + "epoch": 40.55, + "grad_norm": 1.3077539205551147, + "learning_rate": 2e-05, + "loss": 0.05925671, + "step": 20275 + }, + { + "epoch": 40.552, + "grad_norm": 1.9214268922805786, + "learning_rate": 2e-05, + "loss": 0.05207685, + "step": 20276 + }, + { + "epoch": 40.554, + "grad_norm": 1.2245525121688843, + "learning_rate": 2e-05, + "loss": 0.03042131, + "step": 20277 + }, + { + "epoch": 40.556, + "grad_norm": 2.7672958374023438, + "learning_rate": 2e-05, + "loss": 0.06857204, + "step": 20278 + }, + { + "epoch": 40.558, + "grad_norm": 1.2869014739990234, + "learning_rate": 2e-05, + "loss": 0.04283034, + "step": 20279 + }, + { + "epoch": 40.56, + "grad_norm": 1.1815762519836426, + "learning_rate": 2e-05, + "loss": 0.04052944, + "step": 20280 + }, + { + "epoch": 40.562, + "grad_norm": 2.087244749069214, + "learning_rate": 2e-05, + "loss": 0.06331733, + "step": 20281 + }, + { + "epoch": 40.564, + "grad_norm": 1.3826656341552734, + "learning_rate": 2e-05, + "loss": 0.04571678, + "step": 20282 + }, + { + "epoch": 40.566, + "grad_norm": 3.096768617630005, + "learning_rate": 2e-05, + "loss": 0.05192983, + "step": 20283 + }, + { + "epoch": 40.568, + "grad_norm": 1.186075210571289, + "learning_rate": 2e-05, + "loss": 0.05161081, + "step": 20284 + }, + { + "epoch": 40.57, + "grad_norm": 1.5007069110870361, + "learning_rate": 2e-05, + "loss": 0.0494061, + "step": 20285 + }, + { + "epoch": 40.572, + "grad_norm": 1.4686963558197021, + "learning_rate": 2e-05, + "loss": 0.04155823, + "step": 20286 + }, + { + "epoch": 40.574, + "grad_norm": 1.156620979309082, + "learning_rate": 2e-05, + "loss": 0.04370104, + "step": 20287 + }, + { + "epoch": 40.576, + "grad_norm": 0.9128440022468567, + "learning_rate": 2e-05, + "loss": 0.03572758, + "step": 20288 + }, + { + "epoch": 40.578, + "grad_norm": 2.242276668548584, + "learning_rate": 2e-05, + "loss": 0.0504857, + "step": 20289 + }, + { + "epoch": 40.58, + "grad_norm": 3.176802396774292, + "learning_rate": 2e-05, + "loss": 0.05922861, + "step": 20290 + }, + { + "epoch": 40.582, + "grad_norm": 1.0698144435882568, + "learning_rate": 2e-05, + "loss": 0.03699166, + "step": 20291 + }, + { + "epoch": 40.584, + "grad_norm": 1.1629245281219482, + "learning_rate": 2e-05, + "loss": 0.04336737, + "step": 20292 + }, + { + "epoch": 40.586, + "grad_norm": 1.35090172290802, + "learning_rate": 2e-05, + "loss": 0.05229773, + "step": 20293 + }, + { + "epoch": 40.588, + "grad_norm": 1.5074291229248047, + "learning_rate": 2e-05, + "loss": 0.03177891, + "step": 20294 + }, + { + "epoch": 40.59, + "grad_norm": 1.5969630479812622, + "learning_rate": 2e-05, + "loss": 0.02993333, + "step": 20295 + }, + { + "epoch": 40.592, + "grad_norm": 1.2182235717773438, + "learning_rate": 2e-05, + "loss": 0.05838227, + "step": 20296 + }, + { + "epoch": 40.594, + "grad_norm": 1.0844266414642334, + "learning_rate": 2e-05, + "loss": 0.04683068, + "step": 20297 + }, + { + "epoch": 40.596, + "grad_norm": 1.113297939300537, + "learning_rate": 2e-05, + "loss": 0.04225264, + "step": 20298 + }, + { + "epoch": 40.598, + "grad_norm": 1.3273909091949463, + "learning_rate": 2e-05, + "loss": 0.05528468, + "step": 20299 + }, + { + "epoch": 40.6, + "grad_norm": 1.0608805418014526, + "learning_rate": 2e-05, + "loss": 0.04179974, + "step": 20300 + }, + { + "epoch": 40.602, + "grad_norm": 2.0102715492248535, + "learning_rate": 2e-05, + "loss": 0.05286686, + "step": 20301 + }, + { + "epoch": 40.604, + "grad_norm": 1.4773969650268555, + "learning_rate": 2e-05, + "loss": 0.07245906, + "step": 20302 + }, + { + "epoch": 40.606, + "grad_norm": 1.4324716329574585, + "learning_rate": 2e-05, + "loss": 0.0385082, + "step": 20303 + }, + { + "epoch": 40.608, + "grad_norm": 1.1375013589859009, + "learning_rate": 2e-05, + "loss": 0.04455408, + "step": 20304 + }, + { + "epoch": 40.61, + "grad_norm": 1.8112099170684814, + "learning_rate": 2e-05, + "loss": 0.04102049, + "step": 20305 + }, + { + "epoch": 40.612, + "grad_norm": 1.1016380786895752, + "learning_rate": 2e-05, + "loss": 0.04536098, + "step": 20306 + }, + { + "epoch": 40.614, + "grad_norm": 1.2239372730255127, + "learning_rate": 2e-05, + "loss": 0.03934653, + "step": 20307 + }, + { + "epoch": 40.616, + "grad_norm": 1.4652773141860962, + "learning_rate": 2e-05, + "loss": 0.04647703, + "step": 20308 + }, + { + "epoch": 40.618, + "grad_norm": 1.3046026229858398, + "learning_rate": 2e-05, + "loss": 0.04674635, + "step": 20309 + }, + { + "epoch": 40.62, + "grad_norm": 1.244250774383545, + "learning_rate": 2e-05, + "loss": 0.04536857, + "step": 20310 + }, + { + "epoch": 40.622, + "grad_norm": 1.1843619346618652, + "learning_rate": 2e-05, + "loss": 0.05122893, + "step": 20311 + }, + { + "epoch": 40.624, + "grad_norm": 1.3475395441055298, + "learning_rate": 2e-05, + "loss": 0.05014464, + "step": 20312 + }, + { + "epoch": 40.626, + "grad_norm": 3.323194742202759, + "learning_rate": 2e-05, + "loss": 0.05366961, + "step": 20313 + }, + { + "epoch": 40.628, + "grad_norm": 1.2654601335525513, + "learning_rate": 2e-05, + "loss": 0.03832448, + "step": 20314 + }, + { + "epoch": 40.63, + "grad_norm": 1.2079969644546509, + "learning_rate": 2e-05, + "loss": 0.03466445, + "step": 20315 + }, + { + "epoch": 40.632, + "grad_norm": 1.2388030290603638, + "learning_rate": 2e-05, + "loss": 0.05009665, + "step": 20316 + }, + { + "epoch": 40.634, + "grad_norm": 1.3457781076431274, + "learning_rate": 2e-05, + "loss": 0.06065813, + "step": 20317 + }, + { + "epoch": 40.636, + "grad_norm": 1.2236225605010986, + "learning_rate": 2e-05, + "loss": 0.05330027, + "step": 20318 + }, + { + "epoch": 40.638, + "grad_norm": 1.1197084188461304, + "learning_rate": 2e-05, + "loss": 0.04628935, + "step": 20319 + }, + { + "epoch": 40.64, + "grad_norm": 2.9297568798065186, + "learning_rate": 2e-05, + "loss": 0.06352062, + "step": 20320 + }, + { + "epoch": 40.642, + "grad_norm": 1.181127667427063, + "learning_rate": 2e-05, + "loss": 0.04264445, + "step": 20321 + }, + { + "epoch": 40.644, + "grad_norm": 3.967763662338257, + "learning_rate": 2e-05, + "loss": 0.06111223, + "step": 20322 + }, + { + "epoch": 40.646, + "grad_norm": 1.2990795373916626, + "learning_rate": 2e-05, + "loss": 0.03806433, + "step": 20323 + }, + { + "epoch": 40.648, + "grad_norm": 1.437103271484375, + "learning_rate": 2e-05, + "loss": 0.05348209, + "step": 20324 + }, + { + "epoch": 40.65, + "grad_norm": 1.2915222644805908, + "learning_rate": 2e-05, + "loss": 0.04917194, + "step": 20325 + }, + { + "epoch": 40.652, + "grad_norm": 1.3285936117172241, + "learning_rate": 2e-05, + "loss": 0.05632872, + "step": 20326 + }, + { + "epoch": 40.654, + "grad_norm": 1.1885918378829956, + "learning_rate": 2e-05, + "loss": 0.04639854, + "step": 20327 + }, + { + "epoch": 40.656, + "grad_norm": 1.2425380945205688, + "learning_rate": 2e-05, + "loss": 0.04708571, + "step": 20328 + }, + { + "epoch": 40.658, + "grad_norm": 1.2747244834899902, + "learning_rate": 2e-05, + "loss": 0.05863928, + "step": 20329 + }, + { + "epoch": 40.66, + "grad_norm": 1.2093666791915894, + "learning_rate": 2e-05, + "loss": 0.04224218, + "step": 20330 + }, + { + "epoch": 40.662, + "grad_norm": 1.0434030294418335, + "learning_rate": 2e-05, + "loss": 0.0403709, + "step": 20331 + }, + { + "epoch": 40.664, + "grad_norm": 1.101930856704712, + "learning_rate": 2e-05, + "loss": 0.04556001, + "step": 20332 + }, + { + "epoch": 40.666, + "grad_norm": 1.3830711841583252, + "learning_rate": 2e-05, + "loss": 0.0546868, + "step": 20333 + }, + { + "epoch": 40.668, + "grad_norm": 1.0567309856414795, + "learning_rate": 2e-05, + "loss": 0.04143964, + "step": 20334 + }, + { + "epoch": 40.67, + "grad_norm": 1.3042546510696411, + "learning_rate": 2e-05, + "loss": 0.04565054, + "step": 20335 + }, + { + "epoch": 40.672, + "grad_norm": 1.3436415195465088, + "learning_rate": 2e-05, + "loss": 0.06098578, + "step": 20336 + }, + { + "epoch": 40.674, + "grad_norm": 1.1577244997024536, + "learning_rate": 2e-05, + "loss": 0.04822986, + "step": 20337 + }, + { + "epoch": 40.676, + "grad_norm": 1.5855461359024048, + "learning_rate": 2e-05, + "loss": 0.0444434, + "step": 20338 + }, + { + "epoch": 40.678, + "grad_norm": 1.7393256425857544, + "learning_rate": 2e-05, + "loss": 0.0486975, + "step": 20339 + }, + { + "epoch": 40.68, + "grad_norm": 3.428238868713379, + "learning_rate": 2e-05, + "loss": 0.04472708, + "step": 20340 + }, + { + "epoch": 40.682, + "grad_norm": 1.6236741542816162, + "learning_rate": 2e-05, + "loss": 0.05305525, + "step": 20341 + }, + { + "epoch": 40.684, + "grad_norm": 1.2626410722732544, + "learning_rate": 2e-05, + "loss": 0.05415434, + "step": 20342 + }, + { + "epoch": 40.686, + "grad_norm": 1.0027401447296143, + "learning_rate": 2e-05, + "loss": 0.03447507, + "step": 20343 + }, + { + "epoch": 40.688, + "grad_norm": 1.2402012348175049, + "learning_rate": 2e-05, + "loss": 0.04778074, + "step": 20344 + }, + { + "epoch": 40.69, + "grad_norm": 1.050713300704956, + "learning_rate": 2e-05, + "loss": 0.03177363, + "step": 20345 + }, + { + "epoch": 40.692, + "grad_norm": 1.2489185333251953, + "learning_rate": 2e-05, + "loss": 0.04607525, + "step": 20346 + }, + { + "epoch": 40.694, + "grad_norm": 1.1416654586791992, + "learning_rate": 2e-05, + "loss": 0.04840235, + "step": 20347 + }, + { + "epoch": 40.696, + "grad_norm": 1.1122640371322632, + "learning_rate": 2e-05, + "loss": 0.03137276, + "step": 20348 + }, + { + "epoch": 40.698, + "grad_norm": 1.4408119916915894, + "learning_rate": 2e-05, + "loss": 0.05424093, + "step": 20349 + }, + { + "epoch": 40.7, + "grad_norm": 0.9952658414840698, + "learning_rate": 2e-05, + "loss": 0.02818977, + "step": 20350 + }, + { + "epoch": 40.702, + "grad_norm": 1.1479781866073608, + "learning_rate": 2e-05, + "loss": 0.03366838, + "step": 20351 + }, + { + "epoch": 40.704, + "grad_norm": 1.1322118043899536, + "learning_rate": 2e-05, + "loss": 0.04626714, + "step": 20352 + }, + { + "epoch": 40.706, + "grad_norm": 1.3414576053619385, + "learning_rate": 2e-05, + "loss": 0.04931616, + "step": 20353 + }, + { + "epoch": 40.708, + "grad_norm": 1.433391809463501, + "learning_rate": 2e-05, + "loss": 0.04131094, + "step": 20354 + }, + { + "epoch": 40.71, + "grad_norm": 1.145910382270813, + "learning_rate": 2e-05, + "loss": 0.04541127, + "step": 20355 + }, + { + "epoch": 40.712, + "grad_norm": 1.17069411277771, + "learning_rate": 2e-05, + "loss": 0.04991463, + "step": 20356 + }, + { + "epoch": 40.714, + "grad_norm": 1.102471113204956, + "learning_rate": 2e-05, + "loss": 0.04218239, + "step": 20357 + }, + { + "epoch": 40.716, + "grad_norm": 1.172473430633545, + "learning_rate": 2e-05, + "loss": 0.0438892, + "step": 20358 + }, + { + "epoch": 40.718, + "grad_norm": 1.8067024946212769, + "learning_rate": 2e-05, + "loss": 0.05643699, + "step": 20359 + }, + { + "epoch": 40.72, + "grad_norm": 1.3518229722976685, + "learning_rate": 2e-05, + "loss": 0.0555013, + "step": 20360 + }, + { + "epoch": 40.722, + "grad_norm": 1.3783165216445923, + "learning_rate": 2e-05, + "loss": 0.05945724, + "step": 20361 + }, + { + "epoch": 40.724, + "grad_norm": 1.1957453489303589, + "learning_rate": 2e-05, + "loss": 0.05680922, + "step": 20362 + }, + { + "epoch": 40.726, + "grad_norm": 1.1252014636993408, + "learning_rate": 2e-05, + "loss": 0.05653615, + "step": 20363 + }, + { + "epoch": 40.728, + "grad_norm": 1.2244309186935425, + "learning_rate": 2e-05, + "loss": 0.04203859, + "step": 20364 + }, + { + "epoch": 40.73, + "grad_norm": 5.841495037078857, + "learning_rate": 2e-05, + "loss": 0.05102485, + "step": 20365 + }, + { + "epoch": 40.732, + "grad_norm": 2.0462138652801514, + "learning_rate": 2e-05, + "loss": 0.05800704, + "step": 20366 + }, + { + "epoch": 40.734, + "grad_norm": 1.3569488525390625, + "learning_rate": 2e-05, + "loss": 0.04887813, + "step": 20367 + }, + { + "epoch": 40.736, + "grad_norm": 0.9353485703468323, + "learning_rate": 2e-05, + "loss": 0.0317823, + "step": 20368 + }, + { + "epoch": 40.738, + "grad_norm": 1.1075727939605713, + "learning_rate": 2e-05, + "loss": 0.04405333, + "step": 20369 + }, + { + "epoch": 40.74, + "grad_norm": 1.120461344718933, + "learning_rate": 2e-05, + "loss": 0.04158332, + "step": 20370 + }, + { + "epoch": 40.742, + "grad_norm": 1.173218846321106, + "learning_rate": 2e-05, + "loss": 0.03538671, + "step": 20371 + }, + { + "epoch": 40.744, + "grad_norm": 1.3119045495986938, + "learning_rate": 2e-05, + "loss": 0.04965501, + "step": 20372 + }, + { + "epoch": 40.746, + "grad_norm": 1.4258747100830078, + "learning_rate": 2e-05, + "loss": 0.04786938, + "step": 20373 + }, + { + "epoch": 40.748, + "grad_norm": 1.2724413871765137, + "learning_rate": 2e-05, + "loss": 0.04649073, + "step": 20374 + }, + { + "epoch": 40.75, + "grad_norm": 1.6921082735061646, + "learning_rate": 2e-05, + "loss": 0.03883822, + "step": 20375 + }, + { + "epoch": 40.752, + "grad_norm": 3.84258770942688, + "learning_rate": 2e-05, + "loss": 0.05532685, + "step": 20376 + }, + { + "epoch": 40.754, + "grad_norm": 1.4394497871398926, + "learning_rate": 2e-05, + "loss": 0.04756043, + "step": 20377 + }, + { + "epoch": 40.756, + "grad_norm": 1.1952511072158813, + "learning_rate": 2e-05, + "loss": 0.05040652, + "step": 20378 + }, + { + "epoch": 40.758, + "grad_norm": 1.139320969581604, + "learning_rate": 2e-05, + "loss": 0.0432628, + "step": 20379 + }, + { + "epoch": 40.76, + "grad_norm": 1.0140758752822876, + "learning_rate": 2e-05, + "loss": 0.0302791, + "step": 20380 + }, + { + "epoch": 40.762, + "grad_norm": 1.783044695854187, + "learning_rate": 2e-05, + "loss": 0.05844506, + "step": 20381 + }, + { + "epoch": 40.764, + "grad_norm": 1.1988928318023682, + "learning_rate": 2e-05, + "loss": 0.03283492, + "step": 20382 + }, + { + "epoch": 40.766, + "grad_norm": 0.9184706807136536, + "learning_rate": 2e-05, + "loss": 0.03624507, + "step": 20383 + }, + { + "epoch": 40.768, + "grad_norm": 1.7441927194595337, + "learning_rate": 2e-05, + "loss": 0.03876618, + "step": 20384 + }, + { + "epoch": 40.77, + "grad_norm": 1.261252522468567, + "learning_rate": 2e-05, + "loss": 0.05795658, + "step": 20385 + }, + { + "epoch": 40.772, + "grad_norm": 1.1532663106918335, + "learning_rate": 2e-05, + "loss": 0.050796, + "step": 20386 + }, + { + "epoch": 40.774, + "grad_norm": 1.3474946022033691, + "learning_rate": 2e-05, + "loss": 0.0455665, + "step": 20387 + }, + { + "epoch": 40.776, + "grad_norm": 1.175278663635254, + "learning_rate": 2e-05, + "loss": 0.03921409, + "step": 20388 + }, + { + "epoch": 40.778, + "grad_norm": 1.2267823219299316, + "learning_rate": 2e-05, + "loss": 0.05682051, + "step": 20389 + }, + { + "epoch": 40.78, + "grad_norm": 3.2952206134796143, + "learning_rate": 2e-05, + "loss": 0.04363419, + "step": 20390 + }, + { + "epoch": 40.782, + "grad_norm": 1.7365856170654297, + "learning_rate": 2e-05, + "loss": 0.05543724, + "step": 20391 + }, + { + "epoch": 40.784, + "grad_norm": 1.2324695587158203, + "learning_rate": 2e-05, + "loss": 0.05429393, + "step": 20392 + }, + { + "epoch": 40.786, + "grad_norm": 1.2314908504486084, + "learning_rate": 2e-05, + "loss": 0.0417459, + "step": 20393 + }, + { + "epoch": 40.788, + "grad_norm": 1.052890658378601, + "learning_rate": 2e-05, + "loss": 0.04568375, + "step": 20394 + }, + { + "epoch": 40.79, + "grad_norm": 1.1527416706085205, + "learning_rate": 2e-05, + "loss": 0.05163721, + "step": 20395 + }, + { + "epoch": 40.792, + "grad_norm": 1.255745768547058, + "learning_rate": 2e-05, + "loss": 0.04441888, + "step": 20396 + }, + { + "epoch": 40.794, + "grad_norm": 2.4157254695892334, + "learning_rate": 2e-05, + "loss": 0.05280969, + "step": 20397 + }, + { + "epoch": 40.796, + "grad_norm": 1.297743797302246, + "learning_rate": 2e-05, + "loss": 0.03922386, + "step": 20398 + }, + { + "epoch": 40.798, + "grad_norm": 1.3191263675689697, + "learning_rate": 2e-05, + "loss": 0.0459812, + "step": 20399 + }, + { + "epoch": 40.8, + "grad_norm": 1.6138203144073486, + "learning_rate": 2e-05, + "loss": 0.04809796, + "step": 20400 + }, + { + "epoch": 40.802, + "grad_norm": 1.0761232376098633, + "learning_rate": 2e-05, + "loss": 0.04102475, + "step": 20401 + }, + { + "epoch": 40.804, + "grad_norm": 1.3820741176605225, + "learning_rate": 2e-05, + "loss": 0.05293287, + "step": 20402 + }, + { + "epoch": 40.806, + "grad_norm": 0.8618505597114563, + "learning_rate": 2e-05, + "loss": 0.03630167, + "step": 20403 + }, + { + "epoch": 40.808, + "grad_norm": 1.2943086624145508, + "learning_rate": 2e-05, + "loss": 0.04525239, + "step": 20404 + }, + { + "epoch": 40.81, + "grad_norm": 1.6318711042404175, + "learning_rate": 2e-05, + "loss": 0.04626258, + "step": 20405 + }, + { + "epoch": 40.812, + "grad_norm": 1.0166434049606323, + "learning_rate": 2e-05, + "loss": 0.03998704, + "step": 20406 + }, + { + "epoch": 40.814, + "grad_norm": 2.893320083618164, + "learning_rate": 2e-05, + "loss": 0.0571047, + "step": 20407 + }, + { + "epoch": 40.816, + "grad_norm": 1.0632132291793823, + "learning_rate": 2e-05, + "loss": 0.03629584, + "step": 20408 + }, + { + "epoch": 40.818, + "grad_norm": 1.168479084968567, + "learning_rate": 2e-05, + "loss": 0.03368823, + "step": 20409 + }, + { + "epoch": 40.82, + "grad_norm": 1.0956116914749146, + "learning_rate": 2e-05, + "loss": 0.03800235, + "step": 20410 + }, + { + "epoch": 40.822, + "grad_norm": 1.0127363204956055, + "learning_rate": 2e-05, + "loss": 0.03506695, + "step": 20411 + }, + { + "epoch": 40.824, + "grad_norm": 1.6961843967437744, + "learning_rate": 2e-05, + "loss": 0.04757579, + "step": 20412 + }, + { + "epoch": 40.826, + "grad_norm": 1.0485764741897583, + "learning_rate": 2e-05, + "loss": 0.03640955, + "step": 20413 + }, + { + "epoch": 40.828, + "grad_norm": 1.4049381017684937, + "learning_rate": 2e-05, + "loss": 0.05531392, + "step": 20414 + }, + { + "epoch": 40.83, + "grad_norm": 1.3320997953414917, + "learning_rate": 2e-05, + "loss": 0.04148456, + "step": 20415 + }, + { + "epoch": 40.832, + "grad_norm": 1.2389609813690186, + "learning_rate": 2e-05, + "loss": 0.04860774, + "step": 20416 + }, + { + "epoch": 40.834, + "grad_norm": 0.9642683863639832, + "learning_rate": 2e-05, + "loss": 0.04272363, + "step": 20417 + }, + { + "epoch": 40.836, + "grad_norm": 1.2460993528366089, + "learning_rate": 2e-05, + "loss": 0.05987071, + "step": 20418 + }, + { + "epoch": 40.838, + "grad_norm": 1.2246595621109009, + "learning_rate": 2e-05, + "loss": 0.05187958, + "step": 20419 + }, + { + "epoch": 40.84, + "grad_norm": 1.6453964710235596, + "learning_rate": 2e-05, + "loss": 0.04696222, + "step": 20420 + }, + { + "epoch": 40.842, + "grad_norm": 1.2105334997177124, + "learning_rate": 2e-05, + "loss": 0.04552323, + "step": 20421 + }, + { + "epoch": 40.844, + "grad_norm": 1.1809000968933105, + "learning_rate": 2e-05, + "loss": 0.04364845, + "step": 20422 + }, + { + "epoch": 40.846, + "grad_norm": 2.2245469093322754, + "learning_rate": 2e-05, + "loss": 0.04627731, + "step": 20423 + }, + { + "epoch": 40.848, + "grad_norm": 1.180808186531067, + "learning_rate": 2e-05, + "loss": 0.04993726, + "step": 20424 + }, + { + "epoch": 40.85, + "grad_norm": 1.7033965587615967, + "learning_rate": 2e-05, + "loss": 0.06255874, + "step": 20425 + }, + { + "epoch": 40.852, + "grad_norm": 1.2189267873764038, + "learning_rate": 2e-05, + "loss": 0.04405816, + "step": 20426 + }, + { + "epoch": 40.854, + "grad_norm": 3.7718513011932373, + "learning_rate": 2e-05, + "loss": 0.05504301, + "step": 20427 + }, + { + "epoch": 40.856, + "grad_norm": 1.1234560012817383, + "learning_rate": 2e-05, + "loss": 0.04025323, + "step": 20428 + }, + { + "epoch": 40.858, + "grad_norm": 1.3701295852661133, + "learning_rate": 2e-05, + "loss": 0.04871433, + "step": 20429 + }, + { + "epoch": 40.86, + "grad_norm": 1.6501892805099487, + "learning_rate": 2e-05, + "loss": 0.06243546, + "step": 20430 + }, + { + "epoch": 40.862, + "grad_norm": 1.19022536277771, + "learning_rate": 2e-05, + "loss": 0.04894875, + "step": 20431 + }, + { + "epoch": 40.864, + "grad_norm": 1.0078957080841064, + "learning_rate": 2e-05, + "loss": 0.03035384, + "step": 20432 + }, + { + "epoch": 40.866, + "grad_norm": 1.052426815032959, + "learning_rate": 2e-05, + "loss": 0.0399336, + "step": 20433 + }, + { + "epoch": 40.868, + "grad_norm": 1.7193706035614014, + "learning_rate": 2e-05, + "loss": 0.05601721, + "step": 20434 + }, + { + "epoch": 40.87, + "grad_norm": 1.1827843189239502, + "learning_rate": 2e-05, + "loss": 0.0472433, + "step": 20435 + }, + { + "epoch": 40.872, + "grad_norm": 2.2266764640808105, + "learning_rate": 2e-05, + "loss": 0.05473574, + "step": 20436 + }, + { + "epoch": 40.874, + "grad_norm": 1.7117822170257568, + "learning_rate": 2e-05, + "loss": 0.0380583, + "step": 20437 + }, + { + "epoch": 40.876, + "grad_norm": 1.234930396080017, + "learning_rate": 2e-05, + "loss": 0.04837093, + "step": 20438 + }, + { + "epoch": 40.878, + "grad_norm": 1.377922773361206, + "learning_rate": 2e-05, + "loss": 0.04900629, + "step": 20439 + }, + { + "epoch": 40.88, + "grad_norm": 1.206617832183838, + "learning_rate": 2e-05, + "loss": 0.05211646, + "step": 20440 + }, + { + "epoch": 40.882, + "grad_norm": 1.1824190616607666, + "learning_rate": 2e-05, + "loss": 0.03713381, + "step": 20441 + }, + { + "epoch": 40.884, + "grad_norm": 0.9823435544967651, + "learning_rate": 2e-05, + "loss": 0.03660665, + "step": 20442 + }, + { + "epoch": 40.886, + "grad_norm": 1.2949517965316772, + "learning_rate": 2e-05, + "loss": 0.05451689, + "step": 20443 + }, + { + "epoch": 40.888, + "grad_norm": 1.2883609533309937, + "learning_rate": 2e-05, + "loss": 0.05717325, + "step": 20444 + }, + { + "epoch": 40.89, + "grad_norm": 0.9961985945701599, + "learning_rate": 2e-05, + "loss": 0.03337915, + "step": 20445 + }, + { + "epoch": 40.892, + "grad_norm": 1.195377230644226, + "learning_rate": 2e-05, + "loss": 0.0425334, + "step": 20446 + }, + { + "epoch": 40.894, + "grad_norm": 1.027193546295166, + "learning_rate": 2e-05, + "loss": 0.02752158, + "step": 20447 + }, + { + "epoch": 40.896, + "grad_norm": 1.1972559690475464, + "learning_rate": 2e-05, + "loss": 0.04988524, + "step": 20448 + }, + { + "epoch": 40.898, + "grad_norm": 1.175160527229309, + "learning_rate": 2e-05, + "loss": 0.04154052, + "step": 20449 + }, + { + "epoch": 40.9, + "grad_norm": 1.1122380495071411, + "learning_rate": 2e-05, + "loss": 0.03493803, + "step": 20450 + }, + { + "epoch": 40.902, + "grad_norm": 1.1690434217453003, + "learning_rate": 2e-05, + "loss": 0.04058939, + "step": 20451 + }, + { + "epoch": 40.904, + "grad_norm": 2.595667839050293, + "learning_rate": 2e-05, + "loss": 0.08465319, + "step": 20452 + }, + { + "epoch": 40.906, + "grad_norm": 1.2236500978469849, + "learning_rate": 2e-05, + "loss": 0.04924143, + "step": 20453 + }, + { + "epoch": 40.908, + "grad_norm": 2.681835174560547, + "learning_rate": 2e-05, + "loss": 0.06435752, + "step": 20454 + }, + { + "epoch": 40.91, + "grad_norm": 1.396338701248169, + "learning_rate": 2e-05, + "loss": 0.04998089, + "step": 20455 + }, + { + "epoch": 40.912, + "grad_norm": 1.3858076333999634, + "learning_rate": 2e-05, + "loss": 0.05152861, + "step": 20456 + }, + { + "epoch": 40.914, + "grad_norm": 1.376036286354065, + "learning_rate": 2e-05, + "loss": 0.04543985, + "step": 20457 + }, + { + "epoch": 40.916, + "grad_norm": 1.57783842086792, + "learning_rate": 2e-05, + "loss": 0.05842921, + "step": 20458 + }, + { + "epoch": 40.918, + "grad_norm": 1.5693801641464233, + "learning_rate": 2e-05, + "loss": 0.0650731, + "step": 20459 + }, + { + "epoch": 40.92, + "grad_norm": 1.4704762697219849, + "learning_rate": 2e-05, + "loss": 0.05608998, + "step": 20460 + }, + { + "epoch": 40.922, + "grad_norm": 1.1098164319992065, + "learning_rate": 2e-05, + "loss": 0.04608128, + "step": 20461 + }, + { + "epoch": 40.924, + "grad_norm": 1.1128933429718018, + "learning_rate": 2e-05, + "loss": 0.03824999, + "step": 20462 + }, + { + "epoch": 40.926, + "grad_norm": 1.244307518005371, + "learning_rate": 2e-05, + "loss": 0.06200198, + "step": 20463 + }, + { + "epoch": 40.928, + "grad_norm": 1.2337552309036255, + "learning_rate": 2e-05, + "loss": 0.03821502, + "step": 20464 + }, + { + "epoch": 40.93, + "grad_norm": 1.2475448846817017, + "learning_rate": 2e-05, + "loss": 0.05957456, + "step": 20465 + }, + { + "epoch": 40.932, + "grad_norm": 2.0048727989196777, + "learning_rate": 2e-05, + "loss": 0.0534838, + "step": 20466 + }, + { + "epoch": 40.934, + "grad_norm": 1.6427809000015259, + "learning_rate": 2e-05, + "loss": 0.03990686, + "step": 20467 + }, + { + "epoch": 40.936, + "grad_norm": 1.199326753616333, + "learning_rate": 2e-05, + "loss": 0.04839782, + "step": 20468 + }, + { + "epoch": 40.938, + "grad_norm": 1.3400622606277466, + "learning_rate": 2e-05, + "loss": 0.05205151, + "step": 20469 + }, + { + "epoch": 40.94, + "grad_norm": 1.2996052503585815, + "learning_rate": 2e-05, + "loss": 0.06201072, + "step": 20470 + }, + { + "epoch": 40.942, + "grad_norm": 1.287192463874817, + "learning_rate": 2e-05, + "loss": 0.05062536, + "step": 20471 + }, + { + "epoch": 40.944, + "grad_norm": 3.311911106109619, + "learning_rate": 2e-05, + "loss": 0.05273374, + "step": 20472 + }, + { + "epoch": 40.946, + "grad_norm": 1.086965799331665, + "learning_rate": 2e-05, + "loss": 0.02739996, + "step": 20473 + }, + { + "epoch": 40.948, + "grad_norm": 1.349342942237854, + "learning_rate": 2e-05, + "loss": 0.05531573, + "step": 20474 + }, + { + "epoch": 40.95, + "grad_norm": 1.4824485778808594, + "learning_rate": 2e-05, + "loss": 0.05945705, + "step": 20475 + }, + { + "epoch": 40.952, + "grad_norm": 1.3184701204299927, + "learning_rate": 2e-05, + "loss": 0.04314397, + "step": 20476 + }, + { + "epoch": 40.954, + "grad_norm": 1.385398030281067, + "learning_rate": 2e-05, + "loss": 0.06099976, + "step": 20477 + }, + { + "epoch": 40.956, + "grad_norm": 1.1655802726745605, + "learning_rate": 2e-05, + "loss": 0.04490764, + "step": 20478 + }, + { + "epoch": 40.958, + "grad_norm": 4.062130928039551, + "learning_rate": 2e-05, + "loss": 0.05665839, + "step": 20479 + }, + { + "epoch": 40.96, + "grad_norm": 1.4232219457626343, + "learning_rate": 2e-05, + "loss": 0.0396259, + "step": 20480 + }, + { + "epoch": 40.962, + "grad_norm": 1.2747482061386108, + "learning_rate": 2e-05, + "loss": 0.05678995, + "step": 20481 + }, + { + "epoch": 40.964, + "grad_norm": 1.5956790447235107, + "learning_rate": 2e-05, + "loss": 0.04028114, + "step": 20482 + }, + { + "epoch": 40.966, + "grad_norm": 1.2978450059890747, + "learning_rate": 2e-05, + "loss": 0.04526225, + "step": 20483 + }, + { + "epoch": 40.968, + "grad_norm": 1.3045350313186646, + "learning_rate": 2e-05, + "loss": 0.05053577, + "step": 20484 + }, + { + "epoch": 40.97, + "grad_norm": 1.1985734701156616, + "learning_rate": 2e-05, + "loss": 0.04328226, + "step": 20485 + }, + { + "epoch": 40.972, + "grad_norm": 1.285112977027893, + "learning_rate": 2e-05, + "loss": 0.04697558, + "step": 20486 + }, + { + "epoch": 40.974, + "grad_norm": 1.4183571338653564, + "learning_rate": 2e-05, + "loss": 0.05476032, + "step": 20487 + }, + { + "epoch": 40.976, + "grad_norm": 1.4861711263656616, + "learning_rate": 2e-05, + "loss": 0.05896805, + "step": 20488 + }, + { + "epoch": 40.978, + "grad_norm": 1.6024667024612427, + "learning_rate": 2e-05, + "loss": 0.04853802, + "step": 20489 + }, + { + "epoch": 40.98, + "grad_norm": 1.2148326635360718, + "learning_rate": 2e-05, + "loss": 0.05488897, + "step": 20490 + }, + { + "epoch": 40.982, + "grad_norm": 1.6598411798477173, + "learning_rate": 2e-05, + "loss": 0.06064874, + "step": 20491 + }, + { + "epoch": 40.984, + "grad_norm": 1.2032941579818726, + "learning_rate": 2e-05, + "loss": 0.04319712, + "step": 20492 + }, + { + "epoch": 40.986, + "grad_norm": 2.1562469005584717, + "learning_rate": 2e-05, + "loss": 0.04315053, + "step": 20493 + }, + { + "epoch": 40.988, + "grad_norm": 1.3768160343170166, + "learning_rate": 2e-05, + "loss": 0.0549387, + "step": 20494 + }, + { + "epoch": 40.99, + "grad_norm": 1.250068187713623, + "learning_rate": 2e-05, + "loss": 0.04305088, + "step": 20495 + }, + { + "epoch": 40.992, + "grad_norm": 1.2710068225860596, + "learning_rate": 2e-05, + "loss": 0.04697052, + "step": 20496 + }, + { + "epoch": 40.994, + "grad_norm": 1.4347896575927734, + "learning_rate": 2e-05, + "loss": 0.050442, + "step": 20497 + }, + { + "epoch": 40.996, + "grad_norm": 0.8269062638282776, + "learning_rate": 2e-05, + "loss": 0.02071571, + "step": 20498 + }, + { + "epoch": 40.998, + "grad_norm": 4.110866069793701, + "learning_rate": 2e-05, + "loss": 0.03941682, + "step": 20499 + }, + { + "epoch": 41.0, + "grad_norm": 1.8041988611221313, + "learning_rate": 2e-05, + "loss": 0.04850418, + "step": 20500 + }, + { + "epoch": 41.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9880239520958084, + "Equal_1": 0.998, + "Equal_2": 0.9880239520958084, + "Equal_3": 0.9900199600798403, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9839679358717435, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.986, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.996, + "Perpendicular_3": 0.8977955911823647, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.994, + "PointLiesOnCircle_3": 0.998, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9820359281437125 + }, + "eval_runtime": 321.056, + "eval_samples_per_second": 32.705, + "eval_steps_per_second": 0.654, + "step": 20500 + }, + { + "epoch": 41.002, + "grad_norm": 1.1957684755325317, + "learning_rate": 2e-05, + "loss": 0.03748061, + "step": 20501 + }, + { + "epoch": 41.004, + "grad_norm": 1.1992721557617188, + "learning_rate": 2e-05, + "loss": 0.05174942, + "step": 20502 + }, + { + "epoch": 41.006, + "grad_norm": 1.3277416229248047, + "learning_rate": 2e-05, + "loss": 0.04400465, + "step": 20503 + }, + { + "epoch": 41.008, + "grad_norm": 1.6822006702423096, + "learning_rate": 2e-05, + "loss": 0.05937324, + "step": 20504 + }, + { + "epoch": 41.01, + "grad_norm": 2.2135109901428223, + "learning_rate": 2e-05, + "loss": 0.06282458, + "step": 20505 + }, + { + "epoch": 41.012, + "grad_norm": 1.2385971546173096, + "learning_rate": 2e-05, + "loss": 0.05500904, + "step": 20506 + }, + { + "epoch": 41.014, + "grad_norm": 1.4644334316253662, + "learning_rate": 2e-05, + "loss": 0.03757091, + "step": 20507 + }, + { + "epoch": 41.016, + "grad_norm": 1.2504773139953613, + "learning_rate": 2e-05, + "loss": 0.04865737, + "step": 20508 + }, + { + "epoch": 41.018, + "grad_norm": 1.0606939792633057, + "learning_rate": 2e-05, + "loss": 0.046809, + "step": 20509 + }, + { + "epoch": 41.02, + "grad_norm": 0.9825658798217773, + "learning_rate": 2e-05, + "loss": 0.03104939, + "step": 20510 + }, + { + "epoch": 41.022, + "grad_norm": 0.9851643443107605, + "learning_rate": 2e-05, + "loss": 0.03083806, + "step": 20511 + }, + { + "epoch": 41.024, + "grad_norm": 1.2548247575759888, + "learning_rate": 2e-05, + "loss": 0.05934072, + "step": 20512 + }, + { + "epoch": 41.026, + "grad_norm": 2.8248443603515625, + "learning_rate": 2e-05, + "loss": 0.05007793, + "step": 20513 + }, + { + "epoch": 41.028, + "grad_norm": 1.832990288734436, + "learning_rate": 2e-05, + "loss": 0.0635509, + "step": 20514 + }, + { + "epoch": 41.03, + "grad_norm": 1.1892746686935425, + "learning_rate": 2e-05, + "loss": 0.04696298, + "step": 20515 + }, + { + "epoch": 41.032, + "grad_norm": 1.8308331966400146, + "learning_rate": 2e-05, + "loss": 0.04524075, + "step": 20516 + }, + { + "epoch": 41.034, + "grad_norm": 1.3011291027069092, + "learning_rate": 2e-05, + "loss": 0.04992253, + "step": 20517 + }, + { + "epoch": 41.036, + "grad_norm": 1.1970986127853394, + "learning_rate": 2e-05, + "loss": 0.05594495, + "step": 20518 + }, + { + "epoch": 41.038, + "grad_norm": 1.1719766855239868, + "learning_rate": 2e-05, + "loss": 0.04220466, + "step": 20519 + }, + { + "epoch": 41.04, + "grad_norm": 1.5280590057373047, + "learning_rate": 2e-05, + "loss": 0.05499358, + "step": 20520 + }, + { + "epoch": 41.042, + "grad_norm": 1.2882308959960938, + "learning_rate": 2e-05, + "loss": 0.04562774, + "step": 20521 + }, + { + "epoch": 41.044, + "grad_norm": 2.0022976398468018, + "learning_rate": 2e-05, + "loss": 0.04248636, + "step": 20522 + }, + { + "epoch": 41.046, + "grad_norm": 1.5285600423812866, + "learning_rate": 2e-05, + "loss": 0.06069652, + "step": 20523 + }, + { + "epoch": 41.048, + "grad_norm": 1.181755781173706, + "learning_rate": 2e-05, + "loss": 0.04825043, + "step": 20524 + }, + { + "epoch": 41.05, + "grad_norm": 1.338629126548767, + "learning_rate": 2e-05, + "loss": 0.06323239, + "step": 20525 + }, + { + "epoch": 41.052, + "grad_norm": 1.1997042894363403, + "learning_rate": 2e-05, + "loss": 0.05054116, + "step": 20526 + }, + { + "epoch": 41.054, + "grad_norm": 1.3715033531188965, + "learning_rate": 2e-05, + "loss": 0.04951176, + "step": 20527 + }, + { + "epoch": 41.056, + "grad_norm": 1.3828208446502686, + "learning_rate": 2e-05, + "loss": 0.04772475, + "step": 20528 + }, + { + "epoch": 41.058, + "grad_norm": 1.0656706094741821, + "learning_rate": 2e-05, + "loss": 0.04354571, + "step": 20529 + }, + { + "epoch": 41.06, + "grad_norm": 0.9382243156433105, + "learning_rate": 2e-05, + "loss": 0.03308392, + "step": 20530 + }, + { + "epoch": 41.062, + "grad_norm": 1.2257845401763916, + "learning_rate": 2e-05, + "loss": 0.05354673, + "step": 20531 + }, + { + "epoch": 41.064, + "grad_norm": 1.0932214260101318, + "learning_rate": 2e-05, + "loss": 0.05004593, + "step": 20532 + }, + { + "epoch": 41.066, + "grad_norm": 1.3086014986038208, + "learning_rate": 2e-05, + "loss": 0.05718559, + "step": 20533 + }, + { + "epoch": 41.068, + "grad_norm": 1.2380306720733643, + "learning_rate": 2e-05, + "loss": 0.05476848, + "step": 20534 + }, + { + "epoch": 41.07, + "grad_norm": 1.06734037399292, + "learning_rate": 2e-05, + "loss": 0.03656657, + "step": 20535 + }, + { + "epoch": 41.072, + "grad_norm": 1.3546667098999023, + "learning_rate": 2e-05, + "loss": 0.05762174, + "step": 20536 + }, + { + "epoch": 41.074, + "grad_norm": 1.0288851261138916, + "learning_rate": 2e-05, + "loss": 0.0424634, + "step": 20537 + }, + { + "epoch": 41.076, + "grad_norm": 1.131334900856018, + "learning_rate": 2e-05, + "loss": 0.05600655, + "step": 20538 + }, + { + "epoch": 41.078, + "grad_norm": 1.6864545345306396, + "learning_rate": 2e-05, + "loss": 0.04475303, + "step": 20539 + }, + { + "epoch": 41.08, + "grad_norm": 1.9584969282150269, + "learning_rate": 2e-05, + "loss": 0.04874922, + "step": 20540 + }, + { + "epoch": 41.082, + "grad_norm": 1.2000778913497925, + "learning_rate": 2e-05, + "loss": 0.04340697, + "step": 20541 + }, + { + "epoch": 41.084, + "grad_norm": 1.0453670024871826, + "learning_rate": 2e-05, + "loss": 0.05378638, + "step": 20542 + }, + { + "epoch": 41.086, + "grad_norm": 1.0404855012893677, + "learning_rate": 2e-05, + "loss": 0.0372684, + "step": 20543 + }, + { + "epoch": 41.088, + "grad_norm": 1.7498598098754883, + "learning_rate": 2e-05, + "loss": 0.04698805, + "step": 20544 + }, + { + "epoch": 41.09, + "grad_norm": 1.0921574831008911, + "learning_rate": 2e-05, + "loss": 0.04534142, + "step": 20545 + }, + { + "epoch": 41.092, + "grad_norm": 1.1065009832382202, + "learning_rate": 2e-05, + "loss": 0.05014817, + "step": 20546 + }, + { + "epoch": 41.094, + "grad_norm": 1.1455692052841187, + "learning_rate": 2e-05, + "loss": 0.06051344, + "step": 20547 + }, + { + "epoch": 41.096, + "grad_norm": 1.4879392385482788, + "learning_rate": 2e-05, + "loss": 0.04928049, + "step": 20548 + }, + { + "epoch": 41.098, + "grad_norm": 1.4718266725540161, + "learning_rate": 2e-05, + "loss": 0.05406002, + "step": 20549 + }, + { + "epoch": 41.1, + "grad_norm": 1.2394520044326782, + "learning_rate": 2e-05, + "loss": 0.04753721, + "step": 20550 + }, + { + "epoch": 41.102, + "grad_norm": 1.0697596073150635, + "learning_rate": 2e-05, + "loss": 0.03903735, + "step": 20551 + }, + { + "epoch": 41.104, + "grad_norm": 1.3443745374679565, + "learning_rate": 2e-05, + "loss": 0.0416825, + "step": 20552 + }, + { + "epoch": 41.106, + "grad_norm": 1.0805517435073853, + "learning_rate": 2e-05, + "loss": 0.02990261, + "step": 20553 + }, + { + "epoch": 41.108, + "grad_norm": 1.2133495807647705, + "learning_rate": 2e-05, + "loss": 0.04597092, + "step": 20554 + }, + { + "epoch": 41.11, + "grad_norm": 2.3126041889190674, + "learning_rate": 2e-05, + "loss": 0.04624292, + "step": 20555 + }, + { + "epoch": 41.112, + "grad_norm": 1.1802953481674194, + "learning_rate": 2e-05, + "loss": 0.05155454, + "step": 20556 + }, + { + "epoch": 41.114, + "grad_norm": 1.4480260610580444, + "learning_rate": 2e-05, + "loss": 0.0588602, + "step": 20557 + }, + { + "epoch": 41.116, + "grad_norm": 1.3759326934814453, + "learning_rate": 2e-05, + "loss": 0.04992274, + "step": 20558 + }, + { + "epoch": 41.118, + "grad_norm": 1.1570967435836792, + "learning_rate": 2e-05, + "loss": 0.04519217, + "step": 20559 + }, + { + "epoch": 41.12, + "grad_norm": 1.2483819723129272, + "learning_rate": 2e-05, + "loss": 0.04176389, + "step": 20560 + }, + { + "epoch": 41.122, + "grad_norm": 1.4002913236618042, + "learning_rate": 2e-05, + "loss": 0.04440395, + "step": 20561 + }, + { + "epoch": 41.124, + "grad_norm": 1.4284930229187012, + "learning_rate": 2e-05, + "loss": 0.05776929, + "step": 20562 + }, + { + "epoch": 41.126, + "grad_norm": 1.3323142528533936, + "learning_rate": 2e-05, + "loss": 0.0383847, + "step": 20563 + }, + { + "epoch": 41.128, + "grad_norm": 1.1827408075332642, + "learning_rate": 2e-05, + "loss": 0.0447496, + "step": 20564 + }, + { + "epoch": 41.13, + "grad_norm": 1.2362679243087769, + "learning_rate": 2e-05, + "loss": 0.04746515, + "step": 20565 + }, + { + "epoch": 41.132, + "grad_norm": 1.9688647985458374, + "learning_rate": 2e-05, + "loss": 0.04489559, + "step": 20566 + }, + { + "epoch": 41.134, + "grad_norm": 1.304279088973999, + "learning_rate": 2e-05, + "loss": 0.04824959, + "step": 20567 + }, + { + "epoch": 41.136, + "grad_norm": 1.7030314207077026, + "learning_rate": 2e-05, + "loss": 0.04330122, + "step": 20568 + }, + { + "epoch": 41.138, + "grad_norm": 1.1935453414916992, + "learning_rate": 2e-05, + "loss": 0.05471293, + "step": 20569 + }, + { + "epoch": 41.14, + "grad_norm": 1.0641679763793945, + "learning_rate": 2e-05, + "loss": 0.04133367, + "step": 20570 + }, + { + "epoch": 41.142, + "grad_norm": 0.9642373323440552, + "learning_rate": 2e-05, + "loss": 0.03811979, + "step": 20571 + }, + { + "epoch": 41.144, + "grad_norm": 1.7914299964904785, + "learning_rate": 2e-05, + "loss": 0.04343174, + "step": 20572 + }, + { + "epoch": 41.146, + "grad_norm": 1.4745582342147827, + "learning_rate": 2e-05, + "loss": 0.03861541, + "step": 20573 + }, + { + "epoch": 41.148, + "grad_norm": 0.9670395255088806, + "learning_rate": 2e-05, + "loss": 0.03732292, + "step": 20574 + }, + { + "epoch": 41.15, + "grad_norm": 1.2237446308135986, + "learning_rate": 2e-05, + "loss": 0.04279939, + "step": 20575 + }, + { + "epoch": 41.152, + "grad_norm": 1.173431634902954, + "learning_rate": 2e-05, + "loss": 0.05236944, + "step": 20576 + }, + { + "epoch": 41.154, + "grad_norm": 2.5610194206237793, + "learning_rate": 2e-05, + "loss": 0.05082991, + "step": 20577 + }, + { + "epoch": 41.156, + "grad_norm": 1.1480480432510376, + "learning_rate": 2e-05, + "loss": 0.03853033, + "step": 20578 + }, + { + "epoch": 41.158, + "grad_norm": 1.0263923406600952, + "learning_rate": 2e-05, + "loss": 0.0398251, + "step": 20579 + }, + { + "epoch": 41.16, + "grad_norm": 1.241070032119751, + "learning_rate": 2e-05, + "loss": 0.04581333, + "step": 20580 + }, + { + "epoch": 41.162, + "grad_norm": 0.9623987674713135, + "learning_rate": 2e-05, + "loss": 0.02442548, + "step": 20581 + }, + { + "epoch": 41.164, + "grad_norm": 1.2360503673553467, + "learning_rate": 2e-05, + "loss": 0.04588886, + "step": 20582 + }, + { + "epoch": 41.166, + "grad_norm": 1.4302246570587158, + "learning_rate": 2e-05, + "loss": 0.0555368, + "step": 20583 + }, + { + "epoch": 41.168, + "grad_norm": 1.1271694898605347, + "learning_rate": 2e-05, + "loss": 0.04541831, + "step": 20584 + }, + { + "epoch": 41.17, + "grad_norm": 1.3913869857788086, + "learning_rate": 2e-05, + "loss": 0.05388426, + "step": 20585 + }, + { + "epoch": 41.172, + "grad_norm": 1.4658206701278687, + "learning_rate": 2e-05, + "loss": 0.05629757, + "step": 20586 + }, + { + "epoch": 41.174, + "grad_norm": 1.5316065549850464, + "learning_rate": 2e-05, + "loss": 0.04545237, + "step": 20587 + }, + { + "epoch": 41.176, + "grad_norm": 1.3376739025115967, + "learning_rate": 2e-05, + "loss": 0.04239886, + "step": 20588 + }, + { + "epoch": 41.178, + "grad_norm": 1.168369174003601, + "learning_rate": 2e-05, + "loss": 0.04730329, + "step": 20589 + }, + { + "epoch": 41.18, + "grad_norm": 1.132712483406067, + "learning_rate": 2e-05, + "loss": 0.0453882, + "step": 20590 + }, + { + "epoch": 41.182, + "grad_norm": 1.2484111785888672, + "learning_rate": 2e-05, + "loss": 0.05077428, + "step": 20591 + }, + { + "epoch": 41.184, + "grad_norm": 1.2351433038711548, + "learning_rate": 2e-05, + "loss": 0.04053054, + "step": 20592 + }, + { + "epoch": 41.186, + "grad_norm": 1.127816081047058, + "learning_rate": 2e-05, + "loss": 0.03557635, + "step": 20593 + }, + { + "epoch": 41.188, + "grad_norm": 1.6981974840164185, + "learning_rate": 2e-05, + "loss": 0.05980735, + "step": 20594 + }, + { + "epoch": 41.19, + "grad_norm": 1.2019031047821045, + "learning_rate": 2e-05, + "loss": 0.04414729, + "step": 20595 + }, + { + "epoch": 41.192, + "grad_norm": 2.026304244995117, + "learning_rate": 2e-05, + "loss": 0.06682765, + "step": 20596 + }, + { + "epoch": 41.194, + "grad_norm": 1.1906565427780151, + "learning_rate": 2e-05, + "loss": 0.04870375, + "step": 20597 + }, + { + "epoch": 41.196, + "grad_norm": 1.194569706916809, + "learning_rate": 2e-05, + "loss": 0.05204153, + "step": 20598 + }, + { + "epoch": 41.198, + "grad_norm": 1.4061965942382812, + "learning_rate": 2e-05, + "loss": 0.05822208, + "step": 20599 + }, + { + "epoch": 41.2, + "grad_norm": 1.2633558511734009, + "learning_rate": 2e-05, + "loss": 0.0512266, + "step": 20600 + }, + { + "epoch": 41.202, + "grad_norm": 1.4011318683624268, + "learning_rate": 2e-05, + "loss": 0.05827274, + "step": 20601 + }, + { + "epoch": 41.204, + "grad_norm": 1.0192710161209106, + "learning_rate": 2e-05, + "loss": 0.03732585, + "step": 20602 + }, + { + "epoch": 41.206, + "grad_norm": 1.1007825136184692, + "learning_rate": 2e-05, + "loss": 0.04062988, + "step": 20603 + }, + { + "epoch": 41.208, + "grad_norm": 1.4489682912826538, + "learning_rate": 2e-05, + "loss": 0.04679845, + "step": 20604 + }, + { + "epoch": 41.21, + "grad_norm": 1.3930132389068604, + "learning_rate": 2e-05, + "loss": 0.04038769, + "step": 20605 + }, + { + "epoch": 41.212, + "grad_norm": 1.1776801347732544, + "learning_rate": 2e-05, + "loss": 0.05422183, + "step": 20606 + }, + { + "epoch": 41.214, + "grad_norm": 1.3811092376708984, + "learning_rate": 2e-05, + "loss": 0.05448051, + "step": 20607 + }, + { + "epoch": 41.216, + "grad_norm": 1.6993544101715088, + "learning_rate": 2e-05, + "loss": 0.05991393, + "step": 20608 + }, + { + "epoch": 41.218, + "grad_norm": 1.1804338693618774, + "learning_rate": 2e-05, + "loss": 0.0513094, + "step": 20609 + }, + { + "epoch": 41.22, + "grad_norm": 1.2084537744522095, + "learning_rate": 2e-05, + "loss": 0.04306343, + "step": 20610 + }, + { + "epoch": 41.222, + "grad_norm": 2.1294620037078857, + "learning_rate": 2e-05, + "loss": 0.05107892, + "step": 20611 + }, + { + "epoch": 41.224, + "grad_norm": 1.1537929773330688, + "learning_rate": 2e-05, + "loss": 0.04525739, + "step": 20612 + }, + { + "epoch": 41.226, + "grad_norm": 1.0452954769134521, + "learning_rate": 2e-05, + "loss": 0.03667298, + "step": 20613 + }, + { + "epoch": 41.228, + "grad_norm": 1.2010400295257568, + "learning_rate": 2e-05, + "loss": 0.03450677, + "step": 20614 + }, + { + "epoch": 41.23, + "grad_norm": 1.1608492136001587, + "learning_rate": 2e-05, + "loss": 0.0470752, + "step": 20615 + }, + { + "epoch": 41.232, + "grad_norm": 1.367234706878662, + "learning_rate": 2e-05, + "loss": 0.04004624, + "step": 20616 + }, + { + "epoch": 41.234, + "grad_norm": 3.438629388809204, + "learning_rate": 2e-05, + "loss": 0.05718534, + "step": 20617 + }, + { + "epoch": 41.236, + "grad_norm": 1.245473861694336, + "learning_rate": 2e-05, + "loss": 0.04939371, + "step": 20618 + }, + { + "epoch": 41.238, + "grad_norm": 1.5737378597259521, + "learning_rate": 2e-05, + "loss": 0.06649335, + "step": 20619 + }, + { + "epoch": 41.24, + "grad_norm": 2.474226951599121, + "learning_rate": 2e-05, + "loss": 0.05714843, + "step": 20620 + }, + { + "epoch": 41.242, + "grad_norm": 1.1201341152191162, + "learning_rate": 2e-05, + "loss": 0.03632017, + "step": 20621 + }, + { + "epoch": 41.244, + "grad_norm": 1.3226417303085327, + "learning_rate": 2e-05, + "loss": 0.05535428, + "step": 20622 + }, + { + "epoch": 41.246, + "grad_norm": 1.5820775032043457, + "learning_rate": 2e-05, + "loss": 0.06497006, + "step": 20623 + }, + { + "epoch": 41.248, + "grad_norm": 1.6483792066574097, + "learning_rate": 2e-05, + "loss": 0.04980063, + "step": 20624 + }, + { + "epoch": 41.25, + "grad_norm": 0.8462343215942383, + "learning_rate": 2e-05, + "loss": 0.0305022, + "step": 20625 + }, + { + "epoch": 41.252, + "grad_norm": 1.3294532299041748, + "learning_rate": 2e-05, + "loss": 0.05518986, + "step": 20626 + }, + { + "epoch": 41.254, + "grad_norm": 1.0134047269821167, + "learning_rate": 2e-05, + "loss": 0.04463737, + "step": 20627 + }, + { + "epoch": 41.256, + "grad_norm": 1.3394545316696167, + "learning_rate": 2e-05, + "loss": 0.06053488, + "step": 20628 + }, + { + "epoch": 41.258, + "grad_norm": 1.1298450231552124, + "learning_rate": 2e-05, + "loss": 0.03974798, + "step": 20629 + }, + { + "epoch": 41.26, + "grad_norm": 1.2260725498199463, + "learning_rate": 2e-05, + "loss": 0.04857574, + "step": 20630 + }, + { + "epoch": 41.262, + "grad_norm": 1.2204060554504395, + "learning_rate": 2e-05, + "loss": 0.04478389, + "step": 20631 + }, + { + "epoch": 41.264, + "grad_norm": 2.948246479034424, + "learning_rate": 2e-05, + "loss": 0.05024678, + "step": 20632 + }, + { + "epoch": 41.266, + "grad_norm": 1.3143848180770874, + "learning_rate": 2e-05, + "loss": 0.04200232, + "step": 20633 + }, + { + "epoch": 41.268, + "grad_norm": 1.1079685688018799, + "learning_rate": 2e-05, + "loss": 0.03887783, + "step": 20634 + }, + { + "epoch": 41.27, + "grad_norm": 1.3387101888656616, + "learning_rate": 2e-05, + "loss": 0.04824906, + "step": 20635 + }, + { + "epoch": 41.272, + "grad_norm": 1.2918139696121216, + "learning_rate": 2e-05, + "loss": 0.05828239, + "step": 20636 + }, + { + "epoch": 41.274, + "grad_norm": 1.2324050664901733, + "learning_rate": 2e-05, + "loss": 0.05050058, + "step": 20637 + }, + { + "epoch": 41.276, + "grad_norm": 1.08914315700531, + "learning_rate": 2e-05, + "loss": 0.03811881, + "step": 20638 + }, + { + "epoch": 41.278, + "grad_norm": 1.5355799198150635, + "learning_rate": 2e-05, + "loss": 0.06125507, + "step": 20639 + }, + { + "epoch": 41.28, + "grad_norm": 1.1275166273117065, + "learning_rate": 2e-05, + "loss": 0.04609001, + "step": 20640 + }, + { + "epoch": 41.282, + "grad_norm": 1.394644021987915, + "learning_rate": 2e-05, + "loss": 0.05548574, + "step": 20641 + }, + { + "epoch": 41.284, + "grad_norm": 1.1803781986236572, + "learning_rate": 2e-05, + "loss": 0.04673609, + "step": 20642 + }, + { + "epoch": 41.286, + "grad_norm": 1.088181972503662, + "learning_rate": 2e-05, + "loss": 0.04835239, + "step": 20643 + }, + { + "epoch": 41.288, + "grad_norm": 1.2936146259307861, + "learning_rate": 2e-05, + "loss": 0.0537074, + "step": 20644 + }, + { + "epoch": 41.29, + "grad_norm": 1.0376815795898438, + "learning_rate": 2e-05, + "loss": 0.04589043, + "step": 20645 + }, + { + "epoch": 41.292, + "grad_norm": 1.2658627033233643, + "learning_rate": 2e-05, + "loss": 0.04795525, + "step": 20646 + }, + { + "epoch": 41.294, + "grad_norm": 1.5672342777252197, + "learning_rate": 2e-05, + "loss": 0.06098137, + "step": 20647 + }, + { + "epoch": 41.296, + "grad_norm": 1.1743220090866089, + "learning_rate": 2e-05, + "loss": 0.03908478, + "step": 20648 + }, + { + "epoch": 41.298, + "grad_norm": 1.368518590927124, + "learning_rate": 2e-05, + "loss": 0.03311605, + "step": 20649 + }, + { + "epoch": 41.3, + "grad_norm": 0.9655233025550842, + "learning_rate": 2e-05, + "loss": 0.03225183, + "step": 20650 + }, + { + "epoch": 41.302, + "grad_norm": 2.152411937713623, + "learning_rate": 2e-05, + "loss": 0.04487934, + "step": 20651 + }, + { + "epoch": 41.304, + "grad_norm": 1.6587146520614624, + "learning_rate": 2e-05, + "loss": 0.05637205, + "step": 20652 + }, + { + "epoch": 41.306, + "grad_norm": 1.024804949760437, + "learning_rate": 2e-05, + "loss": 0.03787657, + "step": 20653 + }, + { + "epoch": 41.308, + "grad_norm": 0.8710086941719055, + "learning_rate": 2e-05, + "loss": 0.02578922, + "step": 20654 + }, + { + "epoch": 41.31, + "grad_norm": 1.093598484992981, + "learning_rate": 2e-05, + "loss": 0.03931287, + "step": 20655 + }, + { + "epoch": 41.312, + "grad_norm": 1.3158975839614868, + "learning_rate": 2e-05, + "loss": 0.0523903, + "step": 20656 + }, + { + "epoch": 41.314, + "grad_norm": 1.2842652797698975, + "learning_rate": 2e-05, + "loss": 0.0551443, + "step": 20657 + }, + { + "epoch": 41.316, + "grad_norm": 0.9673425555229187, + "learning_rate": 2e-05, + "loss": 0.03543061, + "step": 20658 + }, + { + "epoch": 41.318, + "grad_norm": 1.4158899784088135, + "learning_rate": 2e-05, + "loss": 0.05106701, + "step": 20659 + }, + { + "epoch": 41.32, + "grad_norm": 1.7559031248092651, + "learning_rate": 2e-05, + "loss": 0.03872785, + "step": 20660 + }, + { + "epoch": 41.322, + "grad_norm": 1.1263608932495117, + "learning_rate": 2e-05, + "loss": 0.03599942, + "step": 20661 + }, + { + "epoch": 41.324, + "grad_norm": 0.9912429451942444, + "learning_rate": 2e-05, + "loss": 0.02866612, + "step": 20662 + }, + { + "epoch": 41.326, + "grad_norm": 1.0485600233078003, + "learning_rate": 2e-05, + "loss": 0.03809555, + "step": 20663 + }, + { + "epoch": 41.328, + "grad_norm": 2.218872308731079, + "learning_rate": 2e-05, + "loss": 0.05056027, + "step": 20664 + }, + { + "epoch": 41.33, + "grad_norm": 1.2448780536651611, + "learning_rate": 2e-05, + "loss": 0.06671609, + "step": 20665 + }, + { + "epoch": 41.332, + "grad_norm": 1.1735291481018066, + "learning_rate": 2e-05, + "loss": 0.04490344, + "step": 20666 + }, + { + "epoch": 41.334, + "grad_norm": 1.6742703914642334, + "learning_rate": 2e-05, + "loss": 0.06206613, + "step": 20667 + }, + { + "epoch": 41.336, + "grad_norm": 1.2419872283935547, + "learning_rate": 2e-05, + "loss": 0.03979301, + "step": 20668 + }, + { + "epoch": 41.338, + "grad_norm": 2.8356029987335205, + "learning_rate": 2e-05, + "loss": 0.05524603, + "step": 20669 + }, + { + "epoch": 41.34, + "grad_norm": 2.10284161567688, + "learning_rate": 2e-05, + "loss": 0.05853007, + "step": 20670 + }, + { + "epoch": 41.342, + "grad_norm": 1.0346118211746216, + "learning_rate": 2e-05, + "loss": 0.0342777, + "step": 20671 + }, + { + "epoch": 41.344, + "grad_norm": 1.1584864854812622, + "learning_rate": 2e-05, + "loss": 0.04522749, + "step": 20672 + }, + { + "epoch": 41.346, + "grad_norm": 1.0336374044418335, + "learning_rate": 2e-05, + "loss": 0.0341766, + "step": 20673 + }, + { + "epoch": 41.348, + "grad_norm": 1.2833257913589478, + "learning_rate": 2e-05, + "loss": 0.04493989, + "step": 20674 + }, + { + "epoch": 41.35, + "grad_norm": 1.196931004524231, + "learning_rate": 2e-05, + "loss": 0.03288671, + "step": 20675 + }, + { + "epoch": 41.352, + "grad_norm": 1.6481664180755615, + "learning_rate": 2e-05, + "loss": 0.04984744, + "step": 20676 + }, + { + "epoch": 41.354, + "grad_norm": 1.3201029300689697, + "learning_rate": 2e-05, + "loss": 0.05488572, + "step": 20677 + }, + { + "epoch": 41.356, + "grad_norm": 1.2787818908691406, + "learning_rate": 2e-05, + "loss": 0.0575133, + "step": 20678 + }, + { + "epoch": 41.358, + "grad_norm": 1.1592620611190796, + "learning_rate": 2e-05, + "loss": 0.04519965, + "step": 20679 + }, + { + "epoch": 41.36, + "grad_norm": 1.7062658071517944, + "learning_rate": 2e-05, + "loss": 0.04929303, + "step": 20680 + }, + { + "epoch": 41.362, + "grad_norm": 1.4524036645889282, + "learning_rate": 2e-05, + "loss": 0.05601212, + "step": 20681 + }, + { + "epoch": 41.364, + "grad_norm": 1.2186477184295654, + "learning_rate": 2e-05, + "loss": 0.0421271, + "step": 20682 + }, + { + "epoch": 41.366, + "grad_norm": 1.7632216215133667, + "learning_rate": 2e-05, + "loss": 0.05948129, + "step": 20683 + }, + { + "epoch": 41.368, + "grad_norm": 1.5571186542510986, + "learning_rate": 2e-05, + "loss": 0.04970782, + "step": 20684 + }, + { + "epoch": 41.37, + "grad_norm": 1.611167311668396, + "learning_rate": 2e-05, + "loss": 0.04187579, + "step": 20685 + }, + { + "epoch": 41.372, + "grad_norm": 1.0352783203125, + "learning_rate": 2e-05, + "loss": 0.02740442, + "step": 20686 + }, + { + "epoch": 41.374, + "grad_norm": 1.3376938104629517, + "learning_rate": 2e-05, + "loss": 0.06241329, + "step": 20687 + }, + { + "epoch": 41.376, + "grad_norm": 1.135635495185852, + "learning_rate": 2e-05, + "loss": 0.04911846, + "step": 20688 + }, + { + "epoch": 41.378, + "grad_norm": 1.2492220401763916, + "learning_rate": 2e-05, + "loss": 0.05272296, + "step": 20689 + }, + { + "epoch": 41.38, + "grad_norm": 1.3152111768722534, + "learning_rate": 2e-05, + "loss": 0.05239505, + "step": 20690 + }, + { + "epoch": 41.382, + "grad_norm": 1.3397350311279297, + "learning_rate": 2e-05, + "loss": 0.05067999, + "step": 20691 + }, + { + "epoch": 41.384, + "grad_norm": 1.1833877563476562, + "learning_rate": 2e-05, + "loss": 0.04897395, + "step": 20692 + }, + { + "epoch": 41.386, + "grad_norm": 1.2837327718734741, + "learning_rate": 2e-05, + "loss": 0.05838244, + "step": 20693 + }, + { + "epoch": 41.388, + "grad_norm": 1.150177240371704, + "learning_rate": 2e-05, + "loss": 0.04375577, + "step": 20694 + }, + { + "epoch": 41.39, + "grad_norm": 1.4795578718185425, + "learning_rate": 2e-05, + "loss": 0.05091795, + "step": 20695 + }, + { + "epoch": 41.392, + "grad_norm": 1.2051365375518799, + "learning_rate": 2e-05, + "loss": 0.05093031, + "step": 20696 + }, + { + "epoch": 41.394, + "grad_norm": 0.9962400197982788, + "learning_rate": 2e-05, + "loss": 0.04091921, + "step": 20697 + }, + { + "epoch": 41.396, + "grad_norm": 2.7451603412628174, + "learning_rate": 2e-05, + "loss": 0.05224776, + "step": 20698 + }, + { + "epoch": 41.398, + "grad_norm": 1.16928231716156, + "learning_rate": 2e-05, + "loss": 0.05099126, + "step": 20699 + }, + { + "epoch": 41.4, + "grad_norm": 1.571977972984314, + "learning_rate": 2e-05, + "loss": 0.04833336, + "step": 20700 + }, + { + "epoch": 41.402, + "grad_norm": 1.3276727199554443, + "learning_rate": 2e-05, + "loss": 0.04746715, + "step": 20701 + }, + { + "epoch": 41.404, + "grad_norm": 1.438899040222168, + "learning_rate": 2e-05, + "loss": 0.05122091, + "step": 20702 + }, + { + "epoch": 41.406, + "grad_norm": 1.1340113878250122, + "learning_rate": 2e-05, + "loss": 0.04921956, + "step": 20703 + }, + { + "epoch": 41.408, + "grad_norm": 1.3475137948989868, + "learning_rate": 2e-05, + "loss": 0.0532689, + "step": 20704 + }, + { + "epoch": 41.41, + "grad_norm": 3.48881196975708, + "learning_rate": 2e-05, + "loss": 0.05500069, + "step": 20705 + }, + { + "epoch": 41.412, + "grad_norm": 1.010732889175415, + "learning_rate": 2e-05, + "loss": 0.03719215, + "step": 20706 + }, + { + "epoch": 41.414, + "grad_norm": 1.2824602127075195, + "learning_rate": 2e-05, + "loss": 0.05517687, + "step": 20707 + }, + { + "epoch": 41.416, + "grad_norm": 1.1974467039108276, + "learning_rate": 2e-05, + "loss": 0.03475419, + "step": 20708 + }, + { + "epoch": 41.418, + "grad_norm": 0.9742454290390015, + "learning_rate": 2e-05, + "loss": 0.03150909, + "step": 20709 + }, + { + "epoch": 41.42, + "grad_norm": 4.395928382873535, + "learning_rate": 2e-05, + "loss": 0.06255428, + "step": 20710 + }, + { + "epoch": 41.422, + "grad_norm": 2.005077362060547, + "learning_rate": 2e-05, + "loss": 0.05040678, + "step": 20711 + }, + { + "epoch": 41.424, + "grad_norm": 1.1284871101379395, + "learning_rate": 2e-05, + "loss": 0.04268694, + "step": 20712 + }, + { + "epoch": 41.426, + "grad_norm": 1.1108037233352661, + "learning_rate": 2e-05, + "loss": 0.05018146, + "step": 20713 + }, + { + "epoch": 41.428, + "grad_norm": 1.2900370359420776, + "learning_rate": 2e-05, + "loss": 0.05577334, + "step": 20714 + }, + { + "epoch": 41.43, + "grad_norm": 1.625792145729065, + "learning_rate": 2e-05, + "loss": 0.0519428, + "step": 20715 + }, + { + "epoch": 41.432, + "grad_norm": 1.2618361711502075, + "learning_rate": 2e-05, + "loss": 0.06108624, + "step": 20716 + }, + { + "epoch": 41.434, + "grad_norm": 2.6890978813171387, + "learning_rate": 2e-05, + "loss": 0.05818431, + "step": 20717 + }, + { + "epoch": 41.436, + "grad_norm": 1.5866303443908691, + "learning_rate": 2e-05, + "loss": 0.05809556, + "step": 20718 + }, + { + "epoch": 41.438, + "grad_norm": 3.1994481086730957, + "learning_rate": 2e-05, + "loss": 0.03941204, + "step": 20719 + }, + { + "epoch": 41.44, + "grad_norm": 0.9686678051948547, + "learning_rate": 2e-05, + "loss": 0.03777551, + "step": 20720 + }, + { + "epoch": 41.442, + "grad_norm": 1.2619547843933105, + "learning_rate": 2e-05, + "loss": 0.05709205, + "step": 20721 + }, + { + "epoch": 41.444, + "grad_norm": 1.1282532215118408, + "learning_rate": 2e-05, + "loss": 0.03835287, + "step": 20722 + }, + { + "epoch": 41.446, + "grad_norm": 1.2010022401809692, + "learning_rate": 2e-05, + "loss": 0.04767352, + "step": 20723 + }, + { + "epoch": 41.448, + "grad_norm": 1.447494387626648, + "learning_rate": 2e-05, + "loss": 0.05089083, + "step": 20724 + }, + { + "epoch": 41.45, + "grad_norm": 1.101004719734192, + "learning_rate": 2e-05, + "loss": 0.0408543, + "step": 20725 + }, + { + "epoch": 41.452, + "grad_norm": 1.2500224113464355, + "learning_rate": 2e-05, + "loss": 0.03975309, + "step": 20726 + }, + { + "epoch": 41.454, + "grad_norm": 1.746552586555481, + "learning_rate": 2e-05, + "loss": 0.06327257, + "step": 20727 + }, + { + "epoch": 41.456, + "grad_norm": 1.121049165725708, + "learning_rate": 2e-05, + "loss": 0.03917, + "step": 20728 + }, + { + "epoch": 41.458, + "grad_norm": 1.5828957557678223, + "learning_rate": 2e-05, + "loss": 0.04092988, + "step": 20729 + }, + { + "epoch": 41.46, + "grad_norm": 1.3952829837799072, + "learning_rate": 2e-05, + "loss": 0.05349658, + "step": 20730 + }, + { + "epoch": 41.462, + "grad_norm": 1.1374624967575073, + "learning_rate": 2e-05, + "loss": 0.04330311, + "step": 20731 + }, + { + "epoch": 41.464, + "grad_norm": 2.1786279678344727, + "learning_rate": 2e-05, + "loss": 0.04306266, + "step": 20732 + }, + { + "epoch": 41.466, + "grad_norm": 1.1938217878341675, + "learning_rate": 2e-05, + "loss": 0.04537656, + "step": 20733 + }, + { + "epoch": 41.468, + "grad_norm": 1.673171877861023, + "learning_rate": 2e-05, + "loss": 0.04567283, + "step": 20734 + }, + { + "epoch": 41.47, + "grad_norm": 1.8601455688476562, + "learning_rate": 2e-05, + "loss": 0.04573951, + "step": 20735 + }, + { + "epoch": 41.472, + "grad_norm": 1.2552533149719238, + "learning_rate": 2e-05, + "loss": 0.05038974, + "step": 20736 + }, + { + "epoch": 41.474, + "grad_norm": 1.2557790279388428, + "learning_rate": 2e-05, + "loss": 0.04997993, + "step": 20737 + }, + { + "epoch": 41.476, + "grad_norm": 1.156934142112732, + "learning_rate": 2e-05, + "loss": 0.04020359, + "step": 20738 + }, + { + "epoch": 41.478, + "grad_norm": 1.0187697410583496, + "learning_rate": 2e-05, + "loss": 0.0350149, + "step": 20739 + }, + { + "epoch": 41.48, + "grad_norm": 1.5980960130691528, + "learning_rate": 2e-05, + "loss": 0.05507297, + "step": 20740 + }, + { + "epoch": 41.482, + "grad_norm": 1.2604917287826538, + "learning_rate": 2e-05, + "loss": 0.04280219, + "step": 20741 + }, + { + "epoch": 41.484, + "grad_norm": 1.2176854610443115, + "learning_rate": 2e-05, + "loss": 0.04632607, + "step": 20742 + }, + { + "epoch": 41.486, + "grad_norm": 1.7439926862716675, + "learning_rate": 2e-05, + "loss": 0.06102863, + "step": 20743 + }, + { + "epoch": 41.488, + "grad_norm": 1.1306809186935425, + "learning_rate": 2e-05, + "loss": 0.04056504, + "step": 20744 + }, + { + "epoch": 41.49, + "grad_norm": 1.6267520189285278, + "learning_rate": 2e-05, + "loss": 0.07167388, + "step": 20745 + }, + { + "epoch": 41.492, + "grad_norm": 1.483768343925476, + "learning_rate": 2e-05, + "loss": 0.03321824, + "step": 20746 + }, + { + "epoch": 41.494, + "grad_norm": 1.3936223983764648, + "learning_rate": 2e-05, + "loss": 0.06460038, + "step": 20747 + }, + { + "epoch": 41.496, + "grad_norm": 1.23568594455719, + "learning_rate": 2e-05, + "loss": 0.06132279, + "step": 20748 + }, + { + "epoch": 41.498, + "grad_norm": 1.2954548597335815, + "learning_rate": 2e-05, + "loss": 0.04977329, + "step": 20749 + }, + { + "epoch": 41.5, + "grad_norm": 1.190082311630249, + "learning_rate": 2e-05, + "loss": 0.05261871, + "step": 20750 + }, + { + "epoch": 41.502, + "grad_norm": 1.1780047416687012, + "learning_rate": 2e-05, + "loss": 0.03741407, + "step": 20751 + }, + { + "epoch": 41.504, + "grad_norm": 1.4033395051956177, + "learning_rate": 2e-05, + "loss": 0.04731531, + "step": 20752 + }, + { + "epoch": 41.506, + "grad_norm": 0.8769941329956055, + "learning_rate": 2e-05, + "loss": 0.02578051, + "step": 20753 + }, + { + "epoch": 41.508, + "grad_norm": 1.502710223197937, + "learning_rate": 2e-05, + "loss": 0.05851678, + "step": 20754 + }, + { + "epoch": 41.51, + "grad_norm": 1.0792176723480225, + "learning_rate": 2e-05, + "loss": 0.03291167, + "step": 20755 + }, + { + "epoch": 41.512, + "grad_norm": 1.7319297790527344, + "learning_rate": 2e-05, + "loss": 0.05475036, + "step": 20756 + }, + { + "epoch": 41.514, + "grad_norm": 1.918562889099121, + "learning_rate": 2e-05, + "loss": 0.04272918, + "step": 20757 + }, + { + "epoch": 41.516, + "grad_norm": 2.9401967525482178, + "learning_rate": 2e-05, + "loss": 0.05610197, + "step": 20758 + }, + { + "epoch": 41.518, + "grad_norm": 0.8515689373016357, + "learning_rate": 2e-05, + "loss": 0.02853437, + "step": 20759 + }, + { + "epoch": 41.52, + "grad_norm": 1.914589285850525, + "learning_rate": 2e-05, + "loss": 0.05829733, + "step": 20760 + }, + { + "epoch": 41.522, + "grad_norm": 1.0378468036651611, + "learning_rate": 2e-05, + "loss": 0.03754158, + "step": 20761 + }, + { + "epoch": 41.524, + "grad_norm": 1.3927786350250244, + "learning_rate": 2e-05, + "loss": 0.05459459, + "step": 20762 + }, + { + "epoch": 41.526, + "grad_norm": 4.084712028503418, + "learning_rate": 2e-05, + "loss": 0.0557055, + "step": 20763 + }, + { + "epoch": 41.528, + "grad_norm": 1.2816001176834106, + "learning_rate": 2e-05, + "loss": 0.05118892, + "step": 20764 + }, + { + "epoch": 41.53, + "grad_norm": 1.243981122970581, + "learning_rate": 2e-05, + "loss": 0.053567, + "step": 20765 + }, + { + "epoch": 41.532, + "grad_norm": 1.179433822631836, + "learning_rate": 2e-05, + "loss": 0.04334904, + "step": 20766 + }, + { + "epoch": 41.534, + "grad_norm": 1.8389700651168823, + "learning_rate": 2e-05, + "loss": 0.03809846, + "step": 20767 + }, + { + "epoch": 41.536, + "grad_norm": 1.2287765741348267, + "learning_rate": 2e-05, + "loss": 0.05584181, + "step": 20768 + }, + { + "epoch": 41.538, + "grad_norm": 1.2944599390029907, + "learning_rate": 2e-05, + "loss": 0.04906465, + "step": 20769 + }, + { + "epoch": 41.54, + "grad_norm": 1.9301244020462036, + "learning_rate": 2e-05, + "loss": 0.04297593, + "step": 20770 + }, + { + "epoch": 41.542, + "grad_norm": 1.5391250848770142, + "learning_rate": 2e-05, + "loss": 0.05037332, + "step": 20771 + }, + { + "epoch": 41.544, + "grad_norm": 1.3913311958312988, + "learning_rate": 2e-05, + "loss": 0.0544315, + "step": 20772 + }, + { + "epoch": 41.546, + "grad_norm": 0.9265551567077637, + "learning_rate": 2e-05, + "loss": 0.0323727, + "step": 20773 + }, + { + "epoch": 41.548, + "grad_norm": 1.0124329328536987, + "learning_rate": 2e-05, + "loss": 0.0399297, + "step": 20774 + }, + { + "epoch": 41.55, + "grad_norm": 1.2719837427139282, + "learning_rate": 2e-05, + "loss": 0.04783987, + "step": 20775 + }, + { + "epoch": 41.552, + "grad_norm": 1.9327387809753418, + "learning_rate": 2e-05, + "loss": 0.04304921, + "step": 20776 + }, + { + "epoch": 41.554, + "grad_norm": 1.8879715204238892, + "learning_rate": 2e-05, + "loss": 0.03215152, + "step": 20777 + }, + { + "epoch": 41.556, + "grad_norm": 1.0443545579910278, + "learning_rate": 2e-05, + "loss": 0.04257246, + "step": 20778 + }, + { + "epoch": 41.558, + "grad_norm": 1.7671880722045898, + "learning_rate": 2e-05, + "loss": 0.04795214, + "step": 20779 + }, + { + "epoch": 41.56, + "grad_norm": 1.3257173299789429, + "learning_rate": 2e-05, + "loss": 0.05144795, + "step": 20780 + }, + { + "epoch": 41.562, + "grad_norm": 1.5164159536361694, + "learning_rate": 2e-05, + "loss": 0.05290364, + "step": 20781 + }, + { + "epoch": 41.564, + "grad_norm": 1.2703793048858643, + "learning_rate": 2e-05, + "loss": 0.05436259, + "step": 20782 + }, + { + "epoch": 41.566, + "grad_norm": 1.13009774684906, + "learning_rate": 2e-05, + "loss": 0.03706975, + "step": 20783 + }, + { + "epoch": 41.568, + "grad_norm": 1.3934050798416138, + "learning_rate": 2e-05, + "loss": 0.04990171, + "step": 20784 + }, + { + "epoch": 41.57, + "grad_norm": 1.6412134170532227, + "learning_rate": 2e-05, + "loss": 0.04792435, + "step": 20785 + }, + { + "epoch": 41.572, + "grad_norm": 1.3018213510513306, + "learning_rate": 2e-05, + "loss": 0.06655528, + "step": 20786 + }, + { + "epoch": 41.574, + "grad_norm": 1.3327934741973877, + "learning_rate": 2e-05, + "loss": 0.04949384, + "step": 20787 + }, + { + "epoch": 41.576, + "grad_norm": 1.173862338066101, + "learning_rate": 2e-05, + "loss": 0.05083285, + "step": 20788 + }, + { + "epoch": 41.578, + "grad_norm": 1.3000322580337524, + "learning_rate": 2e-05, + "loss": 0.05392583, + "step": 20789 + }, + { + "epoch": 41.58, + "grad_norm": 1.50508451461792, + "learning_rate": 2e-05, + "loss": 0.03634304, + "step": 20790 + }, + { + "epoch": 41.582, + "grad_norm": 1.342342734336853, + "learning_rate": 2e-05, + "loss": 0.05079833, + "step": 20791 + }, + { + "epoch": 41.584, + "grad_norm": 1.1908931732177734, + "learning_rate": 2e-05, + "loss": 0.0528966, + "step": 20792 + }, + { + "epoch": 41.586, + "grad_norm": 1.2644261121749878, + "learning_rate": 2e-05, + "loss": 0.04991703, + "step": 20793 + }, + { + "epoch": 41.588, + "grad_norm": 1.19503915309906, + "learning_rate": 2e-05, + "loss": 0.04296641, + "step": 20794 + }, + { + "epoch": 41.59, + "grad_norm": 1.3065739870071411, + "learning_rate": 2e-05, + "loss": 0.04288668, + "step": 20795 + }, + { + "epoch": 41.592, + "grad_norm": 1.5874643325805664, + "learning_rate": 2e-05, + "loss": 0.06031433, + "step": 20796 + }, + { + "epoch": 41.594, + "grad_norm": 1.4615437984466553, + "learning_rate": 2e-05, + "loss": 0.06782658, + "step": 20797 + }, + { + "epoch": 41.596, + "grad_norm": 1.0287331342697144, + "learning_rate": 2e-05, + "loss": 0.03726696, + "step": 20798 + }, + { + "epoch": 41.598, + "grad_norm": 2.5669822692871094, + "learning_rate": 2e-05, + "loss": 0.05073772, + "step": 20799 + }, + { + "epoch": 41.6, + "grad_norm": 1.9865378141403198, + "learning_rate": 2e-05, + "loss": 0.06835793, + "step": 20800 + }, + { + "epoch": 41.602, + "grad_norm": 1.1996169090270996, + "learning_rate": 2e-05, + "loss": 0.0441338, + "step": 20801 + }, + { + "epoch": 41.604, + "grad_norm": 1.1404577493667603, + "learning_rate": 2e-05, + "loss": 0.05091837, + "step": 20802 + }, + { + "epoch": 41.606, + "grad_norm": 1.366138219833374, + "learning_rate": 2e-05, + "loss": 0.05536352, + "step": 20803 + }, + { + "epoch": 41.608, + "grad_norm": 1.2868354320526123, + "learning_rate": 2e-05, + "loss": 0.05645128, + "step": 20804 + }, + { + "epoch": 41.61, + "grad_norm": 1.2163975238800049, + "learning_rate": 2e-05, + "loss": 0.04356652, + "step": 20805 + }, + { + "epoch": 41.612, + "grad_norm": 1.5239417552947998, + "learning_rate": 2e-05, + "loss": 0.05934812, + "step": 20806 + }, + { + "epoch": 41.614, + "grad_norm": 1.641875982284546, + "learning_rate": 2e-05, + "loss": 0.0451197, + "step": 20807 + }, + { + "epoch": 41.616, + "grad_norm": 1.4720548391342163, + "learning_rate": 2e-05, + "loss": 0.05895507, + "step": 20808 + }, + { + "epoch": 41.618, + "grad_norm": 1.20379638671875, + "learning_rate": 2e-05, + "loss": 0.05722809, + "step": 20809 + }, + { + "epoch": 41.62, + "grad_norm": 1.206185221672058, + "learning_rate": 2e-05, + "loss": 0.04924655, + "step": 20810 + }, + { + "epoch": 41.622, + "grad_norm": 1.3422046899795532, + "learning_rate": 2e-05, + "loss": 0.04705754, + "step": 20811 + }, + { + "epoch": 41.624, + "grad_norm": 1.251327395439148, + "learning_rate": 2e-05, + "loss": 0.03359656, + "step": 20812 + }, + { + "epoch": 41.626, + "grad_norm": 1.0493559837341309, + "learning_rate": 2e-05, + "loss": 0.03650389, + "step": 20813 + }, + { + "epoch": 41.628, + "grad_norm": 1.1283539533615112, + "learning_rate": 2e-05, + "loss": 0.04552049, + "step": 20814 + }, + { + "epoch": 41.63, + "grad_norm": 1.2148489952087402, + "learning_rate": 2e-05, + "loss": 0.04333513, + "step": 20815 + }, + { + "epoch": 41.632, + "grad_norm": 1.1849173307418823, + "learning_rate": 2e-05, + "loss": 0.04451859, + "step": 20816 + }, + { + "epoch": 41.634, + "grad_norm": 1.035466194152832, + "learning_rate": 2e-05, + "loss": 0.03730474, + "step": 20817 + }, + { + "epoch": 41.636, + "grad_norm": 1.3401261568069458, + "learning_rate": 2e-05, + "loss": 0.03913242, + "step": 20818 + }, + { + "epoch": 41.638, + "grad_norm": 1.3163193464279175, + "learning_rate": 2e-05, + "loss": 0.04887679, + "step": 20819 + }, + { + "epoch": 41.64, + "grad_norm": 1.2960766553878784, + "learning_rate": 2e-05, + "loss": 0.05596934, + "step": 20820 + }, + { + "epoch": 41.642, + "grad_norm": 1.2313705682754517, + "learning_rate": 2e-05, + "loss": 0.05113539, + "step": 20821 + }, + { + "epoch": 41.644, + "grad_norm": 1.16353178024292, + "learning_rate": 2e-05, + "loss": 0.04695116, + "step": 20822 + }, + { + "epoch": 41.646, + "grad_norm": 1.308196783065796, + "learning_rate": 2e-05, + "loss": 0.05837334, + "step": 20823 + }, + { + "epoch": 41.648, + "grad_norm": 1.0429937839508057, + "learning_rate": 2e-05, + "loss": 0.0384528, + "step": 20824 + }, + { + "epoch": 41.65, + "grad_norm": 1.335108995437622, + "learning_rate": 2e-05, + "loss": 0.04670976, + "step": 20825 + }, + { + "epoch": 41.652, + "grad_norm": 1.6604396104812622, + "learning_rate": 2e-05, + "loss": 0.04694479, + "step": 20826 + }, + { + "epoch": 41.654, + "grad_norm": 1.2151720523834229, + "learning_rate": 2e-05, + "loss": 0.0505834, + "step": 20827 + }, + { + "epoch": 41.656, + "grad_norm": 1.6010440587997437, + "learning_rate": 2e-05, + "loss": 0.0715237, + "step": 20828 + }, + { + "epoch": 41.658, + "grad_norm": 1.2786024808883667, + "learning_rate": 2e-05, + "loss": 0.06591807, + "step": 20829 + }, + { + "epoch": 41.66, + "grad_norm": 1.1999880075454712, + "learning_rate": 2e-05, + "loss": 0.05343919, + "step": 20830 + }, + { + "epoch": 41.662, + "grad_norm": 1.5012918710708618, + "learning_rate": 2e-05, + "loss": 0.05417911, + "step": 20831 + }, + { + "epoch": 41.664, + "grad_norm": 1.03823983669281, + "learning_rate": 2e-05, + "loss": 0.03913938, + "step": 20832 + }, + { + "epoch": 41.666, + "grad_norm": 1.4988044500350952, + "learning_rate": 2e-05, + "loss": 0.05051984, + "step": 20833 + }, + { + "epoch": 41.668, + "grad_norm": 1.3459997177124023, + "learning_rate": 2e-05, + "loss": 0.05139535, + "step": 20834 + }, + { + "epoch": 41.67, + "grad_norm": 1.1768592596054077, + "learning_rate": 2e-05, + "loss": 0.05571685, + "step": 20835 + }, + { + "epoch": 41.672, + "grad_norm": 1.1096575260162354, + "learning_rate": 2e-05, + "loss": 0.04181407, + "step": 20836 + }, + { + "epoch": 41.674, + "grad_norm": 1.260703444480896, + "learning_rate": 2e-05, + "loss": 0.05937716, + "step": 20837 + }, + { + "epoch": 41.676, + "grad_norm": 1.1465245485305786, + "learning_rate": 2e-05, + "loss": 0.05365495, + "step": 20838 + }, + { + "epoch": 41.678, + "grad_norm": 1.7328318357467651, + "learning_rate": 2e-05, + "loss": 0.06675532, + "step": 20839 + }, + { + "epoch": 41.68, + "grad_norm": 1.3580307960510254, + "learning_rate": 2e-05, + "loss": 0.06565544, + "step": 20840 + }, + { + "epoch": 41.682, + "grad_norm": 1.1711437702178955, + "learning_rate": 2e-05, + "loss": 0.04366954, + "step": 20841 + }, + { + "epoch": 41.684, + "grad_norm": 1.7624998092651367, + "learning_rate": 2e-05, + "loss": 0.05125007, + "step": 20842 + }, + { + "epoch": 41.686, + "grad_norm": 1.7056723833084106, + "learning_rate": 2e-05, + "loss": 0.06327073, + "step": 20843 + }, + { + "epoch": 41.688, + "grad_norm": 1.4002678394317627, + "learning_rate": 2e-05, + "loss": 0.05082846, + "step": 20844 + }, + { + "epoch": 41.69, + "grad_norm": 1.8663007020950317, + "learning_rate": 2e-05, + "loss": 0.06116794, + "step": 20845 + }, + { + "epoch": 41.692, + "grad_norm": 1.311848759651184, + "learning_rate": 2e-05, + "loss": 0.05300967, + "step": 20846 + }, + { + "epoch": 41.694, + "grad_norm": 1.2102714776992798, + "learning_rate": 2e-05, + "loss": 0.05376907, + "step": 20847 + }, + { + "epoch": 41.696, + "grad_norm": 1.0833280086517334, + "learning_rate": 2e-05, + "loss": 0.04561132, + "step": 20848 + }, + { + "epoch": 41.698, + "grad_norm": 1.1252832412719727, + "learning_rate": 2e-05, + "loss": 0.04921568, + "step": 20849 + }, + { + "epoch": 41.7, + "grad_norm": 1.2025973796844482, + "learning_rate": 2e-05, + "loss": 0.04832213, + "step": 20850 + }, + { + "epoch": 41.702, + "grad_norm": 1.0918816328048706, + "learning_rate": 2e-05, + "loss": 0.04648707, + "step": 20851 + }, + { + "epoch": 41.704, + "grad_norm": 1.1413060426712036, + "learning_rate": 2e-05, + "loss": 0.05401688, + "step": 20852 + }, + { + "epoch": 41.706, + "grad_norm": 1.722861886024475, + "learning_rate": 2e-05, + "loss": 0.04710011, + "step": 20853 + }, + { + "epoch": 41.708, + "grad_norm": 1.183224081993103, + "learning_rate": 2e-05, + "loss": 0.04919672, + "step": 20854 + }, + { + "epoch": 41.71, + "grad_norm": 1.0682933330535889, + "learning_rate": 2e-05, + "loss": 0.04360975, + "step": 20855 + }, + { + "epoch": 41.712, + "grad_norm": 1.3909543752670288, + "learning_rate": 2e-05, + "loss": 0.03963952, + "step": 20856 + }, + { + "epoch": 41.714, + "grad_norm": 1.2624576091766357, + "learning_rate": 2e-05, + "loss": 0.04159348, + "step": 20857 + }, + { + "epoch": 41.716, + "grad_norm": 1.2852903604507446, + "learning_rate": 2e-05, + "loss": 0.06388488, + "step": 20858 + }, + { + "epoch": 41.718, + "grad_norm": 1.2193105220794678, + "learning_rate": 2e-05, + "loss": 0.04823635, + "step": 20859 + }, + { + "epoch": 41.72, + "grad_norm": 1.3638604879379272, + "learning_rate": 2e-05, + "loss": 0.0435699, + "step": 20860 + }, + { + "epoch": 41.722, + "grad_norm": 1.2816431522369385, + "learning_rate": 2e-05, + "loss": 0.05209901, + "step": 20861 + }, + { + "epoch": 41.724, + "grad_norm": 1.994900107383728, + "learning_rate": 2e-05, + "loss": 0.0689984, + "step": 20862 + }, + { + "epoch": 41.726, + "grad_norm": 1.2706905603408813, + "learning_rate": 2e-05, + "loss": 0.04907268, + "step": 20863 + }, + { + "epoch": 41.728, + "grad_norm": 1.134853482246399, + "learning_rate": 2e-05, + "loss": 0.03874713, + "step": 20864 + }, + { + "epoch": 41.73, + "grad_norm": 1.305557370185852, + "learning_rate": 2e-05, + "loss": 0.0443382, + "step": 20865 + }, + { + "epoch": 41.732, + "grad_norm": 1.287753701210022, + "learning_rate": 2e-05, + "loss": 0.05005742, + "step": 20866 + }, + { + "epoch": 41.734, + "grad_norm": 1.2980296611785889, + "learning_rate": 2e-05, + "loss": 0.04606131, + "step": 20867 + }, + { + "epoch": 41.736, + "grad_norm": 1.003082036972046, + "learning_rate": 2e-05, + "loss": 0.03791095, + "step": 20868 + }, + { + "epoch": 41.738, + "grad_norm": 1.2747972011566162, + "learning_rate": 2e-05, + "loss": 0.04731661, + "step": 20869 + }, + { + "epoch": 41.74, + "grad_norm": 1.4059251546859741, + "learning_rate": 2e-05, + "loss": 0.03448366, + "step": 20870 + }, + { + "epoch": 41.742, + "grad_norm": 2.1521401405334473, + "learning_rate": 2e-05, + "loss": 0.06867001, + "step": 20871 + }, + { + "epoch": 41.744, + "grad_norm": 1.4154833555221558, + "learning_rate": 2e-05, + "loss": 0.03487004, + "step": 20872 + }, + { + "epoch": 41.746, + "grad_norm": 1.2275240421295166, + "learning_rate": 2e-05, + "loss": 0.05700513, + "step": 20873 + }, + { + "epoch": 41.748, + "grad_norm": 2.4994146823883057, + "learning_rate": 2e-05, + "loss": 0.05144731, + "step": 20874 + }, + { + "epoch": 41.75, + "grad_norm": 1.8417681455612183, + "learning_rate": 2e-05, + "loss": 0.05225058, + "step": 20875 + }, + { + "epoch": 41.752, + "grad_norm": 1.2966296672821045, + "learning_rate": 2e-05, + "loss": 0.04987405, + "step": 20876 + }, + { + "epoch": 41.754, + "grad_norm": 1.2915031909942627, + "learning_rate": 2e-05, + "loss": 0.05634173, + "step": 20877 + }, + { + "epoch": 41.756, + "grad_norm": 1.4315176010131836, + "learning_rate": 2e-05, + "loss": 0.05782579, + "step": 20878 + }, + { + "epoch": 41.758, + "grad_norm": 1.8965885639190674, + "learning_rate": 2e-05, + "loss": 0.05472075, + "step": 20879 + }, + { + "epoch": 41.76, + "grad_norm": 1.0366774797439575, + "learning_rate": 2e-05, + "loss": 0.03605853, + "step": 20880 + }, + { + "epoch": 41.762, + "grad_norm": 1.6545394659042358, + "learning_rate": 2e-05, + "loss": 0.05220097, + "step": 20881 + }, + { + "epoch": 41.764, + "grad_norm": 1.9253038167953491, + "learning_rate": 2e-05, + "loss": 0.04118697, + "step": 20882 + }, + { + "epoch": 41.766, + "grad_norm": 1.229835033416748, + "learning_rate": 2e-05, + "loss": 0.05954335, + "step": 20883 + }, + { + "epoch": 41.768, + "grad_norm": 2.4672608375549316, + "learning_rate": 2e-05, + "loss": 0.03520424, + "step": 20884 + }, + { + "epoch": 41.77, + "grad_norm": 1.317997694015503, + "learning_rate": 2e-05, + "loss": 0.05394396, + "step": 20885 + }, + { + "epoch": 41.772, + "grad_norm": 1.1028608083724976, + "learning_rate": 2e-05, + "loss": 0.05019546, + "step": 20886 + }, + { + "epoch": 41.774, + "grad_norm": 1.4625998735427856, + "learning_rate": 2e-05, + "loss": 0.05548112, + "step": 20887 + }, + { + "epoch": 41.776, + "grad_norm": 1.5702537298202515, + "learning_rate": 2e-05, + "loss": 0.0497899, + "step": 20888 + }, + { + "epoch": 41.778, + "grad_norm": 1.258571743965149, + "learning_rate": 2e-05, + "loss": 0.04912242, + "step": 20889 + }, + { + "epoch": 41.78, + "grad_norm": 1.0844334363937378, + "learning_rate": 2e-05, + "loss": 0.04057037, + "step": 20890 + }, + { + "epoch": 41.782, + "grad_norm": 1.1522717475891113, + "learning_rate": 2e-05, + "loss": 0.04750651, + "step": 20891 + }, + { + "epoch": 41.784, + "grad_norm": 1.290791630744934, + "learning_rate": 2e-05, + "loss": 0.06190044, + "step": 20892 + }, + { + "epoch": 41.786, + "grad_norm": 1.1737576723098755, + "learning_rate": 2e-05, + "loss": 0.03324655, + "step": 20893 + }, + { + "epoch": 41.788, + "grad_norm": 1.2524381875991821, + "learning_rate": 2e-05, + "loss": 0.03917806, + "step": 20894 + }, + { + "epoch": 41.79, + "grad_norm": 1.3281619548797607, + "learning_rate": 2e-05, + "loss": 0.04660825, + "step": 20895 + }, + { + "epoch": 41.792, + "grad_norm": 1.430392861366272, + "learning_rate": 2e-05, + "loss": 0.07056463, + "step": 20896 + }, + { + "epoch": 41.794, + "grad_norm": 1.4818058013916016, + "learning_rate": 2e-05, + "loss": 0.05624013, + "step": 20897 + }, + { + "epoch": 41.796, + "grad_norm": 1.2507944107055664, + "learning_rate": 2e-05, + "loss": 0.05323803, + "step": 20898 + }, + { + "epoch": 41.798, + "grad_norm": 1.244249701499939, + "learning_rate": 2e-05, + "loss": 0.04751854, + "step": 20899 + }, + { + "epoch": 41.8, + "grad_norm": 2.58414363861084, + "learning_rate": 2e-05, + "loss": 0.05688569, + "step": 20900 + }, + { + "epoch": 41.802, + "grad_norm": 1.0846785306930542, + "learning_rate": 2e-05, + "loss": 0.03785557, + "step": 20901 + }, + { + "epoch": 41.804, + "grad_norm": 1.3131710290908813, + "learning_rate": 2e-05, + "loss": 0.05168329, + "step": 20902 + }, + { + "epoch": 41.806, + "grad_norm": 1.262276530265808, + "learning_rate": 2e-05, + "loss": 0.04058243, + "step": 20903 + }, + { + "epoch": 41.808, + "grad_norm": 1.134749412536621, + "learning_rate": 2e-05, + "loss": 0.033862, + "step": 20904 + }, + { + "epoch": 41.81, + "grad_norm": 1.2034157514572144, + "learning_rate": 2e-05, + "loss": 0.0472676, + "step": 20905 + }, + { + "epoch": 41.812, + "grad_norm": 2.1183056831359863, + "learning_rate": 2e-05, + "loss": 0.04632457, + "step": 20906 + }, + { + "epoch": 41.814, + "grad_norm": 1.1431421041488647, + "learning_rate": 2e-05, + "loss": 0.04334393, + "step": 20907 + }, + { + "epoch": 41.816, + "grad_norm": 1.328840732574463, + "learning_rate": 2e-05, + "loss": 0.04690224, + "step": 20908 + }, + { + "epoch": 41.818, + "grad_norm": 0.96977299451828, + "learning_rate": 2e-05, + "loss": 0.03683947, + "step": 20909 + }, + { + "epoch": 41.82, + "grad_norm": 1.1125136613845825, + "learning_rate": 2e-05, + "loss": 0.04391717, + "step": 20910 + }, + { + "epoch": 41.822, + "grad_norm": 1.0546776056289673, + "learning_rate": 2e-05, + "loss": 0.0294345, + "step": 20911 + }, + { + "epoch": 41.824, + "grad_norm": 1.0246164798736572, + "learning_rate": 2e-05, + "loss": 0.04162443, + "step": 20912 + }, + { + "epoch": 41.826, + "grad_norm": 1.2705532312393188, + "learning_rate": 2e-05, + "loss": 0.06132779, + "step": 20913 + }, + { + "epoch": 41.828, + "grad_norm": 1.0475337505340576, + "learning_rate": 2e-05, + "loss": 0.03245585, + "step": 20914 + }, + { + "epoch": 41.83, + "grad_norm": 1.4540488719940186, + "learning_rate": 2e-05, + "loss": 0.04266465, + "step": 20915 + }, + { + "epoch": 41.832, + "grad_norm": 1.089884638786316, + "learning_rate": 2e-05, + "loss": 0.03583428, + "step": 20916 + }, + { + "epoch": 41.834, + "grad_norm": 1.248226284980774, + "learning_rate": 2e-05, + "loss": 0.04630677, + "step": 20917 + }, + { + "epoch": 41.836, + "grad_norm": 1.2985204458236694, + "learning_rate": 2e-05, + "loss": 0.05467469, + "step": 20918 + }, + { + "epoch": 41.838, + "grad_norm": 1.6257275342941284, + "learning_rate": 2e-05, + "loss": 0.05300467, + "step": 20919 + }, + { + "epoch": 41.84, + "grad_norm": 1.0730141401290894, + "learning_rate": 2e-05, + "loss": 0.04086423, + "step": 20920 + }, + { + "epoch": 41.842, + "grad_norm": 1.7589372396469116, + "learning_rate": 2e-05, + "loss": 0.07659753, + "step": 20921 + }, + { + "epoch": 41.844, + "grad_norm": 1.650455355644226, + "learning_rate": 2e-05, + "loss": 0.04704353, + "step": 20922 + }, + { + "epoch": 41.846, + "grad_norm": 0.9814362525939941, + "learning_rate": 2e-05, + "loss": 0.04067108, + "step": 20923 + }, + { + "epoch": 41.848, + "grad_norm": 2.033730983734131, + "learning_rate": 2e-05, + "loss": 0.03695693, + "step": 20924 + }, + { + "epoch": 41.85, + "grad_norm": 1.1429461240768433, + "learning_rate": 2e-05, + "loss": 0.04535512, + "step": 20925 + }, + { + "epoch": 41.852, + "grad_norm": 1.354186773300171, + "learning_rate": 2e-05, + "loss": 0.05408484, + "step": 20926 + }, + { + "epoch": 41.854, + "grad_norm": 2.6549506187438965, + "learning_rate": 2e-05, + "loss": 0.05873822, + "step": 20927 + }, + { + "epoch": 41.856, + "grad_norm": 1.1592339277267456, + "learning_rate": 2e-05, + "loss": 0.04995368, + "step": 20928 + }, + { + "epoch": 41.858, + "grad_norm": 1.7437788248062134, + "learning_rate": 2e-05, + "loss": 0.07510689, + "step": 20929 + }, + { + "epoch": 41.86, + "grad_norm": 1.1084489822387695, + "learning_rate": 2e-05, + "loss": 0.04345679, + "step": 20930 + }, + { + "epoch": 41.862, + "grad_norm": 1.3295727968215942, + "learning_rate": 2e-05, + "loss": 0.04446131, + "step": 20931 + }, + { + "epoch": 41.864, + "grad_norm": 1.3433420658111572, + "learning_rate": 2e-05, + "loss": 0.05595826, + "step": 20932 + }, + { + "epoch": 41.866, + "grad_norm": 1.3462584018707275, + "learning_rate": 2e-05, + "loss": 0.05649468, + "step": 20933 + }, + { + "epoch": 41.868, + "grad_norm": 1.4262540340423584, + "learning_rate": 2e-05, + "loss": 0.04885809, + "step": 20934 + }, + { + "epoch": 41.87, + "grad_norm": 2.7343051433563232, + "learning_rate": 2e-05, + "loss": 0.06057374, + "step": 20935 + }, + { + "epoch": 41.872, + "grad_norm": 1.2475897073745728, + "learning_rate": 2e-05, + "loss": 0.05510751, + "step": 20936 + }, + { + "epoch": 41.874, + "grad_norm": 1.5104964971542358, + "learning_rate": 2e-05, + "loss": 0.05848288, + "step": 20937 + }, + { + "epoch": 41.876, + "grad_norm": 1.10996675491333, + "learning_rate": 2e-05, + "loss": 0.04091154, + "step": 20938 + }, + { + "epoch": 41.878, + "grad_norm": 1.2295801639556885, + "learning_rate": 2e-05, + "loss": 0.05182572, + "step": 20939 + }, + { + "epoch": 41.88, + "grad_norm": 1.200059413909912, + "learning_rate": 2e-05, + "loss": 0.04649248, + "step": 20940 + }, + { + "epoch": 41.882, + "grad_norm": 1.1000398397445679, + "learning_rate": 2e-05, + "loss": 0.03840358, + "step": 20941 + }, + { + "epoch": 41.884, + "grad_norm": 1.3613183498382568, + "learning_rate": 2e-05, + "loss": 0.05442974, + "step": 20942 + }, + { + "epoch": 41.886, + "grad_norm": 1.567658543586731, + "learning_rate": 2e-05, + "loss": 0.05978185, + "step": 20943 + }, + { + "epoch": 41.888, + "grad_norm": 1.0944557189941406, + "learning_rate": 2e-05, + "loss": 0.03656168, + "step": 20944 + }, + { + "epoch": 41.89, + "grad_norm": 1.177577018737793, + "learning_rate": 2e-05, + "loss": 0.04644332, + "step": 20945 + }, + { + "epoch": 41.892, + "grad_norm": 1.2059484720230103, + "learning_rate": 2e-05, + "loss": 0.03849885, + "step": 20946 + }, + { + "epoch": 41.894, + "grad_norm": 4.2382283210754395, + "learning_rate": 2e-05, + "loss": 0.05498065, + "step": 20947 + }, + { + "epoch": 41.896, + "grad_norm": 1.1419352293014526, + "learning_rate": 2e-05, + "loss": 0.04763758, + "step": 20948 + }, + { + "epoch": 41.898, + "grad_norm": 2.648472785949707, + "learning_rate": 2e-05, + "loss": 0.06559248, + "step": 20949 + }, + { + "epoch": 41.9, + "grad_norm": 1.1834135055541992, + "learning_rate": 2e-05, + "loss": 0.06024366, + "step": 20950 + }, + { + "epoch": 41.902, + "grad_norm": 1.1037529706954956, + "learning_rate": 2e-05, + "loss": 0.05040961, + "step": 20951 + }, + { + "epoch": 41.904, + "grad_norm": 1.312083125114441, + "learning_rate": 2e-05, + "loss": 0.05827889, + "step": 20952 + }, + { + "epoch": 41.906, + "grad_norm": 1.707861065864563, + "learning_rate": 2e-05, + "loss": 0.05063179, + "step": 20953 + }, + { + "epoch": 41.908, + "grad_norm": 1.2550243139266968, + "learning_rate": 2e-05, + "loss": 0.04975577, + "step": 20954 + }, + { + "epoch": 41.91, + "grad_norm": 1.1055585145950317, + "learning_rate": 2e-05, + "loss": 0.04447905, + "step": 20955 + }, + { + "epoch": 41.912, + "grad_norm": 1.2128030061721802, + "learning_rate": 2e-05, + "loss": 0.04356065, + "step": 20956 + }, + { + "epoch": 41.914, + "grad_norm": 2.053502321243286, + "learning_rate": 2e-05, + "loss": 0.04877847, + "step": 20957 + }, + { + "epoch": 41.916, + "grad_norm": 1.2415821552276611, + "learning_rate": 2e-05, + "loss": 0.05251724, + "step": 20958 + }, + { + "epoch": 41.918, + "grad_norm": 1.246356725692749, + "learning_rate": 2e-05, + "loss": 0.03539124, + "step": 20959 + }, + { + "epoch": 41.92, + "grad_norm": 3.0229344367980957, + "learning_rate": 2e-05, + "loss": 0.04473191, + "step": 20960 + }, + { + "epoch": 41.922, + "grad_norm": 1.5012351274490356, + "learning_rate": 2e-05, + "loss": 0.04717286, + "step": 20961 + }, + { + "epoch": 41.924, + "grad_norm": 1.6548337936401367, + "learning_rate": 2e-05, + "loss": 0.05977109, + "step": 20962 + }, + { + "epoch": 41.926, + "grad_norm": 1.0772522687911987, + "learning_rate": 2e-05, + "loss": 0.04247387, + "step": 20963 + }, + { + "epoch": 41.928, + "grad_norm": 1.259922981262207, + "learning_rate": 2e-05, + "loss": 0.05617234, + "step": 20964 + }, + { + "epoch": 41.93, + "grad_norm": 1.1000100374221802, + "learning_rate": 2e-05, + "loss": 0.04250489, + "step": 20965 + }, + { + "epoch": 41.932, + "grad_norm": 0.9738196730613708, + "learning_rate": 2e-05, + "loss": 0.02657899, + "step": 20966 + }, + { + "epoch": 41.934, + "grad_norm": 1.1028367280960083, + "learning_rate": 2e-05, + "loss": 0.03186112, + "step": 20967 + }, + { + "epoch": 41.936, + "grad_norm": 1.2720319032669067, + "learning_rate": 2e-05, + "loss": 0.04869255, + "step": 20968 + }, + { + "epoch": 41.938, + "grad_norm": 1.1687405109405518, + "learning_rate": 2e-05, + "loss": 0.03809325, + "step": 20969 + }, + { + "epoch": 41.94, + "grad_norm": 0.9612535238265991, + "learning_rate": 2e-05, + "loss": 0.02938464, + "step": 20970 + }, + { + "epoch": 41.942, + "grad_norm": 1.5162972211837769, + "learning_rate": 2e-05, + "loss": 0.06188224, + "step": 20971 + }, + { + "epoch": 41.944, + "grad_norm": 2.4762864112854004, + "learning_rate": 2e-05, + "loss": 0.05203952, + "step": 20972 + }, + { + "epoch": 41.946, + "grad_norm": 1.2529581785202026, + "learning_rate": 2e-05, + "loss": 0.04696166, + "step": 20973 + }, + { + "epoch": 41.948, + "grad_norm": 1.1282131671905518, + "learning_rate": 2e-05, + "loss": 0.03645737, + "step": 20974 + }, + { + "epoch": 41.95, + "grad_norm": 1.2952510118484497, + "learning_rate": 2e-05, + "loss": 0.04191312, + "step": 20975 + }, + { + "epoch": 41.952, + "grad_norm": 1.780375599861145, + "learning_rate": 2e-05, + "loss": 0.05335905, + "step": 20976 + }, + { + "epoch": 41.954, + "grad_norm": 1.3107290267944336, + "learning_rate": 2e-05, + "loss": 0.04631649, + "step": 20977 + }, + { + "epoch": 41.956, + "grad_norm": 1.3172683715820312, + "learning_rate": 2e-05, + "loss": 0.05279814, + "step": 20978 + }, + { + "epoch": 41.958, + "grad_norm": 1.3357253074645996, + "learning_rate": 2e-05, + "loss": 0.05109545, + "step": 20979 + }, + { + "epoch": 41.96, + "grad_norm": 1.298035740852356, + "learning_rate": 2e-05, + "loss": 0.05070015, + "step": 20980 + }, + { + "epoch": 41.962, + "grad_norm": 1.3245327472686768, + "learning_rate": 2e-05, + "loss": 0.05043676, + "step": 20981 + }, + { + "epoch": 41.964, + "grad_norm": 1.8343544006347656, + "learning_rate": 2e-05, + "loss": 0.05134499, + "step": 20982 + }, + { + "epoch": 41.966, + "grad_norm": 1.2885112762451172, + "learning_rate": 2e-05, + "loss": 0.04808827, + "step": 20983 + }, + { + "epoch": 41.968, + "grad_norm": 1.0060571432113647, + "learning_rate": 2e-05, + "loss": 0.04182211, + "step": 20984 + }, + { + "epoch": 41.97, + "grad_norm": 1.8609304428100586, + "learning_rate": 2e-05, + "loss": 0.05335212, + "step": 20985 + }, + { + "epoch": 41.972, + "grad_norm": 1.4980871677398682, + "learning_rate": 2e-05, + "loss": 0.04966426, + "step": 20986 + }, + { + "epoch": 41.974, + "grad_norm": 1.4951345920562744, + "learning_rate": 2e-05, + "loss": 0.04603991, + "step": 20987 + }, + { + "epoch": 41.976, + "grad_norm": 0.9615522623062134, + "learning_rate": 2e-05, + "loss": 0.02445277, + "step": 20988 + }, + { + "epoch": 41.978, + "grad_norm": 1.2384010553359985, + "learning_rate": 2e-05, + "loss": 0.04333053, + "step": 20989 + }, + { + "epoch": 41.98, + "grad_norm": 1.366593837738037, + "learning_rate": 2e-05, + "loss": 0.06016916, + "step": 20990 + }, + { + "epoch": 41.982, + "grad_norm": 1.1202651262283325, + "learning_rate": 2e-05, + "loss": 0.0522389, + "step": 20991 + }, + { + "epoch": 41.984, + "grad_norm": 1.1914691925048828, + "learning_rate": 2e-05, + "loss": 0.04286882, + "step": 20992 + }, + { + "epoch": 41.986, + "grad_norm": 1.3832005262374878, + "learning_rate": 2e-05, + "loss": 0.05637228, + "step": 20993 + }, + { + "epoch": 41.988, + "grad_norm": 1.253294587135315, + "learning_rate": 2e-05, + "loss": 0.04640395, + "step": 20994 + }, + { + "epoch": 41.99, + "grad_norm": 2.2488229274749756, + "learning_rate": 2e-05, + "loss": 0.04534402, + "step": 20995 + }, + { + "epoch": 41.992, + "grad_norm": 1.515738606452942, + "learning_rate": 2e-05, + "loss": 0.0565161, + "step": 20996 + }, + { + "epoch": 41.994, + "grad_norm": 2.047625780105591, + "learning_rate": 2e-05, + "loss": 0.04121755, + "step": 20997 + }, + { + "epoch": 41.996, + "grad_norm": 2.0094411373138428, + "learning_rate": 2e-05, + "loss": 0.06695313, + "step": 20998 + }, + { + "epoch": 41.998, + "grad_norm": 1.419292688369751, + "learning_rate": 2e-05, + "loss": 0.04039873, + "step": 20999 + }, + { + "epoch": 42.0, + "grad_norm": 2.344158887863159, + "learning_rate": 2e-05, + "loss": 0.047058, + "step": 21000 + }, + { + "epoch": 42.0, + "eval_performance": { + "AngleClassification_1": 0.994, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9820359281437125, + "Equal_1": 0.996, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9880239520958084, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.9940119760479041, + "Parallel_1": 0.9959919839679359, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.996, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.998, + "Perpendicular_3": 0.9038076152304609, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.9956, + "PointLiesOnLine_1": 0.9939879759519038, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9860279441117764 + }, + "eval_runtime": 320.4536, + "eval_samples_per_second": 32.766, + "eval_steps_per_second": 0.655, + "step": 21000 + }, + { + "epoch": 42.002, + "grad_norm": 2.0079424381256104, + "learning_rate": 2e-05, + "loss": 0.03449755, + "step": 21001 + }, + { + "epoch": 42.004, + "grad_norm": 1.365389108657837, + "learning_rate": 2e-05, + "loss": 0.0548946, + "step": 21002 + }, + { + "epoch": 42.006, + "grad_norm": 2.7778944969177246, + "learning_rate": 2e-05, + "loss": 0.04609371, + "step": 21003 + }, + { + "epoch": 42.008, + "grad_norm": 1.0593626499176025, + "learning_rate": 2e-05, + "loss": 0.0348668, + "step": 21004 + }, + { + "epoch": 42.01, + "grad_norm": 1.3281819820404053, + "learning_rate": 2e-05, + "loss": 0.05575937, + "step": 21005 + }, + { + "epoch": 42.012, + "grad_norm": 1.5199637413024902, + "learning_rate": 2e-05, + "loss": 0.04298349, + "step": 21006 + }, + { + "epoch": 42.014, + "grad_norm": 1.4977668523788452, + "learning_rate": 2e-05, + "loss": 0.03856365, + "step": 21007 + }, + { + "epoch": 42.016, + "grad_norm": 1.0063074827194214, + "learning_rate": 2e-05, + "loss": 0.02791759, + "step": 21008 + }, + { + "epoch": 42.018, + "grad_norm": 1.3319441080093384, + "learning_rate": 2e-05, + "loss": 0.05699763, + "step": 21009 + }, + { + "epoch": 42.02, + "grad_norm": 1.0372650623321533, + "learning_rate": 2e-05, + "loss": 0.04131971, + "step": 21010 + }, + { + "epoch": 42.022, + "grad_norm": 1.1082048416137695, + "learning_rate": 2e-05, + "loss": 0.04324374, + "step": 21011 + }, + { + "epoch": 42.024, + "grad_norm": 1.2265472412109375, + "learning_rate": 2e-05, + "loss": 0.04790197, + "step": 21012 + }, + { + "epoch": 42.026, + "grad_norm": 1.5326513051986694, + "learning_rate": 2e-05, + "loss": 0.05717977, + "step": 21013 + }, + { + "epoch": 42.028, + "grad_norm": 1.2501510381698608, + "learning_rate": 2e-05, + "loss": 0.04369808, + "step": 21014 + }, + { + "epoch": 42.03, + "grad_norm": 0.9996318817138672, + "learning_rate": 2e-05, + "loss": 0.03566208, + "step": 21015 + }, + { + "epoch": 42.032, + "grad_norm": 1.2359881401062012, + "learning_rate": 2e-05, + "loss": 0.05509493, + "step": 21016 + }, + { + "epoch": 42.034, + "grad_norm": 1.4756462574005127, + "learning_rate": 2e-05, + "loss": 0.06215252, + "step": 21017 + }, + { + "epoch": 42.036, + "grad_norm": 1.2540290355682373, + "learning_rate": 2e-05, + "loss": 0.03848273, + "step": 21018 + }, + { + "epoch": 42.038, + "grad_norm": 1.3934910297393799, + "learning_rate": 2e-05, + "loss": 0.0600573, + "step": 21019 + }, + { + "epoch": 42.04, + "grad_norm": 1.3779077529907227, + "learning_rate": 2e-05, + "loss": 0.05684218, + "step": 21020 + }, + { + "epoch": 42.042, + "grad_norm": 1.3131364583969116, + "learning_rate": 2e-05, + "loss": 0.04408994, + "step": 21021 + }, + { + "epoch": 42.044, + "grad_norm": 1.862250804901123, + "learning_rate": 2e-05, + "loss": 0.05622796, + "step": 21022 + }, + { + "epoch": 42.046, + "grad_norm": 1.1200668811798096, + "learning_rate": 2e-05, + "loss": 0.03651465, + "step": 21023 + }, + { + "epoch": 42.048, + "grad_norm": 1.217587947845459, + "learning_rate": 2e-05, + "loss": 0.03259541, + "step": 21024 + }, + { + "epoch": 42.05, + "grad_norm": 1.5544425249099731, + "learning_rate": 2e-05, + "loss": 0.03680165, + "step": 21025 + }, + { + "epoch": 42.052, + "grad_norm": 1.274043083190918, + "learning_rate": 2e-05, + "loss": 0.04595189, + "step": 21026 + }, + { + "epoch": 42.054, + "grad_norm": 2.187490463256836, + "learning_rate": 2e-05, + "loss": 0.04805189, + "step": 21027 + }, + { + "epoch": 42.056, + "grad_norm": 1.2666808366775513, + "learning_rate": 2e-05, + "loss": 0.05547645, + "step": 21028 + }, + { + "epoch": 42.058, + "grad_norm": 1.4787068367004395, + "learning_rate": 2e-05, + "loss": 0.06114297, + "step": 21029 + }, + { + "epoch": 42.06, + "grad_norm": 1.4573709964752197, + "learning_rate": 2e-05, + "loss": 0.04937818, + "step": 21030 + }, + { + "epoch": 42.062, + "grad_norm": 1.2699495553970337, + "learning_rate": 2e-05, + "loss": 0.04790761, + "step": 21031 + }, + { + "epoch": 42.064, + "grad_norm": 1.1611040830612183, + "learning_rate": 2e-05, + "loss": 0.03836013, + "step": 21032 + }, + { + "epoch": 42.066, + "grad_norm": 1.162618637084961, + "learning_rate": 2e-05, + "loss": 0.04036701, + "step": 21033 + }, + { + "epoch": 42.068, + "grad_norm": 1.3911733627319336, + "learning_rate": 2e-05, + "loss": 0.05297327, + "step": 21034 + }, + { + "epoch": 42.07, + "grad_norm": 1.1646391153335571, + "learning_rate": 2e-05, + "loss": 0.05322537, + "step": 21035 + }, + { + "epoch": 42.072, + "grad_norm": 1.2405617237091064, + "learning_rate": 2e-05, + "loss": 0.03772672, + "step": 21036 + }, + { + "epoch": 42.074, + "grad_norm": 1.0591578483581543, + "learning_rate": 2e-05, + "loss": 0.03944032, + "step": 21037 + }, + { + "epoch": 42.076, + "grad_norm": 1.182476282119751, + "learning_rate": 2e-05, + "loss": 0.05112866, + "step": 21038 + }, + { + "epoch": 42.078, + "grad_norm": 1.7845994234085083, + "learning_rate": 2e-05, + "loss": 0.04743159, + "step": 21039 + }, + { + "epoch": 42.08, + "grad_norm": 1.1651856899261475, + "learning_rate": 2e-05, + "loss": 0.04408582, + "step": 21040 + }, + { + "epoch": 42.082, + "grad_norm": 1.3968032598495483, + "learning_rate": 2e-05, + "loss": 0.06172002, + "step": 21041 + }, + { + "epoch": 42.084, + "grad_norm": 1.2377245426177979, + "learning_rate": 2e-05, + "loss": 0.06204825, + "step": 21042 + }, + { + "epoch": 42.086, + "grad_norm": 1.1847991943359375, + "learning_rate": 2e-05, + "loss": 0.03809762, + "step": 21043 + }, + { + "epoch": 42.088, + "grad_norm": 1.6209152936935425, + "learning_rate": 2e-05, + "loss": 0.0557748, + "step": 21044 + }, + { + "epoch": 42.09, + "grad_norm": 1.1455878019332886, + "learning_rate": 2e-05, + "loss": 0.04782082, + "step": 21045 + }, + { + "epoch": 42.092, + "grad_norm": 1.1409224271774292, + "learning_rate": 2e-05, + "loss": 0.04084107, + "step": 21046 + }, + { + "epoch": 42.094, + "grad_norm": 1.6999880075454712, + "learning_rate": 2e-05, + "loss": 0.0430681, + "step": 21047 + }, + { + "epoch": 42.096, + "grad_norm": 1.3535499572753906, + "learning_rate": 2e-05, + "loss": 0.04580639, + "step": 21048 + }, + { + "epoch": 42.098, + "grad_norm": 1.6721376180648804, + "learning_rate": 2e-05, + "loss": 0.04870587, + "step": 21049 + }, + { + "epoch": 42.1, + "grad_norm": 3.2024240493774414, + "learning_rate": 2e-05, + "loss": 0.04476042, + "step": 21050 + }, + { + "epoch": 42.102, + "grad_norm": 0.9818119406700134, + "learning_rate": 2e-05, + "loss": 0.0436222, + "step": 21051 + }, + { + "epoch": 42.104, + "grad_norm": 1.2789314985275269, + "learning_rate": 2e-05, + "loss": 0.04346582, + "step": 21052 + }, + { + "epoch": 42.106, + "grad_norm": 1.2788130044937134, + "learning_rate": 2e-05, + "loss": 0.05709602, + "step": 21053 + }, + { + "epoch": 42.108, + "grad_norm": 2.102720260620117, + "learning_rate": 2e-05, + "loss": 0.06700805, + "step": 21054 + }, + { + "epoch": 42.11, + "grad_norm": 1.278295636177063, + "learning_rate": 2e-05, + "loss": 0.05733266, + "step": 21055 + }, + { + "epoch": 42.112, + "grad_norm": 1.3988900184631348, + "learning_rate": 2e-05, + "loss": 0.04855637, + "step": 21056 + }, + { + "epoch": 42.114, + "grad_norm": 1.2994776964187622, + "learning_rate": 2e-05, + "loss": 0.04653971, + "step": 21057 + }, + { + "epoch": 42.116, + "grad_norm": 1.2954206466674805, + "learning_rate": 2e-05, + "loss": 0.051019, + "step": 21058 + }, + { + "epoch": 42.118, + "grad_norm": 1.1361409425735474, + "learning_rate": 2e-05, + "loss": 0.03964051, + "step": 21059 + }, + { + "epoch": 42.12, + "grad_norm": 0.9857589602470398, + "learning_rate": 2e-05, + "loss": 0.0420249, + "step": 21060 + }, + { + "epoch": 42.122, + "grad_norm": 1.19060480594635, + "learning_rate": 2e-05, + "loss": 0.03876651, + "step": 21061 + }, + { + "epoch": 42.124, + "grad_norm": 1.0141627788543701, + "learning_rate": 2e-05, + "loss": 0.04508419, + "step": 21062 + }, + { + "epoch": 42.126, + "grad_norm": 1.3992326259613037, + "learning_rate": 2e-05, + "loss": 0.05133965, + "step": 21063 + }, + { + "epoch": 42.128, + "grad_norm": 1.2171835899353027, + "learning_rate": 2e-05, + "loss": 0.03061209, + "step": 21064 + }, + { + "epoch": 42.13, + "grad_norm": 0.9995740652084351, + "learning_rate": 2e-05, + "loss": 0.045886, + "step": 21065 + }, + { + "epoch": 42.132, + "grad_norm": 1.2155423164367676, + "learning_rate": 2e-05, + "loss": 0.05031782, + "step": 21066 + }, + { + "epoch": 42.134, + "grad_norm": 3.3475124835968018, + "learning_rate": 2e-05, + "loss": 0.05089858, + "step": 21067 + }, + { + "epoch": 42.136, + "grad_norm": 1.1842341423034668, + "learning_rate": 2e-05, + "loss": 0.03898914, + "step": 21068 + }, + { + "epoch": 42.138, + "grad_norm": 1.3484647274017334, + "learning_rate": 2e-05, + "loss": 0.05519821, + "step": 21069 + }, + { + "epoch": 42.14, + "grad_norm": 1.2679609060287476, + "learning_rate": 2e-05, + "loss": 0.04845376, + "step": 21070 + }, + { + "epoch": 42.142, + "grad_norm": 1.0729557275772095, + "learning_rate": 2e-05, + "loss": 0.03485009, + "step": 21071 + }, + { + "epoch": 42.144, + "grad_norm": 1.442091464996338, + "learning_rate": 2e-05, + "loss": 0.04761119, + "step": 21072 + }, + { + "epoch": 42.146, + "grad_norm": 1.2365809679031372, + "learning_rate": 2e-05, + "loss": 0.04327618, + "step": 21073 + }, + { + "epoch": 42.148, + "grad_norm": 1.1715463399887085, + "learning_rate": 2e-05, + "loss": 0.04974968, + "step": 21074 + }, + { + "epoch": 42.15, + "grad_norm": 1.1144353151321411, + "learning_rate": 2e-05, + "loss": 0.04348652, + "step": 21075 + }, + { + "epoch": 42.152, + "grad_norm": 1.4729726314544678, + "learning_rate": 2e-05, + "loss": 0.05958918, + "step": 21076 + }, + { + "epoch": 42.154, + "grad_norm": 1.1265980005264282, + "learning_rate": 2e-05, + "loss": 0.05273468, + "step": 21077 + }, + { + "epoch": 42.156, + "grad_norm": 1.2327011823654175, + "learning_rate": 2e-05, + "loss": 0.05835613, + "step": 21078 + }, + { + "epoch": 42.158, + "grad_norm": 1.3347694873809814, + "learning_rate": 2e-05, + "loss": 0.03290975, + "step": 21079 + }, + { + "epoch": 42.16, + "grad_norm": 1.2273168563842773, + "learning_rate": 2e-05, + "loss": 0.05718467, + "step": 21080 + }, + { + "epoch": 42.162, + "grad_norm": 1.1186082363128662, + "learning_rate": 2e-05, + "loss": 0.0496081, + "step": 21081 + }, + { + "epoch": 42.164, + "grad_norm": 1.4576385021209717, + "learning_rate": 2e-05, + "loss": 0.0473141, + "step": 21082 + }, + { + "epoch": 42.166, + "grad_norm": 1.3093568086624146, + "learning_rate": 2e-05, + "loss": 0.05596102, + "step": 21083 + }, + { + "epoch": 42.168, + "grad_norm": 1.2706962823867798, + "learning_rate": 2e-05, + "loss": 0.050145, + "step": 21084 + }, + { + "epoch": 42.17, + "grad_norm": 1.4902071952819824, + "learning_rate": 2e-05, + "loss": 0.04793771, + "step": 21085 + }, + { + "epoch": 42.172, + "grad_norm": 1.174926519393921, + "learning_rate": 2e-05, + "loss": 0.05029378, + "step": 21086 + }, + { + "epoch": 42.174, + "grad_norm": 1.2596014738082886, + "learning_rate": 2e-05, + "loss": 0.03948942, + "step": 21087 + }, + { + "epoch": 42.176, + "grad_norm": 1.0307563543319702, + "learning_rate": 2e-05, + "loss": 0.03940136, + "step": 21088 + }, + { + "epoch": 42.178, + "grad_norm": 1.1724040508270264, + "learning_rate": 2e-05, + "loss": 0.04482014, + "step": 21089 + }, + { + "epoch": 42.18, + "grad_norm": 1.143633484840393, + "learning_rate": 2e-05, + "loss": 0.03999674, + "step": 21090 + }, + { + "epoch": 42.182, + "grad_norm": 1.217869758605957, + "learning_rate": 2e-05, + "loss": 0.03607331, + "step": 21091 + }, + { + "epoch": 42.184, + "grad_norm": 1.5815616846084595, + "learning_rate": 2e-05, + "loss": 0.03352413, + "step": 21092 + }, + { + "epoch": 42.186, + "grad_norm": 1.3018903732299805, + "learning_rate": 2e-05, + "loss": 0.04986539, + "step": 21093 + }, + { + "epoch": 42.188, + "grad_norm": 0.9588651657104492, + "learning_rate": 2e-05, + "loss": 0.03948629, + "step": 21094 + }, + { + "epoch": 42.19, + "grad_norm": 1.2913947105407715, + "learning_rate": 2e-05, + "loss": 0.04663259, + "step": 21095 + }, + { + "epoch": 42.192, + "grad_norm": 1.219344139099121, + "learning_rate": 2e-05, + "loss": 0.0564451, + "step": 21096 + }, + { + "epoch": 42.194, + "grad_norm": 1.4023454189300537, + "learning_rate": 2e-05, + "loss": 0.06140105, + "step": 21097 + }, + { + "epoch": 42.196, + "grad_norm": 1.1653475761413574, + "learning_rate": 2e-05, + "loss": 0.03363206, + "step": 21098 + }, + { + "epoch": 42.198, + "grad_norm": 1.1109018325805664, + "learning_rate": 2e-05, + "loss": 0.0404594, + "step": 21099 + }, + { + "epoch": 42.2, + "grad_norm": 1.053236722946167, + "learning_rate": 2e-05, + "loss": 0.02968258, + "step": 21100 + }, + { + "epoch": 42.202, + "grad_norm": 1.2577223777770996, + "learning_rate": 2e-05, + "loss": 0.05090709, + "step": 21101 + }, + { + "epoch": 42.204, + "grad_norm": 1.3133550882339478, + "learning_rate": 2e-05, + "loss": 0.05599451, + "step": 21102 + }, + { + "epoch": 42.206, + "grad_norm": 2.2289819717407227, + "learning_rate": 2e-05, + "loss": 0.04835991, + "step": 21103 + }, + { + "epoch": 42.208, + "grad_norm": 1.7648617029190063, + "learning_rate": 2e-05, + "loss": 0.04039288, + "step": 21104 + }, + { + "epoch": 42.21, + "grad_norm": 1.5205116271972656, + "learning_rate": 2e-05, + "loss": 0.06169431, + "step": 21105 + }, + { + "epoch": 42.212, + "grad_norm": 1.573837161064148, + "learning_rate": 2e-05, + "loss": 0.06739548, + "step": 21106 + }, + { + "epoch": 42.214, + "grad_norm": 1.3160827159881592, + "learning_rate": 2e-05, + "loss": 0.04701831, + "step": 21107 + }, + { + "epoch": 42.216, + "grad_norm": 1.2954484224319458, + "learning_rate": 2e-05, + "loss": 0.05630885, + "step": 21108 + }, + { + "epoch": 42.218, + "grad_norm": 1.2000118494033813, + "learning_rate": 2e-05, + "loss": 0.0458281, + "step": 21109 + }, + { + "epoch": 42.22, + "grad_norm": 1.564306378364563, + "learning_rate": 2e-05, + "loss": 0.05043546, + "step": 21110 + }, + { + "epoch": 42.222, + "grad_norm": 1.0061910152435303, + "learning_rate": 2e-05, + "loss": 0.03526705, + "step": 21111 + }, + { + "epoch": 42.224, + "grad_norm": 1.4038654565811157, + "learning_rate": 2e-05, + "loss": 0.05683837, + "step": 21112 + }, + { + "epoch": 42.226, + "grad_norm": 1.0469942092895508, + "learning_rate": 2e-05, + "loss": 0.04538111, + "step": 21113 + }, + { + "epoch": 42.228, + "grad_norm": 1.2826262712478638, + "learning_rate": 2e-05, + "loss": 0.04978053, + "step": 21114 + }, + { + "epoch": 42.23, + "grad_norm": 1.3049055337905884, + "learning_rate": 2e-05, + "loss": 0.04201498, + "step": 21115 + }, + { + "epoch": 42.232, + "grad_norm": 1.076093077659607, + "learning_rate": 2e-05, + "loss": 0.04795041, + "step": 21116 + }, + { + "epoch": 42.234, + "grad_norm": 1.1158887147903442, + "learning_rate": 2e-05, + "loss": 0.03639564, + "step": 21117 + }, + { + "epoch": 42.236, + "grad_norm": 1.251383900642395, + "learning_rate": 2e-05, + "loss": 0.04058103, + "step": 21118 + }, + { + "epoch": 42.238, + "grad_norm": 1.0459401607513428, + "learning_rate": 2e-05, + "loss": 0.04287563, + "step": 21119 + }, + { + "epoch": 42.24, + "grad_norm": 1.3883095979690552, + "learning_rate": 2e-05, + "loss": 0.06586547, + "step": 21120 + }, + { + "epoch": 42.242, + "grad_norm": 1.1663405895233154, + "learning_rate": 2e-05, + "loss": 0.03624522, + "step": 21121 + }, + { + "epoch": 42.244, + "grad_norm": 1.8142751455307007, + "learning_rate": 2e-05, + "loss": 0.0545116, + "step": 21122 + }, + { + "epoch": 42.246, + "grad_norm": 2.438311815261841, + "learning_rate": 2e-05, + "loss": 0.05591767, + "step": 21123 + }, + { + "epoch": 42.248, + "grad_norm": 1.2450895309448242, + "learning_rate": 2e-05, + "loss": 0.04676469, + "step": 21124 + }, + { + "epoch": 42.25, + "grad_norm": 1.227967619895935, + "learning_rate": 2e-05, + "loss": 0.05129366, + "step": 21125 + }, + { + "epoch": 42.252, + "grad_norm": 1.1938300132751465, + "learning_rate": 2e-05, + "loss": 0.05615389, + "step": 21126 + }, + { + "epoch": 42.254, + "grad_norm": 1.1461693048477173, + "learning_rate": 2e-05, + "loss": 0.04713152, + "step": 21127 + }, + { + "epoch": 42.256, + "grad_norm": 1.187277913093567, + "learning_rate": 2e-05, + "loss": 0.04280775, + "step": 21128 + }, + { + "epoch": 42.258, + "grad_norm": 1.5151171684265137, + "learning_rate": 2e-05, + "loss": 0.04615886, + "step": 21129 + }, + { + "epoch": 42.26, + "grad_norm": 1.1889163255691528, + "learning_rate": 2e-05, + "loss": 0.0394232, + "step": 21130 + }, + { + "epoch": 42.262, + "grad_norm": 1.3880109786987305, + "learning_rate": 2e-05, + "loss": 0.06245904, + "step": 21131 + }, + { + "epoch": 42.264, + "grad_norm": 1.1990442276000977, + "learning_rate": 2e-05, + "loss": 0.05073933, + "step": 21132 + }, + { + "epoch": 42.266, + "grad_norm": 1.3353129625320435, + "learning_rate": 2e-05, + "loss": 0.04800763, + "step": 21133 + }, + { + "epoch": 42.268, + "grad_norm": 1.1701955795288086, + "learning_rate": 2e-05, + "loss": 0.05479547, + "step": 21134 + }, + { + "epoch": 42.27, + "grad_norm": 1.744545578956604, + "learning_rate": 2e-05, + "loss": 0.04801206, + "step": 21135 + }, + { + "epoch": 42.272, + "grad_norm": 1.2501215934753418, + "learning_rate": 2e-05, + "loss": 0.05587947, + "step": 21136 + }, + { + "epoch": 42.274, + "grad_norm": 1.1730159521102905, + "learning_rate": 2e-05, + "loss": 0.03659301, + "step": 21137 + }, + { + "epoch": 42.276, + "grad_norm": 1.2556431293487549, + "learning_rate": 2e-05, + "loss": 0.06406794, + "step": 21138 + }, + { + "epoch": 42.278, + "grad_norm": 0.9709025621414185, + "learning_rate": 2e-05, + "loss": 0.02665532, + "step": 21139 + }, + { + "epoch": 42.28, + "grad_norm": 1.0300137996673584, + "learning_rate": 2e-05, + "loss": 0.0415834, + "step": 21140 + }, + { + "epoch": 42.282, + "grad_norm": 1.334498643875122, + "learning_rate": 2e-05, + "loss": 0.04262113, + "step": 21141 + }, + { + "epoch": 42.284, + "grad_norm": 0.9960622191429138, + "learning_rate": 2e-05, + "loss": 0.03293207, + "step": 21142 + }, + { + "epoch": 42.286, + "grad_norm": 1.19757878780365, + "learning_rate": 2e-05, + "loss": 0.05076621, + "step": 21143 + }, + { + "epoch": 42.288, + "grad_norm": 1.1853511333465576, + "learning_rate": 2e-05, + "loss": 0.05421963, + "step": 21144 + }, + { + "epoch": 42.29, + "grad_norm": 1.0343493223190308, + "learning_rate": 2e-05, + "loss": 0.03314459, + "step": 21145 + }, + { + "epoch": 42.292, + "grad_norm": 6.21450662612915, + "learning_rate": 2e-05, + "loss": 0.05752373, + "step": 21146 + }, + { + "epoch": 42.294, + "grad_norm": 1.1545538902282715, + "learning_rate": 2e-05, + "loss": 0.04223744, + "step": 21147 + }, + { + "epoch": 42.296, + "grad_norm": 1.1654874086380005, + "learning_rate": 2e-05, + "loss": 0.03923909, + "step": 21148 + }, + { + "epoch": 42.298, + "grad_norm": 1.1227787733078003, + "learning_rate": 2e-05, + "loss": 0.03867494, + "step": 21149 + }, + { + "epoch": 42.3, + "grad_norm": 1.422080397605896, + "learning_rate": 2e-05, + "loss": 0.05246984, + "step": 21150 + }, + { + "epoch": 42.302, + "grad_norm": 1.0861878395080566, + "learning_rate": 2e-05, + "loss": 0.03628934, + "step": 21151 + }, + { + "epoch": 42.304, + "grad_norm": 1.113157868385315, + "learning_rate": 2e-05, + "loss": 0.03985634, + "step": 21152 + }, + { + "epoch": 42.306, + "grad_norm": 1.102103352546692, + "learning_rate": 2e-05, + "loss": 0.04545946, + "step": 21153 + }, + { + "epoch": 42.308, + "grad_norm": 2.6324360370635986, + "learning_rate": 2e-05, + "loss": 0.052878, + "step": 21154 + }, + { + "epoch": 42.31, + "grad_norm": 3.8525304794311523, + "learning_rate": 2e-05, + "loss": 0.06736265, + "step": 21155 + }, + { + "epoch": 42.312, + "grad_norm": 1.187397837638855, + "learning_rate": 2e-05, + "loss": 0.04645222, + "step": 21156 + }, + { + "epoch": 42.314, + "grad_norm": 0.9652663469314575, + "learning_rate": 2e-05, + "loss": 0.03505144, + "step": 21157 + }, + { + "epoch": 42.316, + "grad_norm": 0.9658792018890381, + "learning_rate": 2e-05, + "loss": 0.0281552, + "step": 21158 + }, + { + "epoch": 42.318, + "grad_norm": 2.401021718978882, + "learning_rate": 2e-05, + "loss": 0.05007901, + "step": 21159 + }, + { + "epoch": 42.32, + "grad_norm": 1.2055039405822754, + "learning_rate": 2e-05, + "loss": 0.05365395, + "step": 21160 + }, + { + "epoch": 42.322, + "grad_norm": 3.1883809566497803, + "learning_rate": 2e-05, + "loss": 0.05920103, + "step": 21161 + }, + { + "epoch": 42.324, + "grad_norm": 1.5328859090805054, + "learning_rate": 2e-05, + "loss": 0.04159598, + "step": 21162 + }, + { + "epoch": 42.326, + "grad_norm": 1.2868179082870483, + "learning_rate": 2e-05, + "loss": 0.05470748, + "step": 21163 + }, + { + "epoch": 42.328, + "grad_norm": 1.2141040563583374, + "learning_rate": 2e-05, + "loss": 0.0402927, + "step": 21164 + }, + { + "epoch": 42.33, + "grad_norm": 1.7615190744400024, + "learning_rate": 2e-05, + "loss": 0.03097968, + "step": 21165 + }, + { + "epoch": 42.332, + "grad_norm": 1.5224100351333618, + "learning_rate": 2e-05, + "loss": 0.04342334, + "step": 21166 + }, + { + "epoch": 42.334, + "grad_norm": 1.1339821815490723, + "learning_rate": 2e-05, + "loss": 0.03738585, + "step": 21167 + }, + { + "epoch": 42.336, + "grad_norm": 1.207984209060669, + "learning_rate": 2e-05, + "loss": 0.0441434, + "step": 21168 + }, + { + "epoch": 42.338, + "grad_norm": 3.2836313247680664, + "learning_rate": 2e-05, + "loss": 0.03989324, + "step": 21169 + }, + { + "epoch": 42.34, + "grad_norm": 1.7195615768432617, + "learning_rate": 2e-05, + "loss": 0.04979802, + "step": 21170 + }, + { + "epoch": 42.342, + "grad_norm": 1.368320107460022, + "learning_rate": 2e-05, + "loss": 0.0440874, + "step": 21171 + }, + { + "epoch": 42.344, + "grad_norm": 1.2935134172439575, + "learning_rate": 2e-05, + "loss": 0.06269786, + "step": 21172 + }, + { + "epoch": 42.346, + "grad_norm": 1.5454837083816528, + "learning_rate": 2e-05, + "loss": 0.05150343, + "step": 21173 + }, + { + "epoch": 42.348, + "grad_norm": 1.0489261150360107, + "learning_rate": 2e-05, + "loss": 0.03613845, + "step": 21174 + }, + { + "epoch": 42.35, + "grad_norm": 2.1777830123901367, + "learning_rate": 2e-05, + "loss": 0.05178384, + "step": 21175 + }, + { + "epoch": 42.352, + "grad_norm": 1.3198027610778809, + "learning_rate": 2e-05, + "loss": 0.04619918, + "step": 21176 + }, + { + "epoch": 42.354, + "grad_norm": 1.1379380226135254, + "learning_rate": 2e-05, + "loss": 0.0368214, + "step": 21177 + }, + { + "epoch": 42.356, + "grad_norm": 2.0367469787597656, + "learning_rate": 2e-05, + "loss": 0.04297212, + "step": 21178 + }, + { + "epoch": 42.358, + "grad_norm": 1.455183506011963, + "learning_rate": 2e-05, + "loss": 0.06377006, + "step": 21179 + }, + { + "epoch": 42.36, + "grad_norm": 1.1051256656646729, + "learning_rate": 2e-05, + "loss": 0.03730104, + "step": 21180 + }, + { + "epoch": 42.362, + "grad_norm": 1.163114070892334, + "learning_rate": 2e-05, + "loss": 0.04407148, + "step": 21181 + }, + { + "epoch": 42.364, + "grad_norm": 1.0142223834991455, + "learning_rate": 2e-05, + "loss": 0.03786562, + "step": 21182 + }, + { + "epoch": 42.366, + "grad_norm": 1.0811436176300049, + "learning_rate": 2e-05, + "loss": 0.03777923, + "step": 21183 + }, + { + "epoch": 42.368, + "grad_norm": 1.1761528253555298, + "learning_rate": 2e-05, + "loss": 0.04415475, + "step": 21184 + }, + { + "epoch": 42.37, + "grad_norm": 0.9909497499465942, + "learning_rate": 2e-05, + "loss": 0.04150863, + "step": 21185 + }, + { + "epoch": 42.372, + "grad_norm": 1.077889323234558, + "learning_rate": 2e-05, + "loss": 0.04909585, + "step": 21186 + }, + { + "epoch": 42.374, + "grad_norm": 1.0327900648117065, + "learning_rate": 2e-05, + "loss": 0.03843714, + "step": 21187 + }, + { + "epoch": 42.376, + "grad_norm": 1.582082748413086, + "learning_rate": 2e-05, + "loss": 0.04728288, + "step": 21188 + }, + { + "epoch": 42.378, + "grad_norm": 1.148047924041748, + "learning_rate": 2e-05, + "loss": 0.04307776, + "step": 21189 + }, + { + "epoch": 42.38, + "grad_norm": 1.0733706951141357, + "learning_rate": 2e-05, + "loss": 0.04168211, + "step": 21190 + }, + { + "epoch": 42.382, + "grad_norm": 1.2072162628173828, + "learning_rate": 2e-05, + "loss": 0.05512146, + "step": 21191 + }, + { + "epoch": 42.384, + "grad_norm": 1.1533020734786987, + "learning_rate": 2e-05, + "loss": 0.03808629, + "step": 21192 + }, + { + "epoch": 42.386, + "grad_norm": 1.307199478149414, + "learning_rate": 2e-05, + "loss": 0.05116679, + "step": 21193 + }, + { + "epoch": 42.388, + "grad_norm": 1.3861874341964722, + "learning_rate": 2e-05, + "loss": 0.06823573, + "step": 21194 + }, + { + "epoch": 42.39, + "grad_norm": 1.1966004371643066, + "learning_rate": 2e-05, + "loss": 0.0478618, + "step": 21195 + }, + { + "epoch": 42.392, + "grad_norm": 1.1855981349945068, + "learning_rate": 2e-05, + "loss": 0.0414066, + "step": 21196 + }, + { + "epoch": 42.394, + "grad_norm": 1.1840327978134155, + "learning_rate": 2e-05, + "loss": 0.05320561, + "step": 21197 + }, + { + "epoch": 42.396, + "grad_norm": 1.4169137477874756, + "learning_rate": 2e-05, + "loss": 0.05518955, + "step": 21198 + }, + { + "epoch": 42.398, + "grad_norm": 1.1781421899795532, + "learning_rate": 2e-05, + "loss": 0.04477818, + "step": 21199 + }, + { + "epoch": 42.4, + "grad_norm": 1.092429757118225, + "learning_rate": 2e-05, + "loss": 0.04341368, + "step": 21200 + }, + { + "epoch": 42.402, + "grad_norm": 1.3216253519058228, + "learning_rate": 2e-05, + "loss": 0.05887017, + "step": 21201 + }, + { + "epoch": 42.404, + "grad_norm": 1.205215573310852, + "learning_rate": 2e-05, + "loss": 0.04666806, + "step": 21202 + }, + { + "epoch": 42.406, + "grad_norm": 1.2955050468444824, + "learning_rate": 2e-05, + "loss": 0.05575846, + "step": 21203 + }, + { + "epoch": 42.408, + "grad_norm": 2.5864737033843994, + "learning_rate": 2e-05, + "loss": 0.03791555, + "step": 21204 + }, + { + "epoch": 42.41, + "grad_norm": 1.188843011856079, + "learning_rate": 2e-05, + "loss": 0.05555812, + "step": 21205 + }, + { + "epoch": 42.412, + "grad_norm": 0.9318320751190186, + "learning_rate": 2e-05, + "loss": 0.02653901, + "step": 21206 + }, + { + "epoch": 42.414, + "grad_norm": 1.1733531951904297, + "learning_rate": 2e-05, + "loss": 0.04083906, + "step": 21207 + }, + { + "epoch": 42.416, + "grad_norm": 1.1449897289276123, + "learning_rate": 2e-05, + "loss": 0.04881506, + "step": 21208 + }, + { + "epoch": 42.418, + "grad_norm": 1.965896487236023, + "learning_rate": 2e-05, + "loss": 0.04963106, + "step": 21209 + }, + { + "epoch": 42.42, + "grad_norm": 1.0945767164230347, + "learning_rate": 2e-05, + "loss": 0.04849849, + "step": 21210 + }, + { + "epoch": 42.422, + "grad_norm": 1.308809757232666, + "learning_rate": 2e-05, + "loss": 0.06706718, + "step": 21211 + }, + { + "epoch": 42.424, + "grad_norm": 1.53829026222229, + "learning_rate": 2e-05, + "loss": 0.06904881, + "step": 21212 + }, + { + "epoch": 42.426, + "grad_norm": 1.2670159339904785, + "learning_rate": 2e-05, + "loss": 0.05303366, + "step": 21213 + }, + { + "epoch": 42.428, + "grad_norm": 1.2741113901138306, + "learning_rate": 2e-05, + "loss": 0.0492133, + "step": 21214 + }, + { + "epoch": 42.43, + "grad_norm": 1.2569606304168701, + "learning_rate": 2e-05, + "loss": 0.06223689, + "step": 21215 + }, + { + "epoch": 42.432, + "grad_norm": 1.107643485069275, + "learning_rate": 2e-05, + "loss": 0.04277761, + "step": 21216 + }, + { + "epoch": 42.434, + "grad_norm": 1.122188925743103, + "learning_rate": 2e-05, + "loss": 0.04128901, + "step": 21217 + }, + { + "epoch": 42.436, + "grad_norm": 1.1837259531021118, + "learning_rate": 2e-05, + "loss": 0.04776137, + "step": 21218 + }, + { + "epoch": 42.438, + "grad_norm": 3.169210910797119, + "learning_rate": 2e-05, + "loss": 0.05383972, + "step": 21219 + }, + { + "epoch": 42.44, + "grad_norm": 0.9989545941352844, + "learning_rate": 2e-05, + "loss": 0.03308531, + "step": 21220 + }, + { + "epoch": 42.442, + "grad_norm": 1.1423425674438477, + "learning_rate": 2e-05, + "loss": 0.04700923, + "step": 21221 + }, + { + "epoch": 42.444, + "grad_norm": 1.0154725313186646, + "learning_rate": 2e-05, + "loss": 0.03601583, + "step": 21222 + }, + { + "epoch": 42.446, + "grad_norm": 1.450246810913086, + "learning_rate": 2e-05, + "loss": 0.0615345, + "step": 21223 + }, + { + "epoch": 42.448, + "grad_norm": 1.1445060968399048, + "learning_rate": 2e-05, + "loss": 0.03595731, + "step": 21224 + }, + { + "epoch": 42.45, + "grad_norm": 1.6389358043670654, + "learning_rate": 2e-05, + "loss": 0.0496392, + "step": 21225 + }, + { + "epoch": 42.452, + "grad_norm": 1.141597867012024, + "learning_rate": 2e-05, + "loss": 0.03886272, + "step": 21226 + }, + { + "epoch": 42.454, + "grad_norm": 2.0984458923339844, + "learning_rate": 2e-05, + "loss": 0.0533991, + "step": 21227 + }, + { + "epoch": 42.456, + "grad_norm": 1.5619630813598633, + "learning_rate": 2e-05, + "loss": 0.0451661, + "step": 21228 + }, + { + "epoch": 42.458, + "grad_norm": 1.1230692863464355, + "learning_rate": 2e-05, + "loss": 0.02888122, + "step": 21229 + }, + { + "epoch": 42.46, + "grad_norm": 1.5001888275146484, + "learning_rate": 2e-05, + "loss": 0.04741661, + "step": 21230 + }, + { + "epoch": 42.462, + "grad_norm": 1.1775398254394531, + "learning_rate": 2e-05, + "loss": 0.05408292, + "step": 21231 + }, + { + "epoch": 42.464, + "grad_norm": 1.1655455827713013, + "learning_rate": 2e-05, + "loss": 0.04785192, + "step": 21232 + }, + { + "epoch": 42.466, + "grad_norm": 1.3385460376739502, + "learning_rate": 2e-05, + "loss": 0.04888346, + "step": 21233 + }, + { + "epoch": 42.468, + "grad_norm": 1.5374442338943481, + "learning_rate": 2e-05, + "loss": 0.048363, + "step": 21234 + }, + { + "epoch": 42.47, + "grad_norm": 1.1293838024139404, + "learning_rate": 2e-05, + "loss": 0.03480672, + "step": 21235 + }, + { + "epoch": 42.472, + "grad_norm": 1.4131519794464111, + "learning_rate": 2e-05, + "loss": 0.059271, + "step": 21236 + }, + { + "epoch": 42.474, + "grad_norm": 1.091775894165039, + "learning_rate": 2e-05, + "loss": 0.04225942, + "step": 21237 + }, + { + "epoch": 42.476, + "grad_norm": 1.5270899534225464, + "learning_rate": 2e-05, + "loss": 0.05104741, + "step": 21238 + }, + { + "epoch": 42.478, + "grad_norm": 1.4533936977386475, + "learning_rate": 2e-05, + "loss": 0.04931673, + "step": 21239 + }, + { + "epoch": 42.48, + "grad_norm": 1.187991738319397, + "learning_rate": 2e-05, + "loss": 0.05418856, + "step": 21240 + }, + { + "epoch": 42.482, + "grad_norm": 1.3054866790771484, + "learning_rate": 2e-05, + "loss": 0.05543121, + "step": 21241 + }, + { + "epoch": 42.484, + "grad_norm": 1.1094039678573608, + "learning_rate": 2e-05, + "loss": 0.03574315, + "step": 21242 + }, + { + "epoch": 42.486, + "grad_norm": 1.1816613674163818, + "learning_rate": 2e-05, + "loss": 0.04213757, + "step": 21243 + }, + { + "epoch": 42.488, + "grad_norm": 1.0031068325042725, + "learning_rate": 2e-05, + "loss": 0.03389908, + "step": 21244 + }, + { + "epoch": 42.49, + "grad_norm": 1.4081437587738037, + "learning_rate": 2e-05, + "loss": 0.05755498, + "step": 21245 + }, + { + "epoch": 42.492, + "grad_norm": 1.0443217754364014, + "learning_rate": 2e-05, + "loss": 0.0272135, + "step": 21246 + }, + { + "epoch": 42.494, + "grad_norm": 2.372973918914795, + "learning_rate": 2e-05, + "loss": 0.0727305, + "step": 21247 + }, + { + "epoch": 42.496, + "grad_norm": 1.1258074045181274, + "learning_rate": 2e-05, + "loss": 0.04463324, + "step": 21248 + }, + { + "epoch": 42.498, + "grad_norm": 1.0638490915298462, + "learning_rate": 2e-05, + "loss": 0.04064821, + "step": 21249 + }, + { + "epoch": 42.5, + "grad_norm": 2.074948787689209, + "learning_rate": 2e-05, + "loss": 0.04368262, + "step": 21250 + }, + { + "epoch": 42.502, + "grad_norm": 1.1462494134902954, + "learning_rate": 2e-05, + "loss": 0.05238108, + "step": 21251 + }, + { + "epoch": 42.504, + "grad_norm": 1.1089760065078735, + "learning_rate": 2e-05, + "loss": 0.05240737, + "step": 21252 + }, + { + "epoch": 42.506, + "grad_norm": 1.010260820388794, + "learning_rate": 2e-05, + "loss": 0.0344992, + "step": 21253 + }, + { + "epoch": 42.508, + "grad_norm": 1.2549633979797363, + "learning_rate": 2e-05, + "loss": 0.05022692, + "step": 21254 + }, + { + "epoch": 42.51, + "grad_norm": 1.307962417602539, + "learning_rate": 2e-05, + "loss": 0.05273689, + "step": 21255 + }, + { + "epoch": 42.512, + "grad_norm": 1.516259789466858, + "learning_rate": 2e-05, + "loss": 0.03713296, + "step": 21256 + }, + { + "epoch": 42.514, + "grad_norm": 1.1646299362182617, + "learning_rate": 2e-05, + "loss": 0.04224031, + "step": 21257 + }, + { + "epoch": 42.516, + "grad_norm": 1.3015379905700684, + "learning_rate": 2e-05, + "loss": 0.05681949, + "step": 21258 + }, + { + "epoch": 42.518, + "grad_norm": 1.2873107194900513, + "learning_rate": 2e-05, + "loss": 0.06247327, + "step": 21259 + }, + { + "epoch": 42.52, + "grad_norm": 1.2081667184829712, + "learning_rate": 2e-05, + "loss": 0.04261228, + "step": 21260 + }, + { + "epoch": 42.522, + "grad_norm": 2.0166215896606445, + "learning_rate": 2e-05, + "loss": 0.05025642, + "step": 21261 + }, + { + "epoch": 42.524, + "grad_norm": 1.4932124614715576, + "learning_rate": 2e-05, + "loss": 0.04905674, + "step": 21262 + }, + { + "epoch": 42.526, + "grad_norm": 1.2398021221160889, + "learning_rate": 2e-05, + "loss": 0.0445546, + "step": 21263 + }, + { + "epoch": 42.528, + "grad_norm": 7.126203536987305, + "learning_rate": 2e-05, + "loss": 0.05969048, + "step": 21264 + }, + { + "epoch": 42.53, + "grad_norm": 1.2210969924926758, + "learning_rate": 2e-05, + "loss": 0.03673073, + "step": 21265 + }, + { + "epoch": 42.532, + "grad_norm": 1.105908751487732, + "learning_rate": 2e-05, + "loss": 0.04691067, + "step": 21266 + }, + { + "epoch": 42.534, + "grad_norm": 1.393251657485962, + "learning_rate": 2e-05, + "loss": 0.04946836, + "step": 21267 + }, + { + "epoch": 42.536, + "grad_norm": 1.284510850906372, + "learning_rate": 2e-05, + "loss": 0.04197853, + "step": 21268 + }, + { + "epoch": 42.538, + "grad_norm": 1.0972687005996704, + "learning_rate": 2e-05, + "loss": 0.03820524, + "step": 21269 + }, + { + "epoch": 42.54, + "grad_norm": 1.1322238445281982, + "learning_rate": 2e-05, + "loss": 0.04426937, + "step": 21270 + }, + { + "epoch": 42.542, + "grad_norm": 2.50138783454895, + "learning_rate": 2e-05, + "loss": 0.06171612, + "step": 21271 + }, + { + "epoch": 42.544, + "grad_norm": 1.0176743268966675, + "learning_rate": 2e-05, + "loss": 0.03656572, + "step": 21272 + }, + { + "epoch": 42.546, + "grad_norm": 1.3137876987457275, + "learning_rate": 2e-05, + "loss": 0.05632983, + "step": 21273 + }, + { + "epoch": 42.548, + "grad_norm": 2.6000397205352783, + "learning_rate": 2e-05, + "loss": 0.04389685, + "step": 21274 + }, + { + "epoch": 42.55, + "grad_norm": 2.7255218029022217, + "learning_rate": 2e-05, + "loss": 0.06559823, + "step": 21275 + }, + { + "epoch": 42.552, + "grad_norm": 1.1543296575546265, + "learning_rate": 2e-05, + "loss": 0.05046273, + "step": 21276 + }, + { + "epoch": 42.554, + "grad_norm": 1.017396092414856, + "learning_rate": 2e-05, + "loss": 0.03115503, + "step": 21277 + }, + { + "epoch": 42.556, + "grad_norm": 1.3216530084609985, + "learning_rate": 2e-05, + "loss": 0.06245044, + "step": 21278 + }, + { + "epoch": 42.558, + "grad_norm": 12.367131233215332, + "learning_rate": 2e-05, + "loss": 0.05980074, + "step": 21279 + }, + { + "epoch": 42.56, + "grad_norm": 1.1518374681472778, + "learning_rate": 2e-05, + "loss": 0.03418801, + "step": 21280 + }, + { + "epoch": 42.562, + "grad_norm": 2.031050443649292, + "learning_rate": 2e-05, + "loss": 0.04848942, + "step": 21281 + }, + { + "epoch": 42.564, + "grad_norm": 1.0938751697540283, + "learning_rate": 2e-05, + "loss": 0.0451759, + "step": 21282 + }, + { + "epoch": 42.566, + "grad_norm": 1.1018202304840088, + "learning_rate": 2e-05, + "loss": 0.04848454, + "step": 21283 + }, + { + "epoch": 42.568, + "grad_norm": 1.2324109077453613, + "learning_rate": 2e-05, + "loss": 0.04794756, + "step": 21284 + }, + { + "epoch": 42.57, + "grad_norm": 2.7218592166900635, + "learning_rate": 2e-05, + "loss": 0.0646372, + "step": 21285 + }, + { + "epoch": 42.572, + "grad_norm": 1.2674838304519653, + "learning_rate": 2e-05, + "loss": 0.04550756, + "step": 21286 + }, + { + "epoch": 42.574, + "grad_norm": 1.5812036991119385, + "learning_rate": 2e-05, + "loss": 0.05703519, + "step": 21287 + }, + { + "epoch": 42.576, + "grad_norm": 1.1205166578292847, + "learning_rate": 2e-05, + "loss": 0.036104, + "step": 21288 + }, + { + "epoch": 42.578, + "grad_norm": 1.0537066459655762, + "learning_rate": 2e-05, + "loss": 0.03754063, + "step": 21289 + }, + { + "epoch": 42.58, + "grad_norm": 1.2235515117645264, + "learning_rate": 2e-05, + "loss": 0.05299861, + "step": 21290 + }, + { + "epoch": 42.582, + "grad_norm": 1.3739643096923828, + "learning_rate": 2e-05, + "loss": 0.0461156, + "step": 21291 + }, + { + "epoch": 42.584, + "grad_norm": 1.5320465564727783, + "learning_rate": 2e-05, + "loss": 0.04836542, + "step": 21292 + }, + { + "epoch": 42.586, + "grad_norm": 0.9427246451377869, + "learning_rate": 2e-05, + "loss": 0.0362893, + "step": 21293 + }, + { + "epoch": 42.588, + "grad_norm": 1.3789801597595215, + "learning_rate": 2e-05, + "loss": 0.06239626, + "step": 21294 + }, + { + "epoch": 42.59, + "grad_norm": 1.258785605430603, + "learning_rate": 2e-05, + "loss": 0.05440217, + "step": 21295 + }, + { + "epoch": 42.592, + "grad_norm": 1.2506073713302612, + "learning_rate": 2e-05, + "loss": 0.04535231, + "step": 21296 + }, + { + "epoch": 42.594, + "grad_norm": 1.300620436668396, + "learning_rate": 2e-05, + "loss": 0.0481655, + "step": 21297 + }, + { + "epoch": 42.596, + "grad_norm": 1.1247934103012085, + "learning_rate": 2e-05, + "loss": 0.03969868, + "step": 21298 + }, + { + "epoch": 42.598, + "grad_norm": 1.2077419757843018, + "learning_rate": 2e-05, + "loss": 0.04599006, + "step": 21299 + }, + { + "epoch": 42.6, + "grad_norm": 4.433143615722656, + "learning_rate": 2e-05, + "loss": 0.05563483, + "step": 21300 + }, + { + "epoch": 42.602, + "grad_norm": 1.223757266998291, + "learning_rate": 2e-05, + "loss": 0.04834705, + "step": 21301 + }, + { + "epoch": 42.604, + "grad_norm": 1.001859426498413, + "learning_rate": 2e-05, + "loss": 0.03891331, + "step": 21302 + }, + { + "epoch": 42.606, + "grad_norm": 1.4744184017181396, + "learning_rate": 2e-05, + "loss": 0.04635298, + "step": 21303 + }, + { + "epoch": 42.608, + "grad_norm": 1.0292778015136719, + "learning_rate": 2e-05, + "loss": 0.03995837, + "step": 21304 + }, + { + "epoch": 42.61, + "grad_norm": 1.2195087671279907, + "learning_rate": 2e-05, + "loss": 0.04556499, + "step": 21305 + }, + { + "epoch": 42.612, + "grad_norm": 2.625493049621582, + "learning_rate": 2e-05, + "loss": 0.0414276, + "step": 21306 + }, + { + "epoch": 42.614, + "grad_norm": 1.3295499086380005, + "learning_rate": 2e-05, + "loss": 0.05958395, + "step": 21307 + }, + { + "epoch": 42.616, + "grad_norm": 2.4811103343963623, + "learning_rate": 2e-05, + "loss": 0.08586189, + "step": 21308 + }, + { + "epoch": 42.618, + "grad_norm": 0.9455074667930603, + "learning_rate": 2e-05, + "loss": 0.02827884, + "step": 21309 + }, + { + "epoch": 42.62, + "grad_norm": 1.2621259689331055, + "learning_rate": 2e-05, + "loss": 0.04555801, + "step": 21310 + }, + { + "epoch": 42.622, + "grad_norm": 1.1620298624038696, + "learning_rate": 2e-05, + "loss": 0.0330307, + "step": 21311 + }, + { + "epoch": 42.624, + "grad_norm": 1.165947437286377, + "learning_rate": 2e-05, + "loss": 0.04790859, + "step": 21312 + }, + { + "epoch": 42.626, + "grad_norm": 2.1621484756469727, + "learning_rate": 2e-05, + "loss": 0.06904516, + "step": 21313 + }, + { + "epoch": 42.628, + "grad_norm": 1.2981723546981812, + "learning_rate": 2e-05, + "loss": 0.04969555, + "step": 21314 + }, + { + "epoch": 42.63, + "grad_norm": 1.206282377243042, + "learning_rate": 2e-05, + "loss": 0.05298881, + "step": 21315 + }, + { + "epoch": 42.632, + "grad_norm": 1.1706401109695435, + "learning_rate": 2e-05, + "loss": 0.04146463, + "step": 21316 + }, + { + "epoch": 42.634, + "grad_norm": 1.349630355834961, + "learning_rate": 2e-05, + "loss": 0.05339145, + "step": 21317 + }, + { + "epoch": 42.636, + "grad_norm": 1.3318768739700317, + "learning_rate": 2e-05, + "loss": 0.04014498, + "step": 21318 + }, + { + "epoch": 42.638, + "grad_norm": 1.787174105644226, + "learning_rate": 2e-05, + "loss": 0.07187255, + "step": 21319 + }, + { + "epoch": 42.64, + "grad_norm": 1.0085779428482056, + "learning_rate": 2e-05, + "loss": 0.03416509, + "step": 21320 + }, + { + "epoch": 42.642, + "grad_norm": 1.366184115409851, + "learning_rate": 2e-05, + "loss": 0.05623956, + "step": 21321 + }, + { + "epoch": 42.644, + "grad_norm": 1.2050182819366455, + "learning_rate": 2e-05, + "loss": 0.04033398, + "step": 21322 + }, + { + "epoch": 42.646, + "grad_norm": 1.4368079900741577, + "learning_rate": 2e-05, + "loss": 0.04438941, + "step": 21323 + }, + { + "epoch": 42.648, + "grad_norm": 1.231373906135559, + "learning_rate": 2e-05, + "loss": 0.04198126, + "step": 21324 + }, + { + "epoch": 42.65, + "grad_norm": 3.3663556575775146, + "learning_rate": 2e-05, + "loss": 0.05418935, + "step": 21325 + }, + { + "epoch": 42.652, + "grad_norm": 1.528099536895752, + "learning_rate": 2e-05, + "loss": 0.06321813, + "step": 21326 + }, + { + "epoch": 42.654, + "grad_norm": 1.117147445678711, + "learning_rate": 2e-05, + "loss": 0.0353119, + "step": 21327 + }, + { + "epoch": 42.656, + "grad_norm": 1.3439724445343018, + "learning_rate": 2e-05, + "loss": 0.05230685, + "step": 21328 + }, + { + "epoch": 42.658, + "grad_norm": 1.0160696506500244, + "learning_rate": 2e-05, + "loss": 0.04300717, + "step": 21329 + }, + { + "epoch": 42.66, + "grad_norm": 2.5469577312469482, + "learning_rate": 2e-05, + "loss": 0.04865529, + "step": 21330 + }, + { + "epoch": 42.662, + "grad_norm": 1.6859334707260132, + "learning_rate": 2e-05, + "loss": 0.04481193, + "step": 21331 + }, + { + "epoch": 42.664, + "grad_norm": 1.0845061540603638, + "learning_rate": 2e-05, + "loss": 0.03940871, + "step": 21332 + }, + { + "epoch": 42.666, + "grad_norm": 3.1730074882507324, + "learning_rate": 2e-05, + "loss": 0.06717253, + "step": 21333 + }, + { + "epoch": 42.668, + "grad_norm": 0.7828595638275146, + "learning_rate": 2e-05, + "loss": 0.02153579, + "step": 21334 + }, + { + "epoch": 42.67, + "grad_norm": 1.4767457246780396, + "learning_rate": 2e-05, + "loss": 0.05914707, + "step": 21335 + }, + { + "epoch": 42.672, + "grad_norm": 1.4632948637008667, + "learning_rate": 2e-05, + "loss": 0.0467616, + "step": 21336 + }, + { + "epoch": 42.674, + "grad_norm": 1.245827078819275, + "learning_rate": 2e-05, + "loss": 0.04130943, + "step": 21337 + }, + { + "epoch": 42.676, + "grad_norm": 1.1984535455703735, + "learning_rate": 2e-05, + "loss": 0.05097179, + "step": 21338 + }, + { + "epoch": 42.678, + "grad_norm": 1.018325686454773, + "learning_rate": 2e-05, + "loss": 0.04212132, + "step": 21339 + }, + { + "epoch": 42.68, + "grad_norm": 2.334608316421509, + "learning_rate": 2e-05, + "loss": 0.0457168, + "step": 21340 + }, + { + "epoch": 42.682, + "grad_norm": 1.2345631122589111, + "learning_rate": 2e-05, + "loss": 0.04911385, + "step": 21341 + }, + { + "epoch": 42.684, + "grad_norm": 1.1206731796264648, + "learning_rate": 2e-05, + "loss": 0.0280387, + "step": 21342 + }, + { + "epoch": 42.686, + "grad_norm": 1.4151371717453003, + "learning_rate": 2e-05, + "loss": 0.06245557, + "step": 21343 + }, + { + "epoch": 42.688, + "grad_norm": 1.2239195108413696, + "learning_rate": 2e-05, + "loss": 0.04748405, + "step": 21344 + }, + { + "epoch": 42.69, + "grad_norm": 1.0787338018417358, + "learning_rate": 2e-05, + "loss": 0.03307759, + "step": 21345 + }, + { + "epoch": 42.692, + "grad_norm": 1.2297214269638062, + "learning_rate": 2e-05, + "loss": 0.05256373, + "step": 21346 + }, + { + "epoch": 42.694, + "grad_norm": 1.2633470296859741, + "learning_rate": 2e-05, + "loss": 0.05449621, + "step": 21347 + }, + { + "epoch": 42.696, + "grad_norm": 1.178499460220337, + "learning_rate": 2e-05, + "loss": 0.06108705, + "step": 21348 + }, + { + "epoch": 42.698, + "grad_norm": 1.030000925064087, + "learning_rate": 2e-05, + "loss": 0.03802224, + "step": 21349 + }, + { + "epoch": 42.7, + "grad_norm": 1.2210856676101685, + "learning_rate": 2e-05, + "loss": 0.04589963, + "step": 21350 + }, + { + "epoch": 42.702, + "grad_norm": 2.0504300594329834, + "learning_rate": 2e-05, + "loss": 0.06983016, + "step": 21351 + }, + { + "epoch": 42.704, + "grad_norm": 1.2079181671142578, + "learning_rate": 2e-05, + "loss": 0.04810051, + "step": 21352 + }, + { + "epoch": 42.706, + "grad_norm": 1.1589932441711426, + "learning_rate": 2e-05, + "loss": 0.04745119, + "step": 21353 + }, + { + "epoch": 42.708, + "grad_norm": 4.482708930969238, + "learning_rate": 2e-05, + "loss": 0.0591661, + "step": 21354 + }, + { + "epoch": 42.71, + "grad_norm": 1.2510643005371094, + "learning_rate": 2e-05, + "loss": 0.05218077, + "step": 21355 + }, + { + "epoch": 42.712, + "grad_norm": 1.295291543006897, + "learning_rate": 2e-05, + "loss": 0.05780672, + "step": 21356 + }, + { + "epoch": 42.714, + "grad_norm": 1.1533124446868896, + "learning_rate": 2e-05, + "loss": 0.03984768, + "step": 21357 + }, + { + "epoch": 42.716, + "grad_norm": 3.040389060974121, + "learning_rate": 2e-05, + "loss": 0.04302094, + "step": 21358 + }, + { + "epoch": 42.718, + "grad_norm": 1.2422996759414673, + "learning_rate": 2e-05, + "loss": 0.03962889, + "step": 21359 + }, + { + "epoch": 42.72, + "grad_norm": 1.4480630159378052, + "learning_rate": 2e-05, + "loss": 0.06389065, + "step": 21360 + }, + { + "epoch": 42.722, + "grad_norm": 1.3611098527908325, + "learning_rate": 2e-05, + "loss": 0.0595374, + "step": 21361 + }, + { + "epoch": 42.724, + "grad_norm": 1.2345424890518188, + "learning_rate": 2e-05, + "loss": 0.03755029, + "step": 21362 + }, + { + "epoch": 42.726, + "grad_norm": 1.8066527843475342, + "learning_rate": 2e-05, + "loss": 0.05974579, + "step": 21363 + }, + { + "epoch": 42.728, + "grad_norm": 1.1717407703399658, + "learning_rate": 2e-05, + "loss": 0.04459912, + "step": 21364 + }, + { + "epoch": 42.73, + "grad_norm": 1.143455982208252, + "learning_rate": 2e-05, + "loss": 0.04196816, + "step": 21365 + }, + { + "epoch": 42.732, + "grad_norm": 1.2013553380966187, + "learning_rate": 2e-05, + "loss": 0.03594541, + "step": 21366 + }, + { + "epoch": 42.734, + "grad_norm": 1.1833301782608032, + "learning_rate": 2e-05, + "loss": 0.04475646, + "step": 21367 + }, + { + "epoch": 42.736, + "grad_norm": 1.0608525276184082, + "learning_rate": 2e-05, + "loss": 0.0312412, + "step": 21368 + }, + { + "epoch": 42.738, + "grad_norm": 1.3239516019821167, + "learning_rate": 2e-05, + "loss": 0.05588631, + "step": 21369 + }, + { + "epoch": 42.74, + "grad_norm": 1.2072561979293823, + "learning_rate": 2e-05, + "loss": 0.0523868, + "step": 21370 + }, + { + "epoch": 42.742, + "grad_norm": 2.1657562255859375, + "learning_rate": 2e-05, + "loss": 0.0605137, + "step": 21371 + }, + { + "epoch": 42.744, + "grad_norm": 1.158803105354309, + "learning_rate": 2e-05, + "loss": 0.04379743, + "step": 21372 + }, + { + "epoch": 42.746, + "grad_norm": 1.2670451402664185, + "learning_rate": 2e-05, + "loss": 0.04156856, + "step": 21373 + }, + { + "epoch": 42.748, + "grad_norm": 1.0824308395385742, + "learning_rate": 2e-05, + "loss": 0.04734112, + "step": 21374 + }, + { + "epoch": 42.75, + "grad_norm": 1.4957252740859985, + "learning_rate": 2e-05, + "loss": 0.04622442, + "step": 21375 + }, + { + "epoch": 42.752, + "grad_norm": 1.1113194227218628, + "learning_rate": 2e-05, + "loss": 0.03722591, + "step": 21376 + }, + { + "epoch": 42.754, + "grad_norm": 1.513335108757019, + "learning_rate": 2e-05, + "loss": 0.05863171, + "step": 21377 + }, + { + "epoch": 42.756, + "grad_norm": 1.463580846786499, + "learning_rate": 2e-05, + "loss": 0.0668181, + "step": 21378 + }, + { + "epoch": 42.758, + "grad_norm": 1.4309571981430054, + "learning_rate": 2e-05, + "loss": 0.03635763, + "step": 21379 + }, + { + "epoch": 42.76, + "grad_norm": 1.273354172706604, + "learning_rate": 2e-05, + "loss": 0.04854331, + "step": 21380 + }, + { + "epoch": 42.762, + "grad_norm": 1.6273373365402222, + "learning_rate": 2e-05, + "loss": 0.05022759, + "step": 21381 + }, + { + "epoch": 42.764, + "grad_norm": 1.2511389255523682, + "learning_rate": 2e-05, + "loss": 0.06143327, + "step": 21382 + }, + { + "epoch": 42.766, + "grad_norm": 2.7621679306030273, + "learning_rate": 2e-05, + "loss": 0.06704356, + "step": 21383 + }, + { + "epoch": 42.768, + "grad_norm": 1.5374120473861694, + "learning_rate": 2e-05, + "loss": 0.04694888, + "step": 21384 + }, + { + "epoch": 42.77, + "grad_norm": 1.2802921533584595, + "learning_rate": 2e-05, + "loss": 0.05271628, + "step": 21385 + }, + { + "epoch": 42.772, + "grad_norm": 1.4109792709350586, + "learning_rate": 2e-05, + "loss": 0.05482592, + "step": 21386 + }, + { + "epoch": 42.774, + "grad_norm": 1.5210480690002441, + "learning_rate": 2e-05, + "loss": 0.05803951, + "step": 21387 + }, + { + "epoch": 42.776, + "grad_norm": 1.1467036008834839, + "learning_rate": 2e-05, + "loss": 0.0471014, + "step": 21388 + }, + { + "epoch": 42.778, + "grad_norm": 1.3119735717773438, + "learning_rate": 2e-05, + "loss": 0.0533366, + "step": 21389 + }, + { + "epoch": 42.78, + "grad_norm": 1.1641420125961304, + "learning_rate": 2e-05, + "loss": 0.03897999, + "step": 21390 + }, + { + "epoch": 42.782, + "grad_norm": 1.1525665521621704, + "learning_rate": 2e-05, + "loss": 0.04090577, + "step": 21391 + }, + { + "epoch": 42.784, + "grad_norm": 1.2021316289901733, + "learning_rate": 2e-05, + "loss": 0.04947101, + "step": 21392 + }, + { + "epoch": 42.786, + "grad_norm": 1.2240577936172485, + "learning_rate": 2e-05, + "loss": 0.05535308, + "step": 21393 + }, + { + "epoch": 42.788, + "grad_norm": 1.2875986099243164, + "learning_rate": 2e-05, + "loss": 0.03962916, + "step": 21394 + }, + { + "epoch": 42.79, + "grad_norm": 1.526619553565979, + "learning_rate": 2e-05, + "loss": 0.03942203, + "step": 21395 + }, + { + "epoch": 42.792, + "grad_norm": 1.133887529373169, + "learning_rate": 2e-05, + "loss": 0.04529195, + "step": 21396 + }, + { + "epoch": 42.794, + "grad_norm": 1.195630669593811, + "learning_rate": 2e-05, + "loss": 0.04189469, + "step": 21397 + }, + { + "epoch": 42.796, + "grad_norm": 1.2251441478729248, + "learning_rate": 2e-05, + "loss": 0.04151393, + "step": 21398 + }, + { + "epoch": 42.798, + "grad_norm": 1.0438613891601562, + "learning_rate": 2e-05, + "loss": 0.03437603, + "step": 21399 + }, + { + "epoch": 42.8, + "grad_norm": 1.001349687576294, + "learning_rate": 2e-05, + "loss": 0.03562608, + "step": 21400 + }, + { + "epoch": 42.802, + "grad_norm": 1.156879186630249, + "learning_rate": 2e-05, + "loss": 0.03687527, + "step": 21401 + }, + { + "epoch": 42.804, + "grad_norm": 1.0168277025222778, + "learning_rate": 2e-05, + "loss": 0.04121634, + "step": 21402 + }, + { + "epoch": 42.806, + "grad_norm": 1.6192265748977661, + "learning_rate": 2e-05, + "loss": 0.05773157, + "step": 21403 + }, + { + "epoch": 42.808, + "grad_norm": 1.384158968925476, + "learning_rate": 2e-05, + "loss": 0.04894597, + "step": 21404 + }, + { + "epoch": 42.81, + "grad_norm": 1.0740807056427002, + "learning_rate": 2e-05, + "loss": 0.03360681, + "step": 21405 + }, + { + "epoch": 42.812, + "grad_norm": 1.36252760887146, + "learning_rate": 2e-05, + "loss": 0.04959939, + "step": 21406 + }, + { + "epoch": 42.814, + "grad_norm": 1.1320419311523438, + "learning_rate": 2e-05, + "loss": 0.03762558, + "step": 21407 + }, + { + "epoch": 42.816, + "grad_norm": 1.1921322345733643, + "learning_rate": 2e-05, + "loss": 0.04960704, + "step": 21408 + }, + { + "epoch": 42.818, + "grad_norm": 1.363591194152832, + "learning_rate": 2e-05, + "loss": 0.04697926, + "step": 21409 + }, + { + "epoch": 42.82, + "grad_norm": 1.330572485923767, + "learning_rate": 2e-05, + "loss": 0.03431287, + "step": 21410 + }, + { + "epoch": 42.822, + "grad_norm": 1.108099102973938, + "learning_rate": 2e-05, + "loss": 0.04299867, + "step": 21411 + }, + { + "epoch": 42.824, + "grad_norm": 1.28284752368927, + "learning_rate": 2e-05, + "loss": 0.05352758, + "step": 21412 + }, + { + "epoch": 42.826, + "grad_norm": 1.2229121923446655, + "learning_rate": 2e-05, + "loss": 0.0591327, + "step": 21413 + }, + { + "epoch": 42.828, + "grad_norm": 1.152987003326416, + "learning_rate": 2e-05, + "loss": 0.03737348, + "step": 21414 + }, + { + "epoch": 42.83, + "grad_norm": 1.1505484580993652, + "learning_rate": 2e-05, + "loss": 0.04128478, + "step": 21415 + }, + { + "epoch": 42.832, + "grad_norm": 1.2108104228973389, + "learning_rate": 2e-05, + "loss": 0.0475995, + "step": 21416 + }, + { + "epoch": 42.834, + "grad_norm": 1.315833330154419, + "learning_rate": 2e-05, + "loss": 0.04457718, + "step": 21417 + }, + { + "epoch": 42.836, + "grad_norm": 1.3256268501281738, + "learning_rate": 2e-05, + "loss": 0.04045897, + "step": 21418 + }, + { + "epoch": 42.838, + "grad_norm": 2.0043587684631348, + "learning_rate": 2e-05, + "loss": 0.05085915, + "step": 21419 + }, + { + "epoch": 42.84, + "grad_norm": 1.0381031036376953, + "learning_rate": 2e-05, + "loss": 0.03765831, + "step": 21420 + }, + { + "epoch": 42.842, + "grad_norm": 1.3118317127227783, + "learning_rate": 2e-05, + "loss": 0.05129515, + "step": 21421 + }, + { + "epoch": 42.844, + "grad_norm": 1.2408820390701294, + "learning_rate": 2e-05, + "loss": 0.05141382, + "step": 21422 + }, + { + "epoch": 42.846, + "grad_norm": 1.1298905611038208, + "learning_rate": 2e-05, + "loss": 0.03599843, + "step": 21423 + }, + { + "epoch": 42.848, + "grad_norm": 1.4461491107940674, + "learning_rate": 2e-05, + "loss": 0.06431194, + "step": 21424 + }, + { + "epoch": 42.85, + "grad_norm": 1.4541326761245728, + "learning_rate": 2e-05, + "loss": 0.0566168, + "step": 21425 + }, + { + "epoch": 42.852, + "grad_norm": 1.6744414567947388, + "learning_rate": 2e-05, + "loss": 0.04111119, + "step": 21426 + }, + { + "epoch": 42.854, + "grad_norm": 1.2035828828811646, + "learning_rate": 2e-05, + "loss": 0.05423025, + "step": 21427 + }, + { + "epoch": 42.856, + "grad_norm": 3.0272951126098633, + "learning_rate": 2e-05, + "loss": 0.05712648, + "step": 21428 + }, + { + "epoch": 42.858, + "grad_norm": 0.9234744310379028, + "learning_rate": 2e-05, + "loss": 0.03638177, + "step": 21429 + }, + { + "epoch": 42.86, + "grad_norm": 1.2166481018066406, + "learning_rate": 2e-05, + "loss": 0.04876867, + "step": 21430 + }, + { + "epoch": 42.862, + "grad_norm": 1.454555630683899, + "learning_rate": 2e-05, + "loss": 0.06422681, + "step": 21431 + }, + { + "epoch": 42.864, + "grad_norm": 1.2168675661087036, + "learning_rate": 2e-05, + "loss": 0.05015201, + "step": 21432 + }, + { + "epoch": 42.866, + "grad_norm": 1.659549355506897, + "learning_rate": 2e-05, + "loss": 0.03892962, + "step": 21433 + }, + { + "epoch": 42.868, + "grad_norm": 1.268166422843933, + "learning_rate": 2e-05, + "loss": 0.04936218, + "step": 21434 + }, + { + "epoch": 42.87, + "grad_norm": 1.0668729543685913, + "learning_rate": 2e-05, + "loss": 0.03783253, + "step": 21435 + }, + { + "epoch": 42.872, + "grad_norm": 1.1927694082260132, + "learning_rate": 2e-05, + "loss": 0.03492489, + "step": 21436 + }, + { + "epoch": 42.874, + "grad_norm": 1.2407193183898926, + "learning_rate": 2e-05, + "loss": 0.04802696, + "step": 21437 + }, + { + "epoch": 42.876, + "grad_norm": 1.156215786933899, + "learning_rate": 2e-05, + "loss": 0.0512966, + "step": 21438 + }, + { + "epoch": 42.878, + "grad_norm": 1.2110116481781006, + "learning_rate": 2e-05, + "loss": 0.03545962, + "step": 21439 + }, + { + "epoch": 42.88, + "grad_norm": 1.1005175113677979, + "learning_rate": 2e-05, + "loss": 0.04559709, + "step": 21440 + }, + { + "epoch": 42.882, + "grad_norm": 1.329023003578186, + "learning_rate": 2e-05, + "loss": 0.0600805, + "step": 21441 + }, + { + "epoch": 42.884, + "grad_norm": 1.2809315919876099, + "learning_rate": 2e-05, + "loss": 0.03948345, + "step": 21442 + }, + { + "epoch": 42.886, + "grad_norm": 1.0475729703903198, + "learning_rate": 2e-05, + "loss": 0.02779887, + "step": 21443 + }, + { + "epoch": 42.888, + "grad_norm": 1.2448220252990723, + "learning_rate": 2e-05, + "loss": 0.05767167, + "step": 21444 + }, + { + "epoch": 42.89, + "grad_norm": 1.1443229913711548, + "learning_rate": 2e-05, + "loss": 0.05177973, + "step": 21445 + }, + { + "epoch": 42.892, + "grad_norm": 1.5171314477920532, + "learning_rate": 2e-05, + "loss": 0.04922928, + "step": 21446 + }, + { + "epoch": 42.894, + "grad_norm": 1.217365026473999, + "learning_rate": 2e-05, + "loss": 0.05303325, + "step": 21447 + }, + { + "epoch": 42.896, + "grad_norm": 1.2820359468460083, + "learning_rate": 2e-05, + "loss": 0.06319527, + "step": 21448 + }, + { + "epoch": 42.898, + "grad_norm": 1.3315848112106323, + "learning_rate": 2e-05, + "loss": 0.05307559, + "step": 21449 + }, + { + "epoch": 42.9, + "grad_norm": 1.4399700164794922, + "learning_rate": 2e-05, + "loss": 0.0532106, + "step": 21450 + }, + { + "epoch": 42.902, + "grad_norm": 1.1903473138809204, + "learning_rate": 2e-05, + "loss": 0.03630826, + "step": 21451 + }, + { + "epoch": 42.904, + "grad_norm": 1.3730727434158325, + "learning_rate": 2e-05, + "loss": 0.04516565, + "step": 21452 + }, + { + "epoch": 42.906, + "grad_norm": 1.2712746858596802, + "learning_rate": 2e-05, + "loss": 0.04588057, + "step": 21453 + }, + { + "epoch": 42.908, + "grad_norm": 1.266530990600586, + "learning_rate": 2e-05, + "loss": 0.05008287, + "step": 21454 + }, + { + "epoch": 42.91, + "grad_norm": 1.0627597570419312, + "learning_rate": 2e-05, + "loss": 0.03308658, + "step": 21455 + }, + { + "epoch": 42.912, + "grad_norm": 1.8096920251846313, + "learning_rate": 2e-05, + "loss": 0.04926098, + "step": 21456 + }, + { + "epoch": 42.914, + "grad_norm": 1.3828647136688232, + "learning_rate": 2e-05, + "loss": 0.05100198, + "step": 21457 + }, + { + "epoch": 42.916, + "grad_norm": 1.3538262844085693, + "learning_rate": 2e-05, + "loss": 0.05305772, + "step": 21458 + }, + { + "epoch": 42.918, + "grad_norm": 1.327880620956421, + "learning_rate": 2e-05, + "loss": 0.05849855, + "step": 21459 + }, + { + "epoch": 42.92, + "grad_norm": 2.1985175609588623, + "learning_rate": 2e-05, + "loss": 0.04045462, + "step": 21460 + }, + { + "epoch": 42.922, + "grad_norm": 1.3796802759170532, + "learning_rate": 2e-05, + "loss": 0.05500578, + "step": 21461 + }, + { + "epoch": 42.924, + "grad_norm": 1.4442265033721924, + "learning_rate": 2e-05, + "loss": 0.06637794, + "step": 21462 + }, + { + "epoch": 42.926, + "grad_norm": 1.2436082363128662, + "learning_rate": 2e-05, + "loss": 0.05103221, + "step": 21463 + }, + { + "epoch": 42.928, + "grad_norm": 1.0881454944610596, + "learning_rate": 2e-05, + "loss": 0.05290727, + "step": 21464 + }, + { + "epoch": 42.93, + "grad_norm": 1.2928407192230225, + "learning_rate": 2e-05, + "loss": 0.06411622, + "step": 21465 + }, + { + "epoch": 42.932, + "grad_norm": 3.3758764266967773, + "learning_rate": 2e-05, + "loss": 0.05194551, + "step": 21466 + }, + { + "epoch": 42.934, + "grad_norm": 1.0337151288986206, + "learning_rate": 2e-05, + "loss": 0.0436775, + "step": 21467 + }, + { + "epoch": 42.936, + "grad_norm": 1.7715908288955688, + "learning_rate": 2e-05, + "loss": 0.05398581, + "step": 21468 + }, + { + "epoch": 42.938, + "grad_norm": 1.2396409511566162, + "learning_rate": 2e-05, + "loss": 0.06144951, + "step": 21469 + }, + { + "epoch": 42.94, + "grad_norm": 1.1984061002731323, + "learning_rate": 2e-05, + "loss": 0.04839371, + "step": 21470 + }, + { + "epoch": 42.942, + "grad_norm": 1.130316138267517, + "learning_rate": 2e-05, + "loss": 0.05355977, + "step": 21471 + }, + { + "epoch": 42.944, + "grad_norm": 1.217844009399414, + "learning_rate": 2e-05, + "loss": 0.04676696, + "step": 21472 + }, + { + "epoch": 42.946, + "grad_norm": 1.0561518669128418, + "learning_rate": 2e-05, + "loss": 0.0412074, + "step": 21473 + }, + { + "epoch": 42.948, + "grad_norm": 1.0529133081436157, + "learning_rate": 2e-05, + "loss": 0.03983943, + "step": 21474 + }, + { + "epoch": 42.95, + "grad_norm": 1.1867733001708984, + "learning_rate": 2e-05, + "loss": 0.05299602, + "step": 21475 + }, + { + "epoch": 42.952, + "grad_norm": 1.284942865371704, + "learning_rate": 2e-05, + "loss": 0.04374795, + "step": 21476 + }, + { + "epoch": 42.954, + "grad_norm": 1.425246000289917, + "learning_rate": 2e-05, + "loss": 0.07214919, + "step": 21477 + }, + { + "epoch": 42.956, + "grad_norm": 1.3944209814071655, + "learning_rate": 2e-05, + "loss": 0.06095017, + "step": 21478 + }, + { + "epoch": 42.958, + "grad_norm": 1.2820805311203003, + "learning_rate": 2e-05, + "loss": 0.039196, + "step": 21479 + }, + { + "epoch": 42.96, + "grad_norm": 1.0585222244262695, + "learning_rate": 2e-05, + "loss": 0.04182017, + "step": 21480 + }, + { + "epoch": 42.962, + "grad_norm": 1.012815237045288, + "learning_rate": 2e-05, + "loss": 0.03696268, + "step": 21481 + }, + { + "epoch": 42.964, + "grad_norm": 1.024418592453003, + "learning_rate": 2e-05, + "loss": 0.03427555, + "step": 21482 + }, + { + "epoch": 42.966, + "grad_norm": 1.316025972366333, + "learning_rate": 2e-05, + "loss": 0.06875788, + "step": 21483 + }, + { + "epoch": 42.968, + "grad_norm": 1.1476069688796997, + "learning_rate": 2e-05, + "loss": 0.05424814, + "step": 21484 + }, + { + "epoch": 42.97, + "grad_norm": 1.8176919221878052, + "learning_rate": 2e-05, + "loss": 0.06173361, + "step": 21485 + }, + { + "epoch": 42.972, + "grad_norm": 1.1928600072860718, + "learning_rate": 2e-05, + "loss": 0.05281177, + "step": 21486 + }, + { + "epoch": 42.974, + "grad_norm": 0.9522770047187805, + "learning_rate": 2e-05, + "loss": 0.03397568, + "step": 21487 + }, + { + "epoch": 42.976, + "grad_norm": 1.3486912250518799, + "learning_rate": 2e-05, + "loss": 0.05722672, + "step": 21488 + }, + { + "epoch": 42.978, + "grad_norm": 1.3525702953338623, + "learning_rate": 2e-05, + "loss": 0.05092058, + "step": 21489 + }, + { + "epoch": 42.98, + "grad_norm": 1.5690195560455322, + "learning_rate": 2e-05, + "loss": 0.0533328, + "step": 21490 + }, + { + "epoch": 42.982, + "grad_norm": 1.4370635747909546, + "learning_rate": 2e-05, + "loss": 0.06423539, + "step": 21491 + }, + { + "epoch": 42.984, + "grad_norm": 1.0913952589035034, + "learning_rate": 2e-05, + "loss": 0.03920151, + "step": 21492 + }, + { + "epoch": 42.986, + "grad_norm": 1.426624059677124, + "learning_rate": 2e-05, + "loss": 0.07058513, + "step": 21493 + }, + { + "epoch": 42.988, + "grad_norm": 1.0381181240081787, + "learning_rate": 2e-05, + "loss": 0.03804436, + "step": 21494 + }, + { + "epoch": 42.99, + "grad_norm": 0.9893444776535034, + "learning_rate": 2e-05, + "loss": 0.03246005, + "step": 21495 + }, + { + "epoch": 42.992, + "grad_norm": 1.4209214448928833, + "learning_rate": 2e-05, + "loss": 0.05637572, + "step": 21496 + }, + { + "epoch": 42.994, + "grad_norm": 1.1668082475662231, + "learning_rate": 2e-05, + "loss": 0.04799629, + "step": 21497 + }, + { + "epoch": 42.996, + "grad_norm": 1.103164792060852, + "learning_rate": 2e-05, + "loss": 0.04127581, + "step": 21498 + }, + { + "epoch": 42.998, + "grad_norm": 2.728940725326538, + "learning_rate": 2e-05, + "loss": 0.05009279, + "step": 21499 + }, + { + "epoch": 43.0, + "grad_norm": 1.1477065086364746, + "learning_rate": 2e-05, + "loss": 0.03524557, + "step": 21500 + }, + { + "epoch": 43.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9860279441117764, + "Equal_1": 1.0, + "Equal_2": 0.9860279441117764, + "Equal_3": 0.9920159680638723, + "LineComparison_1": 1.0, + "LineComparison_2": 0.998003992015968, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9959919839679359, + "Parallel_2": 0.9959919839679359, + "Parallel_3": 0.99, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.994, + "Perpendicular_3": 0.9018036072144289, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.99, + "PointLiesOnLine_1": 0.9939879759519038, + "PointLiesOnLine_2": 0.9919839679358717, + "PointLiesOnLine_3": 0.9860279441117764 + }, + "eval_runtime": 319.7331, + "eval_samples_per_second": 32.84, + "eval_steps_per_second": 0.657, + "step": 21500 + }, + { + "epoch": 43.002, + "grad_norm": 1.0772887468338013, + "learning_rate": 2e-05, + "loss": 0.04629757, + "step": 21501 + }, + { + "epoch": 43.004, + "grad_norm": 1.0623633861541748, + "learning_rate": 2e-05, + "loss": 0.03715014, + "step": 21502 + }, + { + "epoch": 43.006, + "grad_norm": 1.1207759380340576, + "learning_rate": 2e-05, + "loss": 0.05135489, + "step": 21503 + }, + { + "epoch": 43.008, + "grad_norm": 1.6616121530532837, + "learning_rate": 2e-05, + "loss": 0.04297728, + "step": 21504 + }, + { + "epoch": 43.01, + "grad_norm": 1.3650739192962646, + "learning_rate": 2e-05, + "loss": 0.05856933, + "step": 21505 + }, + { + "epoch": 43.012, + "grad_norm": 1.8282989263534546, + "learning_rate": 2e-05, + "loss": 0.04885706, + "step": 21506 + }, + { + "epoch": 43.014, + "grad_norm": 1.361122727394104, + "learning_rate": 2e-05, + "loss": 0.05703342, + "step": 21507 + }, + { + "epoch": 43.016, + "grad_norm": 1.263135313987732, + "learning_rate": 2e-05, + "loss": 0.04379937, + "step": 21508 + }, + { + "epoch": 43.018, + "grad_norm": 1.3780148029327393, + "learning_rate": 2e-05, + "loss": 0.05682152, + "step": 21509 + }, + { + "epoch": 43.02, + "grad_norm": 1.3219780921936035, + "learning_rate": 2e-05, + "loss": 0.0547793, + "step": 21510 + }, + { + "epoch": 43.022, + "grad_norm": 1.2654283046722412, + "learning_rate": 2e-05, + "loss": 0.047323, + "step": 21511 + }, + { + "epoch": 43.024, + "grad_norm": 1.272671103477478, + "learning_rate": 2e-05, + "loss": 0.04931159, + "step": 21512 + }, + { + "epoch": 43.026, + "grad_norm": 1.1386663913726807, + "learning_rate": 2e-05, + "loss": 0.03507162, + "step": 21513 + }, + { + "epoch": 43.028, + "grad_norm": 1.292199730873108, + "learning_rate": 2e-05, + "loss": 0.04890214, + "step": 21514 + }, + { + "epoch": 43.03, + "grad_norm": 1.5617351531982422, + "learning_rate": 2e-05, + "loss": 0.05029353, + "step": 21515 + }, + { + "epoch": 43.032, + "grad_norm": 2.036496639251709, + "learning_rate": 2e-05, + "loss": 0.07027332, + "step": 21516 + }, + { + "epoch": 43.034, + "grad_norm": 0.9926701784133911, + "learning_rate": 2e-05, + "loss": 0.03244736, + "step": 21517 + }, + { + "epoch": 43.036, + "grad_norm": 1.0814783573150635, + "learning_rate": 2e-05, + "loss": 0.04349055, + "step": 21518 + }, + { + "epoch": 43.038, + "grad_norm": 1.1061608791351318, + "learning_rate": 2e-05, + "loss": 0.04114666, + "step": 21519 + }, + { + "epoch": 43.04, + "grad_norm": 1.1178923845291138, + "learning_rate": 2e-05, + "loss": 0.04589276, + "step": 21520 + }, + { + "epoch": 43.042, + "grad_norm": 1.3732867240905762, + "learning_rate": 2e-05, + "loss": 0.0422243, + "step": 21521 + }, + { + "epoch": 43.044, + "grad_norm": 2.6664786338806152, + "learning_rate": 2e-05, + "loss": 0.06500043, + "step": 21522 + }, + { + "epoch": 43.046, + "grad_norm": 1.1350311040878296, + "learning_rate": 2e-05, + "loss": 0.03851867, + "step": 21523 + }, + { + "epoch": 43.048, + "grad_norm": 1.0867100954055786, + "learning_rate": 2e-05, + "loss": 0.04170727, + "step": 21524 + }, + { + "epoch": 43.05, + "grad_norm": 1.3278635740280151, + "learning_rate": 2e-05, + "loss": 0.05376638, + "step": 21525 + }, + { + "epoch": 43.052, + "grad_norm": 1.2718002796173096, + "learning_rate": 2e-05, + "loss": 0.06238486, + "step": 21526 + }, + { + "epoch": 43.054, + "grad_norm": 1.2181652784347534, + "learning_rate": 2e-05, + "loss": 0.05618663, + "step": 21527 + }, + { + "epoch": 43.056, + "grad_norm": 1.2053866386413574, + "learning_rate": 2e-05, + "loss": 0.05018714, + "step": 21528 + }, + { + "epoch": 43.058, + "grad_norm": 1.7344564199447632, + "learning_rate": 2e-05, + "loss": 0.05830881, + "step": 21529 + }, + { + "epoch": 43.06, + "grad_norm": 1.2700293064117432, + "learning_rate": 2e-05, + "loss": 0.05242781, + "step": 21530 + }, + { + "epoch": 43.062, + "grad_norm": 1.3638324737548828, + "learning_rate": 2e-05, + "loss": 0.05736677, + "step": 21531 + }, + { + "epoch": 43.064, + "grad_norm": 1.4970639944076538, + "learning_rate": 2e-05, + "loss": 0.07066285, + "step": 21532 + }, + { + "epoch": 43.066, + "grad_norm": 1.4570978879928589, + "learning_rate": 2e-05, + "loss": 0.05045886, + "step": 21533 + }, + { + "epoch": 43.068, + "grad_norm": 1.0011194944381714, + "learning_rate": 2e-05, + "loss": 0.04105179, + "step": 21534 + }, + { + "epoch": 43.07, + "grad_norm": 1.718483805656433, + "learning_rate": 2e-05, + "loss": 0.0531066, + "step": 21535 + }, + { + "epoch": 43.072, + "grad_norm": 1.9208921194076538, + "learning_rate": 2e-05, + "loss": 0.07216385, + "step": 21536 + }, + { + "epoch": 43.074, + "grad_norm": 1.464358925819397, + "learning_rate": 2e-05, + "loss": 0.05128159, + "step": 21537 + }, + { + "epoch": 43.076, + "grad_norm": 1.617860198020935, + "learning_rate": 2e-05, + "loss": 0.05280685, + "step": 21538 + }, + { + "epoch": 43.078, + "grad_norm": 1.2591980695724487, + "learning_rate": 2e-05, + "loss": 0.03685856, + "step": 21539 + }, + { + "epoch": 43.08, + "grad_norm": 1.155810832977295, + "learning_rate": 2e-05, + "loss": 0.04095223, + "step": 21540 + }, + { + "epoch": 43.082, + "grad_norm": 1.4928919076919556, + "learning_rate": 2e-05, + "loss": 0.0549767, + "step": 21541 + }, + { + "epoch": 43.084, + "grad_norm": 1.7492938041687012, + "learning_rate": 2e-05, + "loss": 0.06689818, + "step": 21542 + }, + { + "epoch": 43.086, + "grad_norm": 1.3056284189224243, + "learning_rate": 2e-05, + "loss": 0.05678186, + "step": 21543 + }, + { + "epoch": 43.088, + "grad_norm": 1.1400055885314941, + "learning_rate": 2e-05, + "loss": 0.05558588, + "step": 21544 + }, + { + "epoch": 43.09, + "grad_norm": 1.2211138010025024, + "learning_rate": 2e-05, + "loss": 0.0515589, + "step": 21545 + }, + { + "epoch": 43.092, + "grad_norm": 1.1707724332809448, + "learning_rate": 2e-05, + "loss": 0.03701507, + "step": 21546 + }, + { + "epoch": 43.094, + "grad_norm": 1.9183954000473022, + "learning_rate": 2e-05, + "loss": 0.06833592, + "step": 21547 + }, + { + "epoch": 43.096, + "grad_norm": 1.1282914876937866, + "learning_rate": 2e-05, + "loss": 0.04579955, + "step": 21548 + }, + { + "epoch": 43.098, + "grad_norm": 1.173416256904602, + "learning_rate": 2e-05, + "loss": 0.0542321, + "step": 21549 + }, + { + "epoch": 43.1, + "grad_norm": 1.2352608442306519, + "learning_rate": 2e-05, + "loss": 0.05904563, + "step": 21550 + }, + { + "epoch": 43.102, + "grad_norm": 1.1427820920944214, + "learning_rate": 2e-05, + "loss": 0.04212008, + "step": 21551 + }, + { + "epoch": 43.104, + "grad_norm": 1.5755908489227295, + "learning_rate": 2e-05, + "loss": 0.04956923, + "step": 21552 + }, + { + "epoch": 43.106, + "grad_norm": 1.5542879104614258, + "learning_rate": 2e-05, + "loss": 0.06011878, + "step": 21553 + }, + { + "epoch": 43.108, + "grad_norm": 1.389430284500122, + "learning_rate": 2e-05, + "loss": 0.04906828, + "step": 21554 + }, + { + "epoch": 43.11, + "grad_norm": 1.4012058973312378, + "learning_rate": 2e-05, + "loss": 0.06038637, + "step": 21555 + }, + { + "epoch": 43.112, + "grad_norm": 3.3362956047058105, + "learning_rate": 2e-05, + "loss": 0.05289589, + "step": 21556 + }, + { + "epoch": 43.114, + "grad_norm": 1.240237832069397, + "learning_rate": 2e-05, + "loss": 0.04178648, + "step": 21557 + }, + { + "epoch": 43.116, + "grad_norm": 1.0733537673950195, + "learning_rate": 2e-05, + "loss": 0.0356668, + "step": 21558 + }, + { + "epoch": 43.118, + "grad_norm": 1.472996711730957, + "learning_rate": 2e-05, + "loss": 0.04951042, + "step": 21559 + }, + { + "epoch": 43.12, + "grad_norm": 1.41386878490448, + "learning_rate": 2e-05, + "loss": 0.05777404, + "step": 21560 + }, + { + "epoch": 43.122, + "grad_norm": 1.279788851737976, + "learning_rate": 2e-05, + "loss": 0.04153184, + "step": 21561 + }, + { + "epoch": 43.124, + "grad_norm": 1.375981092453003, + "learning_rate": 2e-05, + "loss": 0.05480566, + "step": 21562 + }, + { + "epoch": 43.126, + "grad_norm": 1.1689554452896118, + "learning_rate": 2e-05, + "loss": 0.03462553, + "step": 21563 + }, + { + "epoch": 43.128, + "grad_norm": 1.4414396286010742, + "learning_rate": 2e-05, + "loss": 0.05469351, + "step": 21564 + }, + { + "epoch": 43.13, + "grad_norm": 1.1081470251083374, + "learning_rate": 2e-05, + "loss": 0.0445482, + "step": 21565 + }, + { + "epoch": 43.132, + "grad_norm": 1.3130306005477905, + "learning_rate": 2e-05, + "loss": 0.03380114, + "step": 21566 + }, + { + "epoch": 43.134, + "grad_norm": 1.2124284505844116, + "learning_rate": 2e-05, + "loss": 0.0559901, + "step": 21567 + }, + { + "epoch": 43.136, + "grad_norm": 1.755231499671936, + "learning_rate": 2e-05, + "loss": 0.05166025, + "step": 21568 + }, + { + "epoch": 43.138, + "grad_norm": 1.3129682540893555, + "learning_rate": 2e-05, + "loss": 0.05173158, + "step": 21569 + }, + { + "epoch": 43.14, + "grad_norm": 1.3563148975372314, + "learning_rate": 2e-05, + "loss": 0.05071842, + "step": 21570 + }, + { + "epoch": 43.142, + "grad_norm": 1.0146980285644531, + "learning_rate": 2e-05, + "loss": 0.04583877, + "step": 21571 + }, + { + "epoch": 43.144, + "grad_norm": 1.3281503915786743, + "learning_rate": 2e-05, + "loss": 0.04625765, + "step": 21572 + }, + { + "epoch": 43.146, + "grad_norm": 1.9704128503799438, + "learning_rate": 2e-05, + "loss": 0.04025682, + "step": 21573 + }, + { + "epoch": 43.148, + "grad_norm": 1.2888715267181396, + "learning_rate": 2e-05, + "loss": 0.05024606, + "step": 21574 + }, + { + "epoch": 43.15, + "grad_norm": 1.2609823942184448, + "learning_rate": 2e-05, + "loss": 0.04634705, + "step": 21575 + }, + { + "epoch": 43.152, + "grad_norm": 1.2393063306808472, + "learning_rate": 2e-05, + "loss": 0.05042982, + "step": 21576 + }, + { + "epoch": 43.154, + "grad_norm": 1.1345429420471191, + "learning_rate": 2e-05, + "loss": 0.02828003, + "step": 21577 + }, + { + "epoch": 43.156, + "grad_norm": 1.1094892024993896, + "learning_rate": 2e-05, + "loss": 0.04609562, + "step": 21578 + }, + { + "epoch": 43.158, + "grad_norm": 1.19045889377594, + "learning_rate": 2e-05, + "loss": 0.05094993, + "step": 21579 + }, + { + "epoch": 43.16, + "grad_norm": 1.1686400175094604, + "learning_rate": 2e-05, + "loss": 0.03705244, + "step": 21580 + }, + { + "epoch": 43.162, + "grad_norm": 1.0274404287338257, + "learning_rate": 2e-05, + "loss": 0.03788059, + "step": 21581 + }, + { + "epoch": 43.164, + "grad_norm": 0.9964593052864075, + "learning_rate": 2e-05, + "loss": 0.03574125, + "step": 21582 + }, + { + "epoch": 43.166, + "grad_norm": 1.8188575506210327, + "learning_rate": 2e-05, + "loss": 0.05399974, + "step": 21583 + }, + { + "epoch": 43.168, + "grad_norm": 2.3859317302703857, + "learning_rate": 2e-05, + "loss": 0.04840841, + "step": 21584 + }, + { + "epoch": 43.17, + "grad_norm": 1.4382333755493164, + "learning_rate": 2e-05, + "loss": 0.05152167, + "step": 21585 + }, + { + "epoch": 43.172, + "grad_norm": 1.2169090509414673, + "learning_rate": 2e-05, + "loss": 0.04776974, + "step": 21586 + }, + { + "epoch": 43.174, + "grad_norm": 1.3083583116531372, + "learning_rate": 2e-05, + "loss": 0.05877044, + "step": 21587 + }, + { + "epoch": 43.176, + "grad_norm": 1.264892578125, + "learning_rate": 2e-05, + "loss": 0.04724294, + "step": 21588 + }, + { + "epoch": 43.178, + "grad_norm": 1.7333793640136719, + "learning_rate": 2e-05, + "loss": 0.05662074, + "step": 21589 + }, + { + "epoch": 43.18, + "grad_norm": 1.1243252754211426, + "learning_rate": 2e-05, + "loss": 0.0430276, + "step": 21590 + }, + { + "epoch": 43.182, + "grad_norm": 1.2926504611968994, + "learning_rate": 2e-05, + "loss": 0.04414042, + "step": 21591 + }, + { + "epoch": 43.184, + "grad_norm": 1.2931801080703735, + "learning_rate": 2e-05, + "loss": 0.04339027, + "step": 21592 + }, + { + "epoch": 43.186, + "grad_norm": 1.1842244863510132, + "learning_rate": 2e-05, + "loss": 0.05153984, + "step": 21593 + }, + { + "epoch": 43.188, + "grad_norm": 1.176692008972168, + "learning_rate": 2e-05, + "loss": 0.04030944, + "step": 21594 + }, + { + "epoch": 43.19, + "grad_norm": 1.0769304037094116, + "learning_rate": 2e-05, + "loss": 0.03942397, + "step": 21595 + }, + { + "epoch": 43.192, + "grad_norm": 1.4324592351913452, + "learning_rate": 2e-05, + "loss": 0.04200315, + "step": 21596 + }, + { + "epoch": 43.194, + "grad_norm": 1.7846524715423584, + "learning_rate": 2e-05, + "loss": 0.05106489, + "step": 21597 + }, + { + "epoch": 43.196, + "grad_norm": 1.0251342058181763, + "learning_rate": 2e-05, + "loss": 0.03952106, + "step": 21598 + }, + { + "epoch": 43.198, + "grad_norm": 1.1627730131149292, + "learning_rate": 2e-05, + "loss": 0.02958023, + "step": 21599 + }, + { + "epoch": 43.2, + "grad_norm": 1.2952426671981812, + "learning_rate": 2e-05, + "loss": 0.04737274, + "step": 21600 + }, + { + "epoch": 43.202, + "grad_norm": 1.7284358739852905, + "learning_rate": 2e-05, + "loss": 0.06269697, + "step": 21601 + }, + { + "epoch": 43.204, + "grad_norm": 1.3142390251159668, + "learning_rate": 2e-05, + "loss": 0.0550441, + "step": 21602 + }, + { + "epoch": 43.206, + "grad_norm": 2.0693068504333496, + "learning_rate": 2e-05, + "loss": 0.05673666, + "step": 21603 + }, + { + "epoch": 43.208, + "grad_norm": 1.3296486139297485, + "learning_rate": 2e-05, + "loss": 0.04539488, + "step": 21604 + }, + { + "epoch": 43.21, + "grad_norm": 1.2718302011489868, + "learning_rate": 2e-05, + "loss": 0.04932036, + "step": 21605 + }, + { + "epoch": 43.212, + "grad_norm": 1.0521125793457031, + "learning_rate": 2e-05, + "loss": 0.04189048, + "step": 21606 + }, + { + "epoch": 43.214, + "grad_norm": 1.1329081058502197, + "learning_rate": 2e-05, + "loss": 0.04553405, + "step": 21607 + }, + { + "epoch": 43.216, + "grad_norm": 1.156334400177002, + "learning_rate": 2e-05, + "loss": 0.05069153, + "step": 21608 + }, + { + "epoch": 43.218, + "grad_norm": 1.0390546321868896, + "learning_rate": 2e-05, + "loss": 0.04528834, + "step": 21609 + }, + { + "epoch": 43.22, + "grad_norm": 0.9134246110916138, + "learning_rate": 2e-05, + "loss": 0.02971689, + "step": 21610 + }, + { + "epoch": 43.222, + "grad_norm": 1.1092662811279297, + "learning_rate": 2e-05, + "loss": 0.03292485, + "step": 21611 + }, + { + "epoch": 43.224, + "grad_norm": 2.882272958755493, + "learning_rate": 2e-05, + "loss": 0.05360059, + "step": 21612 + }, + { + "epoch": 43.226, + "grad_norm": 1.372489094734192, + "learning_rate": 2e-05, + "loss": 0.04475276, + "step": 21613 + }, + { + "epoch": 43.228, + "grad_norm": 1.0251941680908203, + "learning_rate": 2e-05, + "loss": 0.04171642, + "step": 21614 + }, + { + "epoch": 43.23, + "grad_norm": 1.3642363548278809, + "learning_rate": 2e-05, + "loss": 0.05339691, + "step": 21615 + }, + { + "epoch": 43.232, + "grad_norm": 0.9985992908477783, + "learning_rate": 2e-05, + "loss": 0.04003511, + "step": 21616 + }, + { + "epoch": 43.234, + "grad_norm": 1.0361459255218506, + "learning_rate": 2e-05, + "loss": 0.04068734, + "step": 21617 + }, + { + "epoch": 43.236, + "grad_norm": 1.290665864944458, + "learning_rate": 2e-05, + "loss": 0.05313302, + "step": 21618 + }, + { + "epoch": 43.238, + "grad_norm": 1.1882466077804565, + "learning_rate": 2e-05, + "loss": 0.04230958, + "step": 21619 + }, + { + "epoch": 43.24, + "grad_norm": 1.1811344623565674, + "learning_rate": 2e-05, + "loss": 0.04467118, + "step": 21620 + }, + { + "epoch": 43.242, + "grad_norm": 1.3368701934814453, + "learning_rate": 2e-05, + "loss": 0.04908449, + "step": 21621 + }, + { + "epoch": 43.244, + "grad_norm": 1.071096658706665, + "learning_rate": 2e-05, + "loss": 0.04370485, + "step": 21622 + }, + { + "epoch": 43.246, + "grad_norm": 1.1309154033660889, + "learning_rate": 2e-05, + "loss": 0.04717417, + "step": 21623 + }, + { + "epoch": 43.248, + "grad_norm": 2.024557590484619, + "learning_rate": 2e-05, + "loss": 0.04132883, + "step": 21624 + }, + { + "epoch": 43.25, + "grad_norm": 1.165766954421997, + "learning_rate": 2e-05, + "loss": 0.0444683, + "step": 21625 + }, + { + "epoch": 43.252, + "grad_norm": 1.0271003246307373, + "learning_rate": 2e-05, + "loss": 0.04147922, + "step": 21626 + }, + { + "epoch": 43.254, + "grad_norm": 1.3019037246704102, + "learning_rate": 2e-05, + "loss": 0.04356762, + "step": 21627 + }, + { + "epoch": 43.256, + "grad_norm": 1.1984200477600098, + "learning_rate": 2e-05, + "loss": 0.04804221, + "step": 21628 + }, + { + "epoch": 43.258, + "grad_norm": 0.9510525465011597, + "learning_rate": 2e-05, + "loss": 0.03916862, + "step": 21629 + }, + { + "epoch": 43.26, + "grad_norm": 1.4483345746994019, + "learning_rate": 2e-05, + "loss": 0.05215236, + "step": 21630 + }, + { + "epoch": 43.262, + "grad_norm": 1.4785760641098022, + "learning_rate": 2e-05, + "loss": 0.05577825, + "step": 21631 + }, + { + "epoch": 43.264, + "grad_norm": 2.0617661476135254, + "learning_rate": 2e-05, + "loss": 0.05145205, + "step": 21632 + }, + { + "epoch": 43.266, + "grad_norm": 1.1790543794631958, + "learning_rate": 2e-05, + "loss": 0.04180574, + "step": 21633 + }, + { + "epoch": 43.268, + "grad_norm": 1.2387884855270386, + "learning_rate": 2e-05, + "loss": 0.04779726, + "step": 21634 + }, + { + "epoch": 43.27, + "grad_norm": 1.2821975946426392, + "learning_rate": 2e-05, + "loss": 0.04490368, + "step": 21635 + }, + { + "epoch": 43.272, + "grad_norm": 1.1714630126953125, + "learning_rate": 2e-05, + "loss": 0.04618052, + "step": 21636 + }, + { + "epoch": 43.274, + "grad_norm": 1.2499911785125732, + "learning_rate": 2e-05, + "loss": 0.04752894, + "step": 21637 + }, + { + "epoch": 43.276, + "grad_norm": 1.9510254859924316, + "learning_rate": 2e-05, + "loss": 0.0413634, + "step": 21638 + }, + { + "epoch": 43.278, + "grad_norm": 0.9319188594818115, + "learning_rate": 2e-05, + "loss": 0.03260102, + "step": 21639 + }, + { + "epoch": 43.28, + "grad_norm": 1.2887738943099976, + "learning_rate": 2e-05, + "loss": 0.05729091, + "step": 21640 + }, + { + "epoch": 43.282, + "grad_norm": 0.9247579574584961, + "learning_rate": 2e-05, + "loss": 0.03868435, + "step": 21641 + }, + { + "epoch": 43.284, + "grad_norm": 1.2112103700637817, + "learning_rate": 2e-05, + "loss": 0.04973643, + "step": 21642 + }, + { + "epoch": 43.286, + "grad_norm": 1.4075590372085571, + "learning_rate": 2e-05, + "loss": 0.0373517, + "step": 21643 + }, + { + "epoch": 43.288, + "grad_norm": 1.0551819801330566, + "learning_rate": 2e-05, + "loss": 0.04350845, + "step": 21644 + }, + { + "epoch": 43.29, + "grad_norm": 1.4103049039840698, + "learning_rate": 2e-05, + "loss": 0.0423544, + "step": 21645 + }, + { + "epoch": 43.292, + "grad_norm": 1.1112929582595825, + "learning_rate": 2e-05, + "loss": 0.03513883, + "step": 21646 + }, + { + "epoch": 43.294, + "grad_norm": 0.9542357325553894, + "learning_rate": 2e-05, + "loss": 0.02830679, + "step": 21647 + }, + { + "epoch": 43.296, + "grad_norm": 1.5825624465942383, + "learning_rate": 2e-05, + "loss": 0.04779094, + "step": 21648 + }, + { + "epoch": 43.298, + "grad_norm": 1.1777398586273193, + "learning_rate": 2e-05, + "loss": 0.0480368, + "step": 21649 + }, + { + "epoch": 43.3, + "grad_norm": 1.8794219493865967, + "learning_rate": 2e-05, + "loss": 0.05965583, + "step": 21650 + }, + { + "epoch": 43.302, + "grad_norm": 1.262937307357788, + "learning_rate": 2e-05, + "loss": 0.06756496, + "step": 21651 + }, + { + "epoch": 43.304, + "grad_norm": 2.5823323726654053, + "learning_rate": 2e-05, + "loss": 0.08038095, + "step": 21652 + }, + { + "epoch": 43.306, + "grad_norm": 1.1853344440460205, + "learning_rate": 2e-05, + "loss": 0.03952831, + "step": 21653 + }, + { + "epoch": 43.308, + "grad_norm": 1.780608892440796, + "learning_rate": 2e-05, + "loss": 0.0625229, + "step": 21654 + }, + { + "epoch": 43.31, + "grad_norm": 1.1544749736785889, + "learning_rate": 2e-05, + "loss": 0.04620733, + "step": 21655 + }, + { + "epoch": 43.312, + "grad_norm": 1.3556686639785767, + "learning_rate": 2e-05, + "loss": 0.04416988, + "step": 21656 + }, + { + "epoch": 43.314, + "grad_norm": 1.0415023565292358, + "learning_rate": 2e-05, + "loss": 0.04206276, + "step": 21657 + }, + { + "epoch": 43.316, + "grad_norm": 1.2264755964279175, + "learning_rate": 2e-05, + "loss": 0.0508853, + "step": 21658 + }, + { + "epoch": 43.318, + "grad_norm": 1.3368821144104004, + "learning_rate": 2e-05, + "loss": 0.05324509, + "step": 21659 + }, + { + "epoch": 43.32, + "grad_norm": 1.3012473583221436, + "learning_rate": 2e-05, + "loss": 0.05128824, + "step": 21660 + }, + { + "epoch": 43.322, + "grad_norm": 0.9434711933135986, + "learning_rate": 2e-05, + "loss": 0.0308494, + "step": 21661 + }, + { + "epoch": 43.324, + "grad_norm": 1.6305221319198608, + "learning_rate": 2e-05, + "loss": 0.05408993, + "step": 21662 + }, + { + "epoch": 43.326, + "grad_norm": 1.6955217123031616, + "learning_rate": 2e-05, + "loss": 0.07358453, + "step": 21663 + }, + { + "epoch": 43.328, + "grad_norm": 1.4787384271621704, + "learning_rate": 2e-05, + "loss": 0.05799458, + "step": 21664 + }, + { + "epoch": 43.33, + "grad_norm": 1.2819902896881104, + "learning_rate": 2e-05, + "loss": 0.040464, + "step": 21665 + }, + { + "epoch": 43.332, + "grad_norm": 1.4899117946624756, + "learning_rate": 2e-05, + "loss": 0.05940196, + "step": 21666 + }, + { + "epoch": 43.334, + "grad_norm": 1.644298791885376, + "learning_rate": 2e-05, + "loss": 0.06026432, + "step": 21667 + }, + { + "epoch": 43.336, + "grad_norm": 1.2563204765319824, + "learning_rate": 2e-05, + "loss": 0.05732097, + "step": 21668 + }, + { + "epoch": 43.338, + "grad_norm": 1.1418761014938354, + "learning_rate": 2e-05, + "loss": 0.04549374, + "step": 21669 + }, + { + "epoch": 43.34, + "grad_norm": 1.0015463829040527, + "learning_rate": 2e-05, + "loss": 0.03093121, + "step": 21670 + }, + { + "epoch": 43.342, + "grad_norm": 1.0512152910232544, + "learning_rate": 2e-05, + "loss": 0.04237778, + "step": 21671 + }, + { + "epoch": 43.344, + "grad_norm": 1.1289246082305908, + "learning_rate": 2e-05, + "loss": 0.04607947, + "step": 21672 + }, + { + "epoch": 43.346, + "grad_norm": 1.371304988861084, + "learning_rate": 2e-05, + "loss": 0.05412474, + "step": 21673 + }, + { + "epoch": 43.348, + "grad_norm": 1.2538726329803467, + "learning_rate": 2e-05, + "loss": 0.0535718, + "step": 21674 + }, + { + "epoch": 43.35, + "grad_norm": 1.3319662809371948, + "learning_rate": 2e-05, + "loss": 0.04659107, + "step": 21675 + }, + { + "epoch": 43.352, + "grad_norm": 1.1024374961853027, + "learning_rate": 2e-05, + "loss": 0.03754035, + "step": 21676 + }, + { + "epoch": 43.354, + "grad_norm": 1.2166569232940674, + "learning_rate": 2e-05, + "loss": 0.06684306, + "step": 21677 + }, + { + "epoch": 43.356, + "grad_norm": 2.1136088371276855, + "learning_rate": 2e-05, + "loss": 0.04331131, + "step": 21678 + }, + { + "epoch": 43.358, + "grad_norm": 1.2272017002105713, + "learning_rate": 2e-05, + "loss": 0.04815719, + "step": 21679 + }, + { + "epoch": 43.36, + "grad_norm": 1.0744694471359253, + "learning_rate": 2e-05, + "loss": 0.04111854, + "step": 21680 + }, + { + "epoch": 43.362, + "grad_norm": 1.242724061012268, + "learning_rate": 2e-05, + "loss": 0.06325316, + "step": 21681 + }, + { + "epoch": 43.364, + "grad_norm": 1.985144853591919, + "learning_rate": 2e-05, + "loss": 0.04359365, + "step": 21682 + }, + { + "epoch": 43.366, + "grad_norm": 1.1064767837524414, + "learning_rate": 2e-05, + "loss": 0.05320355, + "step": 21683 + }, + { + "epoch": 43.368, + "grad_norm": 1.189843773841858, + "learning_rate": 2e-05, + "loss": 0.04457613, + "step": 21684 + }, + { + "epoch": 43.37, + "grad_norm": 1.203808069229126, + "learning_rate": 2e-05, + "loss": 0.05187619, + "step": 21685 + }, + { + "epoch": 43.372, + "grad_norm": 1.1101083755493164, + "learning_rate": 2e-05, + "loss": 0.04831094, + "step": 21686 + }, + { + "epoch": 43.374, + "grad_norm": 1.428242564201355, + "learning_rate": 2e-05, + "loss": 0.06388721, + "step": 21687 + }, + { + "epoch": 43.376, + "grad_norm": 1.2503690719604492, + "learning_rate": 2e-05, + "loss": 0.04847308, + "step": 21688 + }, + { + "epoch": 43.378, + "grad_norm": 1.1128005981445312, + "learning_rate": 2e-05, + "loss": 0.03843355, + "step": 21689 + }, + { + "epoch": 43.38, + "grad_norm": 0.9491602778434753, + "learning_rate": 2e-05, + "loss": 0.0259631, + "step": 21690 + }, + { + "epoch": 43.382, + "grad_norm": 1.233364224433899, + "learning_rate": 2e-05, + "loss": 0.0568186, + "step": 21691 + }, + { + "epoch": 43.384, + "grad_norm": 0.9893776774406433, + "learning_rate": 2e-05, + "loss": 0.04174522, + "step": 21692 + }, + { + "epoch": 43.386, + "grad_norm": 1.187612771987915, + "learning_rate": 2e-05, + "loss": 0.0390992, + "step": 21693 + }, + { + "epoch": 43.388, + "grad_norm": 1.3221521377563477, + "learning_rate": 2e-05, + "loss": 0.05131374, + "step": 21694 + }, + { + "epoch": 43.39, + "grad_norm": 1.2869195938110352, + "learning_rate": 2e-05, + "loss": 0.05698796, + "step": 21695 + }, + { + "epoch": 43.392, + "grad_norm": 1.5506631135940552, + "learning_rate": 2e-05, + "loss": 0.05239779, + "step": 21696 + }, + { + "epoch": 43.394, + "grad_norm": 1.1303107738494873, + "learning_rate": 2e-05, + "loss": 0.0321939, + "step": 21697 + }, + { + "epoch": 43.396, + "grad_norm": 1.2023539543151855, + "learning_rate": 2e-05, + "loss": 0.04435757, + "step": 21698 + }, + { + "epoch": 43.398, + "grad_norm": 1.644532322883606, + "learning_rate": 2e-05, + "loss": 0.05108094, + "step": 21699 + }, + { + "epoch": 43.4, + "grad_norm": 0.9367179274559021, + "learning_rate": 2e-05, + "loss": 0.03235884, + "step": 21700 + }, + { + "epoch": 43.402, + "grad_norm": 1.0146784782409668, + "learning_rate": 2e-05, + "loss": 0.03669849, + "step": 21701 + }, + { + "epoch": 43.404, + "grad_norm": 1.1251667737960815, + "learning_rate": 2e-05, + "loss": 0.04423268, + "step": 21702 + }, + { + "epoch": 43.406, + "grad_norm": 1.0853943824768066, + "learning_rate": 2e-05, + "loss": 0.04164097, + "step": 21703 + }, + { + "epoch": 43.408, + "grad_norm": 1.3860262632369995, + "learning_rate": 2e-05, + "loss": 0.06020261, + "step": 21704 + }, + { + "epoch": 43.41, + "grad_norm": 1.621960163116455, + "learning_rate": 2e-05, + "loss": 0.05325014, + "step": 21705 + }, + { + "epoch": 43.412, + "grad_norm": 1.4392348527908325, + "learning_rate": 2e-05, + "loss": 0.03889326, + "step": 21706 + }, + { + "epoch": 43.414, + "grad_norm": 1.0264551639556885, + "learning_rate": 2e-05, + "loss": 0.03898916, + "step": 21707 + }, + { + "epoch": 43.416, + "grad_norm": 1.2684388160705566, + "learning_rate": 2e-05, + "loss": 0.05191773, + "step": 21708 + }, + { + "epoch": 43.418, + "grad_norm": 1.1081558465957642, + "learning_rate": 2e-05, + "loss": 0.04196195, + "step": 21709 + }, + { + "epoch": 43.42, + "grad_norm": 1.1390728950500488, + "learning_rate": 2e-05, + "loss": 0.03881846, + "step": 21710 + }, + { + "epoch": 43.422, + "grad_norm": 1.3561352491378784, + "learning_rate": 2e-05, + "loss": 0.05348574, + "step": 21711 + }, + { + "epoch": 43.424, + "grad_norm": 1.4882853031158447, + "learning_rate": 2e-05, + "loss": 0.06980874, + "step": 21712 + }, + { + "epoch": 43.426, + "grad_norm": 1.37758469581604, + "learning_rate": 2e-05, + "loss": 0.06751367, + "step": 21713 + }, + { + "epoch": 43.428, + "grad_norm": 1.1444541215896606, + "learning_rate": 2e-05, + "loss": 0.04994627, + "step": 21714 + }, + { + "epoch": 43.43, + "grad_norm": 1.2537963390350342, + "learning_rate": 2e-05, + "loss": 0.05444419, + "step": 21715 + }, + { + "epoch": 43.432, + "grad_norm": 1.0901340246200562, + "learning_rate": 2e-05, + "loss": 0.05521618, + "step": 21716 + }, + { + "epoch": 43.434, + "grad_norm": 1.0437510013580322, + "learning_rate": 2e-05, + "loss": 0.0424891, + "step": 21717 + }, + { + "epoch": 43.436, + "grad_norm": 1.073522925376892, + "learning_rate": 2e-05, + "loss": 0.03688889, + "step": 21718 + }, + { + "epoch": 43.438, + "grad_norm": 1.129959225654602, + "learning_rate": 2e-05, + "loss": 0.04021513, + "step": 21719 + }, + { + "epoch": 43.44, + "grad_norm": 1.2731077671051025, + "learning_rate": 2e-05, + "loss": 0.0577286, + "step": 21720 + }, + { + "epoch": 43.442, + "grad_norm": 1.4466447830200195, + "learning_rate": 2e-05, + "loss": 0.06440642, + "step": 21721 + }, + { + "epoch": 43.444, + "grad_norm": 1.0096185207366943, + "learning_rate": 2e-05, + "loss": 0.03558685, + "step": 21722 + }, + { + "epoch": 43.446, + "grad_norm": 1.1729049682617188, + "learning_rate": 2e-05, + "loss": 0.05121062, + "step": 21723 + }, + { + "epoch": 43.448, + "grad_norm": 1.3237818479537964, + "learning_rate": 2e-05, + "loss": 0.05145048, + "step": 21724 + }, + { + "epoch": 43.45, + "grad_norm": 1.4835050106048584, + "learning_rate": 2e-05, + "loss": 0.04858468, + "step": 21725 + }, + { + "epoch": 43.452, + "grad_norm": 1.2140564918518066, + "learning_rate": 2e-05, + "loss": 0.04449757, + "step": 21726 + }, + { + "epoch": 43.454, + "grad_norm": 1.1771492958068848, + "learning_rate": 2e-05, + "loss": 0.05757561, + "step": 21727 + }, + { + "epoch": 43.456, + "grad_norm": 1.0299850702285767, + "learning_rate": 2e-05, + "loss": 0.03556947, + "step": 21728 + }, + { + "epoch": 43.458, + "grad_norm": 1.5381124019622803, + "learning_rate": 2e-05, + "loss": 0.06413664, + "step": 21729 + }, + { + "epoch": 43.46, + "grad_norm": 2.5248165130615234, + "learning_rate": 2e-05, + "loss": 0.05302243, + "step": 21730 + }, + { + "epoch": 43.462, + "grad_norm": 1.3111073970794678, + "learning_rate": 2e-05, + "loss": 0.04211747, + "step": 21731 + }, + { + "epoch": 43.464, + "grad_norm": 1.2731165885925293, + "learning_rate": 2e-05, + "loss": 0.05004764, + "step": 21732 + }, + { + "epoch": 43.466, + "grad_norm": 0.9703748822212219, + "learning_rate": 2e-05, + "loss": 0.03877125, + "step": 21733 + }, + { + "epoch": 43.468, + "grad_norm": 1.021268367767334, + "learning_rate": 2e-05, + "loss": 0.03762333, + "step": 21734 + }, + { + "epoch": 43.47, + "grad_norm": 1.2705427408218384, + "learning_rate": 2e-05, + "loss": 0.05988788, + "step": 21735 + }, + { + "epoch": 43.472, + "grad_norm": 1.2365317344665527, + "learning_rate": 2e-05, + "loss": 0.05386936, + "step": 21736 + }, + { + "epoch": 43.474, + "grad_norm": 1.3400436639785767, + "learning_rate": 2e-05, + "loss": 0.04573278, + "step": 21737 + }, + { + "epoch": 43.476, + "grad_norm": 1.3688085079193115, + "learning_rate": 2e-05, + "loss": 0.05334489, + "step": 21738 + }, + { + "epoch": 43.478, + "grad_norm": 1.086027979850769, + "learning_rate": 2e-05, + "loss": 0.0416183, + "step": 21739 + }, + { + "epoch": 43.48, + "grad_norm": 1.3055415153503418, + "learning_rate": 2e-05, + "loss": 0.03842257, + "step": 21740 + }, + { + "epoch": 43.482, + "grad_norm": 1.111611247062683, + "learning_rate": 2e-05, + "loss": 0.0508645, + "step": 21741 + }, + { + "epoch": 43.484, + "grad_norm": 1.194685935974121, + "learning_rate": 2e-05, + "loss": 0.03484659, + "step": 21742 + }, + { + "epoch": 43.486, + "grad_norm": 1.132163405418396, + "learning_rate": 2e-05, + "loss": 0.03633229, + "step": 21743 + }, + { + "epoch": 43.488, + "grad_norm": 1.3565982580184937, + "learning_rate": 2e-05, + "loss": 0.04545782, + "step": 21744 + }, + { + "epoch": 43.49, + "grad_norm": 1.4622793197631836, + "learning_rate": 2e-05, + "loss": 0.05259608, + "step": 21745 + }, + { + "epoch": 43.492, + "grad_norm": 1.0451035499572754, + "learning_rate": 2e-05, + "loss": 0.04075598, + "step": 21746 + }, + { + "epoch": 43.494, + "grad_norm": 2.7072818279266357, + "learning_rate": 2e-05, + "loss": 0.06001053, + "step": 21747 + }, + { + "epoch": 43.496, + "grad_norm": 1.031333088874817, + "learning_rate": 2e-05, + "loss": 0.03389198, + "step": 21748 + }, + { + "epoch": 43.498, + "grad_norm": 0.9913927316665649, + "learning_rate": 2e-05, + "loss": 0.03781866, + "step": 21749 + }, + { + "epoch": 43.5, + "grad_norm": 1.303167462348938, + "learning_rate": 2e-05, + "loss": 0.03760751, + "step": 21750 + }, + { + "epoch": 43.502, + "grad_norm": 1.3275164365768433, + "learning_rate": 2e-05, + "loss": 0.04503237, + "step": 21751 + }, + { + "epoch": 43.504, + "grad_norm": 1.729905366897583, + "learning_rate": 2e-05, + "loss": 0.04633639, + "step": 21752 + }, + { + "epoch": 43.506, + "grad_norm": 1.1333032846450806, + "learning_rate": 2e-05, + "loss": 0.05070849, + "step": 21753 + }, + { + "epoch": 43.508, + "grad_norm": 1.3890432119369507, + "learning_rate": 2e-05, + "loss": 0.0517424, + "step": 21754 + }, + { + "epoch": 43.51, + "grad_norm": 1.8249478340148926, + "learning_rate": 2e-05, + "loss": 0.03821604, + "step": 21755 + }, + { + "epoch": 43.512, + "grad_norm": 1.1964874267578125, + "learning_rate": 2e-05, + "loss": 0.04850964, + "step": 21756 + }, + { + "epoch": 43.514, + "grad_norm": 1.322390079498291, + "learning_rate": 2e-05, + "loss": 0.04571023, + "step": 21757 + }, + { + "epoch": 43.516, + "grad_norm": 1.1808900833129883, + "learning_rate": 2e-05, + "loss": 0.05330661, + "step": 21758 + }, + { + "epoch": 43.518, + "grad_norm": 1.255068063735962, + "learning_rate": 2e-05, + "loss": 0.05100637, + "step": 21759 + }, + { + "epoch": 43.52, + "grad_norm": 1.2289645671844482, + "learning_rate": 2e-05, + "loss": 0.04602298, + "step": 21760 + }, + { + "epoch": 43.522, + "grad_norm": 1.1310776472091675, + "learning_rate": 2e-05, + "loss": 0.05270489, + "step": 21761 + }, + { + "epoch": 43.524, + "grad_norm": 1.2839418649673462, + "learning_rate": 2e-05, + "loss": 0.04927288, + "step": 21762 + }, + { + "epoch": 43.526, + "grad_norm": 1.0740174055099487, + "learning_rate": 2e-05, + "loss": 0.03569305, + "step": 21763 + }, + { + "epoch": 43.528, + "grad_norm": 1.2840776443481445, + "learning_rate": 2e-05, + "loss": 0.0497875, + "step": 21764 + }, + { + "epoch": 43.53, + "grad_norm": 1.718205213546753, + "learning_rate": 2e-05, + "loss": 0.05506313, + "step": 21765 + }, + { + "epoch": 43.532, + "grad_norm": 1.0014818906784058, + "learning_rate": 2e-05, + "loss": 0.03927164, + "step": 21766 + }, + { + "epoch": 43.534, + "grad_norm": 1.1391730308532715, + "learning_rate": 2e-05, + "loss": 0.03087741, + "step": 21767 + }, + { + "epoch": 43.536, + "grad_norm": 1.2874438762664795, + "learning_rate": 2e-05, + "loss": 0.05720982, + "step": 21768 + }, + { + "epoch": 43.538, + "grad_norm": 1.5402920246124268, + "learning_rate": 2e-05, + "loss": 0.05315606, + "step": 21769 + }, + { + "epoch": 43.54, + "grad_norm": 1.420612096786499, + "learning_rate": 2e-05, + "loss": 0.04444424, + "step": 21770 + }, + { + "epoch": 43.542, + "grad_norm": 0.9867546558380127, + "learning_rate": 2e-05, + "loss": 0.03263182, + "step": 21771 + }, + { + "epoch": 43.544, + "grad_norm": 1.4081987142562866, + "learning_rate": 2e-05, + "loss": 0.05344656, + "step": 21772 + }, + { + "epoch": 43.546, + "grad_norm": 1.277359962463379, + "learning_rate": 2e-05, + "loss": 0.05032256, + "step": 21773 + }, + { + "epoch": 43.548, + "grad_norm": 1.1443043947219849, + "learning_rate": 2e-05, + "loss": 0.04011863, + "step": 21774 + }, + { + "epoch": 43.55, + "grad_norm": 1.1632673740386963, + "learning_rate": 2e-05, + "loss": 0.03645647, + "step": 21775 + }, + { + "epoch": 43.552, + "grad_norm": 1.0224082469940186, + "learning_rate": 2e-05, + "loss": 0.02952418, + "step": 21776 + }, + { + "epoch": 43.554, + "grad_norm": 1.3022348880767822, + "learning_rate": 2e-05, + "loss": 0.05518476, + "step": 21777 + }, + { + "epoch": 43.556, + "grad_norm": 1.1476200819015503, + "learning_rate": 2e-05, + "loss": 0.03775834, + "step": 21778 + }, + { + "epoch": 43.558, + "grad_norm": 1.0009914636611938, + "learning_rate": 2e-05, + "loss": 0.03371383, + "step": 21779 + }, + { + "epoch": 43.56, + "grad_norm": 1.1276997327804565, + "learning_rate": 2e-05, + "loss": 0.0438572, + "step": 21780 + }, + { + "epoch": 43.562, + "grad_norm": 1.9971312284469604, + "learning_rate": 2e-05, + "loss": 0.05232399, + "step": 21781 + }, + { + "epoch": 43.564, + "grad_norm": 1.3514554500579834, + "learning_rate": 2e-05, + "loss": 0.0524235, + "step": 21782 + }, + { + "epoch": 43.566, + "grad_norm": 1.3473267555236816, + "learning_rate": 2e-05, + "loss": 0.05278003, + "step": 21783 + }, + { + "epoch": 43.568, + "grad_norm": 1.1983340978622437, + "learning_rate": 2e-05, + "loss": 0.05057519, + "step": 21784 + }, + { + "epoch": 43.57, + "grad_norm": 1.5899887084960938, + "learning_rate": 2e-05, + "loss": 0.03519575, + "step": 21785 + }, + { + "epoch": 43.572, + "grad_norm": 1.1169793605804443, + "learning_rate": 2e-05, + "loss": 0.04725049, + "step": 21786 + }, + { + "epoch": 43.574, + "grad_norm": 1.2387906312942505, + "learning_rate": 2e-05, + "loss": 0.03475452, + "step": 21787 + }, + { + "epoch": 43.576, + "grad_norm": 2.203519821166992, + "learning_rate": 2e-05, + "loss": 0.04944497, + "step": 21788 + }, + { + "epoch": 43.578, + "grad_norm": 1.1906425952911377, + "learning_rate": 2e-05, + "loss": 0.04174117, + "step": 21789 + }, + { + "epoch": 43.58, + "grad_norm": 1.4967057704925537, + "learning_rate": 2e-05, + "loss": 0.05349187, + "step": 21790 + }, + { + "epoch": 43.582, + "grad_norm": 1.9748947620391846, + "learning_rate": 2e-05, + "loss": 0.03838755, + "step": 21791 + }, + { + "epoch": 43.584, + "grad_norm": 1.223773717880249, + "learning_rate": 2e-05, + "loss": 0.03617196, + "step": 21792 + }, + { + "epoch": 43.586, + "grad_norm": 1.0227309465408325, + "learning_rate": 2e-05, + "loss": 0.04097284, + "step": 21793 + }, + { + "epoch": 43.588, + "grad_norm": 1.2495092153549194, + "learning_rate": 2e-05, + "loss": 0.05023031, + "step": 21794 + }, + { + "epoch": 43.59, + "grad_norm": 1.443373203277588, + "learning_rate": 2e-05, + "loss": 0.05545416, + "step": 21795 + }, + { + "epoch": 43.592, + "grad_norm": 1.3429638147354126, + "learning_rate": 2e-05, + "loss": 0.04617237, + "step": 21796 + }, + { + "epoch": 43.594, + "grad_norm": 1.17878258228302, + "learning_rate": 2e-05, + "loss": 0.04778713, + "step": 21797 + }, + { + "epoch": 43.596, + "grad_norm": 1.4965510368347168, + "learning_rate": 2e-05, + "loss": 0.07662305, + "step": 21798 + }, + { + "epoch": 43.598, + "grad_norm": 1.0231486558914185, + "learning_rate": 2e-05, + "loss": 0.03974038, + "step": 21799 + }, + { + "epoch": 43.6, + "grad_norm": 1.3388340473175049, + "learning_rate": 2e-05, + "loss": 0.05395049, + "step": 21800 + }, + { + "epoch": 43.602, + "grad_norm": 1.15225088596344, + "learning_rate": 2e-05, + "loss": 0.05144007, + "step": 21801 + }, + { + "epoch": 43.604, + "grad_norm": 1.2700670957565308, + "learning_rate": 2e-05, + "loss": 0.05085467, + "step": 21802 + }, + { + "epoch": 43.606, + "grad_norm": 1.434041976928711, + "learning_rate": 2e-05, + "loss": 0.05072808, + "step": 21803 + }, + { + "epoch": 43.608, + "grad_norm": 1.410075306892395, + "learning_rate": 2e-05, + "loss": 0.04461277, + "step": 21804 + }, + { + "epoch": 43.61, + "grad_norm": 1.0994086265563965, + "learning_rate": 2e-05, + "loss": 0.03965208, + "step": 21805 + }, + { + "epoch": 43.612, + "grad_norm": 1.2478328943252563, + "learning_rate": 2e-05, + "loss": 0.05292454, + "step": 21806 + }, + { + "epoch": 43.614, + "grad_norm": 1.4595047235488892, + "learning_rate": 2e-05, + "loss": 0.05401939, + "step": 21807 + }, + { + "epoch": 43.616, + "grad_norm": 1.2957178354263306, + "learning_rate": 2e-05, + "loss": 0.05286802, + "step": 21808 + }, + { + "epoch": 43.618, + "grad_norm": 1.0460758209228516, + "learning_rate": 2e-05, + "loss": 0.03856381, + "step": 21809 + }, + { + "epoch": 43.62, + "grad_norm": 1.4046540260314941, + "learning_rate": 2e-05, + "loss": 0.04930651, + "step": 21810 + }, + { + "epoch": 43.622, + "grad_norm": 0.9765205383300781, + "learning_rate": 2e-05, + "loss": 0.03233945, + "step": 21811 + }, + { + "epoch": 43.624, + "grad_norm": 1.4802238941192627, + "learning_rate": 2e-05, + "loss": 0.05282279, + "step": 21812 + }, + { + "epoch": 43.626, + "grad_norm": 1.6739566326141357, + "learning_rate": 2e-05, + "loss": 0.04126877, + "step": 21813 + }, + { + "epoch": 43.628, + "grad_norm": 1.0595064163208008, + "learning_rate": 2e-05, + "loss": 0.04565667, + "step": 21814 + }, + { + "epoch": 43.63, + "grad_norm": 1.4827672243118286, + "learning_rate": 2e-05, + "loss": 0.03800133, + "step": 21815 + }, + { + "epoch": 43.632, + "grad_norm": 1.0282008647918701, + "learning_rate": 2e-05, + "loss": 0.03047697, + "step": 21816 + }, + { + "epoch": 43.634, + "grad_norm": 2.9951870441436768, + "learning_rate": 2e-05, + "loss": 0.03744018, + "step": 21817 + }, + { + "epoch": 43.636, + "grad_norm": 1.112250566482544, + "learning_rate": 2e-05, + "loss": 0.04401822, + "step": 21818 + }, + { + "epoch": 43.638, + "grad_norm": 1.140373706817627, + "learning_rate": 2e-05, + "loss": 0.0461672, + "step": 21819 + }, + { + "epoch": 43.64, + "grad_norm": 1.1561338901519775, + "learning_rate": 2e-05, + "loss": 0.0520758, + "step": 21820 + }, + { + "epoch": 43.642, + "grad_norm": 1.1361647844314575, + "learning_rate": 2e-05, + "loss": 0.04934377, + "step": 21821 + }, + { + "epoch": 43.644, + "grad_norm": 0.7384290099143982, + "learning_rate": 2e-05, + "loss": 0.02234074, + "step": 21822 + }, + { + "epoch": 43.646, + "grad_norm": 1.0414352416992188, + "learning_rate": 2e-05, + "loss": 0.03685948, + "step": 21823 + }, + { + "epoch": 43.648, + "grad_norm": 1.0054208040237427, + "learning_rate": 2e-05, + "loss": 0.03611515, + "step": 21824 + }, + { + "epoch": 43.65, + "grad_norm": 1.6434670686721802, + "learning_rate": 2e-05, + "loss": 0.04983237, + "step": 21825 + }, + { + "epoch": 43.652, + "grad_norm": 1.1644682884216309, + "learning_rate": 2e-05, + "loss": 0.04886763, + "step": 21826 + }, + { + "epoch": 43.654, + "grad_norm": 2.3917458057403564, + "learning_rate": 2e-05, + "loss": 0.07453853, + "step": 21827 + }, + { + "epoch": 43.656, + "grad_norm": 1.2234466075897217, + "learning_rate": 2e-05, + "loss": 0.05833396, + "step": 21828 + }, + { + "epoch": 43.658, + "grad_norm": 1.1793771982192993, + "learning_rate": 2e-05, + "loss": 0.04483267, + "step": 21829 + }, + { + "epoch": 43.66, + "grad_norm": 1.5220239162445068, + "learning_rate": 2e-05, + "loss": 0.06673879, + "step": 21830 + }, + { + "epoch": 43.662, + "grad_norm": 1.2569732666015625, + "learning_rate": 2e-05, + "loss": 0.04199339, + "step": 21831 + }, + { + "epoch": 43.664, + "grad_norm": 1.400726079940796, + "learning_rate": 2e-05, + "loss": 0.04842927, + "step": 21832 + }, + { + "epoch": 43.666, + "grad_norm": 1.587497591972351, + "learning_rate": 2e-05, + "loss": 0.06091474, + "step": 21833 + }, + { + "epoch": 43.668, + "grad_norm": 2.467782735824585, + "learning_rate": 2e-05, + "loss": 0.057216, + "step": 21834 + }, + { + "epoch": 43.67, + "grad_norm": 1.1979477405548096, + "learning_rate": 2e-05, + "loss": 0.05099071, + "step": 21835 + }, + { + "epoch": 43.672, + "grad_norm": 1.6900266408920288, + "learning_rate": 2e-05, + "loss": 0.04967137, + "step": 21836 + }, + { + "epoch": 43.674, + "grad_norm": 1.1722626686096191, + "learning_rate": 2e-05, + "loss": 0.0484486, + "step": 21837 + }, + { + "epoch": 43.676, + "grad_norm": 1.4999947547912598, + "learning_rate": 2e-05, + "loss": 0.06908549, + "step": 21838 + }, + { + "epoch": 43.678, + "grad_norm": 1.2642544507980347, + "learning_rate": 2e-05, + "loss": 0.04293026, + "step": 21839 + }, + { + "epoch": 43.68, + "grad_norm": 1.0956248044967651, + "learning_rate": 2e-05, + "loss": 0.03712745, + "step": 21840 + }, + { + "epoch": 43.682, + "grad_norm": 2.061314344406128, + "learning_rate": 2e-05, + "loss": 0.05671034, + "step": 21841 + }, + { + "epoch": 43.684, + "grad_norm": 1.2066631317138672, + "learning_rate": 2e-05, + "loss": 0.05251576, + "step": 21842 + }, + { + "epoch": 43.686, + "grad_norm": 1.1805452108383179, + "learning_rate": 2e-05, + "loss": 0.05300939, + "step": 21843 + }, + { + "epoch": 43.688, + "grad_norm": 1.5431995391845703, + "learning_rate": 2e-05, + "loss": 0.06416997, + "step": 21844 + }, + { + "epoch": 43.69, + "grad_norm": 0.9500445127487183, + "learning_rate": 2e-05, + "loss": 0.02867823, + "step": 21845 + }, + { + "epoch": 43.692, + "grad_norm": 1.0574393272399902, + "learning_rate": 2e-05, + "loss": 0.03581215, + "step": 21846 + }, + { + "epoch": 43.694, + "grad_norm": 1.081818699836731, + "learning_rate": 2e-05, + "loss": 0.03437539, + "step": 21847 + }, + { + "epoch": 43.696, + "grad_norm": 1.1467571258544922, + "learning_rate": 2e-05, + "loss": 0.04538688, + "step": 21848 + }, + { + "epoch": 43.698, + "grad_norm": 1.7992892265319824, + "learning_rate": 2e-05, + "loss": 0.05461973, + "step": 21849 + }, + { + "epoch": 43.7, + "grad_norm": 1.0957931280136108, + "learning_rate": 2e-05, + "loss": 0.03794325, + "step": 21850 + }, + { + "epoch": 43.702, + "grad_norm": 1.1432968378067017, + "learning_rate": 2e-05, + "loss": 0.04415257, + "step": 21851 + }, + { + "epoch": 43.704, + "grad_norm": 1.2000794410705566, + "learning_rate": 2e-05, + "loss": 0.03692243, + "step": 21852 + }, + { + "epoch": 43.706, + "grad_norm": 2.0741629600524902, + "learning_rate": 2e-05, + "loss": 0.05918703, + "step": 21853 + }, + { + "epoch": 43.708, + "grad_norm": 5.487260341644287, + "learning_rate": 2e-05, + "loss": 0.07302981, + "step": 21854 + }, + { + "epoch": 43.71, + "grad_norm": 1.12295401096344, + "learning_rate": 2e-05, + "loss": 0.03448126, + "step": 21855 + }, + { + "epoch": 43.712, + "grad_norm": 1.3459410667419434, + "learning_rate": 2e-05, + "loss": 0.05415928, + "step": 21856 + }, + { + "epoch": 43.714, + "grad_norm": 2.604322910308838, + "learning_rate": 2e-05, + "loss": 0.0712513, + "step": 21857 + }, + { + "epoch": 43.716, + "grad_norm": 1.216117024421692, + "learning_rate": 2e-05, + "loss": 0.04590901, + "step": 21858 + }, + { + "epoch": 43.718, + "grad_norm": 1.3675901889801025, + "learning_rate": 2e-05, + "loss": 0.04854389, + "step": 21859 + }, + { + "epoch": 43.72, + "grad_norm": 2.20420241355896, + "learning_rate": 2e-05, + "loss": 0.05691512, + "step": 21860 + }, + { + "epoch": 43.722, + "grad_norm": 1.0691967010498047, + "learning_rate": 2e-05, + "loss": 0.04248612, + "step": 21861 + }, + { + "epoch": 43.724, + "grad_norm": 1.1441617012023926, + "learning_rate": 2e-05, + "loss": 0.04443025, + "step": 21862 + }, + { + "epoch": 43.726, + "grad_norm": 1.215785264968872, + "learning_rate": 2e-05, + "loss": 0.04109699, + "step": 21863 + }, + { + "epoch": 43.728, + "grad_norm": 1.1096992492675781, + "learning_rate": 2e-05, + "loss": 0.04342065, + "step": 21864 + }, + { + "epoch": 43.73, + "grad_norm": 1.2204557657241821, + "learning_rate": 2e-05, + "loss": 0.04820478, + "step": 21865 + }, + { + "epoch": 43.732, + "grad_norm": 1.8152563571929932, + "learning_rate": 2e-05, + "loss": 0.04331843, + "step": 21866 + }, + { + "epoch": 43.734, + "grad_norm": 1.228575587272644, + "learning_rate": 2e-05, + "loss": 0.0453392, + "step": 21867 + }, + { + "epoch": 43.736, + "grad_norm": 1.2545289993286133, + "learning_rate": 2e-05, + "loss": 0.05211987, + "step": 21868 + }, + { + "epoch": 43.738, + "grad_norm": 1.061532974243164, + "learning_rate": 2e-05, + "loss": 0.03736347, + "step": 21869 + }, + { + "epoch": 43.74, + "grad_norm": 1.4098767042160034, + "learning_rate": 2e-05, + "loss": 0.04484908, + "step": 21870 + }, + { + "epoch": 43.742, + "grad_norm": 1.2383657693862915, + "learning_rate": 2e-05, + "loss": 0.05023118, + "step": 21871 + }, + { + "epoch": 43.744, + "grad_norm": 1.0549107789993286, + "learning_rate": 2e-05, + "loss": 0.03575772, + "step": 21872 + }, + { + "epoch": 43.746, + "grad_norm": 1.266374945640564, + "learning_rate": 2e-05, + "loss": 0.05691652, + "step": 21873 + }, + { + "epoch": 43.748, + "grad_norm": 1.2940677404403687, + "learning_rate": 2e-05, + "loss": 0.05207343, + "step": 21874 + }, + { + "epoch": 43.75, + "grad_norm": 1.380552887916565, + "learning_rate": 2e-05, + "loss": 0.03950589, + "step": 21875 + }, + { + "epoch": 43.752, + "grad_norm": 1.1680132150650024, + "learning_rate": 2e-05, + "loss": 0.0435213, + "step": 21876 + }, + { + "epoch": 43.754, + "grad_norm": 2.5544819831848145, + "learning_rate": 2e-05, + "loss": 0.0586822, + "step": 21877 + }, + { + "epoch": 43.756, + "grad_norm": 1.1925801038742065, + "learning_rate": 2e-05, + "loss": 0.0523452, + "step": 21878 + }, + { + "epoch": 43.758, + "grad_norm": 1.2226167917251587, + "learning_rate": 2e-05, + "loss": 0.04687249, + "step": 21879 + }, + { + "epoch": 43.76, + "grad_norm": 1.0646800994873047, + "learning_rate": 2e-05, + "loss": 0.04498032, + "step": 21880 + }, + { + "epoch": 43.762, + "grad_norm": 1.3412402868270874, + "learning_rate": 2e-05, + "loss": 0.05164725, + "step": 21881 + }, + { + "epoch": 43.764, + "grad_norm": 1.0574461221694946, + "learning_rate": 2e-05, + "loss": 0.0448842, + "step": 21882 + }, + { + "epoch": 43.766, + "grad_norm": 1.0488017797470093, + "learning_rate": 2e-05, + "loss": 0.02794564, + "step": 21883 + }, + { + "epoch": 43.768, + "grad_norm": 1.2221869230270386, + "learning_rate": 2e-05, + "loss": 0.05013753, + "step": 21884 + }, + { + "epoch": 43.77, + "grad_norm": 1.2905614376068115, + "learning_rate": 2e-05, + "loss": 0.04898647, + "step": 21885 + }, + { + "epoch": 43.772, + "grad_norm": 1.3449629545211792, + "learning_rate": 2e-05, + "loss": 0.05343453, + "step": 21886 + }, + { + "epoch": 43.774, + "grad_norm": 1.0985403060913086, + "learning_rate": 2e-05, + "loss": 0.04567765, + "step": 21887 + }, + { + "epoch": 43.776, + "grad_norm": 1.7854421138763428, + "learning_rate": 2e-05, + "loss": 0.05000356, + "step": 21888 + }, + { + "epoch": 43.778, + "grad_norm": 0.9867087602615356, + "learning_rate": 2e-05, + "loss": 0.03435191, + "step": 21889 + }, + { + "epoch": 43.78, + "grad_norm": 1.1816110610961914, + "learning_rate": 2e-05, + "loss": 0.04493886, + "step": 21890 + }, + { + "epoch": 43.782, + "grad_norm": 1.8454886674880981, + "learning_rate": 2e-05, + "loss": 0.07082605, + "step": 21891 + }, + { + "epoch": 43.784, + "grad_norm": 1.6479289531707764, + "learning_rate": 2e-05, + "loss": 0.05434608, + "step": 21892 + }, + { + "epoch": 43.786, + "grad_norm": 1.2260222434997559, + "learning_rate": 2e-05, + "loss": 0.0397054, + "step": 21893 + }, + { + "epoch": 43.788, + "grad_norm": 1.0151551961898804, + "learning_rate": 2e-05, + "loss": 0.03879777, + "step": 21894 + }, + { + "epoch": 43.79, + "grad_norm": 1.233112096786499, + "learning_rate": 2e-05, + "loss": 0.04530869, + "step": 21895 + }, + { + "epoch": 43.792, + "grad_norm": 1.4653815031051636, + "learning_rate": 2e-05, + "loss": 0.06113261, + "step": 21896 + }, + { + "epoch": 43.794, + "grad_norm": 1.832763671875, + "learning_rate": 2e-05, + "loss": 0.06107483, + "step": 21897 + }, + { + "epoch": 43.796, + "grad_norm": 1.1519736051559448, + "learning_rate": 2e-05, + "loss": 0.04647954, + "step": 21898 + }, + { + "epoch": 43.798, + "grad_norm": 1.4427329301834106, + "learning_rate": 2e-05, + "loss": 0.0451367, + "step": 21899 + }, + { + "epoch": 43.8, + "grad_norm": 1.0031224489212036, + "learning_rate": 2e-05, + "loss": 0.04659247, + "step": 21900 + }, + { + "epoch": 43.802, + "grad_norm": 1.3513147830963135, + "learning_rate": 2e-05, + "loss": 0.03858761, + "step": 21901 + }, + { + "epoch": 43.804, + "grad_norm": 1.4595290422439575, + "learning_rate": 2e-05, + "loss": 0.04975832, + "step": 21902 + }, + { + "epoch": 43.806, + "grad_norm": 1.4404231309890747, + "learning_rate": 2e-05, + "loss": 0.04388293, + "step": 21903 + }, + { + "epoch": 43.808, + "grad_norm": 1.1193671226501465, + "learning_rate": 2e-05, + "loss": 0.0433252, + "step": 21904 + }, + { + "epoch": 43.81, + "grad_norm": 1.7537304162979126, + "learning_rate": 2e-05, + "loss": 0.05322967, + "step": 21905 + }, + { + "epoch": 43.812, + "grad_norm": 1.0735844373703003, + "learning_rate": 2e-05, + "loss": 0.03970418, + "step": 21906 + }, + { + "epoch": 43.814, + "grad_norm": 1.533126711845398, + "learning_rate": 2e-05, + "loss": 0.05987874, + "step": 21907 + }, + { + "epoch": 43.816, + "grad_norm": 1.0501418113708496, + "learning_rate": 2e-05, + "loss": 0.03239897, + "step": 21908 + }, + { + "epoch": 43.818, + "grad_norm": 1.326298475265503, + "learning_rate": 2e-05, + "loss": 0.05782353, + "step": 21909 + }, + { + "epoch": 43.82, + "grad_norm": 1.2999595403671265, + "learning_rate": 2e-05, + "loss": 0.04503709, + "step": 21910 + }, + { + "epoch": 43.822, + "grad_norm": 1.109490156173706, + "learning_rate": 2e-05, + "loss": 0.0462186, + "step": 21911 + }, + { + "epoch": 43.824, + "grad_norm": 1.1068756580352783, + "learning_rate": 2e-05, + "loss": 0.03144252, + "step": 21912 + }, + { + "epoch": 43.826, + "grad_norm": 1.1656792163848877, + "learning_rate": 2e-05, + "loss": 0.04489972, + "step": 21913 + }, + { + "epoch": 43.828, + "grad_norm": 1.3886311054229736, + "learning_rate": 2e-05, + "loss": 0.04568454, + "step": 21914 + }, + { + "epoch": 43.83, + "grad_norm": 1.2280189990997314, + "learning_rate": 2e-05, + "loss": 0.04975533, + "step": 21915 + }, + { + "epoch": 43.832, + "grad_norm": 1.2794169187545776, + "learning_rate": 2e-05, + "loss": 0.04802956, + "step": 21916 + }, + { + "epoch": 43.834, + "grad_norm": 1.431793451309204, + "learning_rate": 2e-05, + "loss": 0.0503118, + "step": 21917 + }, + { + "epoch": 43.836, + "grad_norm": 1.1943049430847168, + "learning_rate": 2e-05, + "loss": 0.05117747, + "step": 21918 + }, + { + "epoch": 43.838, + "grad_norm": 1.1528602838516235, + "learning_rate": 2e-05, + "loss": 0.03906956, + "step": 21919 + }, + { + "epoch": 43.84, + "grad_norm": 1.3231438398361206, + "learning_rate": 2e-05, + "loss": 0.04882731, + "step": 21920 + }, + { + "epoch": 43.842, + "grad_norm": 1.6413450241088867, + "learning_rate": 2e-05, + "loss": 0.04453898, + "step": 21921 + }, + { + "epoch": 43.844, + "grad_norm": 1.188307285308838, + "learning_rate": 2e-05, + "loss": 0.04830351, + "step": 21922 + }, + { + "epoch": 43.846, + "grad_norm": 1.5874241590499878, + "learning_rate": 2e-05, + "loss": 0.05982009, + "step": 21923 + }, + { + "epoch": 43.848, + "grad_norm": 0.9717968702316284, + "learning_rate": 2e-05, + "loss": 0.03369679, + "step": 21924 + }, + { + "epoch": 43.85, + "grad_norm": 1.2073335647583008, + "learning_rate": 2e-05, + "loss": 0.03507089, + "step": 21925 + }, + { + "epoch": 43.852, + "grad_norm": 1.133551001548767, + "learning_rate": 2e-05, + "loss": 0.04241706, + "step": 21926 + }, + { + "epoch": 43.854, + "grad_norm": 1.2795426845550537, + "learning_rate": 2e-05, + "loss": 0.04569482, + "step": 21927 + }, + { + "epoch": 43.856, + "grad_norm": 1.216707706451416, + "learning_rate": 2e-05, + "loss": 0.05699688, + "step": 21928 + }, + { + "epoch": 43.858, + "grad_norm": 1.9533652067184448, + "learning_rate": 2e-05, + "loss": 0.07641211, + "step": 21929 + }, + { + "epoch": 43.86, + "grad_norm": 1.0922882556915283, + "learning_rate": 2e-05, + "loss": 0.02667849, + "step": 21930 + }, + { + "epoch": 43.862, + "grad_norm": 1.0828759670257568, + "learning_rate": 2e-05, + "loss": 0.04916801, + "step": 21931 + }, + { + "epoch": 43.864, + "grad_norm": 1.2230288982391357, + "learning_rate": 2e-05, + "loss": 0.05529765, + "step": 21932 + }, + { + "epoch": 43.866, + "grad_norm": 1.1177533864974976, + "learning_rate": 2e-05, + "loss": 0.03495992, + "step": 21933 + }, + { + "epoch": 43.868, + "grad_norm": 1.0803815126419067, + "learning_rate": 2e-05, + "loss": 0.04231985, + "step": 21934 + }, + { + "epoch": 43.87, + "grad_norm": 1.170892357826233, + "learning_rate": 2e-05, + "loss": 0.04216111, + "step": 21935 + }, + { + "epoch": 43.872, + "grad_norm": 1.6169553995132446, + "learning_rate": 2e-05, + "loss": 0.04968107, + "step": 21936 + }, + { + "epoch": 43.874, + "grad_norm": 1.2540957927703857, + "learning_rate": 2e-05, + "loss": 0.05158932, + "step": 21937 + }, + { + "epoch": 43.876, + "grad_norm": 0.9766530394554138, + "learning_rate": 2e-05, + "loss": 0.02565751, + "step": 21938 + }, + { + "epoch": 43.878, + "grad_norm": 1.1864181756973267, + "learning_rate": 2e-05, + "loss": 0.04187727, + "step": 21939 + }, + { + "epoch": 43.88, + "grad_norm": 1.157779335975647, + "learning_rate": 2e-05, + "loss": 0.04422162, + "step": 21940 + }, + { + "epoch": 43.882, + "grad_norm": 3.2718393802642822, + "learning_rate": 2e-05, + "loss": 0.03786396, + "step": 21941 + }, + { + "epoch": 43.884, + "grad_norm": 1.7684119939804077, + "learning_rate": 2e-05, + "loss": 0.04769549, + "step": 21942 + }, + { + "epoch": 43.886, + "grad_norm": 1.5492713451385498, + "learning_rate": 2e-05, + "loss": 0.06170796, + "step": 21943 + }, + { + "epoch": 43.888, + "grad_norm": 1.0928599834442139, + "learning_rate": 2e-05, + "loss": 0.03716299, + "step": 21944 + }, + { + "epoch": 43.89, + "grad_norm": 2.3191888332366943, + "learning_rate": 2e-05, + "loss": 0.06007298, + "step": 21945 + }, + { + "epoch": 43.892, + "grad_norm": 1.7292722463607788, + "learning_rate": 2e-05, + "loss": 0.04911011, + "step": 21946 + }, + { + "epoch": 43.894, + "grad_norm": 1.0400488376617432, + "learning_rate": 2e-05, + "loss": 0.03599721, + "step": 21947 + }, + { + "epoch": 43.896, + "grad_norm": 1.2575953006744385, + "learning_rate": 2e-05, + "loss": 0.05165, + "step": 21948 + }, + { + "epoch": 43.898, + "grad_norm": 1.1358708143234253, + "learning_rate": 2e-05, + "loss": 0.04962153, + "step": 21949 + }, + { + "epoch": 43.9, + "grad_norm": 1.5051313638687134, + "learning_rate": 2e-05, + "loss": 0.04658557, + "step": 21950 + }, + { + "epoch": 43.902, + "grad_norm": 0.9277594685554504, + "learning_rate": 2e-05, + "loss": 0.03316977, + "step": 21951 + }, + { + "epoch": 43.904, + "grad_norm": 1.3635467290878296, + "learning_rate": 2e-05, + "loss": 0.05430625, + "step": 21952 + }, + { + "epoch": 43.906, + "grad_norm": 1.391068458557129, + "learning_rate": 2e-05, + "loss": 0.06159748, + "step": 21953 + }, + { + "epoch": 43.908, + "grad_norm": 1.538338303565979, + "learning_rate": 2e-05, + "loss": 0.04379988, + "step": 21954 + }, + { + "epoch": 43.91, + "grad_norm": 3.397902250289917, + "learning_rate": 2e-05, + "loss": 0.06411252, + "step": 21955 + }, + { + "epoch": 43.912, + "grad_norm": 1.0770752429962158, + "learning_rate": 2e-05, + "loss": 0.04424539, + "step": 21956 + }, + { + "epoch": 43.914, + "grad_norm": 1.3942207098007202, + "learning_rate": 2e-05, + "loss": 0.04902345, + "step": 21957 + }, + { + "epoch": 43.916, + "grad_norm": 1.1789013147354126, + "learning_rate": 2e-05, + "loss": 0.04915366, + "step": 21958 + }, + { + "epoch": 43.918, + "grad_norm": 1.31106698513031, + "learning_rate": 2e-05, + "loss": 0.03723363, + "step": 21959 + }, + { + "epoch": 43.92, + "grad_norm": 1.3015276193618774, + "learning_rate": 2e-05, + "loss": 0.04564174, + "step": 21960 + }, + { + "epoch": 43.922, + "grad_norm": 1.144248127937317, + "learning_rate": 2e-05, + "loss": 0.0414278, + "step": 21961 + }, + { + "epoch": 43.924, + "grad_norm": 1.1458982229232788, + "learning_rate": 2e-05, + "loss": 0.04920912, + "step": 21962 + }, + { + "epoch": 43.926, + "grad_norm": 1.6706799268722534, + "learning_rate": 2e-05, + "loss": 0.04606791, + "step": 21963 + }, + { + "epoch": 43.928, + "grad_norm": 1.1837114095687866, + "learning_rate": 2e-05, + "loss": 0.04852568, + "step": 21964 + }, + { + "epoch": 43.93, + "grad_norm": 1.614950180053711, + "learning_rate": 2e-05, + "loss": 0.05900152, + "step": 21965 + }, + { + "epoch": 43.932, + "grad_norm": 1.168605923652649, + "learning_rate": 2e-05, + "loss": 0.04590261, + "step": 21966 + }, + { + "epoch": 43.934, + "grad_norm": 1.91372811794281, + "learning_rate": 2e-05, + "loss": 0.05605849, + "step": 21967 + }, + { + "epoch": 43.936, + "grad_norm": 1.2598594427108765, + "learning_rate": 2e-05, + "loss": 0.05659298, + "step": 21968 + }, + { + "epoch": 43.938, + "grad_norm": 1.135242223739624, + "learning_rate": 2e-05, + "loss": 0.03806643, + "step": 21969 + }, + { + "epoch": 43.94, + "grad_norm": 1.2540684938430786, + "learning_rate": 2e-05, + "loss": 0.04728854, + "step": 21970 + }, + { + "epoch": 43.942, + "grad_norm": 1.2394566535949707, + "learning_rate": 2e-05, + "loss": 0.05657537, + "step": 21971 + }, + { + "epoch": 43.944, + "grad_norm": 1.1225388050079346, + "learning_rate": 2e-05, + "loss": 0.05487181, + "step": 21972 + }, + { + "epoch": 43.946, + "grad_norm": 1.3079673051834106, + "learning_rate": 2e-05, + "loss": 0.04643994, + "step": 21973 + }, + { + "epoch": 43.948, + "grad_norm": 1.1278849840164185, + "learning_rate": 2e-05, + "loss": 0.05071158, + "step": 21974 + }, + { + "epoch": 43.95, + "grad_norm": 1.311438798904419, + "learning_rate": 2e-05, + "loss": 0.0565824, + "step": 21975 + }, + { + "epoch": 43.952, + "grad_norm": 1.0105366706848145, + "learning_rate": 2e-05, + "loss": 0.04423777, + "step": 21976 + }, + { + "epoch": 43.954, + "grad_norm": 5.449398040771484, + "learning_rate": 2e-05, + "loss": 0.06142551, + "step": 21977 + }, + { + "epoch": 43.956, + "grad_norm": 1.5247281789779663, + "learning_rate": 2e-05, + "loss": 0.05841498, + "step": 21978 + }, + { + "epoch": 43.958, + "grad_norm": 1.2645142078399658, + "learning_rate": 2e-05, + "loss": 0.05982171, + "step": 21979 + }, + { + "epoch": 43.96, + "grad_norm": 1.3923887014389038, + "learning_rate": 2e-05, + "loss": 0.05867479, + "step": 21980 + }, + { + "epoch": 43.962, + "grad_norm": 1.2104207277297974, + "learning_rate": 2e-05, + "loss": 0.05177038, + "step": 21981 + }, + { + "epoch": 43.964, + "grad_norm": 1.2273800373077393, + "learning_rate": 2e-05, + "loss": 0.03514304, + "step": 21982 + }, + { + "epoch": 43.966, + "grad_norm": 1.2372844219207764, + "learning_rate": 2e-05, + "loss": 0.05396766, + "step": 21983 + }, + { + "epoch": 43.968, + "grad_norm": 1.1264572143554688, + "learning_rate": 2e-05, + "loss": 0.03575965, + "step": 21984 + }, + { + "epoch": 43.97, + "grad_norm": 1.2060368061065674, + "learning_rate": 2e-05, + "loss": 0.05158073, + "step": 21985 + }, + { + "epoch": 43.972, + "grad_norm": 1.7094749212265015, + "learning_rate": 2e-05, + "loss": 0.06007314, + "step": 21986 + }, + { + "epoch": 43.974, + "grad_norm": 1.0798571109771729, + "learning_rate": 2e-05, + "loss": 0.05095109, + "step": 21987 + }, + { + "epoch": 43.976, + "grad_norm": 1.1857688426971436, + "learning_rate": 2e-05, + "loss": 0.04817619, + "step": 21988 + }, + { + "epoch": 43.978, + "grad_norm": 1.956149697303772, + "learning_rate": 2e-05, + "loss": 0.05255031, + "step": 21989 + }, + { + "epoch": 43.98, + "grad_norm": 1.298830270767212, + "learning_rate": 2e-05, + "loss": 0.0518288, + "step": 21990 + }, + { + "epoch": 43.982, + "grad_norm": 2.401399612426758, + "learning_rate": 2e-05, + "loss": 0.0452053, + "step": 21991 + }, + { + "epoch": 43.984, + "grad_norm": 1.1325953006744385, + "learning_rate": 2e-05, + "loss": 0.039924, + "step": 21992 + }, + { + "epoch": 43.986, + "grad_norm": 1.7556862831115723, + "learning_rate": 2e-05, + "loss": 0.052302, + "step": 21993 + }, + { + "epoch": 43.988, + "grad_norm": 1.3267320394515991, + "learning_rate": 2e-05, + "loss": 0.05184503, + "step": 21994 + }, + { + "epoch": 43.99, + "grad_norm": 1.1495281457901, + "learning_rate": 2e-05, + "loss": 0.04765497, + "step": 21995 + }, + { + "epoch": 43.992, + "grad_norm": 1.6266371011734009, + "learning_rate": 2e-05, + "loss": 0.05227334, + "step": 21996 + }, + { + "epoch": 43.994, + "grad_norm": 0.914303719997406, + "learning_rate": 2e-05, + "loss": 0.02800883, + "step": 21997 + }, + { + "epoch": 43.996, + "grad_norm": 1.2601372003555298, + "learning_rate": 2e-05, + "loss": 0.04211642, + "step": 21998 + }, + { + "epoch": 43.998, + "grad_norm": 1.1143946647644043, + "learning_rate": 2e-05, + "loss": 0.0415227, + "step": 21999 + }, + { + "epoch": 44.0, + "grad_norm": 1.461881160736084, + "learning_rate": 2e-05, + "loss": 0.06111758, + "step": 22000 + }, + { + "epoch": 44.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9860279441117764, + "Equal_1": 1.0, + "Equal_2": 0.9920159680638723, + "Equal_3": 0.9900199600798403, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9899799599198397, + "Parallel_2": 0.9979959919839679, + "Parallel_3": 0.992, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.992, + "Perpendicular_3": 0.8937875751503006, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.99, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9840319361277445 + }, + "eval_runtime": 319.306, + "eval_samples_per_second": 32.884, + "eval_steps_per_second": 0.658, + "step": 22000 + }, + { + "epoch": 44.002, + "grad_norm": 1.1951003074645996, + "learning_rate": 2e-05, + "loss": 0.04387069, + "step": 22001 + }, + { + "epoch": 44.004, + "grad_norm": 1.2349199056625366, + "learning_rate": 2e-05, + "loss": 0.04709039, + "step": 22002 + }, + { + "epoch": 44.006, + "grad_norm": 1.0489506721496582, + "learning_rate": 2e-05, + "loss": 0.03470477, + "step": 22003 + }, + { + "epoch": 44.008, + "grad_norm": 1.1739320755004883, + "learning_rate": 2e-05, + "loss": 0.03277641, + "step": 22004 + }, + { + "epoch": 44.01, + "grad_norm": 2.5618724822998047, + "learning_rate": 2e-05, + "loss": 0.0545634, + "step": 22005 + }, + { + "epoch": 44.012, + "grad_norm": 1.349908471107483, + "learning_rate": 2e-05, + "loss": 0.04546019, + "step": 22006 + }, + { + "epoch": 44.014, + "grad_norm": 1.1877539157867432, + "learning_rate": 2e-05, + "loss": 0.05321749, + "step": 22007 + }, + { + "epoch": 44.016, + "grad_norm": 1.0178004503250122, + "learning_rate": 2e-05, + "loss": 0.02961883, + "step": 22008 + }, + { + "epoch": 44.018, + "grad_norm": 1.0790722370147705, + "learning_rate": 2e-05, + "loss": 0.04679262, + "step": 22009 + }, + { + "epoch": 44.02, + "grad_norm": 1.1426124572753906, + "learning_rate": 2e-05, + "loss": 0.06072292, + "step": 22010 + }, + { + "epoch": 44.022, + "grad_norm": 1.529797077178955, + "learning_rate": 2e-05, + "loss": 0.05173718, + "step": 22011 + }, + { + "epoch": 44.024, + "grad_norm": 1.4179189205169678, + "learning_rate": 2e-05, + "loss": 0.04669178, + "step": 22012 + }, + { + "epoch": 44.026, + "grad_norm": 1.4494482278823853, + "learning_rate": 2e-05, + "loss": 0.04122728, + "step": 22013 + }, + { + "epoch": 44.028, + "grad_norm": 1.4146836996078491, + "learning_rate": 2e-05, + "loss": 0.05055712, + "step": 22014 + }, + { + "epoch": 44.03, + "grad_norm": 1.372752070426941, + "learning_rate": 2e-05, + "loss": 0.05814591, + "step": 22015 + }, + { + "epoch": 44.032, + "grad_norm": 1.9753098487854004, + "learning_rate": 2e-05, + "loss": 0.0562949, + "step": 22016 + }, + { + "epoch": 44.034, + "grad_norm": 0.9869195222854614, + "learning_rate": 2e-05, + "loss": 0.04519263, + "step": 22017 + }, + { + "epoch": 44.036, + "grad_norm": 1.016710638999939, + "learning_rate": 2e-05, + "loss": 0.03520773, + "step": 22018 + }, + { + "epoch": 44.038, + "grad_norm": 1.2054697275161743, + "learning_rate": 2e-05, + "loss": 0.05319132, + "step": 22019 + }, + { + "epoch": 44.04, + "grad_norm": 1.1351009607315063, + "learning_rate": 2e-05, + "loss": 0.03978164, + "step": 22020 + }, + { + "epoch": 44.042, + "grad_norm": 1.0979878902435303, + "learning_rate": 2e-05, + "loss": 0.03140669, + "step": 22021 + }, + { + "epoch": 44.044, + "grad_norm": 1.446884036064148, + "learning_rate": 2e-05, + "loss": 0.05437474, + "step": 22022 + }, + { + "epoch": 44.046, + "grad_norm": 1.3147832155227661, + "learning_rate": 2e-05, + "loss": 0.05483336, + "step": 22023 + }, + { + "epoch": 44.048, + "grad_norm": 1.4436129331588745, + "learning_rate": 2e-05, + "loss": 0.05707975, + "step": 22024 + }, + { + "epoch": 44.05, + "grad_norm": 1.2296398878097534, + "learning_rate": 2e-05, + "loss": 0.05186768, + "step": 22025 + }, + { + "epoch": 44.052, + "grad_norm": 1.414892554283142, + "learning_rate": 2e-05, + "loss": 0.04964254, + "step": 22026 + }, + { + "epoch": 44.054, + "grad_norm": 0.9986278414726257, + "learning_rate": 2e-05, + "loss": 0.04020878, + "step": 22027 + }, + { + "epoch": 44.056, + "grad_norm": 1.0219697952270508, + "learning_rate": 2e-05, + "loss": 0.03773836, + "step": 22028 + }, + { + "epoch": 44.058, + "grad_norm": 0.9788745045661926, + "learning_rate": 2e-05, + "loss": 0.03778582, + "step": 22029 + }, + { + "epoch": 44.06, + "grad_norm": 1.209914207458496, + "learning_rate": 2e-05, + "loss": 0.03958928, + "step": 22030 + }, + { + "epoch": 44.062, + "grad_norm": 1.2830939292907715, + "learning_rate": 2e-05, + "loss": 0.04328164, + "step": 22031 + }, + { + "epoch": 44.064, + "grad_norm": 1.635453462600708, + "learning_rate": 2e-05, + "loss": 0.05318267, + "step": 22032 + }, + { + "epoch": 44.066, + "grad_norm": 0.9193920493125916, + "learning_rate": 2e-05, + "loss": 0.02856221, + "step": 22033 + }, + { + "epoch": 44.068, + "grad_norm": 1.4379791021347046, + "learning_rate": 2e-05, + "loss": 0.05032479, + "step": 22034 + }, + { + "epoch": 44.07, + "grad_norm": 2.1090707778930664, + "learning_rate": 2e-05, + "loss": 0.08148749, + "step": 22035 + }, + { + "epoch": 44.072, + "grad_norm": 1.4268964529037476, + "learning_rate": 2e-05, + "loss": 0.03891208, + "step": 22036 + }, + { + "epoch": 44.074, + "grad_norm": 1.3950912952423096, + "learning_rate": 2e-05, + "loss": 0.03417912, + "step": 22037 + }, + { + "epoch": 44.076, + "grad_norm": 1.4302080869674683, + "learning_rate": 2e-05, + "loss": 0.04385927, + "step": 22038 + }, + { + "epoch": 44.078, + "grad_norm": 1.038596272468567, + "learning_rate": 2e-05, + "loss": 0.03837661, + "step": 22039 + }, + { + "epoch": 44.08, + "grad_norm": 2.1667323112487793, + "learning_rate": 2e-05, + "loss": 0.06082205, + "step": 22040 + }, + { + "epoch": 44.082, + "grad_norm": 1.3383917808532715, + "learning_rate": 2e-05, + "loss": 0.05682547, + "step": 22041 + }, + { + "epoch": 44.084, + "grad_norm": 1.3785979747772217, + "learning_rate": 2e-05, + "loss": 0.05159343, + "step": 22042 + }, + { + "epoch": 44.086, + "grad_norm": 1.1491683721542358, + "learning_rate": 2e-05, + "loss": 0.04531821, + "step": 22043 + }, + { + "epoch": 44.088, + "grad_norm": 1.113952398300171, + "learning_rate": 2e-05, + "loss": 0.04688843, + "step": 22044 + }, + { + "epoch": 44.09, + "grad_norm": 1.2150129079818726, + "learning_rate": 2e-05, + "loss": 0.03948733, + "step": 22045 + }, + { + "epoch": 44.092, + "grad_norm": 1.161972999572754, + "learning_rate": 2e-05, + "loss": 0.03694011, + "step": 22046 + }, + { + "epoch": 44.094, + "grad_norm": 1.6014569997787476, + "learning_rate": 2e-05, + "loss": 0.06034542, + "step": 22047 + }, + { + "epoch": 44.096, + "grad_norm": 1.49050772190094, + "learning_rate": 2e-05, + "loss": 0.05083546, + "step": 22048 + }, + { + "epoch": 44.098, + "grad_norm": 1.4574774503707886, + "learning_rate": 2e-05, + "loss": 0.0556547, + "step": 22049 + }, + { + "epoch": 44.1, + "grad_norm": 1.13857102394104, + "learning_rate": 2e-05, + "loss": 0.04332735, + "step": 22050 + }, + { + "epoch": 44.102, + "grad_norm": 1.2549856901168823, + "learning_rate": 2e-05, + "loss": 0.05079667, + "step": 22051 + }, + { + "epoch": 44.104, + "grad_norm": 1.2218992710113525, + "learning_rate": 2e-05, + "loss": 0.05055429, + "step": 22052 + }, + { + "epoch": 44.106, + "grad_norm": 1.3079739809036255, + "learning_rate": 2e-05, + "loss": 0.05005455, + "step": 22053 + }, + { + "epoch": 44.108, + "grad_norm": 1.148532748222351, + "learning_rate": 2e-05, + "loss": 0.04688092, + "step": 22054 + }, + { + "epoch": 44.11, + "grad_norm": 1.397567629814148, + "learning_rate": 2e-05, + "loss": 0.0634838, + "step": 22055 + }, + { + "epoch": 44.112, + "grad_norm": 1.166181206703186, + "learning_rate": 2e-05, + "loss": 0.0324477, + "step": 22056 + }, + { + "epoch": 44.114, + "grad_norm": 1.310949444770813, + "learning_rate": 2e-05, + "loss": 0.04780035, + "step": 22057 + }, + { + "epoch": 44.116, + "grad_norm": 1.1098144054412842, + "learning_rate": 2e-05, + "loss": 0.03849977, + "step": 22058 + }, + { + "epoch": 44.118, + "grad_norm": 1.5903892517089844, + "learning_rate": 2e-05, + "loss": 0.05317472, + "step": 22059 + }, + { + "epoch": 44.12, + "grad_norm": 1.1845721006393433, + "learning_rate": 2e-05, + "loss": 0.03562384, + "step": 22060 + }, + { + "epoch": 44.122, + "grad_norm": 1.2428687810897827, + "learning_rate": 2e-05, + "loss": 0.05235748, + "step": 22061 + }, + { + "epoch": 44.124, + "grad_norm": 1.2767870426177979, + "learning_rate": 2e-05, + "loss": 0.05173537, + "step": 22062 + }, + { + "epoch": 44.126, + "grad_norm": 1.3087096214294434, + "learning_rate": 2e-05, + "loss": 0.05469508, + "step": 22063 + }, + { + "epoch": 44.128, + "grad_norm": 1.2753592729568481, + "learning_rate": 2e-05, + "loss": 0.05185987, + "step": 22064 + }, + { + "epoch": 44.13, + "grad_norm": 1.3028455972671509, + "learning_rate": 2e-05, + "loss": 0.05058619, + "step": 22065 + }, + { + "epoch": 44.132, + "grad_norm": 1.2384607791900635, + "learning_rate": 2e-05, + "loss": 0.04439488, + "step": 22066 + }, + { + "epoch": 44.134, + "grad_norm": 1.494373083114624, + "learning_rate": 2e-05, + "loss": 0.06326284, + "step": 22067 + }, + { + "epoch": 44.136, + "grad_norm": 4.574721813201904, + "learning_rate": 2e-05, + "loss": 0.0534743, + "step": 22068 + }, + { + "epoch": 44.138, + "grad_norm": 1.5381698608398438, + "learning_rate": 2e-05, + "loss": 0.06085927, + "step": 22069 + }, + { + "epoch": 44.14, + "grad_norm": 1.221481442451477, + "learning_rate": 2e-05, + "loss": 0.04375359, + "step": 22070 + }, + { + "epoch": 44.142, + "grad_norm": 1.3349063396453857, + "learning_rate": 2e-05, + "loss": 0.04402313, + "step": 22071 + }, + { + "epoch": 44.144, + "grad_norm": 1.3375403881072998, + "learning_rate": 2e-05, + "loss": 0.05106883, + "step": 22072 + }, + { + "epoch": 44.146, + "grad_norm": 1.1496771574020386, + "learning_rate": 2e-05, + "loss": 0.04525649, + "step": 22073 + }, + { + "epoch": 44.148, + "grad_norm": 1.2824701070785522, + "learning_rate": 2e-05, + "loss": 0.04934424, + "step": 22074 + }, + { + "epoch": 44.15, + "grad_norm": 1.1325626373291016, + "learning_rate": 2e-05, + "loss": 0.0284467, + "step": 22075 + }, + { + "epoch": 44.152, + "grad_norm": 1.3874157667160034, + "learning_rate": 2e-05, + "loss": 0.04672045, + "step": 22076 + }, + { + "epoch": 44.154, + "grad_norm": 3.2082982063293457, + "learning_rate": 2e-05, + "loss": 0.0601771, + "step": 22077 + }, + { + "epoch": 44.156, + "grad_norm": 2.588113784790039, + "learning_rate": 2e-05, + "loss": 0.07006887, + "step": 22078 + }, + { + "epoch": 44.158, + "grad_norm": 1.3412790298461914, + "learning_rate": 2e-05, + "loss": 0.04630299, + "step": 22079 + }, + { + "epoch": 44.16, + "grad_norm": 1.2379393577575684, + "learning_rate": 2e-05, + "loss": 0.04339472, + "step": 22080 + }, + { + "epoch": 44.162, + "grad_norm": 1.1797010898590088, + "learning_rate": 2e-05, + "loss": 0.0429916, + "step": 22081 + }, + { + "epoch": 44.164, + "grad_norm": 3.3057401180267334, + "learning_rate": 2e-05, + "loss": 0.04493307, + "step": 22082 + }, + { + "epoch": 44.166, + "grad_norm": 1.1744298934936523, + "learning_rate": 2e-05, + "loss": 0.03810238, + "step": 22083 + }, + { + "epoch": 44.168, + "grad_norm": 1.669509768486023, + "learning_rate": 2e-05, + "loss": 0.06496652, + "step": 22084 + }, + { + "epoch": 44.17, + "grad_norm": 1.5125724077224731, + "learning_rate": 2e-05, + "loss": 0.04929143, + "step": 22085 + }, + { + "epoch": 44.172, + "grad_norm": 1.4067198038101196, + "learning_rate": 2e-05, + "loss": 0.05213205, + "step": 22086 + }, + { + "epoch": 44.174, + "grad_norm": 1.189210057258606, + "learning_rate": 2e-05, + "loss": 0.04036959, + "step": 22087 + }, + { + "epoch": 44.176, + "grad_norm": 1.154815673828125, + "learning_rate": 2e-05, + "loss": 0.04020704, + "step": 22088 + }, + { + "epoch": 44.178, + "grad_norm": 1.233259677886963, + "learning_rate": 2e-05, + "loss": 0.05813593, + "step": 22089 + }, + { + "epoch": 44.18, + "grad_norm": 1.2222100496292114, + "learning_rate": 2e-05, + "loss": 0.04773624, + "step": 22090 + }, + { + "epoch": 44.182, + "grad_norm": 1.3617769479751587, + "learning_rate": 2e-05, + "loss": 0.04983572, + "step": 22091 + }, + { + "epoch": 44.184, + "grad_norm": 1.1668729782104492, + "learning_rate": 2e-05, + "loss": 0.04586762, + "step": 22092 + }, + { + "epoch": 44.186, + "grad_norm": 1.3036890029907227, + "learning_rate": 2e-05, + "loss": 0.05323526, + "step": 22093 + }, + { + "epoch": 44.188, + "grad_norm": 3.2073822021484375, + "learning_rate": 2e-05, + "loss": 0.04346972, + "step": 22094 + }, + { + "epoch": 44.19, + "grad_norm": 1.1499048471450806, + "learning_rate": 2e-05, + "loss": 0.05191308, + "step": 22095 + }, + { + "epoch": 44.192, + "grad_norm": 1.2350811958312988, + "learning_rate": 2e-05, + "loss": 0.04665541, + "step": 22096 + }, + { + "epoch": 44.194, + "grad_norm": 1.1341971158981323, + "learning_rate": 2e-05, + "loss": 0.03508368, + "step": 22097 + }, + { + "epoch": 44.196, + "grad_norm": 1.3536897897720337, + "learning_rate": 2e-05, + "loss": 0.04077377, + "step": 22098 + }, + { + "epoch": 44.198, + "grad_norm": 1.0278751850128174, + "learning_rate": 2e-05, + "loss": 0.03572151, + "step": 22099 + }, + { + "epoch": 44.2, + "grad_norm": 1.1718095541000366, + "learning_rate": 2e-05, + "loss": 0.04011806, + "step": 22100 + }, + { + "epoch": 44.202, + "grad_norm": 1.184627652168274, + "learning_rate": 2e-05, + "loss": 0.04278344, + "step": 22101 + }, + { + "epoch": 44.204, + "grad_norm": 1.8265912532806396, + "learning_rate": 2e-05, + "loss": 0.04747808, + "step": 22102 + }, + { + "epoch": 44.206, + "grad_norm": 1.1171393394470215, + "learning_rate": 2e-05, + "loss": 0.04222995, + "step": 22103 + }, + { + "epoch": 44.208, + "grad_norm": 1.0637423992156982, + "learning_rate": 2e-05, + "loss": 0.04015394, + "step": 22104 + }, + { + "epoch": 44.21, + "grad_norm": 1.0722296237945557, + "learning_rate": 2e-05, + "loss": 0.04177207, + "step": 22105 + }, + { + "epoch": 44.212, + "grad_norm": 1.208267092704773, + "learning_rate": 2e-05, + "loss": 0.04094924, + "step": 22106 + }, + { + "epoch": 44.214, + "grad_norm": 2.877411365509033, + "learning_rate": 2e-05, + "loss": 0.05537339, + "step": 22107 + }, + { + "epoch": 44.216, + "grad_norm": 3.210109233856201, + "learning_rate": 2e-05, + "loss": 0.06678712, + "step": 22108 + }, + { + "epoch": 44.218, + "grad_norm": 1.6754549741744995, + "learning_rate": 2e-05, + "loss": 0.05157789, + "step": 22109 + }, + { + "epoch": 44.22, + "grad_norm": 1.1392402648925781, + "learning_rate": 2e-05, + "loss": 0.04350193, + "step": 22110 + }, + { + "epoch": 44.222, + "grad_norm": 1.8385316133499146, + "learning_rate": 2e-05, + "loss": 0.03925511, + "step": 22111 + }, + { + "epoch": 44.224, + "grad_norm": 1.453076958656311, + "learning_rate": 2e-05, + "loss": 0.05029003, + "step": 22112 + }, + { + "epoch": 44.226, + "grad_norm": 1.1068811416625977, + "learning_rate": 2e-05, + "loss": 0.05229628, + "step": 22113 + }, + { + "epoch": 44.228, + "grad_norm": 1.3013540506362915, + "learning_rate": 2e-05, + "loss": 0.05385014, + "step": 22114 + }, + { + "epoch": 44.23, + "grad_norm": 1.258337140083313, + "learning_rate": 2e-05, + "loss": 0.04877231, + "step": 22115 + }, + { + "epoch": 44.232, + "grad_norm": 1.5271419286727905, + "learning_rate": 2e-05, + "loss": 0.0496937, + "step": 22116 + }, + { + "epoch": 44.234, + "grad_norm": 1.1020830869674683, + "learning_rate": 2e-05, + "loss": 0.0386355, + "step": 22117 + }, + { + "epoch": 44.236, + "grad_norm": 0.9935838580131531, + "learning_rate": 2e-05, + "loss": 0.02613112, + "step": 22118 + }, + { + "epoch": 44.238, + "grad_norm": 1.5669888257980347, + "learning_rate": 2e-05, + "loss": 0.05612132, + "step": 22119 + }, + { + "epoch": 44.24, + "grad_norm": 1.1000120639801025, + "learning_rate": 2e-05, + "loss": 0.041154, + "step": 22120 + }, + { + "epoch": 44.242, + "grad_norm": 1.1130436658859253, + "learning_rate": 2e-05, + "loss": 0.03398903, + "step": 22121 + }, + { + "epoch": 44.244, + "grad_norm": 1.3796424865722656, + "learning_rate": 2e-05, + "loss": 0.05808487, + "step": 22122 + }, + { + "epoch": 44.246, + "grad_norm": 1.7627161741256714, + "learning_rate": 2e-05, + "loss": 0.05050261, + "step": 22123 + }, + { + "epoch": 44.248, + "grad_norm": 1.2039992809295654, + "learning_rate": 2e-05, + "loss": 0.0452828, + "step": 22124 + }, + { + "epoch": 44.25, + "grad_norm": 1.6339401006698608, + "learning_rate": 2e-05, + "loss": 0.07191573, + "step": 22125 + }, + { + "epoch": 44.252, + "grad_norm": 1.870229959487915, + "learning_rate": 2e-05, + "loss": 0.04964413, + "step": 22126 + }, + { + "epoch": 44.254, + "grad_norm": 1.4508622884750366, + "learning_rate": 2e-05, + "loss": 0.05981822, + "step": 22127 + }, + { + "epoch": 44.256, + "grad_norm": 1.3774610757827759, + "learning_rate": 2e-05, + "loss": 0.04445232, + "step": 22128 + }, + { + "epoch": 44.258, + "grad_norm": 1.1424156427383423, + "learning_rate": 2e-05, + "loss": 0.04557105, + "step": 22129 + }, + { + "epoch": 44.26, + "grad_norm": 1.0394034385681152, + "learning_rate": 2e-05, + "loss": 0.0336099, + "step": 22130 + }, + { + "epoch": 44.262, + "grad_norm": 1.2787925004959106, + "learning_rate": 2e-05, + "loss": 0.04919507, + "step": 22131 + }, + { + "epoch": 44.264, + "grad_norm": 0.9578052759170532, + "learning_rate": 2e-05, + "loss": 0.03036173, + "step": 22132 + }, + { + "epoch": 44.266, + "grad_norm": 1.0824671983718872, + "learning_rate": 2e-05, + "loss": 0.04263349, + "step": 22133 + }, + { + "epoch": 44.268, + "grad_norm": 1.273948311805725, + "learning_rate": 2e-05, + "loss": 0.03732668, + "step": 22134 + }, + { + "epoch": 44.27, + "grad_norm": 1.1770116090774536, + "learning_rate": 2e-05, + "loss": 0.04749018, + "step": 22135 + }, + { + "epoch": 44.272, + "grad_norm": 1.3313310146331787, + "learning_rate": 2e-05, + "loss": 0.04976882, + "step": 22136 + }, + { + "epoch": 44.274, + "grad_norm": 2.539590358734131, + "learning_rate": 2e-05, + "loss": 0.04594205, + "step": 22137 + }, + { + "epoch": 44.276, + "grad_norm": 1.6731163263320923, + "learning_rate": 2e-05, + "loss": 0.06411099, + "step": 22138 + }, + { + "epoch": 44.278, + "grad_norm": 1.0700182914733887, + "learning_rate": 2e-05, + "loss": 0.03831624, + "step": 22139 + }, + { + "epoch": 44.28, + "grad_norm": 1.190203309059143, + "learning_rate": 2e-05, + "loss": 0.04036564, + "step": 22140 + }, + { + "epoch": 44.282, + "grad_norm": 3.227332830429077, + "learning_rate": 2e-05, + "loss": 0.06097856, + "step": 22141 + }, + { + "epoch": 44.284, + "grad_norm": 1.274222493171692, + "learning_rate": 2e-05, + "loss": 0.04809301, + "step": 22142 + }, + { + "epoch": 44.286, + "grad_norm": 1.2631959915161133, + "learning_rate": 2e-05, + "loss": 0.0570578, + "step": 22143 + }, + { + "epoch": 44.288, + "grad_norm": 1.4169923067092896, + "learning_rate": 2e-05, + "loss": 0.05213282, + "step": 22144 + }, + { + "epoch": 44.29, + "grad_norm": 1.2868964672088623, + "learning_rate": 2e-05, + "loss": 0.05694466, + "step": 22145 + }, + { + "epoch": 44.292, + "grad_norm": 1.0576223134994507, + "learning_rate": 2e-05, + "loss": 0.04163115, + "step": 22146 + }, + { + "epoch": 44.294, + "grad_norm": 2.8826777935028076, + "learning_rate": 2e-05, + "loss": 0.05488094, + "step": 22147 + }, + { + "epoch": 44.296, + "grad_norm": 1.1898715496063232, + "learning_rate": 2e-05, + "loss": 0.05081913, + "step": 22148 + }, + { + "epoch": 44.298, + "grad_norm": 1.2379268407821655, + "learning_rate": 2e-05, + "loss": 0.05061274, + "step": 22149 + }, + { + "epoch": 44.3, + "grad_norm": 1.355218529701233, + "learning_rate": 2e-05, + "loss": 0.05804717, + "step": 22150 + }, + { + "epoch": 44.302, + "grad_norm": 1.3628958463668823, + "learning_rate": 2e-05, + "loss": 0.04630186, + "step": 22151 + }, + { + "epoch": 44.304, + "grad_norm": 1.064475178718567, + "learning_rate": 2e-05, + "loss": 0.0363181, + "step": 22152 + }, + { + "epoch": 44.306, + "grad_norm": 1.3461925983428955, + "learning_rate": 2e-05, + "loss": 0.03945461, + "step": 22153 + }, + { + "epoch": 44.308, + "grad_norm": 1.2384178638458252, + "learning_rate": 2e-05, + "loss": 0.03994835, + "step": 22154 + }, + { + "epoch": 44.31, + "grad_norm": 1.2434017658233643, + "learning_rate": 2e-05, + "loss": 0.05772359, + "step": 22155 + }, + { + "epoch": 44.312, + "grad_norm": 1.719817876815796, + "learning_rate": 2e-05, + "loss": 0.04721334, + "step": 22156 + }, + { + "epoch": 44.314, + "grad_norm": 1.0811158418655396, + "learning_rate": 2e-05, + "loss": 0.05218527, + "step": 22157 + }, + { + "epoch": 44.316, + "grad_norm": 1.283691644668579, + "learning_rate": 2e-05, + "loss": 0.05287713, + "step": 22158 + }, + { + "epoch": 44.318, + "grad_norm": 1.1990177631378174, + "learning_rate": 2e-05, + "loss": 0.04951636, + "step": 22159 + }, + { + "epoch": 44.32, + "grad_norm": 2.536614418029785, + "learning_rate": 2e-05, + "loss": 0.04560195, + "step": 22160 + }, + { + "epoch": 44.322, + "grad_norm": 1.0507843494415283, + "learning_rate": 2e-05, + "loss": 0.03920343, + "step": 22161 + }, + { + "epoch": 44.324, + "grad_norm": 1.4071505069732666, + "learning_rate": 2e-05, + "loss": 0.05998628, + "step": 22162 + }, + { + "epoch": 44.326, + "grad_norm": 1.6779695749282837, + "learning_rate": 2e-05, + "loss": 0.05833928, + "step": 22163 + }, + { + "epoch": 44.328, + "grad_norm": 1.0807090997695923, + "learning_rate": 2e-05, + "loss": 0.04173718, + "step": 22164 + }, + { + "epoch": 44.33, + "grad_norm": 1.7728112936019897, + "learning_rate": 2e-05, + "loss": 0.05153607, + "step": 22165 + }, + { + "epoch": 44.332, + "grad_norm": 1.454960584640503, + "learning_rate": 2e-05, + "loss": 0.05345749, + "step": 22166 + }, + { + "epoch": 44.334, + "grad_norm": 1.146530032157898, + "learning_rate": 2e-05, + "loss": 0.05334022, + "step": 22167 + }, + { + "epoch": 44.336, + "grad_norm": 1.7574084997177124, + "learning_rate": 2e-05, + "loss": 0.03719595, + "step": 22168 + }, + { + "epoch": 44.338, + "grad_norm": 1.4816433191299438, + "learning_rate": 2e-05, + "loss": 0.07536916, + "step": 22169 + }, + { + "epoch": 44.34, + "grad_norm": 1.3355191946029663, + "learning_rate": 2e-05, + "loss": 0.04837946, + "step": 22170 + }, + { + "epoch": 44.342, + "grad_norm": 1.2563939094543457, + "learning_rate": 2e-05, + "loss": 0.04800646, + "step": 22171 + }, + { + "epoch": 44.344, + "grad_norm": 1.0994011163711548, + "learning_rate": 2e-05, + "loss": 0.03690449, + "step": 22172 + }, + { + "epoch": 44.346, + "grad_norm": 1.6743828058242798, + "learning_rate": 2e-05, + "loss": 0.04229587, + "step": 22173 + }, + { + "epoch": 44.348, + "grad_norm": 1.1569823026657104, + "learning_rate": 2e-05, + "loss": 0.06016696, + "step": 22174 + }, + { + "epoch": 44.35, + "grad_norm": 1.8598023653030396, + "learning_rate": 2e-05, + "loss": 0.06339575, + "step": 22175 + }, + { + "epoch": 44.352, + "grad_norm": 1.2611335515975952, + "learning_rate": 2e-05, + "loss": 0.05222043, + "step": 22176 + }, + { + "epoch": 44.354, + "grad_norm": 0.9472651481628418, + "learning_rate": 2e-05, + "loss": 0.03059671, + "step": 22177 + }, + { + "epoch": 44.356, + "grad_norm": 1.521095633506775, + "learning_rate": 2e-05, + "loss": 0.05748924, + "step": 22178 + }, + { + "epoch": 44.358, + "grad_norm": 1.19306218624115, + "learning_rate": 2e-05, + "loss": 0.04892451, + "step": 22179 + }, + { + "epoch": 44.36, + "grad_norm": 1.12234628200531, + "learning_rate": 2e-05, + "loss": 0.0413163, + "step": 22180 + }, + { + "epoch": 44.362, + "grad_norm": 1.408831000328064, + "learning_rate": 2e-05, + "loss": 0.04378711, + "step": 22181 + }, + { + "epoch": 44.364, + "grad_norm": 1.1348880529403687, + "learning_rate": 2e-05, + "loss": 0.04664857, + "step": 22182 + }, + { + "epoch": 44.366, + "grad_norm": 1.254754662513733, + "learning_rate": 2e-05, + "loss": 0.03291727, + "step": 22183 + }, + { + "epoch": 44.368, + "grad_norm": 2.050689458847046, + "learning_rate": 2e-05, + "loss": 0.06108903, + "step": 22184 + }, + { + "epoch": 44.37, + "grad_norm": 1.1247540712356567, + "learning_rate": 2e-05, + "loss": 0.04102144, + "step": 22185 + }, + { + "epoch": 44.372, + "grad_norm": 1.2489968538284302, + "learning_rate": 2e-05, + "loss": 0.04336529, + "step": 22186 + }, + { + "epoch": 44.374, + "grad_norm": 1.2883358001708984, + "learning_rate": 2e-05, + "loss": 0.04382469, + "step": 22187 + }, + { + "epoch": 44.376, + "grad_norm": 1.5653047561645508, + "learning_rate": 2e-05, + "loss": 0.05455358, + "step": 22188 + }, + { + "epoch": 44.378, + "grad_norm": 1.1407490968704224, + "learning_rate": 2e-05, + "loss": 0.04385475, + "step": 22189 + }, + { + "epoch": 44.38, + "grad_norm": 0.9423237442970276, + "learning_rate": 2e-05, + "loss": 0.03583224, + "step": 22190 + }, + { + "epoch": 44.382, + "grad_norm": 1.5692898035049438, + "learning_rate": 2e-05, + "loss": 0.06093635, + "step": 22191 + }, + { + "epoch": 44.384, + "grad_norm": 1.2798019647598267, + "learning_rate": 2e-05, + "loss": 0.04287341, + "step": 22192 + }, + { + "epoch": 44.386, + "grad_norm": 2.308946132659912, + "learning_rate": 2e-05, + "loss": 0.0538884, + "step": 22193 + }, + { + "epoch": 44.388, + "grad_norm": 1.9377248287200928, + "learning_rate": 2e-05, + "loss": 0.06390283, + "step": 22194 + }, + { + "epoch": 44.39, + "grad_norm": 1.157041072845459, + "learning_rate": 2e-05, + "loss": 0.04512091, + "step": 22195 + }, + { + "epoch": 44.392, + "grad_norm": 1.2009409666061401, + "learning_rate": 2e-05, + "loss": 0.04480921, + "step": 22196 + }, + { + "epoch": 44.394, + "grad_norm": 1.0632206201553345, + "learning_rate": 2e-05, + "loss": 0.04720378, + "step": 22197 + }, + { + "epoch": 44.396, + "grad_norm": 2.500654935836792, + "learning_rate": 2e-05, + "loss": 0.07204752, + "step": 22198 + }, + { + "epoch": 44.398, + "grad_norm": 2.257668972015381, + "learning_rate": 2e-05, + "loss": 0.06277246, + "step": 22199 + }, + { + "epoch": 44.4, + "grad_norm": 1.4456931352615356, + "learning_rate": 2e-05, + "loss": 0.06364234, + "step": 22200 + }, + { + "epoch": 44.402, + "grad_norm": 1.1703506708145142, + "learning_rate": 2e-05, + "loss": 0.04526365, + "step": 22201 + }, + { + "epoch": 44.404, + "grad_norm": 0.9013957381248474, + "learning_rate": 2e-05, + "loss": 0.02716382, + "step": 22202 + }, + { + "epoch": 44.406, + "grad_norm": 1.0054999589920044, + "learning_rate": 2e-05, + "loss": 0.04023641, + "step": 22203 + }, + { + "epoch": 44.408, + "grad_norm": 1.7909125089645386, + "learning_rate": 2e-05, + "loss": 0.05618212, + "step": 22204 + }, + { + "epoch": 44.41, + "grad_norm": 1.3415671586990356, + "learning_rate": 2e-05, + "loss": 0.04464972, + "step": 22205 + }, + { + "epoch": 44.412, + "grad_norm": 1.8720018863677979, + "learning_rate": 2e-05, + "loss": 0.06667548, + "step": 22206 + }, + { + "epoch": 44.414, + "grad_norm": 1.0374834537506104, + "learning_rate": 2e-05, + "loss": 0.03846564, + "step": 22207 + }, + { + "epoch": 44.416, + "grad_norm": 1.2723255157470703, + "learning_rate": 2e-05, + "loss": 0.05234695, + "step": 22208 + }, + { + "epoch": 44.418, + "grad_norm": 1.134573221206665, + "learning_rate": 2e-05, + "loss": 0.04465498, + "step": 22209 + }, + { + "epoch": 44.42, + "grad_norm": 1.5191528797149658, + "learning_rate": 2e-05, + "loss": 0.0526737, + "step": 22210 + }, + { + "epoch": 44.422, + "grad_norm": 1.506665825843811, + "learning_rate": 2e-05, + "loss": 0.06554738, + "step": 22211 + }, + { + "epoch": 44.424, + "grad_norm": 1.1496007442474365, + "learning_rate": 2e-05, + "loss": 0.05143321, + "step": 22212 + }, + { + "epoch": 44.426, + "grad_norm": 1.3427084684371948, + "learning_rate": 2e-05, + "loss": 0.05444918, + "step": 22213 + }, + { + "epoch": 44.428, + "grad_norm": 1.3482060432434082, + "learning_rate": 2e-05, + "loss": 0.04946334, + "step": 22214 + }, + { + "epoch": 44.43, + "grad_norm": 2.4248390197753906, + "learning_rate": 2e-05, + "loss": 0.06155992, + "step": 22215 + }, + { + "epoch": 44.432, + "grad_norm": 1.5861310958862305, + "learning_rate": 2e-05, + "loss": 0.0607713, + "step": 22216 + }, + { + "epoch": 44.434, + "grad_norm": 1.7854304313659668, + "learning_rate": 2e-05, + "loss": 0.05506391, + "step": 22217 + }, + { + "epoch": 44.436, + "grad_norm": 1.5562728643417358, + "learning_rate": 2e-05, + "loss": 0.05874497, + "step": 22218 + }, + { + "epoch": 44.438, + "grad_norm": 1.5572162866592407, + "learning_rate": 2e-05, + "loss": 0.04674251, + "step": 22219 + }, + { + "epoch": 44.44, + "grad_norm": 2.5241782665252686, + "learning_rate": 2e-05, + "loss": 0.05468413, + "step": 22220 + }, + { + "epoch": 44.442, + "grad_norm": 1.4192159175872803, + "learning_rate": 2e-05, + "loss": 0.05214172, + "step": 22221 + }, + { + "epoch": 44.444, + "grad_norm": 1.1719669103622437, + "learning_rate": 2e-05, + "loss": 0.0375664, + "step": 22222 + }, + { + "epoch": 44.446, + "grad_norm": 1.4639357328414917, + "learning_rate": 2e-05, + "loss": 0.06297034, + "step": 22223 + }, + { + "epoch": 44.448, + "grad_norm": 1.3242526054382324, + "learning_rate": 2e-05, + "loss": 0.05253812, + "step": 22224 + }, + { + "epoch": 44.45, + "grad_norm": 1.2953046560287476, + "learning_rate": 2e-05, + "loss": 0.05739931, + "step": 22225 + }, + { + "epoch": 44.452, + "grad_norm": 2.824225425720215, + "learning_rate": 2e-05, + "loss": 0.05475694, + "step": 22226 + }, + { + "epoch": 44.454, + "grad_norm": 2.2453224658966064, + "learning_rate": 2e-05, + "loss": 0.04598641, + "step": 22227 + }, + { + "epoch": 44.456, + "grad_norm": 1.2624878883361816, + "learning_rate": 2e-05, + "loss": 0.05295527, + "step": 22228 + }, + { + "epoch": 44.458, + "grad_norm": 1.4258654117584229, + "learning_rate": 2e-05, + "loss": 0.05860892, + "step": 22229 + }, + { + "epoch": 44.46, + "grad_norm": 1.2828129529953003, + "learning_rate": 2e-05, + "loss": 0.04799199, + "step": 22230 + }, + { + "epoch": 44.462, + "grad_norm": 1.1467986106872559, + "learning_rate": 2e-05, + "loss": 0.05302534, + "step": 22231 + }, + { + "epoch": 44.464, + "grad_norm": 1.229738473892212, + "learning_rate": 2e-05, + "loss": 0.04447974, + "step": 22232 + }, + { + "epoch": 44.466, + "grad_norm": 1.252163052558899, + "learning_rate": 2e-05, + "loss": 0.04261618, + "step": 22233 + }, + { + "epoch": 44.468, + "grad_norm": 1.4980531930923462, + "learning_rate": 2e-05, + "loss": 0.04990222, + "step": 22234 + }, + { + "epoch": 44.47, + "grad_norm": 0.8647037148475647, + "learning_rate": 2e-05, + "loss": 0.03477815, + "step": 22235 + }, + { + "epoch": 44.472, + "grad_norm": 1.2490501403808594, + "learning_rate": 2e-05, + "loss": 0.05593645, + "step": 22236 + }, + { + "epoch": 44.474, + "grad_norm": 2.7238969802856445, + "learning_rate": 2e-05, + "loss": 0.07153691, + "step": 22237 + }, + { + "epoch": 44.476, + "grad_norm": 1.0918712615966797, + "learning_rate": 2e-05, + "loss": 0.03975195, + "step": 22238 + }, + { + "epoch": 44.478, + "grad_norm": 1.2695947885513306, + "learning_rate": 2e-05, + "loss": 0.0505439, + "step": 22239 + }, + { + "epoch": 44.48, + "grad_norm": 1.305942177772522, + "learning_rate": 2e-05, + "loss": 0.05622169, + "step": 22240 + }, + { + "epoch": 44.482, + "grad_norm": 1.5338377952575684, + "learning_rate": 2e-05, + "loss": 0.06709421, + "step": 22241 + }, + { + "epoch": 44.484, + "grad_norm": 1.2748602628707886, + "learning_rate": 2e-05, + "loss": 0.05205786, + "step": 22242 + }, + { + "epoch": 44.486, + "grad_norm": 1.1404742002487183, + "learning_rate": 2e-05, + "loss": 0.04140478, + "step": 22243 + }, + { + "epoch": 44.488, + "grad_norm": 1.2886909246444702, + "learning_rate": 2e-05, + "loss": 0.05009973, + "step": 22244 + }, + { + "epoch": 44.49, + "grad_norm": 1.3291938304901123, + "learning_rate": 2e-05, + "loss": 0.03881418, + "step": 22245 + }, + { + "epoch": 44.492, + "grad_norm": 1.743362307548523, + "learning_rate": 2e-05, + "loss": 0.06562784, + "step": 22246 + }, + { + "epoch": 44.494, + "grad_norm": 1.1744399070739746, + "learning_rate": 2e-05, + "loss": 0.03672204, + "step": 22247 + }, + { + "epoch": 44.496, + "grad_norm": 1.9760868549346924, + "learning_rate": 2e-05, + "loss": 0.04297481, + "step": 22248 + }, + { + "epoch": 44.498, + "grad_norm": 1.2893195152282715, + "learning_rate": 2e-05, + "loss": 0.0419421, + "step": 22249 + }, + { + "epoch": 44.5, + "grad_norm": 1.3587701320648193, + "learning_rate": 2e-05, + "loss": 0.03800445, + "step": 22250 + }, + { + "epoch": 44.502, + "grad_norm": 1.7126094102859497, + "learning_rate": 2e-05, + "loss": 0.04739122, + "step": 22251 + }, + { + "epoch": 44.504, + "grad_norm": 1.3062655925750732, + "learning_rate": 2e-05, + "loss": 0.0504301, + "step": 22252 + }, + { + "epoch": 44.506, + "grad_norm": 1.2278339862823486, + "learning_rate": 2e-05, + "loss": 0.05388809, + "step": 22253 + }, + { + "epoch": 44.508, + "grad_norm": 1.3173941373825073, + "learning_rate": 2e-05, + "loss": 0.03916169, + "step": 22254 + }, + { + "epoch": 44.51, + "grad_norm": 1.1290494203567505, + "learning_rate": 2e-05, + "loss": 0.04224993, + "step": 22255 + }, + { + "epoch": 44.512, + "grad_norm": 1.289548397064209, + "learning_rate": 2e-05, + "loss": 0.05193936, + "step": 22256 + }, + { + "epoch": 44.514, + "grad_norm": 1.2119817733764648, + "learning_rate": 2e-05, + "loss": 0.04464715, + "step": 22257 + }, + { + "epoch": 44.516, + "grad_norm": 1.1168314218521118, + "learning_rate": 2e-05, + "loss": 0.05150532, + "step": 22258 + }, + { + "epoch": 44.518, + "grad_norm": 1.120448112487793, + "learning_rate": 2e-05, + "loss": 0.04792247, + "step": 22259 + }, + { + "epoch": 44.52, + "grad_norm": 1.3556797504425049, + "learning_rate": 2e-05, + "loss": 0.05941795, + "step": 22260 + }, + { + "epoch": 44.522, + "grad_norm": 1.177809715270996, + "learning_rate": 2e-05, + "loss": 0.04000936, + "step": 22261 + }, + { + "epoch": 44.524, + "grad_norm": 1.567018747329712, + "learning_rate": 2e-05, + "loss": 0.04125568, + "step": 22262 + }, + { + "epoch": 44.526, + "grad_norm": 1.170555830001831, + "learning_rate": 2e-05, + "loss": 0.03684732, + "step": 22263 + }, + { + "epoch": 44.528, + "grad_norm": 1.2405741214752197, + "learning_rate": 2e-05, + "loss": 0.04239763, + "step": 22264 + }, + { + "epoch": 44.53, + "grad_norm": 1.0276159048080444, + "learning_rate": 2e-05, + "loss": 0.0344376, + "step": 22265 + }, + { + "epoch": 44.532, + "grad_norm": 1.2948598861694336, + "learning_rate": 2e-05, + "loss": 0.05724182, + "step": 22266 + }, + { + "epoch": 44.534, + "grad_norm": 1.2418267726898193, + "learning_rate": 2e-05, + "loss": 0.05686314, + "step": 22267 + }, + { + "epoch": 44.536, + "grad_norm": 1.2589337825775146, + "learning_rate": 2e-05, + "loss": 0.04628392, + "step": 22268 + }, + { + "epoch": 44.538, + "grad_norm": 1.343007206916809, + "learning_rate": 2e-05, + "loss": 0.06061236, + "step": 22269 + }, + { + "epoch": 44.54, + "grad_norm": 1.2776858806610107, + "learning_rate": 2e-05, + "loss": 0.0425747, + "step": 22270 + }, + { + "epoch": 44.542, + "grad_norm": 1.1150517463684082, + "learning_rate": 2e-05, + "loss": 0.03600602, + "step": 22271 + }, + { + "epoch": 44.544, + "grad_norm": 1.9361085891723633, + "learning_rate": 2e-05, + "loss": 0.04623732, + "step": 22272 + }, + { + "epoch": 44.546, + "grad_norm": 1.4865039587020874, + "learning_rate": 2e-05, + "loss": 0.04633596, + "step": 22273 + }, + { + "epoch": 44.548, + "grad_norm": 1.1509850025177002, + "learning_rate": 2e-05, + "loss": 0.03759123, + "step": 22274 + }, + { + "epoch": 44.55, + "grad_norm": 1.38397216796875, + "learning_rate": 2e-05, + "loss": 0.05729149, + "step": 22275 + }, + { + "epoch": 44.552, + "grad_norm": 1.1179465055465698, + "learning_rate": 2e-05, + "loss": 0.03808307, + "step": 22276 + }, + { + "epoch": 44.554, + "grad_norm": 2.435750961303711, + "learning_rate": 2e-05, + "loss": 0.02759157, + "step": 22277 + }, + { + "epoch": 44.556, + "grad_norm": 1.195075273513794, + "learning_rate": 2e-05, + "loss": 0.03713936, + "step": 22278 + }, + { + "epoch": 44.558, + "grad_norm": 1.1682522296905518, + "learning_rate": 2e-05, + "loss": 0.04102589, + "step": 22279 + }, + { + "epoch": 44.56, + "grad_norm": 1.4527888298034668, + "learning_rate": 2e-05, + "loss": 0.06393797, + "step": 22280 + }, + { + "epoch": 44.562, + "grad_norm": 1.5062603950500488, + "learning_rate": 2e-05, + "loss": 0.07412426, + "step": 22281 + }, + { + "epoch": 44.564, + "grad_norm": 1.2519084215164185, + "learning_rate": 2e-05, + "loss": 0.04865194, + "step": 22282 + }, + { + "epoch": 44.566, + "grad_norm": 1.912338137626648, + "learning_rate": 2e-05, + "loss": 0.0646146, + "step": 22283 + }, + { + "epoch": 44.568, + "grad_norm": 1.1479963064193726, + "learning_rate": 2e-05, + "loss": 0.04428418, + "step": 22284 + }, + { + "epoch": 44.57, + "grad_norm": 1.3839733600616455, + "learning_rate": 2e-05, + "loss": 0.03918359, + "step": 22285 + }, + { + "epoch": 44.572, + "grad_norm": 1.479257345199585, + "learning_rate": 2e-05, + "loss": 0.06053314, + "step": 22286 + }, + { + "epoch": 44.574, + "grad_norm": 2.5502092838287354, + "learning_rate": 2e-05, + "loss": 0.07071359, + "step": 22287 + }, + { + "epoch": 44.576, + "grad_norm": 1.1668108701705933, + "learning_rate": 2e-05, + "loss": 0.02986339, + "step": 22288 + }, + { + "epoch": 44.578, + "grad_norm": 1.1986876726150513, + "learning_rate": 2e-05, + "loss": 0.05737267, + "step": 22289 + }, + { + "epoch": 44.58, + "grad_norm": 1.2797632217407227, + "learning_rate": 2e-05, + "loss": 0.05133417, + "step": 22290 + }, + { + "epoch": 44.582, + "grad_norm": 1.1211811304092407, + "learning_rate": 2e-05, + "loss": 0.04330084, + "step": 22291 + }, + { + "epoch": 44.584, + "grad_norm": 1.906973123550415, + "learning_rate": 2e-05, + "loss": 0.05037066, + "step": 22292 + }, + { + "epoch": 44.586, + "grad_norm": 1.128289818763733, + "learning_rate": 2e-05, + "loss": 0.03821924, + "step": 22293 + }, + { + "epoch": 44.588, + "grad_norm": 2.022193670272827, + "learning_rate": 2e-05, + "loss": 0.06020941, + "step": 22294 + }, + { + "epoch": 44.59, + "grad_norm": 1.26398503780365, + "learning_rate": 2e-05, + "loss": 0.04926804, + "step": 22295 + }, + { + "epoch": 44.592, + "grad_norm": 1.2855454683303833, + "learning_rate": 2e-05, + "loss": 0.05323242, + "step": 22296 + }, + { + "epoch": 44.594, + "grad_norm": 1.0625572204589844, + "learning_rate": 2e-05, + "loss": 0.03464797, + "step": 22297 + }, + { + "epoch": 44.596, + "grad_norm": 1.904098391532898, + "learning_rate": 2e-05, + "loss": 0.07098804, + "step": 22298 + }, + { + "epoch": 44.598, + "grad_norm": 1.219960331916809, + "learning_rate": 2e-05, + "loss": 0.04161097, + "step": 22299 + }, + { + "epoch": 44.6, + "grad_norm": 1.6203744411468506, + "learning_rate": 2e-05, + "loss": 0.0535806, + "step": 22300 + }, + { + "epoch": 44.602, + "grad_norm": 1.138048529624939, + "learning_rate": 2e-05, + "loss": 0.04342931, + "step": 22301 + }, + { + "epoch": 44.604, + "grad_norm": 1.2476096153259277, + "learning_rate": 2e-05, + "loss": 0.0449223, + "step": 22302 + }, + { + "epoch": 44.606, + "grad_norm": 2.6473846435546875, + "learning_rate": 2e-05, + "loss": 0.04433635, + "step": 22303 + }, + { + "epoch": 44.608, + "grad_norm": 1.2410426139831543, + "learning_rate": 2e-05, + "loss": 0.05194189, + "step": 22304 + }, + { + "epoch": 44.61, + "grad_norm": 1.4751123189926147, + "learning_rate": 2e-05, + "loss": 0.04747447, + "step": 22305 + }, + { + "epoch": 44.612, + "grad_norm": 1.1794182062149048, + "learning_rate": 2e-05, + "loss": 0.04860191, + "step": 22306 + }, + { + "epoch": 44.614, + "grad_norm": 0.9958474636077881, + "learning_rate": 2e-05, + "loss": 0.03155885, + "step": 22307 + }, + { + "epoch": 44.616, + "grad_norm": 1.1580699682235718, + "learning_rate": 2e-05, + "loss": 0.0375239, + "step": 22308 + }, + { + "epoch": 44.618, + "grad_norm": 1.1620581150054932, + "learning_rate": 2e-05, + "loss": 0.05345889, + "step": 22309 + }, + { + "epoch": 44.62, + "grad_norm": 1.1996610164642334, + "learning_rate": 2e-05, + "loss": 0.05827244, + "step": 22310 + }, + { + "epoch": 44.622, + "grad_norm": 1.862815260887146, + "learning_rate": 2e-05, + "loss": 0.04650091, + "step": 22311 + }, + { + "epoch": 44.624, + "grad_norm": 1.2667579650878906, + "learning_rate": 2e-05, + "loss": 0.0441907, + "step": 22312 + }, + { + "epoch": 44.626, + "grad_norm": 1.3493322134017944, + "learning_rate": 2e-05, + "loss": 0.05513781, + "step": 22313 + }, + { + "epoch": 44.628, + "grad_norm": 1.2868801355361938, + "learning_rate": 2e-05, + "loss": 0.05301888, + "step": 22314 + }, + { + "epoch": 44.63, + "grad_norm": 1.0114258527755737, + "learning_rate": 2e-05, + "loss": 0.04150274, + "step": 22315 + }, + { + "epoch": 44.632, + "grad_norm": 1.1847352981567383, + "learning_rate": 2e-05, + "loss": 0.04998568, + "step": 22316 + }, + { + "epoch": 44.634, + "grad_norm": 1.4043679237365723, + "learning_rate": 2e-05, + "loss": 0.04988774, + "step": 22317 + }, + { + "epoch": 44.636, + "grad_norm": 2.041414260864258, + "learning_rate": 2e-05, + "loss": 0.05052672, + "step": 22318 + }, + { + "epoch": 44.638, + "grad_norm": 2.7315549850463867, + "learning_rate": 2e-05, + "loss": 0.05014642, + "step": 22319 + }, + { + "epoch": 44.64, + "grad_norm": 1.9132074117660522, + "learning_rate": 2e-05, + "loss": 0.04718821, + "step": 22320 + }, + { + "epoch": 44.642, + "grad_norm": 1.4658045768737793, + "learning_rate": 2e-05, + "loss": 0.04350495, + "step": 22321 + }, + { + "epoch": 44.644, + "grad_norm": 1.4384865760803223, + "learning_rate": 2e-05, + "loss": 0.04404975, + "step": 22322 + }, + { + "epoch": 44.646, + "grad_norm": 1.4497653245925903, + "learning_rate": 2e-05, + "loss": 0.03452429, + "step": 22323 + }, + { + "epoch": 44.648, + "grad_norm": 1.3256078958511353, + "learning_rate": 2e-05, + "loss": 0.040176, + "step": 22324 + }, + { + "epoch": 44.65, + "grad_norm": 1.1343733072280884, + "learning_rate": 2e-05, + "loss": 0.03543989, + "step": 22325 + }, + { + "epoch": 44.652, + "grad_norm": 1.8382833003997803, + "learning_rate": 2e-05, + "loss": 0.05061209, + "step": 22326 + }, + { + "epoch": 44.654, + "grad_norm": 1.3723702430725098, + "learning_rate": 2e-05, + "loss": 0.04715881, + "step": 22327 + }, + { + "epoch": 44.656, + "grad_norm": 1.6342358589172363, + "learning_rate": 2e-05, + "loss": 0.05269512, + "step": 22328 + }, + { + "epoch": 44.658, + "grad_norm": 1.3743045330047607, + "learning_rate": 2e-05, + "loss": 0.04504273, + "step": 22329 + }, + { + "epoch": 44.66, + "grad_norm": 3.2658958435058594, + "learning_rate": 2e-05, + "loss": 0.05143912, + "step": 22330 + }, + { + "epoch": 44.662, + "grad_norm": 2.2157278060913086, + "learning_rate": 2e-05, + "loss": 0.05146747, + "step": 22331 + }, + { + "epoch": 44.664, + "grad_norm": 1.1454540491104126, + "learning_rate": 2e-05, + "loss": 0.04466172, + "step": 22332 + }, + { + "epoch": 44.666, + "grad_norm": 1.2237814664840698, + "learning_rate": 2e-05, + "loss": 0.05431961, + "step": 22333 + }, + { + "epoch": 44.668, + "grad_norm": 1.3983421325683594, + "learning_rate": 2e-05, + "loss": 0.03210323, + "step": 22334 + }, + { + "epoch": 44.67, + "grad_norm": 1.5525994300842285, + "learning_rate": 2e-05, + "loss": 0.06114915, + "step": 22335 + }, + { + "epoch": 44.672, + "grad_norm": 1.054101586341858, + "learning_rate": 2e-05, + "loss": 0.04592917, + "step": 22336 + }, + { + "epoch": 44.674, + "grad_norm": 1.3390069007873535, + "learning_rate": 2e-05, + "loss": 0.04546981, + "step": 22337 + }, + { + "epoch": 44.676, + "grad_norm": 1.3477840423583984, + "learning_rate": 2e-05, + "loss": 0.05513988, + "step": 22338 + }, + { + "epoch": 44.678, + "grad_norm": 1.9911638498306274, + "learning_rate": 2e-05, + "loss": 0.04411037, + "step": 22339 + }, + { + "epoch": 44.68, + "grad_norm": 1.5710328817367554, + "learning_rate": 2e-05, + "loss": 0.0609922, + "step": 22340 + }, + { + "epoch": 44.682, + "grad_norm": 1.218172550201416, + "learning_rate": 2e-05, + "loss": 0.05370003, + "step": 22341 + }, + { + "epoch": 44.684, + "grad_norm": 1.1348539590835571, + "learning_rate": 2e-05, + "loss": 0.0526722, + "step": 22342 + }, + { + "epoch": 44.686, + "grad_norm": 1.610276460647583, + "learning_rate": 2e-05, + "loss": 0.070089, + "step": 22343 + }, + { + "epoch": 44.688, + "grad_norm": 1.2752341032028198, + "learning_rate": 2e-05, + "loss": 0.04979321, + "step": 22344 + }, + { + "epoch": 44.69, + "grad_norm": 1.252686619758606, + "learning_rate": 2e-05, + "loss": 0.04036684, + "step": 22345 + }, + { + "epoch": 44.692, + "grad_norm": 1.023863673210144, + "learning_rate": 2e-05, + "loss": 0.02940369, + "step": 22346 + }, + { + "epoch": 44.694, + "grad_norm": 0.9316999316215515, + "learning_rate": 2e-05, + "loss": 0.03571773, + "step": 22347 + }, + { + "epoch": 44.696, + "grad_norm": 1.4448881149291992, + "learning_rate": 2e-05, + "loss": 0.06954427, + "step": 22348 + }, + { + "epoch": 44.698, + "grad_norm": 1.6992919445037842, + "learning_rate": 2e-05, + "loss": 0.04004553, + "step": 22349 + }, + { + "epoch": 44.7, + "grad_norm": 1.3580870628356934, + "learning_rate": 2e-05, + "loss": 0.04999983, + "step": 22350 + }, + { + "epoch": 44.702, + "grad_norm": 1.1199650764465332, + "learning_rate": 2e-05, + "loss": 0.04584752, + "step": 22351 + }, + { + "epoch": 44.704, + "grad_norm": 1.46734619140625, + "learning_rate": 2e-05, + "loss": 0.04816905, + "step": 22352 + }, + { + "epoch": 44.706, + "grad_norm": 1.2117671966552734, + "learning_rate": 2e-05, + "loss": 0.03972312, + "step": 22353 + }, + { + "epoch": 44.708, + "grad_norm": 1.2888193130493164, + "learning_rate": 2e-05, + "loss": 0.03945049, + "step": 22354 + }, + { + "epoch": 44.71, + "grad_norm": 1.204562783241272, + "learning_rate": 2e-05, + "loss": 0.04760649, + "step": 22355 + }, + { + "epoch": 44.712, + "grad_norm": 1.2271558046340942, + "learning_rate": 2e-05, + "loss": 0.04851292, + "step": 22356 + }, + { + "epoch": 44.714, + "grad_norm": 1.2417253255844116, + "learning_rate": 2e-05, + "loss": 0.05184454, + "step": 22357 + }, + { + "epoch": 44.716, + "grad_norm": 1.2915658950805664, + "learning_rate": 2e-05, + "loss": 0.0653154, + "step": 22358 + }, + { + "epoch": 44.718, + "grad_norm": 1.126898169517517, + "learning_rate": 2e-05, + "loss": 0.04059651, + "step": 22359 + }, + { + "epoch": 44.72, + "grad_norm": 1.2646050453186035, + "learning_rate": 2e-05, + "loss": 0.0460304, + "step": 22360 + }, + { + "epoch": 44.722, + "grad_norm": 1.2595983743667603, + "learning_rate": 2e-05, + "loss": 0.0446999, + "step": 22361 + }, + { + "epoch": 44.724, + "grad_norm": 1.0106483697891235, + "learning_rate": 2e-05, + "loss": 0.03425622, + "step": 22362 + }, + { + "epoch": 44.726, + "grad_norm": 1.0983266830444336, + "learning_rate": 2e-05, + "loss": 0.04440197, + "step": 22363 + }, + { + "epoch": 44.728, + "grad_norm": 1.0528661012649536, + "learning_rate": 2e-05, + "loss": 0.03264795, + "step": 22364 + }, + { + "epoch": 44.73, + "grad_norm": 1.485556721687317, + "learning_rate": 2e-05, + "loss": 0.04851057, + "step": 22365 + }, + { + "epoch": 44.732, + "grad_norm": 1.0484004020690918, + "learning_rate": 2e-05, + "loss": 0.03785881, + "step": 22366 + }, + { + "epoch": 44.734, + "grad_norm": 1.0544923543930054, + "learning_rate": 2e-05, + "loss": 0.04710227, + "step": 22367 + }, + { + "epoch": 44.736, + "grad_norm": 1.6099900007247925, + "learning_rate": 2e-05, + "loss": 0.0503059, + "step": 22368 + }, + { + "epoch": 44.738, + "grad_norm": 1.0800563097000122, + "learning_rate": 2e-05, + "loss": 0.03581601, + "step": 22369 + }, + { + "epoch": 44.74, + "grad_norm": 1.2417271137237549, + "learning_rate": 2e-05, + "loss": 0.05445302, + "step": 22370 + }, + { + "epoch": 44.742, + "grad_norm": 1.3776066303253174, + "learning_rate": 2e-05, + "loss": 0.05917332, + "step": 22371 + }, + { + "epoch": 44.744, + "grad_norm": 6.051969051361084, + "learning_rate": 2e-05, + "loss": 0.03186266, + "step": 22372 + }, + { + "epoch": 44.746, + "grad_norm": 1.193009614944458, + "learning_rate": 2e-05, + "loss": 0.04203957, + "step": 22373 + }, + { + "epoch": 44.748, + "grad_norm": 1.1934806108474731, + "learning_rate": 2e-05, + "loss": 0.05293836, + "step": 22374 + }, + { + "epoch": 44.75, + "grad_norm": 1.2843340635299683, + "learning_rate": 2e-05, + "loss": 0.05309843, + "step": 22375 + }, + { + "epoch": 44.752, + "grad_norm": 1.2187143564224243, + "learning_rate": 2e-05, + "loss": 0.05113508, + "step": 22376 + }, + { + "epoch": 44.754, + "grad_norm": 1.5703728199005127, + "learning_rate": 2e-05, + "loss": 0.06165824, + "step": 22377 + }, + { + "epoch": 44.756, + "grad_norm": 1.9462801218032837, + "learning_rate": 2e-05, + "loss": 0.06039472, + "step": 22378 + }, + { + "epoch": 44.758, + "grad_norm": 1.2913825511932373, + "learning_rate": 2e-05, + "loss": 0.04901668, + "step": 22379 + }, + { + "epoch": 44.76, + "grad_norm": 1.484749436378479, + "learning_rate": 2e-05, + "loss": 0.05032424, + "step": 22380 + }, + { + "epoch": 44.762, + "grad_norm": 1.2912980318069458, + "learning_rate": 2e-05, + "loss": 0.05285494, + "step": 22381 + }, + { + "epoch": 44.764, + "grad_norm": 1.471806526184082, + "learning_rate": 2e-05, + "loss": 0.05935736, + "step": 22382 + }, + { + "epoch": 44.766, + "grad_norm": 1.4022643566131592, + "learning_rate": 2e-05, + "loss": 0.06408676, + "step": 22383 + }, + { + "epoch": 44.768, + "grad_norm": 1.3989965915679932, + "learning_rate": 2e-05, + "loss": 0.0560586, + "step": 22384 + }, + { + "epoch": 44.77, + "grad_norm": 1.7269079685211182, + "learning_rate": 2e-05, + "loss": 0.0490585, + "step": 22385 + }, + { + "epoch": 44.772, + "grad_norm": 2.0063679218292236, + "learning_rate": 2e-05, + "loss": 0.06107655, + "step": 22386 + }, + { + "epoch": 44.774, + "grad_norm": 1.0884474515914917, + "learning_rate": 2e-05, + "loss": 0.04211147, + "step": 22387 + }, + { + "epoch": 44.776, + "grad_norm": 1.1166996955871582, + "learning_rate": 2e-05, + "loss": 0.04605059, + "step": 22388 + }, + { + "epoch": 44.778, + "grad_norm": 1.3494291305541992, + "learning_rate": 2e-05, + "loss": 0.05825819, + "step": 22389 + }, + { + "epoch": 44.78, + "grad_norm": 1.4325668811798096, + "learning_rate": 2e-05, + "loss": 0.0605996, + "step": 22390 + }, + { + "epoch": 44.782, + "grad_norm": 1.1748969554901123, + "learning_rate": 2e-05, + "loss": 0.03148333, + "step": 22391 + }, + { + "epoch": 44.784, + "grad_norm": 1.8170502185821533, + "learning_rate": 2e-05, + "loss": 0.06581346, + "step": 22392 + }, + { + "epoch": 44.786, + "grad_norm": 1.3329365253448486, + "learning_rate": 2e-05, + "loss": 0.05404956, + "step": 22393 + }, + { + "epoch": 44.788, + "grad_norm": 1.3301454782485962, + "learning_rate": 2e-05, + "loss": 0.04755008, + "step": 22394 + }, + { + "epoch": 44.79, + "grad_norm": 1.2659374475479126, + "learning_rate": 2e-05, + "loss": 0.04017854, + "step": 22395 + }, + { + "epoch": 44.792, + "grad_norm": 1.4887542724609375, + "learning_rate": 2e-05, + "loss": 0.04862245, + "step": 22396 + }, + { + "epoch": 44.794, + "grad_norm": 1.8280566930770874, + "learning_rate": 2e-05, + "loss": 0.05217675, + "step": 22397 + }, + { + "epoch": 44.796, + "grad_norm": 1.307554841041565, + "learning_rate": 2e-05, + "loss": 0.051239, + "step": 22398 + }, + { + "epoch": 44.798, + "grad_norm": 1.391467571258545, + "learning_rate": 2e-05, + "loss": 0.041818, + "step": 22399 + }, + { + "epoch": 44.8, + "grad_norm": 1.2929948568344116, + "learning_rate": 2e-05, + "loss": 0.03885982, + "step": 22400 + }, + { + "epoch": 44.802, + "grad_norm": 0.8741863965988159, + "learning_rate": 2e-05, + "loss": 0.02871921, + "step": 22401 + }, + { + "epoch": 44.804, + "grad_norm": 1.1634330749511719, + "learning_rate": 2e-05, + "loss": 0.05327924, + "step": 22402 + }, + { + "epoch": 44.806, + "grad_norm": 1.2257028818130493, + "learning_rate": 2e-05, + "loss": 0.04897387, + "step": 22403 + }, + { + "epoch": 44.808, + "grad_norm": 1.1557226181030273, + "learning_rate": 2e-05, + "loss": 0.04159442, + "step": 22404 + }, + { + "epoch": 44.81, + "grad_norm": 2.137556791305542, + "learning_rate": 2e-05, + "loss": 0.05608717, + "step": 22405 + }, + { + "epoch": 44.812, + "grad_norm": 1.224327802658081, + "learning_rate": 2e-05, + "loss": 0.04210414, + "step": 22406 + }, + { + "epoch": 44.814, + "grad_norm": 1.342980146408081, + "learning_rate": 2e-05, + "loss": 0.06127755, + "step": 22407 + }, + { + "epoch": 44.816, + "grad_norm": 1.5353951454162598, + "learning_rate": 2e-05, + "loss": 0.06302527, + "step": 22408 + }, + { + "epoch": 44.818, + "grad_norm": 1.3694101572036743, + "learning_rate": 2e-05, + "loss": 0.04164361, + "step": 22409 + }, + { + "epoch": 44.82, + "grad_norm": 1.6121137142181396, + "learning_rate": 2e-05, + "loss": 0.05497941, + "step": 22410 + }, + { + "epoch": 44.822, + "grad_norm": 1.375234842300415, + "learning_rate": 2e-05, + "loss": 0.05874093, + "step": 22411 + }, + { + "epoch": 44.824, + "grad_norm": 1.243156909942627, + "learning_rate": 2e-05, + "loss": 0.05009591, + "step": 22412 + }, + { + "epoch": 44.826, + "grad_norm": 1.365031123161316, + "learning_rate": 2e-05, + "loss": 0.04909578, + "step": 22413 + }, + { + "epoch": 44.828, + "grad_norm": 0.9832285046577454, + "learning_rate": 2e-05, + "loss": 0.0287962, + "step": 22414 + }, + { + "epoch": 44.83, + "grad_norm": 1.187656044960022, + "learning_rate": 2e-05, + "loss": 0.04923886, + "step": 22415 + }, + { + "epoch": 44.832, + "grad_norm": 1.2411973476409912, + "learning_rate": 2e-05, + "loss": 0.05921061, + "step": 22416 + }, + { + "epoch": 44.834, + "grad_norm": 1.7300316095352173, + "learning_rate": 2e-05, + "loss": 0.0390442, + "step": 22417 + }, + { + "epoch": 44.836, + "grad_norm": 1.0223551988601685, + "learning_rate": 2e-05, + "loss": 0.03893992, + "step": 22418 + }, + { + "epoch": 44.838, + "grad_norm": 1.9662129878997803, + "learning_rate": 2e-05, + "loss": 0.03231378, + "step": 22419 + }, + { + "epoch": 44.84, + "grad_norm": 1.0947517156600952, + "learning_rate": 2e-05, + "loss": 0.03922854, + "step": 22420 + }, + { + "epoch": 44.842, + "grad_norm": 1.6704392433166504, + "learning_rate": 2e-05, + "loss": 0.04813308, + "step": 22421 + }, + { + "epoch": 44.844, + "grad_norm": 1.2903733253479004, + "learning_rate": 2e-05, + "loss": 0.04857474, + "step": 22422 + }, + { + "epoch": 44.846, + "grad_norm": 1.159749984741211, + "learning_rate": 2e-05, + "loss": 0.04078946, + "step": 22423 + }, + { + "epoch": 44.848, + "grad_norm": 1.364219307899475, + "learning_rate": 2e-05, + "loss": 0.04483482, + "step": 22424 + }, + { + "epoch": 44.85, + "grad_norm": 1.2046773433685303, + "learning_rate": 2e-05, + "loss": 0.05179583, + "step": 22425 + }, + { + "epoch": 44.852, + "grad_norm": 1.419978380203247, + "learning_rate": 2e-05, + "loss": 0.0563489, + "step": 22426 + }, + { + "epoch": 44.854, + "grad_norm": 1.4151443243026733, + "learning_rate": 2e-05, + "loss": 0.05637079, + "step": 22427 + }, + { + "epoch": 44.856, + "grad_norm": 1.2609163522720337, + "learning_rate": 2e-05, + "loss": 0.05354507, + "step": 22428 + }, + { + "epoch": 44.858, + "grad_norm": 1.2031396627426147, + "learning_rate": 2e-05, + "loss": 0.04809187, + "step": 22429 + }, + { + "epoch": 44.86, + "grad_norm": 1.2825238704681396, + "learning_rate": 2e-05, + "loss": 0.05121071, + "step": 22430 + }, + { + "epoch": 44.862, + "grad_norm": 1.0715117454528809, + "learning_rate": 2e-05, + "loss": 0.03546477, + "step": 22431 + }, + { + "epoch": 44.864, + "grad_norm": 1.407547116279602, + "learning_rate": 2e-05, + "loss": 0.05819602, + "step": 22432 + }, + { + "epoch": 44.866, + "grad_norm": 1.705998182296753, + "learning_rate": 2e-05, + "loss": 0.05690518, + "step": 22433 + }, + { + "epoch": 44.868, + "grad_norm": 1.3436830043792725, + "learning_rate": 2e-05, + "loss": 0.04754122, + "step": 22434 + }, + { + "epoch": 44.87, + "grad_norm": 1.3858736753463745, + "learning_rate": 2e-05, + "loss": 0.06503775, + "step": 22435 + }, + { + "epoch": 44.872, + "grad_norm": 1.100571632385254, + "learning_rate": 2e-05, + "loss": 0.04838911, + "step": 22436 + }, + { + "epoch": 44.874, + "grad_norm": 1.277199387550354, + "learning_rate": 2e-05, + "loss": 0.0342557, + "step": 22437 + }, + { + "epoch": 44.876, + "grad_norm": 1.04167640209198, + "learning_rate": 2e-05, + "loss": 0.03733697, + "step": 22438 + }, + { + "epoch": 44.878, + "grad_norm": 1.1401691436767578, + "learning_rate": 2e-05, + "loss": 0.04081557, + "step": 22439 + }, + { + "epoch": 44.88, + "grad_norm": 1.1583184003829956, + "learning_rate": 2e-05, + "loss": 0.04236028, + "step": 22440 + }, + { + "epoch": 44.882, + "grad_norm": 1.224388837814331, + "learning_rate": 2e-05, + "loss": 0.04657396, + "step": 22441 + }, + { + "epoch": 44.884, + "grad_norm": 1.3904510736465454, + "learning_rate": 2e-05, + "loss": 0.05583341, + "step": 22442 + }, + { + "epoch": 44.886, + "grad_norm": 1.3104639053344727, + "learning_rate": 2e-05, + "loss": 0.04422941, + "step": 22443 + }, + { + "epoch": 44.888, + "grad_norm": 1.4339663982391357, + "learning_rate": 2e-05, + "loss": 0.05825556, + "step": 22444 + }, + { + "epoch": 44.89, + "grad_norm": 2.071310520172119, + "learning_rate": 2e-05, + "loss": 0.04446717, + "step": 22445 + }, + { + "epoch": 44.892, + "grad_norm": 1.579842448234558, + "learning_rate": 2e-05, + "loss": 0.06342889, + "step": 22446 + }, + { + "epoch": 44.894, + "grad_norm": 1.1288223266601562, + "learning_rate": 2e-05, + "loss": 0.04180889, + "step": 22447 + }, + { + "epoch": 44.896, + "grad_norm": 0.9220855832099915, + "learning_rate": 2e-05, + "loss": 0.03075635, + "step": 22448 + }, + { + "epoch": 44.898, + "grad_norm": 1.1610661745071411, + "learning_rate": 2e-05, + "loss": 0.04427385, + "step": 22449 + }, + { + "epoch": 44.9, + "grad_norm": 1.0175820589065552, + "learning_rate": 2e-05, + "loss": 0.03336541, + "step": 22450 + }, + { + "epoch": 44.902, + "grad_norm": 1.2916738986968994, + "learning_rate": 2e-05, + "loss": 0.04845192, + "step": 22451 + }, + { + "epoch": 44.904, + "grad_norm": 1.0840328931808472, + "learning_rate": 2e-05, + "loss": 0.03961993, + "step": 22452 + }, + { + "epoch": 44.906, + "grad_norm": 1.168332815170288, + "learning_rate": 2e-05, + "loss": 0.04420353, + "step": 22453 + }, + { + "epoch": 44.908, + "grad_norm": 1.2330690622329712, + "learning_rate": 2e-05, + "loss": 0.046922, + "step": 22454 + }, + { + "epoch": 44.91, + "grad_norm": 1.777421236038208, + "learning_rate": 2e-05, + "loss": 0.06229702, + "step": 22455 + }, + { + "epoch": 44.912, + "grad_norm": 1.3001389503479004, + "learning_rate": 2e-05, + "loss": 0.05773286, + "step": 22456 + }, + { + "epoch": 44.914, + "grad_norm": 1.3984302282333374, + "learning_rate": 2e-05, + "loss": 0.06564485, + "step": 22457 + }, + { + "epoch": 44.916, + "grad_norm": 1.200603723526001, + "learning_rate": 2e-05, + "loss": 0.04599864, + "step": 22458 + }, + { + "epoch": 44.918, + "grad_norm": 1.0556275844573975, + "learning_rate": 2e-05, + "loss": 0.03204244, + "step": 22459 + }, + { + "epoch": 44.92, + "grad_norm": 1.3520619869232178, + "learning_rate": 2e-05, + "loss": 0.05546412, + "step": 22460 + }, + { + "epoch": 44.922, + "grad_norm": 1.3469730615615845, + "learning_rate": 2e-05, + "loss": 0.04959213, + "step": 22461 + }, + { + "epoch": 44.924, + "grad_norm": 1.6626015901565552, + "learning_rate": 2e-05, + "loss": 0.06908607, + "step": 22462 + }, + { + "epoch": 44.926, + "grad_norm": 1.079907774925232, + "learning_rate": 2e-05, + "loss": 0.03825868, + "step": 22463 + }, + { + "epoch": 44.928, + "grad_norm": 1.4360984563827515, + "learning_rate": 2e-05, + "loss": 0.06819391, + "step": 22464 + }, + { + "epoch": 44.93, + "grad_norm": 1.118546485900879, + "learning_rate": 2e-05, + "loss": 0.03266671, + "step": 22465 + }, + { + "epoch": 44.932, + "grad_norm": 1.7453476190567017, + "learning_rate": 2e-05, + "loss": 0.06611872, + "step": 22466 + }, + { + "epoch": 44.934, + "grad_norm": 1.403899073600769, + "learning_rate": 2e-05, + "loss": 0.05418711, + "step": 22467 + }, + { + "epoch": 44.936, + "grad_norm": 4.624436378479004, + "learning_rate": 2e-05, + "loss": 0.07199771, + "step": 22468 + }, + { + "epoch": 44.938, + "grad_norm": 1.1568007469177246, + "learning_rate": 2e-05, + "loss": 0.05059115, + "step": 22469 + }, + { + "epoch": 44.94, + "grad_norm": 1.137229561805725, + "learning_rate": 2e-05, + "loss": 0.03843639, + "step": 22470 + }, + { + "epoch": 44.942, + "grad_norm": 1.4798380136489868, + "learning_rate": 2e-05, + "loss": 0.04668789, + "step": 22471 + }, + { + "epoch": 44.944, + "grad_norm": 1.986149549484253, + "learning_rate": 2e-05, + "loss": 0.03478555, + "step": 22472 + }, + { + "epoch": 44.946, + "grad_norm": 1.0867761373519897, + "learning_rate": 2e-05, + "loss": 0.03376482, + "step": 22473 + }, + { + "epoch": 44.948, + "grad_norm": 1.3647146224975586, + "learning_rate": 2e-05, + "loss": 0.06791535, + "step": 22474 + }, + { + "epoch": 44.95, + "grad_norm": 1.3892532587051392, + "learning_rate": 2e-05, + "loss": 0.05850136, + "step": 22475 + }, + { + "epoch": 44.952, + "grad_norm": 1.2366359233856201, + "learning_rate": 2e-05, + "loss": 0.04139677, + "step": 22476 + }, + { + "epoch": 44.954, + "grad_norm": 2.012018918991089, + "learning_rate": 2e-05, + "loss": 0.05484272, + "step": 22477 + }, + { + "epoch": 44.956, + "grad_norm": 1.2772232294082642, + "learning_rate": 2e-05, + "loss": 0.04107056, + "step": 22478 + }, + { + "epoch": 44.958, + "grad_norm": 1.0646635293960571, + "learning_rate": 2e-05, + "loss": 0.04107633, + "step": 22479 + }, + { + "epoch": 44.96, + "grad_norm": 3.412080764770508, + "learning_rate": 2e-05, + "loss": 0.06652329, + "step": 22480 + }, + { + "epoch": 44.962, + "grad_norm": 1.39540696144104, + "learning_rate": 2e-05, + "loss": 0.04058058, + "step": 22481 + }, + { + "epoch": 44.964, + "grad_norm": 1.372216820716858, + "learning_rate": 2e-05, + "loss": 0.05258819, + "step": 22482 + }, + { + "epoch": 44.966, + "grad_norm": 1.1605424880981445, + "learning_rate": 2e-05, + "loss": 0.05386285, + "step": 22483 + }, + { + "epoch": 44.968, + "grad_norm": 1.0500538349151611, + "learning_rate": 2e-05, + "loss": 0.03223278, + "step": 22484 + }, + { + "epoch": 44.97, + "grad_norm": 1.171493411064148, + "learning_rate": 2e-05, + "loss": 0.0475014, + "step": 22485 + }, + { + "epoch": 44.972, + "grad_norm": 1.1889984607696533, + "learning_rate": 2e-05, + "loss": 0.03335759, + "step": 22486 + }, + { + "epoch": 44.974, + "grad_norm": 1.488581895828247, + "learning_rate": 2e-05, + "loss": 0.04842, + "step": 22487 + }, + { + "epoch": 44.976, + "grad_norm": 1.2926756143569946, + "learning_rate": 2e-05, + "loss": 0.04585892, + "step": 22488 + }, + { + "epoch": 44.978, + "grad_norm": 1.0679482221603394, + "learning_rate": 2e-05, + "loss": 0.03685507, + "step": 22489 + }, + { + "epoch": 44.98, + "grad_norm": 1.2621574401855469, + "learning_rate": 2e-05, + "loss": 0.05050556, + "step": 22490 + }, + { + "epoch": 44.982, + "grad_norm": 1.104034185409546, + "learning_rate": 2e-05, + "loss": 0.03997359, + "step": 22491 + }, + { + "epoch": 44.984, + "grad_norm": 1.1348416805267334, + "learning_rate": 2e-05, + "loss": 0.03981521, + "step": 22492 + }, + { + "epoch": 44.986, + "grad_norm": 1.5478214025497437, + "learning_rate": 2e-05, + "loss": 0.06456232, + "step": 22493 + }, + { + "epoch": 44.988, + "grad_norm": 1.2740987539291382, + "learning_rate": 2e-05, + "loss": 0.04914555, + "step": 22494 + }, + { + "epoch": 44.99, + "grad_norm": 1.046753168106079, + "learning_rate": 2e-05, + "loss": 0.04628645, + "step": 22495 + }, + { + "epoch": 44.992, + "grad_norm": 2.204502820968628, + "learning_rate": 2e-05, + "loss": 0.04956776, + "step": 22496 + }, + { + "epoch": 44.994, + "grad_norm": 1.2961649894714355, + "learning_rate": 2e-05, + "loss": 0.04877903, + "step": 22497 + }, + { + "epoch": 44.996, + "grad_norm": 1.6191555261611938, + "learning_rate": 2e-05, + "loss": 0.05243221, + "step": 22498 + }, + { + "epoch": 44.998, + "grad_norm": 1.054588794708252, + "learning_rate": 2e-05, + "loss": 0.04243375, + "step": 22499 + }, + { + "epoch": 45.0, + "grad_norm": 1.2328218221664429, + "learning_rate": 2e-05, + "loss": 0.04469647, + "step": 22500 + }, + { + "epoch": 45.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9860279441117764, + "Equal_1": 1.0, + "Equal_2": 0.9860279441117764, + "Equal_3": 0.9960079840319361, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9959919839679359, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.996, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.992, + "Perpendicular_3": 0.8877755511022044, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.99, + "PointLiesOnLine_1": 1.0, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9820359281437125 + }, + "eval_runtime": 321.1082, + "eval_samples_per_second": 32.699, + "eval_steps_per_second": 0.654, + "step": 22500 + }, + { + "epoch": 45.002, + "grad_norm": 1.2806766033172607, + "learning_rate": 2e-05, + "loss": 0.03703031, + "step": 22501 + }, + { + "epoch": 45.004, + "grad_norm": 1.2303308248519897, + "learning_rate": 2e-05, + "loss": 0.0578556, + "step": 22502 + }, + { + "epoch": 45.006, + "grad_norm": 1.2770564556121826, + "learning_rate": 2e-05, + "loss": 0.0442674, + "step": 22503 + }, + { + "epoch": 45.008, + "grad_norm": 1.6194312572479248, + "learning_rate": 2e-05, + "loss": 0.06683954, + "step": 22504 + }, + { + "epoch": 45.01, + "grad_norm": 1.137593388557434, + "learning_rate": 2e-05, + "loss": 0.03966604, + "step": 22505 + }, + { + "epoch": 45.012, + "grad_norm": 3.7040164470672607, + "learning_rate": 2e-05, + "loss": 0.05335582, + "step": 22506 + }, + { + "epoch": 45.014, + "grad_norm": 1.019702672958374, + "learning_rate": 2e-05, + "loss": 0.039069, + "step": 22507 + }, + { + "epoch": 45.016, + "grad_norm": 1.11601984500885, + "learning_rate": 2e-05, + "loss": 0.03871304, + "step": 22508 + }, + { + "epoch": 45.018, + "grad_norm": 1.1694669723510742, + "learning_rate": 2e-05, + "loss": 0.04809815, + "step": 22509 + }, + { + "epoch": 45.02, + "grad_norm": 1.0752638578414917, + "learning_rate": 2e-05, + "loss": 0.03829714, + "step": 22510 + }, + { + "epoch": 45.022, + "grad_norm": 1.298243522644043, + "learning_rate": 2e-05, + "loss": 0.03895707, + "step": 22511 + }, + { + "epoch": 45.024, + "grad_norm": 0.9618425965309143, + "learning_rate": 2e-05, + "loss": 0.03141353, + "step": 22512 + }, + { + "epoch": 45.026, + "grad_norm": 1.0746543407440186, + "learning_rate": 2e-05, + "loss": 0.03593572, + "step": 22513 + }, + { + "epoch": 45.028, + "grad_norm": 1.0014342069625854, + "learning_rate": 2e-05, + "loss": 0.0362047, + "step": 22514 + }, + { + "epoch": 45.03, + "grad_norm": 1.230353593826294, + "learning_rate": 2e-05, + "loss": 0.04621077, + "step": 22515 + }, + { + "epoch": 45.032, + "grad_norm": 1.305424690246582, + "learning_rate": 2e-05, + "loss": 0.06496035, + "step": 22516 + }, + { + "epoch": 45.034, + "grad_norm": 0.9447730183601379, + "learning_rate": 2e-05, + "loss": 0.02817334, + "step": 22517 + }, + { + "epoch": 45.036, + "grad_norm": 1.625741958618164, + "learning_rate": 2e-05, + "loss": 0.06509583, + "step": 22518 + }, + { + "epoch": 45.038, + "grad_norm": 1.1020500659942627, + "learning_rate": 2e-05, + "loss": 0.04046689, + "step": 22519 + }, + { + "epoch": 45.04, + "grad_norm": 1.0736149549484253, + "learning_rate": 2e-05, + "loss": 0.04837882, + "step": 22520 + }, + { + "epoch": 45.042, + "grad_norm": 1.7894287109375, + "learning_rate": 2e-05, + "loss": 0.04507787, + "step": 22521 + }, + { + "epoch": 45.044, + "grad_norm": 0.9819255471229553, + "learning_rate": 2e-05, + "loss": 0.02709861, + "step": 22522 + }, + { + "epoch": 45.046, + "grad_norm": 1.2546124458312988, + "learning_rate": 2e-05, + "loss": 0.05152019, + "step": 22523 + }, + { + "epoch": 45.048, + "grad_norm": 1.2507061958312988, + "learning_rate": 2e-05, + "loss": 0.0508112, + "step": 22524 + }, + { + "epoch": 45.05, + "grad_norm": 1.0991570949554443, + "learning_rate": 2e-05, + "loss": 0.04952764, + "step": 22525 + }, + { + "epoch": 45.052, + "grad_norm": 0.895489513874054, + "learning_rate": 2e-05, + "loss": 0.02615194, + "step": 22526 + }, + { + "epoch": 45.054, + "grad_norm": 1.2041149139404297, + "learning_rate": 2e-05, + "loss": 0.05550298, + "step": 22527 + }, + { + "epoch": 45.056, + "grad_norm": 1.2021666765213013, + "learning_rate": 2e-05, + "loss": 0.0547464, + "step": 22528 + }, + { + "epoch": 45.058, + "grad_norm": 1.1511567831039429, + "learning_rate": 2e-05, + "loss": 0.04542843, + "step": 22529 + }, + { + "epoch": 45.06, + "grad_norm": 1.7193586826324463, + "learning_rate": 2e-05, + "loss": 0.06118374, + "step": 22530 + }, + { + "epoch": 45.062, + "grad_norm": 1.1019095182418823, + "learning_rate": 2e-05, + "loss": 0.04117382, + "step": 22531 + }, + { + "epoch": 45.064, + "grad_norm": 0.9580153822898865, + "learning_rate": 2e-05, + "loss": 0.04075727, + "step": 22532 + }, + { + "epoch": 45.066, + "grad_norm": 1.383394718170166, + "learning_rate": 2e-05, + "loss": 0.05107236, + "step": 22533 + }, + { + "epoch": 45.068, + "grad_norm": 1.2198097705841064, + "learning_rate": 2e-05, + "loss": 0.05111481, + "step": 22534 + }, + { + "epoch": 45.07, + "grad_norm": 1.1172356605529785, + "learning_rate": 2e-05, + "loss": 0.04869189, + "step": 22535 + }, + { + "epoch": 45.072, + "grad_norm": 1.1854000091552734, + "learning_rate": 2e-05, + "loss": 0.05265629, + "step": 22536 + }, + { + "epoch": 45.074, + "grad_norm": 1.2538152933120728, + "learning_rate": 2e-05, + "loss": 0.05185087, + "step": 22537 + }, + { + "epoch": 45.076, + "grad_norm": 1.1875842809677124, + "learning_rate": 2e-05, + "loss": 0.05062048, + "step": 22538 + }, + { + "epoch": 45.078, + "grad_norm": 1.1964948177337646, + "learning_rate": 2e-05, + "loss": 0.04546596, + "step": 22539 + }, + { + "epoch": 45.08, + "grad_norm": 0.9874855279922485, + "learning_rate": 2e-05, + "loss": 0.03447803, + "step": 22540 + }, + { + "epoch": 45.082, + "grad_norm": 1.2589225769042969, + "learning_rate": 2e-05, + "loss": 0.052671, + "step": 22541 + }, + { + "epoch": 45.084, + "grad_norm": 1.2300832271575928, + "learning_rate": 2e-05, + "loss": 0.05471198, + "step": 22542 + }, + { + "epoch": 45.086, + "grad_norm": 1.014027714729309, + "learning_rate": 2e-05, + "loss": 0.03973835, + "step": 22543 + }, + { + "epoch": 45.088, + "grad_norm": 1.0800725221633911, + "learning_rate": 2e-05, + "loss": 0.0450437, + "step": 22544 + }, + { + "epoch": 45.09, + "grad_norm": 1.2067081928253174, + "learning_rate": 2e-05, + "loss": 0.04083369, + "step": 22545 + }, + { + "epoch": 45.092, + "grad_norm": 1.1328651905059814, + "learning_rate": 2e-05, + "loss": 0.05992271, + "step": 22546 + }, + { + "epoch": 45.094, + "grad_norm": 1.180773138999939, + "learning_rate": 2e-05, + "loss": 0.04769462, + "step": 22547 + }, + { + "epoch": 45.096, + "grad_norm": 1.1579021215438843, + "learning_rate": 2e-05, + "loss": 0.05516424, + "step": 22548 + }, + { + "epoch": 45.098, + "grad_norm": 1.7966943979263306, + "learning_rate": 2e-05, + "loss": 0.0489285, + "step": 22549 + }, + { + "epoch": 45.1, + "grad_norm": 1.2300606966018677, + "learning_rate": 2e-05, + "loss": 0.05087769, + "step": 22550 + }, + { + "epoch": 45.102, + "grad_norm": 1.3096156120300293, + "learning_rate": 2e-05, + "loss": 0.04192181, + "step": 22551 + }, + { + "epoch": 45.104, + "grad_norm": 1.259109377861023, + "learning_rate": 2e-05, + "loss": 0.05914876, + "step": 22552 + }, + { + "epoch": 45.106, + "grad_norm": 0.9652984142303467, + "learning_rate": 2e-05, + "loss": 0.03940653, + "step": 22553 + }, + { + "epoch": 45.108, + "grad_norm": 1.1099121570587158, + "learning_rate": 2e-05, + "loss": 0.05075653, + "step": 22554 + }, + { + "epoch": 45.11, + "grad_norm": 1.2321460247039795, + "learning_rate": 2e-05, + "loss": 0.05328397, + "step": 22555 + }, + { + "epoch": 45.112, + "grad_norm": 1.3149770498275757, + "learning_rate": 2e-05, + "loss": 0.0476843, + "step": 22556 + }, + { + "epoch": 45.114, + "grad_norm": 0.9816151261329651, + "learning_rate": 2e-05, + "loss": 0.02892705, + "step": 22557 + }, + { + "epoch": 45.116, + "grad_norm": 1.0387154817581177, + "learning_rate": 2e-05, + "loss": 0.029668, + "step": 22558 + }, + { + "epoch": 45.118, + "grad_norm": 1.4174152612686157, + "learning_rate": 2e-05, + "loss": 0.03911356, + "step": 22559 + }, + { + "epoch": 45.12, + "grad_norm": 1.172924518585205, + "learning_rate": 2e-05, + "loss": 0.04783778, + "step": 22560 + }, + { + "epoch": 45.122, + "grad_norm": 1.978395700454712, + "learning_rate": 2e-05, + "loss": 0.06169991, + "step": 22561 + }, + { + "epoch": 45.124, + "grad_norm": 1.215388298034668, + "learning_rate": 2e-05, + "loss": 0.05454685, + "step": 22562 + }, + { + "epoch": 45.126, + "grad_norm": 0.908128559589386, + "learning_rate": 2e-05, + "loss": 0.028432, + "step": 22563 + }, + { + "epoch": 45.128, + "grad_norm": 1.1321818828582764, + "learning_rate": 2e-05, + "loss": 0.0391438, + "step": 22564 + }, + { + "epoch": 45.13, + "grad_norm": 1.0603052377700806, + "learning_rate": 2e-05, + "loss": 0.04576109, + "step": 22565 + }, + { + "epoch": 45.132, + "grad_norm": 1.1814618110656738, + "learning_rate": 2e-05, + "loss": 0.04789328, + "step": 22566 + }, + { + "epoch": 45.134, + "grad_norm": 1.4475892782211304, + "learning_rate": 2e-05, + "loss": 0.03765525, + "step": 22567 + }, + { + "epoch": 45.136, + "grad_norm": 1.2016019821166992, + "learning_rate": 2e-05, + "loss": 0.0461702, + "step": 22568 + }, + { + "epoch": 45.138, + "grad_norm": 1.2433879375457764, + "learning_rate": 2e-05, + "loss": 0.04532842, + "step": 22569 + }, + { + "epoch": 45.14, + "grad_norm": 1.0823651552200317, + "learning_rate": 2e-05, + "loss": 0.03890754, + "step": 22570 + }, + { + "epoch": 45.142, + "grad_norm": 1.3277068138122559, + "learning_rate": 2e-05, + "loss": 0.05912031, + "step": 22571 + }, + { + "epoch": 45.144, + "grad_norm": 1.2994036674499512, + "learning_rate": 2e-05, + "loss": 0.06122985, + "step": 22572 + }, + { + "epoch": 45.146, + "grad_norm": 6.131012916564941, + "learning_rate": 2e-05, + "loss": 0.06587172, + "step": 22573 + }, + { + "epoch": 45.148, + "grad_norm": 1.9716764688491821, + "learning_rate": 2e-05, + "loss": 0.06575805, + "step": 22574 + }, + { + "epoch": 45.15, + "grad_norm": 1.6159476041793823, + "learning_rate": 2e-05, + "loss": 0.04265655, + "step": 22575 + }, + { + "epoch": 45.152, + "grad_norm": 1.4026684761047363, + "learning_rate": 2e-05, + "loss": 0.04744727, + "step": 22576 + }, + { + "epoch": 45.154, + "grad_norm": 1.106533169746399, + "learning_rate": 2e-05, + "loss": 0.04791871, + "step": 22577 + }, + { + "epoch": 45.156, + "grad_norm": 1.9970992803573608, + "learning_rate": 2e-05, + "loss": 0.05857123, + "step": 22578 + }, + { + "epoch": 45.158, + "grad_norm": 2.6338624954223633, + "learning_rate": 2e-05, + "loss": 0.06465561, + "step": 22579 + }, + { + "epoch": 45.16, + "grad_norm": 3.545212507247925, + "learning_rate": 2e-05, + "loss": 0.0638639, + "step": 22580 + }, + { + "epoch": 45.162, + "grad_norm": 1.2237744331359863, + "learning_rate": 2e-05, + "loss": 0.05294487, + "step": 22581 + }, + { + "epoch": 45.164, + "grad_norm": 0.872916042804718, + "learning_rate": 2e-05, + "loss": 0.02183536, + "step": 22582 + }, + { + "epoch": 45.166, + "grad_norm": 1.3746517896652222, + "learning_rate": 2e-05, + "loss": 0.05417369, + "step": 22583 + }, + { + "epoch": 45.168, + "grad_norm": 1.1517207622528076, + "learning_rate": 2e-05, + "loss": 0.03980051, + "step": 22584 + }, + { + "epoch": 45.17, + "grad_norm": 1.2823890447616577, + "learning_rate": 2e-05, + "loss": 0.05957523, + "step": 22585 + }, + { + "epoch": 45.172, + "grad_norm": 1.4123610258102417, + "learning_rate": 2e-05, + "loss": 0.05167025, + "step": 22586 + }, + { + "epoch": 45.174, + "grad_norm": 1.3554683923721313, + "learning_rate": 2e-05, + "loss": 0.0463458, + "step": 22587 + }, + { + "epoch": 45.176, + "grad_norm": 1.6008424758911133, + "learning_rate": 2e-05, + "loss": 0.05228919, + "step": 22588 + }, + { + "epoch": 45.178, + "grad_norm": 1.1501260995864868, + "learning_rate": 2e-05, + "loss": 0.04863862, + "step": 22589 + }, + { + "epoch": 45.18, + "grad_norm": 0.9484877586364746, + "learning_rate": 2e-05, + "loss": 0.03127411, + "step": 22590 + }, + { + "epoch": 45.182, + "grad_norm": 1.6070059537887573, + "learning_rate": 2e-05, + "loss": 0.05212674, + "step": 22591 + }, + { + "epoch": 45.184, + "grad_norm": 1.3073357343673706, + "learning_rate": 2e-05, + "loss": 0.05999123, + "step": 22592 + }, + { + "epoch": 45.186, + "grad_norm": 1.1207677125930786, + "learning_rate": 2e-05, + "loss": 0.05001438, + "step": 22593 + }, + { + "epoch": 45.188, + "grad_norm": 1.6346867084503174, + "learning_rate": 2e-05, + "loss": 0.04875707, + "step": 22594 + }, + { + "epoch": 45.19, + "grad_norm": 1.2604049444198608, + "learning_rate": 2e-05, + "loss": 0.0618588, + "step": 22595 + }, + { + "epoch": 45.192, + "grad_norm": 1.1987062692642212, + "learning_rate": 2e-05, + "loss": 0.05671919, + "step": 22596 + }, + { + "epoch": 45.194, + "grad_norm": 1.0913773775100708, + "learning_rate": 2e-05, + "loss": 0.05150122, + "step": 22597 + }, + { + "epoch": 45.196, + "grad_norm": 1.0405243635177612, + "learning_rate": 2e-05, + "loss": 0.03671101, + "step": 22598 + }, + { + "epoch": 45.198, + "grad_norm": 1.0459086894989014, + "learning_rate": 2e-05, + "loss": 0.04587247, + "step": 22599 + }, + { + "epoch": 45.2, + "grad_norm": 0.8678653836250305, + "learning_rate": 2e-05, + "loss": 0.03158401, + "step": 22600 + }, + { + "epoch": 45.202, + "grad_norm": 1.2741189002990723, + "learning_rate": 2e-05, + "loss": 0.05065588, + "step": 22601 + }, + { + "epoch": 45.204, + "grad_norm": 2.527108907699585, + "learning_rate": 2e-05, + "loss": 0.05647211, + "step": 22602 + }, + { + "epoch": 45.206, + "grad_norm": 1.1058838367462158, + "learning_rate": 2e-05, + "loss": 0.04199415, + "step": 22603 + }, + { + "epoch": 45.208, + "grad_norm": 1.1132142543792725, + "learning_rate": 2e-05, + "loss": 0.06637686, + "step": 22604 + }, + { + "epoch": 45.21, + "grad_norm": 1.6548913717269897, + "learning_rate": 2e-05, + "loss": 0.05278435, + "step": 22605 + }, + { + "epoch": 45.212, + "grad_norm": 1.0226762294769287, + "learning_rate": 2e-05, + "loss": 0.03409335, + "step": 22606 + }, + { + "epoch": 45.214, + "grad_norm": 1.163483738899231, + "learning_rate": 2e-05, + "loss": 0.0428452, + "step": 22607 + }, + { + "epoch": 45.216, + "grad_norm": 0.9998294115066528, + "learning_rate": 2e-05, + "loss": 0.03900335, + "step": 22608 + }, + { + "epoch": 45.218, + "grad_norm": 1.5138134956359863, + "learning_rate": 2e-05, + "loss": 0.05385974, + "step": 22609 + }, + { + "epoch": 45.22, + "grad_norm": 1.0882453918457031, + "learning_rate": 2e-05, + "loss": 0.04675626, + "step": 22610 + }, + { + "epoch": 45.222, + "grad_norm": 1.041988730430603, + "learning_rate": 2e-05, + "loss": 0.04659366, + "step": 22611 + }, + { + "epoch": 45.224, + "grad_norm": 1.9339371919631958, + "learning_rate": 2e-05, + "loss": 0.05996863, + "step": 22612 + }, + { + "epoch": 45.226, + "grad_norm": 1.7372865676879883, + "learning_rate": 2e-05, + "loss": 0.06137735, + "step": 22613 + }, + { + "epoch": 45.228, + "grad_norm": 1.2295641899108887, + "learning_rate": 2e-05, + "loss": 0.05497001, + "step": 22614 + }, + { + "epoch": 45.23, + "grad_norm": 1.282944679260254, + "learning_rate": 2e-05, + "loss": 0.04878019, + "step": 22615 + }, + { + "epoch": 45.232, + "grad_norm": 1.0056757926940918, + "learning_rate": 2e-05, + "loss": 0.03496859, + "step": 22616 + }, + { + "epoch": 45.234, + "grad_norm": 1.1216771602630615, + "learning_rate": 2e-05, + "loss": 0.04637222, + "step": 22617 + }, + { + "epoch": 45.236, + "grad_norm": 1.1101162433624268, + "learning_rate": 2e-05, + "loss": 0.05418177, + "step": 22618 + }, + { + "epoch": 45.238, + "grad_norm": 2.1140050888061523, + "learning_rate": 2e-05, + "loss": 0.05144568, + "step": 22619 + }, + { + "epoch": 45.24, + "grad_norm": 1.0529059171676636, + "learning_rate": 2e-05, + "loss": 0.04527819, + "step": 22620 + }, + { + "epoch": 45.242, + "grad_norm": 1.2318655252456665, + "learning_rate": 2e-05, + "loss": 0.07417886, + "step": 22621 + }, + { + "epoch": 45.244, + "grad_norm": 1.1625219583511353, + "learning_rate": 2e-05, + "loss": 0.04924101, + "step": 22622 + }, + { + "epoch": 45.246, + "grad_norm": 1.1424137353897095, + "learning_rate": 2e-05, + "loss": 0.0479248, + "step": 22623 + }, + { + "epoch": 45.248, + "grad_norm": 0.9770327806472778, + "learning_rate": 2e-05, + "loss": 0.04924466, + "step": 22624 + }, + { + "epoch": 45.25, + "grad_norm": 1.0025733709335327, + "learning_rate": 2e-05, + "loss": 0.04422404, + "step": 22625 + }, + { + "epoch": 45.252, + "grad_norm": 1.084976315498352, + "learning_rate": 2e-05, + "loss": 0.04491837, + "step": 22626 + }, + { + "epoch": 45.254, + "grad_norm": 1.000255823135376, + "learning_rate": 2e-05, + "loss": 0.04638529, + "step": 22627 + }, + { + "epoch": 45.256, + "grad_norm": 1.7140803337097168, + "learning_rate": 2e-05, + "loss": 0.0516253, + "step": 22628 + }, + { + "epoch": 45.258, + "grad_norm": 1.0502322912216187, + "learning_rate": 2e-05, + "loss": 0.03990895, + "step": 22629 + }, + { + "epoch": 45.26, + "grad_norm": 1.12234628200531, + "learning_rate": 2e-05, + "loss": 0.04589079, + "step": 22630 + }, + { + "epoch": 45.262, + "grad_norm": 1.0489299297332764, + "learning_rate": 2e-05, + "loss": 0.03381399, + "step": 22631 + }, + { + "epoch": 45.264, + "grad_norm": 1.171677589416504, + "learning_rate": 2e-05, + "loss": 0.05054912, + "step": 22632 + }, + { + "epoch": 45.266, + "grad_norm": 1.0880128145217896, + "learning_rate": 2e-05, + "loss": 0.03867623, + "step": 22633 + }, + { + "epoch": 45.268, + "grad_norm": 2.91977596282959, + "learning_rate": 2e-05, + "loss": 0.05142038, + "step": 22634 + }, + { + "epoch": 45.27, + "grad_norm": 1.1017811298370361, + "learning_rate": 2e-05, + "loss": 0.04421684, + "step": 22635 + }, + { + "epoch": 45.272, + "grad_norm": 1.2263803482055664, + "learning_rate": 2e-05, + "loss": 0.0688946, + "step": 22636 + }, + { + "epoch": 45.274, + "grad_norm": 1.1422224044799805, + "learning_rate": 2e-05, + "loss": 0.04059188, + "step": 22637 + }, + { + "epoch": 45.276, + "grad_norm": 1.0902363061904907, + "learning_rate": 2e-05, + "loss": 0.04885675, + "step": 22638 + }, + { + "epoch": 45.278, + "grad_norm": 1.2119808197021484, + "learning_rate": 2e-05, + "loss": 0.04337999, + "step": 22639 + }, + { + "epoch": 45.28, + "grad_norm": 1.135478138923645, + "learning_rate": 2e-05, + "loss": 0.04769107, + "step": 22640 + }, + { + "epoch": 45.282, + "grad_norm": 0.9367907643318176, + "learning_rate": 2e-05, + "loss": 0.02835327, + "step": 22641 + }, + { + "epoch": 45.284, + "grad_norm": 1.128351092338562, + "learning_rate": 2e-05, + "loss": 0.05129791, + "step": 22642 + }, + { + "epoch": 45.286, + "grad_norm": 1.2856783866882324, + "learning_rate": 2e-05, + "loss": 0.05416092, + "step": 22643 + }, + { + "epoch": 45.288, + "grad_norm": 1.115014672279358, + "learning_rate": 2e-05, + "loss": 0.03043066, + "step": 22644 + }, + { + "epoch": 45.29, + "grad_norm": 1.2709392309188843, + "learning_rate": 2e-05, + "loss": 0.04415888, + "step": 22645 + }, + { + "epoch": 45.292, + "grad_norm": 1.6946849822998047, + "learning_rate": 2e-05, + "loss": 0.0498039, + "step": 22646 + }, + { + "epoch": 45.294, + "grad_norm": 3.176492929458618, + "learning_rate": 2e-05, + "loss": 0.08013065, + "step": 22647 + }, + { + "epoch": 45.296, + "grad_norm": 1.2074462175369263, + "learning_rate": 2e-05, + "loss": 0.05039094, + "step": 22648 + }, + { + "epoch": 45.298, + "grad_norm": 1.2501564025878906, + "learning_rate": 2e-05, + "loss": 0.05088755, + "step": 22649 + }, + { + "epoch": 45.3, + "grad_norm": 0.9809415340423584, + "learning_rate": 2e-05, + "loss": 0.038453, + "step": 22650 + }, + { + "epoch": 45.302, + "grad_norm": 1.3970571756362915, + "learning_rate": 2e-05, + "loss": 0.04629774, + "step": 22651 + }, + { + "epoch": 45.304, + "grad_norm": 1.4258559942245483, + "learning_rate": 2e-05, + "loss": 0.04505184, + "step": 22652 + }, + { + "epoch": 45.306, + "grad_norm": 1.1975897550582886, + "learning_rate": 2e-05, + "loss": 0.04722192, + "step": 22653 + }, + { + "epoch": 45.308, + "grad_norm": 1.2232824563980103, + "learning_rate": 2e-05, + "loss": 0.06708793, + "step": 22654 + }, + { + "epoch": 45.31, + "grad_norm": 0.874752938747406, + "learning_rate": 2e-05, + "loss": 0.03515249, + "step": 22655 + }, + { + "epoch": 45.312, + "grad_norm": 1.2584034204483032, + "learning_rate": 2e-05, + "loss": 0.04658163, + "step": 22656 + }, + { + "epoch": 45.314, + "grad_norm": 1.2609401941299438, + "learning_rate": 2e-05, + "loss": 0.05220157, + "step": 22657 + }, + { + "epoch": 45.316, + "grad_norm": 2.2510557174682617, + "learning_rate": 2e-05, + "loss": 0.04448275, + "step": 22658 + }, + { + "epoch": 45.318, + "grad_norm": 1.55043625831604, + "learning_rate": 2e-05, + "loss": 0.05576836, + "step": 22659 + }, + { + "epoch": 45.32, + "grad_norm": 1.1508514881134033, + "learning_rate": 2e-05, + "loss": 0.05758848, + "step": 22660 + }, + { + "epoch": 45.322, + "grad_norm": 1.7243223190307617, + "learning_rate": 2e-05, + "loss": 0.0692929, + "step": 22661 + }, + { + "epoch": 45.324, + "grad_norm": 1.220308780670166, + "learning_rate": 2e-05, + "loss": 0.04048155, + "step": 22662 + }, + { + "epoch": 45.326, + "grad_norm": 1.149865984916687, + "learning_rate": 2e-05, + "loss": 0.04047617, + "step": 22663 + }, + { + "epoch": 45.328, + "grad_norm": 1.3912262916564941, + "learning_rate": 2e-05, + "loss": 0.04581297, + "step": 22664 + }, + { + "epoch": 45.33, + "grad_norm": 1.1557172536849976, + "learning_rate": 2e-05, + "loss": 0.04854126, + "step": 22665 + }, + { + "epoch": 45.332, + "grad_norm": 1.5989816188812256, + "learning_rate": 2e-05, + "loss": 0.05772961, + "step": 22666 + }, + { + "epoch": 45.334, + "grad_norm": 0.9497352838516235, + "learning_rate": 2e-05, + "loss": 0.03526705, + "step": 22667 + }, + { + "epoch": 45.336, + "grad_norm": 1.0269160270690918, + "learning_rate": 2e-05, + "loss": 0.0354956, + "step": 22668 + }, + { + "epoch": 45.338, + "grad_norm": 0.9239309430122375, + "learning_rate": 2e-05, + "loss": 0.03152385, + "step": 22669 + }, + { + "epoch": 45.34, + "grad_norm": 1.0795482397079468, + "learning_rate": 2e-05, + "loss": 0.04576925, + "step": 22670 + }, + { + "epoch": 45.342, + "grad_norm": 1.2527785301208496, + "learning_rate": 2e-05, + "loss": 0.0519852, + "step": 22671 + }, + { + "epoch": 45.344, + "grad_norm": 1.2454344034194946, + "learning_rate": 2e-05, + "loss": 0.05516186, + "step": 22672 + }, + { + "epoch": 45.346, + "grad_norm": 1.122444748878479, + "learning_rate": 2e-05, + "loss": 0.05303994, + "step": 22673 + }, + { + "epoch": 45.348, + "grad_norm": 1.169447660446167, + "learning_rate": 2e-05, + "loss": 0.05172439, + "step": 22674 + }, + { + "epoch": 45.35, + "grad_norm": 1.2407636642456055, + "learning_rate": 2e-05, + "loss": 0.05318564, + "step": 22675 + }, + { + "epoch": 45.352, + "grad_norm": 1.0374010801315308, + "learning_rate": 2e-05, + "loss": 0.0399787, + "step": 22676 + }, + { + "epoch": 45.354, + "grad_norm": 1.6039947271347046, + "learning_rate": 2e-05, + "loss": 0.04658148, + "step": 22677 + }, + { + "epoch": 45.356, + "grad_norm": 2.6198055744171143, + "learning_rate": 2e-05, + "loss": 0.04925442, + "step": 22678 + }, + { + "epoch": 45.358, + "grad_norm": 1.2547622919082642, + "learning_rate": 2e-05, + "loss": 0.05453364, + "step": 22679 + }, + { + "epoch": 45.36, + "grad_norm": 1.2854013442993164, + "learning_rate": 2e-05, + "loss": 0.05297067, + "step": 22680 + }, + { + "epoch": 45.362, + "grad_norm": 1.3244527578353882, + "learning_rate": 2e-05, + "loss": 0.0462161, + "step": 22681 + }, + { + "epoch": 45.364, + "grad_norm": 1.1322252750396729, + "learning_rate": 2e-05, + "loss": 0.04703336, + "step": 22682 + }, + { + "epoch": 45.366, + "grad_norm": 1.50275456905365, + "learning_rate": 2e-05, + "loss": 0.0487467, + "step": 22683 + }, + { + "epoch": 45.368, + "grad_norm": 1.0968610048294067, + "learning_rate": 2e-05, + "loss": 0.03857952, + "step": 22684 + }, + { + "epoch": 45.37, + "grad_norm": 1.4444911479949951, + "learning_rate": 2e-05, + "loss": 0.0504374, + "step": 22685 + }, + { + "epoch": 45.372, + "grad_norm": 1.5214262008666992, + "learning_rate": 2e-05, + "loss": 0.05310171, + "step": 22686 + }, + { + "epoch": 45.374, + "grad_norm": 1.0097070932388306, + "learning_rate": 2e-05, + "loss": 0.04260479, + "step": 22687 + }, + { + "epoch": 45.376, + "grad_norm": 0.984809160232544, + "learning_rate": 2e-05, + "loss": 0.03620268, + "step": 22688 + }, + { + "epoch": 45.378, + "grad_norm": 3.517108201980591, + "learning_rate": 2e-05, + "loss": 0.05183204, + "step": 22689 + }, + { + "epoch": 45.38, + "grad_norm": 1.4225305318832397, + "learning_rate": 2e-05, + "loss": 0.05968772, + "step": 22690 + }, + { + "epoch": 45.382, + "grad_norm": 1.4418306350708008, + "learning_rate": 2e-05, + "loss": 0.04406624, + "step": 22691 + }, + { + "epoch": 45.384, + "grad_norm": 1.0155731439590454, + "learning_rate": 2e-05, + "loss": 0.04117178, + "step": 22692 + }, + { + "epoch": 45.386, + "grad_norm": 1.1497960090637207, + "learning_rate": 2e-05, + "loss": 0.04193247, + "step": 22693 + }, + { + "epoch": 45.388, + "grad_norm": 1.2186371088027954, + "learning_rate": 2e-05, + "loss": 0.05743311, + "step": 22694 + }, + { + "epoch": 45.39, + "grad_norm": 1.2327183485031128, + "learning_rate": 2e-05, + "loss": 0.04604057, + "step": 22695 + }, + { + "epoch": 45.392, + "grad_norm": 1.4449176788330078, + "learning_rate": 2e-05, + "loss": 0.06641807, + "step": 22696 + }, + { + "epoch": 45.394, + "grad_norm": 1.2744368314743042, + "learning_rate": 2e-05, + "loss": 0.0488281, + "step": 22697 + }, + { + "epoch": 45.396, + "grad_norm": 1.1756504774093628, + "learning_rate": 2e-05, + "loss": 0.04841977, + "step": 22698 + }, + { + "epoch": 45.398, + "grad_norm": 1.5969294309616089, + "learning_rate": 2e-05, + "loss": 0.0381519, + "step": 22699 + }, + { + "epoch": 45.4, + "grad_norm": 1.177388310432434, + "learning_rate": 2e-05, + "loss": 0.05469193, + "step": 22700 + }, + { + "epoch": 45.402, + "grad_norm": 1.2466199398040771, + "learning_rate": 2e-05, + "loss": 0.04355872, + "step": 22701 + }, + { + "epoch": 45.404, + "grad_norm": 1.1346672773361206, + "learning_rate": 2e-05, + "loss": 0.0505007, + "step": 22702 + }, + { + "epoch": 45.406, + "grad_norm": 1.0802158117294312, + "learning_rate": 2e-05, + "loss": 0.03773396, + "step": 22703 + }, + { + "epoch": 45.408, + "grad_norm": 1.0188031196594238, + "learning_rate": 2e-05, + "loss": 0.03674341, + "step": 22704 + }, + { + "epoch": 45.41, + "grad_norm": 0.9986094236373901, + "learning_rate": 2e-05, + "loss": 0.04618426, + "step": 22705 + }, + { + "epoch": 45.412, + "grad_norm": 1.0472339391708374, + "learning_rate": 2e-05, + "loss": 0.04356561, + "step": 22706 + }, + { + "epoch": 45.414, + "grad_norm": 1.2004998922348022, + "learning_rate": 2e-05, + "loss": 0.05084341, + "step": 22707 + }, + { + "epoch": 45.416, + "grad_norm": 1.2121773958206177, + "learning_rate": 2e-05, + "loss": 0.04511089, + "step": 22708 + }, + { + "epoch": 45.418, + "grad_norm": 1.7242828607559204, + "learning_rate": 2e-05, + "loss": 0.0549062, + "step": 22709 + }, + { + "epoch": 45.42, + "grad_norm": 1.154462218284607, + "learning_rate": 2e-05, + "loss": 0.04893161, + "step": 22710 + }, + { + "epoch": 45.422, + "grad_norm": 1.1248419284820557, + "learning_rate": 2e-05, + "loss": 0.04025434, + "step": 22711 + }, + { + "epoch": 45.424, + "grad_norm": 1.2144428491592407, + "learning_rate": 2e-05, + "loss": 0.04915642, + "step": 22712 + }, + { + "epoch": 45.426, + "grad_norm": 1.1011919975280762, + "learning_rate": 2e-05, + "loss": 0.04912124, + "step": 22713 + }, + { + "epoch": 45.428, + "grad_norm": 1.134437084197998, + "learning_rate": 2e-05, + "loss": 0.0618168, + "step": 22714 + }, + { + "epoch": 45.43, + "grad_norm": 1.2620517015457153, + "learning_rate": 2e-05, + "loss": 0.06262559, + "step": 22715 + }, + { + "epoch": 45.432, + "grad_norm": 1.0983620882034302, + "learning_rate": 2e-05, + "loss": 0.04276703, + "step": 22716 + }, + { + "epoch": 45.434, + "grad_norm": 1.1140186786651611, + "learning_rate": 2e-05, + "loss": 0.05332418, + "step": 22717 + }, + { + "epoch": 45.436, + "grad_norm": 1.3023191690444946, + "learning_rate": 2e-05, + "loss": 0.06473353, + "step": 22718 + }, + { + "epoch": 45.438, + "grad_norm": 0.8834937810897827, + "learning_rate": 2e-05, + "loss": 0.04048298, + "step": 22719 + }, + { + "epoch": 45.44, + "grad_norm": 1.276069164276123, + "learning_rate": 2e-05, + "loss": 0.0446801, + "step": 22720 + }, + { + "epoch": 45.442, + "grad_norm": 1.2004058361053467, + "learning_rate": 2e-05, + "loss": 0.05174657, + "step": 22721 + }, + { + "epoch": 45.444, + "grad_norm": 1.3467042446136475, + "learning_rate": 2e-05, + "loss": 0.06729078, + "step": 22722 + }, + { + "epoch": 45.446, + "grad_norm": 1.200899600982666, + "learning_rate": 2e-05, + "loss": 0.04949638, + "step": 22723 + }, + { + "epoch": 45.448, + "grad_norm": 1.2946702241897583, + "learning_rate": 2e-05, + "loss": 0.05751118, + "step": 22724 + }, + { + "epoch": 45.45, + "grad_norm": 1.094891905784607, + "learning_rate": 2e-05, + "loss": 0.04523872, + "step": 22725 + }, + { + "epoch": 45.452, + "grad_norm": 1.049796462059021, + "learning_rate": 2e-05, + "loss": 0.05196452, + "step": 22726 + }, + { + "epoch": 45.454, + "grad_norm": 1.2776124477386475, + "learning_rate": 2e-05, + "loss": 0.04980614, + "step": 22727 + }, + { + "epoch": 45.456, + "grad_norm": 1.2622987031936646, + "learning_rate": 2e-05, + "loss": 0.05976017, + "step": 22728 + }, + { + "epoch": 45.458, + "grad_norm": 1.2927825450897217, + "learning_rate": 2e-05, + "loss": 0.05953868, + "step": 22729 + }, + { + "epoch": 45.46, + "grad_norm": 1.0884168148040771, + "learning_rate": 2e-05, + "loss": 0.03435563, + "step": 22730 + }, + { + "epoch": 45.462, + "grad_norm": 1.2159061431884766, + "learning_rate": 2e-05, + "loss": 0.05816945, + "step": 22731 + }, + { + "epoch": 45.464, + "grad_norm": 1.9903509616851807, + "learning_rate": 2e-05, + "loss": 0.05068534, + "step": 22732 + }, + { + "epoch": 45.466, + "grad_norm": 1.7449239492416382, + "learning_rate": 2e-05, + "loss": 0.05164981, + "step": 22733 + }, + { + "epoch": 45.468, + "grad_norm": 0.8933517336845398, + "learning_rate": 2e-05, + "loss": 0.03138543, + "step": 22734 + }, + { + "epoch": 45.47, + "grad_norm": 1.1336041688919067, + "learning_rate": 2e-05, + "loss": 0.04539839, + "step": 22735 + }, + { + "epoch": 45.472, + "grad_norm": 0.8588364720344543, + "learning_rate": 2e-05, + "loss": 0.03250713, + "step": 22736 + }, + { + "epoch": 45.474, + "grad_norm": 0.9573074579238892, + "learning_rate": 2e-05, + "loss": 0.03433614, + "step": 22737 + }, + { + "epoch": 45.476, + "grad_norm": 3.1477766036987305, + "learning_rate": 2e-05, + "loss": 0.07124802, + "step": 22738 + }, + { + "epoch": 45.478, + "grad_norm": 1.1136095523834229, + "learning_rate": 2e-05, + "loss": 0.04518589, + "step": 22739 + }, + { + "epoch": 45.48, + "grad_norm": 1.7395950555801392, + "learning_rate": 2e-05, + "loss": 0.03977414, + "step": 22740 + }, + { + "epoch": 45.482, + "grad_norm": 1.2946288585662842, + "learning_rate": 2e-05, + "loss": 0.04495868, + "step": 22741 + }, + { + "epoch": 45.484, + "grad_norm": 1.1849554777145386, + "learning_rate": 2e-05, + "loss": 0.04799411, + "step": 22742 + }, + { + "epoch": 45.486, + "grad_norm": 1.13692307472229, + "learning_rate": 2e-05, + "loss": 0.04666179, + "step": 22743 + }, + { + "epoch": 45.488, + "grad_norm": 1.1994067430496216, + "learning_rate": 2e-05, + "loss": 0.04230376, + "step": 22744 + }, + { + "epoch": 45.49, + "grad_norm": 1.5030912160873413, + "learning_rate": 2e-05, + "loss": 0.05279317, + "step": 22745 + }, + { + "epoch": 45.492, + "grad_norm": 0.8699735999107361, + "learning_rate": 2e-05, + "loss": 0.02855832, + "step": 22746 + }, + { + "epoch": 45.494, + "grad_norm": 1.2766979932785034, + "learning_rate": 2e-05, + "loss": 0.04953785, + "step": 22747 + }, + { + "epoch": 45.496, + "grad_norm": 1.0743104219436646, + "learning_rate": 2e-05, + "loss": 0.0503507, + "step": 22748 + }, + { + "epoch": 45.498, + "grad_norm": 1.15504789352417, + "learning_rate": 2e-05, + "loss": 0.04353083, + "step": 22749 + }, + { + "epoch": 45.5, + "grad_norm": 1.5795592069625854, + "learning_rate": 2e-05, + "loss": 0.05220584, + "step": 22750 + }, + { + "epoch": 45.502, + "grad_norm": 1.1085896492004395, + "learning_rate": 2e-05, + "loss": 0.05066767, + "step": 22751 + }, + { + "epoch": 45.504, + "grad_norm": 1.2716381549835205, + "learning_rate": 2e-05, + "loss": 0.06609177, + "step": 22752 + }, + { + "epoch": 45.506, + "grad_norm": 1.9817065000534058, + "learning_rate": 2e-05, + "loss": 0.06183188, + "step": 22753 + }, + { + "epoch": 45.508, + "grad_norm": 1.3558555841445923, + "learning_rate": 2e-05, + "loss": 0.05611308, + "step": 22754 + }, + { + "epoch": 45.51, + "grad_norm": 1.2409502267837524, + "learning_rate": 2e-05, + "loss": 0.03493334, + "step": 22755 + }, + { + "epoch": 45.512, + "grad_norm": 1.2743219137191772, + "learning_rate": 2e-05, + "loss": 0.04769837, + "step": 22756 + }, + { + "epoch": 45.514, + "grad_norm": 0.9491370320320129, + "learning_rate": 2e-05, + "loss": 0.03029821, + "step": 22757 + }, + { + "epoch": 45.516, + "grad_norm": 4.352717876434326, + "learning_rate": 2e-05, + "loss": 0.03928834, + "step": 22758 + }, + { + "epoch": 45.518, + "grad_norm": 1.3825864791870117, + "learning_rate": 2e-05, + "loss": 0.05772555, + "step": 22759 + }, + { + "epoch": 45.52, + "grad_norm": 1.626591682434082, + "learning_rate": 2e-05, + "loss": 0.05705488, + "step": 22760 + }, + { + "epoch": 45.522, + "grad_norm": 1.2819281816482544, + "learning_rate": 2e-05, + "loss": 0.06068337, + "step": 22761 + }, + { + "epoch": 45.524, + "grad_norm": 2.8456923961639404, + "learning_rate": 2e-05, + "loss": 0.05577563, + "step": 22762 + }, + { + "epoch": 45.526, + "grad_norm": 1.33793044090271, + "learning_rate": 2e-05, + "loss": 0.05010552, + "step": 22763 + }, + { + "epoch": 45.528, + "grad_norm": 1.059351921081543, + "learning_rate": 2e-05, + "loss": 0.04426561, + "step": 22764 + }, + { + "epoch": 45.53, + "grad_norm": 1.1769965887069702, + "learning_rate": 2e-05, + "loss": 0.03966656, + "step": 22765 + }, + { + "epoch": 45.532, + "grad_norm": 1.2695751190185547, + "learning_rate": 2e-05, + "loss": 0.04549221, + "step": 22766 + }, + { + "epoch": 45.534, + "grad_norm": 1.2788498401641846, + "learning_rate": 2e-05, + "loss": 0.05216346, + "step": 22767 + }, + { + "epoch": 45.536, + "grad_norm": 0.9913758039474487, + "learning_rate": 2e-05, + "loss": 0.03447246, + "step": 22768 + }, + { + "epoch": 45.538, + "grad_norm": 1.0714240074157715, + "learning_rate": 2e-05, + "loss": 0.04273647, + "step": 22769 + }, + { + "epoch": 45.54, + "grad_norm": 1.2439073324203491, + "learning_rate": 2e-05, + "loss": 0.04950628, + "step": 22770 + }, + { + "epoch": 45.542, + "grad_norm": 1.4593321084976196, + "learning_rate": 2e-05, + "loss": 0.03826152, + "step": 22771 + }, + { + "epoch": 45.544, + "grad_norm": 1.562803864479065, + "learning_rate": 2e-05, + "loss": 0.05784161, + "step": 22772 + }, + { + "epoch": 45.546, + "grad_norm": 0.9718835949897766, + "learning_rate": 2e-05, + "loss": 0.04087242, + "step": 22773 + }, + { + "epoch": 45.548, + "grad_norm": 1.1946215629577637, + "learning_rate": 2e-05, + "loss": 0.03955045, + "step": 22774 + }, + { + "epoch": 45.55, + "grad_norm": 5.945659160614014, + "learning_rate": 2e-05, + "loss": 0.05163043, + "step": 22775 + }, + { + "epoch": 45.552, + "grad_norm": 0.9503419995307922, + "learning_rate": 2e-05, + "loss": 0.03446332, + "step": 22776 + }, + { + "epoch": 45.554, + "grad_norm": 2.860668897628784, + "learning_rate": 2e-05, + "loss": 0.06032272, + "step": 22777 + }, + { + "epoch": 45.556, + "grad_norm": 1.0413198471069336, + "learning_rate": 2e-05, + "loss": 0.03605777, + "step": 22778 + }, + { + "epoch": 45.558, + "grad_norm": 1.3972159624099731, + "learning_rate": 2e-05, + "loss": 0.0614309, + "step": 22779 + }, + { + "epoch": 45.56, + "grad_norm": 1.1884814500808716, + "learning_rate": 2e-05, + "loss": 0.04031287, + "step": 22780 + }, + { + "epoch": 45.562, + "grad_norm": 1.1417438983917236, + "learning_rate": 2e-05, + "loss": 0.04732093, + "step": 22781 + }, + { + "epoch": 45.564, + "grad_norm": 1.4334638118743896, + "learning_rate": 2e-05, + "loss": 0.05984236, + "step": 22782 + }, + { + "epoch": 45.566, + "grad_norm": 2.67592453956604, + "learning_rate": 2e-05, + "loss": 0.07209148, + "step": 22783 + }, + { + "epoch": 45.568, + "grad_norm": 1.1358227729797363, + "learning_rate": 2e-05, + "loss": 0.04441652, + "step": 22784 + }, + { + "epoch": 45.57, + "grad_norm": 1.161139726638794, + "learning_rate": 2e-05, + "loss": 0.05294163, + "step": 22785 + }, + { + "epoch": 45.572, + "grad_norm": 4.65118932723999, + "learning_rate": 2e-05, + "loss": 0.03936018, + "step": 22786 + }, + { + "epoch": 45.574, + "grad_norm": 1.0236941576004028, + "learning_rate": 2e-05, + "loss": 0.05270987, + "step": 22787 + }, + { + "epoch": 45.576, + "grad_norm": 1.3920187950134277, + "learning_rate": 2e-05, + "loss": 0.05220519, + "step": 22788 + }, + { + "epoch": 45.578, + "grad_norm": 1.4506982564926147, + "learning_rate": 2e-05, + "loss": 0.05476096, + "step": 22789 + }, + { + "epoch": 45.58, + "grad_norm": 1.6124314069747925, + "learning_rate": 2e-05, + "loss": 0.04182716, + "step": 22790 + }, + { + "epoch": 45.582, + "grad_norm": 1.353986144065857, + "learning_rate": 2e-05, + "loss": 0.04613897, + "step": 22791 + }, + { + "epoch": 45.584, + "grad_norm": 1.1616084575653076, + "learning_rate": 2e-05, + "loss": 0.06366609, + "step": 22792 + }, + { + "epoch": 45.586, + "grad_norm": 1.0128250122070312, + "learning_rate": 2e-05, + "loss": 0.03711364, + "step": 22793 + }, + { + "epoch": 45.588, + "grad_norm": 1.2062525749206543, + "learning_rate": 2e-05, + "loss": 0.04811207, + "step": 22794 + }, + { + "epoch": 45.59, + "grad_norm": 1.031899094581604, + "learning_rate": 2e-05, + "loss": 0.03934745, + "step": 22795 + }, + { + "epoch": 45.592, + "grad_norm": 1.0343753099441528, + "learning_rate": 2e-05, + "loss": 0.04823088, + "step": 22796 + }, + { + "epoch": 45.594, + "grad_norm": 3.7844700813293457, + "learning_rate": 2e-05, + "loss": 0.05894583, + "step": 22797 + }, + { + "epoch": 45.596, + "grad_norm": 1.1267212629318237, + "learning_rate": 2e-05, + "loss": 0.03568856, + "step": 22798 + }, + { + "epoch": 45.598, + "grad_norm": 1.3324168920516968, + "learning_rate": 2e-05, + "loss": 0.05654506, + "step": 22799 + }, + { + "epoch": 45.6, + "grad_norm": 1.0916633605957031, + "learning_rate": 2e-05, + "loss": 0.04748604, + "step": 22800 + }, + { + "epoch": 45.602, + "grad_norm": 1.7681325674057007, + "learning_rate": 2e-05, + "loss": 0.04749766, + "step": 22801 + }, + { + "epoch": 45.604, + "grad_norm": 1.385416030883789, + "learning_rate": 2e-05, + "loss": 0.05868257, + "step": 22802 + }, + { + "epoch": 45.606, + "grad_norm": 2.559743642807007, + "learning_rate": 2e-05, + "loss": 0.06657992, + "step": 22803 + }, + { + "epoch": 45.608, + "grad_norm": 1.5238560438156128, + "learning_rate": 2e-05, + "loss": 0.05722816, + "step": 22804 + }, + { + "epoch": 45.61, + "grad_norm": 1.215009331703186, + "learning_rate": 2e-05, + "loss": 0.05467505, + "step": 22805 + }, + { + "epoch": 45.612, + "grad_norm": 1.3188890218734741, + "learning_rate": 2e-05, + "loss": 0.07242645, + "step": 22806 + }, + { + "epoch": 45.614, + "grad_norm": 1.3299347162246704, + "learning_rate": 2e-05, + "loss": 0.04678376, + "step": 22807 + }, + { + "epoch": 45.616, + "grad_norm": 1.213125467300415, + "learning_rate": 2e-05, + "loss": 0.05465173, + "step": 22808 + }, + { + "epoch": 45.618, + "grad_norm": 0.9832747578620911, + "learning_rate": 2e-05, + "loss": 0.03593368, + "step": 22809 + }, + { + "epoch": 45.62, + "grad_norm": 1.4957003593444824, + "learning_rate": 2e-05, + "loss": 0.04334493, + "step": 22810 + }, + { + "epoch": 45.622, + "grad_norm": 1.1476776599884033, + "learning_rate": 2e-05, + "loss": 0.05104943, + "step": 22811 + }, + { + "epoch": 45.624, + "grad_norm": 1.1928304433822632, + "learning_rate": 2e-05, + "loss": 0.05153049, + "step": 22812 + }, + { + "epoch": 45.626, + "grad_norm": 3.3560004234313965, + "learning_rate": 2e-05, + "loss": 0.04769015, + "step": 22813 + }, + { + "epoch": 45.628, + "grad_norm": 1.2147530317306519, + "learning_rate": 2e-05, + "loss": 0.04993775, + "step": 22814 + }, + { + "epoch": 45.63, + "grad_norm": 1.3720028400421143, + "learning_rate": 2e-05, + "loss": 0.04633579, + "step": 22815 + }, + { + "epoch": 45.632, + "grad_norm": 1.1289563179016113, + "learning_rate": 2e-05, + "loss": 0.0430528, + "step": 22816 + }, + { + "epoch": 45.634, + "grad_norm": 1.3510793447494507, + "learning_rate": 2e-05, + "loss": 0.04878941, + "step": 22817 + }, + { + "epoch": 45.636, + "grad_norm": 1.4363319873809814, + "learning_rate": 2e-05, + "loss": 0.05465399, + "step": 22818 + }, + { + "epoch": 45.638, + "grad_norm": 2.460465669631958, + "learning_rate": 2e-05, + "loss": 0.05995677, + "step": 22819 + }, + { + "epoch": 45.64, + "grad_norm": 1.1262542009353638, + "learning_rate": 2e-05, + "loss": 0.03892493, + "step": 22820 + }, + { + "epoch": 45.642, + "grad_norm": 1.2960659265518188, + "learning_rate": 2e-05, + "loss": 0.0484963, + "step": 22821 + }, + { + "epoch": 45.644, + "grad_norm": 1.1812695264816284, + "learning_rate": 2e-05, + "loss": 0.05082157, + "step": 22822 + }, + { + "epoch": 45.646, + "grad_norm": 1.172400712966919, + "learning_rate": 2e-05, + "loss": 0.04413823, + "step": 22823 + }, + { + "epoch": 45.648, + "grad_norm": 1.1976510286331177, + "learning_rate": 2e-05, + "loss": 0.04225794, + "step": 22824 + }, + { + "epoch": 45.65, + "grad_norm": 1.4453816413879395, + "learning_rate": 2e-05, + "loss": 0.05087319, + "step": 22825 + }, + { + "epoch": 45.652, + "grad_norm": 0.993677020072937, + "learning_rate": 2e-05, + "loss": 0.02660798, + "step": 22826 + }, + { + "epoch": 45.654, + "grad_norm": 1.0433624982833862, + "learning_rate": 2e-05, + "loss": 0.04253418, + "step": 22827 + }, + { + "epoch": 45.656, + "grad_norm": 1.026785969734192, + "learning_rate": 2e-05, + "loss": 0.04078073, + "step": 22828 + }, + { + "epoch": 45.658, + "grad_norm": 1.125288963317871, + "learning_rate": 2e-05, + "loss": 0.04690692, + "step": 22829 + }, + { + "epoch": 45.66, + "grad_norm": 1.1184576749801636, + "learning_rate": 2e-05, + "loss": 0.02984325, + "step": 22830 + }, + { + "epoch": 45.662, + "grad_norm": 1.30687415599823, + "learning_rate": 2e-05, + "loss": 0.05822021, + "step": 22831 + }, + { + "epoch": 45.664, + "grad_norm": 1.2212743759155273, + "learning_rate": 2e-05, + "loss": 0.04939518, + "step": 22832 + }, + { + "epoch": 45.666, + "grad_norm": 1.186759352684021, + "learning_rate": 2e-05, + "loss": 0.05301164, + "step": 22833 + }, + { + "epoch": 45.668, + "grad_norm": 1.2905839681625366, + "learning_rate": 2e-05, + "loss": 0.04207082, + "step": 22834 + }, + { + "epoch": 45.67, + "grad_norm": 1.1900533437728882, + "learning_rate": 2e-05, + "loss": 0.05991127, + "step": 22835 + }, + { + "epoch": 45.672, + "grad_norm": 2.1371090412139893, + "learning_rate": 2e-05, + "loss": 0.05556452, + "step": 22836 + }, + { + "epoch": 45.674, + "grad_norm": 1.6555566787719727, + "learning_rate": 2e-05, + "loss": 0.0666324, + "step": 22837 + }, + { + "epoch": 45.676, + "grad_norm": 1.2142423391342163, + "learning_rate": 2e-05, + "loss": 0.04500743, + "step": 22838 + }, + { + "epoch": 45.678, + "grad_norm": 1.204254150390625, + "learning_rate": 2e-05, + "loss": 0.05000075, + "step": 22839 + }, + { + "epoch": 45.68, + "grad_norm": 1.3275655508041382, + "learning_rate": 2e-05, + "loss": 0.05476985, + "step": 22840 + }, + { + "epoch": 45.682, + "grad_norm": 1.0724529027938843, + "learning_rate": 2e-05, + "loss": 0.03965185, + "step": 22841 + }, + { + "epoch": 45.684, + "grad_norm": 1.363209843635559, + "learning_rate": 2e-05, + "loss": 0.0393138, + "step": 22842 + }, + { + "epoch": 45.686, + "grad_norm": 1.3756287097930908, + "learning_rate": 2e-05, + "loss": 0.0670438, + "step": 22843 + }, + { + "epoch": 45.688, + "grad_norm": 1.252241849899292, + "learning_rate": 2e-05, + "loss": 0.05263539, + "step": 22844 + }, + { + "epoch": 45.69, + "grad_norm": 1.1118369102478027, + "learning_rate": 2e-05, + "loss": 0.05109958, + "step": 22845 + }, + { + "epoch": 45.692, + "grad_norm": 1.2781487703323364, + "learning_rate": 2e-05, + "loss": 0.05006749, + "step": 22846 + }, + { + "epoch": 45.694, + "grad_norm": 1.309794306755066, + "learning_rate": 2e-05, + "loss": 0.03993253, + "step": 22847 + }, + { + "epoch": 45.696, + "grad_norm": 1.3862566947937012, + "learning_rate": 2e-05, + "loss": 0.03329, + "step": 22848 + }, + { + "epoch": 45.698, + "grad_norm": 1.0639959573745728, + "learning_rate": 2e-05, + "loss": 0.03904609, + "step": 22849 + }, + { + "epoch": 45.7, + "grad_norm": 1.0626970529556274, + "learning_rate": 2e-05, + "loss": 0.03999314, + "step": 22850 + }, + { + "epoch": 45.702, + "grad_norm": 1.2711509466171265, + "learning_rate": 2e-05, + "loss": 0.0375726, + "step": 22851 + }, + { + "epoch": 45.704, + "grad_norm": 0.9790001511573792, + "learning_rate": 2e-05, + "loss": 0.03540372, + "step": 22852 + }, + { + "epoch": 45.706, + "grad_norm": 0.975598931312561, + "learning_rate": 2e-05, + "loss": 0.03365985, + "step": 22853 + }, + { + "epoch": 45.708, + "grad_norm": 1.0220794677734375, + "learning_rate": 2e-05, + "loss": 0.04530801, + "step": 22854 + }, + { + "epoch": 45.71, + "grad_norm": 1.2147080898284912, + "learning_rate": 2e-05, + "loss": 0.06349006, + "step": 22855 + }, + { + "epoch": 45.712, + "grad_norm": 1.1468337774276733, + "learning_rate": 2e-05, + "loss": 0.04973483, + "step": 22856 + }, + { + "epoch": 45.714, + "grad_norm": 1.4470347166061401, + "learning_rate": 2e-05, + "loss": 0.04426311, + "step": 22857 + }, + { + "epoch": 45.716, + "grad_norm": 1.1811226606369019, + "learning_rate": 2e-05, + "loss": 0.04253963, + "step": 22858 + }, + { + "epoch": 45.718, + "grad_norm": 1.0529688596725464, + "learning_rate": 2e-05, + "loss": 0.04702899, + "step": 22859 + }, + { + "epoch": 45.72, + "grad_norm": 1.1447519063949585, + "learning_rate": 2e-05, + "loss": 0.06699675, + "step": 22860 + }, + { + "epoch": 45.722, + "grad_norm": 1.0626235008239746, + "learning_rate": 2e-05, + "loss": 0.04488755, + "step": 22861 + }, + { + "epoch": 45.724, + "grad_norm": 1.3267689943313599, + "learning_rate": 2e-05, + "loss": 0.05437113, + "step": 22862 + }, + { + "epoch": 45.726, + "grad_norm": 1.0401638746261597, + "learning_rate": 2e-05, + "loss": 0.03539321, + "step": 22863 + }, + { + "epoch": 45.728, + "grad_norm": 1.124693512916565, + "learning_rate": 2e-05, + "loss": 0.03614509, + "step": 22864 + }, + { + "epoch": 45.73, + "grad_norm": 1.4313117265701294, + "learning_rate": 2e-05, + "loss": 0.04222115, + "step": 22865 + }, + { + "epoch": 45.732, + "grad_norm": 2.3040075302124023, + "learning_rate": 2e-05, + "loss": 0.03782343, + "step": 22866 + }, + { + "epoch": 45.734, + "grad_norm": 1.12531316280365, + "learning_rate": 2e-05, + "loss": 0.05212967, + "step": 22867 + }, + { + "epoch": 45.736, + "grad_norm": 1.1863642930984497, + "learning_rate": 2e-05, + "loss": 0.04659764, + "step": 22868 + }, + { + "epoch": 45.738, + "grad_norm": 3.5154294967651367, + "learning_rate": 2e-05, + "loss": 0.0452398, + "step": 22869 + }, + { + "epoch": 45.74, + "grad_norm": 1.2669914960861206, + "learning_rate": 2e-05, + "loss": 0.04121526, + "step": 22870 + }, + { + "epoch": 45.742, + "grad_norm": 1.5299526453018188, + "learning_rate": 2e-05, + "loss": 0.06856737, + "step": 22871 + }, + { + "epoch": 45.744, + "grad_norm": 1.943005919456482, + "learning_rate": 2e-05, + "loss": 0.03736391, + "step": 22872 + }, + { + "epoch": 45.746, + "grad_norm": 1.5813325643539429, + "learning_rate": 2e-05, + "loss": 0.05658036, + "step": 22873 + }, + { + "epoch": 45.748, + "grad_norm": 1.1869730949401855, + "learning_rate": 2e-05, + "loss": 0.04118659, + "step": 22874 + }, + { + "epoch": 45.75, + "grad_norm": 1.1262238025665283, + "learning_rate": 2e-05, + "loss": 0.05163476, + "step": 22875 + }, + { + "epoch": 45.752, + "grad_norm": 1.8035818338394165, + "learning_rate": 2e-05, + "loss": 0.03897575, + "step": 22876 + }, + { + "epoch": 45.754, + "grad_norm": 1.2717819213867188, + "learning_rate": 2e-05, + "loss": 0.06012269, + "step": 22877 + }, + { + "epoch": 45.756, + "grad_norm": 1.081311583518982, + "learning_rate": 2e-05, + "loss": 0.05380928, + "step": 22878 + }, + { + "epoch": 45.758, + "grad_norm": 1.1609777212142944, + "learning_rate": 2e-05, + "loss": 0.04177097, + "step": 22879 + }, + { + "epoch": 45.76, + "grad_norm": 2.053823471069336, + "learning_rate": 2e-05, + "loss": 0.04388625, + "step": 22880 + }, + { + "epoch": 45.762, + "grad_norm": 1.1730915307998657, + "learning_rate": 2e-05, + "loss": 0.03498461, + "step": 22881 + }, + { + "epoch": 45.764, + "grad_norm": 1.1314507722854614, + "learning_rate": 2e-05, + "loss": 0.04693135, + "step": 22882 + }, + { + "epoch": 45.766, + "grad_norm": 1.214177131652832, + "learning_rate": 2e-05, + "loss": 0.05761575, + "step": 22883 + }, + { + "epoch": 45.768, + "grad_norm": 1.2264306545257568, + "learning_rate": 2e-05, + "loss": 0.05583285, + "step": 22884 + }, + { + "epoch": 45.77, + "grad_norm": 1.1142363548278809, + "learning_rate": 2e-05, + "loss": 0.03938077, + "step": 22885 + }, + { + "epoch": 45.772, + "grad_norm": 1.2358603477478027, + "learning_rate": 2e-05, + "loss": 0.05267806, + "step": 22886 + }, + { + "epoch": 45.774, + "grad_norm": 2.0392813682556152, + "learning_rate": 2e-05, + "loss": 0.05020905, + "step": 22887 + }, + { + "epoch": 45.776, + "grad_norm": 1.9601068496704102, + "learning_rate": 2e-05, + "loss": 0.06155182, + "step": 22888 + }, + { + "epoch": 45.778, + "grad_norm": 1.2073707580566406, + "learning_rate": 2e-05, + "loss": 0.04262544, + "step": 22889 + }, + { + "epoch": 45.78, + "grad_norm": 1.61870276927948, + "learning_rate": 2e-05, + "loss": 0.04001756, + "step": 22890 + }, + { + "epoch": 45.782, + "grad_norm": 1.62678062915802, + "learning_rate": 2e-05, + "loss": 0.04077552, + "step": 22891 + }, + { + "epoch": 45.784, + "grad_norm": 1.0999619960784912, + "learning_rate": 2e-05, + "loss": 0.0418282, + "step": 22892 + }, + { + "epoch": 45.786, + "grad_norm": 3.3379061222076416, + "learning_rate": 2e-05, + "loss": 0.06497736, + "step": 22893 + }, + { + "epoch": 45.788, + "grad_norm": 1.6847137212753296, + "learning_rate": 2e-05, + "loss": 0.05242064, + "step": 22894 + }, + { + "epoch": 45.79, + "grad_norm": 1.1532042026519775, + "learning_rate": 2e-05, + "loss": 0.0519854, + "step": 22895 + }, + { + "epoch": 45.792, + "grad_norm": 1.1070436239242554, + "learning_rate": 2e-05, + "loss": 0.04739845, + "step": 22896 + }, + { + "epoch": 45.794, + "grad_norm": 1.0771516561508179, + "learning_rate": 2e-05, + "loss": 0.04649124, + "step": 22897 + }, + { + "epoch": 45.796, + "grad_norm": 1.0672521591186523, + "learning_rate": 2e-05, + "loss": 0.04606741, + "step": 22898 + }, + { + "epoch": 45.798, + "grad_norm": 1.7498153448104858, + "learning_rate": 2e-05, + "loss": 0.0525641, + "step": 22899 + }, + { + "epoch": 45.8, + "grad_norm": 3.885802984237671, + "learning_rate": 2e-05, + "loss": 0.05591378, + "step": 22900 + }, + { + "epoch": 45.802, + "grad_norm": 0.9939262866973877, + "learning_rate": 2e-05, + "loss": 0.0356969, + "step": 22901 + }, + { + "epoch": 45.804, + "grad_norm": 1.0101487636566162, + "learning_rate": 2e-05, + "loss": 0.03555669, + "step": 22902 + }, + { + "epoch": 45.806, + "grad_norm": 1.3214828968048096, + "learning_rate": 2e-05, + "loss": 0.04506984, + "step": 22903 + }, + { + "epoch": 45.808, + "grad_norm": 1.2031276226043701, + "learning_rate": 2e-05, + "loss": 0.04604453, + "step": 22904 + }, + { + "epoch": 45.81, + "grad_norm": 0.993742048740387, + "learning_rate": 2e-05, + "loss": 0.04115344, + "step": 22905 + }, + { + "epoch": 45.812, + "grad_norm": 1.3306998014450073, + "learning_rate": 2e-05, + "loss": 0.05285704, + "step": 22906 + }, + { + "epoch": 45.814, + "grad_norm": 1.5680806636810303, + "learning_rate": 2e-05, + "loss": 0.06785664, + "step": 22907 + }, + { + "epoch": 45.816, + "grad_norm": 1.360222339630127, + "learning_rate": 2e-05, + "loss": 0.05301679, + "step": 22908 + }, + { + "epoch": 45.818, + "grad_norm": 1.7606545686721802, + "learning_rate": 2e-05, + "loss": 0.05103129, + "step": 22909 + }, + { + "epoch": 45.82, + "grad_norm": 0.9953403472900391, + "learning_rate": 2e-05, + "loss": 0.03279923, + "step": 22910 + }, + { + "epoch": 45.822, + "grad_norm": 2.362177610397339, + "learning_rate": 2e-05, + "loss": 0.04696232, + "step": 22911 + }, + { + "epoch": 45.824, + "grad_norm": 1.3651307821273804, + "learning_rate": 2e-05, + "loss": 0.05074629, + "step": 22912 + }, + { + "epoch": 45.826, + "grad_norm": 1.4231796264648438, + "learning_rate": 2e-05, + "loss": 0.04402232, + "step": 22913 + }, + { + "epoch": 45.828, + "grad_norm": 1.3840693235397339, + "learning_rate": 2e-05, + "loss": 0.05228634, + "step": 22914 + }, + { + "epoch": 45.83, + "grad_norm": 1.2382279634475708, + "learning_rate": 2e-05, + "loss": 0.03767666, + "step": 22915 + }, + { + "epoch": 45.832, + "grad_norm": 1.1215736865997314, + "learning_rate": 2e-05, + "loss": 0.03750758, + "step": 22916 + }, + { + "epoch": 45.834, + "grad_norm": 1.5836503505706787, + "learning_rate": 2e-05, + "loss": 0.05021936, + "step": 22917 + }, + { + "epoch": 45.836, + "grad_norm": 1.175191879272461, + "learning_rate": 2e-05, + "loss": 0.05768834, + "step": 22918 + }, + { + "epoch": 45.838, + "grad_norm": 2.3862361907958984, + "learning_rate": 2e-05, + "loss": 0.06385323, + "step": 22919 + }, + { + "epoch": 45.84, + "grad_norm": 1.1213129758834839, + "learning_rate": 2e-05, + "loss": 0.0474508, + "step": 22920 + }, + { + "epoch": 45.842, + "grad_norm": 1.1670286655426025, + "learning_rate": 2e-05, + "loss": 0.05094395, + "step": 22921 + }, + { + "epoch": 45.844, + "grad_norm": 1.0269542932510376, + "learning_rate": 2e-05, + "loss": 0.03310216, + "step": 22922 + }, + { + "epoch": 45.846, + "grad_norm": 1.220439076423645, + "learning_rate": 2e-05, + "loss": 0.05242105, + "step": 22923 + }, + { + "epoch": 45.848, + "grad_norm": 1.1493821144104004, + "learning_rate": 2e-05, + "loss": 0.05762265, + "step": 22924 + }, + { + "epoch": 45.85, + "grad_norm": 1.2592790126800537, + "learning_rate": 2e-05, + "loss": 0.05583587, + "step": 22925 + }, + { + "epoch": 45.852, + "grad_norm": 1.1037945747375488, + "learning_rate": 2e-05, + "loss": 0.05163608, + "step": 22926 + }, + { + "epoch": 45.854, + "grad_norm": 1.039101243019104, + "learning_rate": 2e-05, + "loss": 0.04163966, + "step": 22927 + }, + { + "epoch": 45.856, + "grad_norm": 1.087335228919983, + "learning_rate": 2e-05, + "loss": 0.04357848, + "step": 22928 + }, + { + "epoch": 45.858, + "grad_norm": 1.1296930313110352, + "learning_rate": 2e-05, + "loss": 0.04551987, + "step": 22929 + }, + { + "epoch": 45.86, + "grad_norm": 1.1426345109939575, + "learning_rate": 2e-05, + "loss": 0.05805442, + "step": 22930 + }, + { + "epoch": 45.862, + "grad_norm": 2.9095516204833984, + "learning_rate": 2e-05, + "loss": 0.06481321, + "step": 22931 + }, + { + "epoch": 45.864, + "grad_norm": 1.4706426858901978, + "learning_rate": 2e-05, + "loss": 0.06707012, + "step": 22932 + }, + { + "epoch": 45.866, + "grad_norm": 1.0578680038452148, + "learning_rate": 2e-05, + "loss": 0.03972709, + "step": 22933 + }, + { + "epoch": 45.868, + "grad_norm": 1.6905008554458618, + "learning_rate": 2e-05, + "loss": 0.03994934, + "step": 22934 + }, + { + "epoch": 45.87, + "grad_norm": 1.2955090999603271, + "learning_rate": 2e-05, + "loss": 0.06763369, + "step": 22935 + }, + { + "epoch": 45.872, + "grad_norm": 1.0367980003356934, + "learning_rate": 2e-05, + "loss": 0.03746669, + "step": 22936 + }, + { + "epoch": 45.874, + "grad_norm": 1.5050883293151855, + "learning_rate": 2e-05, + "loss": 0.06505072, + "step": 22937 + }, + { + "epoch": 45.876, + "grad_norm": 1.0128837823867798, + "learning_rate": 2e-05, + "loss": 0.03502842, + "step": 22938 + }, + { + "epoch": 45.878, + "grad_norm": 3.16584849357605, + "learning_rate": 2e-05, + "loss": 0.05251692, + "step": 22939 + }, + { + "epoch": 45.88, + "grad_norm": 1.3227653503417969, + "learning_rate": 2e-05, + "loss": 0.02910842, + "step": 22940 + }, + { + "epoch": 45.882, + "grad_norm": 1.2259905338287354, + "learning_rate": 2e-05, + "loss": 0.06085608, + "step": 22941 + }, + { + "epoch": 45.884, + "grad_norm": 1.1765010356903076, + "learning_rate": 2e-05, + "loss": 0.04740379, + "step": 22942 + }, + { + "epoch": 45.886, + "grad_norm": 1.1642879247665405, + "learning_rate": 2e-05, + "loss": 0.05026664, + "step": 22943 + }, + { + "epoch": 45.888, + "grad_norm": 1.3020238876342773, + "learning_rate": 2e-05, + "loss": 0.06652694, + "step": 22944 + }, + { + "epoch": 45.89, + "grad_norm": 1.2435083389282227, + "learning_rate": 2e-05, + "loss": 0.05145294, + "step": 22945 + }, + { + "epoch": 45.892, + "grad_norm": 1.1023855209350586, + "learning_rate": 2e-05, + "loss": 0.03766239, + "step": 22946 + }, + { + "epoch": 45.894, + "grad_norm": 0.9923415780067444, + "learning_rate": 2e-05, + "loss": 0.03449097, + "step": 22947 + }, + { + "epoch": 45.896, + "grad_norm": 1.2809637784957886, + "learning_rate": 2e-05, + "loss": 0.0631137, + "step": 22948 + }, + { + "epoch": 45.898, + "grad_norm": 1.0583648681640625, + "learning_rate": 2e-05, + "loss": 0.04882384, + "step": 22949 + }, + { + "epoch": 45.9, + "grad_norm": 1.2955833673477173, + "learning_rate": 2e-05, + "loss": 0.0419897, + "step": 22950 + }, + { + "epoch": 45.902, + "grad_norm": 1.0099056959152222, + "learning_rate": 2e-05, + "loss": 0.03409104, + "step": 22951 + }, + { + "epoch": 45.904, + "grad_norm": 1.4930360317230225, + "learning_rate": 2e-05, + "loss": 0.06223853, + "step": 22952 + }, + { + "epoch": 45.906, + "grad_norm": 1.2988282442092896, + "learning_rate": 2e-05, + "loss": 0.04434647, + "step": 22953 + }, + { + "epoch": 45.908, + "grad_norm": 1.2352509498596191, + "learning_rate": 2e-05, + "loss": 0.04742352, + "step": 22954 + }, + { + "epoch": 45.91, + "grad_norm": 1.0645052194595337, + "learning_rate": 2e-05, + "loss": 0.03650692, + "step": 22955 + }, + { + "epoch": 45.912, + "grad_norm": 1.607302188873291, + "learning_rate": 2e-05, + "loss": 0.03560692, + "step": 22956 + }, + { + "epoch": 45.914, + "grad_norm": 0.9931079149246216, + "learning_rate": 2e-05, + "loss": 0.03896011, + "step": 22957 + }, + { + "epoch": 45.916, + "grad_norm": 1.081117033958435, + "learning_rate": 2e-05, + "loss": 0.03914691, + "step": 22958 + }, + { + "epoch": 45.918, + "grad_norm": 2.213714361190796, + "learning_rate": 2e-05, + "loss": 0.07680227, + "step": 22959 + }, + { + "epoch": 45.92, + "grad_norm": 1.328762173652649, + "learning_rate": 2e-05, + "loss": 0.04840521, + "step": 22960 + }, + { + "epoch": 45.922, + "grad_norm": 1.999614953994751, + "learning_rate": 2e-05, + "loss": 0.04826399, + "step": 22961 + }, + { + "epoch": 45.924, + "grad_norm": 1.5177665948867798, + "learning_rate": 2e-05, + "loss": 0.04357665, + "step": 22962 + }, + { + "epoch": 45.926, + "grad_norm": 1.822375774383545, + "learning_rate": 2e-05, + "loss": 0.05791084, + "step": 22963 + }, + { + "epoch": 45.928, + "grad_norm": 1.217348575592041, + "learning_rate": 2e-05, + "loss": 0.03876396, + "step": 22964 + }, + { + "epoch": 45.93, + "grad_norm": 1.5601190328598022, + "learning_rate": 2e-05, + "loss": 0.06204118, + "step": 22965 + }, + { + "epoch": 45.932, + "grad_norm": 2.090487480163574, + "learning_rate": 2e-05, + "loss": 0.0422308, + "step": 22966 + }, + { + "epoch": 45.934, + "grad_norm": 1.2206273078918457, + "learning_rate": 2e-05, + "loss": 0.04056178, + "step": 22967 + }, + { + "epoch": 45.936, + "grad_norm": 1.1585288047790527, + "learning_rate": 2e-05, + "loss": 0.03958793, + "step": 22968 + }, + { + "epoch": 45.938, + "grad_norm": 1.2946537733078003, + "learning_rate": 2e-05, + "loss": 0.04267664, + "step": 22969 + }, + { + "epoch": 45.94, + "grad_norm": 1.4737160205841064, + "learning_rate": 2e-05, + "loss": 0.07163599, + "step": 22970 + }, + { + "epoch": 45.942, + "grad_norm": 1.1721677780151367, + "learning_rate": 2e-05, + "loss": 0.04065248, + "step": 22971 + }, + { + "epoch": 45.944, + "grad_norm": 1.7377208471298218, + "learning_rate": 2e-05, + "loss": 0.05867632, + "step": 22972 + }, + { + "epoch": 45.946, + "grad_norm": 1.7599306106567383, + "learning_rate": 2e-05, + "loss": 0.04502322, + "step": 22973 + }, + { + "epoch": 45.948, + "grad_norm": 1.2876641750335693, + "learning_rate": 2e-05, + "loss": 0.0509927, + "step": 22974 + }, + { + "epoch": 45.95, + "grad_norm": 1.1776494979858398, + "learning_rate": 2e-05, + "loss": 0.05537301, + "step": 22975 + }, + { + "epoch": 45.952, + "grad_norm": 1.3018521070480347, + "learning_rate": 2e-05, + "loss": 0.0640357, + "step": 22976 + }, + { + "epoch": 45.954, + "grad_norm": 1.2216527462005615, + "learning_rate": 2e-05, + "loss": 0.0506551, + "step": 22977 + }, + { + "epoch": 45.956, + "grad_norm": 1.313719630241394, + "learning_rate": 2e-05, + "loss": 0.05280958, + "step": 22978 + }, + { + "epoch": 45.958, + "grad_norm": 1.1301958560943604, + "learning_rate": 2e-05, + "loss": 0.04503809, + "step": 22979 + }, + { + "epoch": 45.96, + "grad_norm": 1.7027695178985596, + "learning_rate": 2e-05, + "loss": 0.05137224, + "step": 22980 + }, + { + "epoch": 45.962, + "grad_norm": 2.217540740966797, + "learning_rate": 2e-05, + "loss": 0.05492135, + "step": 22981 + }, + { + "epoch": 45.964, + "grad_norm": 1.163046956062317, + "learning_rate": 2e-05, + "loss": 0.03381425, + "step": 22982 + }, + { + "epoch": 45.966, + "grad_norm": 1.2386566400527954, + "learning_rate": 2e-05, + "loss": 0.04658375, + "step": 22983 + }, + { + "epoch": 45.968, + "grad_norm": 1.490594506263733, + "learning_rate": 2e-05, + "loss": 0.06036313, + "step": 22984 + }, + { + "epoch": 45.97, + "grad_norm": 1.3342573642730713, + "learning_rate": 2e-05, + "loss": 0.04109773, + "step": 22985 + }, + { + "epoch": 45.972, + "grad_norm": 2.1110265254974365, + "learning_rate": 2e-05, + "loss": 0.04837161, + "step": 22986 + }, + { + "epoch": 45.974, + "grad_norm": 1.1389340162277222, + "learning_rate": 2e-05, + "loss": 0.04660567, + "step": 22987 + }, + { + "epoch": 45.976, + "grad_norm": 1.324904441833496, + "learning_rate": 2e-05, + "loss": 0.05071661, + "step": 22988 + }, + { + "epoch": 45.978, + "grad_norm": 1.7974005937576294, + "learning_rate": 2e-05, + "loss": 0.05388502, + "step": 22989 + }, + { + "epoch": 45.98, + "grad_norm": 1.4797214269638062, + "learning_rate": 2e-05, + "loss": 0.04529134, + "step": 22990 + }, + { + "epoch": 45.982, + "grad_norm": 2.093899965286255, + "learning_rate": 2e-05, + "loss": 0.04731727, + "step": 22991 + }, + { + "epoch": 45.984, + "grad_norm": 1.1308255195617676, + "learning_rate": 2e-05, + "loss": 0.04801143, + "step": 22992 + }, + { + "epoch": 45.986, + "grad_norm": 1.4428658485412598, + "learning_rate": 2e-05, + "loss": 0.04077057, + "step": 22993 + }, + { + "epoch": 45.988, + "grad_norm": 1.423751950263977, + "learning_rate": 2e-05, + "loss": 0.05186245, + "step": 22994 + }, + { + "epoch": 45.99, + "grad_norm": 1.1405518054962158, + "learning_rate": 2e-05, + "loss": 0.04192315, + "step": 22995 + }, + { + "epoch": 45.992, + "grad_norm": 1.020820140838623, + "learning_rate": 2e-05, + "loss": 0.04345158, + "step": 22996 + }, + { + "epoch": 45.994, + "grad_norm": 1.4293239116668701, + "learning_rate": 2e-05, + "loss": 0.04355782, + "step": 22997 + }, + { + "epoch": 45.996, + "grad_norm": 2.0200562477111816, + "learning_rate": 2e-05, + "loss": 0.06576998, + "step": 22998 + }, + { + "epoch": 45.998, + "grad_norm": 1.1396820545196533, + "learning_rate": 2e-05, + "loss": 0.0453399, + "step": 22999 + }, + { + "epoch": 46.0, + "grad_norm": 1.615691900253296, + "learning_rate": 2e-05, + "loss": 0.04784571, + "step": 23000 + }, + { + "epoch": 46.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9840319361277445, + "Equal_1": 0.998, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9920159680638723, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 1.0, + "Parallel_1": 0.9959919839679359, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.998, + "Perpendicular_3": 0.905811623246493, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.992, + "PointLiesOnLine_1": 0.9939879759519038, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9860279441117764 + }, + "eval_runtime": 324.2615, + "eval_samples_per_second": 32.381, + "eval_steps_per_second": 0.648, + "step": 23000 + }, + { + "epoch": 46.002, + "grad_norm": 1.4302409887313843, + "learning_rate": 2e-05, + "loss": 0.05861004, + "step": 23001 + }, + { + "epoch": 46.004, + "grad_norm": 1.39664626121521, + "learning_rate": 2e-05, + "loss": 0.05934343, + "step": 23002 + }, + { + "epoch": 46.006, + "grad_norm": 1.1906030178070068, + "learning_rate": 2e-05, + "loss": 0.03054802, + "step": 23003 + }, + { + "epoch": 46.008, + "grad_norm": 1.7340916395187378, + "learning_rate": 2e-05, + "loss": 0.05384779, + "step": 23004 + }, + { + "epoch": 46.01, + "grad_norm": 1.309027075767517, + "learning_rate": 2e-05, + "loss": 0.04369318, + "step": 23005 + }, + { + "epoch": 46.012, + "grad_norm": 1.002219796180725, + "learning_rate": 2e-05, + "loss": 0.03218366, + "step": 23006 + }, + { + "epoch": 46.014, + "grad_norm": 1.6894913911819458, + "learning_rate": 2e-05, + "loss": 0.05528177, + "step": 23007 + }, + { + "epoch": 46.016, + "grad_norm": 1.18147873878479, + "learning_rate": 2e-05, + "loss": 0.03384054, + "step": 23008 + }, + { + "epoch": 46.018, + "grad_norm": 1.331606149673462, + "learning_rate": 2e-05, + "loss": 0.05830061, + "step": 23009 + }, + { + "epoch": 46.02, + "grad_norm": 1.5706523656845093, + "learning_rate": 2e-05, + "loss": 0.05562139, + "step": 23010 + }, + { + "epoch": 46.022, + "grad_norm": 1.2435215711593628, + "learning_rate": 2e-05, + "loss": 0.05742179, + "step": 23011 + }, + { + "epoch": 46.024, + "grad_norm": 1.2738981246948242, + "learning_rate": 2e-05, + "loss": 0.04086342, + "step": 23012 + }, + { + "epoch": 46.026, + "grad_norm": 1.4569175243377686, + "learning_rate": 2e-05, + "loss": 0.04479681, + "step": 23013 + }, + { + "epoch": 46.028, + "grad_norm": 1.147181749343872, + "learning_rate": 2e-05, + "loss": 0.04108036, + "step": 23014 + }, + { + "epoch": 46.03, + "grad_norm": 1.1991140842437744, + "learning_rate": 2e-05, + "loss": 0.04342316, + "step": 23015 + }, + { + "epoch": 46.032, + "grad_norm": 1.3934859037399292, + "learning_rate": 2e-05, + "loss": 0.05477795, + "step": 23016 + }, + { + "epoch": 46.034, + "grad_norm": 1.2585614919662476, + "learning_rate": 2e-05, + "loss": 0.04793346, + "step": 23017 + }, + { + "epoch": 46.036, + "grad_norm": 1.1003919839859009, + "learning_rate": 2e-05, + "loss": 0.05579524, + "step": 23018 + }, + { + "epoch": 46.038, + "grad_norm": 1.2833575010299683, + "learning_rate": 2e-05, + "loss": 0.05675605, + "step": 23019 + }, + { + "epoch": 46.04, + "grad_norm": 3.453655242919922, + "learning_rate": 2e-05, + "loss": 0.05313053, + "step": 23020 + }, + { + "epoch": 46.042, + "grad_norm": 1.191473126411438, + "learning_rate": 2e-05, + "loss": 0.04780373, + "step": 23021 + }, + { + "epoch": 46.044, + "grad_norm": 1.29380202293396, + "learning_rate": 2e-05, + "loss": 0.04844226, + "step": 23022 + }, + { + "epoch": 46.046, + "grad_norm": 1.3847426176071167, + "learning_rate": 2e-05, + "loss": 0.05267187, + "step": 23023 + }, + { + "epoch": 46.048, + "grad_norm": 2.212348222732544, + "learning_rate": 2e-05, + "loss": 0.0534815, + "step": 23024 + }, + { + "epoch": 46.05, + "grad_norm": 1.3633710145950317, + "learning_rate": 2e-05, + "loss": 0.06192131, + "step": 23025 + }, + { + "epoch": 46.052, + "grad_norm": 1.212673544883728, + "learning_rate": 2e-05, + "loss": 0.03544668, + "step": 23026 + }, + { + "epoch": 46.054, + "grad_norm": 1.0772227048873901, + "learning_rate": 2e-05, + "loss": 0.03711303, + "step": 23027 + }, + { + "epoch": 46.056, + "grad_norm": 2.53438401222229, + "learning_rate": 2e-05, + "loss": 0.05098771, + "step": 23028 + }, + { + "epoch": 46.058, + "grad_norm": 1.814626932144165, + "learning_rate": 2e-05, + "loss": 0.04822916, + "step": 23029 + }, + { + "epoch": 46.06, + "grad_norm": 1.6201591491699219, + "learning_rate": 2e-05, + "loss": 0.0526498, + "step": 23030 + }, + { + "epoch": 46.062, + "grad_norm": 1.464896559715271, + "learning_rate": 2e-05, + "loss": 0.05728604, + "step": 23031 + }, + { + "epoch": 46.064, + "grad_norm": 1.0204657316207886, + "learning_rate": 2e-05, + "loss": 0.04259542, + "step": 23032 + }, + { + "epoch": 46.066, + "grad_norm": 1.1545565128326416, + "learning_rate": 2e-05, + "loss": 0.05285469, + "step": 23033 + }, + { + "epoch": 46.068, + "grad_norm": 1.0158534049987793, + "learning_rate": 2e-05, + "loss": 0.03832763, + "step": 23034 + }, + { + "epoch": 46.07, + "grad_norm": 1.1786905527114868, + "learning_rate": 2e-05, + "loss": 0.03449775, + "step": 23035 + }, + { + "epoch": 46.072, + "grad_norm": 1.2744739055633545, + "learning_rate": 2e-05, + "loss": 0.04379936, + "step": 23036 + }, + { + "epoch": 46.074, + "grad_norm": 1.868301510810852, + "learning_rate": 2e-05, + "loss": 0.0644619, + "step": 23037 + }, + { + "epoch": 46.076, + "grad_norm": 1.1725568771362305, + "learning_rate": 2e-05, + "loss": 0.04825684, + "step": 23038 + }, + { + "epoch": 46.078, + "grad_norm": 1.6947202682495117, + "learning_rate": 2e-05, + "loss": 0.06334668, + "step": 23039 + }, + { + "epoch": 46.08, + "grad_norm": 1.4566727876663208, + "learning_rate": 2e-05, + "loss": 0.05566946, + "step": 23040 + }, + { + "epoch": 46.082, + "grad_norm": 1.2158526182174683, + "learning_rate": 2e-05, + "loss": 0.03828437, + "step": 23041 + }, + { + "epoch": 46.084, + "grad_norm": 1.1454522609710693, + "learning_rate": 2e-05, + "loss": 0.04942612, + "step": 23042 + }, + { + "epoch": 46.086, + "grad_norm": 1.1992695331573486, + "learning_rate": 2e-05, + "loss": 0.04454256, + "step": 23043 + }, + { + "epoch": 46.088, + "grad_norm": 1.242703914642334, + "learning_rate": 2e-05, + "loss": 0.04919573, + "step": 23044 + }, + { + "epoch": 46.09, + "grad_norm": 1.1189968585968018, + "learning_rate": 2e-05, + "loss": 0.04536124, + "step": 23045 + }, + { + "epoch": 46.092, + "grad_norm": 1.1488397121429443, + "learning_rate": 2e-05, + "loss": 0.05430396, + "step": 23046 + }, + { + "epoch": 46.094, + "grad_norm": 1.281959891319275, + "learning_rate": 2e-05, + "loss": 0.06264284, + "step": 23047 + }, + { + "epoch": 46.096, + "grad_norm": 1.4672093391418457, + "learning_rate": 2e-05, + "loss": 0.06311803, + "step": 23048 + }, + { + "epoch": 46.098, + "grad_norm": 1.124530553817749, + "learning_rate": 2e-05, + "loss": 0.03884341, + "step": 23049 + }, + { + "epoch": 46.1, + "grad_norm": 1.1475732326507568, + "learning_rate": 2e-05, + "loss": 0.05629781, + "step": 23050 + }, + { + "epoch": 46.102, + "grad_norm": 1.4845279455184937, + "learning_rate": 2e-05, + "loss": 0.05716734, + "step": 23051 + }, + { + "epoch": 46.104, + "grad_norm": 0.9782443642616272, + "learning_rate": 2e-05, + "loss": 0.0321027, + "step": 23052 + }, + { + "epoch": 46.106, + "grad_norm": 1.0824551582336426, + "learning_rate": 2e-05, + "loss": 0.04053795, + "step": 23053 + }, + { + "epoch": 46.108, + "grad_norm": 1.1406644582748413, + "learning_rate": 2e-05, + "loss": 0.03841585, + "step": 23054 + }, + { + "epoch": 46.11, + "grad_norm": 1.2297335863113403, + "learning_rate": 2e-05, + "loss": 0.05340029, + "step": 23055 + }, + { + "epoch": 46.112, + "grad_norm": 1.1259406805038452, + "learning_rate": 2e-05, + "loss": 0.05276688, + "step": 23056 + }, + { + "epoch": 46.114, + "grad_norm": 2.047508478164673, + "learning_rate": 2e-05, + "loss": 0.05007156, + "step": 23057 + }, + { + "epoch": 46.116, + "grad_norm": 1.338058352470398, + "learning_rate": 2e-05, + "loss": 0.04387624, + "step": 23058 + }, + { + "epoch": 46.118, + "grad_norm": 1.2544599771499634, + "learning_rate": 2e-05, + "loss": 0.04295979, + "step": 23059 + }, + { + "epoch": 46.12, + "grad_norm": 1.185205101966858, + "learning_rate": 2e-05, + "loss": 0.05217368, + "step": 23060 + }, + { + "epoch": 46.122, + "grad_norm": 1.1495543718338013, + "learning_rate": 2e-05, + "loss": 0.042289, + "step": 23061 + }, + { + "epoch": 46.124, + "grad_norm": 1.7538362741470337, + "learning_rate": 2e-05, + "loss": 0.0680445, + "step": 23062 + }, + { + "epoch": 46.126, + "grad_norm": 1.0621302127838135, + "learning_rate": 2e-05, + "loss": 0.04465366, + "step": 23063 + }, + { + "epoch": 46.128, + "grad_norm": 1.2388436794281006, + "learning_rate": 2e-05, + "loss": 0.05579409, + "step": 23064 + }, + { + "epoch": 46.13, + "grad_norm": 1.4218683242797852, + "learning_rate": 2e-05, + "loss": 0.0443549, + "step": 23065 + }, + { + "epoch": 46.132, + "grad_norm": 1.1729727983474731, + "learning_rate": 2e-05, + "loss": 0.05611319, + "step": 23066 + }, + { + "epoch": 46.134, + "grad_norm": 1.1419835090637207, + "learning_rate": 2e-05, + "loss": 0.04782356, + "step": 23067 + }, + { + "epoch": 46.136, + "grad_norm": 1.0463011264801025, + "learning_rate": 2e-05, + "loss": 0.04206122, + "step": 23068 + }, + { + "epoch": 46.138, + "grad_norm": 1.1902068853378296, + "learning_rate": 2e-05, + "loss": 0.0524741, + "step": 23069 + }, + { + "epoch": 46.14, + "grad_norm": 1.0246272087097168, + "learning_rate": 2e-05, + "loss": 0.03112867, + "step": 23070 + }, + { + "epoch": 46.142, + "grad_norm": 1.05487859249115, + "learning_rate": 2e-05, + "loss": 0.04408978, + "step": 23071 + }, + { + "epoch": 46.144, + "grad_norm": 1.0214228630065918, + "learning_rate": 2e-05, + "loss": 0.03995924, + "step": 23072 + }, + { + "epoch": 46.146, + "grad_norm": 1.0368109941482544, + "learning_rate": 2e-05, + "loss": 0.04691656, + "step": 23073 + }, + { + "epoch": 46.148, + "grad_norm": 1.1551309823989868, + "learning_rate": 2e-05, + "loss": 0.04764878, + "step": 23074 + }, + { + "epoch": 46.15, + "grad_norm": 1.2423347234725952, + "learning_rate": 2e-05, + "loss": 0.05095094, + "step": 23075 + }, + { + "epoch": 46.152, + "grad_norm": 1.4114395380020142, + "learning_rate": 2e-05, + "loss": 0.05528051, + "step": 23076 + }, + { + "epoch": 46.154, + "grad_norm": 0.9848634600639343, + "learning_rate": 2e-05, + "loss": 0.04308025, + "step": 23077 + }, + { + "epoch": 46.156, + "grad_norm": 1.071808099746704, + "learning_rate": 2e-05, + "loss": 0.0475583, + "step": 23078 + }, + { + "epoch": 46.158, + "grad_norm": 1.093819499015808, + "learning_rate": 2e-05, + "loss": 0.03855433, + "step": 23079 + }, + { + "epoch": 46.16, + "grad_norm": 1.9816279411315918, + "learning_rate": 2e-05, + "loss": 0.04996374, + "step": 23080 + }, + { + "epoch": 46.162, + "grad_norm": 1.2489104270935059, + "learning_rate": 2e-05, + "loss": 0.05128529, + "step": 23081 + }, + { + "epoch": 46.164, + "grad_norm": 1.3830829858779907, + "learning_rate": 2e-05, + "loss": 0.05532495, + "step": 23082 + }, + { + "epoch": 46.166, + "grad_norm": 1.2296466827392578, + "learning_rate": 2e-05, + "loss": 0.04965352, + "step": 23083 + }, + { + "epoch": 46.168, + "grad_norm": 1.0486325025558472, + "learning_rate": 2e-05, + "loss": 0.03422453, + "step": 23084 + }, + { + "epoch": 46.17, + "grad_norm": 1.4689314365386963, + "learning_rate": 2e-05, + "loss": 0.05122601, + "step": 23085 + }, + { + "epoch": 46.172, + "grad_norm": 1.9588061571121216, + "learning_rate": 2e-05, + "loss": 0.04775591, + "step": 23086 + }, + { + "epoch": 46.174, + "grad_norm": 1.2059226036071777, + "learning_rate": 2e-05, + "loss": 0.04841192, + "step": 23087 + }, + { + "epoch": 46.176, + "grad_norm": 1.1085312366485596, + "learning_rate": 2e-05, + "loss": 0.04657616, + "step": 23088 + }, + { + "epoch": 46.178, + "grad_norm": 2.187915563583374, + "learning_rate": 2e-05, + "loss": 0.05317451, + "step": 23089 + }, + { + "epoch": 46.18, + "grad_norm": 1.0216542482376099, + "learning_rate": 2e-05, + "loss": 0.0392207, + "step": 23090 + }, + { + "epoch": 46.182, + "grad_norm": 1.3469269275665283, + "learning_rate": 2e-05, + "loss": 0.05570158, + "step": 23091 + }, + { + "epoch": 46.184, + "grad_norm": 1.4849958419799805, + "learning_rate": 2e-05, + "loss": 0.04750857, + "step": 23092 + }, + { + "epoch": 46.186, + "grad_norm": 1.8725115060806274, + "learning_rate": 2e-05, + "loss": 0.06257782, + "step": 23093 + }, + { + "epoch": 46.188, + "grad_norm": 1.5166374444961548, + "learning_rate": 2e-05, + "loss": 0.04668888, + "step": 23094 + }, + { + "epoch": 46.19, + "grad_norm": 1.2905293703079224, + "learning_rate": 2e-05, + "loss": 0.04647534, + "step": 23095 + }, + { + "epoch": 46.192, + "grad_norm": 1.208664894104004, + "learning_rate": 2e-05, + "loss": 0.05198023, + "step": 23096 + }, + { + "epoch": 46.194, + "grad_norm": 1.0935642719268799, + "learning_rate": 2e-05, + "loss": 0.05911352, + "step": 23097 + }, + { + "epoch": 46.196, + "grad_norm": 1.104860782623291, + "learning_rate": 2e-05, + "loss": 0.03793868, + "step": 23098 + }, + { + "epoch": 46.198, + "grad_norm": 1.06515371799469, + "learning_rate": 2e-05, + "loss": 0.04790756, + "step": 23099 + }, + { + "epoch": 46.2, + "grad_norm": 1.6914348602294922, + "learning_rate": 2e-05, + "loss": 0.04272515, + "step": 23100 + }, + { + "epoch": 46.202, + "grad_norm": 1.292098879814148, + "learning_rate": 2e-05, + "loss": 0.04760931, + "step": 23101 + }, + { + "epoch": 46.204, + "grad_norm": 1.197635531425476, + "learning_rate": 2e-05, + "loss": 0.05021203, + "step": 23102 + }, + { + "epoch": 46.206, + "grad_norm": 1.267522931098938, + "learning_rate": 2e-05, + "loss": 0.06113753, + "step": 23103 + }, + { + "epoch": 46.208, + "grad_norm": 0.9828411936759949, + "learning_rate": 2e-05, + "loss": 0.04029978, + "step": 23104 + }, + { + "epoch": 46.21, + "grad_norm": 1.4888331890106201, + "learning_rate": 2e-05, + "loss": 0.05187474, + "step": 23105 + }, + { + "epoch": 46.212, + "grad_norm": 1.055316686630249, + "learning_rate": 2e-05, + "loss": 0.04340184, + "step": 23106 + }, + { + "epoch": 46.214, + "grad_norm": 1.2184720039367676, + "learning_rate": 2e-05, + "loss": 0.0478823, + "step": 23107 + }, + { + "epoch": 46.216, + "grad_norm": 1.2196934223175049, + "learning_rate": 2e-05, + "loss": 0.05798684, + "step": 23108 + }, + { + "epoch": 46.218, + "grad_norm": 1.194522738456726, + "learning_rate": 2e-05, + "loss": 0.05176704, + "step": 23109 + }, + { + "epoch": 46.22, + "grad_norm": 1.0654971599578857, + "learning_rate": 2e-05, + "loss": 0.03646301, + "step": 23110 + }, + { + "epoch": 46.222, + "grad_norm": 1.3132420778274536, + "learning_rate": 2e-05, + "loss": 0.03887681, + "step": 23111 + }, + { + "epoch": 46.224, + "grad_norm": 1.870571255683899, + "learning_rate": 2e-05, + "loss": 0.0637666, + "step": 23112 + }, + { + "epoch": 46.226, + "grad_norm": 1.1391980648040771, + "learning_rate": 2e-05, + "loss": 0.04590259, + "step": 23113 + }, + { + "epoch": 46.228, + "grad_norm": 1.1913096904754639, + "learning_rate": 2e-05, + "loss": 0.03562557, + "step": 23114 + }, + { + "epoch": 46.23, + "grad_norm": 3.3259854316711426, + "learning_rate": 2e-05, + "loss": 0.05996282, + "step": 23115 + }, + { + "epoch": 46.232, + "grad_norm": 1.2023781538009644, + "learning_rate": 2e-05, + "loss": 0.05674819, + "step": 23116 + }, + { + "epoch": 46.234, + "grad_norm": 1.2463189363479614, + "learning_rate": 2e-05, + "loss": 0.05823443, + "step": 23117 + }, + { + "epoch": 46.236, + "grad_norm": 1.352786660194397, + "learning_rate": 2e-05, + "loss": 0.0599811, + "step": 23118 + }, + { + "epoch": 46.238, + "grad_norm": 1.1087812185287476, + "learning_rate": 2e-05, + "loss": 0.05076696, + "step": 23119 + }, + { + "epoch": 46.24, + "grad_norm": 1.2660101652145386, + "learning_rate": 2e-05, + "loss": 0.0315996, + "step": 23120 + }, + { + "epoch": 46.242, + "grad_norm": 1.2009081840515137, + "learning_rate": 2e-05, + "loss": 0.04985572, + "step": 23121 + }, + { + "epoch": 46.244, + "grad_norm": 0.9989933371543884, + "learning_rate": 2e-05, + "loss": 0.04121695, + "step": 23122 + }, + { + "epoch": 46.246, + "grad_norm": 2.8735687732696533, + "learning_rate": 2e-05, + "loss": 0.07063636, + "step": 23123 + }, + { + "epoch": 46.248, + "grad_norm": 1.0923492908477783, + "learning_rate": 2e-05, + "loss": 0.04320874, + "step": 23124 + }, + { + "epoch": 46.25, + "grad_norm": 1.0142641067504883, + "learning_rate": 2e-05, + "loss": 0.03465679, + "step": 23125 + }, + { + "epoch": 46.252, + "grad_norm": 1.1704773902893066, + "learning_rate": 2e-05, + "loss": 0.04421233, + "step": 23126 + }, + { + "epoch": 46.254, + "grad_norm": 1.2731939554214478, + "learning_rate": 2e-05, + "loss": 0.07911802, + "step": 23127 + }, + { + "epoch": 46.256, + "grad_norm": 2.1010546684265137, + "learning_rate": 2e-05, + "loss": 0.04964682, + "step": 23128 + }, + { + "epoch": 46.258, + "grad_norm": 3.078134536743164, + "learning_rate": 2e-05, + "loss": 0.0332694, + "step": 23129 + }, + { + "epoch": 46.26, + "grad_norm": 1.6000622510910034, + "learning_rate": 2e-05, + "loss": 0.05290427, + "step": 23130 + }, + { + "epoch": 46.262, + "grad_norm": 2.217181921005249, + "learning_rate": 2e-05, + "loss": 0.04330307, + "step": 23131 + }, + { + "epoch": 46.264, + "grad_norm": 1.268926739692688, + "learning_rate": 2e-05, + "loss": 0.05655915, + "step": 23132 + }, + { + "epoch": 46.266, + "grad_norm": 5.429689407348633, + "learning_rate": 2e-05, + "loss": 0.05056974, + "step": 23133 + }, + { + "epoch": 46.268, + "grad_norm": 4.090033054351807, + "learning_rate": 2e-05, + "loss": 0.05439257, + "step": 23134 + }, + { + "epoch": 46.27, + "grad_norm": 4.116130352020264, + "learning_rate": 2e-05, + "loss": 0.06643784, + "step": 23135 + }, + { + "epoch": 46.272, + "grad_norm": 1.2675787210464478, + "learning_rate": 2e-05, + "loss": 0.05370174, + "step": 23136 + }, + { + "epoch": 46.274, + "grad_norm": 1.1625384092330933, + "learning_rate": 2e-05, + "loss": 0.04453802, + "step": 23137 + }, + { + "epoch": 46.276, + "grad_norm": 1.1202532052993774, + "learning_rate": 2e-05, + "loss": 0.04463059, + "step": 23138 + }, + { + "epoch": 46.278, + "grad_norm": 1.0193384885787964, + "learning_rate": 2e-05, + "loss": 0.03918165, + "step": 23139 + }, + { + "epoch": 46.28, + "grad_norm": 0.9413808584213257, + "learning_rate": 2e-05, + "loss": 0.02721583, + "step": 23140 + }, + { + "epoch": 46.282, + "grad_norm": 1.1726583242416382, + "learning_rate": 2e-05, + "loss": 0.03906541, + "step": 23141 + }, + { + "epoch": 46.284, + "grad_norm": 1.1463528871536255, + "learning_rate": 2e-05, + "loss": 0.04882437, + "step": 23142 + }, + { + "epoch": 46.286, + "grad_norm": 1.02104651927948, + "learning_rate": 2e-05, + "loss": 0.04882137, + "step": 23143 + }, + { + "epoch": 46.288, + "grad_norm": 2.9313247203826904, + "learning_rate": 2e-05, + "loss": 0.04990475, + "step": 23144 + }, + { + "epoch": 46.29, + "grad_norm": 1.2045042514801025, + "learning_rate": 2e-05, + "loss": 0.04847421, + "step": 23145 + }, + { + "epoch": 46.292, + "grad_norm": 1.4009835720062256, + "learning_rate": 2e-05, + "loss": 0.05024692, + "step": 23146 + }, + { + "epoch": 46.294, + "grad_norm": 1.1895580291748047, + "learning_rate": 2e-05, + "loss": 0.04267966, + "step": 23147 + }, + { + "epoch": 46.296, + "grad_norm": 1.1182502508163452, + "learning_rate": 2e-05, + "loss": 0.05323965, + "step": 23148 + }, + { + "epoch": 46.298, + "grad_norm": 1.719334363937378, + "learning_rate": 2e-05, + "loss": 0.04970006, + "step": 23149 + }, + { + "epoch": 46.3, + "grad_norm": 2.0725491046905518, + "learning_rate": 2e-05, + "loss": 0.06950231, + "step": 23150 + }, + { + "epoch": 46.302, + "grad_norm": 3.3280606269836426, + "learning_rate": 2e-05, + "loss": 0.066084, + "step": 23151 + }, + { + "epoch": 46.304, + "grad_norm": 1.1809648275375366, + "learning_rate": 2e-05, + "loss": 0.04454755, + "step": 23152 + }, + { + "epoch": 46.306, + "grad_norm": 1.2297167778015137, + "learning_rate": 2e-05, + "loss": 0.06474146, + "step": 23153 + }, + { + "epoch": 46.308, + "grad_norm": 1.6546833515167236, + "learning_rate": 2e-05, + "loss": 0.04665916, + "step": 23154 + }, + { + "epoch": 46.31, + "grad_norm": 0.9898395538330078, + "learning_rate": 2e-05, + "loss": 0.03971004, + "step": 23155 + }, + { + "epoch": 46.312, + "grad_norm": 1.4820255041122437, + "learning_rate": 2e-05, + "loss": 0.04219128, + "step": 23156 + }, + { + "epoch": 46.314, + "grad_norm": 1.7158340215682983, + "learning_rate": 2e-05, + "loss": 0.0606416, + "step": 23157 + }, + { + "epoch": 46.316, + "grad_norm": 1.0219082832336426, + "learning_rate": 2e-05, + "loss": 0.04473948, + "step": 23158 + }, + { + "epoch": 46.318, + "grad_norm": 1.4255625009536743, + "learning_rate": 2e-05, + "loss": 0.04729429, + "step": 23159 + }, + { + "epoch": 46.32, + "grad_norm": 1.1542085409164429, + "learning_rate": 2e-05, + "loss": 0.04332086, + "step": 23160 + }, + { + "epoch": 46.322, + "grad_norm": 1.1608744859695435, + "learning_rate": 2e-05, + "loss": 0.05311103, + "step": 23161 + }, + { + "epoch": 46.324, + "grad_norm": 1.333052396774292, + "learning_rate": 2e-05, + "loss": 0.05798706, + "step": 23162 + }, + { + "epoch": 46.326, + "grad_norm": 1.1479754447937012, + "learning_rate": 2e-05, + "loss": 0.0463843, + "step": 23163 + }, + { + "epoch": 46.328, + "grad_norm": 1.8615238666534424, + "learning_rate": 2e-05, + "loss": 0.04511591, + "step": 23164 + }, + { + "epoch": 46.33, + "grad_norm": 1.2631683349609375, + "learning_rate": 2e-05, + "loss": 0.05475502, + "step": 23165 + }, + { + "epoch": 46.332, + "grad_norm": 1.4224528074264526, + "learning_rate": 2e-05, + "loss": 0.06586679, + "step": 23166 + }, + { + "epoch": 46.334, + "grad_norm": 1.1078559160232544, + "learning_rate": 2e-05, + "loss": 0.03966166, + "step": 23167 + }, + { + "epoch": 46.336, + "grad_norm": 0.9279854893684387, + "learning_rate": 2e-05, + "loss": 0.03631292, + "step": 23168 + }, + { + "epoch": 46.338, + "grad_norm": 2.5856709480285645, + "learning_rate": 2e-05, + "loss": 0.05210607, + "step": 23169 + }, + { + "epoch": 46.34, + "grad_norm": 1.2316886186599731, + "learning_rate": 2e-05, + "loss": 0.04556869, + "step": 23170 + }, + { + "epoch": 46.342, + "grad_norm": 1.1155706644058228, + "learning_rate": 2e-05, + "loss": 0.05066471, + "step": 23171 + }, + { + "epoch": 46.344, + "grad_norm": 2.491442918777466, + "learning_rate": 2e-05, + "loss": 0.05617189, + "step": 23172 + }, + { + "epoch": 46.346, + "grad_norm": 1.3126240968704224, + "learning_rate": 2e-05, + "loss": 0.04718146, + "step": 23173 + }, + { + "epoch": 46.348, + "grad_norm": 1.0606178045272827, + "learning_rate": 2e-05, + "loss": 0.04762572, + "step": 23174 + }, + { + "epoch": 46.35, + "grad_norm": 1.0557273626327515, + "learning_rate": 2e-05, + "loss": 0.03899936, + "step": 23175 + }, + { + "epoch": 46.352, + "grad_norm": 1.1816285848617554, + "learning_rate": 2e-05, + "loss": 0.04925738, + "step": 23176 + }, + { + "epoch": 46.354, + "grad_norm": 1.4543360471725464, + "learning_rate": 2e-05, + "loss": 0.04988147, + "step": 23177 + }, + { + "epoch": 46.356, + "grad_norm": 1.1029762029647827, + "learning_rate": 2e-05, + "loss": 0.04883325, + "step": 23178 + }, + { + "epoch": 46.358, + "grad_norm": 1.2085076570510864, + "learning_rate": 2e-05, + "loss": 0.0439341, + "step": 23179 + }, + { + "epoch": 46.36, + "grad_norm": 0.9505863785743713, + "learning_rate": 2e-05, + "loss": 0.03014771, + "step": 23180 + }, + { + "epoch": 46.362, + "grad_norm": 0.9543641805648804, + "learning_rate": 2e-05, + "loss": 0.03300781, + "step": 23181 + }, + { + "epoch": 46.364, + "grad_norm": 0.9900519251823425, + "learning_rate": 2e-05, + "loss": 0.04158117, + "step": 23182 + }, + { + "epoch": 46.366, + "grad_norm": 2.153928279876709, + "learning_rate": 2e-05, + "loss": 0.04417425, + "step": 23183 + }, + { + "epoch": 46.368, + "grad_norm": 2.409182071685791, + "learning_rate": 2e-05, + "loss": 0.06157666, + "step": 23184 + }, + { + "epoch": 46.37, + "grad_norm": 1.057466745376587, + "learning_rate": 2e-05, + "loss": 0.04592596, + "step": 23185 + }, + { + "epoch": 46.372, + "grad_norm": 1.1165255308151245, + "learning_rate": 2e-05, + "loss": 0.04613017, + "step": 23186 + }, + { + "epoch": 46.374, + "grad_norm": 0.9036556482315063, + "learning_rate": 2e-05, + "loss": 0.03239109, + "step": 23187 + }, + { + "epoch": 46.376, + "grad_norm": 1.1022392511367798, + "learning_rate": 2e-05, + "loss": 0.05009172, + "step": 23188 + }, + { + "epoch": 46.378, + "grad_norm": 1.0640677213668823, + "learning_rate": 2e-05, + "loss": 0.04262567, + "step": 23189 + }, + { + "epoch": 46.38, + "grad_norm": 1.11314857006073, + "learning_rate": 2e-05, + "loss": 0.04713321, + "step": 23190 + }, + { + "epoch": 46.382, + "grad_norm": 1.3541191816329956, + "learning_rate": 2e-05, + "loss": 0.04420114, + "step": 23191 + }, + { + "epoch": 46.384, + "grad_norm": 1.3517026901245117, + "learning_rate": 2e-05, + "loss": 0.03507008, + "step": 23192 + }, + { + "epoch": 46.386, + "grad_norm": 1.197635293006897, + "learning_rate": 2e-05, + "loss": 0.04363513, + "step": 23193 + }, + { + "epoch": 46.388, + "grad_norm": 1.3382245302200317, + "learning_rate": 2e-05, + "loss": 0.05595472, + "step": 23194 + }, + { + "epoch": 46.39, + "grad_norm": 1.074084758758545, + "learning_rate": 2e-05, + "loss": 0.05243614, + "step": 23195 + }, + { + "epoch": 46.392, + "grad_norm": 1.0921239852905273, + "learning_rate": 2e-05, + "loss": 0.03115225, + "step": 23196 + }, + { + "epoch": 46.394, + "grad_norm": 2.5413079261779785, + "learning_rate": 2e-05, + "loss": 0.04841086, + "step": 23197 + }, + { + "epoch": 46.396, + "grad_norm": 1.2901527881622314, + "learning_rate": 2e-05, + "loss": 0.05885056, + "step": 23198 + }, + { + "epoch": 46.398, + "grad_norm": 1.157203197479248, + "learning_rate": 2e-05, + "loss": 0.05642793, + "step": 23199 + }, + { + "epoch": 46.4, + "grad_norm": 1.1579469442367554, + "learning_rate": 2e-05, + "loss": 0.04676284, + "step": 23200 + }, + { + "epoch": 46.402, + "grad_norm": 1.3478295803070068, + "learning_rate": 2e-05, + "loss": 0.05287962, + "step": 23201 + }, + { + "epoch": 46.404, + "grad_norm": 0.9474977254867554, + "learning_rate": 2e-05, + "loss": 0.0353672, + "step": 23202 + }, + { + "epoch": 46.406, + "grad_norm": 1.401186466217041, + "learning_rate": 2e-05, + "loss": 0.05350681, + "step": 23203 + }, + { + "epoch": 46.408, + "grad_norm": 1.6736927032470703, + "learning_rate": 2e-05, + "loss": 0.05108627, + "step": 23204 + }, + { + "epoch": 46.41, + "grad_norm": 1.5280506610870361, + "learning_rate": 2e-05, + "loss": 0.06310539, + "step": 23205 + }, + { + "epoch": 46.412, + "grad_norm": 1.2769758701324463, + "learning_rate": 2e-05, + "loss": 0.04531645, + "step": 23206 + }, + { + "epoch": 46.414, + "grad_norm": 1.0869319438934326, + "learning_rate": 2e-05, + "loss": 0.04933855, + "step": 23207 + }, + { + "epoch": 46.416, + "grad_norm": 1.8246020078659058, + "learning_rate": 2e-05, + "loss": 0.04469419, + "step": 23208 + }, + { + "epoch": 46.418, + "grad_norm": 1.1753547191619873, + "learning_rate": 2e-05, + "loss": 0.03421785, + "step": 23209 + }, + { + "epoch": 46.42, + "grad_norm": 1.1583861112594604, + "learning_rate": 2e-05, + "loss": 0.05171474, + "step": 23210 + }, + { + "epoch": 46.422, + "grad_norm": 1.1681585311889648, + "learning_rate": 2e-05, + "loss": 0.04288517, + "step": 23211 + }, + { + "epoch": 46.424, + "grad_norm": 1.8628696203231812, + "learning_rate": 2e-05, + "loss": 0.06078832, + "step": 23212 + }, + { + "epoch": 46.426, + "grad_norm": 2.069993495941162, + "learning_rate": 2e-05, + "loss": 0.06477488, + "step": 23213 + }, + { + "epoch": 46.428, + "grad_norm": 3.0566177368164062, + "learning_rate": 2e-05, + "loss": 0.05633872, + "step": 23214 + }, + { + "epoch": 46.43, + "grad_norm": 1.5551815032958984, + "learning_rate": 2e-05, + "loss": 0.06199846, + "step": 23215 + }, + { + "epoch": 46.432, + "grad_norm": 1.2828153371810913, + "learning_rate": 2e-05, + "loss": 0.04329269, + "step": 23216 + }, + { + "epoch": 46.434, + "grad_norm": 1.1735610961914062, + "learning_rate": 2e-05, + "loss": 0.05622312, + "step": 23217 + }, + { + "epoch": 46.436, + "grad_norm": 2.0017809867858887, + "learning_rate": 2e-05, + "loss": 0.03825869, + "step": 23218 + }, + { + "epoch": 46.438, + "grad_norm": 1.1812978982925415, + "learning_rate": 2e-05, + "loss": 0.05794305, + "step": 23219 + }, + { + "epoch": 46.44, + "grad_norm": 1.3604928255081177, + "learning_rate": 2e-05, + "loss": 0.05252583, + "step": 23220 + }, + { + "epoch": 46.442, + "grad_norm": 1.1032557487487793, + "learning_rate": 2e-05, + "loss": 0.03681751, + "step": 23221 + }, + { + "epoch": 46.444, + "grad_norm": 1.597011685371399, + "learning_rate": 2e-05, + "loss": 0.03853673, + "step": 23222 + }, + { + "epoch": 46.446, + "grad_norm": 1.193235158920288, + "learning_rate": 2e-05, + "loss": 0.05152307, + "step": 23223 + }, + { + "epoch": 46.448, + "grad_norm": 1.071548581123352, + "learning_rate": 2e-05, + "loss": 0.04468784, + "step": 23224 + }, + { + "epoch": 46.45, + "grad_norm": 1.2598882913589478, + "learning_rate": 2e-05, + "loss": 0.05761843, + "step": 23225 + }, + { + "epoch": 46.452, + "grad_norm": 1.1971979141235352, + "learning_rate": 2e-05, + "loss": 0.05141561, + "step": 23226 + }, + { + "epoch": 46.454, + "grad_norm": 2.1761207580566406, + "learning_rate": 2e-05, + "loss": 0.04906508, + "step": 23227 + }, + { + "epoch": 46.456, + "grad_norm": 1.4585120677947998, + "learning_rate": 2e-05, + "loss": 0.0423772, + "step": 23228 + }, + { + "epoch": 46.458, + "grad_norm": 1.1392666101455688, + "learning_rate": 2e-05, + "loss": 0.04400862, + "step": 23229 + }, + { + "epoch": 46.46, + "grad_norm": 1.4353058338165283, + "learning_rate": 2e-05, + "loss": 0.0567156, + "step": 23230 + }, + { + "epoch": 46.462, + "grad_norm": 1.12968909740448, + "learning_rate": 2e-05, + "loss": 0.03764338, + "step": 23231 + }, + { + "epoch": 46.464, + "grad_norm": 1.2651554346084595, + "learning_rate": 2e-05, + "loss": 0.06105966, + "step": 23232 + }, + { + "epoch": 46.466, + "grad_norm": 1.1130878925323486, + "learning_rate": 2e-05, + "loss": 0.03365841, + "step": 23233 + }, + { + "epoch": 46.468, + "grad_norm": 1.5134097337722778, + "learning_rate": 2e-05, + "loss": 0.04425268, + "step": 23234 + }, + { + "epoch": 46.47, + "grad_norm": 1.2247387170791626, + "learning_rate": 2e-05, + "loss": 0.04427869, + "step": 23235 + }, + { + "epoch": 46.472, + "grad_norm": 1.1638450622558594, + "learning_rate": 2e-05, + "loss": 0.04198921, + "step": 23236 + }, + { + "epoch": 46.474, + "grad_norm": 1.113433599472046, + "learning_rate": 2e-05, + "loss": 0.03829671, + "step": 23237 + }, + { + "epoch": 46.476, + "grad_norm": 1.3648087978363037, + "learning_rate": 2e-05, + "loss": 0.03742781, + "step": 23238 + }, + { + "epoch": 46.478, + "grad_norm": 1.5586036443710327, + "learning_rate": 2e-05, + "loss": 0.04454248, + "step": 23239 + }, + { + "epoch": 46.48, + "grad_norm": 1.5988510847091675, + "learning_rate": 2e-05, + "loss": 0.04805886, + "step": 23240 + }, + { + "epoch": 46.482, + "grad_norm": 1.0325082540512085, + "learning_rate": 2e-05, + "loss": 0.03924419, + "step": 23241 + }, + { + "epoch": 46.484, + "grad_norm": 1.351863980293274, + "learning_rate": 2e-05, + "loss": 0.05655452, + "step": 23242 + }, + { + "epoch": 46.486, + "grad_norm": 1.4276981353759766, + "learning_rate": 2e-05, + "loss": 0.05009124, + "step": 23243 + }, + { + "epoch": 46.488, + "grad_norm": 1.474223017692566, + "learning_rate": 2e-05, + "loss": 0.04846952, + "step": 23244 + }, + { + "epoch": 46.49, + "grad_norm": 0.9883701205253601, + "learning_rate": 2e-05, + "loss": 0.03598514, + "step": 23245 + }, + { + "epoch": 46.492, + "grad_norm": 1.3215599060058594, + "learning_rate": 2e-05, + "loss": 0.05669083, + "step": 23246 + }, + { + "epoch": 46.494, + "grad_norm": 1.0561680793762207, + "learning_rate": 2e-05, + "loss": 0.04358659, + "step": 23247 + }, + { + "epoch": 46.496, + "grad_norm": 1.151599407196045, + "learning_rate": 2e-05, + "loss": 0.05015893, + "step": 23248 + }, + { + "epoch": 46.498, + "grad_norm": 1.4524661302566528, + "learning_rate": 2e-05, + "loss": 0.04662398, + "step": 23249 + }, + { + "epoch": 46.5, + "grad_norm": 1.5353271961212158, + "learning_rate": 2e-05, + "loss": 0.03597783, + "step": 23250 + }, + { + "epoch": 46.502, + "grad_norm": 2.727353811264038, + "learning_rate": 2e-05, + "loss": 0.03727568, + "step": 23251 + }, + { + "epoch": 46.504, + "grad_norm": 1.1945043802261353, + "learning_rate": 2e-05, + "loss": 0.03968314, + "step": 23252 + }, + { + "epoch": 46.506, + "grad_norm": 1.0994758605957031, + "learning_rate": 2e-05, + "loss": 0.05075628, + "step": 23253 + }, + { + "epoch": 46.508, + "grad_norm": 1.2463784217834473, + "learning_rate": 2e-05, + "loss": 0.05148967, + "step": 23254 + }, + { + "epoch": 46.51, + "grad_norm": 1.2693138122558594, + "learning_rate": 2e-05, + "loss": 0.04996495, + "step": 23255 + }, + { + "epoch": 46.512, + "grad_norm": 1.0111137628555298, + "learning_rate": 2e-05, + "loss": 0.04151868, + "step": 23256 + }, + { + "epoch": 46.514, + "grad_norm": 1.1659035682678223, + "learning_rate": 2e-05, + "loss": 0.03361614, + "step": 23257 + }, + { + "epoch": 46.516, + "grad_norm": 1.3636959791183472, + "learning_rate": 2e-05, + "loss": 0.054345, + "step": 23258 + }, + { + "epoch": 46.518, + "grad_norm": 1.228248953819275, + "learning_rate": 2e-05, + "loss": 0.06052267, + "step": 23259 + }, + { + "epoch": 46.52, + "grad_norm": 1.2913553714752197, + "learning_rate": 2e-05, + "loss": 0.05008456, + "step": 23260 + }, + { + "epoch": 46.522, + "grad_norm": 1.2020117044448853, + "learning_rate": 2e-05, + "loss": 0.05029326, + "step": 23261 + }, + { + "epoch": 46.524, + "grad_norm": 1.0171692371368408, + "learning_rate": 2e-05, + "loss": 0.03987342, + "step": 23262 + }, + { + "epoch": 46.526, + "grad_norm": 1.2745869159698486, + "learning_rate": 2e-05, + "loss": 0.06460331, + "step": 23263 + }, + { + "epoch": 46.528, + "grad_norm": 1.0925174951553345, + "learning_rate": 2e-05, + "loss": 0.05129455, + "step": 23264 + }, + { + "epoch": 46.53, + "grad_norm": 1.1556943655014038, + "learning_rate": 2e-05, + "loss": 0.0476981, + "step": 23265 + }, + { + "epoch": 46.532, + "grad_norm": 1.4467393159866333, + "learning_rate": 2e-05, + "loss": 0.06611196, + "step": 23266 + }, + { + "epoch": 46.534, + "grad_norm": 1.8696622848510742, + "learning_rate": 2e-05, + "loss": 0.04994733, + "step": 23267 + }, + { + "epoch": 46.536, + "grad_norm": 1.0204404592514038, + "learning_rate": 2e-05, + "loss": 0.04660401, + "step": 23268 + }, + { + "epoch": 46.538, + "grad_norm": 1.2084547281265259, + "learning_rate": 2e-05, + "loss": 0.05732334, + "step": 23269 + }, + { + "epoch": 46.54, + "grad_norm": 1.333622932434082, + "learning_rate": 2e-05, + "loss": 0.05636217, + "step": 23270 + }, + { + "epoch": 46.542, + "grad_norm": 1.1122244596481323, + "learning_rate": 2e-05, + "loss": 0.04155293, + "step": 23271 + }, + { + "epoch": 46.544, + "grad_norm": 1.6370881795883179, + "learning_rate": 2e-05, + "loss": 0.05667079, + "step": 23272 + }, + { + "epoch": 46.546, + "grad_norm": 1.505200743675232, + "learning_rate": 2e-05, + "loss": 0.05249736, + "step": 23273 + }, + { + "epoch": 46.548, + "grad_norm": 1.155645728111267, + "learning_rate": 2e-05, + "loss": 0.05694194, + "step": 23274 + }, + { + "epoch": 46.55, + "grad_norm": 1.1647768020629883, + "learning_rate": 2e-05, + "loss": 0.05435292, + "step": 23275 + }, + { + "epoch": 46.552, + "grad_norm": 1.0532338619232178, + "learning_rate": 2e-05, + "loss": 0.04438793, + "step": 23276 + }, + { + "epoch": 46.554, + "grad_norm": 1.6303856372833252, + "learning_rate": 2e-05, + "loss": 0.05212019, + "step": 23277 + }, + { + "epoch": 46.556, + "grad_norm": 1.217422604560852, + "learning_rate": 2e-05, + "loss": 0.03759392, + "step": 23278 + }, + { + "epoch": 46.558, + "grad_norm": 1.3669236898422241, + "learning_rate": 2e-05, + "loss": 0.05771047, + "step": 23279 + }, + { + "epoch": 46.56, + "grad_norm": 2.1932220458984375, + "learning_rate": 2e-05, + "loss": 0.04165822, + "step": 23280 + }, + { + "epoch": 46.562, + "grad_norm": 1.2292122840881348, + "learning_rate": 2e-05, + "loss": 0.04785195, + "step": 23281 + }, + { + "epoch": 46.564, + "grad_norm": 1.196970820426941, + "learning_rate": 2e-05, + "loss": 0.03689167, + "step": 23282 + }, + { + "epoch": 46.566, + "grad_norm": 1.020137071609497, + "learning_rate": 2e-05, + "loss": 0.03734909, + "step": 23283 + }, + { + "epoch": 46.568, + "grad_norm": 2.541544198989868, + "learning_rate": 2e-05, + "loss": 0.06038415, + "step": 23284 + }, + { + "epoch": 46.57, + "grad_norm": 1.083556890487671, + "learning_rate": 2e-05, + "loss": 0.03873578, + "step": 23285 + }, + { + "epoch": 46.572, + "grad_norm": 1.4096356630325317, + "learning_rate": 2e-05, + "loss": 0.05036553, + "step": 23286 + }, + { + "epoch": 46.574, + "grad_norm": 0.9817960858345032, + "learning_rate": 2e-05, + "loss": 0.03779167, + "step": 23287 + }, + { + "epoch": 46.576, + "grad_norm": 1.2446906566619873, + "learning_rate": 2e-05, + "loss": 0.05619381, + "step": 23288 + }, + { + "epoch": 46.578, + "grad_norm": 1.0761586427688599, + "learning_rate": 2e-05, + "loss": 0.04057213, + "step": 23289 + }, + { + "epoch": 46.58, + "grad_norm": 1.518162727355957, + "learning_rate": 2e-05, + "loss": 0.03429618, + "step": 23290 + }, + { + "epoch": 46.582, + "grad_norm": 1.1717662811279297, + "learning_rate": 2e-05, + "loss": 0.0466333, + "step": 23291 + }, + { + "epoch": 46.584, + "grad_norm": 3.0718834400177, + "learning_rate": 2e-05, + "loss": 0.04770675, + "step": 23292 + }, + { + "epoch": 46.586, + "grad_norm": 1.1569914817810059, + "learning_rate": 2e-05, + "loss": 0.05019266, + "step": 23293 + }, + { + "epoch": 46.588, + "grad_norm": 1.0586425065994263, + "learning_rate": 2e-05, + "loss": 0.03773466, + "step": 23294 + }, + { + "epoch": 46.59, + "grad_norm": 1.4067730903625488, + "learning_rate": 2e-05, + "loss": 0.06203558, + "step": 23295 + }, + { + "epoch": 46.592, + "grad_norm": 0.9604422450065613, + "learning_rate": 2e-05, + "loss": 0.03419396, + "step": 23296 + }, + { + "epoch": 46.594, + "grad_norm": 1.4648443460464478, + "learning_rate": 2e-05, + "loss": 0.06012858, + "step": 23297 + }, + { + "epoch": 46.596, + "grad_norm": 1.02535080909729, + "learning_rate": 2e-05, + "loss": 0.03938122, + "step": 23298 + }, + { + "epoch": 46.598, + "grad_norm": 1.0963574647903442, + "learning_rate": 2e-05, + "loss": 0.0514155, + "step": 23299 + }, + { + "epoch": 46.6, + "grad_norm": 1.111864447593689, + "learning_rate": 2e-05, + "loss": 0.03777619, + "step": 23300 + }, + { + "epoch": 46.602, + "grad_norm": 1.2542434930801392, + "learning_rate": 2e-05, + "loss": 0.03768213, + "step": 23301 + }, + { + "epoch": 46.604, + "grad_norm": 0.9642258286476135, + "learning_rate": 2e-05, + "loss": 0.040147, + "step": 23302 + }, + { + "epoch": 46.606, + "grad_norm": 1.2112300395965576, + "learning_rate": 2e-05, + "loss": 0.06443147, + "step": 23303 + }, + { + "epoch": 46.608, + "grad_norm": 0.9941690564155579, + "learning_rate": 2e-05, + "loss": 0.04848402, + "step": 23304 + }, + { + "epoch": 46.61, + "grad_norm": 1.9957858324050903, + "learning_rate": 2e-05, + "loss": 0.04321967, + "step": 23305 + }, + { + "epoch": 46.612, + "grad_norm": 0.9449677467346191, + "learning_rate": 2e-05, + "loss": 0.02798765, + "step": 23306 + }, + { + "epoch": 46.614, + "grad_norm": 1.068254828453064, + "learning_rate": 2e-05, + "loss": 0.04512742, + "step": 23307 + }, + { + "epoch": 46.616, + "grad_norm": 1.142982006072998, + "learning_rate": 2e-05, + "loss": 0.04882696, + "step": 23308 + }, + { + "epoch": 46.618, + "grad_norm": 1.2351460456848145, + "learning_rate": 2e-05, + "loss": 0.05512179, + "step": 23309 + }, + { + "epoch": 46.62, + "grad_norm": 2.279547691345215, + "learning_rate": 2e-05, + "loss": 0.03314951, + "step": 23310 + }, + { + "epoch": 46.622, + "grad_norm": 1.0746487379074097, + "learning_rate": 2e-05, + "loss": 0.04710182, + "step": 23311 + }, + { + "epoch": 46.624, + "grad_norm": 1.222518801689148, + "learning_rate": 2e-05, + "loss": 0.06075151, + "step": 23312 + }, + { + "epoch": 46.626, + "grad_norm": 0.9651801586151123, + "learning_rate": 2e-05, + "loss": 0.04442292, + "step": 23313 + }, + { + "epoch": 46.628, + "grad_norm": 0.9503448605537415, + "learning_rate": 2e-05, + "loss": 0.03715204, + "step": 23314 + }, + { + "epoch": 46.63, + "grad_norm": 1.1955111026763916, + "learning_rate": 2e-05, + "loss": 0.0486602, + "step": 23315 + }, + { + "epoch": 46.632, + "grad_norm": 1.020430088043213, + "learning_rate": 2e-05, + "loss": 0.0444079, + "step": 23316 + }, + { + "epoch": 46.634, + "grad_norm": 1.1504944562911987, + "learning_rate": 2e-05, + "loss": 0.0579015, + "step": 23317 + }, + { + "epoch": 46.636, + "grad_norm": 1.0248736143112183, + "learning_rate": 2e-05, + "loss": 0.03767063, + "step": 23318 + }, + { + "epoch": 46.638, + "grad_norm": 1.0812240839004517, + "learning_rate": 2e-05, + "loss": 0.03341661, + "step": 23319 + }, + { + "epoch": 46.64, + "grad_norm": 1.3688080310821533, + "learning_rate": 2e-05, + "loss": 0.04959516, + "step": 23320 + }, + { + "epoch": 46.642, + "grad_norm": 1.0399272441864014, + "learning_rate": 2e-05, + "loss": 0.04377398, + "step": 23321 + }, + { + "epoch": 46.644, + "grad_norm": 1.2335994243621826, + "learning_rate": 2e-05, + "loss": 0.06242413, + "step": 23322 + }, + { + "epoch": 46.646, + "grad_norm": 1.2382959127426147, + "learning_rate": 2e-05, + "loss": 0.04701665, + "step": 23323 + }, + { + "epoch": 46.648, + "grad_norm": 1.318410873413086, + "learning_rate": 2e-05, + "loss": 0.03696349, + "step": 23324 + }, + { + "epoch": 46.65, + "grad_norm": 2.3140316009521484, + "learning_rate": 2e-05, + "loss": 0.05928268, + "step": 23325 + }, + { + "epoch": 46.652, + "grad_norm": 1.1803500652313232, + "learning_rate": 2e-05, + "loss": 0.03985074, + "step": 23326 + }, + { + "epoch": 46.654, + "grad_norm": 1.5823149681091309, + "learning_rate": 2e-05, + "loss": 0.06254095, + "step": 23327 + }, + { + "epoch": 46.656, + "grad_norm": 1.4812148809432983, + "learning_rate": 2e-05, + "loss": 0.05604445, + "step": 23328 + }, + { + "epoch": 46.658, + "grad_norm": 2.1350300312042236, + "learning_rate": 2e-05, + "loss": 0.05875448, + "step": 23329 + }, + { + "epoch": 46.66, + "grad_norm": 1.400207757949829, + "learning_rate": 2e-05, + "loss": 0.07081041, + "step": 23330 + }, + { + "epoch": 46.662, + "grad_norm": 1.0994622707366943, + "learning_rate": 2e-05, + "loss": 0.04271101, + "step": 23331 + }, + { + "epoch": 46.664, + "grad_norm": 1.0609034299850464, + "learning_rate": 2e-05, + "loss": 0.04732626, + "step": 23332 + }, + { + "epoch": 46.666, + "grad_norm": 1.0104788541793823, + "learning_rate": 2e-05, + "loss": 0.03551992, + "step": 23333 + }, + { + "epoch": 46.668, + "grad_norm": 1.1824249029159546, + "learning_rate": 2e-05, + "loss": 0.05614135, + "step": 23334 + }, + { + "epoch": 46.67, + "grad_norm": 2.1531567573547363, + "learning_rate": 2e-05, + "loss": 0.04873116, + "step": 23335 + }, + { + "epoch": 46.672, + "grad_norm": 1.3467340469360352, + "learning_rate": 2e-05, + "loss": 0.03813969, + "step": 23336 + }, + { + "epoch": 46.674, + "grad_norm": 1.037055253982544, + "learning_rate": 2e-05, + "loss": 0.04405836, + "step": 23337 + }, + { + "epoch": 46.676, + "grad_norm": 1.868750810623169, + "learning_rate": 2e-05, + "loss": 0.06937762, + "step": 23338 + }, + { + "epoch": 46.678, + "grad_norm": 1.8962420225143433, + "learning_rate": 2e-05, + "loss": 0.04944228, + "step": 23339 + }, + { + "epoch": 46.68, + "grad_norm": 1.170283317565918, + "learning_rate": 2e-05, + "loss": 0.04979547, + "step": 23340 + }, + { + "epoch": 46.682, + "grad_norm": 1.2021749019622803, + "learning_rate": 2e-05, + "loss": 0.05005801, + "step": 23341 + }, + { + "epoch": 46.684, + "grad_norm": 1.635096549987793, + "learning_rate": 2e-05, + "loss": 0.05956845, + "step": 23342 + }, + { + "epoch": 46.686, + "grad_norm": 2.5779504776000977, + "learning_rate": 2e-05, + "loss": 0.05944379, + "step": 23343 + }, + { + "epoch": 46.688, + "grad_norm": 1.169403076171875, + "learning_rate": 2e-05, + "loss": 0.04585174, + "step": 23344 + }, + { + "epoch": 46.69, + "grad_norm": 1.3411033153533936, + "learning_rate": 2e-05, + "loss": 0.06493143, + "step": 23345 + }, + { + "epoch": 46.692, + "grad_norm": 1.2918158769607544, + "learning_rate": 2e-05, + "loss": 0.04256565, + "step": 23346 + }, + { + "epoch": 46.694, + "grad_norm": 0.985222578048706, + "learning_rate": 2e-05, + "loss": 0.0372174, + "step": 23347 + }, + { + "epoch": 46.696, + "grad_norm": 1.0924230813980103, + "learning_rate": 2e-05, + "loss": 0.03921843, + "step": 23348 + }, + { + "epoch": 46.698, + "grad_norm": 1.3069814443588257, + "learning_rate": 2e-05, + "loss": 0.04222802, + "step": 23349 + }, + { + "epoch": 46.7, + "grad_norm": 1.1846057176589966, + "learning_rate": 2e-05, + "loss": 0.04912648, + "step": 23350 + }, + { + "epoch": 46.702, + "grad_norm": 1.3495811223983765, + "learning_rate": 2e-05, + "loss": 0.04603204, + "step": 23351 + }, + { + "epoch": 46.704, + "grad_norm": 1.0049281120300293, + "learning_rate": 2e-05, + "loss": 0.04690091, + "step": 23352 + }, + { + "epoch": 46.706, + "grad_norm": 1.2509807348251343, + "learning_rate": 2e-05, + "loss": 0.05150412, + "step": 23353 + }, + { + "epoch": 46.708, + "grad_norm": 1.5264276266098022, + "learning_rate": 2e-05, + "loss": 0.05389606, + "step": 23354 + }, + { + "epoch": 46.71, + "grad_norm": 1.7153115272521973, + "learning_rate": 2e-05, + "loss": 0.06340338, + "step": 23355 + }, + { + "epoch": 46.712, + "grad_norm": 1.010880708694458, + "learning_rate": 2e-05, + "loss": 0.0329822, + "step": 23356 + }, + { + "epoch": 46.714, + "grad_norm": 1.157622218132019, + "learning_rate": 2e-05, + "loss": 0.05921149, + "step": 23357 + }, + { + "epoch": 46.716, + "grad_norm": 0.9842050075531006, + "learning_rate": 2e-05, + "loss": 0.03667116, + "step": 23358 + }, + { + "epoch": 46.718, + "grad_norm": 1.1801451444625854, + "learning_rate": 2e-05, + "loss": 0.04451838, + "step": 23359 + }, + { + "epoch": 46.72, + "grad_norm": 1.2267253398895264, + "learning_rate": 2e-05, + "loss": 0.05214031, + "step": 23360 + }, + { + "epoch": 46.722, + "grad_norm": 1.019111156463623, + "learning_rate": 2e-05, + "loss": 0.04116342, + "step": 23361 + }, + { + "epoch": 46.724, + "grad_norm": 1.3958607912063599, + "learning_rate": 2e-05, + "loss": 0.05170007, + "step": 23362 + }, + { + "epoch": 46.726, + "grad_norm": 1.5343711376190186, + "learning_rate": 2e-05, + "loss": 0.05161713, + "step": 23363 + }, + { + "epoch": 46.728, + "grad_norm": 0.9997346997261047, + "learning_rate": 2e-05, + "loss": 0.0342833, + "step": 23364 + }, + { + "epoch": 46.73, + "grad_norm": 1.421368956565857, + "learning_rate": 2e-05, + "loss": 0.04781113, + "step": 23365 + }, + { + "epoch": 46.732, + "grad_norm": 1.6076453924179077, + "learning_rate": 2e-05, + "loss": 0.04223795, + "step": 23366 + }, + { + "epoch": 46.734, + "grad_norm": 1.1187045574188232, + "learning_rate": 2e-05, + "loss": 0.03693566, + "step": 23367 + }, + { + "epoch": 46.736, + "grad_norm": 1.6310001611709595, + "learning_rate": 2e-05, + "loss": 0.04587207, + "step": 23368 + }, + { + "epoch": 46.738, + "grad_norm": 1.0003893375396729, + "learning_rate": 2e-05, + "loss": 0.02953731, + "step": 23369 + }, + { + "epoch": 46.74, + "grad_norm": 2.397538900375366, + "learning_rate": 2e-05, + "loss": 0.04967777, + "step": 23370 + }, + { + "epoch": 46.742, + "grad_norm": 2.9524292945861816, + "learning_rate": 2e-05, + "loss": 0.06176184, + "step": 23371 + }, + { + "epoch": 46.744, + "grad_norm": 1.3411389589309692, + "learning_rate": 2e-05, + "loss": 0.05731054, + "step": 23372 + }, + { + "epoch": 46.746, + "grad_norm": 0.9929517507553101, + "learning_rate": 2e-05, + "loss": 0.04180113, + "step": 23373 + }, + { + "epoch": 46.748, + "grad_norm": 1.270747184753418, + "learning_rate": 2e-05, + "loss": 0.05174856, + "step": 23374 + }, + { + "epoch": 46.75, + "grad_norm": 1.058241605758667, + "learning_rate": 2e-05, + "loss": 0.04240339, + "step": 23375 + }, + { + "epoch": 46.752, + "grad_norm": 1.1424481868743896, + "learning_rate": 2e-05, + "loss": 0.04190686, + "step": 23376 + }, + { + "epoch": 46.754, + "grad_norm": 1.0503220558166504, + "learning_rate": 2e-05, + "loss": 0.05099996, + "step": 23377 + }, + { + "epoch": 46.756, + "grad_norm": 1.255510687828064, + "learning_rate": 2e-05, + "loss": 0.03783688, + "step": 23378 + }, + { + "epoch": 46.758, + "grad_norm": 1.1512470245361328, + "learning_rate": 2e-05, + "loss": 0.04825658, + "step": 23379 + }, + { + "epoch": 46.76, + "grad_norm": 1.2274504899978638, + "learning_rate": 2e-05, + "loss": 0.05738919, + "step": 23380 + }, + { + "epoch": 46.762, + "grad_norm": 2.000230073928833, + "learning_rate": 2e-05, + "loss": 0.05466451, + "step": 23381 + }, + { + "epoch": 46.764, + "grad_norm": 1.1941397190093994, + "learning_rate": 2e-05, + "loss": 0.0490576, + "step": 23382 + }, + { + "epoch": 46.766, + "grad_norm": 1.1872297525405884, + "learning_rate": 2e-05, + "loss": 0.04906765, + "step": 23383 + }, + { + "epoch": 46.768, + "grad_norm": 1.068359136581421, + "learning_rate": 2e-05, + "loss": 0.04766599, + "step": 23384 + }, + { + "epoch": 46.77, + "grad_norm": 1.1551222801208496, + "learning_rate": 2e-05, + "loss": 0.05015551, + "step": 23385 + }, + { + "epoch": 46.772, + "grad_norm": 1.1826947927474976, + "learning_rate": 2e-05, + "loss": 0.04717891, + "step": 23386 + }, + { + "epoch": 46.774, + "grad_norm": 1.3930027484893799, + "learning_rate": 2e-05, + "loss": 0.04331183, + "step": 23387 + }, + { + "epoch": 46.776, + "grad_norm": 1.001609206199646, + "learning_rate": 2e-05, + "loss": 0.03368718, + "step": 23388 + }, + { + "epoch": 46.778, + "grad_norm": 1.371207356452942, + "learning_rate": 2e-05, + "loss": 0.05175086, + "step": 23389 + }, + { + "epoch": 46.78, + "grad_norm": 1.378040075302124, + "learning_rate": 2e-05, + "loss": 0.04431616, + "step": 23390 + }, + { + "epoch": 46.782, + "grad_norm": 1.5609763860702515, + "learning_rate": 2e-05, + "loss": 0.04517556, + "step": 23391 + }, + { + "epoch": 46.784, + "grad_norm": 1.9839285612106323, + "learning_rate": 2e-05, + "loss": 0.05456992, + "step": 23392 + }, + { + "epoch": 46.786, + "grad_norm": 0.9844745993614197, + "learning_rate": 2e-05, + "loss": 0.0431798, + "step": 23393 + }, + { + "epoch": 46.788, + "grad_norm": 1.6684527397155762, + "learning_rate": 2e-05, + "loss": 0.03772101, + "step": 23394 + }, + { + "epoch": 46.79, + "grad_norm": 1.1648509502410889, + "learning_rate": 2e-05, + "loss": 0.05023489, + "step": 23395 + }, + { + "epoch": 46.792, + "grad_norm": 1.1498349905014038, + "learning_rate": 2e-05, + "loss": 0.05116352, + "step": 23396 + }, + { + "epoch": 46.794, + "grad_norm": 2.005505084991455, + "learning_rate": 2e-05, + "loss": 0.04006296, + "step": 23397 + }, + { + "epoch": 46.796, + "grad_norm": 4.387477874755859, + "learning_rate": 2e-05, + "loss": 0.05183373, + "step": 23398 + }, + { + "epoch": 46.798, + "grad_norm": 2.7198569774627686, + "learning_rate": 2e-05, + "loss": 0.05188341, + "step": 23399 + }, + { + "epoch": 46.8, + "grad_norm": 1.1940276622772217, + "learning_rate": 2e-05, + "loss": 0.05574021, + "step": 23400 + }, + { + "epoch": 46.802, + "grad_norm": 1.0419164896011353, + "learning_rate": 2e-05, + "loss": 0.04050265, + "step": 23401 + }, + { + "epoch": 46.804, + "grad_norm": 3.387681484222412, + "learning_rate": 2e-05, + "loss": 0.04175478, + "step": 23402 + }, + { + "epoch": 46.806, + "grad_norm": 1.285062551498413, + "learning_rate": 2e-05, + "loss": 0.04971816, + "step": 23403 + }, + { + "epoch": 46.808, + "grad_norm": 1.3174958229064941, + "learning_rate": 2e-05, + "loss": 0.04881707, + "step": 23404 + }, + { + "epoch": 46.81, + "grad_norm": 3.6777937412261963, + "learning_rate": 2e-05, + "loss": 0.0468701, + "step": 23405 + }, + { + "epoch": 46.812, + "grad_norm": 1.1515544652938843, + "learning_rate": 2e-05, + "loss": 0.04995457, + "step": 23406 + }, + { + "epoch": 46.814, + "grad_norm": 1.6691585779190063, + "learning_rate": 2e-05, + "loss": 0.04617448, + "step": 23407 + }, + { + "epoch": 46.816, + "grad_norm": 1.3046892881393433, + "learning_rate": 2e-05, + "loss": 0.04137983, + "step": 23408 + }, + { + "epoch": 46.818, + "grad_norm": 1.3128255605697632, + "learning_rate": 2e-05, + "loss": 0.05065984, + "step": 23409 + }, + { + "epoch": 46.82, + "grad_norm": 1.3586230278015137, + "learning_rate": 2e-05, + "loss": 0.05450898, + "step": 23410 + }, + { + "epoch": 46.822, + "grad_norm": 1.2502920627593994, + "learning_rate": 2e-05, + "loss": 0.03689632, + "step": 23411 + }, + { + "epoch": 46.824, + "grad_norm": 1.1424442529678345, + "learning_rate": 2e-05, + "loss": 0.03632981, + "step": 23412 + }, + { + "epoch": 46.826, + "grad_norm": 1.3228057622909546, + "learning_rate": 2e-05, + "loss": 0.03984486, + "step": 23413 + }, + { + "epoch": 46.828, + "grad_norm": 1.0505292415618896, + "learning_rate": 2e-05, + "loss": 0.04190434, + "step": 23414 + }, + { + "epoch": 46.83, + "grad_norm": 1.3199437856674194, + "learning_rate": 2e-05, + "loss": 0.0435336, + "step": 23415 + }, + { + "epoch": 46.832, + "grad_norm": 2.5862584114074707, + "learning_rate": 2e-05, + "loss": 0.06176166, + "step": 23416 + }, + { + "epoch": 46.834, + "grad_norm": 1.1140897274017334, + "learning_rate": 2e-05, + "loss": 0.04548195, + "step": 23417 + }, + { + "epoch": 46.836, + "grad_norm": 1.4556688070297241, + "learning_rate": 2e-05, + "loss": 0.04776576, + "step": 23418 + }, + { + "epoch": 46.838, + "grad_norm": 2.7034659385681152, + "learning_rate": 2e-05, + "loss": 0.04163261, + "step": 23419 + }, + { + "epoch": 46.84, + "grad_norm": 1.0860694646835327, + "learning_rate": 2e-05, + "loss": 0.04266633, + "step": 23420 + }, + { + "epoch": 46.842, + "grad_norm": 1.2049165964126587, + "learning_rate": 2e-05, + "loss": 0.05778315, + "step": 23421 + }, + { + "epoch": 46.844, + "grad_norm": 1.6713166236877441, + "learning_rate": 2e-05, + "loss": 0.06890272, + "step": 23422 + }, + { + "epoch": 46.846, + "grad_norm": 3.021561622619629, + "learning_rate": 2e-05, + "loss": 0.05821927, + "step": 23423 + }, + { + "epoch": 46.848, + "grad_norm": 1.4241578578948975, + "learning_rate": 2e-05, + "loss": 0.0569243, + "step": 23424 + }, + { + "epoch": 46.85, + "grad_norm": 1.1604105234146118, + "learning_rate": 2e-05, + "loss": 0.0533638, + "step": 23425 + }, + { + "epoch": 46.852, + "grad_norm": 1.0262799263000488, + "learning_rate": 2e-05, + "loss": 0.04001687, + "step": 23426 + }, + { + "epoch": 46.854, + "grad_norm": 1.0680865049362183, + "learning_rate": 2e-05, + "loss": 0.04203242, + "step": 23427 + }, + { + "epoch": 46.856, + "grad_norm": 1.7087593078613281, + "learning_rate": 2e-05, + "loss": 0.04629037, + "step": 23428 + }, + { + "epoch": 46.858, + "grad_norm": 1.3922479152679443, + "learning_rate": 2e-05, + "loss": 0.05780581, + "step": 23429 + }, + { + "epoch": 46.86, + "grad_norm": 1.3703882694244385, + "learning_rate": 2e-05, + "loss": 0.05306429, + "step": 23430 + }, + { + "epoch": 46.862, + "grad_norm": 1.1331651210784912, + "learning_rate": 2e-05, + "loss": 0.05829866, + "step": 23431 + }, + { + "epoch": 46.864, + "grad_norm": 0.9955253601074219, + "learning_rate": 2e-05, + "loss": 0.03273296, + "step": 23432 + }, + { + "epoch": 46.866, + "grad_norm": 0.9291515946388245, + "learning_rate": 2e-05, + "loss": 0.03978248, + "step": 23433 + }, + { + "epoch": 46.868, + "grad_norm": 1.4390240907669067, + "learning_rate": 2e-05, + "loss": 0.06263384, + "step": 23434 + }, + { + "epoch": 46.87, + "grad_norm": 1.2633771896362305, + "learning_rate": 2e-05, + "loss": 0.05546718, + "step": 23435 + }, + { + "epoch": 46.872, + "grad_norm": 0.8656467199325562, + "learning_rate": 2e-05, + "loss": 0.03080487, + "step": 23436 + }, + { + "epoch": 46.874, + "grad_norm": 1.1050388813018799, + "learning_rate": 2e-05, + "loss": 0.05227588, + "step": 23437 + }, + { + "epoch": 46.876, + "grad_norm": 1.1237446069717407, + "learning_rate": 2e-05, + "loss": 0.06108324, + "step": 23438 + }, + { + "epoch": 46.878, + "grad_norm": 1.0278390645980835, + "learning_rate": 2e-05, + "loss": 0.03754516, + "step": 23439 + }, + { + "epoch": 46.88, + "grad_norm": 1.2690916061401367, + "learning_rate": 2e-05, + "loss": 0.05982792, + "step": 23440 + }, + { + "epoch": 46.882, + "grad_norm": 0.9860858917236328, + "learning_rate": 2e-05, + "loss": 0.04347444, + "step": 23441 + }, + { + "epoch": 46.884, + "grad_norm": 1.0543830394744873, + "learning_rate": 2e-05, + "loss": 0.03453768, + "step": 23442 + }, + { + "epoch": 46.886, + "grad_norm": 1.214526891708374, + "learning_rate": 2e-05, + "loss": 0.04959118, + "step": 23443 + }, + { + "epoch": 46.888, + "grad_norm": 1.9631701707839966, + "learning_rate": 2e-05, + "loss": 0.05660867, + "step": 23444 + }, + { + "epoch": 46.89, + "grad_norm": 1.270965337753296, + "learning_rate": 2e-05, + "loss": 0.05391593, + "step": 23445 + }, + { + "epoch": 46.892, + "grad_norm": 1.369183897972107, + "learning_rate": 2e-05, + "loss": 0.05142535, + "step": 23446 + }, + { + "epoch": 46.894, + "grad_norm": 3.9048008918762207, + "learning_rate": 2e-05, + "loss": 0.041148, + "step": 23447 + }, + { + "epoch": 46.896, + "grad_norm": 2.134962320327759, + "learning_rate": 2e-05, + "loss": 0.03332872, + "step": 23448 + }, + { + "epoch": 46.898, + "grad_norm": 1.895607352256775, + "learning_rate": 2e-05, + "loss": 0.0488515, + "step": 23449 + }, + { + "epoch": 46.9, + "grad_norm": 1.2083396911621094, + "learning_rate": 2e-05, + "loss": 0.04162563, + "step": 23450 + }, + { + "epoch": 46.902, + "grad_norm": 1.1905925273895264, + "learning_rate": 2e-05, + "loss": 0.03953518, + "step": 23451 + }, + { + "epoch": 46.904, + "grad_norm": 1.0462309122085571, + "learning_rate": 2e-05, + "loss": 0.04261171, + "step": 23452 + }, + { + "epoch": 46.906, + "grad_norm": 1.2605634927749634, + "learning_rate": 2e-05, + "loss": 0.04457783, + "step": 23453 + }, + { + "epoch": 46.908, + "grad_norm": 1.0726561546325684, + "learning_rate": 2e-05, + "loss": 0.04535056, + "step": 23454 + }, + { + "epoch": 46.91, + "grad_norm": 1.0684914588928223, + "learning_rate": 2e-05, + "loss": 0.04435501, + "step": 23455 + }, + { + "epoch": 46.912, + "grad_norm": 1.568894386291504, + "learning_rate": 2e-05, + "loss": 0.05053505, + "step": 23456 + }, + { + "epoch": 46.914, + "grad_norm": 1.0680261850357056, + "learning_rate": 2e-05, + "loss": 0.04221952, + "step": 23457 + }, + { + "epoch": 46.916, + "grad_norm": 1.2090034484863281, + "learning_rate": 2e-05, + "loss": 0.05732998, + "step": 23458 + }, + { + "epoch": 46.918, + "grad_norm": 1.0553538799285889, + "learning_rate": 2e-05, + "loss": 0.03570461, + "step": 23459 + }, + { + "epoch": 46.92, + "grad_norm": 1.3866254091262817, + "learning_rate": 2e-05, + "loss": 0.05774738, + "step": 23460 + }, + { + "epoch": 46.922, + "grad_norm": 1.218993067741394, + "learning_rate": 2e-05, + "loss": 0.05130921, + "step": 23461 + }, + { + "epoch": 46.924, + "grad_norm": 1.7904051542282104, + "learning_rate": 2e-05, + "loss": 0.05767091, + "step": 23462 + }, + { + "epoch": 46.926, + "grad_norm": 1.2114713191986084, + "learning_rate": 2e-05, + "loss": 0.05555228, + "step": 23463 + }, + { + "epoch": 46.928, + "grad_norm": 1.6034760475158691, + "learning_rate": 2e-05, + "loss": 0.056468, + "step": 23464 + }, + { + "epoch": 46.93, + "grad_norm": 1.8197824954986572, + "learning_rate": 2e-05, + "loss": 0.03906733, + "step": 23465 + }, + { + "epoch": 46.932, + "grad_norm": 2.5520434379577637, + "learning_rate": 2e-05, + "loss": 0.05988537, + "step": 23466 + }, + { + "epoch": 46.934, + "grad_norm": 1.1971849203109741, + "learning_rate": 2e-05, + "loss": 0.05080458, + "step": 23467 + }, + { + "epoch": 46.936, + "grad_norm": 1.2232027053833008, + "learning_rate": 2e-05, + "loss": 0.0653313, + "step": 23468 + }, + { + "epoch": 46.938, + "grad_norm": 1.3607845306396484, + "learning_rate": 2e-05, + "loss": 0.04467438, + "step": 23469 + }, + { + "epoch": 46.94, + "grad_norm": 1.129961371421814, + "learning_rate": 2e-05, + "loss": 0.05106883, + "step": 23470 + }, + { + "epoch": 46.942, + "grad_norm": 1.3855695724487305, + "learning_rate": 2e-05, + "loss": 0.0506998, + "step": 23471 + }, + { + "epoch": 46.944, + "grad_norm": 1.0359809398651123, + "learning_rate": 2e-05, + "loss": 0.04331663, + "step": 23472 + }, + { + "epoch": 46.946, + "grad_norm": 0.8526789546012878, + "learning_rate": 2e-05, + "loss": 0.03156386, + "step": 23473 + }, + { + "epoch": 46.948, + "grad_norm": 1.0871293544769287, + "learning_rate": 2e-05, + "loss": 0.04656047, + "step": 23474 + }, + { + "epoch": 46.95, + "grad_norm": 1.156686782836914, + "learning_rate": 2e-05, + "loss": 0.06759372, + "step": 23475 + }, + { + "epoch": 46.952, + "grad_norm": 2.5178990364074707, + "learning_rate": 2e-05, + "loss": 0.07468967, + "step": 23476 + }, + { + "epoch": 46.954, + "grad_norm": 1.1853928565979004, + "learning_rate": 2e-05, + "loss": 0.0469472, + "step": 23477 + }, + { + "epoch": 46.956, + "grad_norm": 1.038954257965088, + "learning_rate": 2e-05, + "loss": 0.03782792, + "step": 23478 + }, + { + "epoch": 46.958, + "grad_norm": 1.2154408693313599, + "learning_rate": 2e-05, + "loss": 0.05507773, + "step": 23479 + }, + { + "epoch": 46.96, + "grad_norm": 1.1402373313903809, + "learning_rate": 2e-05, + "loss": 0.0420175, + "step": 23480 + }, + { + "epoch": 46.962, + "grad_norm": 2.3009064197540283, + "learning_rate": 2e-05, + "loss": 0.04467285, + "step": 23481 + }, + { + "epoch": 46.964, + "grad_norm": 2.1940722465515137, + "learning_rate": 2e-05, + "loss": 0.05642062, + "step": 23482 + }, + { + "epoch": 46.966, + "grad_norm": 1.6159992218017578, + "learning_rate": 2e-05, + "loss": 0.04018041, + "step": 23483 + }, + { + "epoch": 46.968, + "grad_norm": 1.1516873836517334, + "learning_rate": 2e-05, + "loss": 0.04684192, + "step": 23484 + }, + { + "epoch": 46.97, + "grad_norm": 1.2707068920135498, + "learning_rate": 2e-05, + "loss": 0.04745233, + "step": 23485 + }, + { + "epoch": 46.972, + "grad_norm": 1.360982060432434, + "learning_rate": 2e-05, + "loss": 0.04974321, + "step": 23486 + }, + { + "epoch": 46.974, + "grad_norm": 1.3548970222473145, + "learning_rate": 2e-05, + "loss": 0.04797807, + "step": 23487 + }, + { + "epoch": 46.976, + "grad_norm": 1.2003024816513062, + "learning_rate": 2e-05, + "loss": 0.04765528, + "step": 23488 + }, + { + "epoch": 46.978, + "grad_norm": 1.032074213027954, + "learning_rate": 2e-05, + "loss": 0.03950176, + "step": 23489 + }, + { + "epoch": 46.98, + "grad_norm": 1.1735929250717163, + "learning_rate": 2e-05, + "loss": 0.05224654, + "step": 23490 + }, + { + "epoch": 46.982, + "grad_norm": 1.1766208410263062, + "learning_rate": 2e-05, + "loss": 0.03919265, + "step": 23491 + }, + { + "epoch": 46.984, + "grad_norm": 1.5051755905151367, + "learning_rate": 2e-05, + "loss": 0.04944252, + "step": 23492 + }, + { + "epoch": 46.986, + "grad_norm": 1.4483623504638672, + "learning_rate": 2e-05, + "loss": 0.04749278, + "step": 23493 + }, + { + "epoch": 46.988, + "grad_norm": 1.991761565208435, + "learning_rate": 2e-05, + "loss": 0.07904573, + "step": 23494 + }, + { + "epoch": 46.99, + "grad_norm": 2.063960552215576, + "learning_rate": 2e-05, + "loss": 0.06021646, + "step": 23495 + }, + { + "epoch": 46.992, + "grad_norm": 2.799022912979126, + "learning_rate": 2e-05, + "loss": 0.04049257, + "step": 23496 + }, + { + "epoch": 46.994, + "grad_norm": 1.7096704244613647, + "learning_rate": 2e-05, + "loss": 0.04271127, + "step": 23497 + }, + { + "epoch": 46.996, + "grad_norm": 0.984707236289978, + "learning_rate": 2e-05, + "loss": 0.03668294, + "step": 23498 + }, + { + "epoch": 46.998, + "grad_norm": 1.7703429460525513, + "learning_rate": 2e-05, + "loss": 0.05086554, + "step": 23499 + }, + { + "epoch": 47.0, + "grad_norm": 1.1229311227798462, + "learning_rate": 2e-05, + "loss": 0.04655139, + "step": 23500 + }, + { + "epoch": 47.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9820359281437125, + "Equal_1": 1.0, + "Equal_2": 0.9880239520958084, + "Equal_3": 0.9880239520958084, + "LineComparison_1": 1.0, + "LineComparison_2": 0.998003992015968, + "LineComparison_3": 0.998003992015968, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.992, + "Perpendicular_1": 0.998, + "Perpendicular_2": 0.998, + "Perpendicular_3": 0.905811623246493, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 0.9976666666666667, + "PointLiesOnCircle_3": 0.9936, + "PointLiesOnLine_1": 1.0, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9860279441117764 + }, + "eval_runtime": 319.7259, + "eval_samples_per_second": 32.841, + "eval_steps_per_second": 0.657, + "step": 23500 + }, + { + "epoch": 47.002, + "grad_norm": 1.2124334573745728, + "learning_rate": 2e-05, + "loss": 0.04620025, + "step": 23501 + }, + { + "epoch": 47.004, + "grad_norm": 1.2111510038375854, + "learning_rate": 2e-05, + "loss": 0.03958354, + "step": 23502 + }, + { + "epoch": 47.006, + "grad_norm": 2.0674355030059814, + "learning_rate": 2e-05, + "loss": 0.06526448, + "step": 23503 + }, + { + "epoch": 47.008, + "grad_norm": 1.139857530593872, + "learning_rate": 2e-05, + "loss": 0.0366558, + "step": 23504 + }, + { + "epoch": 47.01, + "grad_norm": 1.407399296760559, + "learning_rate": 2e-05, + "loss": 0.05581947, + "step": 23505 + }, + { + "epoch": 47.012, + "grad_norm": 1.1468596458435059, + "learning_rate": 2e-05, + "loss": 0.04368159, + "step": 23506 + }, + { + "epoch": 47.014, + "grad_norm": 1.4752362966537476, + "learning_rate": 2e-05, + "loss": 0.03284196, + "step": 23507 + }, + { + "epoch": 47.016, + "grad_norm": 1.083961009979248, + "learning_rate": 2e-05, + "loss": 0.04144663, + "step": 23508 + }, + { + "epoch": 47.018, + "grad_norm": 1.0464342832565308, + "learning_rate": 2e-05, + "loss": 0.04529237, + "step": 23509 + }, + { + "epoch": 47.02, + "grad_norm": 1.1925157308578491, + "learning_rate": 2e-05, + "loss": 0.05837334, + "step": 23510 + }, + { + "epoch": 47.022, + "grad_norm": 1.135394811630249, + "learning_rate": 2e-05, + "loss": 0.04342172, + "step": 23511 + }, + { + "epoch": 47.024, + "grad_norm": 1.2975212335586548, + "learning_rate": 2e-05, + "loss": 0.05698482, + "step": 23512 + }, + { + "epoch": 47.026, + "grad_norm": 0.9662984609603882, + "learning_rate": 2e-05, + "loss": 0.03092372, + "step": 23513 + }, + { + "epoch": 47.028, + "grad_norm": 1.3168891668319702, + "learning_rate": 2e-05, + "loss": 0.05116647, + "step": 23514 + }, + { + "epoch": 47.03, + "grad_norm": 1.0158759355545044, + "learning_rate": 2e-05, + "loss": 0.04270767, + "step": 23515 + }, + { + "epoch": 47.032, + "grad_norm": 1.0800869464874268, + "learning_rate": 2e-05, + "loss": 0.0404634, + "step": 23516 + }, + { + "epoch": 47.034, + "grad_norm": 1.3503354787826538, + "learning_rate": 2e-05, + "loss": 0.06739471, + "step": 23517 + }, + { + "epoch": 47.036, + "grad_norm": 1.1669559478759766, + "learning_rate": 2e-05, + "loss": 0.04633216, + "step": 23518 + }, + { + "epoch": 47.038, + "grad_norm": 1.211911678314209, + "learning_rate": 2e-05, + "loss": 0.05827531, + "step": 23519 + }, + { + "epoch": 47.04, + "grad_norm": 1.1868705749511719, + "learning_rate": 2e-05, + "loss": 0.04901566, + "step": 23520 + }, + { + "epoch": 47.042, + "grad_norm": 0.9578158855438232, + "learning_rate": 2e-05, + "loss": 0.04152099, + "step": 23521 + }, + { + "epoch": 47.044, + "grad_norm": 1.1845344305038452, + "learning_rate": 2e-05, + "loss": 0.04116198, + "step": 23522 + }, + { + "epoch": 47.046, + "grad_norm": 1.7209062576293945, + "learning_rate": 2e-05, + "loss": 0.05426127, + "step": 23523 + }, + { + "epoch": 47.048, + "grad_norm": 1.0993847846984863, + "learning_rate": 2e-05, + "loss": 0.04611755, + "step": 23524 + }, + { + "epoch": 47.05, + "grad_norm": 1.5714466571807861, + "learning_rate": 2e-05, + "loss": 0.06220062, + "step": 23525 + }, + { + "epoch": 47.052, + "grad_norm": 1.5425214767456055, + "learning_rate": 2e-05, + "loss": 0.04734546, + "step": 23526 + }, + { + "epoch": 47.054, + "grad_norm": 1.330129623413086, + "learning_rate": 2e-05, + "loss": 0.0763645, + "step": 23527 + }, + { + "epoch": 47.056, + "grad_norm": 1.205364465713501, + "learning_rate": 2e-05, + "loss": 0.06087791, + "step": 23528 + }, + { + "epoch": 47.058, + "grad_norm": 1.8291528224945068, + "learning_rate": 2e-05, + "loss": 0.05230067, + "step": 23529 + }, + { + "epoch": 47.06, + "grad_norm": 1.2380144596099854, + "learning_rate": 2e-05, + "loss": 0.05399077, + "step": 23530 + }, + { + "epoch": 47.062, + "grad_norm": 1.1345731019973755, + "learning_rate": 2e-05, + "loss": 0.04412289, + "step": 23531 + }, + { + "epoch": 47.064, + "grad_norm": 1.3171319961547852, + "learning_rate": 2e-05, + "loss": 0.05258146, + "step": 23532 + }, + { + "epoch": 47.066, + "grad_norm": 1.1284446716308594, + "learning_rate": 2e-05, + "loss": 0.04565362, + "step": 23533 + }, + { + "epoch": 47.068, + "grad_norm": 1.0812745094299316, + "learning_rate": 2e-05, + "loss": 0.03943041, + "step": 23534 + }, + { + "epoch": 47.07, + "grad_norm": 1.1932939291000366, + "learning_rate": 2e-05, + "loss": 0.03978365, + "step": 23535 + }, + { + "epoch": 47.072, + "grad_norm": 1.5841753482818604, + "learning_rate": 2e-05, + "loss": 0.04983898, + "step": 23536 + }, + { + "epoch": 47.074, + "grad_norm": 1.2005212306976318, + "learning_rate": 2e-05, + "loss": 0.0624929, + "step": 23537 + }, + { + "epoch": 47.076, + "grad_norm": 1.3606860637664795, + "learning_rate": 2e-05, + "loss": 0.0616478, + "step": 23538 + }, + { + "epoch": 47.078, + "grad_norm": 1.0764256715774536, + "learning_rate": 2e-05, + "loss": 0.04636963, + "step": 23539 + }, + { + "epoch": 47.08, + "grad_norm": 1.1802319288253784, + "learning_rate": 2e-05, + "loss": 0.05296212, + "step": 23540 + }, + { + "epoch": 47.082, + "grad_norm": 1.249846339225769, + "learning_rate": 2e-05, + "loss": 0.05203488, + "step": 23541 + }, + { + "epoch": 47.084, + "grad_norm": 1.3313225507736206, + "learning_rate": 2e-05, + "loss": 0.05261878, + "step": 23542 + }, + { + "epoch": 47.086, + "grad_norm": 1.1203484535217285, + "learning_rate": 2e-05, + "loss": 0.04687518, + "step": 23543 + }, + { + "epoch": 47.088, + "grad_norm": 1.1626969575881958, + "learning_rate": 2e-05, + "loss": 0.04210647, + "step": 23544 + }, + { + "epoch": 47.09, + "grad_norm": 1.0756282806396484, + "learning_rate": 2e-05, + "loss": 0.04296558, + "step": 23545 + }, + { + "epoch": 47.092, + "grad_norm": 1.29530930519104, + "learning_rate": 2e-05, + "loss": 0.05416109, + "step": 23546 + }, + { + "epoch": 47.094, + "grad_norm": 1.05868661403656, + "learning_rate": 2e-05, + "loss": 0.04070142, + "step": 23547 + }, + { + "epoch": 47.096, + "grad_norm": 1.0833359956741333, + "learning_rate": 2e-05, + "loss": 0.0563876, + "step": 23548 + }, + { + "epoch": 47.098, + "grad_norm": 1.1828356981277466, + "learning_rate": 2e-05, + "loss": 0.05474529, + "step": 23549 + }, + { + "epoch": 47.1, + "grad_norm": 1.6338591575622559, + "learning_rate": 2e-05, + "loss": 0.04851716, + "step": 23550 + }, + { + "epoch": 47.102, + "grad_norm": 1.1993439197540283, + "learning_rate": 2e-05, + "loss": 0.05251703, + "step": 23551 + }, + { + "epoch": 47.104, + "grad_norm": 1.1439919471740723, + "learning_rate": 2e-05, + "loss": 0.04419599, + "step": 23552 + }, + { + "epoch": 47.106, + "grad_norm": 1.2255439758300781, + "learning_rate": 2e-05, + "loss": 0.04737487, + "step": 23553 + }, + { + "epoch": 47.108, + "grad_norm": 1.085721492767334, + "learning_rate": 2e-05, + "loss": 0.03265755, + "step": 23554 + }, + { + "epoch": 47.11, + "grad_norm": 1.1767328977584839, + "learning_rate": 2e-05, + "loss": 0.05188051, + "step": 23555 + }, + { + "epoch": 47.112, + "grad_norm": 1.1831814050674438, + "learning_rate": 2e-05, + "loss": 0.06118751, + "step": 23556 + }, + { + "epoch": 47.114, + "grad_norm": 1.093849778175354, + "learning_rate": 2e-05, + "loss": 0.04082596, + "step": 23557 + }, + { + "epoch": 47.116, + "grad_norm": 1.1168532371520996, + "learning_rate": 2e-05, + "loss": 0.05243224, + "step": 23558 + }, + { + "epoch": 47.118, + "grad_norm": 1.548278570175171, + "learning_rate": 2e-05, + "loss": 0.05186, + "step": 23559 + }, + { + "epoch": 47.12, + "grad_norm": 1.2307542562484741, + "learning_rate": 2e-05, + "loss": 0.05896568, + "step": 23560 + }, + { + "epoch": 47.122, + "grad_norm": 1.2599307298660278, + "learning_rate": 2e-05, + "loss": 0.04969592, + "step": 23561 + }, + { + "epoch": 47.124, + "grad_norm": 1.4370466470718384, + "learning_rate": 2e-05, + "loss": 0.06255047, + "step": 23562 + }, + { + "epoch": 47.126, + "grad_norm": 1.6804131269454956, + "learning_rate": 2e-05, + "loss": 0.04512757, + "step": 23563 + }, + { + "epoch": 47.128, + "grad_norm": 1.4804704189300537, + "learning_rate": 2e-05, + "loss": 0.04683591, + "step": 23564 + }, + { + "epoch": 47.13, + "grad_norm": 2.8036322593688965, + "learning_rate": 2e-05, + "loss": 0.06730855, + "step": 23565 + }, + { + "epoch": 47.132, + "grad_norm": 0.9568971395492554, + "learning_rate": 2e-05, + "loss": 0.04147626, + "step": 23566 + }, + { + "epoch": 47.134, + "grad_norm": 1.3193092346191406, + "learning_rate": 2e-05, + "loss": 0.05314566, + "step": 23567 + }, + { + "epoch": 47.136, + "grad_norm": 1.122524380683899, + "learning_rate": 2e-05, + "loss": 0.04262283, + "step": 23568 + }, + { + "epoch": 47.138, + "grad_norm": 1.9593148231506348, + "learning_rate": 2e-05, + "loss": 0.05774532, + "step": 23569 + }, + { + "epoch": 47.14, + "grad_norm": 1.1386750936508179, + "learning_rate": 2e-05, + "loss": 0.0475467, + "step": 23570 + }, + { + "epoch": 47.142, + "grad_norm": 1.329964518547058, + "learning_rate": 2e-05, + "loss": 0.04642937, + "step": 23571 + }, + { + "epoch": 47.144, + "grad_norm": 1.109947681427002, + "learning_rate": 2e-05, + "loss": 0.05008635, + "step": 23572 + }, + { + "epoch": 47.146, + "grad_norm": 1.2006001472473145, + "learning_rate": 2e-05, + "loss": 0.05233764, + "step": 23573 + }, + { + "epoch": 47.148, + "grad_norm": 1.4829745292663574, + "learning_rate": 2e-05, + "loss": 0.05814692, + "step": 23574 + }, + { + "epoch": 47.15, + "grad_norm": 1.1940250396728516, + "learning_rate": 2e-05, + "loss": 0.05612388, + "step": 23575 + }, + { + "epoch": 47.152, + "grad_norm": 1.3208078145980835, + "learning_rate": 2e-05, + "loss": 0.04610663, + "step": 23576 + }, + { + "epoch": 47.154, + "grad_norm": 1.1997742652893066, + "learning_rate": 2e-05, + "loss": 0.04106606, + "step": 23577 + }, + { + "epoch": 47.156, + "grad_norm": 1.40494704246521, + "learning_rate": 2e-05, + "loss": 0.06252164, + "step": 23578 + }, + { + "epoch": 47.158, + "grad_norm": 0.9245737791061401, + "learning_rate": 2e-05, + "loss": 0.04033583, + "step": 23579 + }, + { + "epoch": 47.16, + "grad_norm": 1.717743992805481, + "learning_rate": 2e-05, + "loss": 0.06210865, + "step": 23580 + }, + { + "epoch": 47.162, + "grad_norm": 1.206834316253662, + "learning_rate": 2e-05, + "loss": 0.05354768, + "step": 23581 + }, + { + "epoch": 47.164, + "grad_norm": 1.2854136228561401, + "learning_rate": 2e-05, + "loss": 0.05695827, + "step": 23582 + }, + { + "epoch": 47.166, + "grad_norm": 1.6390953063964844, + "learning_rate": 2e-05, + "loss": 0.06032401, + "step": 23583 + }, + { + "epoch": 47.168, + "grad_norm": 1.2514336109161377, + "learning_rate": 2e-05, + "loss": 0.05170215, + "step": 23584 + }, + { + "epoch": 47.17, + "grad_norm": 1.274780035018921, + "learning_rate": 2e-05, + "loss": 0.03985206, + "step": 23585 + }, + { + "epoch": 47.172, + "grad_norm": 1.108774185180664, + "learning_rate": 2e-05, + "loss": 0.03357091, + "step": 23586 + }, + { + "epoch": 47.174, + "grad_norm": 1.0785712003707886, + "learning_rate": 2e-05, + "loss": 0.03320477, + "step": 23587 + }, + { + "epoch": 47.176, + "grad_norm": 1.3981683254241943, + "learning_rate": 2e-05, + "loss": 0.03950857, + "step": 23588 + }, + { + "epoch": 47.178, + "grad_norm": 1.2349064350128174, + "learning_rate": 2e-05, + "loss": 0.06858353, + "step": 23589 + }, + { + "epoch": 47.18, + "grad_norm": 2.030306100845337, + "learning_rate": 2e-05, + "loss": 0.04219834, + "step": 23590 + }, + { + "epoch": 47.182, + "grad_norm": 1.5019062757492065, + "learning_rate": 2e-05, + "loss": 0.05146636, + "step": 23591 + }, + { + "epoch": 47.184, + "grad_norm": 1.3464275598526, + "learning_rate": 2e-05, + "loss": 0.05433298, + "step": 23592 + }, + { + "epoch": 47.186, + "grad_norm": 1.466103196144104, + "learning_rate": 2e-05, + "loss": 0.06641914, + "step": 23593 + }, + { + "epoch": 47.188, + "grad_norm": 1.3546744585037231, + "learning_rate": 2e-05, + "loss": 0.05045114, + "step": 23594 + }, + { + "epoch": 47.19, + "grad_norm": 1.330115795135498, + "learning_rate": 2e-05, + "loss": 0.0545896, + "step": 23595 + }, + { + "epoch": 47.192, + "grad_norm": 1.141314148902893, + "learning_rate": 2e-05, + "loss": 0.03988175, + "step": 23596 + }, + { + "epoch": 47.194, + "grad_norm": 1.3933604955673218, + "learning_rate": 2e-05, + "loss": 0.05277719, + "step": 23597 + }, + { + "epoch": 47.196, + "grad_norm": 1.3836783170700073, + "learning_rate": 2e-05, + "loss": 0.05862165, + "step": 23598 + }, + { + "epoch": 47.198, + "grad_norm": 1.1174200773239136, + "learning_rate": 2e-05, + "loss": 0.04964589, + "step": 23599 + }, + { + "epoch": 47.2, + "grad_norm": 1.247449278831482, + "learning_rate": 2e-05, + "loss": 0.05483104, + "step": 23600 + }, + { + "epoch": 47.202, + "grad_norm": 1.1020945310592651, + "learning_rate": 2e-05, + "loss": 0.05271735, + "step": 23601 + }, + { + "epoch": 47.204, + "grad_norm": 1.5037002563476562, + "learning_rate": 2e-05, + "loss": 0.06174751, + "step": 23602 + }, + { + "epoch": 47.206, + "grad_norm": 1.4615405797958374, + "learning_rate": 2e-05, + "loss": 0.05728906, + "step": 23603 + }, + { + "epoch": 47.208, + "grad_norm": 1.1932406425476074, + "learning_rate": 2e-05, + "loss": 0.05166669, + "step": 23604 + }, + { + "epoch": 47.21, + "grad_norm": 1.133001685142517, + "learning_rate": 2e-05, + "loss": 0.05504667, + "step": 23605 + }, + { + "epoch": 47.212, + "grad_norm": 1.0907678604125977, + "learning_rate": 2e-05, + "loss": 0.04519046, + "step": 23606 + }, + { + "epoch": 47.214, + "grad_norm": 1.0519095659255981, + "learning_rate": 2e-05, + "loss": 0.0439795, + "step": 23607 + }, + { + "epoch": 47.216, + "grad_norm": 1.272680401802063, + "learning_rate": 2e-05, + "loss": 0.04971275, + "step": 23608 + }, + { + "epoch": 47.218, + "grad_norm": 1.4395430088043213, + "learning_rate": 2e-05, + "loss": 0.05114156, + "step": 23609 + }, + { + "epoch": 47.22, + "grad_norm": 1.1029596328735352, + "learning_rate": 2e-05, + "loss": 0.04121825, + "step": 23610 + }, + { + "epoch": 47.222, + "grad_norm": 1.232066035270691, + "learning_rate": 2e-05, + "loss": 0.04981194, + "step": 23611 + }, + { + "epoch": 47.224, + "grad_norm": 1.2140462398529053, + "learning_rate": 2e-05, + "loss": 0.05153929, + "step": 23612 + }, + { + "epoch": 47.226, + "grad_norm": 1.5724217891693115, + "learning_rate": 2e-05, + "loss": 0.05892528, + "step": 23613 + }, + { + "epoch": 47.228, + "grad_norm": 1.089890480041504, + "learning_rate": 2e-05, + "loss": 0.03622951, + "step": 23614 + }, + { + "epoch": 47.23, + "grad_norm": 1.137673258781433, + "learning_rate": 2e-05, + "loss": 0.05067534, + "step": 23615 + }, + { + "epoch": 47.232, + "grad_norm": 1.6163313388824463, + "learning_rate": 2e-05, + "loss": 0.07416356, + "step": 23616 + }, + { + "epoch": 47.234, + "grad_norm": 1.5656497478485107, + "learning_rate": 2e-05, + "loss": 0.04488578, + "step": 23617 + }, + { + "epoch": 47.236, + "grad_norm": 2.585951566696167, + "learning_rate": 2e-05, + "loss": 0.0466084, + "step": 23618 + }, + { + "epoch": 47.238, + "grad_norm": 1.1935782432556152, + "learning_rate": 2e-05, + "loss": 0.03517368, + "step": 23619 + }, + { + "epoch": 47.24, + "grad_norm": 0.9748817682266235, + "learning_rate": 2e-05, + "loss": 0.03498342, + "step": 23620 + }, + { + "epoch": 47.242, + "grad_norm": 1.339208960533142, + "learning_rate": 2e-05, + "loss": 0.04053495, + "step": 23621 + }, + { + "epoch": 47.244, + "grad_norm": 1.0967516899108887, + "learning_rate": 2e-05, + "loss": 0.03619142, + "step": 23622 + }, + { + "epoch": 47.246, + "grad_norm": 1.4304256439208984, + "learning_rate": 2e-05, + "loss": 0.05116164, + "step": 23623 + }, + { + "epoch": 47.248, + "grad_norm": 1.2332102060317993, + "learning_rate": 2e-05, + "loss": 0.05349259, + "step": 23624 + }, + { + "epoch": 47.25, + "grad_norm": 1.084965467453003, + "learning_rate": 2e-05, + "loss": 0.04240631, + "step": 23625 + }, + { + "epoch": 47.252, + "grad_norm": 1.075219750404358, + "learning_rate": 2e-05, + "loss": 0.04627833, + "step": 23626 + }, + { + "epoch": 47.254, + "grad_norm": 1.1400893926620483, + "learning_rate": 2e-05, + "loss": 0.04680464, + "step": 23627 + }, + { + "epoch": 47.256, + "grad_norm": 1.1270958185195923, + "learning_rate": 2e-05, + "loss": 0.04278168, + "step": 23628 + }, + { + "epoch": 47.258, + "grad_norm": 2.2311246395111084, + "learning_rate": 2e-05, + "loss": 0.05921525, + "step": 23629 + }, + { + "epoch": 47.26, + "grad_norm": 1.1150248050689697, + "learning_rate": 2e-05, + "loss": 0.04678026, + "step": 23630 + }, + { + "epoch": 47.262, + "grad_norm": 1.1364946365356445, + "learning_rate": 2e-05, + "loss": 0.05590563, + "step": 23631 + }, + { + "epoch": 47.264, + "grad_norm": 1.2034296989440918, + "learning_rate": 2e-05, + "loss": 0.05708367, + "step": 23632 + }, + { + "epoch": 47.266, + "grad_norm": 1.4349358081817627, + "learning_rate": 2e-05, + "loss": 0.06511645, + "step": 23633 + }, + { + "epoch": 47.268, + "grad_norm": 1.4762065410614014, + "learning_rate": 2e-05, + "loss": 0.06263445, + "step": 23634 + }, + { + "epoch": 47.27, + "grad_norm": 0.9478891491889954, + "learning_rate": 2e-05, + "loss": 0.03476822, + "step": 23635 + }, + { + "epoch": 47.272, + "grad_norm": 1.010331153869629, + "learning_rate": 2e-05, + "loss": 0.02933269, + "step": 23636 + }, + { + "epoch": 47.274, + "grad_norm": 1.250553011894226, + "learning_rate": 2e-05, + "loss": 0.04823877, + "step": 23637 + }, + { + "epoch": 47.276, + "grad_norm": 0.9914244413375854, + "learning_rate": 2e-05, + "loss": 0.02885609, + "step": 23638 + }, + { + "epoch": 47.278, + "grad_norm": 1.766867995262146, + "learning_rate": 2e-05, + "loss": 0.05482132, + "step": 23639 + }, + { + "epoch": 47.28, + "grad_norm": 1.9935588836669922, + "learning_rate": 2e-05, + "loss": 0.05211507, + "step": 23640 + }, + { + "epoch": 47.282, + "grad_norm": 1.3328672647476196, + "learning_rate": 2e-05, + "loss": 0.05550382, + "step": 23641 + }, + { + "epoch": 47.284, + "grad_norm": 1.2937812805175781, + "learning_rate": 2e-05, + "loss": 0.05083098, + "step": 23642 + }, + { + "epoch": 47.286, + "grad_norm": 1.1793773174285889, + "learning_rate": 2e-05, + "loss": 0.04735196, + "step": 23643 + }, + { + "epoch": 47.288, + "grad_norm": 0.9814954400062561, + "learning_rate": 2e-05, + "loss": 0.03630852, + "step": 23644 + }, + { + "epoch": 47.29, + "grad_norm": 1.104866862297058, + "learning_rate": 2e-05, + "loss": 0.04346426, + "step": 23645 + }, + { + "epoch": 47.292, + "grad_norm": 1.7783113718032837, + "learning_rate": 2e-05, + "loss": 0.05869078, + "step": 23646 + }, + { + "epoch": 47.294, + "grad_norm": 1.1085009574890137, + "learning_rate": 2e-05, + "loss": 0.05062877, + "step": 23647 + }, + { + "epoch": 47.296, + "grad_norm": 1.106187343597412, + "learning_rate": 2e-05, + "loss": 0.05129405, + "step": 23648 + }, + { + "epoch": 47.298, + "grad_norm": 1.0873476266860962, + "learning_rate": 2e-05, + "loss": 0.03707412, + "step": 23649 + }, + { + "epoch": 47.3, + "grad_norm": 1.7342414855957031, + "learning_rate": 2e-05, + "loss": 0.05554866, + "step": 23650 + }, + { + "epoch": 47.302, + "grad_norm": 0.9549355506896973, + "learning_rate": 2e-05, + "loss": 0.03941218, + "step": 23651 + }, + { + "epoch": 47.304, + "grad_norm": 1.0804848670959473, + "learning_rate": 2e-05, + "loss": 0.04693265, + "step": 23652 + }, + { + "epoch": 47.306, + "grad_norm": 1.6769800186157227, + "learning_rate": 2e-05, + "loss": 0.05842284, + "step": 23653 + }, + { + "epoch": 47.308, + "grad_norm": 1.0862559080123901, + "learning_rate": 2e-05, + "loss": 0.04118923, + "step": 23654 + }, + { + "epoch": 47.31, + "grad_norm": 1.7151005268096924, + "learning_rate": 2e-05, + "loss": 0.07205494, + "step": 23655 + }, + { + "epoch": 47.312, + "grad_norm": 1.1123970746994019, + "learning_rate": 2e-05, + "loss": 0.04154256, + "step": 23656 + }, + { + "epoch": 47.314, + "grad_norm": 1.0019702911376953, + "learning_rate": 2e-05, + "loss": 0.03406595, + "step": 23657 + }, + { + "epoch": 47.316, + "grad_norm": 1.2018071413040161, + "learning_rate": 2e-05, + "loss": 0.04904677, + "step": 23658 + }, + { + "epoch": 47.318, + "grad_norm": 1.2852320671081543, + "learning_rate": 2e-05, + "loss": 0.03975239, + "step": 23659 + }, + { + "epoch": 47.32, + "grad_norm": 1.0827425718307495, + "learning_rate": 2e-05, + "loss": 0.0492649, + "step": 23660 + }, + { + "epoch": 47.322, + "grad_norm": 1.0454564094543457, + "learning_rate": 2e-05, + "loss": 0.03652071, + "step": 23661 + }, + { + "epoch": 47.324, + "grad_norm": 1.3567827939987183, + "learning_rate": 2e-05, + "loss": 0.05563585, + "step": 23662 + }, + { + "epoch": 47.326, + "grad_norm": 2.5465247631073, + "learning_rate": 2e-05, + "loss": 0.06577208, + "step": 23663 + }, + { + "epoch": 47.328, + "grad_norm": 1.4325695037841797, + "learning_rate": 2e-05, + "loss": 0.06627315, + "step": 23664 + }, + { + "epoch": 47.33, + "grad_norm": 1.2820574045181274, + "learning_rate": 2e-05, + "loss": 0.05097567, + "step": 23665 + }, + { + "epoch": 47.332, + "grad_norm": 0.9658817648887634, + "learning_rate": 2e-05, + "loss": 0.03891777, + "step": 23666 + }, + { + "epoch": 47.334, + "grad_norm": 0.9408270716667175, + "learning_rate": 2e-05, + "loss": 0.03226583, + "step": 23667 + }, + { + "epoch": 47.336, + "grad_norm": 1.1844184398651123, + "learning_rate": 2e-05, + "loss": 0.04694714, + "step": 23668 + }, + { + "epoch": 47.338, + "grad_norm": 1.1536945104599, + "learning_rate": 2e-05, + "loss": 0.04273265, + "step": 23669 + }, + { + "epoch": 47.34, + "grad_norm": 1.163088083267212, + "learning_rate": 2e-05, + "loss": 0.03042351, + "step": 23670 + }, + { + "epoch": 47.342, + "grad_norm": 1.5534110069274902, + "learning_rate": 2e-05, + "loss": 0.04814804, + "step": 23671 + }, + { + "epoch": 47.344, + "grad_norm": 1.2012584209442139, + "learning_rate": 2e-05, + "loss": 0.04714233, + "step": 23672 + }, + { + "epoch": 47.346, + "grad_norm": 1.1810247898101807, + "learning_rate": 2e-05, + "loss": 0.03760591, + "step": 23673 + }, + { + "epoch": 47.348, + "grad_norm": 1.1035114526748657, + "learning_rate": 2e-05, + "loss": 0.04680648, + "step": 23674 + }, + { + "epoch": 47.35, + "grad_norm": 0.9210943579673767, + "learning_rate": 2e-05, + "loss": 0.03328598, + "step": 23675 + }, + { + "epoch": 47.352, + "grad_norm": 1.0501015186309814, + "learning_rate": 2e-05, + "loss": 0.03831122, + "step": 23676 + }, + { + "epoch": 47.354, + "grad_norm": 2.732465982437134, + "learning_rate": 2e-05, + "loss": 0.05034593, + "step": 23677 + }, + { + "epoch": 47.356, + "grad_norm": 0.9773300290107727, + "learning_rate": 2e-05, + "loss": 0.03243728, + "step": 23678 + }, + { + "epoch": 47.358, + "grad_norm": 1.2418444156646729, + "learning_rate": 2e-05, + "loss": 0.05027065, + "step": 23679 + }, + { + "epoch": 47.36, + "grad_norm": 1.1005994081497192, + "learning_rate": 2e-05, + "loss": 0.04392687, + "step": 23680 + }, + { + "epoch": 47.362, + "grad_norm": 1.098421573638916, + "learning_rate": 2e-05, + "loss": 0.05161924, + "step": 23681 + }, + { + "epoch": 47.364, + "grad_norm": 1.4422335624694824, + "learning_rate": 2e-05, + "loss": 0.04385792, + "step": 23682 + }, + { + "epoch": 47.366, + "grad_norm": 1.126664638519287, + "learning_rate": 2e-05, + "loss": 0.04566282, + "step": 23683 + }, + { + "epoch": 47.368, + "grad_norm": 1.4247503280639648, + "learning_rate": 2e-05, + "loss": 0.05417196, + "step": 23684 + }, + { + "epoch": 47.37, + "grad_norm": 1.1334466934204102, + "learning_rate": 2e-05, + "loss": 0.04590701, + "step": 23685 + }, + { + "epoch": 47.372, + "grad_norm": 1.1766184568405151, + "learning_rate": 2e-05, + "loss": 0.04920424, + "step": 23686 + }, + { + "epoch": 47.374, + "grad_norm": 1.0475587844848633, + "learning_rate": 2e-05, + "loss": 0.03809017, + "step": 23687 + }, + { + "epoch": 47.376, + "grad_norm": 1.0179909467697144, + "learning_rate": 2e-05, + "loss": 0.0358481, + "step": 23688 + }, + { + "epoch": 47.378, + "grad_norm": 1.281684398651123, + "learning_rate": 2e-05, + "loss": 0.05288438, + "step": 23689 + }, + { + "epoch": 47.38, + "grad_norm": 1.3577895164489746, + "learning_rate": 2e-05, + "loss": 0.05755025, + "step": 23690 + }, + { + "epoch": 47.382, + "grad_norm": 1.284982681274414, + "learning_rate": 2e-05, + "loss": 0.0437559, + "step": 23691 + }, + { + "epoch": 47.384, + "grad_norm": 0.9902780055999756, + "learning_rate": 2e-05, + "loss": 0.03365954, + "step": 23692 + }, + { + "epoch": 47.386, + "grad_norm": 2.0288844108581543, + "learning_rate": 2e-05, + "loss": 0.03784881, + "step": 23693 + }, + { + "epoch": 47.388, + "grad_norm": 1.6296567916870117, + "learning_rate": 2e-05, + "loss": 0.05740002, + "step": 23694 + }, + { + "epoch": 47.39, + "grad_norm": 1.2057472467422485, + "learning_rate": 2e-05, + "loss": 0.04737182, + "step": 23695 + }, + { + "epoch": 47.392, + "grad_norm": 2.2552409172058105, + "learning_rate": 2e-05, + "loss": 0.05677157, + "step": 23696 + }, + { + "epoch": 47.394, + "grad_norm": 1.9773105382919312, + "learning_rate": 2e-05, + "loss": 0.05616665, + "step": 23697 + }, + { + "epoch": 47.396, + "grad_norm": 1.2296985387802124, + "learning_rate": 2e-05, + "loss": 0.04554373, + "step": 23698 + }, + { + "epoch": 47.398, + "grad_norm": 1.108909010887146, + "learning_rate": 2e-05, + "loss": 0.03959118, + "step": 23699 + }, + { + "epoch": 47.4, + "grad_norm": 1.2280199527740479, + "learning_rate": 2e-05, + "loss": 0.0505251, + "step": 23700 + }, + { + "epoch": 47.402, + "grad_norm": 1.2505056858062744, + "learning_rate": 2e-05, + "loss": 0.04818206, + "step": 23701 + }, + { + "epoch": 47.404, + "grad_norm": 1.2215907573699951, + "learning_rate": 2e-05, + "loss": 0.06155346, + "step": 23702 + }, + { + "epoch": 47.406, + "grad_norm": 2.548524856567383, + "learning_rate": 2e-05, + "loss": 0.07346775, + "step": 23703 + }, + { + "epoch": 47.408, + "grad_norm": 1.2835642099380493, + "learning_rate": 2e-05, + "loss": 0.03647726, + "step": 23704 + }, + { + "epoch": 47.41, + "grad_norm": 1.0395970344543457, + "learning_rate": 2e-05, + "loss": 0.03743298, + "step": 23705 + }, + { + "epoch": 47.412, + "grad_norm": 1.1787834167480469, + "learning_rate": 2e-05, + "loss": 0.05008036, + "step": 23706 + }, + { + "epoch": 47.414, + "grad_norm": 1.2608765363693237, + "learning_rate": 2e-05, + "loss": 0.05554523, + "step": 23707 + }, + { + "epoch": 47.416, + "grad_norm": 1.1391633749008179, + "learning_rate": 2e-05, + "loss": 0.04239395, + "step": 23708 + }, + { + "epoch": 47.418, + "grad_norm": 1.310951590538025, + "learning_rate": 2e-05, + "loss": 0.05426461, + "step": 23709 + }, + { + "epoch": 47.42, + "grad_norm": 2.4088597297668457, + "learning_rate": 2e-05, + "loss": 0.06128083, + "step": 23710 + }, + { + "epoch": 47.422, + "grad_norm": 1.4820548295974731, + "learning_rate": 2e-05, + "loss": 0.07166282, + "step": 23711 + }, + { + "epoch": 47.424, + "grad_norm": 1.2579067945480347, + "learning_rate": 2e-05, + "loss": 0.05245124, + "step": 23712 + }, + { + "epoch": 47.426, + "grad_norm": 1.2548208236694336, + "learning_rate": 2e-05, + "loss": 0.04808481, + "step": 23713 + }, + { + "epoch": 47.428, + "grad_norm": 1.0751184225082397, + "learning_rate": 2e-05, + "loss": 0.0375668, + "step": 23714 + }, + { + "epoch": 47.43, + "grad_norm": 1.1384459733963013, + "learning_rate": 2e-05, + "loss": 0.03911222, + "step": 23715 + }, + { + "epoch": 47.432, + "grad_norm": 1.2228471040725708, + "learning_rate": 2e-05, + "loss": 0.05863813, + "step": 23716 + }, + { + "epoch": 47.434, + "grad_norm": 3.312714099884033, + "learning_rate": 2e-05, + "loss": 0.06117887, + "step": 23717 + }, + { + "epoch": 47.436, + "grad_norm": 1.1511445045471191, + "learning_rate": 2e-05, + "loss": 0.04497742, + "step": 23718 + }, + { + "epoch": 47.438, + "grad_norm": 1.4445806741714478, + "learning_rate": 2e-05, + "loss": 0.06244376, + "step": 23719 + }, + { + "epoch": 47.44, + "grad_norm": 1.4664429426193237, + "learning_rate": 2e-05, + "loss": 0.03415911, + "step": 23720 + }, + { + "epoch": 47.442, + "grad_norm": 1.0343490839004517, + "learning_rate": 2e-05, + "loss": 0.03623316, + "step": 23721 + }, + { + "epoch": 47.444, + "grad_norm": 1.4916282892227173, + "learning_rate": 2e-05, + "loss": 0.04197778, + "step": 23722 + }, + { + "epoch": 47.446, + "grad_norm": 1.2317581176757812, + "learning_rate": 2e-05, + "loss": 0.05793425, + "step": 23723 + }, + { + "epoch": 47.448, + "grad_norm": 2.5570762157440186, + "learning_rate": 2e-05, + "loss": 0.04695795, + "step": 23724 + }, + { + "epoch": 47.45, + "grad_norm": 0.9492204785346985, + "learning_rate": 2e-05, + "loss": 0.0334493, + "step": 23725 + }, + { + "epoch": 47.452, + "grad_norm": 1.1067724227905273, + "learning_rate": 2e-05, + "loss": 0.03886347, + "step": 23726 + }, + { + "epoch": 47.454, + "grad_norm": 1.2168097496032715, + "learning_rate": 2e-05, + "loss": 0.04640615, + "step": 23727 + }, + { + "epoch": 47.456, + "grad_norm": 1.1498087644577026, + "learning_rate": 2e-05, + "loss": 0.05414462, + "step": 23728 + }, + { + "epoch": 47.458, + "grad_norm": 1.2377431392669678, + "learning_rate": 2e-05, + "loss": 0.05543441, + "step": 23729 + }, + { + "epoch": 47.46, + "grad_norm": 1.3042511940002441, + "learning_rate": 2e-05, + "loss": 0.05268206, + "step": 23730 + }, + { + "epoch": 47.462, + "grad_norm": 1.4321894645690918, + "learning_rate": 2e-05, + "loss": 0.04099277, + "step": 23731 + }, + { + "epoch": 47.464, + "grad_norm": 1.5580233335494995, + "learning_rate": 2e-05, + "loss": 0.05169383, + "step": 23732 + }, + { + "epoch": 47.466, + "grad_norm": 1.2211647033691406, + "learning_rate": 2e-05, + "loss": 0.03869744, + "step": 23733 + }, + { + "epoch": 47.468, + "grad_norm": 1.683929681777954, + "learning_rate": 2e-05, + "loss": 0.06134295, + "step": 23734 + }, + { + "epoch": 47.47, + "grad_norm": 1.2739115953445435, + "learning_rate": 2e-05, + "loss": 0.0545798, + "step": 23735 + }, + { + "epoch": 47.472, + "grad_norm": 1.387938141822815, + "learning_rate": 2e-05, + "loss": 0.06269515, + "step": 23736 + }, + { + "epoch": 47.474, + "grad_norm": 1.0513253211975098, + "learning_rate": 2e-05, + "loss": 0.03459634, + "step": 23737 + }, + { + "epoch": 47.476, + "grad_norm": 1.5735803842544556, + "learning_rate": 2e-05, + "loss": 0.05392249, + "step": 23738 + }, + { + "epoch": 47.478, + "grad_norm": 1.1136903762817383, + "learning_rate": 2e-05, + "loss": 0.0393229, + "step": 23739 + }, + { + "epoch": 47.48, + "grad_norm": 1.2298059463500977, + "learning_rate": 2e-05, + "loss": 0.05348741, + "step": 23740 + }, + { + "epoch": 47.482, + "grad_norm": 1.1526238918304443, + "learning_rate": 2e-05, + "loss": 0.04516281, + "step": 23741 + }, + { + "epoch": 47.484, + "grad_norm": 1.8501273393630981, + "learning_rate": 2e-05, + "loss": 0.04677322, + "step": 23742 + }, + { + "epoch": 47.486, + "grad_norm": 1.4598199129104614, + "learning_rate": 2e-05, + "loss": 0.03834684, + "step": 23743 + }, + { + "epoch": 47.488, + "grad_norm": 1.539868950843811, + "learning_rate": 2e-05, + "loss": 0.04157211, + "step": 23744 + }, + { + "epoch": 47.49, + "grad_norm": 1.016363501548767, + "learning_rate": 2e-05, + "loss": 0.0401577, + "step": 23745 + }, + { + "epoch": 47.492, + "grad_norm": 1.0004823207855225, + "learning_rate": 2e-05, + "loss": 0.04216732, + "step": 23746 + }, + { + "epoch": 47.494, + "grad_norm": 0.8949176669120789, + "learning_rate": 2e-05, + "loss": 0.03267929, + "step": 23747 + }, + { + "epoch": 47.496, + "grad_norm": 1.092294692993164, + "learning_rate": 2e-05, + "loss": 0.04605146, + "step": 23748 + }, + { + "epoch": 47.498, + "grad_norm": 1.3999691009521484, + "learning_rate": 2e-05, + "loss": 0.05625776, + "step": 23749 + }, + { + "epoch": 47.5, + "grad_norm": 1.136635422706604, + "learning_rate": 2e-05, + "loss": 0.04005692, + "step": 23750 + }, + { + "epoch": 47.502, + "grad_norm": 1.1078810691833496, + "learning_rate": 2e-05, + "loss": 0.04342239, + "step": 23751 + }, + { + "epoch": 47.504, + "grad_norm": 1.279944658279419, + "learning_rate": 2e-05, + "loss": 0.04219215, + "step": 23752 + }, + { + "epoch": 47.506, + "grad_norm": 1.091110110282898, + "learning_rate": 2e-05, + "loss": 0.03880176, + "step": 23753 + }, + { + "epoch": 47.508, + "grad_norm": 4.729748249053955, + "learning_rate": 2e-05, + "loss": 0.05638552, + "step": 23754 + }, + { + "epoch": 47.51, + "grad_norm": 1.0970360040664673, + "learning_rate": 2e-05, + "loss": 0.04364794, + "step": 23755 + }, + { + "epoch": 47.512, + "grad_norm": 1.0389903783798218, + "learning_rate": 2e-05, + "loss": 0.04169441, + "step": 23756 + }, + { + "epoch": 47.514, + "grad_norm": 1.5221785306930542, + "learning_rate": 2e-05, + "loss": 0.06585445, + "step": 23757 + }, + { + "epoch": 47.516, + "grad_norm": 2.1099374294281006, + "learning_rate": 2e-05, + "loss": 0.06468124, + "step": 23758 + }, + { + "epoch": 47.518, + "grad_norm": 1.0862318277359009, + "learning_rate": 2e-05, + "loss": 0.03993117, + "step": 23759 + }, + { + "epoch": 47.52, + "grad_norm": 1.1114388704299927, + "learning_rate": 2e-05, + "loss": 0.04338128, + "step": 23760 + }, + { + "epoch": 47.522, + "grad_norm": 0.9857814908027649, + "learning_rate": 2e-05, + "loss": 0.03844403, + "step": 23761 + }, + { + "epoch": 47.524, + "grad_norm": 1.3540973663330078, + "learning_rate": 2e-05, + "loss": 0.04602545, + "step": 23762 + }, + { + "epoch": 47.526, + "grad_norm": 1.3859474658966064, + "learning_rate": 2e-05, + "loss": 0.06367904, + "step": 23763 + }, + { + "epoch": 47.528, + "grad_norm": 1.4162659645080566, + "learning_rate": 2e-05, + "loss": 0.05949718, + "step": 23764 + }, + { + "epoch": 47.53, + "grad_norm": 1.3201795816421509, + "learning_rate": 2e-05, + "loss": 0.06164296, + "step": 23765 + }, + { + "epoch": 47.532, + "grad_norm": 1.2807005643844604, + "learning_rate": 2e-05, + "loss": 0.05345618, + "step": 23766 + }, + { + "epoch": 47.534, + "grad_norm": 1.0871586799621582, + "learning_rate": 2e-05, + "loss": 0.04408365, + "step": 23767 + }, + { + "epoch": 47.536, + "grad_norm": 1.0492790937423706, + "learning_rate": 2e-05, + "loss": 0.04874376, + "step": 23768 + }, + { + "epoch": 47.538, + "grad_norm": 1.1767281293869019, + "learning_rate": 2e-05, + "loss": 0.04965818, + "step": 23769 + }, + { + "epoch": 47.54, + "grad_norm": 1.1047857999801636, + "learning_rate": 2e-05, + "loss": 0.05242363, + "step": 23770 + }, + { + "epoch": 47.542, + "grad_norm": 1.1232954263687134, + "learning_rate": 2e-05, + "loss": 0.05646368, + "step": 23771 + }, + { + "epoch": 47.544, + "grad_norm": 1.3574858903884888, + "learning_rate": 2e-05, + "loss": 0.04148731, + "step": 23772 + }, + { + "epoch": 47.546, + "grad_norm": 0.9722660183906555, + "learning_rate": 2e-05, + "loss": 0.0293147, + "step": 23773 + }, + { + "epoch": 47.548, + "grad_norm": 1.1585569381713867, + "learning_rate": 2e-05, + "loss": 0.03967403, + "step": 23774 + }, + { + "epoch": 47.55, + "grad_norm": 1.473108172416687, + "learning_rate": 2e-05, + "loss": 0.06600234, + "step": 23775 + }, + { + "epoch": 47.552, + "grad_norm": 1.2558122873306274, + "learning_rate": 2e-05, + "loss": 0.04727027, + "step": 23776 + }, + { + "epoch": 47.554, + "grad_norm": 1.1267999410629272, + "learning_rate": 2e-05, + "loss": 0.05461453, + "step": 23777 + }, + { + "epoch": 47.556, + "grad_norm": 1.1623164415359497, + "learning_rate": 2e-05, + "loss": 0.04348694, + "step": 23778 + }, + { + "epoch": 47.558, + "grad_norm": 0.9064064621925354, + "learning_rate": 2e-05, + "loss": 0.03024364, + "step": 23779 + }, + { + "epoch": 47.56, + "grad_norm": 1.4345759153366089, + "learning_rate": 2e-05, + "loss": 0.04869916, + "step": 23780 + }, + { + "epoch": 47.562, + "grad_norm": 1.3848798274993896, + "learning_rate": 2e-05, + "loss": 0.06412567, + "step": 23781 + }, + { + "epoch": 47.564, + "grad_norm": 1.2268092632293701, + "learning_rate": 2e-05, + "loss": 0.0550194, + "step": 23782 + }, + { + "epoch": 47.566, + "grad_norm": 1.1246010065078735, + "learning_rate": 2e-05, + "loss": 0.05901127, + "step": 23783 + }, + { + "epoch": 47.568, + "grad_norm": 1.1794065237045288, + "learning_rate": 2e-05, + "loss": 0.04719983, + "step": 23784 + }, + { + "epoch": 47.57, + "grad_norm": 1.2328189611434937, + "learning_rate": 2e-05, + "loss": 0.03939323, + "step": 23785 + }, + { + "epoch": 47.572, + "grad_norm": 1.12712824344635, + "learning_rate": 2e-05, + "loss": 0.04430801, + "step": 23786 + }, + { + "epoch": 47.574, + "grad_norm": 1.1956403255462646, + "learning_rate": 2e-05, + "loss": 0.05049133, + "step": 23787 + }, + { + "epoch": 47.576, + "grad_norm": 1.102604866027832, + "learning_rate": 2e-05, + "loss": 0.04754536, + "step": 23788 + }, + { + "epoch": 47.578, + "grad_norm": 1.1247801780700684, + "learning_rate": 2e-05, + "loss": 0.04530304, + "step": 23789 + }, + { + "epoch": 47.58, + "grad_norm": 1.1731112003326416, + "learning_rate": 2e-05, + "loss": 0.04685323, + "step": 23790 + }, + { + "epoch": 47.582, + "grad_norm": 1.0738763809204102, + "learning_rate": 2e-05, + "loss": 0.0424612, + "step": 23791 + }, + { + "epoch": 47.584, + "grad_norm": 0.9249600172042847, + "learning_rate": 2e-05, + "loss": 0.03432335, + "step": 23792 + }, + { + "epoch": 47.586, + "grad_norm": 0.9936639666557312, + "learning_rate": 2e-05, + "loss": 0.03993072, + "step": 23793 + }, + { + "epoch": 47.588, + "grad_norm": 1.2670681476593018, + "learning_rate": 2e-05, + "loss": 0.04661375, + "step": 23794 + }, + { + "epoch": 47.59, + "grad_norm": 1.8145248889923096, + "learning_rate": 2e-05, + "loss": 0.05395586, + "step": 23795 + }, + { + "epoch": 47.592, + "grad_norm": 1.2371265888214111, + "learning_rate": 2e-05, + "loss": 0.06457849, + "step": 23796 + }, + { + "epoch": 47.594, + "grad_norm": 1.2411563396453857, + "learning_rate": 2e-05, + "loss": 0.04113891, + "step": 23797 + }, + { + "epoch": 47.596, + "grad_norm": 1.3027358055114746, + "learning_rate": 2e-05, + "loss": 0.04472353, + "step": 23798 + }, + { + "epoch": 47.598, + "grad_norm": 1.9770257472991943, + "learning_rate": 2e-05, + "loss": 0.06172611, + "step": 23799 + }, + { + "epoch": 47.6, + "grad_norm": 1.2722480297088623, + "learning_rate": 2e-05, + "loss": 0.04975735, + "step": 23800 + }, + { + "epoch": 47.602, + "grad_norm": 0.9771064519882202, + "learning_rate": 2e-05, + "loss": 0.03003059, + "step": 23801 + }, + { + "epoch": 47.604, + "grad_norm": 1.6549073457717896, + "learning_rate": 2e-05, + "loss": 0.04169197, + "step": 23802 + }, + { + "epoch": 47.606, + "grad_norm": 0.9755164980888367, + "learning_rate": 2e-05, + "loss": 0.03423417, + "step": 23803 + }, + { + "epoch": 47.608, + "grad_norm": 1.199762225151062, + "learning_rate": 2e-05, + "loss": 0.05413834, + "step": 23804 + }, + { + "epoch": 47.61, + "grad_norm": 1.410494089126587, + "learning_rate": 2e-05, + "loss": 0.04707893, + "step": 23805 + }, + { + "epoch": 47.612, + "grad_norm": 1.3130079507827759, + "learning_rate": 2e-05, + "loss": 0.05722672, + "step": 23806 + }, + { + "epoch": 47.614, + "grad_norm": 1.174035668373108, + "learning_rate": 2e-05, + "loss": 0.05756243, + "step": 23807 + }, + { + "epoch": 47.616, + "grad_norm": 1.7935864925384521, + "learning_rate": 2e-05, + "loss": 0.07460059, + "step": 23808 + }, + { + "epoch": 47.618, + "grad_norm": 1.6251633167266846, + "learning_rate": 2e-05, + "loss": 0.03863, + "step": 23809 + }, + { + "epoch": 47.62, + "grad_norm": 1.1181731224060059, + "learning_rate": 2e-05, + "loss": 0.04170064, + "step": 23810 + }, + { + "epoch": 47.622, + "grad_norm": 1.3144530057907104, + "learning_rate": 2e-05, + "loss": 0.05499358, + "step": 23811 + }, + { + "epoch": 47.624, + "grad_norm": 1.4863946437835693, + "learning_rate": 2e-05, + "loss": 0.07462236, + "step": 23812 + }, + { + "epoch": 47.626, + "grad_norm": 1.226033329963684, + "learning_rate": 2e-05, + "loss": 0.05091349, + "step": 23813 + }, + { + "epoch": 47.628, + "grad_norm": 1.164097785949707, + "learning_rate": 2e-05, + "loss": 0.0417085, + "step": 23814 + }, + { + "epoch": 47.63, + "grad_norm": 3.6329402923583984, + "learning_rate": 2e-05, + "loss": 0.03966795, + "step": 23815 + }, + { + "epoch": 47.632, + "grad_norm": 1.1707428693771362, + "learning_rate": 2e-05, + "loss": 0.05277382, + "step": 23816 + }, + { + "epoch": 47.634, + "grad_norm": 0.8920641541481018, + "learning_rate": 2e-05, + "loss": 0.02758377, + "step": 23817 + }, + { + "epoch": 47.636, + "grad_norm": 1.1014950275421143, + "learning_rate": 2e-05, + "loss": 0.04578391, + "step": 23818 + }, + { + "epoch": 47.638, + "grad_norm": 1.0088590383529663, + "learning_rate": 2e-05, + "loss": 0.0345577, + "step": 23819 + }, + { + "epoch": 47.64, + "grad_norm": 1.6413283348083496, + "learning_rate": 2e-05, + "loss": 0.04818995, + "step": 23820 + }, + { + "epoch": 47.642, + "grad_norm": 1.2504075765609741, + "learning_rate": 2e-05, + "loss": 0.04787406, + "step": 23821 + }, + { + "epoch": 47.644, + "grad_norm": 2.165092945098877, + "learning_rate": 2e-05, + "loss": 0.03882634, + "step": 23822 + }, + { + "epoch": 47.646, + "grad_norm": 0.9500136971473694, + "learning_rate": 2e-05, + "loss": 0.04030318, + "step": 23823 + }, + { + "epoch": 47.648, + "grad_norm": 1.0927907228469849, + "learning_rate": 2e-05, + "loss": 0.03360862, + "step": 23824 + }, + { + "epoch": 47.65, + "grad_norm": 1.44313645362854, + "learning_rate": 2e-05, + "loss": 0.05937745, + "step": 23825 + }, + { + "epoch": 47.652, + "grad_norm": 1.2534681558609009, + "learning_rate": 2e-05, + "loss": 0.04984522, + "step": 23826 + }, + { + "epoch": 47.654, + "grad_norm": 2.9447219371795654, + "learning_rate": 2e-05, + "loss": 0.05307635, + "step": 23827 + }, + { + "epoch": 47.656, + "grad_norm": 1.3493146896362305, + "learning_rate": 2e-05, + "loss": 0.04327986, + "step": 23828 + }, + { + "epoch": 47.658, + "grad_norm": 1.0709292888641357, + "learning_rate": 2e-05, + "loss": 0.04765247, + "step": 23829 + }, + { + "epoch": 47.66, + "grad_norm": 1.281948447227478, + "learning_rate": 2e-05, + "loss": 0.05185589, + "step": 23830 + }, + { + "epoch": 47.662, + "grad_norm": 1.2605935335159302, + "learning_rate": 2e-05, + "loss": 0.05030036, + "step": 23831 + }, + { + "epoch": 47.664, + "grad_norm": 1.413135051727295, + "learning_rate": 2e-05, + "loss": 0.05803326, + "step": 23832 + }, + { + "epoch": 47.666, + "grad_norm": 1.2617324590682983, + "learning_rate": 2e-05, + "loss": 0.04378517, + "step": 23833 + }, + { + "epoch": 47.668, + "grad_norm": 1.1886942386627197, + "learning_rate": 2e-05, + "loss": 0.05778721, + "step": 23834 + }, + { + "epoch": 47.67, + "grad_norm": 3.0332577228546143, + "learning_rate": 2e-05, + "loss": 0.05872624, + "step": 23835 + }, + { + "epoch": 47.672, + "grad_norm": 1.4225571155548096, + "learning_rate": 2e-05, + "loss": 0.03956306, + "step": 23836 + }, + { + "epoch": 47.674, + "grad_norm": 0.9652089476585388, + "learning_rate": 2e-05, + "loss": 0.02727574, + "step": 23837 + }, + { + "epoch": 47.676, + "grad_norm": 1.103406548500061, + "learning_rate": 2e-05, + "loss": 0.04494239, + "step": 23838 + }, + { + "epoch": 47.678, + "grad_norm": 1.1904319524765015, + "learning_rate": 2e-05, + "loss": 0.04636001, + "step": 23839 + }, + { + "epoch": 47.68, + "grad_norm": 1.2117693424224854, + "learning_rate": 2e-05, + "loss": 0.04273736, + "step": 23840 + }, + { + "epoch": 47.682, + "grad_norm": 1.095442771911621, + "learning_rate": 2e-05, + "loss": 0.04069439, + "step": 23841 + }, + { + "epoch": 47.684, + "grad_norm": 1.315514326095581, + "learning_rate": 2e-05, + "loss": 0.05416162, + "step": 23842 + }, + { + "epoch": 47.686, + "grad_norm": 1.8992160558700562, + "learning_rate": 2e-05, + "loss": 0.07332712, + "step": 23843 + }, + { + "epoch": 47.688, + "grad_norm": 1.0993719100952148, + "learning_rate": 2e-05, + "loss": 0.04374916, + "step": 23844 + }, + { + "epoch": 47.69, + "grad_norm": 1.2032912969589233, + "learning_rate": 2e-05, + "loss": 0.04704859, + "step": 23845 + }, + { + "epoch": 47.692, + "grad_norm": 5.187289237976074, + "learning_rate": 2e-05, + "loss": 0.04735934, + "step": 23846 + }, + { + "epoch": 47.694, + "grad_norm": 1.0638306140899658, + "learning_rate": 2e-05, + "loss": 0.04312051, + "step": 23847 + }, + { + "epoch": 47.696, + "grad_norm": 1.0537846088409424, + "learning_rate": 2e-05, + "loss": 0.04088642, + "step": 23848 + }, + { + "epoch": 47.698, + "grad_norm": 1.1591740846633911, + "learning_rate": 2e-05, + "loss": 0.04216747, + "step": 23849 + }, + { + "epoch": 47.7, + "grad_norm": 0.8905053734779358, + "learning_rate": 2e-05, + "loss": 0.03059562, + "step": 23850 + }, + { + "epoch": 47.702, + "grad_norm": 0.9576981663703918, + "learning_rate": 2e-05, + "loss": 0.02699701, + "step": 23851 + }, + { + "epoch": 47.704, + "grad_norm": 1.3115862607955933, + "learning_rate": 2e-05, + "loss": 0.04908556, + "step": 23852 + }, + { + "epoch": 47.706, + "grad_norm": 1.1104012727737427, + "learning_rate": 2e-05, + "loss": 0.04494857, + "step": 23853 + }, + { + "epoch": 47.708, + "grad_norm": 1.0533535480499268, + "learning_rate": 2e-05, + "loss": 0.03632095, + "step": 23854 + }, + { + "epoch": 47.71, + "grad_norm": 1.0708428621292114, + "learning_rate": 2e-05, + "loss": 0.04081979, + "step": 23855 + }, + { + "epoch": 47.712, + "grad_norm": 1.3414500951766968, + "learning_rate": 2e-05, + "loss": 0.04155166, + "step": 23856 + }, + { + "epoch": 47.714, + "grad_norm": 1.131571888923645, + "learning_rate": 2e-05, + "loss": 0.03478384, + "step": 23857 + }, + { + "epoch": 47.716, + "grad_norm": 1.0607177019119263, + "learning_rate": 2e-05, + "loss": 0.04390678, + "step": 23858 + }, + { + "epoch": 47.718, + "grad_norm": 1.1620864868164062, + "learning_rate": 2e-05, + "loss": 0.03903388, + "step": 23859 + }, + { + "epoch": 47.72, + "grad_norm": 1.2598408460617065, + "learning_rate": 2e-05, + "loss": 0.04365044, + "step": 23860 + }, + { + "epoch": 47.722, + "grad_norm": 1.0379410982131958, + "learning_rate": 2e-05, + "loss": 0.03826845, + "step": 23861 + }, + { + "epoch": 47.724, + "grad_norm": 1.4189554452896118, + "learning_rate": 2e-05, + "loss": 0.05523844, + "step": 23862 + }, + { + "epoch": 47.726, + "grad_norm": 2.2879295349121094, + "learning_rate": 2e-05, + "loss": 0.04601286, + "step": 23863 + }, + { + "epoch": 47.728, + "grad_norm": 1.9662691354751587, + "learning_rate": 2e-05, + "loss": 0.04415329, + "step": 23864 + }, + { + "epoch": 47.73, + "grad_norm": 1.136714220046997, + "learning_rate": 2e-05, + "loss": 0.04704106, + "step": 23865 + }, + { + "epoch": 47.732, + "grad_norm": 1.8158776760101318, + "learning_rate": 2e-05, + "loss": 0.04839449, + "step": 23866 + }, + { + "epoch": 47.734, + "grad_norm": 1.312970519065857, + "learning_rate": 2e-05, + "loss": 0.05811097, + "step": 23867 + }, + { + "epoch": 47.736, + "grad_norm": 1.4508172273635864, + "learning_rate": 2e-05, + "loss": 0.06293134, + "step": 23868 + }, + { + "epoch": 47.738, + "grad_norm": 1.0996098518371582, + "learning_rate": 2e-05, + "loss": 0.05198246, + "step": 23869 + }, + { + "epoch": 47.74, + "grad_norm": 1.2088953256607056, + "learning_rate": 2e-05, + "loss": 0.05847076, + "step": 23870 + }, + { + "epoch": 47.742, + "grad_norm": 1.990601897239685, + "learning_rate": 2e-05, + "loss": 0.07238883, + "step": 23871 + }, + { + "epoch": 47.744, + "grad_norm": 0.990237832069397, + "learning_rate": 2e-05, + "loss": 0.03466114, + "step": 23872 + }, + { + "epoch": 47.746, + "grad_norm": 1.187274694442749, + "learning_rate": 2e-05, + "loss": 0.05454275, + "step": 23873 + }, + { + "epoch": 47.748, + "grad_norm": 1.2135357856750488, + "learning_rate": 2e-05, + "loss": 0.05944178, + "step": 23874 + }, + { + "epoch": 47.75, + "grad_norm": 1.147931694984436, + "learning_rate": 2e-05, + "loss": 0.04608715, + "step": 23875 + }, + { + "epoch": 47.752, + "grad_norm": 1.2686227560043335, + "learning_rate": 2e-05, + "loss": 0.03793816, + "step": 23876 + }, + { + "epoch": 47.754, + "grad_norm": 1.2888165712356567, + "learning_rate": 2e-05, + "loss": 0.05994112, + "step": 23877 + }, + { + "epoch": 47.756, + "grad_norm": 1.1328716278076172, + "learning_rate": 2e-05, + "loss": 0.04631865, + "step": 23878 + }, + { + "epoch": 47.758, + "grad_norm": 1.1973130702972412, + "learning_rate": 2e-05, + "loss": 0.04879654, + "step": 23879 + }, + { + "epoch": 47.76, + "grad_norm": 1.3249599933624268, + "learning_rate": 2e-05, + "loss": 0.0531996, + "step": 23880 + }, + { + "epoch": 47.762, + "grad_norm": 0.9521530866622925, + "learning_rate": 2e-05, + "loss": 0.03609097, + "step": 23881 + }, + { + "epoch": 47.764, + "grad_norm": 1.0063776969909668, + "learning_rate": 2e-05, + "loss": 0.04234225, + "step": 23882 + }, + { + "epoch": 47.766, + "grad_norm": 1.1210484504699707, + "learning_rate": 2e-05, + "loss": 0.04907482, + "step": 23883 + }, + { + "epoch": 47.768, + "grad_norm": 1.406660795211792, + "learning_rate": 2e-05, + "loss": 0.05606017, + "step": 23884 + }, + { + "epoch": 47.77, + "grad_norm": 1.1773229837417603, + "learning_rate": 2e-05, + "loss": 0.05414952, + "step": 23885 + }, + { + "epoch": 47.772, + "grad_norm": 2.4284679889678955, + "learning_rate": 2e-05, + "loss": 0.04393204, + "step": 23886 + }, + { + "epoch": 47.774, + "grad_norm": 1.0576744079589844, + "learning_rate": 2e-05, + "loss": 0.03376132, + "step": 23887 + }, + { + "epoch": 47.776, + "grad_norm": 2.838202714920044, + "learning_rate": 2e-05, + "loss": 0.06541966, + "step": 23888 + }, + { + "epoch": 47.778, + "grad_norm": 1.1831326484680176, + "learning_rate": 2e-05, + "loss": 0.04526076, + "step": 23889 + }, + { + "epoch": 47.78, + "grad_norm": 1.4545902013778687, + "learning_rate": 2e-05, + "loss": 0.05634362, + "step": 23890 + }, + { + "epoch": 47.782, + "grad_norm": 1.2465368509292603, + "learning_rate": 2e-05, + "loss": 0.04193708, + "step": 23891 + }, + { + "epoch": 47.784, + "grad_norm": 1.2062345743179321, + "learning_rate": 2e-05, + "loss": 0.05329098, + "step": 23892 + }, + { + "epoch": 47.786, + "grad_norm": 1.221840262413025, + "learning_rate": 2e-05, + "loss": 0.04324333, + "step": 23893 + }, + { + "epoch": 47.788, + "grad_norm": 1.1237281560897827, + "learning_rate": 2e-05, + "loss": 0.03751169, + "step": 23894 + }, + { + "epoch": 47.79, + "grad_norm": 1.1282539367675781, + "learning_rate": 2e-05, + "loss": 0.04889031, + "step": 23895 + }, + { + "epoch": 47.792, + "grad_norm": 1.0426477193832397, + "learning_rate": 2e-05, + "loss": 0.03937999, + "step": 23896 + }, + { + "epoch": 47.794, + "grad_norm": 2.0234551429748535, + "learning_rate": 2e-05, + "loss": 0.04139862, + "step": 23897 + }, + { + "epoch": 47.796, + "grad_norm": 1.0973756313323975, + "learning_rate": 2e-05, + "loss": 0.04653414, + "step": 23898 + }, + { + "epoch": 47.798, + "grad_norm": 1.1589323282241821, + "learning_rate": 2e-05, + "loss": 0.05205517, + "step": 23899 + }, + { + "epoch": 47.8, + "grad_norm": 1.1792693138122559, + "learning_rate": 2e-05, + "loss": 0.04807966, + "step": 23900 + }, + { + "epoch": 47.802, + "grad_norm": 0.9611416459083557, + "learning_rate": 2e-05, + "loss": 0.03486631, + "step": 23901 + }, + { + "epoch": 47.804, + "grad_norm": 1.1984083652496338, + "learning_rate": 2e-05, + "loss": 0.06004932, + "step": 23902 + }, + { + "epoch": 47.806, + "grad_norm": 1.102168083190918, + "learning_rate": 2e-05, + "loss": 0.05081422, + "step": 23903 + }, + { + "epoch": 47.808, + "grad_norm": 1.1081064939498901, + "learning_rate": 2e-05, + "loss": 0.0446634, + "step": 23904 + }, + { + "epoch": 47.81, + "grad_norm": 0.9922335147857666, + "learning_rate": 2e-05, + "loss": 0.04278713, + "step": 23905 + }, + { + "epoch": 47.812, + "grad_norm": 1.3645964860916138, + "learning_rate": 2e-05, + "loss": 0.06499512, + "step": 23906 + }, + { + "epoch": 47.814, + "grad_norm": 1.6501846313476562, + "learning_rate": 2e-05, + "loss": 0.04862681, + "step": 23907 + }, + { + "epoch": 47.816, + "grad_norm": 1.0368905067443848, + "learning_rate": 2e-05, + "loss": 0.0441791, + "step": 23908 + }, + { + "epoch": 47.818, + "grad_norm": 1.1054444313049316, + "learning_rate": 2e-05, + "loss": 0.04063675, + "step": 23909 + }, + { + "epoch": 47.82, + "grad_norm": 1.1246895790100098, + "learning_rate": 2e-05, + "loss": 0.04597709, + "step": 23910 + }, + { + "epoch": 47.822, + "grad_norm": 1.0721478462219238, + "learning_rate": 2e-05, + "loss": 0.04677291, + "step": 23911 + }, + { + "epoch": 47.824, + "grad_norm": 1.1274585723876953, + "learning_rate": 2e-05, + "loss": 0.05058762, + "step": 23912 + }, + { + "epoch": 47.826, + "grad_norm": 1.4998968839645386, + "learning_rate": 2e-05, + "loss": 0.05139451, + "step": 23913 + }, + { + "epoch": 47.828, + "grad_norm": 0.9521937370300293, + "learning_rate": 2e-05, + "loss": 0.03633155, + "step": 23914 + }, + { + "epoch": 47.83, + "grad_norm": 1.3443994522094727, + "learning_rate": 2e-05, + "loss": 0.06229488, + "step": 23915 + }, + { + "epoch": 47.832, + "grad_norm": 1.165913701057434, + "learning_rate": 2e-05, + "loss": 0.05210435, + "step": 23916 + }, + { + "epoch": 47.834, + "grad_norm": 1.4400709867477417, + "learning_rate": 2e-05, + "loss": 0.0268791, + "step": 23917 + }, + { + "epoch": 47.836, + "grad_norm": 0.978833019733429, + "learning_rate": 2e-05, + "loss": 0.03885253, + "step": 23918 + }, + { + "epoch": 47.838, + "grad_norm": 3.3101084232330322, + "learning_rate": 2e-05, + "loss": 0.06251098, + "step": 23919 + }, + { + "epoch": 47.84, + "grad_norm": 1.6580345630645752, + "learning_rate": 2e-05, + "loss": 0.04920085, + "step": 23920 + }, + { + "epoch": 47.842, + "grad_norm": 1.2298296689987183, + "learning_rate": 2e-05, + "loss": 0.04933652, + "step": 23921 + }, + { + "epoch": 47.844, + "grad_norm": 1.209776520729065, + "learning_rate": 2e-05, + "loss": 0.06445093, + "step": 23922 + }, + { + "epoch": 47.846, + "grad_norm": 1.1187968254089355, + "learning_rate": 2e-05, + "loss": 0.04598414, + "step": 23923 + }, + { + "epoch": 47.848, + "grad_norm": 1.0532989501953125, + "learning_rate": 2e-05, + "loss": 0.03271957, + "step": 23924 + }, + { + "epoch": 47.85, + "grad_norm": 1.2785584926605225, + "learning_rate": 2e-05, + "loss": 0.05796767, + "step": 23925 + }, + { + "epoch": 47.852, + "grad_norm": 1.1597468852996826, + "learning_rate": 2e-05, + "loss": 0.04433453, + "step": 23926 + }, + { + "epoch": 47.854, + "grad_norm": 1.1068360805511475, + "learning_rate": 2e-05, + "loss": 0.04815186, + "step": 23927 + }, + { + "epoch": 47.856, + "grad_norm": 1.0562989711761475, + "learning_rate": 2e-05, + "loss": 0.039212, + "step": 23928 + }, + { + "epoch": 47.858, + "grad_norm": 1.2464039325714111, + "learning_rate": 2e-05, + "loss": 0.05434022, + "step": 23929 + }, + { + "epoch": 47.86, + "grad_norm": 1.9144781827926636, + "learning_rate": 2e-05, + "loss": 0.0500843, + "step": 23930 + }, + { + "epoch": 47.862, + "grad_norm": 4.982010364532471, + "learning_rate": 2e-05, + "loss": 0.05325773, + "step": 23931 + }, + { + "epoch": 47.864, + "grad_norm": 2.1956326961517334, + "learning_rate": 2e-05, + "loss": 0.04757648, + "step": 23932 + }, + { + "epoch": 47.866, + "grad_norm": 1.3570915460586548, + "learning_rate": 2e-05, + "loss": 0.05788545, + "step": 23933 + }, + { + "epoch": 47.868, + "grad_norm": 1.1021960973739624, + "learning_rate": 2e-05, + "loss": 0.04605621, + "step": 23934 + }, + { + "epoch": 47.87, + "grad_norm": 1.2763595581054688, + "learning_rate": 2e-05, + "loss": 0.04462332, + "step": 23935 + }, + { + "epoch": 47.872, + "grad_norm": 1.2677397727966309, + "learning_rate": 2e-05, + "loss": 0.06458889, + "step": 23936 + }, + { + "epoch": 47.874, + "grad_norm": 1.571900486946106, + "learning_rate": 2e-05, + "loss": 0.04743953, + "step": 23937 + }, + { + "epoch": 47.876, + "grad_norm": 1.2645798921585083, + "learning_rate": 2e-05, + "loss": 0.04014737, + "step": 23938 + }, + { + "epoch": 47.878, + "grad_norm": 1.3748975992202759, + "learning_rate": 2e-05, + "loss": 0.05016341, + "step": 23939 + }, + { + "epoch": 47.88, + "grad_norm": 1.0344460010528564, + "learning_rate": 2e-05, + "loss": 0.03785864, + "step": 23940 + }, + { + "epoch": 47.882, + "grad_norm": 1.15352463722229, + "learning_rate": 2e-05, + "loss": 0.04408693, + "step": 23941 + }, + { + "epoch": 47.884, + "grad_norm": 0.9576112627983093, + "learning_rate": 2e-05, + "loss": 0.0283091, + "step": 23942 + }, + { + "epoch": 47.886, + "grad_norm": 1.1543420553207397, + "learning_rate": 2e-05, + "loss": 0.03587131, + "step": 23943 + }, + { + "epoch": 47.888, + "grad_norm": 1.3030697107315063, + "learning_rate": 2e-05, + "loss": 0.06125759, + "step": 23944 + }, + { + "epoch": 47.89, + "grad_norm": 1.159096598625183, + "learning_rate": 2e-05, + "loss": 0.04972298, + "step": 23945 + }, + { + "epoch": 47.892, + "grad_norm": 1.3995835781097412, + "learning_rate": 2e-05, + "loss": 0.03951022, + "step": 23946 + }, + { + "epoch": 47.894, + "grad_norm": 1.2913930416107178, + "learning_rate": 2e-05, + "loss": 0.04597832, + "step": 23947 + }, + { + "epoch": 47.896, + "grad_norm": 1.0816295146942139, + "learning_rate": 2e-05, + "loss": 0.03365443, + "step": 23948 + }, + { + "epoch": 47.898, + "grad_norm": 1.2228970527648926, + "learning_rate": 2e-05, + "loss": 0.06327063, + "step": 23949 + }, + { + "epoch": 47.9, + "grad_norm": 1.1971608400344849, + "learning_rate": 2e-05, + "loss": 0.04186807, + "step": 23950 + }, + { + "epoch": 47.902, + "grad_norm": 1.3792004585266113, + "learning_rate": 2e-05, + "loss": 0.04573726, + "step": 23951 + }, + { + "epoch": 47.904, + "grad_norm": 1.3499553203582764, + "learning_rate": 2e-05, + "loss": 0.04411193, + "step": 23952 + }, + { + "epoch": 47.906, + "grad_norm": 0.9711533188819885, + "learning_rate": 2e-05, + "loss": 0.04265768, + "step": 23953 + }, + { + "epoch": 47.908, + "grad_norm": 1.329298496246338, + "learning_rate": 2e-05, + "loss": 0.0500191, + "step": 23954 + }, + { + "epoch": 47.91, + "grad_norm": 1.0052729845046997, + "learning_rate": 2e-05, + "loss": 0.04355924, + "step": 23955 + }, + { + "epoch": 47.912, + "grad_norm": 1.1501191854476929, + "learning_rate": 2e-05, + "loss": 0.03861841, + "step": 23956 + }, + { + "epoch": 47.914, + "grad_norm": 1.1558641195297241, + "learning_rate": 2e-05, + "loss": 0.05884036, + "step": 23957 + }, + { + "epoch": 47.916, + "grad_norm": 1.0510032176971436, + "learning_rate": 2e-05, + "loss": 0.03164276, + "step": 23958 + }, + { + "epoch": 47.918, + "grad_norm": 1.3713427782058716, + "learning_rate": 2e-05, + "loss": 0.05324286, + "step": 23959 + }, + { + "epoch": 47.92, + "grad_norm": 1.4690266847610474, + "learning_rate": 2e-05, + "loss": 0.04128034, + "step": 23960 + }, + { + "epoch": 47.922, + "grad_norm": 2.2947113513946533, + "learning_rate": 2e-05, + "loss": 0.04311752, + "step": 23961 + }, + { + "epoch": 47.924, + "grad_norm": 1.1298458576202393, + "learning_rate": 2e-05, + "loss": 0.04617842, + "step": 23962 + }, + { + "epoch": 47.926, + "grad_norm": 1.2151092290878296, + "learning_rate": 2e-05, + "loss": 0.0448565, + "step": 23963 + }, + { + "epoch": 47.928, + "grad_norm": 1.210721731185913, + "learning_rate": 2e-05, + "loss": 0.03401937, + "step": 23964 + }, + { + "epoch": 47.93, + "grad_norm": 1.6079378128051758, + "learning_rate": 2e-05, + "loss": 0.04983381, + "step": 23965 + }, + { + "epoch": 47.932, + "grad_norm": 0.9416164755821228, + "learning_rate": 2e-05, + "loss": 0.0281193, + "step": 23966 + }, + { + "epoch": 47.934, + "grad_norm": 1.450137972831726, + "learning_rate": 2e-05, + "loss": 0.05513122, + "step": 23967 + }, + { + "epoch": 47.936, + "grad_norm": 1.427017092704773, + "learning_rate": 2e-05, + "loss": 0.06112292, + "step": 23968 + }, + { + "epoch": 47.938, + "grad_norm": 1.1121290922164917, + "learning_rate": 2e-05, + "loss": 0.04755298, + "step": 23969 + }, + { + "epoch": 47.94, + "grad_norm": 1.350212812423706, + "learning_rate": 2e-05, + "loss": 0.02974812, + "step": 23970 + }, + { + "epoch": 47.942, + "grad_norm": 1.009774088859558, + "learning_rate": 2e-05, + "loss": 0.03674398, + "step": 23971 + }, + { + "epoch": 47.944, + "grad_norm": 0.989896297454834, + "learning_rate": 2e-05, + "loss": 0.04537395, + "step": 23972 + }, + { + "epoch": 47.946, + "grad_norm": 1.3564207553863525, + "learning_rate": 2e-05, + "loss": 0.0425001, + "step": 23973 + }, + { + "epoch": 47.948, + "grad_norm": 2.2842447757720947, + "learning_rate": 2e-05, + "loss": 0.04634003, + "step": 23974 + }, + { + "epoch": 47.95, + "grad_norm": 1.2380462884902954, + "learning_rate": 2e-05, + "loss": 0.04033655, + "step": 23975 + }, + { + "epoch": 47.952, + "grad_norm": 1.2136390209197998, + "learning_rate": 2e-05, + "loss": 0.03208754, + "step": 23976 + }, + { + "epoch": 47.954, + "grad_norm": 1.2181363105773926, + "learning_rate": 2e-05, + "loss": 0.05851332, + "step": 23977 + }, + { + "epoch": 47.956, + "grad_norm": 1.1737416982650757, + "learning_rate": 2e-05, + "loss": 0.04869546, + "step": 23978 + }, + { + "epoch": 47.958, + "grad_norm": 1.1523841619491577, + "learning_rate": 2e-05, + "loss": 0.04287051, + "step": 23979 + }, + { + "epoch": 47.96, + "grad_norm": 1.3035601377487183, + "learning_rate": 2e-05, + "loss": 0.05795803, + "step": 23980 + }, + { + "epoch": 47.962, + "grad_norm": 1.4103642702102661, + "learning_rate": 2e-05, + "loss": 0.0504769, + "step": 23981 + }, + { + "epoch": 47.964, + "grad_norm": 1.3686590194702148, + "learning_rate": 2e-05, + "loss": 0.05023767, + "step": 23982 + }, + { + "epoch": 47.966, + "grad_norm": 1.1425793170928955, + "learning_rate": 2e-05, + "loss": 0.03801563, + "step": 23983 + }, + { + "epoch": 47.968, + "grad_norm": 1.2969728708267212, + "learning_rate": 2e-05, + "loss": 0.04899958, + "step": 23984 + }, + { + "epoch": 47.97, + "grad_norm": 1.312829613685608, + "learning_rate": 2e-05, + "loss": 0.06080573, + "step": 23985 + }, + { + "epoch": 47.972, + "grad_norm": 1.264377236366272, + "learning_rate": 2e-05, + "loss": 0.04977002, + "step": 23986 + }, + { + "epoch": 47.974, + "grad_norm": 2.843947410583496, + "learning_rate": 2e-05, + "loss": 0.06198847, + "step": 23987 + }, + { + "epoch": 47.976, + "grad_norm": 1.4334129095077515, + "learning_rate": 2e-05, + "loss": 0.07072139, + "step": 23988 + }, + { + "epoch": 47.978, + "grad_norm": 1.3132730722427368, + "learning_rate": 2e-05, + "loss": 0.06276097, + "step": 23989 + }, + { + "epoch": 47.98, + "grad_norm": 1.2689872980117798, + "learning_rate": 2e-05, + "loss": 0.04908171, + "step": 23990 + }, + { + "epoch": 47.982, + "grad_norm": 1.4487504959106445, + "learning_rate": 2e-05, + "loss": 0.04362805, + "step": 23991 + }, + { + "epoch": 47.984, + "grad_norm": 1.1339962482452393, + "learning_rate": 2e-05, + "loss": 0.05087781, + "step": 23992 + }, + { + "epoch": 47.986, + "grad_norm": 1.7450958490371704, + "learning_rate": 2e-05, + "loss": 0.05819881, + "step": 23993 + }, + { + "epoch": 47.988, + "grad_norm": 1.1396898031234741, + "learning_rate": 2e-05, + "loss": 0.04882061, + "step": 23994 + }, + { + "epoch": 47.99, + "grad_norm": 1.337996482849121, + "learning_rate": 2e-05, + "loss": 0.04781748, + "step": 23995 + }, + { + "epoch": 47.992, + "grad_norm": 1.5428389310836792, + "learning_rate": 2e-05, + "loss": 0.0593975, + "step": 23996 + }, + { + "epoch": 47.994, + "grad_norm": 1.029953956604004, + "learning_rate": 2e-05, + "loss": 0.03917014, + "step": 23997 + }, + { + "epoch": 47.996, + "grad_norm": 1.3269636631011963, + "learning_rate": 2e-05, + "loss": 0.06252966, + "step": 23998 + }, + { + "epoch": 47.998, + "grad_norm": 1.4018568992614746, + "learning_rate": 2e-05, + "loss": 0.05506238, + "step": 23999 + }, + { + "epoch": 48.0, + "grad_norm": 1.3268723487854004, + "learning_rate": 2e-05, + "loss": 0.04358728, + "step": 24000 + }, + { + "epoch": 48.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9840319361277445, + "Equal_1": 0.998, + "Equal_2": 0.9820359281437125, + "Equal_3": 0.9900199600798403, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 1.0, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 1.0, + "Perpendicular_2": 1.0, + "Perpendicular_3": 0.9098196392785571, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.9916, + "PointLiesOnLine_1": 0.9979959919839679, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9940119760479041 + }, + "eval_runtime": 320.3246, + "eval_samples_per_second": 32.779, + "eval_steps_per_second": 0.656, + "step": 24000 + }, + { + "epoch": 48.002, + "grad_norm": 1.0694226026535034, + "learning_rate": 2e-05, + "loss": 0.03889792, + "step": 24001 + }, + { + "epoch": 48.004, + "grad_norm": 1.1457878351211548, + "learning_rate": 2e-05, + "loss": 0.04411246, + "step": 24002 + }, + { + "epoch": 48.006, + "grad_norm": 1.3555324077606201, + "learning_rate": 2e-05, + "loss": 0.05262123, + "step": 24003 + }, + { + "epoch": 48.008, + "grad_norm": 1.038233995437622, + "learning_rate": 2e-05, + "loss": 0.04188913, + "step": 24004 + }, + { + "epoch": 48.01, + "grad_norm": 3.3475518226623535, + "learning_rate": 2e-05, + "loss": 0.05884375, + "step": 24005 + }, + { + "epoch": 48.012, + "grad_norm": 1.470353364944458, + "learning_rate": 2e-05, + "loss": 0.04114807, + "step": 24006 + }, + { + "epoch": 48.014, + "grad_norm": 1.2043060064315796, + "learning_rate": 2e-05, + "loss": 0.0573847, + "step": 24007 + }, + { + "epoch": 48.016, + "grad_norm": 1.1786561012268066, + "learning_rate": 2e-05, + "loss": 0.04742495, + "step": 24008 + }, + { + "epoch": 48.018, + "grad_norm": 1.0251394510269165, + "learning_rate": 2e-05, + "loss": 0.0432762, + "step": 24009 + }, + { + "epoch": 48.02, + "grad_norm": 1.1770861148834229, + "learning_rate": 2e-05, + "loss": 0.04839217, + "step": 24010 + }, + { + "epoch": 48.022, + "grad_norm": 0.9781084060668945, + "learning_rate": 2e-05, + "loss": 0.04288932, + "step": 24011 + }, + { + "epoch": 48.024, + "grad_norm": 1.5834667682647705, + "learning_rate": 2e-05, + "loss": 0.04880485, + "step": 24012 + }, + { + "epoch": 48.026, + "grad_norm": 2.6981749534606934, + "learning_rate": 2e-05, + "loss": 0.03831647, + "step": 24013 + }, + { + "epoch": 48.028, + "grad_norm": 1.1615235805511475, + "learning_rate": 2e-05, + "loss": 0.04750589, + "step": 24014 + }, + { + "epoch": 48.03, + "grad_norm": 1.1639854907989502, + "learning_rate": 2e-05, + "loss": 0.04984055, + "step": 24015 + }, + { + "epoch": 48.032, + "grad_norm": 1.0798165798187256, + "learning_rate": 2e-05, + "loss": 0.05147992, + "step": 24016 + }, + { + "epoch": 48.034, + "grad_norm": 1.0274465084075928, + "learning_rate": 2e-05, + "loss": 0.04617003, + "step": 24017 + }, + { + "epoch": 48.036, + "grad_norm": 1.038852334022522, + "learning_rate": 2e-05, + "loss": 0.0369803, + "step": 24018 + }, + { + "epoch": 48.038, + "grad_norm": 2.6861026287078857, + "learning_rate": 2e-05, + "loss": 0.07091722, + "step": 24019 + }, + { + "epoch": 48.04, + "grad_norm": 1.3858764171600342, + "learning_rate": 2e-05, + "loss": 0.04682039, + "step": 24020 + }, + { + "epoch": 48.042, + "grad_norm": 1.1013208627700806, + "learning_rate": 2e-05, + "loss": 0.04063297, + "step": 24021 + }, + { + "epoch": 48.044, + "grad_norm": 1.4258663654327393, + "learning_rate": 2e-05, + "loss": 0.05036592, + "step": 24022 + }, + { + "epoch": 48.046, + "grad_norm": 1.1322613954544067, + "learning_rate": 2e-05, + "loss": 0.04924812, + "step": 24023 + }, + { + "epoch": 48.048, + "grad_norm": 1.7514662742614746, + "learning_rate": 2e-05, + "loss": 0.06260653, + "step": 24024 + }, + { + "epoch": 48.05, + "grad_norm": 1.3482943773269653, + "learning_rate": 2e-05, + "loss": 0.05003114, + "step": 24025 + }, + { + "epoch": 48.052, + "grad_norm": 1.5502170324325562, + "learning_rate": 2e-05, + "loss": 0.04645883, + "step": 24026 + }, + { + "epoch": 48.054, + "grad_norm": 1.1267354488372803, + "learning_rate": 2e-05, + "loss": 0.04372516, + "step": 24027 + }, + { + "epoch": 48.056, + "grad_norm": 1.668345332145691, + "learning_rate": 2e-05, + "loss": 0.07128799, + "step": 24028 + }, + { + "epoch": 48.058, + "grad_norm": 2.1013059616088867, + "learning_rate": 2e-05, + "loss": 0.04920814, + "step": 24029 + }, + { + "epoch": 48.06, + "grad_norm": 1.272568941116333, + "learning_rate": 2e-05, + "loss": 0.04338931, + "step": 24030 + }, + { + "epoch": 48.062, + "grad_norm": 1.102447748184204, + "learning_rate": 2e-05, + "loss": 0.03770321, + "step": 24031 + }, + { + "epoch": 48.064, + "grad_norm": 1.8323739767074585, + "learning_rate": 2e-05, + "loss": 0.05331714, + "step": 24032 + }, + { + "epoch": 48.066, + "grad_norm": 2.065495491027832, + "learning_rate": 2e-05, + "loss": 0.05580588, + "step": 24033 + }, + { + "epoch": 48.068, + "grad_norm": 4.448790073394775, + "learning_rate": 2e-05, + "loss": 0.04371783, + "step": 24034 + }, + { + "epoch": 48.07, + "grad_norm": 1.122683048248291, + "learning_rate": 2e-05, + "loss": 0.05487248, + "step": 24035 + }, + { + "epoch": 48.072, + "grad_norm": 1.2778655290603638, + "learning_rate": 2e-05, + "loss": 0.0487362, + "step": 24036 + }, + { + "epoch": 48.074, + "grad_norm": 1.5660276412963867, + "learning_rate": 2e-05, + "loss": 0.05149665, + "step": 24037 + }, + { + "epoch": 48.076, + "grad_norm": 1.2964324951171875, + "learning_rate": 2e-05, + "loss": 0.04056223, + "step": 24038 + }, + { + "epoch": 48.078, + "grad_norm": 1.1524419784545898, + "learning_rate": 2e-05, + "loss": 0.05273098, + "step": 24039 + }, + { + "epoch": 48.08, + "grad_norm": 1.4630237817764282, + "learning_rate": 2e-05, + "loss": 0.05465612, + "step": 24040 + }, + { + "epoch": 48.082, + "grad_norm": 1.2101308107376099, + "learning_rate": 2e-05, + "loss": 0.04670757, + "step": 24041 + }, + { + "epoch": 48.084, + "grad_norm": 1.1870620250701904, + "learning_rate": 2e-05, + "loss": 0.04672918, + "step": 24042 + }, + { + "epoch": 48.086, + "grad_norm": 1.2451860904693604, + "learning_rate": 2e-05, + "loss": 0.05255879, + "step": 24043 + }, + { + "epoch": 48.088, + "grad_norm": 1.1772750616073608, + "learning_rate": 2e-05, + "loss": 0.05022447, + "step": 24044 + }, + { + "epoch": 48.09, + "grad_norm": 1.4949736595153809, + "learning_rate": 2e-05, + "loss": 0.05462297, + "step": 24045 + }, + { + "epoch": 48.092, + "grad_norm": 1.0446524620056152, + "learning_rate": 2e-05, + "loss": 0.05005912, + "step": 24046 + }, + { + "epoch": 48.094, + "grad_norm": 1.2837433815002441, + "learning_rate": 2e-05, + "loss": 0.04919739, + "step": 24047 + }, + { + "epoch": 48.096, + "grad_norm": 1.2766021490097046, + "learning_rate": 2e-05, + "loss": 0.05800232, + "step": 24048 + }, + { + "epoch": 48.098, + "grad_norm": 1.386711835861206, + "learning_rate": 2e-05, + "loss": 0.05083595, + "step": 24049 + }, + { + "epoch": 48.1, + "grad_norm": 0.9854950308799744, + "learning_rate": 2e-05, + "loss": 0.03358131, + "step": 24050 + }, + { + "epoch": 48.102, + "grad_norm": 1.1516034603118896, + "learning_rate": 2e-05, + "loss": 0.04865019, + "step": 24051 + }, + { + "epoch": 48.104, + "grad_norm": 1.855728030204773, + "learning_rate": 2e-05, + "loss": 0.06086725, + "step": 24052 + }, + { + "epoch": 48.106, + "grad_norm": 1.3824462890625, + "learning_rate": 2e-05, + "loss": 0.03999291, + "step": 24053 + }, + { + "epoch": 48.108, + "grad_norm": 1.1911598443984985, + "learning_rate": 2e-05, + "loss": 0.04892493, + "step": 24054 + }, + { + "epoch": 48.11, + "grad_norm": 1.2059804201126099, + "learning_rate": 2e-05, + "loss": 0.05040662, + "step": 24055 + }, + { + "epoch": 48.112, + "grad_norm": 1.4151570796966553, + "learning_rate": 2e-05, + "loss": 0.05632602, + "step": 24056 + }, + { + "epoch": 48.114, + "grad_norm": 1.101078748703003, + "learning_rate": 2e-05, + "loss": 0.03802776, + "step": 24057 + }, + { + "epoch": 48.116, + "grad_norm": 1.3649169206619263, + "learning_rate": 2e-05, + "loss": 0.06694816, + "step": 24058 + }, + { + "epoch": 48.118, + "grad_norm": 1.1776071786880493, + "learning_rate": 2e-05, + "loss": 0.05049934, + "step": 24059 + }, + { + "epoch": 48.12, + "grad_norm": 1.082025408744812, + "learning_rate": 2e-05, + "loss": 0.04495796, + "step": 24060 + }, + { + "epoch": 48.122, + "grad_norm": 1.2222503423690796, + "learning_rate": 2e-05, + "loss": 0.04708505, + "step": 24061 + }, + { + "epoch": 48.124, + "grad_norm": 1.5613664388656616, + "learning_rate": 2e-05, + "loss": 0.05574992, + "step": 24062 + }, + { + "epoch": 48.126, + "grad_norm": 1.2608956098556519, + "learning_rate": 2e-05, + "loss": 0.04397441, + "step": 24063 + }, + { + "epoch": 48.128, + "grad_norm": 0.9204988479614258, + "learning_rate": 2e-05, + "loss": 0.02955709, + "step": 24064 + }, + { + "epoch": 48.13, + "grad_norm": 0.984535813331604, + "learning_rate": 2e-05, + "loss": 0.04042712, + "step": 24065 + }, + { + "epoch": 48.132, + "grad_norm": 5.602854251861572, + "learning_rate": 2e-05, + "loss": 0.046597, + "step": 24066 + }, + { + "epoch": 48.134, + "grad_norm": 1.278770089149475, + "learning_rate": 2e-05, + "loss": 0.03210206, + "step": 24067 + }, + { + "epoch": 48.136, + "grad_norm": 0.9865384697914124, + "learning_rate": 2e-05, + "loss": 0.03966504, + "step": 24068 + }, + { + "epoch": 48.138, + "grad_norm": 1.1876673698425293, + "learning_rate": 2e-05, + "loss": 0.0427718, + "step": 24069 + }, + { + "epoch": 48.14, + "grad_norm": 1.1486650705337524, + "learning_rate": 2e-05, + "loss": 0.04623136, + "step": 24070 + }, + { + "epoch": 48.142, + "grad_norm": 1.3332282304763794, + "learning_rate": 2e-05, + "loss": 0.05006913, + "step": 24071 + }, + { + "epoch": 48.144, + "grad_norm": 1.2728177309036255, + "learning_rate": 2e-05, + "loss": 0.05031462, + "step": 24072 + }, + { + "epoch": 48.146, + "grad_norm": 1.2598438262939453, + "learning_rate": 2e-05, + "loss": 0.04445124, + "step": 24073 + }, + { + "epoch": 48.148, + "grad_norm": 1.0530667304992676, + "learning_rate": 2e-05, + "loss": 0.0403815, + "step": 24074 + }, + { + "epoch": 48.15, + "grad_norm": 1.0963428020477295, + "learning_rate": 2e-05, + "loss": 0.0428313, + "step": 24075 + }, + { + "epoch": 48.152, + "grad_norm": 1.0206382274627686, + "learning_rate": 2e-05, + "loss": 0.04635064, + "step": 24076 + }, + { + "epoch": 48.154, + "grad_norm": 1.0546106100082397, + "learning_rate": 2e-05, + "loss": 0.0390348, + "step": 24077 + }, + { + "epoch": 48.156, + "grad_norm": 1.1284135580062866, + "learning_rate": 2e-05, + "loss": 0.03736102, + "step": 24078 + }, + { + "epoch": 48.158, + "grad_norm": 1.389346957206726, + "learning_rate": 2e-05, + "loss": 0.04481775, + "step": 24079 + }, + { + "epoch": 48.16, + "grad_norm": 1.1270530223846436, + "learning_rate": 2e-05, + "loss": 0.04756751, + "step": 24080 + }, + { + "epoch": 48.162, + "grad_norm": 1.2751901149749756, + "learning_rate": 2e-05, + "loss": 0.04584195, + "step": 24081 + }, + { + "epoch": 48.164, + "grad_norm": 1.0802828073501587, + "learning_rate": 2e-05, + "loss": 0.06117968, + "step": 24082 + }, + { + "epoch": 48.166, + "grad_norm": 1.2902870178222656, + "learning_rate": 2e-05, + "loss": 0.05666249, + "step": 24083 + }, + { + "epoch": 48.168, + "grad_norm": 1.07093346118927, + "learning_rate": 2e-05, + "loss": 0.04220451, + "step": 24084 + }, + { + "epoch": 48.17, + "grad_norm": 0.9996381998062134, + "learning_rate": 2e-05, + "loss": 0.03157579, + "step": 24085 + }, + { + "epoch": 48.172, + "grad_norm": 1.4622588157653809, + "learning_rate": 2e-05, + "loss": 0.04022121, + "step": 24086 + }, + { + "epoch": 48.174, + "grad_norm": 1.200003981590271, + "learning_rate": 2e-05, + "loss": 0.03526891, + "step": 24087 + }, + { + "epoch": 48.176, + "grad_norm": 1.3392802476882935, + "learning_rate": 2e-05, + "loss": 0.03042413, + "step": 24088 + }, + { + "epoch": 48.178, + "grad_norm": 1.0749857425689697, + "learning_rate": 2e-05, + "loss": 0.04232724, + "step": 24089 + }, + { + "epoch": 48.18, + "grad_norm": 1.1630322933197021, + "learning_rate": 2e-05, + "loss": 0.04984683, + "step": 24090 + }, + { + "epoch": 48.182, + "grad_norm": 1.1708325147628784, + "learning_rate": 2e-05, + "loss": 0.04399581, + "step": 24091 + }, + { + "epoch": 48.184, + "grad_norm": 2.851407527923584, + "learning_rate": 2e-05, + "loss": 0.07037769, + "step": 24092 + }, + { + "epoch": 48.186, + "grad_norm": 1.0833169221878052, + "learning_rate": 2e-05, + "loss": 0.04655275, + "step": 24093 + }, + { + "epoch": 48.188, + "grad_norm": 1.372167706489563, + "learning_rate": 2e-05, + "loss": 0.07025443, + "step": 24094 + }, + { + "epoch": 48.19, + "grad_norm": 1.2088648080825806, + "learning_rate": 2e-05, + "loss": 0.04490817, + "step": 24095 + }, + { + "epoch": 48.192, + "grad_norm": 1.134153962135315, + "learning_rate": 2e-05, + "loss": 0.05215061, + "step": 24096 + }, + { + "epoch": 48.194, + "grad_norm": 1.1450132131576538, + "learning_rate": 2e-05, + "loss": 0.05180396, + "step": 24097 + }, + { + "epoch": 48.196, + "grad_norm": 1.1347380876541138, + "learning_rate": 2e-05, + "loss": 0.04732578, + "step": 24098 + }, + { + "epoch": 48.198, + "grad_norm": 1.1348960399627686, + "learning_rate": 2e-05, + "loss": 0.05249636, + "step": 24099 + }, + { + "epoch": 48.2, + "grad_norm": 1.7801083326339722, + "learning_rate": 2e-05, + "loss": 0.05126803, + "step": 24100 + }, + { + "epoch": 48.202, + "grad_norm": 1.2715392112731934, + "learning_rate": 2e-05, + "loss": 0.05465782, + "step": 24101 + }, + { + "epoch": 48.204, + "grad_norm": 0.9739833474159241, + "learning_rate": 2e-05, + "loss": 0.03681491, + "step": 24102 + }, + { + "epoch": 48.206, + "grad_norm": 0.991855800151825, + "learning_rate": 2e-05, + "loss": 0.03075913, + "step": 24103 + }, + { + "epoch": 48.208, + "grad_norm": 1.7213369607925415, + "learning_rate": 2e-05, + "loss": 0.03983373, + "step": 24104 + }, + { + "epoch": 48.21, + "grad_norm": 1.0067009925842285, + "learning_rate": 2e-05, + "loss": 0.04427213, + "step": 24105 + }, + { + "epoch": 48.212, + "grad_norm": 1.8340460062026978, + "learning_rate": 2e-05, + "loss": 0.04303214, + "step": 24106 + }, + { + "epoch": 48.214, + "grad_norm": 1.1613703966140747, + "learning_rate": 2e-05, + "loss": 0.04921772, + "step": 24107 + }, + { + "epoch": 48.216, + "grad_norm": 2.5453414916992188, + "learning_rate": 2e-05, + "loss": 0.05552774, + "step": 24108 + }, + { + "epoch": 48.218, + "grad_norm": 1.020289659500122, + "learning_rate": 2e-05, + "loss": 0.03909015, + "step": 24109 + }, + { + "epoch": 48.22, + "grad_norm": 1.1632663011550903, + "learning_rate": 2e-05, + "loss": 0.05679201, + "step": 24110 + }, + { + "epoch": 48.222, + "grad_norm": 0.987187385559082, + "learning_rate": 2e-05, + "loss": 0.03800602, + "step": 24111 + }, + { + "epoch": 48.224, + "grad_norm": 1.2574355602264404, + "learning_rate": 2e-05, + "loss": 0.05340779, + "step": 24112 + }, + { + "epoch": 48.226, + "grad_norm": 1.076492428779602, + "learning_rate": 2e-05, + "loss": 0.04606473, + "step": 24113 + }, + { + "epoch": 48.228, + "grad_norm": 1.010301947593689, + "learning_rate": 2e-05, + "loss": 0.03338667, + "step": 24114 + }, + { + "epoch": 48.23, + "grad_norm": 1.3499643802642822, + "learning_rate": 2e-05, + "loss": 0.06098785, + "step": 24115 + }, + { + "epoch": 48.232, + "grad_norm": 1.1102203130722046, + "learning_rate": 2e-05, + "loss": 0.04781565, + "step": 24116 + }, + { + "epoch": 48.234, + "grad_norm": 0.941985011100769, + "learning_rate": 2e-05, + "loss": 0.03292421, + "step": 24117 + }, + { + "epoch": 48.236, + "grad_norm": 1.1043226718902588, + "learning_rate": 2e-05, + "loss": 0.0464811, + "step": 24118 + }, + { + "epoch": 48.238, + "grad_norm": 0.9736769795417786, + "learning_rate": 2e-05, + "loss": 0.04281894, + "step": 24119 + }, + { + "epoch": 48.24, + "grad_norm": 1.1103366613388062, + "learning_rate": 2e-05, + "loss": 0.05232271, + "step": 24120 + }, + { + "epoch": 48.242, + "grad_norm": 1.2056519985198975, + "learning_rate": 2e-05, + "loss": 0.05404781, + "step": 24121 + }, + { + "epoch": 48.244, + "grad_norm": 1.410598874092102, + "learning_rate": 2e-05, + "loss": 0.06230663, + "step": 24122 + }, + { + "epoch": 48.246, + "grad_norm": 1.136397123336792, + "learning_rate": 2e-05, + "loss": 0.05437019, + "step": 24123 + }, + { + "epoch": 48.248, + "grad_norm": 1.0891176462173462, + "learning_rate": 2e-05, + "loss": 0.04057914, + "step": 24124 + }, + { + "epoch": 48.25, + "grad_norm": 1.1873037815093994, + "learning_rate": 2e-05, + "loss": 0.04513947, + "step": 24125 + }, + { + "epoch": 48.252, + "grad_norm": 1.5150892734527588, + "learning_rate": 2e-05, + "loss": 0.0410989, + "step": 24126 + }, + { + "epoch": 48.254, + "grad_norm": 2.031759738922119, + "learning_rate": 2e-05, + "loss": 0.05381062, + "step": 24127 + }, + { + "epoch": 48.256, + "grad_norm": 1.5702145099639893, + "learning_rate": 2e-05, + "loss": 0.03662251, + "step": 24128 + }, + { + "epoch": 48.258, + "grad_norm": 1.2305831909179688, + "learning_rate": 2e-05, + "loss": 0.06105634, + "step": 24129 + }, + { + "epoch": 48.26, + "grad_norm": 1.1910518407821655, + "learning_rate": 2e-05, + "loss": 0.04331035, + "step": 24130 + }, + { + "epoch": 48.262, + "grad_norm": 1.225128412246704, + "learning_rate": 2e-05, + "loss": 0.05862859, + "step": 24131 + }, + { + "epoch": 48.264, + "grad_norm": 0.9506001472473145, + "learning_rate": 2e-05, + "loss": 0.03767806, + "step": 24132 + }, + { + "epoch": 48.266, + "grad_norm": 1.2454578876495361, + "learning_rate": 2e-05, + "loss": 0.05315366, + "step": 24133 + }, + { + "epoch": 48.268, + "grad_norm": 2.151110887527466, + "learning_rate": 2e-05, + "loss": 0.04462639, + "step": 24134 + }, + { + "epoch": 48.27, + "grad_norm": 1.1853653192520142, + "learning_rate": 2e-05, + "loss": 0.05619586, + "step": 24135 + }, + { + "epoch": 48.272, + "grad_norm": 0.8391315937042236, + "learning_rate": 2e-05, + "loss": 0.02844811, + "step": 24136 + }, + { + "epoch": 48.274, + "grad_norm": 1.102542757987976, + "learning_rate": 2e-05, + "loss": 0.04351805, + "step": 24137 + }, + { + "epoch": 48.276, + "grad_norm": 0.9882407784461975, + "learning_rate": 2e-05, + "loss": 0.0304465, + "step": 24138 + }, + { + "epoch": 48.278, + "grad_norm": 1.748734951019287, + "learning_rate": 2e-05, + "loss": 0.06221665, + "step": 24139 + }, + { + "epoch": 48.28, + "grad_norm": 1.4740245342254639, + "learning_rate": 2e-05, + "loss": 0.06243878, + "step": 24140 + }, + { + "epoch": 48.282, + "grad_norm": 1.324210524559021, + "learning_rate": 2e-05, + "loss": 0.05721395, + "step": 24141 + }, + { + "epoch": 48.284, + "grad_norm": 1.527857780456543, + "learning_rate": 2e-05, + "loss": 0.0590317, + "step": 24142 + }, + { + "epoch": 48.286, + "grad_norm": 1.0875802040100098, + "learning_rate": 2e-05, + "loss": 0.04194984, + "step": 24143 + }, + { + "epoch": 48.288, + "grad_norm": 1.2001830339431763, + "learning_rate": 2e-05, + "loss": 0.05224827, + "step": 24144 + }, + { + "epoch": 48.29, + "grad_norm": 1.3207565546035767, + "learning_rate": 2e-05, + "loss": 0.05328258, + "step": 24145 + }, + { + "epoch": 48.292, + "grad_norm": 1.3341400623321533, + "learning_rate": 2e-05, + "loss": 0.03919051, + "step": 24146 + }, + { + "epoch": 48.294, + "grad_norm": 1.3471317291259766, + "learning_rate": 2e-05, + "loss": 0.0625244, + "step": 24147 + }, + { + "epoch": 48.296, + "grad_norm": 0.9852702021598816, + "learning_rate": 2e-05, + "loss": 0.04854707, + "step": 24148 + }, + { + "epoch": 48.298, + "grad_norm": 0.9104224443435669, + "learning_rate": 2e-05, + "loss": 0.03098441, + "step": 24149 + }, + { + "epoch": 48.3, + "grad_norm": 1.0037267208099365, + "learning_rate": 2e-05, + "loss": 0.03725686, + "step": 24150 + }, + { + "epoch": 48.302, + "grad_norm": 1.1196410655975342, + "learning_rate": 2e-05, + "loss": 0.04821742, + "step": 24151 + }, + { + "epoch": 48.304, + "grad_norm": 1.17573082447052, + "learning_rate": 2e-05, + "loss": 0.04223769, + "step": 24152 + }, + { + "epoch": 48.306, + "grad_norm": 1.1599558591842651, + "learning_rate": 2e-05, + "loss": 0.04793893, + "step": 24153 + }, + { + "epoch": 48.308, + "grad_norm": 2.1056137084960938, + "learning_rate": 2e-05, + "loss": 0.04927088, + "step": 24154 + }, + { + "epoch": 48.31, + "grad_norm": 1.2018673419952393, + "learning_rate": 2e-05, + "loss": 0.04321744, + "step": 24155 + }, + { + "epoch": 48.312, + "grad_norm": 1.2046828269958496, + "learning_rate": 2e-05, + "loss": 0.0523455, + "step": 24156 + }, + { + "epoch": 48.314, + "grad_norm": 1.2521579265594482, + "learning_rate": 2e-05, + "loss": 0.04741405, + "step": 24157 + }, + { + "epoch": 48.316, + "grad_norm": 1.046248435974121, + "learning_rate": 2e-05, + "loss": 0.03981638, + "step": 24158 + }, + { + "epoch": 48.318, + "grad_norm": 1.2036545276641846, + "learning_rate": 2e-05, + "loss": 0.04452758, + "step": 24159 + }, + { + "epoch": 48.32, + "grad_norm": 1.224697470664978, + "learning_rate": 2e-05, + "loss": 0.04996022, + "step": 24160 + }, + { + "epoch": 48.322, + "grad_norm": 1.4698989391326904, + "learning_rate": 2e-05, + "loss": 0.06074246, + "step": 24161 + }, + { + "epoch": 48.324, + "grad_norm": 1.205679178237915, + "learning_rate": 2e-05, + "loss": 0.05434452, + "step": 24162 + }, + { + "epoch": 48.326, + "grad_norm": 1.2876218557357788, + "learning_rate": 2e-05, + "loss": 0.04615651, + "step": 24163 + }, + { + "epoch": 48.328, + "grad_norm": 1.218860149383545, + "learning_rate": 2e-05, + "loss": 0.05063464, + "step": 24164 + }, + { + "epoch": 48.33, + "grad_norm": 1.225708246231079, + "learning_rate": 2e-05, + "loss": 0.04554639, + "step": 24165 + }, + { + "epoch": 48.332, + "grad_norm": 1.4999538660049438, + "learning_rate": 2e-05, + "loss": 0.0535819, + "step": 24166 + }, + { + "epoch": 48.334, + "grad_norm": 1.4471505880355835, + "learning_rate": 2e-05, + "loss": 0.05369381, + "step": 24167 + }, + { + "epoch": 48.336, + "grad_norm": 1.0038902759552002, + "learning_rate": 2e-05, + "loss": 0.03418017, + "step": 24168 + }, + { + "epoch": 48.338, + "grad_norm": 1.2840253114700317, + "learning_rate": 2e-05, + "loss": 0.05745913, + "step": 24169 + }, + { + "epoch": 48.34, + "grad_norm": 1.1269090175628662, + "learning_rate": 2e-05, + "loss": 0.04826582, + "step": 24170 + }, + { + "epoch": 48.342, + "grad_norm": 0.9834097623825073, + "learning_rate": 2e-05, + "loss": 0.0321577, + "step": 24171 + }, + { + "epoch": 48.344, + "grad_norm": 1.304100513458252, + "learning_rate": 2e-05, + "loss": 0.05313345, + "step": 24172 + }, + { + "epoch": 48.346, + "grad_norm": 0.9855166077613831, + "learning_rate": 2e-05, + "loss": 0.04511321, + "step": 24173 + }, + { + "epoch": 48.348, + "grad_norm": 1.117134928703308, + "learning_rate": 2e-05, + "loss": 0.04698523, + "step": 24174 + }, + { + "epoch": 48.35, + "grad_norm": 1.2151545286178589, + "learning_rate": 2e-05, + "loss": 0.03640284, + "step": 24175 + }, + { + "epoch": 48.352, + "grad_norm": 1.690683126449585, + "learning_rate": 2e-05, + "loss": 0.05967022, + "step": 24176 + }, + { + "epoch": 48.354, + "grad_norm": 1.0497173070907593, + "learning_rate": 2e-05, + "loss": 0.03619199, + "step": 24177 + }, + { + "epoch": 48.356, + "grad_norm": 1.0658822059631348, + "learning_rate": 2e-05, + "loss": 0.04109097, + "step": 24178 + }, + { + "epoch": 48.358, + "grad_norm": 1.1863605976104736, + "learning_rate": 2e-05, + "loss": 0.05850657, + "step": 24179 + }, + { + "epoch": 48.36, + "grad_norm": 0.910224974155426, + "learning_rate": 2e-05, + "loss": 0.03389906, + "step": 24180 + }, + { + "epoch": 48.362, + "grad_norm": 1.1010740995407104, + "learning_rate": 2e-05, + "loss": 0.04562913, + "step": 24181 + }, + { + "epoch": 48.364, + "grad_norm": 1.3951750993728638, + "learning_rate": 2e-05, + "loss": 0.04543884, + "step": 24182 + }, + { + "epoch": 48.366, + "grad_norm": 1.33402681350708, + "learning_rate": 2e-05, + "loss": 0.04668194, + "step": 24183 + }, + { + "epoch": 48.368, + "grad_norm": 1.3181812763214111, + "learning_rate": 2e-05, + "loss": 0.05848655, + "step": 24184 + }, + { + "epoch": 48.37, + "grad_norm": 1.1864538192749023, + "learning_rate": 2e-05, + "loss": 0.04627438, + "step": 24185 + }, + { + "epoch": 48.372, + "grad_norm": 1.0719053745269775, + "learning_rate": 2e-05, + "loss": 0.04401678, + "step": 24186 + }, + { + "epoch": 48.374, + "grad_norm": 1.2365782260894775, + "learning_rate": 2e-05, + "loss": 0.04953314, + "step": 24187 + }, + { + "epoch": 48.376, + "grad_norm": 1.362647533416748, + "learning_rate": 2e-05, + "loss": 0.03675044, + "step": 24188 + }, + { + "epoch": 48.378, + "grad_norm": 1.1618704795837402, + "learning_rate": 2e-05, + "loss": 0.05714417, + "step": 24189 + }, + { + "epoch": 48.38, + "grad_norm": 1.2545180320739746, + "learning_rate": 2e-05, + "loss": 0.04899006, + "step": 24190 + }, + { + "epoch": 48.382, + "grad_norm": 1.5738791227340698, + "learning_rate": 2e-05, + "loss": 0.06693644, + "step": 24191 + }, + { + "epoch": 48.384, + "grad_norm": 1.2538057565689087, + "learning_rate": 2e-05, + "loss": 0.04467027, + "step": 24192 + }, + { + "epoch": 48.386, + "grad_norm": 2.4022083282470703, + "learning_rate": 2e-05, + "loss": 0.0603236, + "step": 24193 + }, + { + "epoch": 48.388, + "grad_norm": 1.0010615587234497, + "learning_rate": 2e-05, + "loss": 0.03820972, + "step": 24194 + }, + { + "epoch": 48.39, + "grad_norm": 1.7481197118759155, + "learning_rate": 2e-05, + "loss": 0.06582922, + "step": 24195 + }, + { + "epoch": 48.392, + "grad_norm": 1.1138973236083984, + "learning_rate": 2e-05, + "loss": 0.05272717, + "step": 24196 + }, + { + "epoch": 48.394, + "grad_norm": 1.0726069211959839, + "learning_rate": 2e-05, + "loss": 0.05133822, + "step": 24197 + }, + { + "epoch": 48.396, + "grad_norm": 1.7698593139648438, + "learning_rate": 2e-05, + "loss": 0.05580845, + "step": 24198 + }, + { + "epoch": 48.398, + "grad_norm": 1.3947736024856567, + "learning_rate": 2e-05, + "loss": 0.05010434, + "step": 24199 + }, + { + "epoch": 48.4, + "grad_norm": 1.0563322305679321, + "learning_rate": 2e-05, + "loss": 0.03886336, + "step": 24200 + }, + { + "epoch": 48.402, + "grad_norm": 1.6317943334579468, + "learning_rate": 2e-05, + "loss": 0.06757972, + "step": 24201 + }, + { + "epoch": 48.404, + "grad_norm": 1.1341196298599243, + "learning_rate": 2e-05, + "loss": 0.04929399, + "step": 24202 + }, + { + "epoch": 48.406, + "grad_norm": 1.2951228618621826, + "learning_rate": 2e-05, + "loss": 0.04734764, + "step": 24203 + }, + { + "epoch": 48.408, + "grad_norm": 1.2703125476837158, + "learning_rate": 2e-05, + "loss": 0.0539843, + "step": 24204 + }, + { + "epoch": 48.41, + "grad_norm": 1.4762182235717773, + "learning_rate": 2e-05, + "loss": 0.05031619, + "step": 24205 + }, + { + "epoch": 48.412, + "grad_norm": 1.1297892332077026, + "learning_rate": 2e-05, + "loss": 0.05098326, + "step": 24206 + }, + { + "epoch": 48.414, + "grad_norm": 1.776755452156067, + "learning_rate": 2e-05, + "loss": 0.05509105, + "step": 24207 + }, + { + "epoch": 48.416, + "grad_norm": 1.4835596084594727, + "learning_rate": 2e-05, + "loss": 0.05865866, + "step": 24208 + }, + { + "epoch": 48.418, + "grad_norm": 1.1328264474868774, + "learning_rate": 2e-05, + "loss": 0.0568237, + "step": 24209 + }, + { + "epoch": 48.42, + "grad_norm": 1.0949770212173462, + "learning_rate": 2e-05, + "loss": 0.04025354, + "step": 24210 + }, + { + "epoch": 48.422, + "grad_norm": 1.2721264362335205, + "learning_rate": 2e-05, + "loss": 0.05412237, + "step": 24211 + }, + { + "epoch": 48.424, + "grad_norm": 1.6875686645507812, + "learning_rate": 2e-05, + "loss": 0.05133815, + "step": 24212 + }, + { + "epoch": 48.426, + "grad_norm": 1.1843007802963257, + "learning_rate": 2e-05, + "loss": 0.04539281, + "step": 24213 + }, + { + "epoch": 48.428, + "grad_norm": 1.1561273336410522, + "learning_rate": 2e-05, + "loss": 0.05123963, + "step": 24214 + }, + { + "epoch": 48.43, + "grad_norm": 1.2568432092666626, + "learning_rate": 2e-05, + "loss": 0.05854414, + "step": 24215 + }, + { + "epoch": 48.432, + "grad_norm": 1.2234379053115845, + "learning_rate": 2e-05, + "loss": 0.03763745, + "step": 24216 + }, + { + "epoch": 48.434, + "grad_norm": 1.1725529432296753, + "learning_rate": 2e-05, + "loss": 0.05705394, + "step": 24217 + }, + { + "epoch": 48.436, + "grad_norm": 1.598198652267456, + "learning_rate": 2e-05, + "loss": 0.05314296, + "step": 24218 + }, + { + "epoch": 48.438, + "grad_norm": 1.4097914695739746, + "learning_rate": 2e-05, + "loss": 0.04881332, + "step": 24219 + }, + { + "epoch": 48.44, + "grad_norm": 1.0265836715698242, + "learning_rate": 2e-05, + "loss": 0.03001934, + "step": 24220 + }, + { + "epoch": 48.442, + "grad_norm": 1.0836626291275024, + "learning_rate": 2e-05, + "loss": 0.04115789, + "step": 24221 + }, + { + "epoch": 48.444, + "grad_norm": 0.9527786374092102, + "learning_rate": 2e-05, + "loss": 0.04305189, + "step": 24222 + }, + { + "epoch": 48.446, + "grad_norm": 2.3282713890075684, + "learning_rate": 2e-05, + "loss": 0.04773811, + "step": 24223 + }, + { + "epoch": 48.448, + "grad_norm": 1.238843321800232, + "learning_rate": 2e-05, + "loss": 0.05873473, + "step": 24224 + }, + { + "epoch": 48.45, + "grad_norm": 1.050054669380188, + "learning_rate": 2e-05, + "loss": 0.04225109, + "step": 24225 + }, + { + "epoch": 48.452, + "grad_norm": 0.9493594765663147, + "learning_rate": 2e-05, + "loss": 0.03815629, + "step": 24226 + }, + { + "epoch": 48.454, + "grad_norm": 1.4076915979385376, + "learning_rate": 2e-05, + "loss": 0.0571268, + "step": 24227 + }, + { + "epoch": 48.456, + "grad_norm": 1.1200451850891113, + "learning_rate": 2e-05, + "loss": 0.04536561, + "step": 24228 + }, + { + "epoch": 48.458, + "grad_norm": 0.9729321002960205, + "learning_rate": 2e-05, + "loss": 0.04120147, + "step": 24229 + }, + { + "epoch": 48.46, + "grad_norm": 1.1512271165847778, + "learning_rate": 2e-05, + "loss": 0.04277606, + "step": 24230 + }, + { + "epoch": 48.462, + "grad_norm": 1.1026575565338135, + "learning_rate": 2e-05, + "loss": 0.0351891, + "step": 24231 + }, + { + "epoch": 48.464, + "grad_norm": 1.0310128927230835, + "learning_rate": 2e-05, + "loss": 0.03461772, + "step": 24232 + }, + { + "epoch": 48.466, + "grad_norm": 1.412767767906189, + "learning_rate": 2e-05, + "loss": 0.04195011, + "step": 24233 + }, + { + "epoch": 48.468, + "grad_norm": 1.1256314516067505, + "learning_rate": 2e-05, + "loss": 0.05557293, + "step": 24234 + }, + { + "epoch": 48.47, + "grad_norm": 1.1693962812423706, + "learning_rate": 2e-05, + "loss": 0.03926221, + "step": 24235 + }, + { + "epoch": 48.472, + "grad_norm": 1.1517102718353271, + "learning_rate": 2e-05, + "loss": 0.05275252, + "step": 24236 + }, + { + "epoch": 48.474, + "grad_norm": 1.3744908571243286, + "learning_rate": 2e-05, + "loss": 0.04760543, + "step": 24237 + }, + { + "epoch": 48.476, + "grad_norm": 1.1861525774002075, + "learning_rate": 2e-05, + "loss": 0.04502533, + "step": 24238 + }, + { + "epoch": 48.478, + "grad_norm": 1.4491164684295654, + "learning_rate": 2e-05, + "loss": 0.05307315, + "step": 24239 + }, + { + "epoch": 48.48, + "grad_norm": 1.0572928190231323, + "learning_rate": 2e-05, + "loss": 0.04607842, + "step": 24240 + }, + { + "epoch": 48.482, + "grad_norm": 1.7253468036651611, + "learning_rate": 2e-05, + "loss": 0.0420462, + "step": 24241 + }, + { + "epoch": 48.484, + "grad_norm": 1.1973940134048462, + "learning_rate": 2e-05, + "loss": 0.06035455, + "step": 24242 + }, + { + "epoch": 48.486, + "grad_norm": 1.123047947883606, + "learning_rate": 2e-05, + "loss": 0.04807914, + "step": 24243 + }, + { + "epoch": 48.488, + "grad_norm": 3.36896014213562, + "learning_rate": 2e-05, + "loss": 0.05288967, + "step": 24244 + }, + { + "epoch": 48.49, + "grad_norm": 1.3150179386138916, + "learning_rate": 2e-05, + "loss": 0.04113879, + "step": 24245 + }, + { + "epoch": 48.492, + "grad_norm": 0.998184084892273, + "learning_rate": 2e-05, + "loss": 0.03896046, + "step": 24246 + }, + { + "epoch": 48.494, + "grad_norm": 2.140963315963745, + "learning_rate": 2e-05, + "loss": 0.05801499, + "step": 24247 + }, + { + "epoch": 48.496, + "grad_norm": 0.9742385149002075, + "learning_rate": 2e-05, + "loss": 0.04312814, + "step": 24248 + }, + { + "epoch": 48.498, + "grad_norm": 0.9801925420761108, + "learning_rate": 2e-05, + "loss": 0.04314513, + "step": 24249 + }, + { + "epoch": 48.5, + "grad_norm": 1.167195200920105, + "learning_rate": 2e-05, + "loss": 0.04025427, + "step": 24250 + }, + { + "epoch": 48.502, + "grad_norm": 1.045845627784729, + "learning_rate": 2e-05, + "loss": 0.03268411, + "step": 24251 + }, + { + "epoch": 48.504, + "grad_norm": 1.4855949878692627, + "learning_rate": 2e-05, + "loss": 0.03678501, + "step": 24252 + }, + { + "epoch": 48.506, + "grad_norm": 1.0382455587387085, + "learning_rate": 2e-05, + "loss": 0.05245716, + "step": 24253 + }, + { + "epoch": 48.508, + "grad_norm": 1.0648964643478394, + "learning_rate": 2e-05, + "loss": 0.0379638, + "step": 24254 + }, + { + "epoch": 48.51, + "grad_norm": 1.1607509851455688, + "learning_rate": 2e-05, + "loss": 0.05345634, + "step": 24255 + }, + { + "epoch": 48.512, + "grad_norm": 1.2925974130630493, + "learning_rate": 2e-05, + "loss": 0.05352456, + "step": 24256 + }, + { + "epoch": 48.514, + "grad_norm": 1.1393349170684814, + "learning_rate": 2e-05, + "loss": 0.04305478, + "step": 24257 + }, + { + "epoch": 48.516, + "grad_norm": 1.041580319404602, + "learning_rate": 2e-05, + "loss": 0.05400862, + "step": 24258 + }, + { + "epoch": 48.518, + "grad_norm": 1.0252822637557983, + "learning_rate": 2e-05, + "loss": 0.0446092, + "step": 24259 + }, + { + "epoch": 48.52, + "grad_norm": 1.2504663467407227, + "learning_rate": 2e-05, + "loss": 0.06324365, + "step": 24260 + }, + { + "epoch": 48.522, + "grad_norm": 1.246679425239563, + "learning_rate": 2e-05, + "loss": 0.05985711, + "step": 24261 + }, + { + "epoch": 48.524, + "grad_norm": 1.0827921628952026, + "learning_rate": 2e-05, + "loss": 0.04559185, + "step": 24262 + }, + { + "epoch": 48.526, + "grad_norm": 0.9944784045219421, + "learning_rate": 2e-05, + "loss": 0.04224956, + "step": 24263 + }, + { + "epoch": 48.528, + "grad_norm": 1.6731486320495605, + "learning_rate": 2e-05, + "loss": 0.06704316, + "step": 24264 + }, + { + "epoch": 48.53, + "grad_norm": 1.1593559980392456, + "learning_rate": 2e-05, + "loss": 0.04145539, + "step": 24265 + }, + { + "epoch": 48.532, + "grad_norm": 1.5469125509262085, + "learning_rate": 2e-05, + "loss": 0.05441985, + "step": 24266 + }, + { + "epoch": 48.534, + "grad_norm": 1.191585898399353, + "learning_rate": 2e-05, + "loss": 0.04084273, + "step": 24267 + }, + { + "epoch": 48.536, + "grad_norm": 2.4968080520629883, + "learning_rate": 2e-05, + "loss": 0.05761244, + "step": 24268 + }, + { + "epoch": 48.538, + "grad_norm": 1.4555327892303467, + "learning_rate": 2e-05, + "loss": 0.05669381, + "step": 24269 + }, + { + "epoch": 48.54, + "grad_norm": 1.1853517293930054, + "learning_rate": 2e-05, + "loss": 0.03782331, + "step": 24270 + }, + { + "epoch": 48.542, + "grad_norm": 1.0957677364349365, + "learning_rate": 2e-05, + "loss": 0.04011827, + "step": 24271 + }, + { + "epoch": 48.544, + "grad_norm": 1.120521903038025, + "learning_rate": 2e-05, + "loss": 0.0431511, + "step": 24272 + }, + { + "epoch": 48.546, + "grad_norm": 1.3489925861358643, + "learning_rate": 2e-05, + "loss": 0.04053114, + "step": 24273 + }, + { + "epoch": 48.548, + "grad_norm": 1.153000831604004, + "learning_rate": 2e-05, + "loss": 0.05193951, + "step": 24274 + }, + { + "epoch": 48.55, + "grad_norm": 1.4672309160232544, + "learning_rate": 2e-05, + "loss": 0.0470688, + "step": 24275 + }, + { + "epoch": 48.552, + "grad_norm": 1.2560532093048096, + "learning_rate": 2e-05, + "loss": 0.04077678, + "step": 24276 + }, + { + "epoch": 48.554, + "grad_norm": 1.1438381671905518, + "learning_rate": 2e-05, + "loss": 0.04561079, + "step": 24277 + }, + { + "epoch": 48.556, + "grad_norm": 1.103325605392456, + "learning_rate": 2e-05, + "loss": 0.04266635, + "step": 24278 + }, + { + "epoch": 48.558, + "grad_norm": 1.2053292989730835, + "learning_rate": 2e-05, + "loss": 0.05609243, + "step": 24279 + }, + { + "epoch": 48.56, + "grad_norm": 1.2866277694702148, + "learning_rate": 2e-05, + "loss": 0.05822425, + "step": 24280 + }, + { + "epoch": 48.562, + "grad_norm": 1.3046430349349976, + "learning_rate": 2e-05, + "loss": 0.05633568, + "step": 24281 + }, + { + "epoch": 48.564, + "grad_norm": 1.165405035018921, + "learning_rate": 2e-05, + "loss": 0.02543902, + "step": 24282 + }, + { + "epoch": 48.566, + "grad_norm": 1.2302770614624023, + "learning_rate": 2e-05, + "loss": 0.04642507, + "step": 24283 + }, + { + "epoch": 48.568, + "grad_norm": 1.5563623905181885, + "learning_rate": 2e-05, + "loss": 0.05143239, + "step": 24284 + }, + { + "epoch": 48.57, + "grad_norm": 1.0007959604263306, + "learning_rate": 2e-05, + "loss": 0.03177904, + "step": 24285 + }, + { + "epoch": 48.572, + "grad_norm": 1.5403399467468262, + "learning_rate": 2e-05, + "loss": 0.0840653, + "step": 24286 + }, + { + "epoch": 48.574, + "grad_norm": 2.075601577758789, + "learning_rate": 2e-05, + "loss": 0.04922469, + "step": 24287 + }, + { + "epoch": 48.576, + "grad_norm": 1.155706524848938, + "learning_rate": 2e-05, + "loss": 0.03379347, + "step": 24288 + }, + { + "epoch": 48.578, + "grad_norm": 2.421046018600464, + "learning_rate": 2e-05, + "loss": 0.03684851, + "step": 24289 + }, + { + "epoch": 48.58, + "grad_norm": 1.1234990358352661, + "learning_rate": 2e-05, + "loss": 0.04452331, + "step": 24290 + }, + { + "epoch": 48.582, + "grad_norm": 1.228397011756897, + "learning_rate": 2e-05, + "loss": 0.05986349, + "step": 24291 + }, + { + "epoch": 48.584, + "grad_norm": 1.2848201990127563, + "learning_rate": 2e-05, + "loss": 0.05497922, + "step": 24292 + }, + { + "epoch": 48.586, + "grad_norm": 1.1021811962127686, + "learning_rate": 2e-05, + "loss": 0.04066538, + "step": 24293 + }, + { + "epoch": 48.588, + "grad_norm": 1.184058427810669, + "learning_rate": 2e-05, + "loss": 0.05837499, + "step": 24294 + }, + { + "epoch": 48.59, + "grad_norm": 1.4225780963897705, + "learning_rate": 2e-05, + "loss": 0.04438188, + "step": 24295 + }, + { + "epoch": 48.592, + "grad_norm": 1.1480562686920166, + "learning_rate": 2e-05, + "loss": 0.0400277, + "step": 24296 + }, + { + "epoch": 48.594, + "grad_norm": 1.323900580406189, + "learning_rate": 2e-05, + "loss": 0.05192615, + "step": 24297 + }, + { + "epoch": 48.596, + "grad_norm": 1.441277265548706, + "learning_rate": 2e-05, + "loss": 0.03372613, + "step": 24298 + }, + { + "epoch": 48.598, + "grad_norm": 1.0926908254623413, + "learning_rate": 2e-05, + "loss": 0.05107879, + "step": 24299 + }, + { + "epoch": 48.6, + "grad_norm": 1.2348512411117554, + "learning_rate": 2e-05, + "loss": 0.05852629, + "step": 24300 + }, + { + "epoch": 48.602, + "grad_norm": 1.182897925376892, + "learning_rate": 2e-05, + "loss": 0.06499533, + "step": 24301 + }, + { + "epoch": 48.604, + "grad_norm": 1.1605639457702637, + "learning_rate": 2e-05, + "loss": 0.05055974, + "step": 24302 + }, + { + "epoch": 48.606, + "grad_norm": 1.3561877012252808, + "learning_rate": 2e-05, + "loss": 0.05069784, + "step": 24303 + }, + { + "epoch": 48.608, + "grad_norm": 1.2737542390823364, + "learning_rate": 2e-05, + "loss": 0.04829124, + "step": 24304 + }, + { + "epoch": 48.61, + "grad_norm": 1.2459218502044678, + "learning_rate": 2e-05, + "loss": 0.057886, + "step": 24305 + }, + { + "epoch": 48.612, + "grad_norm": 1.154877781867981, + "learning_rate": 2e-05, + "loss": 0.03690616, + "step": 24306 + }, + { + "epoch": 48.614, + "grad_norm": 1.1142473220825195, + "learning_rate": 2e-05, + "loss": 0.04277994, + "step": 24307 + }, + { + "epoch": 48.616, + "grad_norm": 1.2507994174957275, + "learning_rate": 2e-05, + "loss": 0.05749039, + "step": 24308 + }, + { + "epoch": 48.618, + "grad_norm": 1.2597581148147583, + "learning_rate": 2e-05, + "loss": 0.04754964, + "step": 24309 + }, + { + "epoch": 48.62, + "grad_norm": 1.1352006196975708, + "learning_rate": 2e-05, + "loss": 0.04118899, + "step": 24310 + }, + { + "epoch": 48.622, + "grad_norm": 1.7616618871688843, + "learning_rate": 2e-05, + "loss": 0.0472442, + "step": 24311 + }, + { + "epoch": 48.624, + "grad_norm": 1.2920984029769897, + "learning_rate": 2e-05, + "loss": 0.05620179, + "step": 24312 + }, + { + "epoch": 48.626, + "grad_norm": 1.0155946016311646, + "learning_rate": 2e-05, + "loss": 0.03928847, + "step": 24313 + }, + { + "epoch": 48.628, + "grad_norm": 1.8907221555709839, + "learning_rate": 2e-05, + "loss": 0.05770548, + "step": 24314 + }, + { + "epoch": 48.63, + "grad_norm": 1.3525524139404297, + "learning_rate": 2e-05, + "loss": 0.05569984, + "step": 24315 + }, + { + "epoch": 48.632, + "grad_norm": 2.0137546062469482, + "learning_rate": 2e-05, + "loss": 0.04367228, + "step": 24316 + }, + { + "epoch": 48.634, + "grad_norm": 1.1896785497665405, + "learning_rate": 2e-05, + "loss": 0.0412126, + "step": 24317 + }, + { + "epoch": 48.636, + "grad_norm": 1.2872071266174316, + "learning_rate": 2e-05, + "loss": 0.05311878, + "step": 24318 + }, + { + "epoch": 48.638, + "grad_norm": 3.229105234146118, + "learning_rate": 2e-05, + "loss": 0.05653278, + "step": 24319 + }, + { + "epoch": 48.64, + "grad_norm": 1.131352186203003, + "learning_rate": 2e-05, + "loss": 0.04109538, + "step": 24320 + }, + { + "epoch": 48.642, + "grad_norm": 1.0072386264801025, + "learning_rate": 2e-05, + "loss": 0.04287854, + "step": 24321 + }, + { + "epoch": 48.644, + "grad_norm": 1.1246684789657593, + "learning_rate": 2e-05, + "loss": 0.04119401, + "step": 24322 + }, + { + "epoch": 48.646, + "grad_norm": 0.9696083664894104, + "learning_rate": 2e-05, + "loss": 0.03462346, + "step": 24323 + }, + { + "epoch": 48.648, + "grad_norm": 1.8664556741714478, + "learning_rate": 2e-05, + "loss": 0.0543664, + "step": 24324 + }, + { + "epoch": 48.65, + "grad_norm": 2.026254653930664, + "learning_rate": 2e-05, + "loss": 0.0509354, + "step": 24325 + }, + { + "epoch": 48.652, + "grad_norm": 1.2250194549560547, + "learning_rate": 2e-05, + "loss": 0.06017506, + "step": 24326 + }, + { + "epoch": 48.654, + "grad_norm": 1.2674254179000854, + "learning_rate": 2e-05, + "loss": 0.04169001, + "step": 24327 + }, + { + "epoch": 48.656, + "grad_norm": 1.2122572660446167, + "learning_rate": 2e-05, + "loss": 0.05367371, + "step": 24328 + }, + { + "epoch": 48.658, + "grad_norm": 1.4815667867660522, + "learning_rate": 2e-05, + "loss": 0.07224436, + "step": 24329 + }, + { + "epoch": 48.66, + "grad_norm": 1.290479302406311, + "learning_rate": 2e-05, + "loss": 0.04883266, + "step": 24330 + }, + { + "epoch": 48.662, + "grad_norm": 1.1489940881729126, + "learning_rate": 2e-05, + "loss": 0.0474559, + "step": 24331 + }, + { + "epoch": 48.664, + "grad_norm": 1.302720069885254, + "learning_rate": 2e-05, + "loss": 0.05755413, + "step": 24332 + }, + { + "epoch": 48.666, + "grad_norm": 0.9729812741279602, + "learning_rate": 2e-05, + "loss": 0.0363071, + "step": 24333 + }, + { + "epoch": 48.668, + "grad_norm": 1.3361284732818604, + "learning_rate": 2e-05, + "loss": 0.04093036, + "step": 24334 + }, + { + "epoch": 48.67, + "grad_norm": 1.046409010887146, + "learning_rate": 2e-05, + "loss": 0.03625423, + "step": 24335 + }, + { + "epoch": 48.672, + "grad_norm": 1.158137321472168, + "learning_rate": 2e-05, + "loss": 0.05011062, + "step": 24336 + }, + { + "epoch": 48.674, + "grad_norm": 1.1454992294311523, + "learning_rate": 2e-05, + "loss": 0.04545939, + "step": 24337 + }, + { + "epoch": 48.676, + "grad_norm": 1.2289396524429321, + "learning_rate": 2e-05, + "loss": 0.06850711, + "step": 24338 + }, + { + "epoch": 48.678, + "grad_norm": 2.0199902057647705, + "learning_rate": 2e-05, + "loss": 0.04739082, + "step": 24339 + }, + { + "epoch": 48.68, + "grad_norm": 1.1302142143249512, + "learning_rate": 2e-05, + "loss": 0.0431182, + "step": 24340 + }, + { + "epoch": 48.682, + "grad_norm": 1.3505676984786987, + "learning_rate": 2e-05, + "loss": 0.04874783, + "step": 24341 + }, + { + "epoch": 48.684, + "grad_norm": 0.9121742248535156, + "learning_rate": 2e-05, + "loss": 0.02643291, + "step": 24342 + }, + { + "epoch": 48.686, + "grad_norm": 1.3847429752349854, + "learning_rate": 2e-05, + "loss": 0.04750867, + "step": 24343 + }, + { + "epoch": 48.688, + "grad_norm": 1.1395158767700195, + "learning_rate": 2e-05, + "loss": 0.04924957, + "step": 24344 + }, + { + "epoch": 48.69, + "grad_norm": 1.0794190168380737, + "learning_rate": 2e-05, + "loss": 0.03787911, + "step": 24345 + }, + { + "epoch": 48.692, + "grad_norm": 1.1486942768096924, + "learning_rate": 2e-05, + "loss": 0.04261864, + "step": 24346 + }, + { + "epoch": 48.694, + "grad_norm": 1.1748255491256714, + "learning_rate": 2e-05, + "loss": 0.04672168, + "step": 24347 + }, + { + "epoch": 48.696, + "grad_norm": 1.1263988018035889, + "learning_rate": 2e-05, + "loss": 0.04133555, + "step": 24348 + }, + { + "epoch": 48.698, + "grad_norm": 1.2288318872451782, + "learning_rate": 2e-05, + "loss": 0.04745194, + "step": 24349 + }, + { + "epoch": 48.7, + "grad_norm": 0.9768017530441284, + "learning_rate": 2e-05, + "loss": 0.04292785, + "step": 24350 + }, + { + "epoch": 48.702, + "grad_norm": 1.0999313592910767, + "learning_rate": 2e-05, + "loss": 0.05881019, + "step": 24351 + }, + { + "epoch": 48.704, + "grad_norm": 1.878710389137268, + "learning_rate": 2e-05, + "loss": 0.05866309, + "step": 24352 + }, + { + "epoch": 48.706, + "grad_norm": 1.63356614112854, + "learning_rate": 2e-05, + "loss": 0.05248346, + "step": 24353 + }, + { + "epoch": 48.708, + "grad_norm": 1.2071478366851807, + "learning_rate": 2e-05, + "loss": 0.05778679, + "step": 24354 + }, + { + "epoch": 48.71, + "grad_norm": 1.188579797744751, + "learning_rate": 2e-05, + "loss": 0.05696705, + "step": 24355 + }, + { + "epoch": 48.712, + "grad_norm": 0.9021721482276917, + "learning_rate": 2e-05, + "loss": 0.02676656, + "step": 24356 + }, + { + "epoch": 48.714, + "grad_norm": 1.446212887763977, + "learning_rate": 2e-05, + "loss": 0.04261808, + "step": 24357 + }, + { + "epoch": 48.716, + "grad_norm": 0.975503146648407, + "learning_rate": 2e-05, + "loss": 0.03775565, + "step": 24358 + }, + { + "epoch": 48.718, + "grad_norm": 1.1201655864715576, + "learning_rate": 2e-05, + "loss": 0.04218309, + "step": 24359 + }, + { + "epoch": 48.72, + "grad_norm": 1.2044806480407715, + "learning_rate": 2e-05, + "loss": 0.04980427, + "step": 24360 + }, + { + "epoch": 48.722, + "grad_norm": 1.0975890159606934, + "learning_rate": 2e-05, + "loss": 0.04094896, + "step": 24361 + }, + { + "epoch": 48.724, + "grad_norm": 1.5695173740386963, + "learning_rate": 2e-05, + "loss": 0.04596317, + "step": 24362 + }, + { + "epoch": 48.726, + "grad_norm": 1.0056501626968384, + "learning_rate": 2e-05, + "loss": 0.04648286, + "step": 24363 + }, + { + "epoch": 48.728, + "grad_norm": 0.9918159246444702, + "learning_rate": 2e-05, + "loss": 0.03798754, + "step": 24364 + }, + { + "epoch": 48.73, + "grad_norm": 1.287979245185852, + "learning_rate": 2e-05, + "loss": 0.05667548, + "step": 24365 + }, + { + "epoch": 48.732, + "grad_norm": 1.1386852264404297, + "learning_rate": 2e-05, + "loss": 0.05278626, + "step": 24366 + }, + { + "epoch": 48.734, + "grad_norm": 0.996176540851593, + "learning_rate": 2e-05, + "loss": 0.05246001, + "step": 24367 + }, + { + "epoch": 48.736, + "grad_norm": 2.319737434387207, + "learning_rate": 2e-05, + "loss": 0.0373604, + "step": 24368 + }, + { + "epoch": 48.738, + "grad_norm": 1.7545973062515259, + "learning_rate": 2e-05, + "loss": 0.04798623, + "step": 24369 + }, + { + "epoch": 48.74, + "grad_norm": 1.3103057146072388, + "learning_rate": 2e-05, + "loss": 0.05369485, + "step": 24370 + }, + { + "epoch": 48.742, + "grad_norm": 1.1758966445922852, + "learning_rate": 2e-05, + "loss": 0.05795531, + "step": 24371 + }, + { + "epoch": 48.744, + "grad_norm": 0.9001947641372681, + "learning_rate": 2e-05, + "loss": 0.02734162, + "step": 24372 + }, + { + "epoch": 48.746, + "grad_norm": 1.0777047872543335, + "learning_rate": 2e-05, + "loss": 0.03581995, + "step": 24373 + }, + { + "epoch": 48.748, + "grad_norm": 1.3073005676269531, + "learning_rate": 2e-05, + "loss": 0.04707327, + "step": 24374 + }, + { + "epoch": 48.75, + "grad_norm": 0.9895936250686646, + "learning_rate": 2e-05, + "loss": 0.02827165, + "step": 24375 + }, + { + "epoch": 48.752, + "grad_norm": 1.1249933242797852, + "learning_rate": 2e-05, + "loss": 0.04632932, + "step": 24376 + }, + { + "epoch": 48.754, + "grad_norm": 1.274895191192627, + "learning_rate": 2e-05, + "loss": 0.04302908, + "step": 24377 + }, + { + "epoch": 48.756, + "grad_norm": 1.0535324811935425, + "learning_rate": 2e-05, + "loss": 0.03941915, + "step": 24378 + }, + { + "epoch": 48.758, + "grad_norm": 1.3476455211639404, + "learning_rate": 2e-05, + "loss": 0.05376494, + "step": 24379 + }, + { + "epoch": 48.76, + "grad_norm": 1.2379733324050903, + "learning_rate": 2e-05, + "loss": 0.05499609, + "step": 24380 + }, + { + "epoch": 48.762, + "grad_norm": 1.1249278783798218, + "learning_rate": 2e-05, + "loss": 0.03757162, + "step": 24381 + }, + { + "epoch": 48.764, + "grad_norm": 2.2356626987457275, + "learning_rate": 2e-05, + "loss": 0.0576737, + "step": 24382 + }, + { + "epoch": 48.766, + "grad_norm": 1.1273705959320068, + "learning_rate": 2e-05, + "loss": 0.05674585, + "step": 24383 + }, + { + "epoch": 48.768, + "grad_norm": 1.3228777647018433, + "learning_rate": 2e-05, + "loss": 0.05507819, + "step": 24384 + }, + { + "epoch": 48.77, + "grad_norm": 1.3240513801574707, + "learning_rate": 2e-05, + "loss": 0.05233713, + "step": 24385 + }, + { + "epoch": 48.772, + "grad_norm": 1.2595722675323486, + "learning_rate": 2e-05, + "loss": 0.05647445, + "step": 24386 + }, + { + "epoch": 48.774, + "grad_norm": 1.1670889854431152, + "learning_rate": 2e-05, + "loss": 0.0378773, + "step": 24387 + }, + { + "epoch": 48.776, + "grad_norm": 0.9020729064941406, + "learning_rate": 2e-05, + "loss": 0.03322694, + "step": 24388 + }, + { + "epoch": 48.778, + "grad_norm": 1.0231337547302246, + "learning_rate": 2e-05, + "loss": 0.03719312, + "step": 24389 + }, + { + "epoch": 48.78, + "grad_norm": 1.0719740390777588, + "learning_rate": 2e-05, + "loss": 0.04352896, + "step": 24390 + }, + { + "epoch": 48.782, + "grad_norm": 1.1282035112380981, + "learning_rate": 2e-05, + "loss": 0.05233166, + "step": 24391 + }, + { + "epoch": 48.784, + "grad_norm": 1.2852978706359863, + "learning_rate": 2e-05, + "loss": 0.05125814, + "step": 24392 + }, + { + "epoch": 48.786, + "grad_norm": 1.1023353338241577, + "learning_rate": 2e-05, + "loss": 0.04840891, + "step": 24393 + }, + { + "epoch": 48.788, + "grad_norm": 1.5600016117095947, + "learning_rate": 2e-05, + "loss": 0.04811985, + "step": 24394 + }, + { + "epoch": 48.79, + "grad_norm": 1.1673002243041992, + "learning_rate": 2e-05, + "loss": 0.04379809, + "step": 24395 + }, + { + "epoch": 48.792, + "grad_norm": 1.3947582244873047, + "learning_rate": 2e-05, + "loss": 0.05733298, + "step": 24396 + }, + { + "epoch": 48.794, + "grad_norm": 1.2544630765914917, + "learning_rate": 2e-05, + "loss": 0.06692179, + "step": 24397 + }, + { + "epoch": 48.796, + "grad_norm": 1.9167871475219727, + "learning_rate": 2e-05, + "loss": 0.06020879, + "step": 24398 + }, + { + "epoch": 48.798, + "grad_norm": 1.5168108940124512, + "learning_rate": 2e-05, + "loss": 0.04920478, + "step": 24399 + }, + { + "epoch": 48.8, + "grad_norm": 1.047117829322815, + "learning_rate": 2e-05, + "loss": 0.03645068, + "step": 24400 + }, + { + "epoch": 48.802, + "grad_norm": 1.2618528604507446, + "learning_rate": 2e-05, + "loss": 0.05060851, + "step": 24401 + }, + { + "epoch": 48.804, + "grad_norm": 1.0257928371429443, + "learning_rate": 2e-05, + "loss": 0.04926369, + "step": 24402 + }, + { + "epoch": 48.806, + "grad_norm": 2.2728285789489746, + "learning_rate": 2e-05, + "loss": 0.05713367, + "step": 24403 + }, + { + "epoch": 48.808, + "grad_norm": 1.2449434995651245, + "learning_rate": 2e-05, + "loss": 0.05508956, + "step": 24404 + }, + { + "epoch": 48.81, + "grad_norm": 1.1658732891082764, + "learning_rate": 2e-05, + "loss": 0.05041445, + "step": 24405 + }, + { + "epoch": 48.812, + "grad_norm": 1.206467866897583, + "learning_rate": 2e-05, + "loss": 0.06293423, + "step": 24406 + }, + { + "epoch": 48.814, + "grad_norm": 1.246690034866333, + "learning_rate": 2e-05, + "loss": 0.05447911, + "step": 24407 + }, + { + "epoch": 48.816, + "grad_norm": 1.204403042793274, + "learning_rate": 2e-05, + "loss": 0.05379122, + "step": 24408 + }, + { + "epoch": 48.818, + "grad_norm": 1.2013062238693237, + "learning_rate": 2e-05, + "loss": 0.05955196, + "step": 24409 + }, + { + "epoch": 48.82, + "grad_norm": 1.0979270935058594, + "learning_rate": 2e-05, + "loss": 0.0413815, + "step": 24410 + }, + { + "epoch": 48.822, + "grad_norm": 1.224603295326233, + "learning_rate": 2e-05, + "loss": 0.05617508, + "step": 24411 + }, + { + "epoch": 48.824, + "grad_norm": 1.262998342514038, + "learning_rate": 2e-05, + "loss": 0.04537391, + "step": 24412 + }, + { + "epoch": 48.826, + "grad_norm": 1.1487383842468262, + "learning_rate": 2e-05, + "loss": 0.03556521, + "step": 24413 + }, + { + "epoch": 48.828, + "grad_norm": 0.9703657031059265, + "learning_rate": 2e-05, + "loss": 0.03395029, + "step": 24414 + }, + { + "epoch": 48.83, + "grad_norm": 1.5587289333343506, + "learning_rate": 2e-05, + "loss": 0.0705722, + "step": 24415 + }, + { + "epoch": 48.832, + "grad_norm": 1.0362234115600586, + "learning_rate": 2e-05, + "loss": 0.04194809, + "step": 24416 + }, + { + "epoch": 48.834, + "grad_norm": 1.3099713325500488, + "learning_rate": 2e-05, + "loss": 0.05039264, + "step": 24417 + }, + { + "epoch": 48.836, + "grad_norm": 2.788674831390381, + "learning_rate": 2e-05, + "loss": 0.05012584, + "step": 24418 + }, + { + "epoch": 48.838, + "grad_norm": 1.322467565536499, + "learning_rate": 2e-05, + "loss": 0.0455, + "step": 24419 + }, + { + "epoch": 48.84, + "grad_norm": 1.230636477470398, + "learning_rate": 2e-05, + "loss": 0.03692626, + "step": 24420 + }, + { + "epoch": 48.842, + "grad_norm": 1.2802784442901611, + "learning_rate": 2e-05, + "loss": 0.05909276, + "step": 24421 + }, + { + "epoch": 48.844, + "grad_norm": 1.3420884609222412, + "learning_rate": 2e-05, + "loss": 0.0529298, + "step": 24422 + }, + { + "epoch": 48.846, + "grad_norm": 1.0645166635513306, + "learning_rate": 2e-05, + "loss": 0.04237661, + "step": 24423 + }, + { + "epoch": 48.848, + "grad_norm": 1.489173173904419, + "learning_rate": 2e-05, + "loss": 0.04594196, + "step": 24424 + }, + { + "epoch": 48.85, + "grad_norm": 1.9845540523529053, + "learning_rate": 2e-05, + "loss": 0.06666305, + "step": 24425 + }, + { + "epoch": 48.852, + "grad_norm": 1.2646132707595825, + "learning_rate": 2e-05, + "loss": 0.06317861, + "step": 24426 + }, + { + "epoch": 48.854, + "grad_norm": 1.1693730354309082, + "learning_rate": 2e-05, + "loss": 0.03957154, + "step": 24427 + }, + { + "epoch": 48.856, + "grad_norm": 1.139262080192566, + "learning_rate": 2e-05, + "loss": 0.03852566, + "step": 24428 + }, + { + "epoch": 48.858, + "grad_norm": 1.340086579322815, + "learning_rate": 2e-05, + "loss": 0.06382296, + "step": 24429 + }, + { + "epoch": 48.86, + "grad_norm": 1.404128909111023, + "learning_rate": 2e-05, + "loss": 0.06833863, + "step": 24430 + }, + { + "epoch": 48.862, + "grad_norm": 1.1025450229644775, + "learning_rate": 2e-05, + "loss": 0.05407901, + "step": 24431 + }, + { + "epoch": 48.864, + "grad_norm": 1.1320126056671143, + "learning_rate": 2e-05, + "loss": 0.04277468, + "step": 24432 + }, + { + "epoch": 48.866, + "grad_norm": 1.192533016204834, + "learning_rate": 2e-05, + "loss": 0.04574575, + "step": 24433 + }, + { + "epoch": 48.868, + "grad_norm": 5.2781267166137695, + "learning_rate": 2e-05, + "loss": 0.06549813, + "step": 24434 + }, + { + "epoch": 48.87, + "grad_norm": 1.2239826917648315, + "learning_rate": 2e-05, + "loss": 0.05279471, + "step": 24435 + }, + { + "epoch": 48.872, + "grad_norm": 1.0941675901412964, + "learning_rate": 2e-05, + "loss": 0.04686259, + "step": 24436 + }, + { + "epoch": 48.874, + "grad_norm": 1.5225481986999512, + "learning_rate": 2e-05, + "loss": 0.05451784, + "step": 24437 + }, + { + "epoch": 48.876, + "grad_norm": 1.0985329151153564, + "learning_rate": 2e-05, + "loss": 0.03912208, + "step": 24438 + }, + { + "epoch": 48.878, + "grad_norm": 0.9841903448104858, + "learning_rate": 2e-05, + "loss": 0.03415463, + "step": 24439 + }, + { + "epoch": 48.88, + "grad_norm": 1.519121527671814, + "learning_rate": 2e-05, + "loss": 0.05647705, + "step": 24440 + }, + { + "epoch": 48.882, + "grad_norm": 1.738462209701538, + "learning_rate": 2e-05, + "loss": 0.05705027, + "step": 24441 + }, + { + "epoch": 48.884, + "grad_norm": 1.2349008321762085, + "learning_rate": 2e-05, + "loss": 0.06694788, + "step": 24442 + }, + { + "epoch": 48.886, + "grad_norm": 1.1472513675689697, + "learning_rate": 2e-05, + "loss": 0.04741376, + "step": 24443 + }, + { + "epoch": 48.888, + "grad_norm": 1.7409826517105103, + "learning_rate": 2e-05, + "loss": 0.04635547, + "step": 24444 + }, + { + "epoch": 48.89, + "grad_norm": 1.507589340209961, + "learning_rate": 2e-05, + "loss": 0.04607899, + "step": 24445 + }, + { + "epoch": 48.892, + "grad_norm": 2.211873769760132, + "learning_rate": 2e-05, + "loss": 0.05145214, + "step": 24446 + }, + { + "epoch": 48.894, + "grad_norm": 1.286216139793396, + "learning_rate": 2e-05, + "loss": 0.06177491, + "step": 24447 + }, + { + "epoch": 48.896, + "grad_norm": 1.2425750494003296, + "learning_rate": 2e-05, + "loss": 0.05383733, + "step": 24448 + }, + { + "epoch": 48.898, + "grad_norm": 1.189475655555725, + "learning_rate": 2e-05, + "loss": 0.05400949, + "step": 24449 + }, + { + "epoch": 48.9, + "grad_norm": 1.3408055305480957, + "learning_rate": 2e-05, + "loss": 0.03949302, + "step": 24450 + }, + { + "epoch": 48.902, + "grad_norm": 2.6601455211639404, + "learning_rate": 2e-05, + "loss": 0.06074892, + "step": 24451 + }, + { + "epoch": 48.904, + "grad_norm": 0.9952139854431152, + "learning_rate": 2e-05, + "loss": 0.04453053, + "step": 24452 + }, + { + "epoch": 48.906, + "grad_norm": 1.4619004726409912, + "learning_rate": 2e-05, + "loss": 0.04188272, + "step": 24453 + }, + { + "epoch": 48.908, + "grad_norm": 1.2014166116714478, + "learning_rate": 2e-05, + "loss": 0.04575384, + "step": 24454 + }, + { + "epoch": 48.91, + "grad_norm": 2.021064043045044, + "learning_rate": 2e-05, + "loss": 0.04514164, + "step": 24455 + }, + { + "epoch": 48.912, + "grad_norm": 1.2506054639816284, + "learning_rate": 2e-05, + "loss": 0.0620794, + "step": 24456 + }, + { + "epoch": 48.914, + "grad_norm": 1.2398749589920044, + "learning_rate": 2e-05, + "loss": 0.04733563, + "step": 24457 + }, + { + "epoch": 48.916, + "grad_norm": 1.095046043395996, + "learning_rate": 2e-05, + "loss": 0.04853748, + "step": 24458 + }, + { + "epoch": 48.918, + "grad_norm": 2.492252826690674, + "learning_rate": 2e-05, + "loss": 0.04443378, + "step": 24459 + }, + { + "epoch": 48.92, + "grad_norm": 1.4789276123046875, + "learning_rate": 2e-05, + "loss": 0.04718126, + "step": 24460 + }, + { + "epoch": 48.922, + "grad_norm": 1.7125012874603271, + "learning_rate": 2e-05, + "loss": 0.03764442, + "step": 24461 + }, + { + "epoch": 48.924, + "grad_norm": 1.5009963512420654, + "learning_rate": 2e-05, + "loss": 0.02679938, + "step": 24462 + }, + { + "epoch": 48.926, + "grad_norm": 1.1632426977157593, + "learning_rate": 2e-05, + "loss": 0.04027367, + "step": 24463 + }, + { + "epoch": 48.928, + "grad_norm": 1.0739227533340454, + "learning_rate": 2e-05, + "loss": 0.04538012, + "step": 24464 + }, + { + "epoch": 48.93, + "grad_norm": 1.1324819326400757, + "learning_rate": 2e-05, + "loss": 0.05008095, + "step": 24465 + }, + { + "epoch": 48.932, + "grad_norm": 1.2299511432647705, + "learning_rate": 2e-05, + "loss": 0.03868235, + "step": 24466 + }, + { + "epoch": 48.934, + "grad_norm": 1.457084059715271, + "learning_rate": 2e-05, + "loss": 0.06458244, + "step": 24467 + }, + { + "epoch": 48.936, + "grad_norm": 1.4329404830932617, + "learning_rate": 2e-05, + "loss": 0.06372182, + "step": 24468 + }, + { + "epoch": 48.938, + "grad_norm": 1.0780876874923706, + "learning_rate": 2e-05, + "loss": 0.04435993, + "step": 24469 + }, + { + "epoch": 48.94, + "grad_norm": 1.6297838687896729, + "learning_rate": 2e-05, + "loss": 0.05016711, + "step": 24470 + }, + { + "epoch": 48.942, + "grad_norm": 1.1690902709960938, + "learning_rate": 2e-05, + "loss": 0.06398173, + "step": 24471 + }, + { + "epoch": 48.944, + "grad_norm": 1.1354949474334717, + "learning_rate": 2e-05, + "loss": 0.04506849, + "step": 24472 + }, + { + "epoch": 48.946, + "grad_norm": 1.3150396347045898, + "learning_rate": 2e-05, + "loss": 0.06466317, + "step": 24473 + }, + { + "epoch": 48.948, + "grad_norm": 1.104675531387329, + "learning_rate": 2e-05, + "loss": 0.0407197, + "step": 24474 + }, + { + "epoch": 48.95, + "grad_norm": 1.0967339277267456, + "learning_rate": 2e-05, + "loss": 0.04545303, + "step": 24475 + }, + { + "epoch": 48.952, + "grad_norm": 1.0893007516860962, + "learning_rate": 2e-05, + "loss": 0.05033952, + "step": 24476 + }, + { + "epoch": 48.954, + "grad_norm": 1.1199082136154175, + "learning_rate": 2e-05, + "loss": 0.04963399, + "step": 24477 + }, + { + "epoch": 48.956, + "grad_norm": 1.1054890155792236, + "learning_rate": 2e-05, + "loss": 0.04332212, + "step": 24478 + }, + { + "epoch": 48.958, + "grad_norm": 2.2792885303497314, + "learning_rate": 2e-05, + "loss": 0.05663423, + "step": 24479 + }, + { + "epoch": 48.96, + "grad_norm": 1.1706188917160034, + "learning_rate": 2e-05, + "loss": 0.04888928, + "step": 24480 + }, + { + "epoch": 48.962, + "grad_norm": 2.348127841949463, + "learning_rate": 2e-05, + "loss": 0.05282357, + "step": 24481 + }, + { + "epoch": 48.964, + "grad_norm": 1.2429864406585693, + "learning_rate": 2e-05, + "loss": 0.05059104, + "step": 24482 + }, + { + "epoch": 48.966, + "grad_norm": 1.1958144903182983, + "learning_rate": 2e-05, + "loss": 0.03917575, + "step": 24483 + }, + { + "epoch": 48.968, + "grad_norm": 1.0921530723571777, + "learning_rate": 2e-05, + "loss": 0.03745528, + "step": 24484 + }, + { + "epoch": 48.97, + "grad_norm": 1.7767388820648193, + "learning_rate": 2e-05, + "loss": 0.06159507, + "step": 24485 + }, + { + "epoch": 48.972, + "grad_norm": 1.1890246868133545, + "learning_rate": 2e-05, + "loss": 0.04934237, + "step": 24486 + }, + { + "epoch": 48.974, + "grad_norm": 1.616060495376587, + "learning_rate": 2e-05, + "loss": 0.04842779, + "step": 24487 + }, + { + "epoch": 48.976, + "grad_norm": 2.0892298221588135, + "learning_rate": 2e-05, + "loss": 0.03829409, + "step": 24488 + }, + { + "epoch": 48.978, + "grad_norm": 1.2111293077468872, + "learning_rate": 2e-05, + "loss": 0.05161772, + "step": 24489 + }, + { + "epoch": 48.98, + "grad_norm": 1.258104920387268, + "learning_rate": 2e-05, + "loss": 0.04110197, + "step": 24490 + }, + { + "epoch": 48.982, + "grad_norm": 1.1039538383483887, + "learning_rate": 2e-05, + "loss": 0.03855299, + "step": 24491 + }, + { + "epoch": 48.984, + "grad_norm": 1.0499625205993652, + "learning_rate": 2e-05, + "loss": 0.04614359, + "step": 24492 + }, + { + "epoch": 48.986, + "grad_norm": 1.2007472515106201, + "learning_rate": 2e-05, + "loss": 0.03996876, + "step": 24493 + }, + { + "epoch": 48.988, + "grad_norm": 1.2354191541671753, + "learning_rate": 2e-05, + "loss": 0.04676054, + "step": 24494 + }, + { + "epoch": 48.99, + "grad_norm": 1.0763860940933228, + "learning_rate": 2e-05, + "loss": 0.04218274, + "step": 24495 + }, + { + "epoch": 48.992, + "grad_norm": 1.1772406101226807, + "learning_rate": 2e-05, + "loss": 0.06190931, + "step": 24496 + }, + { + "epoch": 48.994, + "grad_norm": 2.410313844680786, + "learning_rate": 2e-05, + "loss": 0.05162603, + "step": 24497 + }, + { + "epoch": 48.996, + "grad_norm": 1.293403148651123, + "learning_rate": 2e-05, + "loss": 0.04930715, + "step": 24498 + }, + { + "epoch": 48.998, + "grad_norm": 2.3302156925201416, + "learning_rate": 2e-05, + "loss": 0.05537525, + "step": 24499 + }, + { + "epoch": 49.0, + "grad_norm": 1.106102705001831, + "learning_rate": 2e-05, + "loss": 0.05640403, + "step": 24500 + }, + { + "epoch": 49.0, + "eval_performance": { + "AngleClassification_1": 1.0, + "AngleClassification_2": 1.0, + "AngleClassification_3": 0.9720558882235529, + "Equal_1": 1.0, + "Equal_2": 0.9800399201596807, + "Equal_3": 0.9860279441117764, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 1.0, + "Parallel_1": 0.9939879759519038, + "Parallel_2": 0.9939879759519038, + "Parallel_3": 0.994, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.998, + "Perpendicular_3": 0.8897795591182365, + "PointLiesOnCircle_1": 1.0, + "PointLiesOnCircle_2": 0.994, + "PointLiesOnCircle_3": 0.9956, + "PointLiesOnLine_1": 1.0, + "PointLiesOnLine_2": 0.9939879759519038, + "PointLiesOnLine_3": 0.9880239520958084 + }, + "eval_runtime": 320.4713, + "eval_samples_per_second": 32.764, + "eval_steps_per_second": 0.655, + "step": 24500 + }, + { + "epoch": 49.002, + "grad_norm": 1.3205698728561401, + "learning_rate": 2e-05, + "loss": 0.0536387, + "step": 24501 + }, + { + "epoch": 49.004, + "grad_norm": 1.0756568908691406, + "learning_rate": 2e-05, + "loss": 0.03509374, + "step": 24502 + }, + { + "epoch": 49.006, + "grad_norm": 1.1383591890335083, + "learning_rate": 2e-05, + "loss": 0.05303418, + "step": 24503 + }, + { + "epoch": 49.008, + "grad_norm": 1.2055492401123047, + "learning_rate": 2e-05, + "loss": 0.0511542, + "step": 24504 + }, + { + "epoch": 49.01, + "grad_norm": 1.0886911153793335, + "learning_rate": 2e-05, + "loss": 0.0437855, + "step": 24505 + }, + { + "epoch": 49.012, + "grad_norm": 1.0806195735931396, + "learning_rate": 2e-05, + "loss": 0.04402277, + "step": 24506 + }, + { + "epoch": 49.014, + "grad_norm": 0.97076815366745, + "learning_rate": 2e-05, + "loss": 0.04103847, + "step": 24507 + }, + { + "epoch": 49.016, + "grad_norm": 0.9942542910575867, + "learning_rate": 2e-05, + "loss": 0.03674922, + "step": 24508 + }, + { + "epoch": 49.018, + "grad_norm": 1.3719143867492676, + "learning_rate": 2e-05, + "loss": 0.04641896, + "step": 24509 + }, + { + "epoch": 49.02, + "grad_norm": 1.4681041240692139, + "learning_rate": 2e-05, + "loss": 0.05441123, + "step": 24510 + }, + { + "epoch": 49.022, + "grad_norm": 1.3180499076843262, + "learning_rate": 2e-05, + "loss": 0.05244955, + "step": 24511 + }, + { + "epoch": 49.024, + "grad_norm": 2.063997983932495, + "learning_rate": 2e-05, + "loss": 0.06359348, + "step": 24512 + }, + { + "epoch": 49.026, + "grad_norm": 1.1440930366516113, + "learning_rate": 2e-05, + "loss": 0.04446263, + "step": 24513 + }, + { + "epoch": 49.028, + "grad_norm": 1.2878350019454956, + "learning_rate": 2e-05, + "loss": 0.03576039, + "step": 24514 + }, + { + "epoch": 49.03, + "grad_norm": 0.9617430567741394, + "learning_rate": 2e-05, + "loss": 0.02875559, + "step": 24515 + }, + { + "epoch": 49.032, + "grad_norm": 1.2200411558151245, + "learning_rate": 2e-05, + "loss": 0.04824933, + "step": 24516 + }, + { + "epoch": 49.034, + "grad_norm": 1.4803850650787354, + "learning_rate": 2e-05, + "loss": 0.05635831, + "step": 24517 + }, + { + "epoch": 49.036, + "grad_norm": 1.9696398973464966, + "learning_rate": 2e-05, + "loss": 0.04805397, + "step": 24518 + }, + { + "epoch": 49.038, + "grad_norm": 1.0635892152786255, + "learning_rate": 2e-05, + "loss": 0.04648872, + "step": 24519 + }, + { + "epoch": 49.04, + "grad_norm": 1.0286033153533936, + "learning_rate": 2e-05, + "loss": 0.03219466, + "step": 24520 + }, + { + "epoch": 49.042, + "grad_norm": 1.2040108442306519, + "learning_rate": 2e-05, + "loss": 0.05577746, + "step": 24521 + }, + { + "epoch": 49.044, + "grad_norm": 1.2264634370803833, + "learning_rate": 2e-05, + "loss": 0.05810263, + "step": 24522 + }, + { + "epoch": 49.046, + "grad_norm": 1.1748919486999512, + "learning_rate": 2e-05, + "loss": 0.0435359, + "step": 24523 + }, + { + "epoch": 49.048, + "grad_norm": 1.623011589050293, + "learning_rate": 2e-05, + "loss": 0.06521875, + "step": 24524 + }, + { + "epoch": 49.05, + "grad_norm": 1.593483805656433, + "learning_rate": 2e-05, + "loss": 0.04450686, + "step": 24525 + }, + { + "epoch": 49.052, + "grad_norm": 1.0634883642196655, + "learning_rate": 2e-05, + "loss": 0.0392899, + "step": 24526 + }, + { + "epoch": 49.054, + "grad_norm": 0.962153971195221, + "learning_rate": 2e-05, + "loss": 0.03904355, + "step": 24527 + }, + { + "epoch": 49.056, + "grad_norm": 1.0543261766433716, + "learning_rate": 2e-05, + "loss": 0.03974721, + "step": 24528 + }, + { + "epoch": 49.058, + "grad_norm": 1.3880788087844849, + "learning_rate": 2e-05, + "loss": 0.04019085, + "step": 24529 + }, + { + "epoch": 49.06, + "grad_norm": 1.029956340789795, + "learning_rate": 2e-05, + "loss": 0.04097775, + "step": 24530 + }, + { + "epoch": 49.062, + "grad_norm": 1.2715673446655273, + "learning_rate": 2e-05, + "loss": 0.05796398, + "step": 24531 + }, + { + "epoch": 49.064, + "grad_norm": 1.1110681295394897, + "learning_rate": 2e-05, + "loss": 0.04320613, + "step": 24532 + }, + { + "epoch": 49.066, + "grad_norm": 1.2434399127960205, + "learning_rate": 2e-05, + "loss": 0.05722959, + "step": 24533 + }, + { + "epoch": 49.068, + "grad_norm": 1.0647788047790527, + "learning_rate": 2e-05, + "loss": 0.04631557, + "step": 24534 + }, + { + "epoch": 49.07, + "grad_norm": 1.1238845586776733, + "learning_rate": 2e-05, + "loss": 0.04434226, + "step": 24535 + }, + { + "epoch": 49.072, + "grad_norm": 2.4695746898651123, + "learning_rate": 2e-05, + "loss": 0.05840037, + "step": 24536 + }, + { + "epoch": 49.074, + "grad_norm": 1.5649980306625366, + "learning_rate": 2e-05, + "loss": 0.04456908, + "step": 24537 + }, + { + "epoch": 49.076, + "grad_norm": 1.1340800523757935, + "learning_rate": 2e-05, + "loss": 0.02922975, + "step": 24538 + }, + { + "epoch": 49.078, + "grad_norm": 1.6249136924743652, + "learning_rate": 2e-05, + "loss": 0.05591683, + "step": 24539 + }, + { + "epoch": 49.08, + "grad_norm": 1.0379282236099243, + "learning_rate": 2e-05, + "loss": 0.04590633, + "step": 24540 + }, + { + "epoch": 49.082, + "grad_norm": 1.1572468280792236, + "learning_rate": 2e-05, + "loss": 0.05310665, + "step": 24541 + }, + { + "epoch": 49.084, + "grad_norm": 1.3208953142166138, + "learning_rate": 2e-05, + "loss": 0.04799189, + "step": 24542 + }, + { + "epoch": 49.086, + "grad_norm": 1.1476277112960815, + "learning_rate": 2e-05, + "loss": 0.04075008, + "step": 24543 + }, + { + "epoch": 49.088, + "grad_norm": 1.445860505104065, + "learning_rate": 2e-05, + "loss": 0.04546492, + "step": 24544 + }, + { + "epoch": 49.09, + "grad_norm": 1.02455472946167, + "learning_rate": 2e-05, + "loss": 0.04289842, + "step": 24545 + }, + { + "epoch": 49.092, + "grad_norm": 1.0884724855422974, + "learning_rate": 2e-05, + "loss": 0.05313245, + "step": 24546 + }, + { + "epoch": 49.094, + "grad_norm": 1.2504005432128906, + "learning_rate": 2e-05, + "loss": 0.05837908, + "step": 24547 + }, + { + "epoch": 49.096, + "grad_norm": 1.050110101699829, + "learning_rate": 2e-05, + "loss": 0.04570572, + "step": 24548 + }, + { + "epoch": 49.098, + "grad_norm": 1.3742461204528809, + "learning_rate": 2e-05, + "loss": 0.04856823, + "step": 24549 + }, + { + "epoch": 49.1, + "grad_norm": 1.0349839925765991, + "learning_rate": 2e-05, + "loss": 0.04231741, + "step": 24550 + }, + { + "epoch": 49.102, + "grad_norm": 1.7095385789871216, + "learning_rate": 2e-05, + "loss": 0.05369782, + "step": 24551 + }, + { + "epoch": 49.104, + "grad_norm": 1.103822946548462, + "learning_rate": 2e-05, + "loss": 0.04244398, + "step": 24552 + }, + { + "epoch": 49.106, + "grad_norm": 1.2387681007385254, + "learning_rate": 2e-05, + "loss": 0.05892359, + "step": 24553 + }, + { + "epoch": 49.108, + "grad_norm": 1.1725491285324097, + "learning_rate": 2e-05, + "loss": 0.05246655, + "step": 24554 + }, + { + "epoch": 49.11, + "grad_norm": 1.0159834623336792, + "learning_rate": 2e-05, + "loss": 0.04580885, + "step": 24555 + }, + { + "epoch": 49.112, + "grad_norm": 1.1978039741516113, + "learning_rate": 2e-05, + "loss": 0.05257396, + "step": 24556 + }, + { + "epoch": 49.114, + "grad_norm": 1.2739366292953491, + "learning_rate": 2e-05, + "loss": 0.06457013, + "step": 24557 + }, + { + "epoch": 49.116, + "grad_norm": 1.2462983131408691, + "learning_rate": 2e-05, + "loss": 0.05257029, + "step": 24558 + }, + { + "epoch": 49.118, + "grad_norm": 1.0246502161026, + "learning_rate": 2e-05, + "loss": 0.03672457, + "step": 24559 + }, + { + "epoch": 49.12, + "grad_norm": 1.0243715047836304, + "learning_rate": 2e-05, + "loss": 0.03848898, + "step": 24560 + }, + { + "epoch": 49.122, + "grad_norm": 1.0890052318572998, + "learning_rate": 2e-05, + "loss": 0.04123619, + "step": 24561 + }, + { + "epoch": 49.124, + "grad_norm": 1.0428330898284912, + "learning_rate": 2e-05, + "loss": 0.04471182, + "step": 24562 + }, + { + "epoch": 49.126, + "grad_norm": 1.534914255142212, + "learning_rate": 2e-05, + "loss": 0.03503163, + "step": 24563 + }, + { + "epoch": 49.128, + "grad_norm": 3.263657808303833, + "learning_rate": 2e-05, + "loss": 0.06536023, + "step": 24564 + }, + { + "epoch": 49.13, + "grad_norm": 0.8561310768127441, + "learning_rate": 2e-05, + "loss": 0.02339423, + "step": 24565 + }, + { + "epoch": 49.132, + "grad_norm": 1.2668267488479614, + "learning_rate": 2e-05, + "loss": 0.05148039, + "step": 24566 + }, + { + "epoch": 49.134, + "grad_norm": 1.4245041608810425, + "learning_rate": 2e-05, + "loss": 0.04260141, + "step": 24567 + }, + { + "epoch": 49.136, + "grad_norm": 1.2553406953811646, + "learning_rate": 2e-05, + "loss": 0.05330418, + "step": 24568 + }, + { + "epoch": 49.138, + "grad_norm": 1.5819698572158813, + "learning_rate": 2e-05, + "loss": 0.0418847, + "step": 24569 + }, + { + "epoch": 49.14, + "grad_norm": 1.157503604888916, + "learning_rate": 2e-05, + "loss": 0.0456868, + "step": 24570 + }, + { + "epoch": 49.142, + "grad_norm": 1.1057533025741577, + "learning_rate": 2e-05, + "loss": 0.04578763, + "step": 24571 + }, + { + "epoch": 49.144, + "grad_norm": 1.3727549314498901, + "learning_rate": 2e-05, + "loss": 0.06042334, + "step": 24572 + }, + { + "epoch": 49.146, + "grad_norm": 1.2736660242080688, + "learning_rate": 2e-05, + "loss": 0.05840679, + "step": 24573 + }, + { + "epoch": 49.148, + "grad_norm": 1.6220810413360596, + "learning_rate": 2e-05, + "loss": 0.04685524, + "step": 24574 + }, + { + "epoch": 49.15, + "grad_norm": 1.0651113986968994, + "learning_rate": 2e-05, + "loss": 0.0451685, + "step": 24575 + }, + { + "epoch": 49.152, + "grad_norm": 1.3879947662353516, + "learning_rate": 2e-05, + "loss": 0.05266923, + "step": 24576 + }, + { + "epoch": 49.154, + "grad_norm": 1.164844036102295, + "learning_rate": 2e-05, + "loss": 0.04246272, + "step": 24577 + }, + { + "epoch": 49.156, + "grad_norm": 1.1619504690170288, + "learning_rate": 2e-05, + "loss": 0.04659127, + "step": 24578 + }, + { + "epoch": 49.158, + "grad_norm": 1.3078287839889526, + "learning_rate": 2e-05, + "loss": 0.04308105, + "step": 24579 + }, + { + "epoch": 49.16, + "grad_norm": 1.6431167125701904, + "learning_rate": 2e-05, + "loss": 0.04275879, + "step": 24580 + }, + { + "epoch": 49.162, + "grad_norm": 0.956802248954773, + "learning_rate": 2e-05, + "loss": 0.03798681, + "step": 24581 + }, + { + "epoch": 49.164, + "grad_norm": 1.2058719396591187, + "learning_rate": 2e-05, + "loss": 0.04575798, + "step": 24582 + }, + { + "epoch": 49.166, + "grad_norm": 0.9033730626106262, + "learning_rate": 2e-05, + "loss": 0.03232187, + "step": 24583 + }, + { + "epoch": 49.168, + "grad_norm": 1.3325138092041016, + "learning_rate": 2e-05, + "loss": 0.03885297, + "step": 24584 + }, + { + "epoch": 49.17, + "grad_norm": 1.1684311628341675, + "learning_rate": 2e-05, + "loss": 0.04936481, + "step": 24585 + }, + { + "epoch": 49.172, + "grad_norm": 0.9386561512947083, + "learning_rate": 2e-05, + "loss": 0.02889916, + "step": 24586 + }, + { + "epoch": 49.174, + "grad_norm": 1.115565538406372, + "learning_rate": 2e-05, + "loss": 0.04214649, + "step": 24587 + }, + { + "epoch": 49.176, + "grad_norm": 1.8959245681762695, + "learning_rate": 2e-05, + "loss": 0.05413285, + "step": 24588 + }, + { + "epoch": 49.178, + "grad_norm": 1.2707425355911255, + "learning_rate": 2e-05, + "loss": 0.04919063, + "step": 24589 + }, + { + "epoch": 49.18, + "grad_norm": 1.1527551412582397, + "learning_rate": 2e-05, + "loss": 0.04787771, + "step": 24590 + }, + { + "epoch": 49.182, + "grad_norm": 1.5040851831436157, + "learning_rate": 2e-05, + "loss": 0.0654645, + "step": 24591 + }, + { + "epoch": 49.184, + "grad_norm": 1.2260459661483765, + "learning_rate": 2e-05, + "loss": 0.04879155, + "step": 24592 + }, + { + "epoch": 49.186, + "grad_norm": 1.2306815385818481, + "learning_rate": 2e-05, + "loss": 0.03491394, + "step": 24593 + }, + { + "epoch": 49.188, + "grad_norm": 1.220257043838501, + "learning_rate": 2e-05, + "loss": 0.04896056, + "step": 24594 + }, + { + "epoch": 49.19, + "grad_norm": 1.178658366203308, + "learning_rate": 2e-05, + "loss": 0.03527681, + "step": 24595 + }, + { + "epoch": 49.192, + "grad_norm": 1.795593023300171, + "learning_rate": 2e-05, + "loss": 0.07905695, + "step": 24596 + }, + { + "epoch": 49.194, + "grad_norm": 1.1190385818481445, + "learning_rate": 2e-05, + "loss": 0.04868895, + "step": 24597 + }, + { + "epoch": 49.196, + "grad_norm": 1.0901833772659302, + "learning_rate": 2e-05, + "loss": 0.04650778, + "step": 24598 + }, + { + "epoch": 49.198, + "grad_norm": 1.1784275770187378, + "learning_rate": 2e-05, + "loss": 0.03697899, + "step": 24599 + }, + { + "epoch": 49.2, + "grad_norm": 0.9963816404342651, + "learning_rate": 2e-05, + "loss": 0.04271566, + "step": 24600 + }, + { + "epoch": 49.202, + "grad_norm": 1.3513808250427246, + "learning_rate": 2e-05, + "loss": 0.03827177, + "step": 24601 + }, + { + "epoch": 49.204, + "grad_norm": 1.297082781791687, + "learning_rate": 2e-05, + "loss": 0.05176802, + "step": 24602 + }, + { + "epoch": 49.206, + "grad_norm": 1.3902502059936523, + "learning_rate": 2e-05, + "loss": 0.04333564, + "step": 24603 + }, + { + "epoch": 49.208, + "grad_norm": 1.5116182565689087, + "learning_rate": 2e-05, + "loss": 0.06067524, + "step": 24604 + }, + { + "epoch": 49.21, + "grad_norm": 1.116183876991272, + "learning_rate": 2e-05, + "loss": 0.04102014, + "step": 24605 + }, + { + "epoch": 49.212, + "grad_norm": 1.3426848649978638, + "learning_rate": 2e-05, + "loss": 0.06735122, + "step": 24606 + }, + { + "epoch": 49.214, + "grad_norm": 1.37204110622406, + "learning_rate": 2e-05, + "loss": 0.04117677, + "step": 24607 + }, + { + "epoch": 49.216, + "grad_norm": 1.1000261306762695, + "learning_rate": 2e-05, + "loss": 0.03981378, + "step": 24608 + }, + { + "epoch": 49.218, + "grad_norm": 0.9247246980667114, + "learning_rate": 2e-05, + "loss": 0.02853769, + "step": 24609 + }, + { + "epoch": 49.22, + "grad_norm": 1.3698744773864746, + "learning_rate": 2e-05, + "loss": 0.05477355, + "step": 24610 + }, + { + "epoch": 49.222, + "grad_norm": 7.3837504386901855, + "learning_rate": 2e-05, + "loss": 0.06142946, + "step": 24611 + }, + { + "epoch": 49.224, + "grad_norm": 1.7267507314682007, + "learning_rate": 2e-05, + "loss": 0.05547143, + "step": 24612 + }, + { + "epoch": 49.226, + "grad_norm": 1.1767046451568604, + "learning_rate": 2e-05, + "loss": 0.05019828, + "step": 24613 + }, + { + "epoch": 49.228, + "grad_norm": 1.5185580253601074, + "learning_rate": 2e-05, + "loss": 0.05122478, + "step": 24614 + }, + { + "epoch": 49.23, + "grad_norm": 1.286940097808838, + "learning_rate": 2e-05, + "loss": 0.05315614, + "step": 24615 + }, + { + "epoch": 49.232, + "grad_norm": 1.2825911045074463, + "learning_rate": 2e-05, + "loss": 0.04709144, + "step": 24616 + }, + { + "epoch": 49.234, + "grad_norm": 1.1364567279815674, + "learning_rate": 2e-05, + "loss": 0.06106904, + "step": 24617 + }, + { + "epoch": 49.236, + "grad_norm": 1.2616055011749268, + "learning_rate": 2e-05, + "loss": 0.05195578, + "step": 24618 + }, + { + "epoch": 49.238, + "grad_norm": 1.2682206630706787, + "learning_rate": 2e-05, + "loss": 0.0589853, + "step": 24619 + }, + { + "epoch": 49.24, + "grad_norm": 1.1738789081573486, + "learning_rate": 2e-05, + "loss": 0.04862665, + "step": 24620 + }, + { + "epoch": 49.242, + "grad_norm": 1.5461301803588867, + "learning_rate": 2e-05, + "loss": 0.05598284, + "step": 24621 + }, + { + "epoch": 49.244, + "grad_norm": 1.336150050163269, + "learning_rate": 2e-05, + "loss": 0.05503397, + "step": 24622 + }, + { + "epoch": 49.246, + "grad_norm": 1.2071928977966309, + "learning_rate": 2e-05, + "loss": 0.05489289, + "step": 24623 + }, + { + "epoch": 49.248, + "grad_norm": 1.186057448387146, + "learning_rate": 2e-05, + "loss": 0.04869892, + "step": 24624 + }, + { + "epoch": 49.25, + "grad_norm": 1.1905598640441895, + "learning_rate": 2e-05, + "loss": 0.04225285, + "step": 24625 + }, + { + "epoch": 49.252, + "grad_norm": 1.1007153987884521, + "learning_rate": 2e-05, + "loss": 0.05079801, + "step": 24626 + }, + { + "epoch": 49.254, + "grad_norm": 1.4382884502410889, + "learning_rate": 2e-05, + "loss": 0.03811382, + "step": 24627 + }, + { + "epoch": 49.256, + "grad_norm": 1.2123656272888184, + "learning_rate": 2e-05, + "loss": 0.05187578, + "step": 24628 + }, + { + "epoch": 49.258, + "grad_norm": 1.222488284111023, + "learning_rate": 2e-05, + "loss": 0.05206077, + "step": 24629 + }, + { + "epoch": 49.26, + "grad_norm": 1.4503012895584106, + "learning_rate": 2e-05, + "loss": 0.04863203, + "step": 24630 + }, + { + "epoch": 49.262, + "grad_norm": 1.278603196144104, + "learning_rate": 2e-05, + "loss": 0.05332947, + "step": 24631 + }, + { + "epoch": 49.264, + "grad_norm": 1.2204537391662598, + "learning_rate": 2e-05, + "loss": 0.0511545, + "step": 24632 + }, + { + "epoch": 49.266, + "grad_norm": 1.0226802825927734, + "learning_rate": 2e-05, + "loss": 0.0365659, + "step": 24633 + }, + { + "epoch": 49.268, + "grad_norm": 0.9728658199310303, + "learning_rate": 2e-05, + "loss": 0.03656091, + "step": 24634 + }, + { + "epoch": 49.27, + "grad_norm": 1.2572722434997559, + "learning_rate": 2e-05, + "loss": 0.05211511, + "step": 24635 + }, + { + "epoch": 49.272, + "grad_norm": 1.153205394744873, + "learning_rate": 2e-05, + "loss": 0.06159973, + "step": 24636 + }, + { + "epoch": 49.274, + "grad_norm": 1.1002930402755737, + "learning_rate": 2e-05, + "loss": 0.04466134, + "step": 24637 + }, + { + "epoch": 49.276, + "grad_norm": 1.2222155332565308, + "learning_rate": 2e-05, + "loss": 0.05036249, + "step": 24638 + }, + { + "epoch": 49.278, + "grad_norm": 3.585354804992676, + "learning_rate": 2e-05, + "loss": 0.03442348, + "step": 24639 + }, + { + "epoch": 49.28, + "grad_norm": 1.2959743738174438, + "learning_rate": 2e-05, + "loss": 0.05108596, + "step": 24640 + }, + { + "epoch": 49.282, + "grad_norm": 1.3540737628936768, + "learning_rate": 2e-05, + "loss": 0.05969388, + "step": 24641 + }, + { + "epoch": 49.284, + "grad_norm": 1.276310920715332, + "learning_rate": 2e-05, + "loss": 0.05358229, + "step": 24642 + }, + { + "epoch": 49.286, + "grad_norm": 1.1716383695602417, + "learning_rate": 2e-05, + "loss": 0.03722314, + "step": 24643 + }, + { + "epoch": 49.288, + "grad_norm": 1.0765808820724487, + "learning_rate": 2e-05, + "loss": 0.03878718, + "step": 24644 + }, + { + "epoch": 49.29, + "grad_norm": 1.1529293060302734, + "learning_rate": 2e-05, + "loss": 0.05101361, + "step": 24645 + }, + { + "epoch": 49.292, + "grad_norm": 1.209089994430542, + "learning_rate": 2e-05, + "loss": 0.03981159, + "step": 24646 + }, + { + "epoch": 49.294, + "grad_norm": 1.1552914381027222, + "learning_rate": 2e-05, + "loss": 0.03878843, + "step": 24647 + }, + { + "epoch": 49.296, + "grad_norm": 3.3828413486480713, + "learning_rate": 2e-05, + "loss": 0.06451204, + "step": 24648 + }, + { + "epoch": 49.298, + "grad_norm": 1.4841753244400024, + "learning_rate": 2e-05, + "loss": 0.04134528, + "step": 24649 + }, + { + "epoch": 49.3, + "grad_norm": 1.1813883781433105, + "learning_rate": 2e-05, + "loss": 0.04356892, + "step": 24650 + }, + { + "epoch": 49.302, + "grad_norm": 1.3516215085983276, + "learning_rate": 2e-05, + "loss": 0.06003821, + "step": 24651 + }, + { + "epoch": 49.304, + "grad_norm": 1.0883997678756714, + "learning_rate": 2e-05, + "loss": 0.04874385, + "step": 24652 + }, + { + "epoch": 49.306, + "grad_norm": 1.3258439302444458, + "learning_rate": 2e-05, + "loss": 0.06162676, + "step": 24653 + }, + { + "epoch": 49.308, + "grad_norm": 0.9306918382644653, + "learning_rate": 2e-05, + "loss": 0.04051726, + "step": 24654 + }, + { + "epoch": 49.31, + "grad_norm": 1.3069220781326294, + "learning_rate": 2e-05, + "loss": 0.06446021, + "step": 24655 + }, + { + "epoch": 49.312, + "grad_norm": 1.4380950927734375, + "learning_rate": 2e-05, + "loss": 0.06534431, + "step": 24656 + }, + { + "epoch": 49.314, + "grad_norm": 1.293108582496643, + "learning_rate": 2e-05, + "loss": 0.05122873, + "step": 24657 + }, + { + "epoch": 49.316, + "grad_norm": 1.5020869970321655, + "learning_rate": 2e-05, + "loss": 0.04365094, + "step": 24658 + }, + { + "epoch": 49.318, + "grad_norm": 1.0571807622909546, + "learning_rate": 2e-05, + "loss": 0.04433767, + "step": 24659 + }, + { + "epoch": 49.32, + "grad_norm": 1.6660947799682617, + "learning_rate": 2e-05, + "loss": 0.05386006, + "step": 24660 + }, + { + "epoch": 49.322, + "grad_norm": 1.2687824964523315, + "learning_rate": 2e-05, + "loss": 0.05124846, + "step": 24661 + }, + { + "epoch": 49.324, + "grad_norm": 1.3088194131851196, + "learning_rate": 2e-05, + "loss": 0.05232558, + "step": 24662 + }, + { + "epoch": 49.326, + "grad_norm": 1.0739314556121826, + "learning_rate": 2e-05, + "loss": 0.04608125, + "step": 24663 + }, + { + "epoch": 49.328, + "grad_norm": 1.3387324810028076, + "learning_rate": 2e-05, + "loss": 0.0492764, + "step": 24664 + }, + { + "epoch": 49.33, + "grad_norm": 1.2844595909118652, + "learning_rate": 2e-05, + "loss": 0.04417478, + "step": 24665 + }, + { + "epoch": 49.332, + "grad_norm": 1.1230336427688599, + "learning_rate": 2e-05, + "loss": 0.04758064, + "step": 24666 + }, + { + "epoch": 49.334, + "grad_norm": 5.419571876525879, + "learning_rate": 2e-05, + "loss": 0.04511326, + "step": 24667 + }, + { + "epoch": 49.336, + "grad_norm": 1.1522164344787598, + "learning_rate": 2e-05, + "loss": 0.04697948, + "step": 24668 + }, + { + "epoch": 49.338, + "grad_norm": 1.43909752368927, + "learning_rate": 2e-05, + "loss": 0.05124324, + "step": 24669 + }, + { + "epoch": 49.34, + "grad_norm": 1.290650486946106, + "learning_rate": 2e-05, + "loss": 0.03891211, + "step": 24670 + }, + { + "epoch": 49.342, + "grad_norm": 1.0777543783187866, + "learning_rate": 2e-05, + "loss": 0.03897686, + "step": 24671 + }, + { + "epoch": 49.344, + "grad_norm": 1.2240030765533447, + "learning_rate": 2e-05, + "loss": 0.04273064, + "step": 24672 + }, + { + "epoch": 49.346, + "grad_norm": 1.2230967283248901, + "learning_rate": 2e-05, + "loss": 0.05533133, + "step": 24673 + }, + { + "epoch": 49.348, + "grad_norm": 1.7442550659179688, + "learning_rate": 2e-05, + "loss": 0.04444986, + "step": 24674 + }, + { + "epoch": 49.35, + "grad_norm": 1.193528413772583, + "learning_rate": 2e-05, + "loss": 0.02971295, + "step": 24675 + }, + { + "epoch": 49.352, + "grad_norm": 1.548940896987915, + "learning_rate": 2e-05, + "loss": 0.05257022, + "step": 24676 + }, + { + "epoch": 49.354, + "grad_norm": 1.4281851053237915, + "learning_rate": 2e-05, + "loss": 0.05808748, + "step": 24677 + }, + { + "epoch": 49.356, + "grad_norm": 1.1668903827667236, + "learning_rate": 2e-05, + "loss": 0.039745, + "step": 24678 + }, + { + "epoch": 49.358, + "grad_norm": 2.4779348373413086, + "learning_rate": 2e-05, + "loss": 0.04287686, + "step": 24679 + }, + { + "epoch": 49.36, + "grad_norm": 1.0857728719711304, + "learning_rate": 2e-05, + "loss": 0.03546745, + "step": 24680 + }, + { + "epoch": 49.362, + "grad_norm": 1.758777141571045, + "learning_rate": 2e-05, + "loss": 0.03802776, + "step": 24681 + }, + { + "epoch": 49.364, + "grad_norm": 0.9436732530593872, + "learning_rate": 2e-05, + "loss": 0.03220242, + "step": 24682 + }, + { + "epoch": 49.366, + "grad_norm": 1.0815670490264893, + "learning_rate": 2e-05, + "loss": 0.04280725, + "step": 24683 + }, + { + "epoch": 49.368, + "grad_norm": 1.114959955215454, + "learning_rate": 2e-05, + "loss": 0.04519267, + "step": 24684 + }, + { + "epoch": 49.37, + "grad_norm": 1.2002875804901123, + "learning_rate": 2e-05, + "loss": 0.04737946, + "step": 24685 + }, + { + "epoch": 49.372, + "grad_norm": 1.381123661994934, + "learning_rate": 2e-05, + "loss": 0.0593003, + "step": 24686 + }, + { + "epoch": 49.374, + "grad_norm": 2.0842089653015137, + "learning_rate": 2e-05, + "loss": 0.0516322, + "step": 24687 + }, + { + "epoch": 49.376, + "grad_norm": 1.0045135021209717, + "learning_rate": 2e-05, + "loss": 0.03573417, + "step": 24688 + }, + { + "epoch": 49.378, + "grad_norm": 1.3091294765472412, + "learning_rate": 2e-05, + "loss": 0.04659965, + "step": 24689 + }, + { + "epoch": 49.38, + "grad_norm": 1.3777577877044678, + "learning_rate": 2e-05, + "loss": 0.0343422, + "step": 24690 + }, + { + "epoch": 49.382, + "grad_norm": 1.5019209384918213, + "learning_rate": 2e-05, + "loss": 0.05598423, + "step": 24691 + }, + { + "epoch": 49.384, + "grad_norm": 1.168333888053894, + "learning_rate": 2e-05, + "loss": 0.04606739, + "step": 24692 + }, + { + "epoch": 49.386, + "grad_norm": 1.3140724897384644, + "learning_rate": 2e-05, + "loss": 0.05491744, + "step": 24693 + }, + { + "epoch": 49.388, + "grad_norm": 1.1507341861724854, + "learning_rate": 2e-05, + "loss": 0.04055898, + "step": 24694 + }, + { + "epoch": 49.39, + "grad_norm": 1.2338950634002686, + "learning_rate": 2e-05, + "loss": 0.04385046, + "step": 24695 + }, + { + "epoch": 49.392, + "grad_norm": 1.2507073879241943, + "learning_rate": 2e-05, + "loss": 0.06624331, + "step": 24696 + }, + { + "epoch": 49.394, + "grad_norm": 1.172641634941101, + "learning_rate": 2e-05, + "loss": 0.05282669, + "step": 24697 + }, + { + "epoch": 49.396, + "grad_norm": 1.290877103805542, + "learning_rate": 2e-05, + "loss": 0.0552482, + "step": 24698 + }, + { + "epoch": 49.398, + "grad_norm": 1.3046849966049194, + "learning_rate": 2e-05, + "loss": 0.05484886, + "step": 24699 + }, + { + "epoch": 49.4, + "grad_norm": 1.0632675886154175, + "learning_rate": 2e-05, + "loss": 0.04052684, + "step": 24700 + }, + { + "epoch": 49.402, + "grad_norm": 1.0683989524841309, + "learning_rate": 2e-05, + "loss": 0.04582777, + "step": 24701 + }, + { + "epoch": 49.404, + "grad_norm": 1.6077747344970703, + "learning_rate": 2e-05, + "loss": 0.04787633, + "step": 24702 + }, + { + "epoch": 49.406, + "grad_norm": 1.207194209098816, + "learning_rate": 2e-05, + "loss": 0.05000635, + "step": 24703 + }, + { + "epoch": 49.408, + "grad_norm": 1.1203255653381348, + "learning_rate": 2e-05, + "loss": 0.05599294, + "step": 24704 + }, + { + "epoch": 49.41, + "grad_norm": 1.4439189434051514, + "learning_rate": 2e-05, + "loss": 0.04717576, + "step": 24705 + }, + { + "epoch": 49.412, + "grad_norm": 1.1799260377883911, + "learning_rate": 2e-05, + "loss": 0.04708055, + "step": 24706 + }, + { + "epoch": 49.414, + "grad_norm": 1.2697405815124512, + "learning_rate": 2e-05, + "loss": 0.05073357, + "step": 24707 + }, + { + "epoch": 49.416, + "grad_norm": 1.1728367805480957, + "learning_rate": 2e-05, + "loss": 0.05580144, + "step": 24708 + }, + { + "epoch": 49.418, + "grad_norm": 1.9259074926376343, + "learning_rate": 2e-05, + "loss": 0.04573488, + "step": 24709 + }, + { + "epoch": 49.42, + "grad_norm": 1.285468578338623, + "learning_rate": 2e-05, + "loss": 0.06362525, + "step": 24710 + }, + { + "epoch": 49.422, + "grad_norm": 1.0827348232269287, + "learning_rate": 2e-05, + "loss": 0.0446627, + "step": 24711 + }, + { + "epoch": 49.424, + "grad_norm": 1.354174256324768, + "learning_rate": 2e-05, + "loss": 0.06585463, + "step": 24712 + }, + { + "epoch": 49.426, + "grad_norm": 1.2439528703689575, + "learning_rate": 2e-05, + "loss": 0.04843699, + "step": 24713 + }, + { + "epoch": 49.428, + "grad_norm": 1.024624228477478, + "learning_rate": 2e-05, + "loss": 0.04217855, + "step": 24714 + }, + { + "epoch": 49.43, + "grad_norm": 1.0700979232788086, + "learning_rate": 2e-05, + "loss": 0.04749613, + "step": 24715 + }, + { + "epoch": 49.432, + "grad_norm": 1.496504306793213, + "learning_rate": 2e-05, + "loss": 0.0489388, + "step": 24716 + }, + { + "epoch": 49.434, + "grad_norm": 1.1693992614746094, + "learning_rate": 2e-05, + "loss": 0.04739952, + "step": 24717 + }, + { + "epoch": 49.436, + "grad_norm": 1.1603397130966187, + "learning_rate": 2e-05, + "loss": 0.04808056, + "step": 24718 + }, + { + "epoch": 49.438, + "grad_norm": 1.1455094814300537, + "learning_rate": 2e-05, + "loss": 0.05102035, + "step": 24719 + }, + { + "epoch": 49.44, + "grad_norm": 1.1179344654083252, + "learning_rate": 2e-05, + "loss": 0.04601201, + "step": 24720 + }, + { + "epoch": 49.442, + "grad_norm": 1.0473073720932007, + "learning_rate": 2e-05, + "loss": 0.04930402, + "step": 24721 + }, + { + "epoch": 49.444, + "grad_norm": 1.236930251121521, + "learning_rate": 2e-05, + "loss": 0.04765532, + "step": 24722 + }, + { + "epoch": 49.446, + "grad_norm": 1.2558659315109253, + "learning_rate": 2e-05, + "loss": 0.04867148, + "step": 24723 + }, + { + "epoch": 49.448, + "grad_norm": 1.1845966577529907, + "learning_rate": 2e-05, + "loss": 0.04930171, + "step": 24724 + }, + { + "epoch": 49.45, + "grad_norm": 1.112603783607483, + "learning_rate": 2e-05, + "loss": 0.04704773, + "step": 24725 + }, + { + "epoch": 49.452, + "grad_norm": 1.188025712966919, + "learning_rate": 2e-05, + "loss": 0.04383368, + "step": 24726 + }, + { + "epoch": 49.454, + "grad_norm": 1.1775012016296387, + "learning_rate": 2e-05, + "loss": 0.04882312, + "step": 24727 + }, + { + "epoch": 49.456, + "grad_norm": 1.2298845052719116, + "learning_rate": 2e-05, + "loss": 0.04555701, + "step": 24728 + }, + { + "epoch": 49.458, + "grad_norm": 1.373660683631897, + "learning_rate": 2e-05, + "loss": 0.05526736, + "step": 24729 + }, + { + "epoch": 49.46, + "grad_norm": 1.3296241760253906, + "learning_rate": 2e-05, + "loss": 0.06211912, + "step": 24730 + }, + { + "epoch": 49.462, + "grad_norm": 1.1016606092453003, + "learning_rate": 2e-05, + "loss": 0.05170915, + "step": 24731 + }, + { + "epoch": 49.464, + "grad_norm": 1.3367371559143066, + "learning_rate": 2e-05, + "loss": 0.04138402, + "step": 24732 + }, + { + "epoch": 49.466, + "grad_norm": 3.0442302227020264, + "learning_rate": 2e-05, + "loss": 0.07401902, + "step": 24733 + }, + { + "epoch": 49.468, + "grad_norm": 1.304378867149353, + "learning_rate": 2e-05, + "loss": 0.04841861, + "step": 24734 + }, + { + "epoch": 49.47, + "grad_norm": 1.211232304573059, + "learning_rate": 2e-05, + "loss": 0.04923613, + "step": 24735 + }, + { + "epoch": 49.472, + "grad_norm": 1.0207058191299438, + "learning_rate": 2e-05, + "loss": 0.03922756, + "step": 24736 + }, + { + "epoch": 49.474, + "grad_norm": 1.7041534185409546, + "learning_rate": 2e-05, + "loss": 0.04910336, + "step": 24737 + }, + { + "epoch": 49.476, + "grad_norm": 1.1709401607513428, + "learning_rate": 2e-05, + "loss": 0.04106373, + "step": 24738 + }, + { + "epoch": 49.478, + "grad_norm": 2.144960880279541, + "learning_rate": 2e-05, + "loss": 0.05597088, + "step": 24739 + }, + { + "epoch": 49.48, + "grad_norm": 1.052988052368164, + "learning_rate": 2e-05, + "loss": 0.04293213, + "step": 24740 + }, + { + "epoch": 49.482, + "grad_norm": 1.4180502891540527, + "learning_rate": 2e-05, + "loss": 0.04159679, + "step": 24741 + }, + { + "epoch": 49.484, + "grad_norm": 1.2406322956085205, + "learning_rate": 2e-05, + "loss": 0.05517141, + "step": 24742 + }, + { + "epoch": 49.486, + "grad_norm": 1.1686590909957886, + "learning_rate": 2e-05, + "loss": 0.04570251, + "step": 24743 + }, + { + "epoch": 49.488, + "grad_norm": 1.2575324773788452, + "learning_rate": 2e-05, + "loss": 0.03278139, + "step": 24744 + }, + { + "epoch": 49.49, + "grad_norm": 1.0804585218429565, + "learning_rate": 2e-05, + "loss": 0.03636895, + "step": 24745 + }, + { + "epoch": 49.492, + "grad_norm": 1.3195343017578125, + "learning_rate": 2e-05, + "loss": 0.05228486, + "step": 24746 + }, + { + "epoch": 49.494, + "grad_norm": 1.1343023777008057, + "learning_rate": 2e-05, + "loss": 0.04752772, + "step": 24747 + }, + { + "epoch": 49.496, + "grad_norm": 1.1542292833328247, + "learning_rate": 2e-05, + "loss": 0.04346114, + "step": 24748 + }, + { + "epoch": 49.498, + "grad_norm": 1.0267248153686523, + "learning_rate": 2e-05, + "loss": 0.04148855, + "step": 24749 + }, + { + "epoch": 49.5, + "grad_norm": 1.197594404220581, + "learning_rate": 2e-05, + "loss": 0.05338103, + "step": 24750 + }, + { + "epoch": 49.502, + "grad_norm": 1.3752979040145874, + "learning_rate": 2e-05, + "loss": 0.05251706, + "step": 24751 + }, + { + "epoch": 49.504, + "grad_norm": 2.483018636703491, + "learning_rate": 2e-05, + "loss": 0.05616256, + "step": 24752 + }, + { + "epoch": 49.506, + "grad_norm": 0.9340147972106934, + "learning_rate": 2e-05, + "loss": 0.0262505, + "step": 24753 + }, + { + "epoch": 49.508, + "grad_norm": 1.1258265972137451, + "learning_rate": 2e-05, + "loss": 0.03799858, + "step": 24754 + }, + { + "epoch": 49.51, + "grad_norm": 1.6418697834014893, + "learning_rate": 2e-05, + "loss": 0.06251703, + "step": 24755 + }, + { + "epoch": 49.512, + "grad_norm": 1.4351956844329834, + "learning_rate": 2e-05, + "loss": 0.04956281, + "step": 24756 + }, + { + "epoch": 49.514, + "grad_norm": 1.3971424102783203, + "learning_rate": 2e-05, + "loss": 0.07184118, + "step": 24757 + }, + { + "epoch": 49.516, + "grad_norm": 1.242491602897644, + "learning_rate": 2e-05, + "loss": 0.04963819, + "step": 24758 + }, + { + "epoch": 49.518, + "grad_norm": 1.8168903589248657, + "learning_rate": 2e-05, + "loss": 0.05463743, + "step": 24759 + }, + { + "epoch": 49.52, + "grad_norm": 1.0832699537277222, + "learning_rate": 2e-05, + "loss": 0.04424177, + "step": 24760 + }, + { + "epoch": 49.522, + "grad_norm": 0.9689600467681885, + "learning_rate": 2e-05, + "loss": 0.04252536, + "step": 24761 + }, + { + "epoch": 49.524, + "grad_norm": 1.111344337463379, + "learning_rate": 2e-05, + "loss": 0.04760269, + "step": 24762 + }, + { + "epoch": 49.526, + "grad_norm": 1.120718240737915, + "learning_rate": 2e-05, + "loss": 0.04254458, + "step": 24763 + }, + { + "epoch": 49.528, + "grad_norm": 1.1055079698562622, + "learning_rate": 2e-05, + "loss": 0.04112123, + "step": 24764 + }, + { + "epoch": 49.53, + "grad_norm": 1.8217605352401733, + "learning_rate": 2e-05, + "loss": 0.05603817, + "step": 24765 + }, + { + "epoch": 49.532, + "grad_norm": 1.008670449256897, + "learning_rate": 2e-05, + "loss": 0.03235183, + "step": 24766 + }, + { + "epoch": 49.534, + "grad_norm": 1.1165664196014404, + "learning_rate": 2e-05, + "loss": 0.0455579, + "step": 24767 + }, + { + "epoch": 49.536, + "grad_norm": 1.0880547761917114, + "learning_rate": 2e-05, + "loss": 0.04338773, + "step": 24768 + }, + { + "epoch": 49.538, + "grad_norm": 0.9217278361320496, + "learning_rate": 2e-05, + "loss": 0.03157655, + "step": 24769 + }, + { + "epoch": 49.54, + "grad_norm": 1.095906376838684, + "learning_rate": 2e-05, + "loss": 0.04428188, + "step": 24770 + }, + { + "epoch": 49.542, + "grad_norm": 1.5616658926010132, + "learning_rate": 2e-05, + "loss": 0.03784732, + "step": 24771 + }, + { + "epoch": 49.544, + "grad_norm": 1.0943976640701294, + "learning_rate": 2e-05, + "loss": 0.04282872, + "step": 24772 + }, + { + "epoch": 49.546, + "grad_norm": 1.0541517734527588, + "learning_rate": 2e-05, + "loss": 0.04491699, + "step": 24773 + }, + { + "epoch": 49.548, + "grad_norm": 3.1224377155303955, + "learning_rate": 2e-05, + "loss": 0.04894248, + "step": 24774 + }, + { + "epoch": 49.55, + "grad_norm": 1.100438117980957, + "learning_rate": 2e-05, + "loss": 0.04886558, + "step": 24775 + }, + { + "epoch": 49.552, + "grad_norm": 1.7008203268051147, + "learning_rate": 2e-05, + "loss": 0.03542287, + "step": 24776 + }, + { + "epoch": 49.554, + "grad_norm": 1.2163397073745728, + "learning_rate": 2e-05, + "loss": 0.05112882, + "step": 24777 + }, + { + "epoch": 49.556, + "grad_norm": 1.178572177886963, + "learning_rate": 2e-05, + "loss": 0.05226102, + "step": 24778 + }, + { + "epoch": 49.558, + "grad_norm": 1.07505202293396, + "learning_rate": 2e-05, + "loss": 0.04829258, + "step": 24779 + }, + { + "epoch": 49.56, + "grad_norm": 1.545498013496399, + "learning_rate": 2e-05, + "loss": 0.05802028, + "step": 24780 + }, + { + "epoch": 49.562, + "grad_norm": 1.0516809225082397, + "learning_rate": 2e-05, + "loss": 0.03485567, + "step": 24781 + }, + { + "epoch": 49.564, + "grad_norm": 1.0159010887145996, + "learning_rate": 2e-05, + "loss": 0.04023987, + "step": 24782 + }, + { + "epoch": 49.566, + "grad_norm": 1.1692757606506348, + "learning_rate": 2e-05, + "loss": 0.04836097, + "step": 24783 + }, + { + "epoch": 49.568, + "grad_norm": 1.2164714336395264, + "learning_rate": 2e-05, + "loss": 0.05162116, + "step": 24784 + }, + { + "epoch": 49.57, + "grad_norm": 1.0095146894454956, + "learning_rate": 2e-05, + "loss": 0.04396236, + "step": 24785 + }, + { + "epoch": 49.572, + "grad_norm": 1.3631678819656372, + "learning_rate": 2e-05, + "loss": 0.05843265, + "step": 24786 + }, + { + "epoch": 49.574, + "grad_norm": 1.3029148578643799, + "learning_rate": 2e-05, + "loss": 0.04724909, + "step": 24787 + }, + { + "epoch": 49.576, + "grad_norm": 1.3042387962341309, + "learning_rate": 2e-05, + "loss": 0.04229445, + "step": 24788 + }, + { + "epoch": 49.578, + "grad_norm": 1.0347017049789429, + "learning_rate": 2e-05, + "loss": 0.03812275, + "step": 24789 + }, + { + "epoch": 49.58, + "grad_norm": 1.7310672998428345, + "learning_rate": 2e-05, + "loss": 0.06088685, + "step": 24790 + }, + { + "epoch": 49.582, + "grad_norm": 0.9168398976325989, + "learning_rate": 2e-05, + "loss": 0.02849315, + "step": 24791 + }, + { + "epoch": 49.584, + "grad_norm": 1.1964490413665771, + "learning_rate": 2e-05, + "loss": 0.0535724, + "step": 24792 + }, + { + "epoch": 49.586, + "grad_norm": 1.1546590328216553, + "learning_rate": 2e-05, + "loss": 0.03581116, + "step": 24793 + }, + { + "epoch": 49.588, + "grad_norm": 1.0855379104614258, + "learning_rate": 2e-05, + "loss": 0.05509954, + "step": 24794 + }, + { + "epoch": 49.59, + "grad_norm": 1.2587318420410156, + "learning_rate": 2e-05, + "loss": 0.04235309, + "step": 24795 + }, + { + "epoch": 49.592, + "grad_norm": 1.2183408737182617, + "learning_rate": 2e-05, + "loss": 0.05718917, + "step": 24796 + }, + { + "epoch": 49.594, + "grad_norm": 1.1804263591766357, + "learning_rate": 2e-05, + "loss": 0.04849926, + "step": 24797 + }, + { + "epoch": 49.596, + "grad_norm": 1.4819769859313965, + "learning_rate": 2e-05, + "loss": 0.05709495, + "step": 24798 + }, + { + "epoch": 49.598, + "grad_norm": 1.1413755416870117, + "learning_rate": 2e-05, + "loss": 0.0554666, + "step": 24799 + }, + { + "epoch": 49.6, + "grad_norm": 1.1274505853652954, + "learning_rate": 2e-05, + "loss": 0.04003352, + "step": 24800 + }, + { + "epoch": 49.602, + "grad_norm": 1.2199150323867798, + "learning_rate": 2e-05, + "loss": 0.04663592, + "step": 24801 + }, + { + "epoch": 49.604, + "grad_norm": 1.1105495691299438, + "learning_rate": 2e-05, + "loss": 0.04194749, + "step": 24802 + }, + { + "epoch": 49.606, + "grad_norm": 1.1821727752685547, + "learning_rate": 2e-05, + "loss": 0.05207427, + "step": 24803 + }, + { + "epoch": 49.608, + "grad_norm": 1.0401041507720947, + "learning_rate": 2e-05, + "loss": 0.04428156, + "step": 24804 + }, + { + "epoch": 49.61, + "grad_norm": 1.840406060218811, + "learning_rate": 2e-05, + "loss": 0.05603269, + "step": 24805 + }, + { + "epoch": 49.612, + "grad_norm": 1.3533574342727661, + "learning_rate": 2e-05, + "loss": 0.03703146, + "step": 24806 + }, + { + "epoch": 49.614, + "grad_norm": 1.0640056133270264, + "learning_rate": 2e-05, + "loss": 0.03546765, + "step": 24807 + }, + { + "epoch": 49.616, + "grad_norm": 1.206377625465393, + "learning_rate": 2e-05, + "loss": 0.05672641, + "step": 24808 + }, + { + "epoch": 49.618, + "grad_norm": 1.0968784093856812, + "learning_rate": 2e-05, + "loss": 0.03934178, + "step": 24809 + }, + { + "epoch": 49.62, + "grad_norm": 1.4784859418869019, + "learning_rate": 2e-05, + "loss": 0.05061619, + "step": 24810 + }, + { + "epoch": 49.622, + "grad_norm": 1.2458291053771973, + "learning_rate": 2e-05, + "loss": 0.04952991, + "step": 24811 + }, + { + "epoch": 49.624, + "grad_norm": 0.917413055896759, + "learning_rate": 2e-05, + "loss": 0.03567467, + "step": 24812 + }, + { + "epoch": 49.626, + "grad_norm": 1.093457818031311, + "learning_rate": 2e-05, + "loss": 0.04676176, + "step": 24813 + }, + { + "epoch": 49.628, + "grad_norm": 1.19669771194458, + "learning_rate": 2e-05, + "loss": 0.05353126, + "step": 24814 + }, + { + "epoch": 49.63, + "grad_norm": 1.258165955543518, + "learning_rate": 2e-05, + "loss": 0.05881431, + "step": 24815 + }, + { + "epoch": 49.632, + "grad_norm": 1.8352237939834595, + "learning_rate": 2e-05, + "loss": 0.03418227, + "step": 24816 + }, + { + "epoch": 49.634, + "grad_norm": 1.2149909734725952, + "learning_rate": 2e-05, + "loss": 0.0405385, + "step": 24817 + }, + { + "epoch": 49.636, + "grad_norm": 1.1387282609939575, + "learning_rate": 2e-05, + "loss": 0.04924862, + "step": 24818 + }, + { + "epoch": 49.638, + "grad_norm": 1.0884685516357422, + "learning_rate": 2e-05, + "loss": 0.03455062, + "step": 24819 + }, + { + "epoch": 49.64, + "grad_norm": 1.1216976642608643, + "learning_rate": 2e-05, + "loss": 0.04718848, + "step": 24820 + }, + { + "epoch": 49.642, + "grad_norm": 1.1525574922561646, + "learning_rate": 2e-05, + "loss": 0.0507764, + "step": 24821 + }, + { + "epoch": 49.644, + "grad_norm": 1.040356993675232, + "learning_rate": 2e-05, + "loss": 0.03835782, + "step": 24822 + }, + { + "epoch": 49.646, + "grad_norm": 1.6863218545913696, + "learning_rate": 2e-05, + "loss": 0.05974036, + "step": 24823 + }, + { + "epoch": 49.648, + "grad_norm": 1.5963078737258911, + "learning_rate": 2e-05, + "loss": 0.05577124, + "step": 24824 + }, + { + "epoch": 49.65, + "grad_norm": 1.9898076057434082, + "learning_rate": 2e-05, + "loss": 0.04162847, + "step": 24825 + }, + { + "epoch": 49.652, + "grad_norm": 1.7517437934875488, + "learning_rate": 2e-05, + "loss": 0.04477061, + "step": 24826 + }, + { + "epoch": 49.654, + "grad_norm": 1.0563920736312866, + "learning_rate": 2e-05, + "loss": 0.04491702, + "step": 24827 + }, + { + "epoch": 49.656, + "grad_norm": 1.1382147073745728, + "learning_rate": 2e-05, + "loss": 0.04465196, + "step": 24828 + }, + { + "epoch": 49.658, + "grad_norm": 1.1406420469284058, + "learning_rate": 2e-05, + "loss": 0.04676701, + "step": 24829 + }, + { + "epoch": 49.66, + "grad_norm": 0.9442988038063049, + "learning_rate": 2e-05, + "loss": 0.03124127, + "step": 24830 + }, + { + "epoch": 49.662, + "grad_norm": 1.63547682762146, + "learning_rate": 2e-05, + "loss": 0.05501399, + "step": 24831 + }, + { + "epoch": 49.664, + "grad_norm": 0.9990745782852173, + "learning_rate": 2e-05, + "loss": 0.03285347, + "step": 24832 + }, + { + "epoch": 49.666, + "grad_norm": 1.0436017513275146, + "learning_rate": 2e-05, + "loss": 0.0376052, + "step": 24833 + }, + { + "epoch": 49.668, + "grad_norm": 1.4094048738479614, + "learning_rate": 2e-05, + "loss": 0.05660111, + "step": 24834 + }, + { + "epoch": 49.67, + "grad_norm": 1.181092381477356, + "learning_rate": 2e-05, + "loss": 0.04279668, + "step": 24835 + }, + { + "epoch": 49.672, + "grad_norm": 1.2665365934371948, + "learning_rate": 2e-05, + "loss": 0.05205832, + "step": 24836 + }, + { + "epoch": 49.674, + "grad_norm": 1.061452031135559, + "learning_rate": 2e-05, + "loss": 0.03486584, + "step": 24837 + }, + { + "epoch": 49.676, + "grad_norm": 1.312247395515442, + "learning_rate": 2e-05, + "loss": 0.05554381, + "step": 24838 + }, + { + "epoch": 49.678, + "grad_norm": 1.2573567628860474, + "learning_rate": 2e-05, + "loss": 0.05710577, + "step": 24839 + }, + { + "epoch": 49.68, + "grad_norm": 1.2652338743209839, + "learning_rate": 2e-05, + "loss": 0.04693561, + "step": 24840 + }, + { + "epoch": 49.682, + "grad_norm": 1.076812505722046, + "learning_rate": 2e-05, + "loss": 0.04891666, + "step": 24841 + }, + { + "epoch": 49.684, + "grad_norm": 1.0280793905258179, + "learning_rate": 2e-05, + "loss": 0.03525931, + "step": 24842 + }, + { + "epoch": 49.686, + "grad_norm": 1.2222983837127686, + "learning_rate": 2e-05, + "loss": 0.03659509, + "step": 24843 + }, + { + "epoch": 49.688, + "grad_norm": 1.1637095212936401, + "learning_rate": 2e-05, + "loss": 0.04860913, + "step": 24844 + }, + { + "epoch": 49.69, + "grad_norm": 1.232571005821228, + "learning_rate": 2e-05, + "loss": 0.04756819, + "step": 24845 + }, + { + "epoch": 49.692, + "grad_norm": 1.7061697244644165, + "learning_rate": 2e-05, + "loss": 0.05409347, + "step": 24846 + }, + { + "epoch": 49.694, + "grad_norm": 3.033412218093872, + "learning_rate": 2e-05, + "loss": 0.05069696, + "step": 24847 + }, + { + "epoch": 49.696, + "grad_norm": 1.3082060813903809, + "learning_rate": 2e-05, + "loss": 0.06048907, + "step": 24848 + }, + { + "epoch": 49.698, + "grad_norm": 1.3506269454956055, + "learning_rate": 2e-05, + "loss": 0.04115216, + "step": 24849 + }, + { + "epoch": 49.7, + "grad_norm": 1.5505048036575317, + "learning_rate": 2e-05, + "loss": 0.04728535, + "step": 24850 + }, + { + "epoch": 49.702, + "grad_norm": 1.154044270515442, + "learning_rate": 2e-05, + "loss": 0.03792886, + "step": 24851 + }, + { + "epoch": 49.704, + "grad_norm": 1.337306261062622, + "learning_rate": 2e-05, + "loss": 0.0570382, + "step": 24852 + }, + { + "epoch": 49.706, + "grad_norm": 1.206878900527954, + "learning_rate": 2e-05, + "loss": 0.05738255, + "step": 24853 + }, + { + "epoch": 49.708, + "grad_norm": 1.0860217809677124, + "learning_rate": 2e-05, + "loss": 0.04733476, + "step": 24854 + }, + { + "epoch": 49.71, + "grad_norm": 1.1992672681808472, + "learning_rate": 2e-05, + "loss": 0.05342748, + "step": 24855 + }, + { + "epoch": 49.712, + "grad_norm": 1.2483594417572021, + "learning_rate": 2e-05, + "loss": 0.03885884, + "step": 24856 + }, + { + "epoch": 49.714, + "grad_norm": 1.1189866065979004, + "learning_rate": 2e-05, + "loss": 0.03883381, + "step": 24857 + }, + { + "epoch": 49.716, + "grad_norm": 3.194491386413574, + "learning_rate": 2e-05, + "loss": 0.05132719, + "step": 24858 + }, + { + "epoch": 49.718, + "grad_norm": 2.8470568656921387, + "learning_rate": 2e-05, + "loss": 0.04896916, + "step": 24859 + }, + { + "epoch": 49.72, + "grad_norm": 1.2007850408554077, + "learning_rate": 2e-05, + "loss": 0.04390725, + "step": 24860 + }, + { + "epoch": 49.722, + "grad_norm": 1.5928409099578857, + "learning_rate": 2e-05, + "loss": 0.06557049, + "step": 24861 + }, + { + "epoch": 49.724, + "grad_norm": 1.5397547483444214, + "learning_rate": 2e-05, + "loss": 0.05117305, + "step": 24862 + }, + { + "epoch": 49.726, + "grad_norm": 1.1637266874313354, + "learning_rate": 2e-05, + "loss": 0.0384597, + "step": 24863 + }, + { + "epoch": 49.728, + "grad_norm": 1.3207765817642212, + "learning_rate": 2e-05, + "loss": 0.04964681, + "step": 24864 + }, + { + "epoch": 49.73, + "grad_norm": 1.3087096214294434, + "learning_rate": 2e-05, + "loss": 0.05499934, + "step": 24865 + }, + { + "epoch": 49.732, + "grad_norm": 1.8449457883834839, + "learning_rate": 2e-05, + "loss": 0.053078, + "step": 24866 + }, + { + "epoch": 49.734, + "grad_norm": 1.1697624921798706, + "learning_rate": 2e-05, + "loss": 0.04434161, + "step": 24867 + }, + { + "epoch": 49.736, + "grad_norm": 1.2733197212219238, + "learning_rate": 2e-05, + "loss": 0.05949679, + "step": 24868 + }, + { + "epoch": 49.738, + "grad_norm": 1.6024494171142578, + "learning_rate": 2e-05, + "loss": 0.05078848, + "step": 24869 + }, + { + "epoch": 49.74, + "grad_norm": 1.3443810939788818, + "learning_rate": 2e-05, + "loss": 0.04602218, + "step": 24870 + }, + { + "epoch": 49.742, + "grad_norm": 1.3979158401489258, + "learning_rate": 2e-05, + "loss": 0.05351568, + "step": 24871 + }, + { + "epoch": 49.744, + "grad_norm": 1.2029260396957397, + "learning_rate": 2e-05, + "loss": 0.0432038, + "step": 24872 + }, + { + "epoch": 49.746, + "grad_norm": 1.2258926630020142, + "learning_rate": 2e-05, + "loss": 0.05626082, + "step": 24873 + }, + { + "epoch": 49.748, + "grad_norm": 1.232932686805725, + "learning_rate": 2e-05, + "loss": 0.05177718, + "step": 24874 + }, + { + "epoch": 49.75, + "grad_norm": 1.1512670516967773, + "learning_rate": 2e-05, + "loss": 0.04957498, + "step": 24875 + }, + { + "epoch": 49.752, + "grad_norm": 1.1977880001068115, + "learning_rate": 2e-05, + "loss": 0.04884469, + "step": 24876 + }, + { + "epoch": 49.754, + "grad_norm": 1.363495111465454, + "learning_rate": 2e-05, + "loss": 0.04705457, + "step": 24877 + }, + { + "epoch": 49.756, + "grad_norm": 1.2528479099273682, + "learning_rate": 2e-05, + "loss": 0.04862735, + "step": 24878 + }, + { + "epoch": 49.758, + "grad_norm": 2.5792670249938965, + "learning_rate": 2e-05, + "loss": 0.0410705, + "step": 24879 + }, + { + "epoch": 49.76, + "grad_norm": 2.7361626625061035, + "learning_rate": 2e-05, + "loss": 0.05311979, + "step": 24880 + }, + { + "epoch": 49.762, + "grad_norm": 2.464141845703125, + "learning_rate": 2e-05, + "loss": 0.05702599, + "step": 24881 + }, + { + "epoch": 49.764, + "grad_norm": 1.099612832069397, + "learning_rate": 2e-05, + "loss": 0.03931236, + "step": 24882 + }, + { + "epoch": 49.766, + "grad_norm": 1.0781171321868896, + "learning_rate": 2e-05, + "loss": 0.04257463, + "step": 24883 + }, + { + "epoch": 49.768, + "grad_norm": 4.444637775421143, + "learning_rate": 2e-05, + "loss": 0.05754374, + "step": 24884 + }, + { + "epoch": 49.77, + "grad_norm": 1.3171765804290771, + "learning_rate": 2e-05, + "loss": 0.05533032, + "step": 24885 + }, + { + "epoch": 49.772, + "grad_norm": 1.1796399354934692, + "learning_rate": 2e-05, + "loss": 0.03754626, + "step": 24886 + }, + { + "epoch": 49.774, + "grad_norm": 1.154632806777954, + "learning_rate": 2e-05, + "loss": 0.04902484, + "step": 24887 + }, + { + "epoch": 49.776, + "grad_norm": 1.1982251405715942, + "learning_rate": 2e-05, + "loss": 0.04432965, + "step": 24888 + }, + { + "epoch": 49.778, + "grad_norm": 1.4120286703109741, + "learning_rate": 2e-05, + "loss": 0.04318996, + "step": 24889 + }, + { + "epoch": 49.78, + "grad_norm": 1.3181231021881104, + "learning_rate": 2e-05, + "loss": 0.04140037, + "step": 24890 + }, + { + "epoch": 49.782, + "grad_norm": 1.1879209280014038, + "learning_rate": 2e-05, + "loss": 0.04180463, + "step": 24891 + }, + { + "epoch": 49.784, + "grad_norm": 1.2485787868499756, + "learning_rate": 2e-05, + "loss": 0.06122434, + "step": 24892 + }, + { + "epoch": 49.786, + "grad_norm": 1.3617092370986938, + "learning_rate": 2e-05, + "loss": 0.04306863, + "step": 24893 + }, + { + "epoch": 49.788, + "grad_norm": 1.1197565793991089, + "learning_rate": 2e-05, + "loss": 0.04167994, + "step": 24894 + }, + { + "epoch": 49.79, + "grad_norm": 1.2155925035476685, + "learning_rate": 2e-05, + "loss": 0.05426242, + "step": 24895 + }, + { + "epoch": 49.792, + "grad_norm": 1.309443712234497, + "learning_rate": 2e-05, + "loss": 0.05209014, + "step": 24896 + }, + { + "epoch": 49.794, + "grad_norm": 2.3679568767547607, + "learning_rate": 2e-05, + "loss": 0.0490745, + "step": 24897 + }, + { + "epoch": 49.796, + "grad_norm": 1.1244512796401978, + "learning_rate": 2e-05, + "loss": 0.04849419, + "step": 24898 + }, + { + "epoch": 49.798, + "grad_norm": 1.0218008756637573, + "learning_rate": 2e-05, + "loss": 0.04120092, + "step": 24899 + }, + { + "epoch": 49.8, + "grad_norm": 1.2458765506744385, + "learning_rate": 2e-05, + "loss": 0.05435371, + "step": 24900 + }, + { + "epoch": 49.802, + "grad_norm": 1.1713382005691528, + "learning_rate": 2e-05, + "loss": 0.04432591, + "step": 24901 + }, + { + "epoch": 49.804, + "grad_norm": 1.043931007385254, + "learning_rate": 2e-05, + "loss": 0.03500161, + "step": 24902 + }, + { + "epoch": 49.806, + "grad_norm": 1.1524564027786255, + "learning_rate": 2e-05, + "loss": 0.05594937, + "step": 24903 + }, + { + "epoch": 49.808, + "grad_norm": 1.2194099426269531, + "learning_rate": 2e-05, + "loss": 0.06552596, + "step": 24904 + }, + { + "epoch": 49.81, + "grad_norm": 1.5245249271392822, + "learning_rate": 2e-05, + "loss": 0.04009125, + "step": 24905 + }, + { + "epoch": 49.812, + "grad_norm": 1.0253095626831055, + "learning_rate": 2e-05, + "loss": 0.04843939, + "step": 24906 + }, + { + "epoch": 49.814, + "grad_norm": 1.9079041481018066, + "learning_rate": 2e-05, + "loss": 0.05018196, + "step": 24907 + }, + { + "epoch": 49.816, + "grad_norm": 1.1070141792297363, + "learning_rate": 2e-05, + "loss": 0.04317448, + "step": 24908 + }, + { + "epoch": 49.818, + "grad_norm": 1.1777336597442627, + "learning_rate": 2e-05, + "loss": 0.05528118, + "step": 24909 + }, + { + "epoch": 49.82, + "grad_norm": 1.162487268447876, + "learning_rate": 2e-05, + "loss": 0.04349917, + "step": 24910 + }, + { + "epoch": 49.822, + "grad_norm": 1.0818195343017578, + "learning_rate": 2e-05, + "loss": 0.03660588, + "step": 24911 + }, + { + "epoch": 49.824, + "grad_norm": 1.178435206413269, + "learning_rate": 2e-05, + "loss": 0.04270165, + "step": 24912 + }, + { + "epoch": 49.826, + "grad_norm": 1.0772615671157837, + "learning_rate": 2e-05, + "loss": 0.0409967, + "step": 24913 + }, + { + "epoch": 49.828, + "grad_norm": 1.5055285692214966, + "learning_rate": 2e-05, + "loss": 0.0552909, + "step": 24914 + }, + { + "epoch": 49.83, + "grad_norm": 1.929326057434082, + "learning_rate": 2e-05, + "loss": 0.05568858, + "step": 24915 + }, + { + "epoch": 49.832, + "grad_norm": 1.478667974472046, + "learning_rate": 2e-05, + "loss": 0.06444258, + "step": 24916 + }, + { + "epoch": 49.834, + "grad_norm": 2.3012428283691406, + "learning_rate": 2e-05, + "loss": 0.06190757, + "step": 24917 + }, + { + "epoch": 49.836, + "grad_norm": 1.908669114112854, + "learning_rate": 2e-05, + "loss": 0.07899611, + "step": 24918 + }, + { + "epoch": 49.838, + "grad_norm": 1.1932661533355713, + "learning_rate": 2e-05, + "loss": 0.05439082, + "step": 24919 + }, + { + "epoch": 49.84, + "grad_norm": 1.1718835830688477, + "learning_rate": 2e-05, + "loss": 0.04695781, + "step": 24920 + }, + { + "epoch": 49.842, + "grad_norm": 1.2461529970169067, + "learning_rate": 2e-05, + "loss": 0.06060567, + "step": 24921 + }, + { + "epoch": 49.844, + "grad_norm": 1.0214687585830688, + "learning_rate": 2e-05, + "loss": 0.03788566, + "step": 24922 + }, + { + "epoch": 49.846, + "grad_norm": 1.10343599319458, + "learning_rate": 2e-05, + "loss": 0.05159609, + "step": 24923 + }, + { + "epoch": 49.848, + "grad_norm": 1.017777919769287, + "learning_rate": 2e-05, + "loss": 0.0479437, + "step": 24924 + }, + { + "epoch": 49.85, + "grad_norm": 1.4957668781280518, + "learning_rate": 2e-05, + "loss": 0.04876685, + "step": 24925 + }, + { + "epoch": 49.852, + "grad_norm": 1.3842114210128784, + "learning_rate": 2e-05, + "loss": 0.05079174, + "step": 24926 + }, + { + "epoch": 49.854, + "grad_norm": 1.3459488153457642, + "learning_rate": 2e-05, + "loss": 0.05015388, + "step": 24927 + }, + { + "epoch": 49.856, + "grad_norm": 1.1398237943649292, + "learning_rate": 2e-05, + "loss": 0.04445317, + "step": 24928 + }, + { + "epoch": 49.858, + "grad_norm": 1.344321370124817, + "learning_rate": 2e-05, + "loss": 0.05805134, + "step": 24929 + }, + { + "epoch": 49.86, + "grad_norm": 1.3307359218597412, + "learning_rate": 2e-05, + "loss": 0.04095314, + "step": 24930 + }, + { + "epoch": 49.862, + "grad_norm": 1.3429630994796753, + "learning_rate": 2e-05, + "loss": 0.04923171, + "step": 24931 + }, + { + "epoch": 49.864, + "grad_norm": 1.1111401319503784, + "learning_rate": 2e-05, + "loss": 0.05133491, + "step": 24932 + }, + { + "epoch": 49.866, + "grad_norm": 1.182901382446289, + "learning_rate": 2e-05, + "loss": 0.05458375, + "step": 24933 + }, + { + "epoch": 49.868, + "grad_norm": 0.9391345977783203, + "learning_rate": 2e-05, + "loss": 0.03023691, + "step": 24934 + }, + { + "epoch": 49.87, + "grad_norm": 2.566356658935547, + "learning_rate": 2e-05, + "loss": 0.06601861, + "step": 24935 + }, + { + "epoch": 49.872, + "grad_norm": 1.2246321439743042, + "learning_rate": 2e-05, + "loss": 0.05346267, + "step": 24936 + }, + { + "epoch": 49.874, + "grad_norm": 1.1111116409301758, + "learning_rate": 2e-05, + "loss": 0.04999265, + "step": 24937 + }, + { + "epoch": 49.876, + "grad_norm": 1.101694941520691, + "learning_rate": 2e-05, + "loss": 0.04546491, + "step": 24938 + }, + { + "epoch": 49.878, + "grad_norm": 1.1672554016113281, + "learning_rate": 2e-05, + "loss": 0.0529165, + "step": 24939 + }, + { + "epoch": 49.88, + "grad_norm": 1.219642996788025, + "learning_rate": 2e-05, + "loss": 0.03548888, + "step": 24940 + }, + { + "epoch": 49.882, + "grad_norm": 1.246626853942871, + "learning_rate": 2e-05, + "loss": 0.05844071, + "step": 24941 + }, + { + "epoch": 49.884, + "grad_norm": 1.2160290479660034, + "learning_rate": 2e-05, + "loss": 0.05645484, + "step": 24942 + }, + { + "epoch": 49.886, + "grad_norm": 2.756503105163574, + "learning_rate": 2e-05, + "loss": 0.05539309, + "step": 24943 + }, + { + "epoch": 49.888, + "grad_norm": 2.0938100814819336, + "learning_rate": 2e-05, + "loss": 0.04288248, + "step": 24944 + }, + { + "epoch": 49.89, + "grad_norm": 1.1077803373336792, + "learning_rate": 2e-05, + "loss": 0.04419069, + "step": 24945 + }, + { + "epoch": 49.892, + "grad_norm": 1.1682178974151611, + "learning_rate": 2e-05, + "loss": 0.04008082, + "step": 24946 + }, + { + "epoch": 49.894, + "grad_norm": 1.7397586107254028, + "learning_rate": 2e-05, + "loss": 0.04435687, + "step": 24947 + }, + { + "epoch": 49.896, + "grad_norm": 0.9368448257446289, + "learning_rate": 2e-05, + "loss": 0.04006282, + "step": 24948 + }, + { + "epoch": 49.898, + "grad_norm": 2.1141977310180664, + "learning_rate": 2e-05, + "loss": 0.04719079, + "step": 24949 + }, + { + "epoch": 49.9, + "grad_norm": 1.1511861085891724, + "learning_rate": 2e-05, + "loss": 0.03775914, + "step": 24950 + }, + { + "epoch": 49.902, + "grad_norm": 1.1330093145370483, + "learning_rate": 2e-05, + "loss": 0.04169647, + "step": 24951 + }, + { + "epoch": 49.904, + "grad_norm": 1.1492831707000732, + "learning_rate": 2e-05, + "loss": 0.04711498, + "step": 24952 + }, + { + "epoch": 49.906, + "grad_norm": 1.6958730220794678, + "learning_rate": 2e-05, + "loss": 0.05036974, + "step": 24953 + }, + { + "epoch": 49.908, + "grad_norm": 1.0818232297897339, + "learning_rate": 2e-05, + "loss": 0.0415853, + "step": 24954 + }, + { + "epoch": 49.91, + "grad_norm": 1.1111513376235962, + "learning_rate": 2e-05, + "loss": 0.04978892, + "step": 24955 + }, + { + "epoch": 49.912, + "grad_norm": 1.2296136617660522, + "learning_rate": 2e-05, + "loss": 0.04639769, + "step": 24956 + }, + { + "epoch": 49.914, + "grad_norm": 1.226378321647644, + "learning_rate": 2e-05, + "loss": 0.05798329, + "step": 24957 + }, + { + "epoch": 49.916, + "grad_norm": 1.0596457719802856, + "learning_rate": 2e-05, + "loss": 0.04839835, + "step": 24958 + }, + { + "epoch": 49.918, + "grad_norm": 1.3038498163223267, + "learning_rate": 2e-05, + "loss": 0.05488671, + "step": 24959 + }, + { + "epoch": 49.92, + "grad_norm": 1.0039633512496948, + "learning_rate": 2e-05, + "loss": 0.03758329, + "step": 24960 + }, + { + "epoch": 49.922, + "grad_norm": 1.0808930397033691, + "learning_rate": 2e-05, + "loss": 0.03469931, + "step": 24961 + }, + { + "epoch": 49.924, + "grad_norm": 1.2433420419692993, + "learning_rate": 2e-05, + "loss": 0.04504021, + "step": 24962 + }, + { + "epoch": 49.926, + "grad_norm": 1.254826545715332, + "learning_rate": 2e-05, + "loss": 0.05490943, + "step": 24963 + }, + { + "epoch": 49.928, + "grad_norm": 0.9941253066062927, + "learning_rate": 2e-05, + "loss": 0.05116001, + "step": 24964 + }, + { + "epoch": 49.93, + "grad_norm": 1.0954303741455078, + "learning_rate": 2e-05, + "loss": 0.0452432, + "step": 24965 + }, + { + "epoch": 49.932, + "grad_norm": 1.0251585245132446, + "learning_rate": 2e-05, + "loss": 0.04928794, + "step": 24966 + }, + { + "epoch": 49.934, + "grad_norm": 1.1478753089904785, + "learning_rate": 2e-05, + "loss": 0.0425126, + "step": 24967 + }, + { + "epoch": 49.936, + "grad_norm": 1.222156047821045, + "learning_rate": 2e-05, + "loss": 0.04921298, + "step": 24968 + }, + { + "epoch": 49.938, + "grad_norm": 1.1650967597961426, + "learning_rate": 2e-05, + "loss": 0.04439209, + "step": 24969 + }, + { + "epoch": 49.94, + "grad_norm": 1.2680879831314087, + "learning_rate": 2e-05, + "loss": 0.05739405, + "step": 24970 + }, + { + "epoch": 49.942, + "grad_norm": 1.1213183403015137, + "learning_rate": 2e-05, + "loss": 0.05398349, + "step": 24971 + }, + { + "epoch": 49.944, + "grad_norm": 1.088508129119873, + "learning_rate": 2e-05, + "loss": 0.04370025, + "step": 24972 + }, + { + "epoch": 49.946, + "grad_norm": 1.314470887184143, + "learning_rate": 2e-05, + "loss": 0.05276281, + "step": 24973 + }, + { + "epoch": 49.948, + "grad_norm": 1.5439000129699707, + "learning_rate": 2e-05, + "loss": 0.04817521, + "step": 24974 + }, + { + "epoch": 49.95, + "grad_norm": 1.5129501819610596, + "learning_rate": 2e-05, + "loss": 0.05268683, + "step": 24975 + }, + { + "epoch": 49.952, + "grad_norm": 1.2225157022476196, + "learning_rate": 2e-05, + "loss": 0.05818931, + "step": 24976 + }, + { + "epoch": 49.954, + "grad_norm": 2.1078433990478516, + "learning_rate": 2e-05, + "loss": 0.05812836, + "step": 24977 + }, + { + "epoch": 49.956, + "grad_norm": 1.2115638256072998, + "learning_rate": 2e-05, + "loss": 0.04945941, + "step": 24978 + }, + { + "epoch": 49.958, + "grad_norm": 1.2102251052856445, + "learning_rate": 2e-05, + "loss": 0.04883428, + "step": 24979 + }, + { + "epoch": 49.96, + "grad_norm": 1.0502766370773315, + "learning_rate": 2e-05, + "loss": 0.03550196, + "step": 24980 + }, + { + "epoch": 49.962, + "grad_norm": 1.2148598432540894, + "learning_rate": 2e-05, + "loss": 0.048013, + "step": 24981 + }, + { + "epoch": 49.964, + "grad_norm": 1.189860224723816, + "learning_rate": 2e-05, + "loss": 0.06181513, + "step": 24982 + }, + { + "epoch": 49.966, + "grad_norm": 1.195906639099121, + "learning_rate": 2e-05, + "loss": 0.04937382, + "step": 24983 + }, + { + "epoch": 49.968, + "grad_norm": 3.3317697048187256, + "learning_rate": 2e-05, + "loss": 0.05652332, + "step": 24984 + }, + { + "epoch": 49.97, + "grad_norm": 0.9888531565666199, + "learning_rate": 2e-05, + "loss": 0.04144951, + "step": 24985 + }, + { + "epoch": 49.972, + "grad_norm": 1.15919828414917, + "learning_rate": 2e-05, + "loss": 0.04281298, + "step": 24986 + }, + { + "epoch": 49.974, + "grad_norm": 1.744892954826355, + "learning_rate": 2e-05, + "loss": 0.04002345, + "step": 24987 + }, + { + "epoch": 49.976, + "grad_norm": 1.2224351167678833, + "learning_rate": 2e-05, + "loss": 0.04044309, + "step": 24988 + }, + { + "epoch": 49.978, + "grad_norm": 1.1010823249816895, + "learning_rate": 2e-05, + "loss": 0.04040832, + "step": 24989 + }, + { + "epoch": 49.98, + "grad_norm": 1.0111849308013916, + "learning_rate": 2e-05, + "loss": 0.03537663, + "step": 24990 + }, + { + "epoch": 49.982, + "grad_norm": 1.0763754844665527, + "learning_rate": 2e-05, + "loss": 0.0413045, + "step": 24991 + }, + { + "epoch": 49.984, + "grad_norm": 1.0886006355285645, + "learning_rate": 2e-05, + "loss": 0.05110417, + "step": 24992 + }, + { + "epoch": 49.986, + "grad_norm": 1.0088766813278198, + "learning_rate": 2e-05, + "loss": 0.03378244, + "step": 24993 + }, + { + "epoch": 49.988, + "grad_norm": 1.1878137588500977, + "learning_rate": 2e-05, + "loss": 0.05160052, + "step": 24994 + }, + { + "epoch": 49.99, + "grad_norm": 1.188334584236145, + "learning_rate": 2e-05, + "loss": 0.03924968, + "step": 24995 + }, + { + "epoch": 49.992, + "grad_norm": 1.1772879362106323, + "learning_rate": 2e-05, + "loss": 0.05064984, + "step": 24996 + }, + { + "epoch": 49.994, + "grad_norm": 0.7895515561103821, + "learning_rate": 2e-05, + "loss": 0.02828157, + "step": 24997 + }, + { + "epoch": 49.996, + "grad_norm": 0.9504234790802002, + "learning_rate": 2e-05, + "loss": 0.02684129, + "step": 24998 + }, + { + "epoch": 49.998, + "grad_norm": 1.4684773683547974, + "learning_rate": 2e-05, + "loss": 0.05880807, + "step": 24999 + }, + { + "epoch": 50.0, + "grad_norm": 1.0322144031524658, + "learning_rate": 2e-05, + "loss": 0.03352503, + "step": 25000 + }, + { + "epoch": 50.0, + "eval_performance": { + "AngleClassification_1": 0.998, + "AngleClassification_2": 0.998, + "AngleClassification_3": 0.9840319361277445, + "Equal_1": 1.0, + "Equal_2": 0.9840319361277445, + "Equal_3": 0.9940119760479041, + "LineComparison_1": 1.0, + "LineComparison_2": 1.0, + "LineComparison_3": 1.0, + "Parallel_1": 0.9919839679358717, + "Parallel_2": 0.9919839679358717, + "Parallel_3": 0.994, + "Perpendicular_1": 1.0, + "Perpendicular_2": 0.996, + "Perpendicular_3": 0.9078156312625251, + "PointLiesOnCircle_1": 0.9979959919839679, + "PointLiesOnCircle_2": 1.0, + "PointLiesOnCircle_3": 0.99, + "PointLiesOnLine_1": 1.0, + "PointLiesOnLine_2": 0.9959919839679359, + "PointLiesOnLine_3": 0.9880239520958084 + }, + "eval_runtime": 321.2431, + "eval_samples_per_second": 32.686, + "eval_steps_per_second": 0.654, + "step": 25000 + }, + { + "epoch": 50.0, + "step": 25000, + "total_flos": 9.912786476979978e+17, + "train_loss": 0.05224685758773238, + "train_runtime": 92925.7559, + "train_samples_per_second": 17.218, + "train_steps_per_second": 0.269 + } + ], + "logging_steps": 1, + "max_steps": 25000, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.912786476979978e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}