{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5839793281653747, "eval_steps": 5000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025839793281653745, "grad_norm": 21431.55859375, "learning_rate": 3.4188034188034193e-06, "loss": 5737.0758, "step": 10 }, { "epoch": 0.05167958656330749, "grad_norm": 21565.48046875, "learning_rate": 6.837606837606839e-06, "loss": 5749.3406, "step": 20 }, { "epoch": 0.07751937984496124, "grad_norm": 17265.16796875, "learning_rate": 1.0256410256410256e-05, "loss": 5788.193, "step": 30 }, { "epoch": 0.10335917312661498, "grad_norm": 16618.6875, "learning_rate": 1.3675213675213677e-05, "loss": 5726.6566, "step": 40 }, { "epoch": 0.12919896640826872, "grad_norm": 18382.3359375, "learning_rate": 1.7094017094017095e-05, "loss": 5755.7125, "step": 50 }, { "epoch": 0.15503875968992248, "grad_norm": 1329.3785400390625, "learning_rate": 2.0512820512820512e-05, "loss": 3448.4984, "step": 60 }, { "epoch": 0.18087855297157623, "grad_norm": 982.5582885742188, "learning_rate": 2.393162393162393e-05, "loss": 793.4248, "step": 70 }, { "epoch": 0.20671834625322996, "grad_norm": 441.6934814453125, "learning_rate": 2.7350427350427355e-05, "loss": 664.0245, "step": 80 }, { "epoch": 0.23255813953488372, "grad_norm": 384.9512634277344, "learning_rate": 3.0769230769230774e-05, "loss": 590.2515, "step": 90 }, { "epoch": 0.25839793281653745, "grad_norm": 1645.2818603515625, "learning_rate": 3.418803418803419e-05, "loss": 540.1542, "step": 100 }, { "epoch": 0.2842377260981912, "grad_norm": 656.8567504882812, "learning_rate": 3.760683760683761e-05, "loss": 540.0785, "step": 110 }, { "epoch": 0.31007751937984496, "grad_norm": 209.0157928466797, "learning_rate": 3.999918503621906e-05, "loss": 500.4066, "step": 120 }, { "epoch": 0.3359173126614987, "grad_norm": 312.04351806640625, "learning_rate": 3.9984698638788994e-05, "loss": 476.6278, "step": 130 }, { "epoch": 0.36175710594315247, "grad_norm": 111.13858795166016, "learning_rate": 3.995211703336012e-05, "loss": 457.4648, "step": 140 }, { "epoch": 0.3875968992248062, "grad_norm": 198.55479431152344, "learning_rate": 3.9901469721049156e-05, "loss": 439.1863, "step": 150 }, { "epoch": 0.4134366925064599, "grad_norm": 179.00514221191406, "learning_rate": 3.983280256062371e-05, "loss": 435.1474, "step": 160 }, { "epoch": 0.4392764857881137, "grad_norm": 204.16162109375, "learning_rate": 3.9746177726979355e-05, "loss": 444.1413, "step": 170 }, { "epoch": 0.46511627906976744, "grad_norm": 232.0103759765625, "learning_rate": 3.964167365484312e-05, "loss": 427.4143, "step": 180 }, { "epoch": 0.4909560723514212, "grad_norm": 75.20649719238281, "learning_rate": 3.951938496775456e-05, "loss": 420.458, "step": 190 }, { "epoch": 0.5167958656330749, "grad_norm": 270.50970458984375, "learning_rate": 3.937942239238855e-05, "loss": 406.7704, "step": 200 }, { "epoch": 0.5426356589147286, "grad_norm": 146.1454315185547, "learning_rate": 3.92219126582975e-05, "loss": 407.9724, "step": 210 }, { "epoch": 0.5684754521963824, "grad_norm": 205.81834411621094, "learning_rate": 3.904699838316363e-05, "loss": 416.5542, "step": 220 }, { "epoch": 0.5943152454780362, "grad_norm": 120.94080352783203, "learning_rate": 3.885483794366543e-05, "loss": 415.2502, "step": 230 }, { "epoch": 0.6201550387596899, "grad_norm": 108.84484100341797, "learning_rate": 3.86456053320749e-05, "loss": 401.6582, "step": 240 }, { "epoch": 0.6459948320413437, "grad_norm": 240.042724609375, "learning_rate": 3.841948999871579e-05, "loss": 398.7828, "step": 250 }, { "epoch": 0.6718346253229974, "grad_norm": 74.43096160888672, "learning_rate": 3.817669668042516e-05, "loss": 389.9398, "step": 260 }, { "epoch": 0.6976744186046512, "grad_norm": 209.9810028076172, "learning_rate": 3.7917445215173765e-05, "loss": 389.4235, "step": 270 }, { "epoch": 0.7235142118863049, "grad_norm": 109.08077239990234, "learning_rate": 3.7641970343013115e-05, "loss": 392.0608, "step": 280 }, { "epoch": 0.7493540051679587, "grad_norm": 109.67909240722656, "learning_rate": 3.7350521493529335e-05, "loss": 390.3438, "step": 290 }, { "epoch": 0.7751937984496124, "grad_norm": 248.89028930664062, "learning_rate": 3.704336255999636e-05, "loss": 387.3038, "step": 300 }, { "epoch": 0.8010335917312662, "grad_norm": 119.14262390136719, "learning_rate": 3.672077166043294e-05, "loss": 377.6907, "step": 310 }, { "epoch": 0.8268733850129198, "grad_norm": 142.8026123046875, "learning_rate": 3.638304088577984e-05, "loss": 385.1249, "step": 320 }, { "epoch": 0.8527131782945736, "grad_norm": 173.82833862304688, "learning_rate": 3.603047603542515e-05, "loss": 375.1511, "step": 330 }, { "epoch": 0.8785529715762274, "grad_norm": 164.8625946044922, "learning_rate": 3.566339634031729e-05, "loss": 375.9214, "step": 340 }, { "epoch": 0.9043927648578811, "grad_norm": 117.82234191894531, "learning_rate": 3.528213417391633e-05, "loss": 377.7271, "step": 350 }, { "epoch": 0.9302325581395349, "grad_norm": 41.274600982666016, "learning_rate": 3.488703475124541e-05, "loss": 368.712, "step": 360 }, { "epoch": 0.9560723514211886, "grad_norm": 138.67919921875, "learning_rate": 3.4478455816314724e-05, "loss": 375.8104, "step": 370 }, { "epoch": 0.9819121447028424, "grad_norm": 61.867340087890625, "learning_rate": 3.405676731820106e-05, "loss": 374.8659, "step": 380 }, { "epoch": 1.0077519379844961, "grad_norm": 168.50962829589844, "learning_rate": 3.362235107607629e-05, "loss": 367.1276, "step": 390 }, { "epoch": 1.0335917312661498, "grad_norm": 94.74224853515625, "learning_rate": 3.317560043348795e-05, "loss": 361.7362, "step": 400 }, { "epoch": 1.0594315245478036, "grad_norm": 57.599578857421875, "learning_rate": 3.2716919902205154e-05, "loss": 360.4581, "step": 410 }, { "epoch": 1.0852713178294573, "grad_norm": 127.81239318847656, "learning_rate": 3.224672479595208e-05, "loss": 358.2213, "step": 420 }, { "epoch": 1.1111111111111112, "grad_norm": 262.8865661621094, "learning_rate": 3.176544085436091e-05, "loss": 360.1062, "step": 430 }, { "epoch": 1.1369509043927648, "grad_norm": 247.9445343017578, "learning_rate": 3.127350385748453e-05, "loss": 367.3566, "step": 440 }, { "epoch": 1.1627906976744187, "grad_norm": 204.389892578125, "learning_rate": 3.077135923121809e-05, "loss": 354.9228, "step": 450 }, { "epoch": 1.1886304909560723, "grad_norm": 260.5009460449219, "learning_rate": 3.0259461643986784e-05, "loss": 356.919, "step": 460 }, { "epoch": 1.2144702842377262, "grad_norm": 149.97607421875, "learning_rate": 2.9738274595064845e-05, "loss": 354.5676, "step": 470 }, { "epoch": 1.2403100775193798, "grad_norm": 194.74737548828125, "learning_rate": 2.9208269994898725e-05, "loss": 353.6357, "step": 480 }, { "epoch": 1.2661498708010335, "grad_norm": 200.73731994628906, "learning_rate": 2.8669927737814244e-05, "loss": 363.2115, "step": 490 }, { "epoch": 1.2919896640826873, "grad_norm": 66.00718688964844, "learning_rate": 2.8123735267494826e-05, "loss": 350.5789, "step": 500 }, { "epoch": 1.3178294573643412, "grad_norm": 136.6959991455078, "learning_rate": 2.7570187135624063e-05, "loss": 347.9595, "step": 510 }, { "epoch": 1.3436692506459949, "grad_norm": 40.67978286743164, "learning_rate": 2.7009784554092338e-05, "loss": 351.9972, "step": 520 }, { "epoch": 1.3695090439276485, "grad_norm": 34.59364318847656, "learning_rate": 2.6443034941172962e-05, "loss": 349.3325, "step": 530 }, { "epoch": 1.3953488372093024, "grad_norm": 130.43624877929688, "learning_rate": 2.5870451462078697e-05, "loss": 345.4973, "step": 540 }, { "epoch": 1.421188630490956, "grad_norm": 52.83992004394531, "learning_rate": 2.529255256431472e-05, "loss": 350.3788, "step": 550 }, { "epoch": 1.4470284237726099, "grad_norm": 92.32342529296875, "learning_rate": 2.4709861508248688e-05, "loss": 350.4281, "step": 560 }, { "epoch": 1.4728682170542635, "grad_norm": 56.88703536987305, "learning_rate": 2.4122905893323006e-05, "loss": 349.739, "step": 570 }, { "epoch": 1.4987080103359174, "grad_norm": 73.592529296875, "learning_rate": 2.3532217180338283e-05, "loss": 355.5978, "step": 580 }, { "epoch": 1.524547803617571, "grad_norm": 107.73036193847656, "learning_rate": 2.2938330210240424e-05, "loss": 338.8647, "step": 590 }, { "epoch": 1.550387596899225, "grad_norm": 151.14724731445312, "learning_rate": 2.2341782719847292e-05, "loss": 339.6862, "step": 600 }, { "epoch": 1.5762273901808785, "grad_norm": 109.76298522949219, "learning_rate": 2.174311485495317e-05, "loss": 351.1045, "step": 610 }, { "epoch": 1.6020671834625322, "grad_norm": 75.17061614990234, "learning_rate": 2.1142868681252072e-05, "loss": 344.3581, "step": 620 }, { "epoch": 1.627906976744186, "grad_norm": 43.415557861328125, "learning_rate": 2.0541587693522694e-05, "loss": 346.1661, "step": 630 }, { "epoch": 1.65374677002584, "grad_norm": 95.63790893554688, "learning_rate": 1.99398163235193e-05, "loss": 340.8094, "step": 640 }, { "epoch": 1.6795865633074936, "grad_norm": 173.1468048095703, "learning_rate": 1.9338099447014348e-05, "loss": 344.9255, "step": 650 }, { "epoch": 1.7054263565891472, "grad_norm": 185.2141571044922, "learning_rate": 1.8736981890438973e-05, "loss": 345.9086, "step": 660 }, { "epoch": 1.731266149870801, "grad_norm": 185.66473388671875, "learning_rate": 1.8137007937568198e-05, "loss": 342.1713, "step": 670 }, { "epoch": 1.757105943152455, "grad_norm": 135.51266479492188, "learning_rate": 1.7538720836697505e-05, "loss": 336.7293, "step": 680 }, { "epoch": 1.7829457364341086, "grad_norm": 74.66438293457031, "learning_rate": 1.6942662308756942e-05, "loss": 340.9484, "step": 690 }, { "epoch": 1.8087855297157622, "grad_norm": 76.51793670654297, "learning_rate": 1.6349372056808196e-05, "loss": 332.7376, "step": 700 }, { "epoch": 1.8346253229974159, "grad_norm": 88.05294799804688, "learning_rate": 1.5759387277368817e-05, "loss": 337.1342, "step": 710 }, { "epoch": 1.8604651162790697, "grad_norm": 82.69217681884766, "learning_rate": 1.517324217400589e-05, "loss": 338.4325, "step": 720 }, { "epoch": 1.8863049095607236, "grad_norm": 49.15666580200195, "learning_rate": 1.4591467473639769e-05, "loss": 333.9558, "step": 730 }, { "epoch": 1.9121447028423773, "grad_norm": 100.37080383300781, "learning_rate": 1.4014589945995718e-05, "loss": 339.7346, "step": 740 }, { "epoch": 1.937984496124031, "grad_norm": 64.05115509033203, "learning_rate": 1.3443131926638637e-05, "loss": 336.4353, "step": 750 }, { "epoch": 1.9638242894056848, "grad_norm": 40.96150207519531, "learning_rate": 1.287761084402265e-05, "loss": 344.2413, "step": 760 }, { "epoch": 1.9896640826873386, "grad_norm": 42.53565979003906, "learning_rate": 1.2318538750983903e-05, "loss": 326.7869, "step": 770 }, { "epoch": 2.0155038759689923, "grad_norm": 33.191429138183594, "learning_rate": 1.1766421861100734e-05, "loss": 330.7202, "step": 780 }, { "epoch": 2.041343669250646, "grad_norm": 25.623939514160156, "learning_rate": 1.1221760090340987e-05, "loss": 332.5341, "step": 790 }, { "epoch": 2.0671834625322996, "grad_norm": 41.96136474609375, "learning_rate": 1.068504660441154e-05, "loss": 332.9494, "step": 800 }, { "epoch": 2.0930232558139537, "grad_norm": 27.031042098999023, "learning_rate": 1.0156767372219854e-05, "loss": 325.7135, "step": 810 }, { "epoch": 2.1188630490956073, "grad_norm": 58.57283401489258, "learning_rate": 9.637400725851947e-06, "loss": 331.5063, "step": 820 }, { "epoch": 2.144702842377261, "grad_norm": 54.434139251708984, "learning_rate": 9.127416927465047e-06, "loss": 327.2943, "step": 830 }, { "epoch": 2.1705426356589146, "grad_norm": 36.624176025390625, "learning_rate": 8.627277743487296e-06, "loss": 332.8677, "step": 840 }, { "epoch": 2.1963824289405687, "grad_norm": 53.19684600830078, "learning_rate": 8.137436026509862e-06, "loss": 333.1244, "step": 850 }, { "epoch": 2.2222222222222223, "grad_norm": 37.776268005371094, "learning_rate": 7.65833530525017e-06, "loss": 329.4649, "step": 860 }, { "epoch": 2.248062015503876, "grad_norm": 26.57048797607422, "learning_rate": 7.190409382957408e-06, "loss": 336.0614, "step": 870 }, { "epoch": 2.2739018087855296, "grad_norm": 57.234336853027344, "learning_rate": 6.734081944624027e-06, "loss": 328.4645, "step": 880 }, { "epoch": 2.2997416020671837, "grad_norm": 56.26626205444336, "learning_rate": 6.289766173358826e-06, "loss": 324.8838, "step": 890 }, { "epoch": 2.3255813953488373, "grad_norm": 48.253318786621094, "learning_rate": 5.857864376269051e-06, "loss": 327.6445, "step": 900 }, { "epoch": 2.351421188630491, "grad_norm": 28.283784866333008, "learning_rate": 5.438767620190108e-06, "loss": 326.632, "step": 910 }, { "epoch": 2.3772609819121446, "grad_norm": 41.68576431274414, "learning_rate": 5.032855377592904e-06, "loss": 325.8222, "step": 920 }, { "epoch": 2.4031007751937983, "grad_norm": 22.881074905395508, "learning_rate": 4.64049518298932e-06, "loss": 324.5381, "step": 930 }, { "epoch": 2.4289405684754524, "grad_norm": 26.17159652709961, "learning_rate": 4.262042300146898e-06, "loss": 330.8671, "step": 940 }, { "epoch": 2.454780361757106, "grad_norm": 44.048988342285156, "learning_rate": 3.897839400414187e-06, "loss": 326.6512, "step": 950 }, { "epoch": 2.4806201550387597, "grad_norm": 29.078561782836914, "learning_rate": 3.548216252447867e-06, "loss": 321.7997, "step": 960 }, { "epoch": 2.5064599483204133, "grad_norm": 20.772472381591797, "learning_rate": 3.21348942362272e-06, "loss": 331.5863, "step": 970 }, { "epoch": 2.532299741602067, "grad_norm": 20.700204849243164, "learning_rate": 2.893961993394667e-06, "loss": 326.034, "step": 980 }, { "epoch": 2.558139534883721, "grad_norm": 52.08636474609375, "learning_rate": 2.5899232788765604e-06, "loss": 329.6624, "step": 990 }, { "epoch": 2.5839793281653747, "grad_norm": 26.22823715209961, "learning_rate": 2.3016485728750724e-06, "loss": 324.0566, "step": 1000 } ], "logging_steps": 10, "max_steps": 1161, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.2584857851448525e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }