{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006028636021100226, "eval_steps": 20, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.028636021100226e-05, "grad_norm": 0.41394490003585815, "learning_rate": 0.0001, "loss": 3.7924, "step": 1 }, { "epoch": 6.028636021100226e-05, "eval_loss": 4.150390625, "eval_runtime": 60.2622, "eval_samples_per_second": 4.414, "eval_steps_per_second": 4.414, "step": 1 }, { "epoch": 0.00012057272042200452, "grad_norm": 0.4651585519313812, "learning_rate": 0.0002, "loss": 4.096, "step": 2 }, { "epoch": 0.0001808590806330068, "grad_norm": 0.4591425061225891, "learning_rate": 0.0003, "loss": 4.4366, "step": 3 }, { "epoch": 0.00024114544084400904, "grad_norm": 0.6872814893722534, "learning_rate": 0.0004, "loss": 4.0985, "step": 4 }, { "epoch": 0.0003014318010550113, "grad_norm": 0.7587088346481323, "learning_rate": 0.0005, "loss": 3.6821, "step": 5 }, { "epoch": 0.0003617181612660136, "grad_norm": 0.7434119582176208, "learning_rate": 0.0006, "loss": 3.051, "step": 6 }, { "epoch": 0.00042200452147701583, "grad_norm": 1.2289282083511353, "learning_rate": 0.0007, "loss": 2.6831, "step": 7 }, { "epoch": 0.0004822908816880181, "grad_norm": 1.2558321952819824, "learning_rate": 0.0008, "loss": 2.2848, "step": 8 }, { "epoch": 0.0005425772418990204, "grad_norm": 0.7864887714385986, "learning_rate": 0.0009000000000000001, "loss": 1.9602, "step": 9 }, { "epoch": 0.0006028636021100226, "grad_norm": 0.9119133353233337, "learning_rate": 0.001, "loss": 1.8959, "step": 10 }, { "epoch": 0.0006631499623210249, "grad_norm": 0.6961417198181152, "learning_rate": 0.0009996954135095479, "loss": 1.8307, "step": 11 }, { "epoch": 0.0007234363225320272, "grad_norm": 0.6136600971221924, "learning_rate": 0.0009987820251299122, "loss": 1.6552, "step": 12 }, { "epoch": 0.0007837226827430294, "grad_norm": 0.7051136493682861, "learning_rate": 0.0009972609476841367, "loss": 1.7831, "step": 13 }, { "epoch": 0.0008440090429540317, "grad_norm": 0.5496101379394531, "learning_rate": 0.0009951340343707852, "loss": 1.3238, "step": 14 }, { "epoch": 0.000904295403165034, "grad_norm": 0.60066157579422, "learning_rate": 0.000992403876506104, "loss": 1.4053, "step": 15 }, { "epoch": 0.0009645817633760362, "grad_norm": 0.6970762610435486, "learning_rate": 0.0009890738003669028, "loss": 1.3307, "step": 16 }, { "epoch": 0.0010248681235870383, "grad_norm": 0.529237687587738, "learning_rate": 0.0009851478631379982, "loss": 1.0936, "step": 17 }, { "epoch": 0.0010851544837980408, "grad_norm": 0.5715049505233765, "learning_rate": 0.0009806308479691594, "loss": 1.189, "step": 18 }, { "epoch": 0.001145440844009043, "grad_norm": 0.6311086416244507, "learning_rate": 0.0009755282581475768, "loss": 1.347, "step": 19 }, { "epoch": 0.0012057272042200451, "grad_norm": 0.5899785161018372, "learning_rate": 0.0009698463103929542, "loss": 1.3327, "step": 20 }, { "epoch": 0.0012057272042200451, "eval_loss": 1.1405787467956543, "eval_runtime": 62.5055, "eval_samples_per_second": 4.256, "eval_steps_per_second": 4.256, "step": 20 }, { "epoch": 0.0012660135644310476, "grad_norm": 0.5856114029884338, "learning_rate": 0.0009635919272833937, "loss": 1.2596, "step": 21 }, { "epoch": 0.0013262999246420497, "grad_norm": 0.5270152688026428, "learning_rate": 0.0009567727288213005, "loss": 1.2, "step": 22 }, { "epoch": 0.001386586284853052, "grad_norm": 0.4631257951259613, "learning_rate": 0.0009493970231495835, "loss": 1.0611, "step": 23 }, { "epoch": 0.0014468726450640543, "grad_norm": 0.5318592190742493, "learning_rate": 0.0009414737964294635, "loss": 0.9471, "step": 24 }, { "epoch": 0.0015071590052750565, "grad_norm": 0.45401427149772644, "learning_rate": 0.0009330127018922195, "loss": 1.017, "step": 25 }, { "epoch": 0.0015674453654860587, "grad_norm": 0.4869089126586914, "learning_rate": 0.0009240240480782129, "loss": 1.0759, "step": 26 }, { "epoch": 0.0016277317256970611, "grad_norm": 0.48176413774490356, "learning_rate": 0.0009145187862775209, "loss": 0.9456, "step": 27 }, { "epoch": 0.0016880180859080633, "grad_norm": 0.4359772503376007, "learning_rate": 0.0009045084971874737, "loss": 0.9358, "step": 28 }, { "epoch": 0.0017483044461190655, "grad_norm": 0.5028501152992249, "learning_rate": 0.0008940053768033609, "loss": 1.0855, "step": 29 }, { "epoch": 0.001808590806330068, "grad_norm": 0.4810752868652344, "learning_rate": 0.000883022221559489, "loss": 1.0254, "step": 30 }, { "epoch": 0.0018688771665410701, "grad_norm": 0.5646216869354248, "learning_rate": 0.0008715724127386971, "loss": 1.1916, "step": 31 }, { "epoch": 0.0019291635267520723, "grad_norm": 0.4549255073070526, "learning_rate": 0.0008596699001693256, "loss": 0.8809, "step": 32 }, { "epoch": 0.0019894498869630745, "grad_norm": 0.5244401693344116, "learning_rate": 0.0008473291852294987, "loss": 0.9467, "step": 33 }, { "epoch": 0.0020497362471740767, "grad_norm": 0.5511285066604614, "learning_rate": 0.0008345653031794292, "loss": 0.9638, "step": 34 }, { "epoch": 0.0021100226073850793, "grad_norm": 0.3588230013847351, "learning_rate": 0.0008213938048432696, "loss": 0.6308, "step": 35 }, { "epoch": 0.0021703089675960815, "grad_norm": 0.47416505217552185, "learning_rate": 0.0008078307376628291, "loss": 1.0042, "step": 36 }, { "epoch": 0.0022305953278070837, "grad_norm": 0.5733112692832947, "learning_rate": 0.0007938926261462366, "loss": 1.0075, "step": 37 }, { "epoch": 0.002290881688018086, "grad_norm": 0.3355793058872223, "learning_rate": 0.0007795964517353734, "loss": 0.7105, "step": 38 }, { "epoch": 0.002351168048229088, "grad_norm": 0.4479922950267792, "learning_rate": 0.0007649596321166025, "loss": 0.9258, "step": 39 }, { "epoch": 0.0024114544084400903, "grad_norm": 0.4550175666809082, "learning_rate": 0.00075, "loss": 0.883, "step": 40 }, { "epoch": 0.0024114544084400903, "eval_loss": 0.8111943602561951, "eval_runtime": 56.3176, "eval_samples_per_second": 4.723, "eval_steps_per_second": 4.723, "step": 40 }, { "epoch": 0.0024717407686510925, "grad_norm": 0.41962698101997375, "learning_rate": 0.0007347357813929454, "loss": 0.8918, "step": 41 }, { "epoch": 0.002532027128862095, "grad_norm": 0.39438948035240173, "learning_rate": 0.0007191855733945387, "loss": 0.7145, "step": 42 }, { "epoch": 0.0025923134890730973, "grad_norm": 0.4375174641609192, "learning_rate": 0.0007033683215379002, "loss": 0.8218, "step": 43 }, { "epoch": 0.0026525998492840995, "grad_norm": 0.49163204431533813, "learning_rate": 0.0006873032967079561, "loss": 0.7817, "step": 44 }, { "epoch": 0.0027128862094951017, "grad_norm": 0.5024600028991699, "learning_rate": 0.0006710100716628344, "loss": 0.9008, "step": 45 }, { "epoch": 0.002773172569706104, "grad_norm": 0.49633675813674927, "learning_rate": 0.0006545084971874737, "loss": 0.9048, "step": 46 }, { "epoch": 0.002833458929917106, "grad_norm": 0.4405844807624817, "learning_rate": 0.0006378186779084996, "loss": 0.6957, "step": 47 }, { "epoch": 0.0028937452901281087, "grad_norm": 0.4384251534938812, "learning_rate": 0.0006209609477998338, "loss": 0.9315, "step": 48 }, { "epoch": 0.002954031650339111, "grad_norm": 0.4509848952293396, "learning_rate": 0.0006039558454088796, "loss": 0.7483, "step": 49 }, { "epoch": 0.003014318010550113, "grad_norm": 0.3974878787994385, "learning_rate": 0.0005868240888334653, "loss": 0.7334, "step": 50 }, { "epoch": 0.0030746043707611153, "grad_norm": 0.368656724691391, "learning_rate": 0.0005695865504800327, "loss": 0.7516, "step": 51 }, { "epoch": 0.0031348907309721175, "grad_norm": 0.42397060990333557, "learning_rate": 0.0005522642316338268, "loss": 0.6901, "step": 52 }, { "epoch": 0.0031951770911831196, "grad_norm": 0.42175376415252686, "learning_rate": 0.0005348782368720626, "loss": 0.7027, "step": 53 }, { "epoch": 0.0032554634513941223, "grad_norm": 0.3615281879901886, "learning_rate": 0.0005174497483512506, "loss": 0.6602, "step": 54 }, { "epoch": 0.0033157498116051245, "grad_norm": 0.35693299770355225, "learning_rate": 0.0005, "loss": 0.6363, "step": 55 }, { "epoch": 0.0033760361718161267, "grad_norm": 0.3756321966648102, "learning_rate": 0.0004825502516487497, "loss": 0.6734, "step": 56 }, { "epoch": 0.003436322532027129, "grad_norm": 0.49512147903442383, "learning_rate": 0.00046512176312793734, "loss": 0.7711, "step": 57 }, { "epoch": 0.003496608892238131, "grad_norm": 0.37548142671585083, "learning_rate": 0.00044773576836617336, "loss": 0.6175, "step": 58 }, { "epoch": 0.0035568952524491332, "grad_norm": 0.45658257603645325, "learning_rate": 0.0004304134495199674, "loss": 0.735, "step": 59 }, { "epoch": 0.003617181612660136, "grad_norm": 0.3818158805370331, "learning_rate": 0.00041317591116653486, "loss": 0.6586, "step": 60 }, { "epoch": 0.003617181612660136, "eval_loss": 0.6724220514297485, "eval_runtime": 56.3327, "eval_samples_per_second": 4.722, "eval_steps_per_second": 4.722, "step": 60 }, { "epoch": 0.003677467972871138, "grad_norm": 0.3128480911254883, "learning_rate": 0.0003960441545911204, "loss": 0.5331, "step": 61 }, { "epoch": 0.0037377543330821402, "grad_norm": 0.48710331320762634, "learning_rate": 0.0003790390522001662, "loss": 0.7518, "step": 62 }, { "epoch": 0.0037980406932931424, "grad_norm": 0.34967750310897827, "learning_rate": 0.00036218132209150044, "loss": 0.6341, "step": 63 }, { "epoch": 0.0038583270535041446, "grad_norm": 0.41423478722572327, "learning_rate": 0.00034549150281252633, "loss": 0.6115, "step": 64 }, { "epoch": 0.003918613413715147, "grad_norm": 0.338761568069458, "learning_rate": 0.0003289899283371657, "loss": 0.6564, "step": 65 }, { "epoch": 0.003978899773926149, "grad_norm": 0.2858395576477051, "learning_rate": 0.00031269670329204396, "loss": 0.547, "step": 66 }, { "epoch": 0.004039186134137151, "grad_norm": 0.37567245960235596, "learning_rate": 0.0002966316784621, "loss": 0.5626, "step": 67 }, { "epoch": 0.004099472494348153, "grad_norm": 0.3639447093009949, "learning_rate": 0.00028081442660546124, "loss": 0.6215, "step": 68 }, { "epoch": 0.004159758854559156, "grad_norm": 0.4098832309246063, "learning_rate": 0.00026526421860705474, "loss": 0.7116, "step": 69 }, { "epoch": 0.004220045214770159, "grad_norm": 0.31612440943717957, "learning_rate": 0.0002500000000000001, "loss": 0.5438, "step": 70 }, { "epoch": 0.004280331574981161, "grad_norm": 0.43097761273384094, "learning_rate": 0.0002350403678833976, "loss": 0.717, "step": 71 }, { "epoch": 0.004340617935192163, "grad_norm": 0.32251855731010437, "learning_rate": 0.00022040354826462666, "loss": 0.536, "step": 72 }, { "epoch": 0.004400904295403165, "grad_norm": 0.42903852462768555, "learning_rate": 0.00020610737385376348, "loss": 0.5561, "step": 73 }, { "epoch": 0.004461190655614167, "grad_norm": 0.3536398112773895, "learning_rate": 0.00019216926233717085, "loss": 0.5522, "step": 74 }, { "epoch": 0.00452147701582517, "grad_norm": 0.408915638923645, "learning_rate": 0.0001786061951567303, "loss": 0.6592, "step": 75 }, { "epoch": 0.004581763376036172, "grad_norm": 0.2951997220516205, "learning_rate": 0.00016543469682057105, "loss": 0.4968, "step": 76 }, { "epoch": 0.004642049736247174, "grad_norm": 0.37560755014419556, "learning_rate": 0.00015267081477050133, "loss": 0.6035, "step": 77 }, { "epoch": 0.004702336096458176, "grad_norm": 0.335110604763031, "learning_rate": 0.00014033009983067452, "loss": 0.5496, "step": 78 }, { "epoch": 0.004762622456669178, "grad_norm": 0.29665490984916687, "learning_rate": 0.00012842758726130281, "loss": 0.4494, "step": 79 }, { "epoch": 0.004822908816880181, "grad_norm": 0.34419184923171997, "learning_rate": 0.00011697777844051105, "loss": 0.5594, "step": 80 }, { "epoch": 0.004822908816880181, "eval_loss": 0.5573277473449707, "eval_runtime": 56.342, "eval_samples_per_second": 4.721, "eval_steps_per_second": 4.721, "step": 80 }, { "epoch": 0.004883195177091183, "grad_norm": 0.3764530420303345, "learning_rate": 0.00010599462319663906, "loss": 0.5898, "step": 81 }, { "epoch": 0.004943481537302185, "grad_norm": 0.255892276763916, "learning_rate": 9.549150281252633e-05, "loss": 0.4779, "step": 82 }, { "epoch": 0.005003767897513188, "grad_norm": 0.3388007581233978, "learning_rate": 8.548121372247918e-05, "loss": 0.4146, "step": 83 }, { "epoch": 0.00506405425772419, "grad_norm": 0.2967192828655243, "learning_rate": 7.597595192178702e-05, "loss": 0.4899, "step": 84 }, { "epoch": 0.005124340617935192, "grad_norm": 0.46650180220603943, "learning_rate": 6.698729810778065e-05, "loss": 0.6348, "step": 85 }, { "epoch": 0.005184626978146195, "grad_norm": 0.32702431082725525, "learning_rate": 5.852620357053651e-05, "loss": 0.538, "step": 86 }, { "epoch": 0.005244913338357197, "grad_norm": 0.391559362411499, "learning_rate": 5.060297685041659e-05, "loss": 0.5989, "step": 87 }, { "epoch": 0.005305199698568199, "grad_norm": 0.38837510347366333, "learning_rate": 4.322727117869951e-05, "loss": 0.5236, "step": 88 }, { "epoch": 0.005365486058779201, "grad_norm": 0.26703765988349915, "learning_rate": 3.6408072716606344e-05, "loss": 0.471, "step": 89 }, { "epoch": 0.005425772418990203, "grad_norm": 0.43834730982780457, "learning_rate": 3.0153689607045842e-05, "loss": 0.591, "step": 90 }, { "epoch": 0.0054860587792012055, "grad_norm": 0.3244725167751312, "learning_rate": 2.4471741852423235e-05, "loss": 0.461, "step": 91 }, { "epoch": 0.005546345139412208, "grad_norm": 0.3419345021247864, "learning_rate": 1.9369152030840554e-05, "loss": 0.568, "step": 92 }, { "epoch": 0.00560663149962321, "grad_norm": 0.3026769161224365, "learning_rate": 1.4852136862001764e-05, "loss": 0.4901, "step": 93 }, { "epoch": 0.005666917859834212, "grad_norm": 0.37205642461776733, "learning_rate": 1.0926199633097156e-05, "loss": 0.6391, "step": 94 }, { "epoch": 0.005727204220045215, "grad_norm": 0.2809191048145294, "learning_rate": 7.59612349389599e-06, "loss": 0.4608, "step": 95 }, { "epoch": 0.005787490580256217, "grad_norm": 0.3068303167819977, "learning_rate": 4.865965629214819e-06, "loss": 0.5105, "step": 96 }, { "epoch": 0.00584777694046722, "grad_norm": 0.2759382724761963, "learning_rate": 2.739052315863355e-06, "loss": 0.4257, "step": 97 }, { "epoch": 0.005908063300678222, "grad_norm": 0.3797454833984375, "learning_rate": 1.2179748700879012e-06, "loss": 0.5798, "step": 98 }, { "epoch": 0.005968349660889224, "grad_norm": 0.25835317373275757, "learning_rate": 3.0458649045211895e-07, "loss": 0.4266, "step": 99 }, { "epoch": 0.006028636021100226, "grad_norm": 0.3680518567562103, "learning_rate": 0.0, "loss": 0.5557, "step": 100 }, { "epoch": 0.006028636021100226, "eval_loss": 0.526295006275177, "eval_runtime": 56.3029, "eval_samples_per_second": 4.724, "eval_steps_per_second": 4.724, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.65790133256192e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }