|
{ |
|
"best_metric": 0.0014008664293214679, |
|
"best_model_checkpoint": "./vit-base-beans/checkpoint-1100", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 1160, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 1.4541093111038208, |
|
"learning_rate": 0.00019827586206896554, |
|
"loss": 0.9417, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 0.6665103435516357, |
|
"learning_rate": 0.00019655172413793104, |
|
"loss": 0.3226, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 0.33402901887893677, |
|
"learning_rate": 0.00019482758620689657, |
|
"loss": 0.1528, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.21010038256645203, |
|
"learning_rate": 0.0001931034482758621, |
|
"loss": 0.1077, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 0.2470974624156952, |
|
"learning_rate": 0.0001913793103448276, |
|
"loss": 0.0676, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 0.16962118446826935, |
|
"learning_rate": 0.00018965517241379312, |
|
"loss": 0.0449, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 0.1308588832616806, |
|
"learning_rate": 0.00018793103448275865, |
|
"loss": 0.0346, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.11898835003376007, |
|
"learning_rate": 0.00018620689655172415, |
|
"loss": 0.0318, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 0.10036325454711914, |
|
"learning_rate": 0.00018448275862068968, |
|
"loss": 0.0425, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 0.10104402154684067, |
|
"learning_rate": 0.00018275862068965518, |
|
"loss": 0.0532, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"eval_accuracy": 0.9961315280464217, |
|
"eval_loss": 0.027677614241838455, |
|
"eval_runtime": 9.322, |
|
"eval_samples_per_second": 55.46, |
|
"eval_steps_per_second": 6.973, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 1.163355827331543, |
|
"learning_rate": 0.0001810344827586207, |
|
"loss": 0.1062, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.08375360816717148, |
|
"learning_rate": 0.0001793103448275862, |
|
"loss": 0.0258, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 0.09952229261398315, |
|
"learning_rate": 0.00017758620689655173, |
|
"loss": 0.0175, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 0.6083604097366333, |
|
"learning_rate": 0.00017586206896551723, |
|
"loss": 0.0683, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 0.09937556833028793, |
|
"learning_rate": 0.00017413793103448276, |
|
"loss": 0.0947, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.08655019849538803, |
|
"learning_rate": 0.00017241379310344826, |
|
"loss": 0.0201, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 0.059486184269189835, |
|
"learning_rate": 0.0001706896551724138, |
|
"loss": 0.0631, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 0.07637613266706467, |
|
"learning_rate": 0.00016896551724137932, |
|
"loss": 0.032, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 10.693470001220703, |
|
"learning_rate": 0.00016724137931034482, |
|
"loss": 0.0542, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.09118778258562088, |
|
"learning_rate": 0.00016551724137931035, |
|
"loss": 0.0518, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"eval_accuracy": 0.9941972920696325, |
|
"eval_loss": 0.03303845226764679, |
|
"eval_runtime": 9.2358, |
|
"eval_samples_per_second": 55.978, |
|
"eval_steps_per_second": 7.038, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 0.04961516335606575, |
|
"learning_rate": 0.00016379310344827587, |
|
"loss": 0.0708, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 0.058188311755657196, |
|
"learning_rate": 0.00016206896551724137, |
|
"loss": 0.0392, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 0.041994284838438034, |
|
"learning_rate": 0.0001603448275862069, |
|
"loss": 0.0116, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.05082221329212189, |
|
"learning_rate": 0.00015862068965517243, |
|
"loss": 0.0376, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 0.07286480814218521, |
|
"learning_rate": 0.00015689655172413793, |
|
"loss": 0.0382, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 0.03850896656513214, |
|
"learning_rate": 0.00015517241379310346, |
|
"loss": 0.0178, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 0.031015470623970032, |
|
"learning_rate": 0.00015344827586206899, |
|
"loss": 0.0328, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.03393784165382385, |
|
"learning_rate": 0.00015172413793103449, |
|
"loss": 0.0072, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.09719406068325043, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0071, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 0.02531067281961441, |
|
"learning_rate": 0.00014827586206896554, |
|
"loss": 0.006, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"eval_accuracy": 0.9922630560928434, |
|
"eval_loss": 0.038734398782253265, |
|
"eval_runtime": 9.1377, |
|
"eval_samples_per_second": 56.579, |
|
"eval_steps_per_second": 7.113, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0689655172413792, |
|
"grad_norm": 0.02636132761836052, |
|
"learning_rate": 0.00014655172413793104, |
|
"loss": 0.0057, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 0.023396922275424004, |
|
"learning_rate": 0.00014482758620689657, |
|
"loss": 0.0053, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 0.022781934589147568, |
|
"learning_rate": 0.0001431034482758621, |
|
"loss": 0.005, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"grad_norm": 0.020923634991049767, |
|
"learning_rate": 0.0001413793103448276, |
|
"loss": 0.0048, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 0.020241938531398773, |
|
"learning_rate": 0.0001396551724137931, |
|
"loss": 0.0046, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 0.0188324972987175, |
|
"learning_rate": 0.00013793103448275863, |
|
"loss": 0.0046, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 0.019414547830820084, |
|
"learning_rate": 0.00013620689655172413, |
|
"loss": 0.0043, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"grad_norm": 0.019972946494817734, |
|
"learning_rate": 0.00013448275862068965, |
|
"loss": 0.0075, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 0.02629067189991474, |
|
"learning_rate": 0.00013275862068965518, |
|
"loss": 0.0377, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.027980973944067955, |
|
"learning_rate": 0.00013103448275862068, |
|
"loss": 0.004, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"eval_accuracy": 0.9845261121856866, |
|
"eval_loss": 0.0695781335234642, |
|
"eval_runtime": 9.2324, |
|
"eval_samples_per_second": 55.998, |
|
"eval_steps_per_second": 7.04, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 10.141643524169922, |
|
"learning_rate": 0.0001293103448275862, |
|
"loss": 0.0441, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.4482758620689655, |
|
"grad_norm": 0.017165783792734146, |
|
"learning_rate": 0.00012758620689655174, |
|
"loss": 0.0049, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4827586206896552, |
|
"grad_norm": 0.01630399189889431, |
|
"learning_rate": 0.00012586206896551724, |
|
"loss": 0.0492, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5172413793103448, |
|
"grad_norm": 0.018257714807987213, |
|
"learning_rate": 0.00012413793103448277, |
|
"loss": 0.0526, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.01886160485446453, |
|
"learning_rate": 0.00012241379310344827, |
|
"loss": 0.0402, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5862068965517242, |
|
"grad_norm": 0.01721111685037613, |
|
"learning_rate": 0.0001206896551724138, |
|
"loss": 0.0359, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.6206896551724137, |
|
"grad_norm": 0.020360512658953667, |
|
"learning_rate": 0.00011896551724137932, |
|
"loss": 0.0051, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 0.01872898079454899, |
|
"learning_rate": 0.00011724137931034482, |
|
"loss": 0.004, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 0.015705358237028122, |
|
"learning_rate": 0.00011551724137931035, |
|
"loss": 0.0036, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.022712524980306625, |
|
"learning_rate": 0.00011379310344827588, |
|
"loss": 0.0032, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"eval_accuracy": 0.9690522243713733, |
|
"eval_loss": 0.16091904044151306, |
|
"eval_runtime": 9.2258, |
|
"eval_samples_per_second": 56.039, |
|
"eval_steps_per_second": 7.045, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.7586206896551724, |
|
"grad_norm": 0.02154046855866909, |
|
"learning_rate": 0.00011206896551724138, |
|
"loss": 0.0837, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.793103448275862, |
|
"grad_norm": 0.01605401560664177, |
|
"learning_rate": 0.0001103448275862069, |
|
"loss": 0.0033, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 0.04890372231602669, |
|
"learning_rate": 0.00010862068965517242, |
|
"loss": 0.0431, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.8620689655172413, |
|
"grad_norm": 0.017495185136795044, |
|
"learning_rate": 0.00010689655172413792, |
|
"loss": 0.0038, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 0.017458785325288773, |
|
"learning_rate": 0.00010517241379310345, |
|
"loss": 0.0036, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 0.01829078048467636, |
|
"learning_rate": 0.00010344827586206898, |
|
"loss": 0.003, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 0.015696866437792778, |
|
"learning_rate": 0.00010172413793103448, |
|
"loss": 0.0029, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.015601696446537971, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0027, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.0344827586206895, |
|
"grad_norm": 0.014859354123473167, |
|
"learning_rate": 9.827586206896552e-05, |
|
"loss": 0.0027, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 0.013295358046889305, |
|
"learning_rate": 9.655172413793105e-05, |
|
"loss": 0.0026, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0025033194106072187, |
|
"eval_runtime": 9.5465, |
|
"eval_samples_per_second": 54.156, |
|
"eval_steps_per_second": 6.809, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.103448275862069, |
|
"grad_norm": 0.015505661256611347, |
|
"learning_rate": 9.482758620689656e-05, |
|
"loss": 0.0025, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.1379310344827585, |
|
"grad_norm": 0.010766665451228619, |
|
"learning_rate": 9.310344827586207e-05, |
|
"loss": 0.0024, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.1724137931034484, |
|
"grad_norm": 0.010864505544304848, |
|
"learning_rate": 9.137931034482759e-05, |
|
"loss": 0.0023, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.206896551724138, |
|
"grad_norm": 0.009982941672205925, |
|
"learning_rate": 8.96551724137931e-05, |
|
"loss": 0.0023, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.2413793103448274, |
|
"grad_norm": 0.009656297974288464, |
|
"learning_rate": 8.793103448275862e-05, |
|
"loss": 0.0022, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.2758620689655173, |
|
"grad_norm": 0.009799299761652946, |
|
"learning_rate": 8.620689655172413e-05, |
|
"loss": 0.0022, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.310344827586207, |
|
"grad_norm": 0.01052001304924488, |
|
"learning_rate": 8.448275862068966e-05, |
|
"loss": 0.0022, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.344827586206897, |
|
"grad_norm": 0.009396790526807308, |
|
"learning_rate": 8.275862068965517e-05, |
|
"loss": 0.0021, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.3793103448275863, |
|
"grad_norm": 0.00975924450904131, |
|
"learning_rate": 8.103448275862069e-05, |
|
"loss": 0.0021, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.009396582841873169, |
|
"learning_rate": 7.931034482758621e-05, |
|
"loss": 0.0021, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.002028640592470765, |
|
"eval_runtime": 9.6653, |
|
"eval_samples_per_second": 53.49, |
|
"eval_steps_per_second": 6.725, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.4482758620689653, |
|
"grad_norm": 0.008813206106424332, |
|
"learning_rate": 7.758620689655173e-05, |
|
"loss": 0.002, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.4827586206896552, |
|
"grad_norm": 0.008702595718204975, |
|
"learning_rate": 7.586206896551724e-05, |
|
"loss": 0.002, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.5172413793103448, |
|
"grad_norm": 0.010061556473374367, |
|
"learning_rate": 7.413793103448277e-05, |
|
"loss": 0.002, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.5517241379310347, |
|
"grad_norm": 0.010726147331297398, |
|
"learning_rate": 7.241379310344828e-05, |
|
"loss": 0.0019, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 0.009536409750580788, |
|
"learning_rate": 7.06896551724138e-05, |
|
"loss": 0.0019, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.6206896551724137, |
|
"grad_norm": 0.008245566859841347, |
|
"learning_rate": 6.896551724137931e-05, |
|
"loss": 0.0019, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.655172413793103, |
|
"grad_norm": 0.010203349404036999, |
|
"learning_rate": 6.724137931034483e-05, |
|
"loss": 0.0019, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.689655172413793, |
|
"grad_norm": 0.008181954734027386, |
|
"learning_rate": 6.551724137931034e-05, |
|
"loss": 0.0018, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.7241379310344827, |
|
"grad_norm": 0.008396759629249573, |
|
"learning_rate": 6.379310344827587e-05, |
|
"loss": 0.0018, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.008574232459068298, |
|
"learning_rate": 6.206896551724138e-05, |
|
"loss": 0.0018, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0017438590293750167, |
|
"eval_runtime": 9.2473, |
|
"eval_samples_per_second": 55.908, |
|
"eval_steps_per_second": 7.029, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.793103448275862, |
|
"grad_norm": 0.008582869544625282, |
|
"learning_rate": 6.03448275862069e-05, |
|
"loss": 0.0017, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.8275862068965516, |
|
"grad_norm": 0.00796087272465229, |
|
"learning_rate": 5.862068965517241e-05, |
|
"loss": 0.0017, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.862068965517241, |
|
"grad_norm": 0.00767522631213069, |
|
"learning_rate": 5.689655172413794e-05, |
|
"loss": 0.0017, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.896551724137931, |
|
"grad_norm": 0.007729759905487299, |
|
"learning_rate": 5.517241379310345e-05, |
|
"loss": 0.0017, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.9310344827586206, |
|
"grad_norm": 0.007915050722658634, |
|
"learning_rate": 5.344827586206896e-05, |
|
"loss": 0.0016, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.9655172413793105, |
|
"grad_norm": 0.008189872838556767, |
|
"learning_rate": 5.172413793103449e-05, |
|
"loss": 0.0016, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.007728048134595156, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0016, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.0344827586206895, |
|
"grad_norm": 0.007373048458248377, |
|
"learning_rate": 4.827586206896552e-05, |
|
"loss": 0.0016, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.0689655172413794, |
|
"grad_norm": 0.00832042470574379, |
|
"learning_rate": 4.655172413793104e-05, |
|
"loss": 0.0016, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 0.007547506596893072, |
|
"learning_rate": 4.482758620689655e-05, |
|
"loss": 0.0016, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0015662283403798938, |
|
"eval_runtime": 9.1484, |
|
"eval_samples_per_second": 56.513, |
|
"eval_steps_per_second": 7.105, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.1379310344827585, |
|
"grad_norm": 0.007507437374442816, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 0.0016, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.1724137931034484, |
|
"grad_norm": 0.008085578680038452, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 0.0015, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.206896551724138, |
|
"grad_norm": 0.007526910398155451, |
|
"learning_rate": 3.965517241379311e-05, |
|
"loss": 0.0015, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.2413793103448274, |
|
"grad_norm": 0.007942083291709423, |
|
"learning_rate": 3.793103448275862e-05, |
|
"loss": 0.0015, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.2758620689655173, |
|
"grad_norm": 0.007609126623719931, |
|
"learning_rate": 3.620689655172414e-05, |
|
"loss": 0.0015, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.310344827586207, |
|
"grad_norm": 0.007188369985669851, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 0.0015, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.344827586206897, |
|
"grad_norm": 0.0077166566625237465, |
|
"learning_rate": 3.275862068965517e-05, |
|
"loss": 0.0015, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.3793103448275863, |
|
"grad_norm": 0.007557329256087542, |
|
"learning_rate": 3.103448275862069e-05, |
|
"loss": 0.0015, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.413793103448276, |
|
"grad_norm": 0.006876759696751833, |
|
"learning_rate": 2.9310344827586206e-05, |
|
"loss": 0.0015, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 0.007274131290614605, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 0.0015, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.001456769765354693, |
|
"eval_runtime": 9.1926, |
|
"eval_samples_per_second": 56.241, |
|
"eval_steps_per_second": 7.071, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.4827586206896552, |
|
"grad_norm": 0.007954578846693039, |
|
"learning_rate": 2.5862068965517244e-05, |
|
"loss": 0.0015, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.5172413793103448, |
|
"grad_norm": 0.007275938987731934, |
|
"learning_rate": 2.413793103448276e-05, |
|
"loss": 0.0015, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.5517241379310347, |
|
"grad_norm": 0.006545715499669313, |
|
"learning_rate": 2.2413793103448276e-05, |
|
"loss": 0.0014, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.586206896551724, |
|
"grad_norm": 0.006517117377370596, |
|
"learning_rate": 2.0689655172413793e-05, |
|
"loss": 0.0014, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.6206896551724137, |
|
"grad_norm": 0.006700526457279921, |
|
"learning_rate": 1.896551724137931e-05, |
|
"loss": 0.0014, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.655172413793103, |
|
"grad_norm": 0.006489131134003401, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 0.0014, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.689655172413793, |
|
"grad_norm": 0.006455055437982082, |
|
"learning_rate": 1.5517241379310346e-05, |
|
"loss": 0.0014, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.7241379310344827, |
|
"grad_norm": 0.006427043117582798, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 0.0014, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.7586206896551726, |
|
"grad_norm": 0.006561470218002796, |
|
"learning_rate": 1.206896551724138e-05, |
|
"loss": 0.0014, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 0.007191502954810858, |
|
"learning_rate": 1.0344827586206897e-05, |
|
"loss": 0.0014, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0014008664293214679, |
|
"eval_runtime": 9.2274, |
|
"eval_samples_per_second": 56.029, |
|
"eval_steps_per_second": 7.044, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.8275862068965516, |
|
"grad_norm": 0.006780336145311594, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 0.0014, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.862068965517241, |
|
"grad_norm": 0.006570115685462952, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.0014, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.896551724137931, |
|
"grad_norm": 0.00782351940870285, |
|
"learning_rate": 5.172413793103448e-06, |
|
"loss": 0.0014, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.9310344827586206, |
|
"grad_norm": 0.006399817299097776, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 0.0014, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.9655172413793105, |
|
"grad_norm": 0.00720432261005044, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 0.0014, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.006754900328814983, |
|
"learning_rate": 0.0, |
|
"loss": 0.0014, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 1160, |
|
"total_flos": 1.436727240856535e+18, |
|
"train_loss": 0.027367618223170524, |
|
"train_runtime": 673.711, |
|
"train_samples_per_second": 27.519, |
|
"train_steps_per_second": 1.722 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1160, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.436727240856535e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|