|
{ |
|
"best_metric": 0.6337994337081909, |
|
"best_model_checkpoint": "finetuned-bangladeshi-traditional-food/checkpoint-320", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 640, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 260322.40625, |
|
"learning_rate": 0.000196875, |
|
"loss": 0.1664, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 9524.54296875, |
|
"learning_rate": 0.00019375000000000002, |
|
"loss": 0.132, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 232655.25, |
|
"learning_rate": 0.000190625, |
|
"loss": 0.2052, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2797.121337890625, |
|
"learning_rate": 0.0001875, |
|
"loss": 0.1608, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 94306.1328125, |
|
"learning_rate": 0.000184375, |
|
"loss": 0.0879, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 94746.8828125, |
|
"learning_rate": 0.00018125000000000001, |
|
"loss": 0.2159, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.840782122905028, |
|
"eval_loss": 0.6425307393074036, |
|
"eval_runtime": 6.1943, |
|
"eval_samples_per_second": 57.795, |
|
"eval_steps_per_second": 3.713, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 224044.515625, |
|
"learning_rate": 0.000178125, |
|
"loss": 0.1128, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 179013.890625, |
|
"learning_rate": 0.000175, |
|
"loss": 0.1238, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 322362.1875, |
|
"learning_rate": 0.00017187500000000002, |
|
"loss": 0.0767, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 486709.90625, |
|
"learning_rate": 0.00016875, |
|
"loss": 0.1934, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 339304.5, |
|
"learning_rate": 0.000165625, |
|
"loss": 0.1545, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 400540.96875, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 0.1968, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8547486033519553, |
|
"eval_loss": 0.6727585792541504, |
|
"eval_runtime": 6.1785, |
|
"eval_samples_per_second": 57.943, |
|
"eval_steps_per_second": 3.723, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 402626.6875, |
|
"learning_rate": 0.000159375, |
|
"loss": 0.1303, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 84834.3515625, |
|
"learning_rate": 0.00015625, |
|
"loss": 0.1479, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 120292.5703125, |
|
"learning_rate": 0.000153125, |
|
"loss": 0.1937, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 20186.486328125, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0804, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 173780.65625, |
|
"learning_rate": 0.000146875, |
|
"loss": 0.0843, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 46256.0546875, |
|
"learning_rate": 0.00014375, |
|
"loss": 0.0831, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 165973.46875, |
|
"learning_rate": 0.00014062500000000002, |
|
"loss": 0.0686, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8631284916201117, |
|
"eval_loss": 0.71826171875, |
|
"eval_runtime": 6.475, |
|
"eval_samples_per_second": 55.29, |
|
"eval_steps_per_second": 3.552, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 811.771240234375, |
|
"learning_rate": 0.0001375, |
|
"loss": 0.0824, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 421833.125, |
|
"learning_rate": 0.000134375, |
|
"loss": 0.1451, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 347591.78125, |
|
"learning_rate": 0.00013125000000000002, |
|
"loss": 0.1193, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 150345.671875, |
|
"learning_rate": 0.000128125, |
|
"loss": 0.1245, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 370.7461853027344, |
|
"learning_rate": 0.000125, |
|
"loss": 0.0934, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 583907.0, |
|
"learning_rate": 0.00012187500000000001, |
|
"loss": 0.0975, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8463687150837989, |
|
"eval_loss": 0.8238765597343445, |
|
"eval_runtime": 6.2123, |
|
"eval_samples_per_second": 57.628, |
|
"eval_steps_per_second": 3.702, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 75910.390625, |
|
"learning_rate": 0.00011875, |
|
"loss": 0.0952, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 483136.6875, |
|
"learning_rate": 0.000115625, |
|
"loss": 0.0867, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 4663.18310546875, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 0.0599, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 1170.08251953125, |
|
"learning_rate": 0.000109375, |
|
"loss": 0.0671, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 286.7016296386719, |
|
"learning_rate": 0.00010625000000000001, |
|
"loss": 0.1158, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 237.0006561279297, |
|
"learning_rate": 0.000103125, |
|
"loss": 0.0821, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 137.74867248535156, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0399, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.888268156424581, |
|
"eval_loss": 0.6337994337081909, |
|
"eval_runtime": 6.2754, |
|
"eval_samples_per_second": 57.048, |
|
"eval_steps_per_second": 3.665, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 1214.9869384765625, |
|
"learning_rate": 9.687500000000001e-05, |
|
"loss": 0.1064, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 534309.6875, |
|
"learning_rate": 9.375e-05, |
|
"loss": 0.0826, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 2779.36669921875, |
|
"learning_rate": 9.062500000000001e-05, |
|
"loss": 0.0524, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 2379.431640625, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.059, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 567636.125, |
|
"learning_rate": 8.4375e-05, |
|
"loss": 0.1303, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 582.1480712890625, |
|
"learning_rate": 8.125000000000001e-05, |
|
"loss": 0.0221, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.888268156424581, |
|
"eval_loss": 0.7707696557044983, |
|
"eval_runtime": 6.201, |
|
"eval_samples_per_second": 57.732, |
|
"eval_steps_per_second": 3.709, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 60.556556701660156, |
|
"learning_rate": 7.8125e-05, |
|
"loss": 0.0341, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 59.626529693603516, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.047, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 303419.21875, |
|
"learning_rate": 7.1875e-05, |
|
"loss": 0.065, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 90.21700286865234, |
|
"learning_rate": 6.875e-05, |
|
"loss": 0.0541, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 801652.9375, |
|
"learning_rate": 6.562500000000001e-05, |
|
"loss": 0.1373, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 281023.09375, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.0202, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.8910614525139665, |
|
"eval_loss": 0.7648816704750061, |
|
"eval_runtime": 6.2429, |
|
"eval_samples_per_second": 57.345, |
|
"eval_steps_per_second": 3.684, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 43.21717834472656, |
|
"learning_rate": 5.9375e-05, |
|
"loss": 0.0798, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 622758.6875, |
|
"learning_rate": 5.6250000000000005e-05, |
|
"loss": 0.0585, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 4335.7919921875, |
|
"learning_rate": 5.3125000000000004e-05, |
|
"loss": 0.0967, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 56.46971130371094, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0412, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 26.425151824951172, |
|
"learning_rate": 4.6875e-05, |
|
"loss": 0.0834, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 2260.0234375, |
|
"learning_rate": 4.375e-05, |
|
"loss": 0.0237, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 55191.5859375, |
|
"learning_rate": 4.0625000000000005e-05, |
|
"loss": 0.0274, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9162011173184358, |
|
"eval_loss": 0.7099100351333618, |
|
"eval_runtime": 6.3314, |
|
"eval_samples_per_second": 56.543, |
|
"eval_steps_per_second": 3.633, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 34213.54296875, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.0295, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"grad_norm": 35.443115234375, |
|
"learning_rate": 3.4375e-05, |
|
"loss": 0.0814, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 141960.859375, |
|
"learning_rate": 3.125e-05, |
|
"loss": 0.0377, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"grad_norm": 40429.1171875, |
|
"learning_rate": 2.8125000000000003e-05, |
|
"loss": 0.0481, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 11.297977447509766, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0573, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"grad_norm": 12.592033386230469, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 0.0507, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9162011173184358, |
|
"eval_loss": 0.7499300837516785, |
|
"eval_runtime": 6.5756, |
|
"eval_samples_per_second": 54.444, |
|
"eval_steps_per_second": 3.498, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 38.957550048828125, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 0.0137, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"grad_norm": 37808.00390625, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 0.1018, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 35257.2890625, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.0305, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"grad_norm": 14.713729858398438, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.0538, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 123944.28125, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.0499, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"grad_norm": 199116.203125, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.1264, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 156690.1875, |
|
"learning_rate": 0.0, |
|
"loss": 0.0653, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9162011173184358, |
|
"eval_loss": 0.7513992786407471, |
|
"eval_runtime": 6.3086, |
|
"eval_samples_per_second": 56.748, |
|
"eval_steps_per_second": 3.646, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 640, |
|
"total_flos": 1.571762912487506e+18, |
|
"train_loss": 0.0904881817754358, |
|
"train_runtime": 704.9996, |
|
"train_samples_per_second": 28.766, |
|
"train_steps_per_second": 0.908 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 640, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.571762912487506e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|