camembertav2-base-fquad / trainer_state.json
wissamantoun's picture
Upload folder using huggingface_hub
cf96f54 verified
{
"best_metric": 64.42910915934755,
"best_model_checkpoint": "/scratch/camembertv2/runs/results/fquad/camembertav2-base-bf16-p2-17000/max_seq_length-896-doc_stride-128-max_answer_length-30-gradient_accumulation_steps-2-precision-fp32-learning_rate-3e-05-epochs-6-lr_scheduler-cosine-warmup_steps-0/SEED-1/checkpoint-6480",
"epoch": 6.0,
"eval_steps": 500,
"global_step": 7776,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07716049382716049,
"grad_norm": 23.665807723999023,
"learning_rate": 2.998775977415799e-05,
"loss": 3.705,
"step": 100
},
{
"epoch": 0.15432098765432098,
"grad_norm": 20.01573944091797,
"learning_rate": 2.9951059073049117e-05,
"loss": 1.6565,
"step": 200
},
{
"epoch": 0.23148148148148148,
"grad_norm": 12.386185646057129,
"learning_rate": 2.988995779332273e-05,
"loss": 1.4648,
"step": 300
},
{
"epoch": 0.30864197530864196,
"grad_norm": 18.621143341064453,
"learning_rate": 2.980455565410724e-05,
"loss": 1.243,
"step": 400
},
{
"epoch": 0.38580246913580246,
"grad_norm": 15.412405967712402,
"learning_rate": 2.96949920342655e-05,
"loss": 1.1382,
"step": 500
},
{
"epoch": 0.46296296296296297,
"grad_norm": 18.231019973754883,
"learning_rate": 2.95614457449243e-05,
"loss": 1.1534,
"step": 600
},
{
"epoch": 0.5401234567901234,
"grad_norm": 16.341306686401367,
"learning_rate": 2.940413473764923e-05,
"loss": 1.0392,
"step": 700
},
{
"epoch": 0.6172839506172839,
"grad_norm": 12.782474517822266,
"learning_rate": 2.9223315748741146e-05,
"loss": 1.0394,
"step": 800
},
{
"epoch": 0.6944444444444444,
"grad_norm": 15.871848106384277,
"learning_rate": 2.9019283880234828e-05,
"loss": 0.9878,
"step": 900
},
{
"epoch": 0.7716049382716049,
"grad_norm": 8.874638557434082,
"learning_rate": 2.879237211828353e-05,
"loss": 1.0025,
"step": 1000
},
{
"epoch": 0.8487654320987654,
"grad_norm": 19.52781867980957,
"learning_rate": 2.8542950789715587e-05,
"loss": 0.9396,
"step": 1100
},
{
"epoch": 0.9259259259259259,
"grad_norm": 11.584712982177734,
"learning_rate": 2.8271426957649866e-05,
"loss": 0.9625,
"step": 1200
},
{
"epoch": 1.0,
"eval_exact_match": 61.91969887076537,
"eval_f1": 81.01142429829103,
"eval_runtime": 46.1272,
"eval_samples_per_second": 69.113,
"eval_steps_per_second": 1.084,
"step": 1296
},
{
"epoch": 1.0030864197530864,
"grad_norm": 7.031412601470947,
"learning_rate": 2.7978243757156497e-05,
"loss": 0.9411,
"step": 1300
},
{
"epoch": 1.0802469135802468,
"grad_norm": 11.493968963623047,
"learning_rate": 2.7663879672047095e-05,
"loss": 0.6781,
"step": 1400
},
{
"epoch": 1.1574074074074074,
"grad_norm": 5.261780738830566,
"learning_rate": 2.732884775397477e-05,
"loss": 0.6742,
"step": 1500
},
{
"epoch": 1.2345679012345678,
"grad_norm": 13.310417175292969,
"learning_rate": 2.6973694785118392e-05,
"loss": 0.7004,
"step": 1600
},
{
"epoch": 1.3117283950617284,
"grad_norm": 17.390798568725586,
"learning_rate": 2.65990003858176e-05,
"loss": 0.6512,
"step": 1700
},
{
"epoch": 1.3888888888888888,
"grad_norm": 14.149763107299805,
"learning_rate": 2.620537606861494e-05,
"loss": 0.7074,
"step": 1800
},
{
"epoch": 1.4660493827160495,
"grad_norm": 5.80026912689209,
"learning_rate": 2.5793464240249014e-05,
"loss": 0.6629,
"step": 1900
},
{
"epoch": 1.5432098765432098,
"grad_norm": 17.484006881713867,
"learning_rate": 2.536393715322732e-05,
"loss": 0.6993,
"step": 2000
},
{
"epoch": 1.6203703703703702,
"grad_norm": 21.80438804626465,
"learning_rate": 2.49174958086899e-05,
"loss": 0.6408,
"step": 2100
},
{
"epoch": 1.6975308641975309,
"grad_norm": 19.416994094848633,
"learning_rate": 2.4454868812354406e-05,
"loss": 0.6592,
"step": 2200
},
{
"epoch": 1.7746913580246915,
"grad_norm": 5.628990650177002,
"learning_rate": 2.3976811185409607e-05,
"loss": 0.622,
"step": 2300
},
{
"epoch": 1.8518518518518519,
"grad_norm": 14.69774341583252,
"learning_rate": 2.3484103132298082e-05,
"loss": 0.645,
"step": 2400
},
{
"epoch": 1.9290123456790123,
"grad_norm": 13.895576477050781,
"learning_rate": 2.297754876739905e-05,
"loss": 0.6746,
"step": 2500
},
{
"epoch": 2.0,
"eval_exact_match": 63.26850690087829,
"eval_f1": 82.69346269167056,
"eval_runtime": 45.8773,
"eval_samples_per_second": 69.49,
"eval_steps_per_second": 1.09,
"step": 2592
},
{
"epoch": 2.006172839506173,
"grad_norm": 7.386751651763916,
"learning_rate": 2.2457974802689542e-05,
"loss": 0.6472,
"step": 2600
},
{
"epoch": 2.0833333333333335,
"grad_norm": 12.677722930908203,
"learning_rate": 2.192622919852551e-05,
"loss": 0.4365,
"step": 2700
},
{
"epoch": 2.1604938271604937,
"grad_norm": 7.171934127807617,
"learning_rate": 2.138317977974501e-05,
"loss": 0.4287,
"step": 2800
},
{
"epoch": 2.2376543209876543,
"grad_norm": 10.016780853271484,
"learning_rate": 2.082971281935195e-05,
"loss": 0.4462,
"step": 2900
},
{
"epoch": 2.314814814814815,
"grad_norm": 21.301910400390625,
"learning_rate": 2.0266731592091834e-05,
"loss": 0.4425,
"step": 3000
},
{
"epoch": 2.3919753086419755,
"grad_norm": 21.78326988220215,
"learning_rate": 1.969515490028019e-05,
"loss": 0.425,
"step": 3100
},
{
"epoch": 2.4691358024691357,
"grad_norm": 17.772539138793945,
"learning_rate": 1.9115915574289523e-05,
"loss": 0.4181,
"step": 3200
},
{
"epoch": 2.5462962962962963,
"grad_norm": 7.547439098358154,
"learning_rate": 1.8529958950142064e-05,
"loss": 0.4233,
"step": 3300
},
{
"epoch": 2.623456790123457,
"grad_norm": 9.031538963317871,
"learning_rate": 1.7938241326692906e-05,
"loss": 0.4691,
"step": 3400
},
{
"epoch": 2.700617283950617,
"grad_norm": 9.722735404968262,
"learning_rate": 1.734172840492147e-05,
"loss": 0.4498,
"step": 3500
},
{
"epoch": 2.7777777777777777,
"grad_norm": 9.985281944274902,
"learning_rate": 1.6741393711878455e-05,
"loss": 0.4388,
"step": 3600
},
{
"epoch": 2.8549382716049383,
"grad_norm": 9.514204978942871,
"learning_rate": 1.6138217011860335e-05,
"loss": 0.4501,
"step": 3700
},
{
"epoch": 2.932098765432099,
"grad_norm": 16.88687515258789,
"learning_rate": 1.5533182707404563e-05,
"loss": 0.4172,
"step": 3800
},
{
"epoch": 3.0,
"eval_exact_match": 63.676286072772896,
"eval_f1": 82.60726439387956,
"eval_runtime": 45.9653,
"eval_samples_per_second": 69.357,
"eval_steps_per_second": 1.088,
"step": 3888
},
{
"epoch": 3.009259259259259,
"grad_norm": 4.132925033569336,
"learning_rate": 1.4927278232714974e-05,
"loss": 0.3689,
"step": 3900
},
{
"epoch": 3.0864197530864197,
"grad_norm": 9.779620170593262,
"learning_rate": 1.4321492442139406e-05,
"loss": 0.2905,
"step": 4000
},
{
"epoch": 3.1635802469135803,
"grad_norm": 7.350837230682373,
"learning_rate": 1.371681399632967e-05,
"loss": 0.2937,
"step": 4100
},
{
"epoch": 3.240740740740741,
"grad_norm": 6.923620223999023,
"learning_rate": 1.3114229748717562e-05,
"loss": 0.2922,
"step": 4200
},
{
"epoch": 3.317901234567901,
"grad_norm": 16.84642791748047,
"learning_rate": 1.2514723134940363e-05,
"loss": 0.28,
"step": 4300
},
{
"epoch": 3.3950617283950617,
"grad_norm": 22.180021286010742,
"learning_rate": 1.191927256784427e-05,
"loss": 0.2907,
"step": 4400
},
{
"epoch": 3.4722222222222223,
"grad_norm": 2.5661354064941406,
"learning_rate": 1.1328849840685143e-05,
"loss": 0.2806,
"step": 4500
},
{
"epoch": 3.549382716049383,
"grad_norm": 11.584675788879395,
"learning_rate": 1.0744418541132676e-05,
"loss": 0.2963,
"step": 4600
},
{
"epoch": 3.626543209876543,
"grad_norm": 5.476423740386963,
"learning_rate": 1.0166932478666293e-05,
"loss": 0.3199,
"step": 4700
},
{
"epoch": 3.7037037037037037,
"grad_norm": 9.405366897583008,
"learning_rate": 9.597334127929346e-06,
"loss": 0.3107,
"step": 4800
},
{
"epoch": 3.7808641975308643,
"grad_norm": 8.880900382995605,
"learning_rate": 9.036553090582144e-06,
"loss": 0.2991,
"step": 4900
},
{
"epoch": 3.8580246913580245,
"grad_norm": 3.8892629146575928,
"learning_rate": 8.485504578164017e-06,
"loss": 0.2716,
"step": 5000
},
{
"epoch": 3.935185185185185,
"grad_norm": 5.704967498779297,
"learning_rate": 7.945087918440563e-06,
"loss": 0.2688,
"step": 5100
},
{
"epoch": 4.0,
"eval_exact_match": 64.2409033877039,
"eval_f1": 83.135484930466,
"eval_runtime": 45.9298,
"eval_samples_per_second": 69.41,
"eval_steps_per_second": 1.089,
"step": 5184
},
{
"epoch": 4.012345679012346,
"grad_norm": 16.147579193115234,
"learning_rate": 7.416185087673616e-06,
"loss": 0.2919,
"step": 5200
},
{
"epoch": 4.089506172839506,
"grad_norm": 13.380005836486816,
"learning_rate": 6.899659271209459e-06,
"loss": 0.2068,
"step": 5300
},
{
"epoch": 4.166666666666667,
"grad_norm": 9.491084098815918,
"learning_rate": 6.3963534547343126e-06,
"loss": 0.2009,
"step": 5400
},
{
"epoch": 4.243827160493828,
"grad_norm": 14.11040210723877,
"learning_rate": 5.907089048496351e-06,
"loss": 0.2124,
"step": 5500
},
{
"epoch": 4.320987654320987,
"grad_norm": 12.674304962158203,
"learning_rate": 5.4326645467394085e-06,
"loss": 0.2173,
"step": 5600
},
{
"epoch": 4.398148148148148,
"grad_norm": 5.682621955871582,
"learning_rate": 4.973854224536363e-06,
"loss": 0.213,
"step": 5700
},
{
"epoch": 4.4753086419753085,
"grad_norm": 5.133475303649902,
"learning_rate": 4.5314068741488615e-06,
"loss": 0.2,
"step": 5800
},
{
"epoch": 4.552469135802469,
"grad_norm": 6.370384693145752,
"learning_rate": 4.1060445829758305e-06,
"loss": 0.197,
"step": 5900
},
{
"epoch": 4.62962962962963,
"grad_norm": 16.37765884399414,
"learning_rate": 3.6984615550850894e-06,
"loss": 0.2051,
"step": 6000
},
{
"epoch": 4.70679012345679,
"grad_norm": 11.54761791229248,
"learning_rate": 3.3093229782514023e-06,
"loss": 0.1733,
"step": 6100
},
{
"epoch": 4.783950617283951,
"grad_norm": 22.175281524658203,
"learning_rate": 2.939263938350012e-06,
"loss": 0.2003,
"step": 6200
},
{
"epoch": 4.861111111111111,
"grad_norm": 1.2753137350082397,
"learning_rate": 2.588888382877342e-06,
"loss": 0.194,
"step": 6300
},
{
"epoch": 4.938271604938271,
"grad_norm": 32.319236755371094,
"learning_rate": 2.2587681352905404e-06,
"loss": 0.2149,
"step": 6400
},
{
"epoch": 5.0,
"eval_exact_match": 64.42910915934755,
"eval_f1": 83.36016013340664,
"eval_runtime": 45.8927,
"eval_samples_per_second": 69.466,
"eval_steps_per_second": 1.089,
"step": 6480
},
{
"epoch": 5.015432098765432,
"grad_norm": 9.053484916687012,
"learning_rate": 1.9494419617743312e-06,
"loss": 0.198,
"step": 6500
},
{
"epoch": 5.092592592592593,
"grad_norm": 11.60733699798584,
"learning_rate": 1.6614146919584094e-06,
"loss": 0.1512,
"step": 6600
},
{
"epoch": 5.169753086419753,
"grad_norm": 9.327279090881348,
"learning_rate": 1.3951563950202656e-06,
"loss": 0.167,
"step": 6700
},
{
"epoch": 5.246913580246914,
"grad_norm": 4.551391124725342,
"learning_rate": 1.1511016125181445e-06,
"loss": 0.1315,
"step": 6800
},
{
"epoch": 5.324074074074074,
"grad_norm": 4.2411274909973145,
"learning_rate": 9.296486492061334e-07,
"loss": 0.1532,
"step": 6900
},
{
"epoch": 5.401234567901234,
"grad_norm": 5.1193461418151855,
"learning_rate": 7.311589229888083e-07,
"loss": 0.1624,
"step": 7000
},
{
"epoch": 5.478395061728395,
"grad_norm": 1.735378384590149,
"learning_rate": 5.55956375076332e-07,
"loss": 0.1688,
"step": 7100
},
{
"epoch": 5.555555555555555,
"grad_norm": 3.7359869480133057,
"learning_rate": 4.043269413026429e-07,
"loss": 0.148,
"step": 7200
},
{
"epoch": 5.632716049382716,
"grad_norm": 6.915912628173828,
"learning_rate": 2.7651808546956646e-07,
"loss": 0.1822,
"step": 7300
},
{
"epoch": 5.709876543209877,
"grad_norm": 5.2128448486328125,
"learning_rate": 1.727383954784373e-07,
"loss": 0.163,
"step": 7400
},
{
"epoch": 5.787037037037037,
"grad_norm": 3.4656639099121094,
"learning_rate": 9.315724290836047e-08,
"loss": 0.1716,
"step": 7500
},
{
"epoch": 5.864197530864198,
"grad_norm": 21.396251678466797,
"learning_rate": 3.790450659670097e-08,
"loss": 0.1694,
"step": 7600
},
{
"epoch": 5.9413580246913575,
"grad_norm": 14.200843811035156,
"learning_rate": 7.070360672907228e-09,
"loss": 0.1618,
"step": 7700
},
{
"epoch": 6.0,
"eval_exact_match": 64.0840652446675,
"eval_f1": 83.12314115625247,
"eval_runtime": 45.7565,
"eval_samples_per_second": 69.673,
"eval_steps_per_second": 1.093,
"step": 7776
},
{
"epoch": 6.0,
"step": 7776,
"total_flos": 2.0394634246921464e+16,
"train_loss": 0.5145930189164087,
"train_runtime": 3736.1381,
"train_samples_per_second": 33.293,
"train_steps_per_second": 2.081
}
],
"logging_steps": 100,
"max_steps": 7776,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.0394634246921464e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}