|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.32, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.8485889434814453, |
|
"learning_rate": 0.0001999964908278481, |
|
"loss": 1.2049, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.47789862751960754, |
|
"learning_rate": 0.00019998596355767805, |
|
"loss": 0.9333, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 1.017558217048645, |
|
"learning_rate": 0.00019996841892833, |
|
"loss": 0.8671, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.6610977053642273, |
|
"learning_rate": 0.00019994385817114646, |
|
"loss": 0.7979, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6075429320335388, |
|
"learning_rate": 0.00019991228300988585, |
|
"loss": 0.7662, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.6595763564109802, |
|
"learning_rate": 0.00019987369566060176, |
|
"loss": 0.7929, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 0.6968618035316467, |
|
"learning_rate": 0.00019982809883148722, |
|
"loss": 0.7683, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.4889592230319977, |
|
"learning_rate": 0.00019977549572268468, |
|
"loss": 0.8667, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.6651108264923096, |
|
"learning_rate": 0.0001997158900260614, |
|
"loss": 0.8446, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5898510217666626, |
|
"learning_rate": 0.00019964928592495045, |
|
"loss": 0.9051, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 0.4398016035556793, |
|
"learning_rate": 0.00019957568809385694, |
|
"loss": 0.7235, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.6901968121528625, |
|
"learning_rate": 0.00019949510169813003, |
|
"loss": 0.8169, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 0.6267213225364685, |
|
"learning_rate": 0.00019940753239360047, |
|
"loss": 0.8266, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.48524895310401917, |
|
"learning_rate": 0.00019931298632618356, |
|
"loss": 0.758, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5294132232666016, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 0.7759, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.48957982659339905, |
|
"learning_rate": 0.0001991029909341493, |
|
"loss": 0.7797, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.645412802696228, |
|
"learning_rate": 0.00019898755634773158, |
|
"loss": 0.7437, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.43297675251960754, |
|
"learning_rate": 0.0001988651744737914, |
|
"loss": 0.8043, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.5513920783996582, |
|
"learning_rate": 0.00019873585390151003, |
|
"loss": 0.7701, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8462435007095337, |
|
"learning_rate": 0.0001985996037070505, |
|
"loss": 0.709, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.6892585158348083, |
|
"learning_rate": 0.00019845643345292054, |
|
"loss": 0.7377, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.4617864191532135, |
|
"learning_rate": 0.00019830635318730154, |
|
"loss": 0.8352, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.6300354599952698, |
|
"learning_rate": 0.0001981493734433433, |
|
"loss": 0.7738, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.8086859583854675, |
|
"learning_rate": 0.0001979855052384247, |
|
"loss": 0.8067, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6272985935211182, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 0.7456, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.44750839471817017, |
|
"learning_rate": 0.00019763714993169452, |
|
"loss": 0.758, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 0.5053977370262146, |
|
"learning_rate": 0.00019745268727865774, |
|
"loss": 0.7895, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.41920769214630127, |
|
"learning_rate": 0.00019726138506049438, |
|
"loss": 0.7302, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.38280290365219116, |
|
"learning_rate": 0.00019706325670345275, |
|
"loss": 0.8152, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.554710865020752, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 0.8461, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 0.5612509250640869, |
|
"learning_rate": 0.00019664657767216176, |
|
"loss": 0.7787, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.610614538192749, |
|
"learning_rate": 0.00019642805624188147, |
|
"loss": 0.7574, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 0.679517924785614, |
|
"learning_rate": 0.0001962027671586086, |
|
"loss": 0.8487, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.6685434579849243, |
|
"learning_rate": 0.00019597072623390668, |
|
"loss": 0.6611, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.480293869972229, |
|
"learning_rate": 0.00019573194975320673, |
|
"loss": 0.7802, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.7727369070053101, |
|
"learning_rate": 0.00019548645447466431, |
|
"loss": 0.6727, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 0.6371043920516968, |
|
"learning_rate": 0.00019523425762798329, |
|
"loss": 0.7502, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.6399966478347778, |
|
"learning_rate": 0.00019497537691320668, |
|
"loss": 0.8401, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 0.7263137698173523, |
|
"learning_rate": 0.00019470983049947444, |
|
"loss": 0.7494, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.402416467666626, |
|
"learning_rate": 0.00019443763702374812, |
|
"loss": 0.7842, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1875, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2337729827766272e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|