{ "best_metric": 0.12199707, "best_model_checkpoint": "/data1/tzz/VQA/ckpt/llava_next_video/v2-20250226-080739/checkpoint-185", "epoch": 0.9966329966329966, "eval_steps": 500, "global_step": 185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0053872053872053875, "grad_norm": 14.159545000064247, "learning_rate": 1.0000000000000002e-06, "loss": 31.90625, "memory(GiB)": 22.53, "step": 1, "train_speed(iter/s)": 0.022985 }, { "epoch": 0.010774410774410775, "grad_norm": 14.616283963493206, "learning_rate": 2.0000000000000003e-06, "loss": 31.5234375, "memory(GiB)": 22.53, "step": 2, "train_speed(iter/s)": 0.028711 }, { "epoch": 0.01616161616161616, "grad_norm": 13.121864716464238, "learning_rate": 3e-06, "loss": 33.6796875, "memory(GiB)": 22.53, "step": 3, "train_speed(iter/s)": 0.031289 }, { "epoch": 0.02154882154882155, "grad_norm": 11.258740067609244, "learning_rate": 4.000000000000001e-06, "loss": 31.8203125, "memory(GiB)": 22.53, "step": 4, "train_speed(iter/s)": 0.032739 }, { "epoch": 0.026936026936026935, "grad_norm": 13.170936715126654, "learning_rate": 5e-06, "loss": 29.2109375, "memory(GiB)": 22.55, "step": 5, "train_speed(iter/s)": 0.033213 }, { "epoch": 0.03232323232323232, "grad_norm": 14.330929445232412, "learning_rate": 6e-06, "loss": 28.078125, "memory(GiB)": 22.55, "step": 6, "train_speed(iter/s)": 0.033152 }, { "epoch": 0.03771043771043771, "grad_norm": 13.548506738998086, "learning_rate": 7e-06, "loss": 27.5078125, "memory(GiB)": 22.55, "step": 7, "train_speed(iter/s)": 0.033486 }, { "epoch": 0.0430976430976431, "grad_norm": 8.666929263748118, "learning_rate": 8.000000000000001e-06, "loss": 24.9609375, "memory(GiB)": 22.56, "step": 8, "train_speed(iter/s)": 0.033686 }, { "epoch": 0.048484848484848485, "grad_norm": 11.066925048714, "learning_rate": 9e-06, "loss": 19.890625, "memory(GiB)": 22.57, "step": 9, "train_speed(iter/s)": 0.0337 }, { "epoch": 0.05387205387205387, "grad_norm": 8.973276554829988, "learning_rate": 1e-05, "loss": 14.328125, "memory(GiB)": 22.58, "step": 10, "train_speed(iter/s)": 0.03368 }, { "epoch": 0.05925925925925926, "grad_norm": 4.92025256078084, "learning_rate": 9.999194339645292e-06, "loss": 11.275390625, "memory(GiB)": 22.58, "step": 11, "train_speed(iter/s)": 0.033773 }, { "epoch": 0.06464646464646465, "grad_norm": 2.5239985209180706, "learning_rate": 9.996777618216608e-06, "loss": 9.6875, "memory(GiB)": 22.58, "step": 12, "train_speed(iter/s)": 0.034222 }, { "epoch": 0.07003367003367003, "grad_norm": 3.3965201854332046, "learning_rate": 9.992750614536606e-06, "loss": 7.869140625, "memory(GiB)": 22.58, "step": 13, "train_speed(iter/s)": 0.034559 }, { "epoch": 0.07542087542087542, "grad_norm": 3.83999730322345, "learning_rate": 9.987114626364172e-06, "loss": 7.22265625, "memory(GiB)": 22.58, "step": 14, "train_speed(iter/s)": 0.034521 }, { "epoch": 0.08080808080808081, "grad_norm": 4.824626769554233, "learning_rate": 9.979871469976197e-06, "loss": 7.0576171875, "memory(GiB)": 22.58, "step": 15, "train_speed(iter/s)": 0.03441 }, { "epoch": 0.0861952861952862, "grad_norm": 3.043760096951554, "learning_rate": 9.971023479582258e-06, "loss": 5.4990234375, "memory(GiB)": 22.58, "step": 16, "train_speed(iter/s)": 0.034356 }, { "epoch": 0.09158249158249158, "grad_norm": 1.3971212531371173, "learning_rate": 9.960573506572391e-06, "loss": 4.044921875, "memory(GiB)": 22.58, "step": 17, "train_speed(iter/s)": 0.03432 }, { "epoch": 0.09696969696969697, "grad_norm": 1.5367962214559587, "learning_rate": 9.948524918598175e-06, "loss": 3.44189453125, "memory(GiB)": 22.58, "step": 18, "train_speed(iter/s)": 0.034225 }, { "epoch": 0.10235690235690235, "grad_norm": 1.2329087385122603, "learning_rate": 9.934881598487478e-06, "loss": 3.4072265625, "memory(GiB)": 22.58, "step": 19, "train_speed(iter/s)": 0.034123 }, { "epoch": 0.10774410774410774, "grad_norm": 0.8648810367159049, "learning_rate": 9.91964794299315e-06, "loss": 3.0048828125, "memory(GiB)": 22.58, "step": 20, "train_speed(iter/s)": 0.03406 }, { "epoch": 0.11313131313131314, "grad_norm": 1.1333084548737522, "learning_rate": 9.902828861376101e-06, "loss": 2.973876953125, "memory(GiB)": 22.58, "step": 21, "train_speed(iter/s)": 0.03407 }, { "epoch": 0.11851851851851852, "grad_norm": 1.67328747436259, "learning_rate": 9.884429773823238e-06, "loss": 2.460693359375, "memory(GiB)": 22.58, "step": 22, "train_speed(iter/s)": 0.033985 }, { "epoch": 0.12390572390572391, "grad_norm": 0.8370283899907709, "learning_rate": 9.864456609700726e-06, "loss": 2.162109375, "memory(GiB)": 22.58, "step": 23, "train_speed(iter/s)": 0.033817 }, { "epoch": 0.1292929292929293, "grad_norm": 0.7984037408374535, "learning_rate": 9.842915805643156e-06, "loss": 2.711669921875, "memory(GiB)": 22.58, "step": 24, "train_speed(iter/s)": 0.033701 }, { "epoch": 0.13468013468013468, "grad_norm": 0.5877571918682093, "learning_rate": 9.819814303479268e-06, "loss": 1.707275390625, "memory(GiB)": 22.58, "step": 25, "train_speed(iter/s)": 0.033872 }, { "epoch": 0.14006734006734006, "grad_norm": 1.4800629465642858, "learning_rate": 9.79515954799483e-06, "loss": 2.813720703125, "memory(GiB)": 22.58, "step": 26, "train_speed(iter/s)": 0.034031 }, { "epoch": 0.14545454545454545, "grad_norm": 2.1222533390916443, "learning_rate": 9.768959484533461e-06, "loss": 3.59912109375, "memory(GiB)": 22.58, "step": 27, "train_speed(iter/s)": 0.034169 }, { "epoch": 0.15084175084175083, "grad_norm": 0.8369490081884605, "learning_rate": 9.741222556436132e-06, "loss": 1.89404296875, "memory(GiB)": 22.58, "step": 28, "train_speed(iter/s)": 0.034295 }, { "epoch": 0.15622895622895622, "grad_norm": 0.5854633514891076, "learning_rate": 9.711957702320176e-06, "loss": 1.986328125, "memory(GiB)": 22.58, "step": 29, "train_speed(iter/s)": 0.034448 }, { "epoch": 0.16161616161616163, "grad_norm": 0.35782476089852655, "learning_rate": 9.681174353198687e-06, "loss": 2.087890625, "memory(GiB)": 22.58, "step": 30, "train_speed(iter/s)": 0.034568 }, { "epoch": 0.16700336700336701, "grad_norm": 0.7861618699933016, "learning_rate": 9.648882429441258e-06, "loss": 2.669921875, "memory(GiB)": 22.58, "step": 31, "train_speed(iter/s)": 0.034675 }, { "epoch": 0.1723905723905724, "grad_norm": 0.536791680824106, "learning_rate": 9.615092337576987e-06, "loss": 2.203125, "memory(GiB)": 22.58, "step": 32, "train_speed(iter/s)": 0.034758 }, { "epoch": 0.17777777777777778, "grad_norm": 1.3726808261834198, "learning_rate": 9.579814966940833e-06, "loss": 2.114501953125, "memory(GiB)": 22.58, "step": 33, "train_speed(iter/s)": 0.034839 }, { "epoch": 0.18316498316498317, "grad_norm": 0.8535138723050261, "learning_rate": 9.543061686164374e-06, "loss": 2.1591796875, "memory(GiB)": 22.58, "step": 34, "train_speed(iter/s)": 0.034969 }, { "epoch": 0.18855218855218855, "grad_norm": 0.6726334477065563, "learning_rate": 9.504844339512096e-06, "loss": 2.35791015625, "memory(GiB)": 22.58, "step": 35, "train_speed(iter/s)": 0.035076 }, { "epoch": 0.19393939393939394, "grad_norm": 0.7227226956981251, "learning_rate": 9.465175243064428e-06, "loss": 2.400390625, "memory(GiB)": 22.58, "step": 36, "train_speed(iter/s)": 0.035195 }, { "epoch": 0.19932659932659932, "grad_norm": 0.7075241914063357, "learning_rate": 9.424067180748692e-06, "loss": 1.476318359375, "memory(GiB)": 22.58, "step": 37, "train_speed(iter/s)": 0.035278 }, { "epoch": 0.2047138047138047, "grad_norm": 0.8285808812880359, "learning_rate": 9.381533400219319e-06, "loss": 2.50634765625, "memory(GiB)": 22.58, "step": 38, "train_speed(iter/s)": 0.035354 }, { "epoch": 0.2101010101010101, "grad_norm": 0.747109858212397, "learning_rate": 9.337587608588588e-06, "loss": 2.397216796875, "memory(GiB)": 22.58, "step": 39, "train_speed(iter/s)": 0.035434 }, { "epoch": 0.21548821548821548, "grad_norm": 0.8997236382866319, "learning_rate": 9.292243968009332e-06, "loss": 2.3466796875, "memory(GiB)": 22.58, "step": 40, "train_speed(iter/s)": 0.035447 }, { "epoch": 0.22087542087542086, "grad_norm": 0.3854506877674985, "learning_rate": 9.24551709111097e-06, "loss": 1.607421875, "memory(GiB)": 22.58, "step": 41, "train_speed(iter/s)": 0.035398 }, { "epoch": 0.22626262626262628, "grad_norm": 0.4259732475000951, "learning_rate": 9.197422036290386e-06, "loss": 1.921630859375, "memory(GiB)": 22.58, "step": 42, "train_speed(iter/s)": 0.035349 }, { "epoch": 0.23164983164983166, "grad_norm": 0.46150408574103824, "learning_rate": 9.147974302859158e-06, "loss": 1.41650390625, "memory(GiB)": 22.58, "step": 43, "train_speed(iter/s)": 0.035321 }, { "epoch": 0.23703703703703705, "grad_norm": 0.5918291232050616, "learning_rate": 9.09718982604866e-06, "loss": 1.58154296875, "memory(GiB)": 22.58, "step": 44, "train_speed(iter/s)": 0.03529 }, { "epoch": 0.24242424242424243, "grad_norm": 1.1984794966626473, "learning_rate": 9.045084971874738e-06, "loss": 2.67236328125, "memory(GiB)": 22.58, "step": 45, "train_speed(iter/s)": 0.035244 }, { "epoch": 0.24781144781144782, "grad_norm": 0.7304425352094286, "learning_rate": 8.991676531863507e-06, "loss": 1.993408203125, "memory(GiB)": 22.58, "step": 46, "train_speed(iter/s)": 0.0352 }, { "epoch": 0.2531986531986532, "grad_norm": 0.8247667804924503, "learning_rate": 8.936981717640061e-06, "loss": 2.8740234375, "memory(GiB)": 22.58, "step": 47, "train_speed(iter/s)": 0.035111 }, { "epoch": 0.2585858585858586, "grad_norm": 1.072788633508109, "learning_rate": 8.881018155381766e-06, "loss": 1.845458984375, "memory(GiB)": 22.58, "step": 48, "train_speed(iter/s)": 0.035139 }, { "epoch": 0.26397306397306397, "grad_norm": 0.6949566674892941, "learning_rate": 8.823803880137993e-06, "loss": 2.345458984375, "memory(GiB)": 22.58, "step": 49, "train_speed(iter/s)": 0.035224 }, { "epoch": 0.26936026936026936, "grad_norm": 0.3214051528089464, "learning_rate": 8.765357330018056e-06, "loss": 1.640869140625, "memory(GiB)": 22.58, "step": 50, "train_speed(iter/s)": 0.035311 }, { "epoch": 0.27474747474747474, "grad_norm": 0.8127331172569063, "learning_rate": 8.705697340249275e-06, "loss": 2.334716796875, "memory(GiB)": 22.58, "step": 51, "train_speed(iter/s)": 0.035368 }, { "epoch": 0.2801346801346801, "grad_norm": 0.6993353179443554, "learning_rate": 8.644843137107058e-06, "loss": 2.2666015625, "memory(GiB)": 22.58, "step": 52, "train_speed(iter/s)": 0.03541 }, { "epoch": 0.2855218855218855, "grad_norm": 0.7930646229400613, "learning_rate": 8.582814331718961e-06, "loss": 1.73876953125, "memory(GiB)": 22.58, "step": 53, "train_speed(iter/s)": 0.035443 }, { "epoch": 0.2909090909090909, "grad_norm": 0.47348696234661886, "learning_rate": 8.519630913744726e-06, "loss": 1.8544921875, "memory(GiB)": 22.58, "step": 54, "train_speed(iter/s)": 0.035485 }, { "epoch": 0.2962962962962963, "grad_norm": 0.5105789152298116, "learning_rate": 8.455313244934324e-06, "loss": 2.10107421875, "memory(GiB)": 22.58, "step": 55, "train_speed(iter/s)": 0.03552 }, { "epoch": 0.30168350168350166, "grad_norm": 0.48874730617457113, "learning_rate": 8.389882052566106e-06, "loss": 2.19189453125, "memory(GiB)": 22.58, "step": 56, "train_speed(iter/s)": 0.035547 }, { "epoch": 0.30707070707070705, "grad_norm": 0.7017590448005361, "learning_rate": 8.32335842276713e-06, "loss": 1.605224609375, "memory(GiB)": 22.58, "step": 57, "train_speed(iter/s)": 0.035484 }, { "epoch": 0.31245791245791243, "grad_norm": 0.7736924894631574, "learning_rate": 8.255763793717868e-06, "loss": 2.123779296875, "memory(GiB)": 22.58, "step": 58, "train_speed(iter/s)": 0.035432 }, { "epoch": 0.3178451178451178, "grad_norm": 0.6091631207035194, "learning_rate": 8.18711994874345e-06, "loss": 1.8798828125, "memory(GiB)": 22.58, "step": 59, "train_speed(iter/s)": 0.035351 }, { "epoch": 0.32323232323232326, "grad_norm": 0.6745360872937951, "learning_rate": 8.117449009293668e-06, "loss": 2.36767578125, "memory(GiB)": 22.58, "step": 60, "train_speed(iter/s)": 0.035291 }, { "epoch": 0.32861952861952864, "grad_norm": 1.1170607516843722, "learning_rate": 8.046773427814043e-06, "loss": 2.153076171875, "memory(GiB)": 22.58, "step": 61, "train_speed(iter/s)": 0.035255 }, { "epoch": 0.33400673400673403, "grad_norm": 0.42517306211931166, "learning_rate": 7.975115980510187e-06, "loss": 1.717041015625, "memory(GiB)": 22.58, "step": 62, "train_speed(iter/s)": 0.035224 }, { "epoch": 0.3393939393939394, "grad_norm": 0.8043024113222557, "learning_rate": 7.902499760007867e-06, "loss": 1.85888671875, "memory(GiB)": 22.58, "step": 63, "train_speed(iter/s)": 0.035142 }, { "epoch": 0.3447811447811448, "grad_norm": 0.9761638945939747, "learning_rate": 7.828948167911073e-06, "loss": 1.906005859375, "memory(GiB)": 22.58, "step": 64, "train_speed(iter/s)": 0.035063 }, { "epoch": 0.3501683501683502, "grad_norm": 0.4137734068293326, "learning_rate": 7.754484907260513e-06, "loss": 2.05712890625, "memory(GiB)": 22.58, "step": 65, "train_speed(iter/s)": 0.034992 }, { "epoch": 0.35555555555555557, "grad_norm": 0.6313489954771672, "learning_rate": 7.679133974894984e-06, "loss": 1.56591796875, "memory(GiB)": 22.58, "step": 66, "train_speed(iter/s)": 0.035062 }, { "epoch": 0.36094276094276095, "grad_norm": 0.7916770866661113, "learning_rate": 7.602919653718044e-06, "loss": 1.32373046875, "memory(GiB)": 22.58, "step": 67, "train_speed(iter/s)": 0.035123 }, { "epoch": 0.36632996632996634, "grad_norm": 0.7005145101509135, "learning_rate": 7.5258665048725065e-06, "loss": 1.677490234375, "memory(GiB)": 22.58, "step": 68, "train_speed(iter/s)": 0.035192 }, { "epoch": 0.3717171717171717, "grad_norm": 0.5600472715983401, "learning_rate": 7.447999359825263e-06, "loss": 1.8934326171875, "memory(GiB)": 22.58, "step": 69, "train_speed(iter/s)": 0.035242 }, { "epoch": 0.3771043771043771, "grad_norm": 0.7799156688047453, "learning_rate": 7.369343312364994e-06, "loss": 1.737060546875, "memory(GiB)": 22.58, "step": 70, "train_speed(iter/s)": 0.035303 }, { "epoch": 0.3824915824915825, "grad_norm": 1.0088361337375438, "learning_rate": 7.289923710515338e-06, "loss": 2.55859375, "memory(GiB)": 22.58, "step": 71, "train_speed(iter/s)": 0.035339 }, { "epoch": 0.3878787878787879, "grad_norm": 0.7778606766770365, "learning_rate": 7.2097661483661355e-06, "loss": 2.3927001953125, "memory(GiB)": 22.58, "step": 72, "train_speed(iter/s)": 0.035398 }, { "epoch": 0.39326599326599326, "grad_norm": 0.7503526567701239, "learning_rate": 7.128896457825364e-06, "loss": 2.4095458984375, "memory(GiB)": 22.58, "step": 73, "train_speed(iter/s)": 0.035435 }, { "epoch": 0.39865319865319865, "grad_norm": 0.9293852718192778, "learning_rate": 7.047340700294454e-06, "loss": 2.0943603515625, "memory(GiB)": 22.58, "step": 74, "train_speed(iter/s)": 0.035473 }, { "epoch": 0.40404040404040403, "grad_norm": 1.2981158494810365, "learning_rate": 6.965125158269619e-06, "loss": 2.36279296875, "memory(GiB)": 22.58, "step": 75, "train_speed(iter/s)": 0.035498 }, { "epoch": 0.4094276094276094, "grad_norm": 0.5915357318010657, "learning_rate": 6.88227632687196e-06, "loss": 1.13037109375, "memory(GiB)": 22.58, "step": 76, "train_speed(iter/s)": 0.035521 }, { "epoch": 0.4148148148148148, "grad_norm": 0.8289109263502568, "learning_rate": 6.798820905309036e-06, "loss": 2.245849609375, "memory(GiB)": 22.58, "step": 77, "train_speed(iter/s)": 0.035549 }, { "epoch": 0.4202020202020202, "grad_norm": 0.7332772758108902, "learning_rate": 6.714785788270658e-06, "loss": 1.794189453125, "memory(GiB)": 22.58, "step": 78, "train_speed(iter/s)": 0.035574 }, { "epoch": 0.4255892255892256, "grad_norm": 0.8695389561000924, "learning_rate": 6.63019805726171e-06, "loss": 2.107177734375, "memory(GiB)": 22.58, "step": 79, "train_speed(iter/s)": 0.035564 }, { "epoch": 0.43097643097643096, "grad_norm": 1.0578963540355828, "learning_rate": 6.545084971874738e-06, "loss": 2.2099609375, "memory(GiB)": 22.58, "step": 80, "train_speed(iter/s)": 0.035518 }, { "epoch": 0.43636363636363634, "grad_norm": 0.5355473518839581, "learning_rate": 6.459473961005168e-06, "loss": 1.679931640625, "memory(GiB)": 22.58, "step": 81, "train_speed(iter/s)": 0.035449 }, { "epoch": 0.4417508417508417, "grad_norm": 0.47562295475695077, "learning_rate": 6.373392614011952e-06, "loss": 1.548828125, "memory(GiB)": 22.58, "step": 82, "train_speed(iter/s)": 0.03541 }, { "epoch": 0.4471380471380471, "grad_norm": 1.1873250939202482, "learning_rate": 6.286868671826513e-06, "loss": 2.3310546875, "memory(GiB)": 22.58, "step": 83, "train_speed(iter/s)": 0.035383 }, { "epoch": 0.45252525252525255, "grad_norm": 0.6325848523967413, "learning_rate": 6.19993001801283e-06, "loss": 1.63232421875, "memory(GiB)": 22.58, "step": 84, "train_speed(iter/s)": 0.035357 }, { "epoch": 0.45791245791245794, "grad_norm": 0.6180246232374331, "learning_rate": 6.112604669781572e-06, "loss": 2.5283203125, "memory(GiB)": 22.58, "step": 85, "train_speed(iter/s)": 0.035328 }, { "epoch": 0.4632996632996633, "grad_norm": 0.9254342636136799, "learning_rate": 6.024920768961153e-06, "loss": 2.09814453125, "memory(GiB)": 22.58, "step": 86, "train_speed(iter/s)": 0.03531 }, { "epoch": 0.4686868686868687, "grad_norm": 1.0220943585915119, "learning_rate": 5.936906572928625e-06, "loss": 1.8603515625, "memory(GiB)": 22.58, "step": 87, "train_speed(iter/s)": 0.035243 }, { "epoch": 0.4740740740740741, "grad_norm": 0.547874150160307, "learning_rate": 5.848590445503345e-06, "loss": 2.2890625, "memory(GiB)": 22.58, "step": 88, "train_speed(iter/s)": 0.03516 }, { "epoch": 0.4794612794612795, "grad_norm": 0.7203446700675221, "learning_rate": 5.760000847806337e-06, "loss": 1.68115234375, "memory(GiB)": 22.58, "step": 89, "train_speed(iter/s)": 0.035117 }, { "epoch": 0.48484848484848486, "grad_norm": 0.7628245708662847, "learning_rate": 5.671166329088278e-06, "loss": 2.126953125, "memory(GiB)": 22.58, "step": 90, "train_speed(iter/s)": 0.035147 }, { "epoch": 0.49023569023569025, "grad_norm": 0.8089999734614459, "learning_rate": 5.582115517529114e-06, "loss": 1.948486328125, "memory(GiB)": 22.58, "step": 91, "train_speed(iter/s)": 0.035179 }, { "epoch": 0.49562289562289563, "grad_norm": 0.5039876551970663, "learning_rate": 5.4928771110122185e-06, "loss": 1.849853515625, "memory(GiB)": 22.58, "step": 92, "train_speed(iter/s)": 0.035212 }, { "epoch": 0.501010101010101, "grad_norm": 0.9008917409254343, "learning_rate": 5.403479867876087e-06, "loss": 2.642578125, "memory(GiB)": 22.58, "step": 93, "train_speed(iter/s)": 0.035235 }, { "epoch": 0.5063973063973064, "grad_norm": 1.1384096826151604, "learning_rate": 5.3139525976465675e-06, "loss": 2.49365234375, "memory(GiB)": 22.58, "step": 94, "train_speed(iter/s)": 0.035265 }, { "epoch": 0.5117845117845118, "grad_norm": 0.7491826485818727, "learning_rate": 5.224324151752575e-06, "loss": 1.88037109375, "memory(GiB)": 22.58, "step": 95, "train_speed(iter/s)": 0.035291 }, { "epoch": 0.5171717171717172, "grad_norm": 0.6169314437426718, "learning_rate": 5.134623414228315e-06, "loss": 1.485595703125, "memory(GiB)": 22.58, "step": 96, "train_speed(iter/s)": 0.035322 }, { "epoch": 0.5225589225589226, "grad_norm": 0.7458411085328407, "learning_rate": 5.04487929240499e-06, "loss": 2.030517578125, "memory(GiB)": 22.58, "step": 97, "train_speed(iter/s)": 0.035358 }, { "epoch": 0.5279461279461279, "grad_norm": 0.36969067992245414, "learning_rate": 4.955120707595011e-06, "loss": 1.82421875, "memory(GiB)": 22.58, "step": 98, "train_speed(iter/s)": 0.03539 }, { "epoch": 0.5333333333333333, "grad_norm": 0.7184526746731991, "learning_rate": 4.865376585771687e-06, "loss": 2.1650390625, "memory(GiB)": 22.58, "step": 99, "train_speed(iter/s)": 0.035417 }, { "epoch": 0.5387205387205387, "grad_norm": 0.5860047275017632, "learning_rate": 4.775675848247427e-06, "loss": 2.016845703125, "memory(GiB)": 22.58, "step": 100, "train_speed(iter/s)": 0.035451 }, { "epoch": 0.5441077441077441, "grad_norm": 0.6740666234718802, "learning_rate": 4.686047402353433e-06, "loss": 1.481689453125, "memory(GiB)": 22.58, "step": 101, "train_speed(iter/s)": 0.035484 }, { "epoch": 0.5494949494949495, "grad_norm": 0.5962985498733315, "learning_rate": 4.596520132123915e-06, "loss": 2.225341796875, "memory(GiB)": 22.58, "step": 102, "train_speed(iter/s)": 0.035522 }, { "epoch": 0.5548821548821549, "grad_norm": 0.6185754487719404, "learning_rate": 4.507122888987782e-06, "loss": 2.630615234375, "memory(GiB)": 22.58, "step": 103, "train_speed(iter/s)": 0.035566 }, { "epoch": 0.5602693602693603, "grad_norm": 0.8891703200104817, "learning_rate": 4.417884482470887e-06, "loss": 1.98291015625, "memory(GiB)": 22.58, "step": 104, "train_speed(iter/s)": 0.03558 }, { "epoch": 0.5656565656565656, "grad_norm": 0.5620520767612842, "learning_rate": 4.3288336709117246e-06, "loss": 1.933349609375, "memory(GiB)": 22.58, "step": 105, "train_speed(iter/s)": 0.035549 }, { "epoch": 0.571043771043771, "grad_norm": 1.3690550098042635, "learning_rate": 4.239999152193664e-06, "loss": 2.217529296875, "memory(GiB)": 22.58, "step": 106, "train_speed(iter/s)": 0.035527 }, { "epoch": 0.5764309764309764, "grad_norm": 0.4160377433886458, "learning_rate": 4.1514095544966556e-06, "loss": 1.737060546875, "memory(GiB)": 22.58, "step": 107, "train_speed(iter/s)": 0.03551 }, { "epoch": 0.5818181818181818, "grad_norm": 0.8209806760015574, "learning_rate": 4.063093427071376e-06, "loss": 2.782470703125, "memory(GiB)": 22.58, "step": 108, "train_speed(iter/s)": 0.035486 }, { "epoch": 0.5872053872053872, "grad_norm": 0.726795857048424, "learning_rate": 3.975079231038848e-06, "loss": 2.009521484375, "memory(GiB)": 22.58, "step": 109, "train_speed(iter/s)": 0.035449 }, { "epoch": 0.5925925925925926, "grad_norm": 1.2624010183388914, "learning_rate": 3.887395330218429e-06, "loss": 2.59814453125, "memory(GiB)": 22.58, "step": 110, "train_speed(iter/s)": 0.035431 }, { "epoch": 0.597979797979798, "grad_norm": 0.7513165711048129, "learning_rate": 3.8000699819871704e-06, "loss": 1.6396484375, "memory(GiB)": 22.58, "step": 111, "train_speed(iter/s)": 0.035402 }, { "epoch": 0.6033670033670033, "grad_norm": 0.4587115862936887, "learning_rate": 3.7131313281734895e-06, "loss": 2.044189453125, "memory(GiB)": 22.58, "step": 112, "train_speed(iter/s)": 0.035373 }, { "epoch": 0.6087542087542087, "grad_norm": 0.41256540620865373, "learning_rate": 3.62660738598805e-06, "loss": 1.9287109375, "memory(GiB)": 22.58, "step": 113, "train_speed(iter/s)": 0.03534 }, { "epoch": 0.6141414141414141, "grad_norm": 0.4286929355926436, "learning_rate": 3.540526038994834e-06, "loss": 1.646728515625, "memory(GiB)": 22.58, "step": 114, "train_speed(iter/s)": 0.035359 }, { "epoch": 0.6195286195286195, "grad_norm": 0.8246295061207459, "learning_rate": 3.4549150281252635e-06, "loss": 1.7587890625, "memory(GiB)": 22.58, "step": 115, "train_speed(iter/s)": 0.035387 }, { "epoch": 0.6249158249158249, "grad_norm": 0.653674454928138, "learning_rate": 3.3698019427382912e-06, "loss": 1.9765625, "memory(GiB)": 22.58, "step": 116, "train_speed(iter/s)": 0.035417 }, { "epoch": 0.6303030303030303, "grad_norm": 0.6402748838297282, "learning_rate": 3.2852142117293435e-06, "loss": 1.94970703125, "memory(GiB)": 22.58, "step": 117, "train_speed(iter/s)": 0.035431 }, { "epoch": 0.6356902356902356, "grad_norm": 0.5582058394376362, "learning_rate": 3.2011790946909673e-06, "loss": 1.9755859375, "memory(GiB)": 22.58, "step": 118, "train_speed(iter/s)": 0.03546 }, { "epoch": 0.641077441077441, "grad_norm": 0.8447371297083311, "learning_rate": 3.11772367312804e-06, "loss": 1.784423828125, "memory(GiB)": 22.58, "step": 119, "train_speed(iter/s)": 0.035493 }, { "epoch": 0.6464646464646465, "grad_norm": 0.7640836687261319, "learning_rate": 3.0348748417303826e-06, "loss": 1.76171875, "memory(GiB)": 22.58, "step": 120, "train_speed(iter/s)": 0.035513 }, { "epoch": 0.6518518518518519, "grad_norm": 0.6689239585125656, "learning_rate": 2.9526592997055488e-06, "loss": 2.076904296875, "memory(GiB)": 22.58, "step": 121, "train_speed(iter/s)": 0.03554 }, { "epoch": 0.6572390572390573, "grad_norm": 0.8205443169011045, "learning_rate": 2.871103542174637e-06, "loss": 2.4423828125, "memory(GiB)": 22.58, "step": 122, "train_speed(iter/s)": 0.035564 }, { "epoch": 0.6626262626262627, "grad_norm": 0.3861380215034983, "learning_rate": 2.790233851633868e-06, "loss": 1.405517578125, "memory(GiB)": 22.58, "step": 123, "train_speed(iter/s)": 0.035589 }, { "epoch": 0.6680134680134681, "grad_norm": 0.9319720706049784, "learning_rate": 2.7100762894846633e-06, "loss": 1.884033203125, "memory(GiB)": 22.58, "step": 124, "train_speed(iter/s)": 0.035611 }, { "epoch": 0.6734006734006734, "grad_norm": 0.4894495365923113, "learning_rate": 2.6306566876350072e-06, "loss": 1.992431640625, "memory(GiB)": 22.58, "step": 125, "train_speed(iter/s)": 0.035618 }, { "epoch": 0.6787878787878788, "grad_norm": 0.5156966779296556, "learning_rate": 2.55200064017474e-06, "loss": 1.7987060546875, "memory(GiB)": 22.58, "step": 126, "train_speed(iter/s)": 0.035593 }, { "epoch": 0.6841750841750842, "grad_norm": 0.39627149470201456, "learning_rate": 2.4741334951274948e-06, "loss": 1.779541015625, "memory(GiB)": 22.58, "step": 127, "train_speed(iter/s)": 0.035563 }, { "epoch": 0.6895622895622896, "grad_norm": 0.7990132228587018, "learning_rate": 2.3970803462819586e-06, "loss": 2.385498046875, "memory(GiB)": 22.58, "step": 128, "train_speed(iter/s)": 0.035533 }, { "epoch": 0.694949494949495, "grad_norm": 0.542336867995926, "learning_rate": 2.320866025105016e-06, "loss": 1.775390625, "memory(GiB)": 22.58, "step": 129, "train_speed(iter/s)": 0.035487 }, { "epoch": 0.7003367003367004, "grad_norm": 0.40553603638944413, "learning_rate": 2.245515092739488e-06, "loss": 1.65771484375, "memory(GiB)": 22.58, "step": 130, "train_speed(iter/s)": 0.035457 }, { "epoch": 0.7057239057239058, "grad_norm": 0.5705311307759141, "learning_rate": 2.171051832088928e-06, "loss": 1.392578125, "memory(GiB)": 22.58, "step": 131, "train_speed(iter/s)": 0.035439 }, { "epoch": 0.7111111111111111, "grad_norm": 0.5637194621292295, "learning_rate": 2.097500239992132e-06, "loss": 1.808349609375, "memory(GiB)": 22.58, "step": 132, "train_speed(iter/s)": 0.035412 }, { "epoch": 0.7164983164983165, "grad_norm": 1.0166298249729564, "learning_rate": 2.0248840194898155e-06, "loss": 1.88232421875, "memory(GiB)": 22.58, "step": 133, "train_speed(iter/s)": 0.035367 }, { "epoch": 0.7218855218855219, "grad_norm": 0.365517442677317, "learning_rate": 1.95322657218596e-06, "loss": 1.8359375, "memory(GiB)": 22.58, "step": 134, "train_speed(iter/s)": 0.035304 }, { "epoch": 0.7272727272727273, "grad_norm": 0.5937921378630181, "learning_rate": 1.8825509907063328e-06, "loss": 2.16162109375, "memory(GiB)": 22.58, "step": 135, "train_speed(iter/s)": 0.035244 }, { "epoch": 0.7326599326599327, "grad_norm": 0.5630691840328598, "learning_rate": 1.8128800512565514e-06, "loss": 1.953369140625, "memory(GiB)": 22.58, "step": 136, "train_speed(iter/s)": 0.035272 }, { "epoch": 0.7380471380471381, "grad_norm": 0.9036946278139879, "learning_rate": 1.7442362062821323e-06, "loss": 3.1923828125, "memory(GiB)": 22.58, "step": 137, "train_speed(iter/s)": 0.035287 }, { "epoch": 0.7434343434343434, "grad_norm": 0.5335511498935785, "learning_rate": 1.6766415772328732e-06, "loss": 1.705322265625, "memory(GiB)": 22.58, "step": 138, "train_speed(iter/s)": 0.035295 }, { "epoch": 0.7488215488215488, "grad_norm": 0.8149099249815346, "learning_rate": 1.610117947433897e-06, "loss": 2.81689453125, "memory(GiB)": 22.58, "step": 139, "train_speed(iter/s)": 0.035308 }, { "epoch": 0.7542087542087542, "grad_norm": 0.5287002241309334, "learning_rate": 1.544686755065677e-06, "loss": 1.266357421875, "memory(GiB)": 22.58, "step": 140, "train_speed(iter/s)": 0.035318 }, { "epoch": 0.7595959595959596, "grad_norm": 0.6139302197140588, "learning_rate": 1.4803690862552755e-06, "loss": 1.817626953125, "memory(GiB)": 22.58, "step": 141, "train_speed(iter/s)": 0.035343 }, { "epoch": 0.764983164983165, "grad_norm": 0.6333656991964685, "learning_rate": 1.4171856682810386e-06, "loss": 2.101806640625, "memory(GiB)": 22.58, "step": 142, "train_speed(iter/s)": 0.035364 }, { "epoch": 0.7703703703703704, "grad_norm": 0.8829740683592863, "learning_rate": 1.3551568628929434e-06, "loss": 2.508056640625, "memory(GiB)": 22.58, "step": 143, "train_speed(iter/s)": 0.03539 }, { "epoch": 0.7757575757575758, "grad_norm": 0.5801508492146695, "learning_rate": 1.2943026597507268e-06, "loss": 1.6142578125, "memory(GiB)": 22.58, "step": 144, "train_speed(iter/s)": 0.035413 }, { "epoch": 0.7811447811447811, "grad_norm": 0.48056036748223746, "learning_rate": 1.234642669981946e-06, "loss": 1.942138671875, "memory(GiB)": 22.58, "step": 145, "train_speed(iter/s)": 0.035431 }, { "epoch": 0.7865319865319865, "grad_norm": 0.5473637984491948, "learning_rate": 1.1761961198620081e-06, "loss": 1.748779296875, "memory(GiB)": 22.58, "step": 146, "train_speed(iter/s)": 0.035455 }, { "epoch": 0.7919191919191919, "grad_norm": 0.7226102542834439, "learning_rate": 1.118981844618236e-06, "loss": 1.657470703125, "memory(GiB)": 22.58, "step": 147, "train_speed(iter/s)": 0.035472 }, { "epoch": 0.7973063973063973, "grad_norm": 0.677002948688539, "learning_rate": 1.06301828235994e-06, "loss": 1.730224609375, "memory(GiB)": 22.58, "step": 148, "train_speed(iter/s)": 0.035492 }, { "epoch": 0.8026936026936027, "grad_norm": 0.4690206204454014, "learning_rate": 1.0083234681364934e-06, "loss": 1.97509765625, "memory(GiB)": 22.58, "step": 149, "train_speed(iter/s)": 0.035513 }, { "epoch": 0.8080808080808081, "grad_norm": 0.38431237166068455, "learning_rate": 9.549150281252633e-07, "loss": 1.977783203125, "memory(GiB)": 22.58, "step": 150, "train_speed(iter/s)": 0.035533 }, { "epoch": 0.8134680134680135, "grad_norm": 1.4318443328161967, "learning_rate": 9.028101739513406e-07, "loss": 2.696533203125, "memory(GiB)": 22.58, "step": 151, "train_speed(iter/s)": 0.035549 }, { "epoch": 0.8188552188552188, "grad_norm": 0.39825000243591335, "learning_rate": 8.520256971408453e-07, "loss": 1.52294921875, "memory(GiB)": 22.58, "step": 152, "train_speed(iter/s)": 0.035566 }, { "epoch": 0.8242424242424242, "grad_norm": 0.403223921534723, "learning_rate": 8.025779637096138e-07, "loss": 2.0869140625, "memory(GiB)": 22.58, "step": 153, "train_speed(iter/s)": 0.035581 }, { "epoch": 0.8296296296296296, "grad_norm": 0.39408518518211616, "learning_rate": 7.544829088890326e-07, "loss": 2.085693359375, "memory(GiB)": 22.58, "step": 154, "train_speed(iter/s)": 0.035601 }, { "epoch": 0.835016835016835, "grad_norm": 0.6580639598152973, "learning_rate": 7.077560319906696e-07, "loss": 1.58740234375, "memory(GiB)": 22.58, "step": 155, "train_speed(iter/s)": 0.035601 }, { "epoch": 0.8404040404040404, "grad_norm": 0.5455643216936216, "learning_rate": 6.624123914114122e-07, "loss": 1.76953125, "memory(GiB)": 22.58, "step": 156, "train_speed(iter/s)": 0.035584 }, { "epoch": 0.8457912457912458, "grad_norm": 0.9580661740362665, "learning_rate": 6.184665997806832e-07, "loss": 2.3505859375, "memory(GiB)": 22.58, "step": 157, "train_speed(iter/s)": 0.035562 }, { "epoch": 0.8511784511784511, "grad_norm": 0.49273093322057226, "learning_rate": 5.759328192513075e-07, "loss": 1.632080078125, "memory(GiB)": 22.58, "step": 158, "train_speed(iter/s)": 0.035543 }, { "epoch": 0.8565656565656565, "grad_norm": 0.5074137587596991, "learning_rate": 5.348247569355736e-07, "loss": 1.71240234375, "memory(GiB)": 22.58, "step": 159, "train_speed(iter/s)": 0.03552 }, { "epoch": 0.8619528619528619, "grad_norm": 0.7185716029221749, "learning_rate": 4.951556604879049e-07, "loss": 2.36669921875, "memory(GiB)": 22.58, "step": 160, "train_speed(iter/s)": 0.035503 }, { "epoch": 0.8673400673400673, "grad_norm": 0.7811542866299452, "learning_rate": 4.569383138356276e-07, "loss": 1.678955078125, "memory(GiB)": 22.58, "step": 161, "train_speed(iter/s)": 0.035485 }, { "epoch": 0.8727272727272727, "grad_norm": 0.5128778192942757, "learning_rate": 4.201850330591678e-07, "loss": 2.072998046875, "memory(GiB)": 22.58, "step": 162, "train_speed(iter/s)": 0.035459 }, { "epoch": 0.8781144781144781, "grad_norm": 0.6851552480944826, "learning_rate": 3.8490766242301356e-07, "loss": 1.55322265625, "memory(GiB)": 22.58, "step": 163, "train_speed(iter/s)": 0.035422 }, { "epoch": 0.8835016835016835, "grad_norm": 1.0656634793505568, "learning_rate": 3.511175705587433e-07, "loss": 2.09228515625, "memory(GiB)": 22.58, "step": 164, "train_speed(iter/s)": 0.035412 }, { "epoch": 0.8888888888888888, "grad_norm": 0.4704867924767551, "learning_rate": 3.18825646801314e-07, "loss": 2.178955078125, "memory(GiB)": 22.58, "step": 165, "train_speed(iter/s)": 0.03542 }, { "epoch": 0.8942760942760942, "grad_norm": 0.3438531193817133, "learning_rate": 2.8804229767982637e-07, "loss": 1.828125, "memory(GiB)": 22.58, "step": 166, "train_speed(iter/s)": 0.035441 }, { "epoch": 0.8996632996632996, "grad_norm": 0.9072486327466182, "learning_rate": 2.587774435638679e-07, "loss": 1.902099609375, "memory(GiB)": 22.58, "step": 167, "train_speed(iter/s)": 0.035458 }, { "epoch": 0.9050505050505051, "grad_norm": 0.40209833194248146, "learning_rate": 2.3104051546654016e-07, "loss": 1.72314453125, "memory(GiB)": 22.58, "step": 168, "train_speed(iter/s)": 0.035472 }, { "epoch": 0.9104377104377105, "grad_norm": 0.6534758670706157, "learning_rate": 2.0484045200517222e-07, "loss": 1.73095703125, "memory(GiB)": 22.58, "step": 169, "train_speed(iter/s)": 0.03548 }, { "epoch": 0.9158249158249159, "grad_norm": 0.36229213242531244, "learning_rate": 1.801856965207338e-07, "loss": 1.954345703125, "memory(GiB)": 22.58, "step": 170, "train_speed(iter/s)": 0.035499 }, { "epoch": 0.9212121212121213, "grad_norm": 0.41462023840060064, "learning_rate": 1.5708419435684463e-07, "loss": 1.726318359375, "memory(GiB)": 22.58, "step": 171, "train_speed(iter/s)": 0.035512 }, { "epoch": 0.9265993265993266, "grad_norm": 0.793282464162017, "learning_rate": 1.3554339029927532e-07, "loss": 2.07861328125, "memory(GiB)": 22.58, "step": 172, "train_speed(iter/s)": 0.035526 }, { "epoch": 0.931986531986532, "grad_norm": 0.4924403822397691, "learning_rate": 1.1557022617676217e-07, "loss": 1.400634765625, "memory(GiB)": 22.58, "step": 173, "train_speed(iter/s)": 0.035541 }, { "epoch": 0.9373737373737374, "grad_norm": 0.41980069690346106, "learning_rate": 9.717113862389993e-08, "loss": 2.12158203125, "memory(GiB)": 22.58, "step": 174, "train_speed(iter/s)": 0.03556 }, { "epoch": 0.9427609427609428, "grad_norm": 0.8809220146060189, "learning_rate": 8.035205700685167e-08, "loss": 2.621826171875, "memory(GiB)": 22.58, "step": 175, "train_speed(iter/s)": 0.035577 }, { "epoch": 0.9481481481481482, "grad_norm": 0.6908254679787823, "learning_rate": 6.511840151252169e-08, "loss": 1.813232421875, "memory(GiB)": 22.58, "step": 176, "train_speed(iter/s)": 0.035597 }, { "epoch": 0.9535353535353536, "grad_norm": 0.49484208186969647, "learning_rate": 5.1475081401825553e-08, "loss": 1.9814453125, "memory(GiB)": 22.58, "step": 177, "train_speed(iter/s)": 0.035578 }, { "epoch": 0.958922558922559, "grad_norm": 0.6989450753180266, "learning_rate": 3.9426493427611177e-08, "loss": 1.78466796875, "memory(GiB)": 22.58, "step": 178, "train_speed(iter/s)": 0.035563 }, { "epoch": 0.9643097643097643, "grad_norm": 0.5543481036485521, "learning_rate": 2.8976520417742794e-08, "loss": 1.727783203125, "memory(GiB)": 22.58, "step": 179, "train_speed(iter/s)": 0.035552 }, { "epoch": 0.9696969696969697, "grad_norm": 0.5545843045026326, "learning_rate": 2.012853002380466e-08, "loss": 1.75634765625, "memory(GiB)": 22.58, "step": 180, "train_speed(iter/s)": 0.035543 }, { "epoch": 0.9750841750841751, "grad_norm": 0.3433152184276571, "learning_rate": 1.2885373635829756e-08, "loss": 1.64208984375, "memory(GiB)": 22.58, "step": 181, "train_speed(iter/s)": 0.035533 }, { "epoch": 0.9804713804713805, "grad_norm": 0.9002229182397717, "learning_rate": 7.249385463395375e-09, "loss": 2.177490234375, "memory(GiB)": 22.58, "step": 182, "train_speed(iter/s)": 0.035517 }, { "epoch": 0.9858585858585859, "grad_norm": 0.5840020558119475, "learning_rate": 3.2223817833931803e-09, "loss": 1.4775390625, "memory(GiB)": 22.58, "step": 183, "train_speed(iter/s)": 0.035499 }, { "epoch": 0.9912457912457913, "grad_norm": 0.31651969118225726, "learning_rate": 8.056603547090813e-10, "loss": 1.804931640625, "memory(GiB)": 22.58, "step": 184, "train_speed(iter/s)": 0.035513 }, { "epoch": 0.9966329966329966, "grad_norm": 0.5699524292753597, "learning_rate": 0.0, "loss": 1.653076171875, "memory(GiB)": 22.58, "step": 185, "train_speed(iter/s)": 0.035529 }, { "epoch": 0.9966329966329966, "eval_loss": 0.12199707329273224, "eval_runtime": 16.4404, "eval_samples_per_second": 1.825, "eval_steps_per_second": 1.825, "step": 185 }, { "epoch": 0.9966329966329966, "eval_loss": 0.12199707329273224, "eval_runtime": 18.3596, "eval_samples_per_second": 1.634, "eval_steps_per_second": 1.634, "step": 185 } ], "logging_steps": 1, "max_steps": 185, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 664501364736.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }