bigscience-bot
commited on
Commit
·
a76e390
1
Parent(s):
1c7d906
new data
Browse files- logs/main_log.txt +69 -0
logs/main_log.txt
CHANGED
@@ -87027,3 +87027,72 @@ time (ms)
|
|
87027 |
time (ms)
|
87028 |
iteration 1325/ 292968 | consumed samples: 2713600 | consumed tokens: 262094848 | elapsed time per iteration (ms): 113595.4 | learning rate: 7.236E-05 | global batch size: 2048 | lm loss: 4.305782E+00 | loss scale: 16384.0 | grad norm: 9792.060 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87029 |
time (ms)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87027 |
time (ms)
|
87028 |
iteration 1325/ 292968 | consumed samples: 2713600 | consumed tokens: 262094848 | elapsed time per iteration (ms): 113595.4 | learning rate: 7.236E-05 | global batch size: 2048 | lm loss: 4.305782E+00 | loss scale: 16384.0 | grad norm: 9792.060 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87029 |
time (ms)
|
87030 |
+
iteration 1326/ 292968 | consumed samples: 2715648 | consumed tokens: 262373376 | elapsed time per iteration (ms): 106966.1 | learning rate: 7.242E-05 | global batch size: 2048 | lm loss: 4.298875E+00 | loss scale: 16384.0 | grad norm: 9256.978 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87031 |
+
time (ms)
|
87032 |
+
iteration 1327/ 292968 | consumed samples: 2717696 | consumed tokens: 262651904 | elapsed time per iteration (ms): 112772.2 | learning rate: 7.247E-05 | global batch size: 2048 | lm loss: 4.275658E+00 | loss scale: 16384.0 | grad norm: 12353.776 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87033 |
+
time (ms)
|
87034 |
+
iteration 1328/ 292968 | consumed samples: 2719744 | consumed tokens: 262930432 | elapsed time per iteration (ms): 116094.4 | learning rate: 7.253E-05 | global batch size: 2048 | lm loss: 4.294221E+00 | loss scale: 16384.0 | grad norm: 15819.284 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87035 |
+
time (ms)
|
87036 |
+
iteration 1329/ 292968 | consumed samples: 2721792 | consumed tokens: 263208960 | elapsed time per iteration (ms): 108861.8 | learning rate: 7.258E-05 | global batch size: 2048 | lm loss: 4.278796E+00 | loss scale: 16384.0 | grad norm: 14416.408 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87037 |
+
time (ms)
|
87038 |
+
iteration 1330/ 292968 | consumed samples: 2723840 | consumed tokens: 263487488 | elapsed time per iteration (ms): 111717.3 | learning rate: 7.264E-05 | global batch size: 2048 | lm loss: 4.279788E+00 | loss scale: 16384.0 | grad norm: 10858.691 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87039 |
+
time (ms)
|
87040 |
+
iteration 1331/ 292968 | consumed samples: 2725888 | consumed tokens: 263766016 | elapsed time per iteration (ms): 106840.2 | learning rate: 7.269E-05 | global batch size: 2048 | lm loss: 4.321123E+00 | loss scale: 16384.0 | grad norm: 16413.887 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87041 |
+
time (ms)
|
87042 |
+
iteration 1332/ 292968 | consumed samples: 2727936 | consumed tokens: 264044544 | elapsed time per iteration (ms): 105046.3 | learning rate: 7.274E-05 | global batch size: 2048 | lm loss: 4.286259E+00 | loss scale: 16384.0 | grad norm: 13602.333 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87043 |
+
time (ms)
|
87044 |
+
iteration 1333/ 292968 | consumed samples: 2729984 | consumed tokens: 264323072 | elapsed time per iteration (ms): 103539.0 | learning rate: 7.280E-05 | global batch size: 2048 | lm loss: 4.311579E+00 | loss scale: 16384.0 | grad norm: 12268.700 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87045 |
+
time (ms)
|
87046 |
+
iteration 1334/ 292968 | consumed samples: 2732032 | consumed tokens: 264601600 | elapsed time per iteration (ms): 104597.9 | learning rate: 7.285E-05 | global batch size: 2048 | lm loss: 4.297973E+00 | loss scale: 16384.0 | grad norm: 11817.463 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87047 |
+
time (ms)
|
87048 |
+
iteration 1335/ 292968 | consumed samples: 2734080 | consumed tokens: 264880128 | elapsed time per iteration (ms): 106853.2 | learning rate: 7.291E-05 | global batch size: 2048 | lm loss: 4.288142E+00 | loss scale: 16384.0 | grad norm: 9158.477 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87049 |
+
time (ms)
|
87050 |
+
iteration 1336/ 292968 | consumed samples: 2736128 | consumed tokens: 265158656 | elapsed time per iteration (ms): 109768.8 | learning rate: 7.296E-05 | global batch size: 2048 | lm loss: 4.275808E+00 | loss scale: 16384.0 | grad norm: 9550.713 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87051 |
+
time (ms)
|
87052 |
+
iteration 1337/ 292968 | consumed samples: 2738176 | consumed tokens: 265437184 | elapsed time per iteration (ms): 106402.4 | learning rate: 7.302E-05 | global batch size: 2048 | lm loss: 4.278894E+00 | loss scale: 16384.0 | grad norm: 8149.629 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87053 |
+
time (ms)
|
87054 |
+
iteration 1338/ 292968 | consumed samples: 2740224 | consumed tokens: 265715712 | elapsed time per iteration (ms): 104883.4 | learning rate: 7.307E-05 | global batch size: 2048 | lm loss: 4.285826E+00 | loss scale: 16384.0 | grad norm: 8283.185 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87055 |
+
time (ms)
|
87056 |
+
iteration 1339/ 292968 | consumed samples: 2742272 | consumed tokens: 265994240 | elapsed time per iteration (ms): 105272.5 | learning rate: 7.313E-05 | global batch size: 2048 | lm loss: 4.284776E+00 | loss scale: 16384.0 | grad norm: 8637.702 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87057 |
+
time (ms)
|
87058 |
+
iteration 1340/ 292968 | consumed samples: 2744320 | consumed tokens: 266272768 | elapsed time per iteration (ms): 102678.5 | learning rate: 7.318E-05 | global batch size: 2048 | lm loss: 4.302094E+00 | loss scale: 16384.0 | grad norm: 8230.286 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87059 |
+
time (ms)
|
87060 |
+
iteration 1341/ 292968 | consumed samples: 2746368 | consumed tokens: 266551296 | elapsed time per iteration (ms): 103750.2 | learning rate: 7.324E-05 | global batch size: 2048 | lm loss: 4.306873E+00 | loss scale: 16384.0 | grad norm: 12167.833 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87061 |
+
time (ms)
|
87062 |
+
iteration 1342/ 292968 | consumed samples: 2748416 | consumed tokens: 266829824 | elapsed time per iteration (ms): 104922.5 | learning rate: 7.329E-05 | global batch size: 2048 | lm loss: 4.294527E+00 | loss scale: 16384.0 | grad norm: 11905.773 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87063 |
+
time (ms)
|
87064 |
+
iteration 1343/ 292968 | consumed samples: 2750464 | consumed tokens: 267108352 | elapsed time per iteration (ms): 103900.0 | learning rate: 7.335E-05 | global batch size: 2048 | lm loss: 4.295758E+00 | loss scale: 16384.0 | grad norm: 12966.247 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87065 |
+
time (ms)
|
87066 |
+
iteration 1344/ 292968 | consumed samples: 2752512 | consumed tokens: 267386880 | elapsed time per iteration (ms): 112773.0 | learning rate: 7.340E-05 | global batch size: 2048 | lm loss: 4.293741E+00 | loss scale: 16384.0 | grad norm: 17679.849 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87067 |
+
time (ms)
|
87068 |
+
iteration 1345/ 292968 | consumed samples: 2754560 | consumed tokens: 267665408 | elapsed time per iteration (ms): 107333.9 | learning rate: 7.345E-05 | global batch size: 2048 | lm loss: 4.285107E+00 | loss scale: 16384.0 | grad norm: 12319.450 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87069 |
+
time (ms)
|
87070 |
+
iteration 1346/ 292968 | consumed samples: 2756608 | consumed tokens: 267943936 | elapsed time per iteration (ms): 107084.2 | learning rate: 7.351E-05 | global batch size: 2048 | lm loss: 4.317650E+00 | loss scale: 16384.0 | grad norm: 10941.971 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87071 |
+
time (ms)
|
87072 |
+
iteration 1347/ 292968 | consumed samples: 2758656 | consumed tokens: 268222464 | elapsed time per iteration (ms): 104355.1 | learning rate: 7.356E-05 | global batch size: 2048 | lm loss: 4.266949E+00 | loss scale: 16384.0 | grad norm: 8940.800 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87073 |
+
time (ms)
|
87074 |
+
iteration 1348/ 292968 | consumed samples: 2760704 | consumed tokens: 268500992 | elapsed time per iteration (ms): 102429.5 | learning rate: 7.362E-05 | global batch size: 2048 | lm loss: 4.283114E+00 | loss scale: 16384.0 | grad norm: 7895.135 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87075 |
+
time (ms)
|
87076 |
+
iteration 1349/ 292968 | consumed samples: 2762752 | consumed tokens: 268779520 | elapsed time per iteration (ms): 105154.4 | learning rate: 7.367E-05 | global batch size: 2048 | lm loss: 4.285004E+00 | loss scale: 16384.0 | grad norm: 9430.716 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87077 |
+
time (ms)
|
87078 |
+
iteration 1350/ 292968 | consumed samples: 2764800 | consumed tokens: 269058048 | elapsed time per iteration (ms): 103674.9 | learning rate: 7.373E-05 | global batch size: 2048 | lm loss: 4.279161E+00 | loss scale: 16384.0 | grad norm: 10926.594 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87079 |
+
time (ms)
|
87080 |
+
------------------------------------------------------------------------------------------------
|
87081 |
+
validation loss at iteration 1350 | lm loss value: 4.259500E+00 | lm loss PPL: 7.077459E+01 |
|
87082 |
+
------------------------------------------------------------------------------------------------
|
87083 |
+
iteration 1351/ 292968 | consumed samples: 2766848 | consumed tokens: 269336576 | elapsed time per iteration (ms): 274611.4 | learning rate: 7.378E-05 | global batch size: 2048 | lm loss: 4.258837E+00 | loss scale: 16384.0 | grad norm: 10373.234 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87084 |
+
time (ms)
|
87085 |
+
iteration 1352/ 292968 | consumed samples: 2768896 | consumed tokens: 269615104 | elapsed time per iteration (ms): 106646.8 | learning rate: 7.384E-05 | global batch size: 2048 | lm loss: 4.268482E+00 | loss scale: 16384.0 | grad norm: 9422.137 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87086 |
+
time (ms)
|
87087 |
+
iteration 1353/ 292968 | consumed samples: 2770944 | consumed tokens: 269893632 | elapsed time per iteration (ms): 109903.2 | learning rate: 7.389E-05 | global batch size: 2048 | lm loss: 4.249788E+00 | loss scale: 16384.0 | grad norm: 9869.253 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87088 |
+
time (ms)
|
87089 |
+
iteration 1354/ 292968 | consumed samples: 2772992 | consumed tokens: 270172160 | elapsed time per iteration (ms): 104478.9 | learning rate: 7.395E-05 | global batch size: 2048 | lm loss: 4.269929E+00 | loss scale: 16384.0 | grad norm: 14670.245 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87090 |
+
time (ms)
|
87091 |
+
iteration 1355/ 292968 | consumed samples: 2775040 | consumed tokens: 270450688 | elapsed time per iteration (ms): 104033.5 | learning rate: 7.400E-05 | global batch size: 2048 | lm loss: 4.291121E+00 | loss scale: 16384.0 | grad norm: 17109.005 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87092 |
+
time (ms)
|
87093 |
+
iteration 1356/ 292968 | consumed samples: 2777088 | consumed tokens: 270729216 | elapsed time per iteration (ms): 103055.2 | learning rate: 7.406E-05 | global batch size: 2048 | lm loss: 4.270620E+00 | loss scale: 16384.0 | grad norm: 11280.739 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87094 |
+
time (ms)
|
87095 |
+
iteration 1357/ 292968 | consumed samples: 2779136 | consumed tokens: 271007744 | elapsed time per iteration (ms): 102621.3 | learning rate: 7.411E-05 | global batch size: 2048 | lm loss: 4.277614E+00 | loss scale: 16384.0 | grad norm: 9553.789 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87096 |
+
time (ms)
|
87097 |
+
iteration 1358/ 292968 | consumed samples: 2781184 | consumed tokens: 271286272 | elapsed time per iteration (ms): 103434.3 | learning rate: 7.416E-05 | global batch size: 2048 | lm loss: 4.257460E+00 | loss scale: 16384.0 | grad norm: 12285.977 | num zeros: 0.0 | curriculum seqlen: 136 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
87098 |
+
time (ms)
|