|
5-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
|
time: 22 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
|
5k,0.0341,0.0416,0.0565,0.0526
|
|
10k,0.0715,,0.0931,0.0767
|
|
15k,0.0765,,0.1061,0.1127
|
|
20k,0.0787,,0.1183,0.1247
|
|
25k,0.0892,0.115,0.1352,0.1343
|
|
30k,0.0911,0.1366,0.1271,0.1421
|
|
35k,0.097,0.1488,0.1485,0.1524
|
|
40k,0.1028,0.1355,0.1488,0.1562
|
|
45k,0.1078,0.1488,0.162,0.1598
|
|
50k,0.105,0.154,0.159,0.1698
|
|
55k,0.1097,0.1607,0.1662,0.1704
|
|
60k,0.1211,0.1654,0.1612,0.1801
|
|
65k,0.1089,0.1573,0.1693,0.1823
|
|
70k,0.1222,0.1634,0.1679,0.1767
|
|
75k,0.1097,0.1709,0.1881,0.1762
|
|
80k,0.1277,0.1573,0.1776,0.1964
|
|
85k,0.128,0.1776,0.1889,0.1889
|
|
90k,0.1158,0.1598,0.1806,0.1773
|
|
95k,0.1235,0.1762,0.1781,0.1917
|
|
100k,0.1258,,0.1928,0.1947
|
|
105k,0.1366,,0.1814,0.2094
|
|
110k,0.1377,0.1756,0.1859,
|
|
115k,0.1346,0.1831,0.1947,0.2119
|
|
120k,0.1402,0.2014,,0.2119
|
|
125k,0.1307,0.203,0.1992,0.1787
|
|
130k,0.1368,0.1997,0.1994,0.2086
|
|
135k,0.1363,,0.2014,0.2069
|
|
140k,0.1435,,0.1986,0.2058
|
|
145k,0.1532,,0.1953,0.2102
|
|
150k,0.1404,,,0.2075
|
|
155k,0.1418,,0.1931,0.2205
|
|
160k,0.1346,,0.2116,0.2208
|
|
165k,0.1524,,0.2139,0.2213
|
|
170k,0.1388,,,0.2169
|
|
175k,0.1438,,0.2222,0.2321
|
|
180k,0.1471,,0.2249,0.236
|
|
185k,0.1499,,0.2222,0.2366
|
|
190k,0.1504,,,0.2274
|
|
195k,0.1554,,,0.2454
|
|
200k,0.1565,,,0.2346
|
|
205k,0.1726,,,0.2316
|
|
210k,0.1623,,,0.2493
|
|
215k,0.1576,,,0.2355
|
|
220k,0.1693,,,0.2427
|
|
225k,0.1596,,,0.244
|
|
230k,0.1693,,,0.2554
|
|
235k,0.172,,,0.2535
|
|
240k,0.1712,,,
|
|
245k,0.1704,,,
|
|
250k,0.1784,,,
|
|
255k,0.174,,,
|
|
260k,0.1756,,,
|
|
265k,0.1886,,,
|
|
270k,0.182,,,
|
|
275k,0.187,,,
|
|
280k,0.1704,,,
|
|
285k,0.1903,,,
|
|
290k,,,,
|
|
300k,,,,
|
|
305k,,,,
|
|
310k,,,,
|
|
315k,,,,
|
|
320k,,,,
|
|
325k,,,,
|
|
330k,,,,
|
|
335k,,,, |