add plots
Browse files- data/plots/c4_filters_hellaswag.json +1 -0
- data/plots/dataset_ablations.json +1 -0
- data/plots/filtering_steps.json +1 -0
- index.html +10 -1
- src/plotting.js +82 -0
data/plots/c4_filters_hellaswag.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.2911666582028071,0.31999999781449634,0.3476666659116745,0.3673333376646042,0.3841666678587596,0.396166667342186,0.40683333575725555,0.4143333335717519,0.42099999884764355,0.4248333275318146,0.4294999986886978,0.4333333323399226,0.4363333334525426],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29316666225592297,0.32333333293596905,0.34833333392937976,0.37016666928927106,0.3866666704416275,0.39916667342185974,0.4098333368698756,0.4176666686932246,0.422666663924853,0.42466666797796887,0.429666668176651,0.43316666781902313,0.43683333198229474],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29383333027362823,0.3306666662295659,0.3608333319425583,0.3838333288828532,0.39299999674161273,0.4049999962250392,0.4166666666666667,0.4331666628519694,0.44066666563351947,0.4479999939600627,0.45033333202203113,0.4536666621764501,0.4533333331346512],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29099999864896137,0.31966666877269745,0.34183333317438763,0.3605000029007594,0.3793333371480306,0.39800000190734863,0.4116666615009308,0.4183333267768224,0.4231666624546051,0.4298333326975505,0.43566666543483734,0.44033333162466687,0.44200000166893005],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29849999646345776,0.3388333320617676,0.3713333358367284,0.39916666845480603,0.41683333615461987,0.43249999980131787,0.4416666676600774,0.450833335518837,0.4599999984105428,0.47033333281675976,0.4763333300749461,0.4795000006755193,0.4826666663090388],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.2973333348830541,0.33400000631809235,0.36916666726271313,0.39299999674161273,0.40933333337306976,0.41700000564257306,0.4266666720310847,0.4348333328962326,0.4429999937613805,0.44849999248981476,0.453999991218249,0.4598333289225896,0.464999998609225],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29733332329326206,0.3346666594346364,0.36666666467984516,0.39177778032090926,0.4108888937367334,0.42588888936572605,0.4386666648917728,0.44777777459886337,0.45900000135103863,0.46800000137752956,0.4753333330154419,0.4793333311875661,0.4818888869550493],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"},"range":[4,29]},"yaxis":{"title":{"text":"agg_score"},"range":[0.3,0.49]},"title":{"text":"C4 filtering effect on HellaSwag"}}}
|
data/plots/dataset_ablations.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data":{"FineWeb (ours)":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.3712157472968102,0.38569323457777493,0.3981285229325294,0.4066198557615279,0.41478234604001035,0.4201014973223208,0.42716352790594103,0.43035290017724037,0.4334479622542857,0.43603434190154067,0.4374171830713749,0.4378406159579754,0.44050431549549096,0.4426967814564704,0.444344089180231,0.44582218378782273,0.44676088243722917,0.44773444831371306,0.44797005578875543,0.4479209095239639,0.4497388891875743,0.451181749254465,0.4514984801411629,0.4518859677016735,0.45252272188663484,0.4525098197162151,0.4530941963195801,0.453959035128355,0.4554740980267525,0.4559641920030117,0.45572500452399256,0.45685316547751426,0.45709227323532103,0.45743352398276327,0.4586012713611126,0.4593494705855846,0.45895241349935534,0.4599407374858856,0.4597075887024403,0.46024790331721305,0.46140901520848276,0.4629077948629856,0.46317790597677233,0.46494788229465484,0.4657444849610329,0.4665262944996357,0.46651043817400933,0.466835780441761,0.46796536445617676,0.4681459695100784,0.4683231070637703,0.46803470253944396,0.4680803991854191,0.46696048602461804,0.46784936264157295,0.4682179480791092,0.46965089738368987,0.4700184591114521,0.47036621868610384,0.4702661596238612,0.4707536585628985,0.47030577659606926,0.47028143927454946,0.4708319529891014,0.47103812247514726,0.46995603293180466,0.4700050979852676,0.47025975957512844,0.47025189697742464,0.4695272326469421,0.4708316929638386,0.47134945169091225,0.47233630791306497,0.472486087679863,0.47400609999895094,0.4740514993667603,0.4746290810406208,0.47478879392147066,0.474917159229517,0.47537324130535125,0.4754243455827236,0.4759679578244686,0.4765262506902218,0.4769704148173332,0.47606868073344233,0.476014269143343,0.4759118087589741,0.4757449202239513,0.4757299132645129,0.4764128066599368,0.4770635090768337,0.4772775433957577,0.4777563586831093,0.4781574815511703,0.47913932502269735,0.47997388914227485,0.47990029156208036,0.48027274310588836,0.48123554810881614,0.4807776227593422,0.48076377213001253,0.48124308288097384,0.4807186722755432,0.480298426002264,0.48083388805389404,0.48105664253234864,0.48138434141874314,0.4823483981192112,0.4827818602323532,0.4833590790629387,0.4834526889026165,0.4835219897329807,0.48327558264136317,0.4832548059523106,0.482832083106041,0.482999499887228,0.48349839374423026,0.48438283503055574,0.48480270951986315,0.4846198961138725,0.4855451248586178,0.48584607914090155,0.48580994829535484,0.48572852462530136,0.4865334898233414,0.48629426285624505,0.4866053737699986,0.486323406547308,0.4868562504649162,0.48631764724850657,0.48656089380383494,0.4869096249341965,0.4872058629989624,0.4876266010105609,0.48828058391809454,0.48916375041007987,0.4885731689631939,0.48913962915539744,0.48933037295937537,0.4896206259727478,0.4888610139489174,0.4889063358306884,0.48800586611032487,0.48769507482647895,0.4881645493209362,0.4885176382958889,0.4889214225113392,0.489346531778574,0.4897931829094887,0.4895638138055801,0.4895636722445488,0.4894485406577587,0.4895821809768677,0.48963045328855515,0.4895258165895939,0.4892311230301857,0.48935932368040086,0.48930409029126165,0.4892051488161087,0.48934917002916334,0.4892323412001133,0.4888860426843166,0.4898474603891373,0.4898831218481064],"label":"FineWeb (ours)"},"C4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.37031132541596884,0.3861145623028278,0.39781578034162524,0.4049981914460659,0.41125049293041227,0.4157645933330059,0.4184085942804813,0.42142740786075583,0.42450482919812194,0.4273415431380271,0.429358696192503,0.4319171831011772,0.43310311511158944,0.4356558553874493,0.43729385510087015,0.4398450471460819,0.44034275487065316,0.4422505870461464,0.4438278801739215,0.4442768432199955,0.44453793913125994,0.44682204648852347,0.4492095001041889,0.45029403567314147,0.4518999971449375,0.45325383394956587,0.45350821912288664,0.4530544176697731,0.4531293801963329,0.453867893666029,0.4546523816883564,0.45572223886847496,0.455907317250967,0.4559489533305168,0.45620120167732237,0.4561437442898749,0.4568042539060115,0.45808903500437725,0.45854576379060746,0.4595249958336353,0.46013019010424616,0.4601067304611206,0.4603969775140285,0.46039394065737715,0.46063812673091886,0.4616517499089241,0.462982639670372,0.46384881511330606,0.46470888406038285,0.46438453644514077,0.4641804374754428,0.4632778570055961,0.4631356731057167,0.46262368112802504,0.4630329914391041,0.4633222192525864,0.46437387317419043,0.4648158758878707,0.465447574853897,0.46562537327408793,0.465600273758173,0.46574294194579124,0.46528102830052376,0.4660298578441143,0.4666306748986244,0.46725640147924424,0.4672479182481766,0.46756778135895727,0.46775934025645255,0.46824805736541747,0.46802097260951997,0.46941083669662476,0.47014615312218666,0.470748395472765,0.47020162716507913,0.47148469015955924,0.4708667159080505,0.471061547100544,0.47109143659472463,0.47253392785787585,0.4729177556931973,0.473410177975893,0.47412602305412294,0.47420368120074274,0.47472296878695486,0.47433724999427795,0.4744605697691441,0.47398262843489647,0.4749260261654854,0.4746309369802475,0.4754087567329407,0.4752944201231002,0.4756043516099452,0.4752288468182086,0.47521871849894526,0.4746076203882694,0.47511918321251867,0.4751265667378902,0.4762439362704754,0.4778266429901123,0.4785951733589172,0.4783878833055496,0.47844565808773043,0.4780256047844887,0.47702656015753736,0.47713900655508035,0.4774845123291015,0.47798100039362906,0.4780901148915291,0.4777564540505409,0.4772858589887619,0.47762828022241594,0.47768432945013045,0.4774010144174098,0.4781514324247837,0.4781773522496223,0.47855629697442054,0.4787003755569458,0.479026710242033,0.47915929853916167,0.48030186966061594,0.48038364499807357,0.480581185221672,0.4808879986405373,0.4813949279487133,0.48143648356199265,0.48228714913129805,0.4826158158481121,0.4831616222858429,0.483425835520029,0.48365990445017804,0.48269245773553837,0.4836991146206856,0.48364961966872216,0.48380382508039477,0.4834172509610652,0.48334112986922256,0.48242905735969543,0.4826629556715488,0.4822983302175999,0.48211922869086266,0.483265259861946,0.48355209007859223,0.4838595129549502,0.48433948382735253,0.4856067016720772,0.4857440173625946,0.4866707265377045,0.4869413435459137,0.4869915746152401,0.4868630483746529,0.4867526046931744,0.4863955594599247,0.48616318702697753,0.4860677346587181,0.48642648234963415,0.48611854612827293,0.4861273929476737,0.4865010216832161,0.4868352599442005,0.48694165945053103,0.48753583505749704,0.48733403384685514,0.4865913726389408],"label":"C4"},"Dolma":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.36829185895621774,0.3826920263469219,0.3932168565690517,0.4001671388745308,0.40611165165901186,0.4102663993835449,0.4129279285669326,0.4168520301580429,0.4199860475957394,0.42218567058444023,0.4255398973822594,0.42862062379717825,0.43043847382068623,0.4322855733335017,0.4342906281352043,0.4356040135025978,0.437412865459919,0.4377865843474865,0.4391136698424816,0.4399378590285778,0.44036932438611986,0.4404760181903839,0.44079480320215225,0.44110928773880004,0.4419735036790371,0.4431540474295616,0.44414861872792244,0.44663134664297105,0.447731390595436,0.4486377447843551,0.4490752771496772,0.45021008625626563,0.45042492374777787,0.4503888428211211,0.4511564202606677,0.4517831355333328,0.45151555314660075,0.45143224447965613,0.4509019635617732,0.4508959643542766,0.45099312737584113,0.45197013318538665,0.4533025212585926,0.45523594170808784,0.4563622847199439,0.4563429668545722,0.4569791406393051,0.4572999693453312,0.45812175720930093,0.45867776051163667,0.4597163848578929,0.45905695110559464,0.4590897589921951,0.4591017745435237,0.4589703656733035,0.4600061275064945,0.46125792637467383,0.4612972617149353,0.4610033519566059,0.46131007373332966,0.45974892750382423,0.45977524295449257,0.46031488552689553,0.4609262838959694,0.46078696325421326,0.4631240248680114,0.4636728063225746,0.46505812257528306,0.46523031517863267,0.4652312956750392,0.46456133648753156,0.4649143539369106,0.4650689773261547,0.46563632860779763,0.4664593979716301,0.46731297448277476,0.4678213559091091,0.467756862193346,0.4675082013010979,0.4674791909754276,0.4665357306599617,0.4657153382897377,0.4652122125029564,0.46624193117022517,0.4670622080564499,0.4675221122801304,0.46849180534482004,0.4684284642338753,0.4681626588106155,0.4679159983992577,0.4691922187805176,0.46928863450884817,0.46938162073493006,0.46954184845089914,0.4700673170387744,0.46967210322618475,0.46959259286522864,0.4706240609288216,0.47129315659403803,0.4710762068629265,0.47106821537017823,0.4707925833761692,0.4713354676961899,0.47075842395424844,0.4710302673280239,0.4717977650463581,0.4727449908852576,0.4723355159163474,0.47267045825719833,0.473048622906208,0.4724260903894901,0.47202426195144653,0.4715518891811371,0.4712790258228779,0.47217001020908356,0.4726972974836826,0.47325685098767273,0.4736775815486908,0.4738755039870739,0.4731722638010979,0.47355266809463503,0.4729598484933376,0.4730292782187462,0.47380238249897955,0.4743541695177555,0.4744453772902489,0.47513795644044876,0.47564751878380773,0.4758379586040974,0.47590363696217536,0.47586077228188517,0.4762671314179897,0.4765605591237545,0.4763809978961945,0.4761639192700386,0.4760676920413971,0.4757721446454525,0.47608443796634675,0.47684395089745524,0.47727004289627073,0.4778515577316284,0.47871544063091276,0.4789580747485161,0.479602736979723,0.4793718926608562,0.48051962405443194,0.480340039730072,0.48023983761668204,0.4793727234005928,0.47987697795033457,0.4789552815258503,0.4794896461069583,0.4793884709477424,0.47952600941061974,0.47939058393239975,0.4796253673732281,0.47967042177915564,0.4800381310284137,0.48009947761893274,0.4808355815708637,0.4803269624710083,0.4804235249757767,0.48066105097532275,0.4810446500778198],"label":"Dolma"},"RefinedWeb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.36935312487184996,0.38374419510364527,0.39556556120514863,0.40262971073389053,0.40787065848708154,0.41219308972358704,0.4158456213772297,0.4189255848526955,0.4221720516681671,0.4249745920300484,0.4275571651756763,0.4307715676724911,0.43210732862353324,0.4332358233630657,0.43628031089901925,0.4377801202237606,0.4382371336221695,0.4408359855413437,0.443413619697094,0.44420371055603025,0.44484473913908007,0.4471548080444336,0.44681392312049867,0.4467212952673435,0.44725192710757256,0.449183938652277,0.4495456732809543,0.4513785853981971,0.4531454570591449,0.4544703222811222,0.45540532246232035,0.45598348379135134,0.4556840226054192,0.45618869438767423,0.4562996678054332,0.4565622642636299,0.45782349109649656,0.4579134449362755,0.45743068605661386,0.4575530268251895,0.45722133442759505,0.4563326708972454,0.45796059966087344,0.4588068947196007,0.4595696978271008,0.46085438057780265,0.4618456199765205,0.4616811737418174,0.461778799444437,0.4613799624145031,0.4618814967572689,0.46205456703901293,0.4624464973807335,0.46321221739053725,0.46389560475945474,0.4640473112463951,0.46415690779685975,0.46410940065979955,0.4646980591118336,0.4655677951872349,0.4660304687917233,0.46585372239351264,0.46676014959812157,0.46638399064540864,0.46536661833524706,0.46535017490386965,0.46683350801467893,0.46656889393925666,0.46725284233689307,0.4680892124772072,0.4678822658956051,0.4680094726383686,0.469198103249073,0.470146207511425,0.47041539251804354,0.47116121649742126,0.4711993597447872,0.4709890834987164,0.4708621069788933,0.47159409448504447,0.4708657041192055,0.4702023401856422,0.4696627654135227,0.46882943958044054,0.468185143917799,0.46857124716043463,0.4695053867995739,0.470417944341898,0.4714525043964386,0.47220595851540564,0.4725564628839492,0.47324422001838673,0.47333174645900716,0.4733056709170341,0.4735308401286602,0.4738423585891723,0.47416625097393983,0.4753894440829754,0.47546544298529625,0.4756516933441162,0.4759101323783398,0.47567163929343215,0.47464561983942977,0.47505067735910406,0.47595281824469565,0.47621366158127787,0.47611333057284355,0.47660795897245406,0.47582818642258645,0.4758710920810699,0.4767298653721809,0.4770657457411289,0.47764199823141096,0.4785135343670845,0.47847120761871337,0.47832037433981894,0.47839118018746374,0.4785721957683563,0.4793835572898388,0.4795083418488503,0.4799169234931469,0.4805363215506076,0.4808934584259986,0.4812358051538467,0.4816051431000233,0.4815927021205425,0.4814530000090599,0.48048100918531417,0.4804213002324104,0.48112683445215226,0.4811866842210293,0.4815112330019474,0.48153620585799217,0.48184896931052207,0.48141826167702667,0.4815475441515445,0.4816103555262088,0.4822722218930721,0.48146815076470373,0.4818138018250465,0.48226772993803024,0.4829028986394405,0.48318603038787844,0.484446120262146,0.4841261185705662,0.4838590003550053,0.48318717777729037,0.48345534726977346,0.4831574305891991,0.4837306492030621,0.48418081998825074,0.4850054018199444,0.48536263331770896,0.4854676000773906,0.4851828873157501,0.4849528044462204,0.4844263069331646,0.4835553206503391,0.48416665121912955,0.4842161864042282,0.4840385474264622,0.4839116208255291,0.4846745267510414,0.4844333350658416],"label":"RefinedWeb"},"SlimPajama":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.36013916321098804,0.3709003426134586,0.3800318390130996,0.38614354506134985,0.39255225434899327,0.39765713736414904,0.40228414386510847,0.4059404022991657,0.4092565655708313,0.41099545136094096,0.4136519022285938,0.4162317536771297,0.41780887767672537,0.4196796014904976,0.42196245416998857,0.4236407779157162,0.42431910410523416,0.42467992529273035,0.42509203106164933,0.42523933202028275,0.4256282828748226,0.42816226482391356,0.43013309463858607,0.43194242641329766,0.43289266973733903,0.4347443826496601,0.43465954437851906,0.43606125339865687,0.4370398983359337,0.4380304604768753,0.4384885407984257,0.43862698823213575,0.43828715905547144,0.43891616612672807,0.438679126650095,0.4381975084543228,0.43921599835157393,0.439888284355402,0.4399630777537822,0.4410494461655617,0.44306385442614554,0.4435621775686741,0.4448402144014835,0.4448398597538471,0.4457201905548572,0.4452336251735686,0.44593263119459153,0.4458536572754383,0.4460982523858547,0.445988829433918,0.44623474702239035,0.44649243652820586,0.4471563212573527,0.4487740397453308,0.4496089734137058,0.44998190701007845,0.4499057076871395,0.44955074265599243,0.4494915328919887,0.4500779673457145,0.4507186159491539,0.4517171747982502,0.45319566056132315,0.45317392349243163,0.45306552946567535,0.45329397842288016,0.4527900494635104,0.4526586286723613,0.45291358828544614,0.45282933190464975,0.45260326787829397,0.45251850560307505,0.4518451876938343,0.4522048644721508,0.45168248265981675,0.45291881933808326,0.4540523044764996,0.45541578307747843,0.45543657541275023,0.4569337382912636,0.4560297131538391,0.45644690096378326,0.4559798687696457,0.45641890689730646,0.45619859024882314,0.4572006665170193,0.45651850402355193,0.457117622345686,0.45780671387910843,0.4594726629555225,0.46010534912347795,0.4611823491752148,0.46068611592054365,0.46045204624533653,0.45969258770346644,0.45951006188988686,0.46077279150485995,0.462513330578804,0.46345093324780456,0.4642737157642841,0.464357141405344,0.4634327881038189,0.46236506626009943,0.4610362842679024,0.4608594886958599,0.4611344300210476,0.46224700808525077,0.46374807581305494,0.46639568656682967,0.46628877371549604,0.4665589049458504,0.4661438174545765,0.4662491038441658,0.4647610723972321,0.465025345236063,0.4655064478516578,0.4654252536594868,0.46531233936548233,0.4655247926712036,0.46629018113017084,0.4662842907011509,0.4662777349352837,0.4664938136935234,0.46739139780402184,0.46779298558831217,0.4682520292699337,0.46970570981502535,0.47001703903079034,0.47097785249352453,0.47020549774169923,0.47066197395324705,0.4706603668630123,0.47171802148222924,0.47115041837096217,0.4720696859061718,0.47241689413785937,0.47169830799102785,0.4712020568549632,0.4716058835387229,0.47207002639770507,0.4718272626399994,0.4730011515319347,0.4735621757805347,0.47323642000555993,0.4731345571577549,0.4738350659608841,0.473908656090498,0.4731449708342552,0.4743215024471283,0.47447666302323344,0.4741019673645496,0.4740337811410427,0.47380995824933053,0.473081661015749,0.47391644641757014,0.4742524974048138,0.47385535687208175,0.47428033500909805,0.4746873453259467,0.47334098219871523,0.4730934128165245,0.4729781374335289,0.4734421379864216,0.4738457091152668],"label":"SlimPajama"},"The Pile":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.35403027236461637,0.36315691284835333,0.3699307084083557,0.3759866751730442,0.3822197556495667,0.38530751317739487,0.3886714994907379,0.3910529114305973,0.39301389157772065,0.39559405893087385,0.39784636646509164,0.40015986263751985,0.4027916923165321,0.40428677424788473,0.4043083056807518,0.40494729280471803,0.4064370468258858,0.407907473295927,0.41011113226413726,0.41224115565419195,0.4144911035895348,0.4152746140956879,0.4157869629561901,0.4168264023959637,0.4170485265552998,0.41803082302212713,0.41931466087698926,0.41980380192399025,0.41905572935938834,0.4204210460186005,0.42110354006290435,0.421506617218256,0.42289810329675676,0.42470798194408416,0.4248323492705822,0.424747234582901,0.42606232687830925,0.42716642618179324,0.4277482956647873,0.4293295681476593,0.4302771858870983,0.429832735657692,0.43018229901790617,0.4304100930690765,0.43095132932066915,0.43160988613963125,0.432910792529583,0.43309193402528756,0.43355609327554695,0.43359937965869894,0.43301409259438517,0.4326248303055763,0.4334431245923042,0.43437108024954796,0.4340257093310356,0.4352980673313141,0.43563542142510414,0.43546789288520815,0.4360013075172901,0.4372364699840546,0.43800699785351754,0.4385517969727516,0.43829558715224265,0.43806290701031686,0.4384324349462986,0.4385986141860485,0.43900027498602856,0.4400079883635043,0.43933424949645994,0.4397112004458904,0.4397529311478138,0.4400021076202393,0.44107552617788315,0.4427199296653271,0.44177714735269547,0.4420967280864715,0.44206107780337334,0.44179753586649895,0.4418474048376083,0.4423248037695885,0.44180904850363734,0.4422737643122673,0.44300810620188713,0.44330510273575774,0.4441195271909236,0.4447399370372295,0.44552269130945205,0.44593375697731974,0.4462174646556377,0.44630332216620444,0.44741628021001817,0.4480100393295287,0.44780487865209573,0.4480263926088809,0.44870344549417496,0.4489807546138763,0.4478344045579433,0.447753146290779,0.4492069073021411,0.44926297143101684,0.44869897738099096,0.45012065693736075,0.4509387515485286,0.4500133350491523,0.4505564682185649,0.4512146979570389,0.45188284665346146,0.45187010392546656,0.4522735998034477,0.4525675527751446,0.4534406378865242,0.45298373177647583,0.45276769027113906,0.4525142863392829,0.4527019999921322,0.4526311345398426,0.45349918827414504,0.45450910031795494,0.4554465614259243,0.4559020109474659,0.45558119714260104,0.45572791919112204,0.45617976114153863,0.4565602965652943,0.4566966436803341,0.4581799410283566,0.4586497411131858,0.45875198021531105,0.4588025249540806,0.45904205068945886,0.4590006470680237,0.4592668078839779,0.4595161899924277,0.459639312326908,0.45990723744034767,0.45912066772580146,0.45880992859601977,0.45943455025553703,0.4595917724072933,0.45958872437477105,0.4612408824265003,0.4613038249313831,0.46093683913350103,0.46127560213208196,0.46168113946914674,0.4610830768942833,0.4619025319814682,0.4615081101655959,0.46194025799632066,0.4626124188303947,0.4634204514324665,0.463740087300539,0.4655907340347767,0.46584516689181327,0.46609186828136445,0.46637092977762223,0.46627322807908056,0.46496834605932236,0.4654968619346619,0.46497657895088196,0.46417722329497335,0.46418203562498095,0.4641611523926258,0.4636922091245651],"label":"The Pile"},"RedPajama2":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.36020261645317075,0.37207864299416543,0.38215991929173465,0.3887263186275959,0.3934720449149608,0.3976951420307159,0.4018514782190323,0.40409348607063295,0.4073106154799461,0.4097383216023445,0.412900410592556,0.41463092267513274,0.41752370819449425,0.41950091868638995,0.42179937809705736,0.42283808290958397,0.4250119782984257,0.42555054649710655,0.4271428920328617,0.4278797753155231,0.42892353385686877,0.4300256744027138,0.4309113919734955,0.43137083500623696,0.43263344764709466,0.4347386948764324,0.434593190997839,0.4354085043072701,0.4361723817884922,0.4368736453354359,0.4363872602581978,0.437036245316267,0.43818535283207893,0.43855890408158305,0.4401340030133724,0.4409464053809643,0.44143203869462005,0.44234083741903296,0.44288691133260727,0.44235401451587675,0.4424125269055367,0.4428589724004269,0.4423035465180874,0.4427142545580864,0.4434956908226013,0.4445128194987774,0.44537728279829025,0.4461486995220184,0.4466302402317524,0.44617108479142187,0.446101326495409,0.44619950726628305,0.446491190046072,0.4471056528389454,0.4490526393055916,0.4495409466326237,0.45030279234051707,0.4513317734003066,0.4515420950949192,0.45110466703772545,0.4514125272631645,0.4519342966377734,0.45223737955093374,0.4519515648484229,0.45210928544402124,0.4531633608043194,0.45289948806166647,0.453323532640934,0.4550468109548091,0.4550088882446288,0.45433161333203315,0.4552720218896866,0.45461406633257867,0.45399482920765877,0.45426675528287885,0.4548324435949326,0.4549082241952419,0.4551001012325287,0.45547500252723694,0.45594589933753016,0.4558583088219166,0.45595540255308153,0.45691645964980127,0.4574550613760948,0.4584540419280529,0.459098344296217,0.45916826501488683,0.45948839783668516,0.46112224757671355,0.4611063919961452,0.46171060875058173,0.4630969136953354,0.46330603063106535,0.4624505050480366,0.4625287853181361,0.46226065307855596,0.4616734065115451,0.46187167763710024,0.46120553836226463,0.46134640350937844,0.4622974008321762,0.4626293145120143,0.46236210465431216,0.46368267834186555,0.4645942062139511,0.46427512764930723,0.4639273129403591,0.464119590818882,0.46363756284117696,0.4630771316587925,0.4631906099617481,0.46360151246190073,0.4643662080168724,0.4643269568681717,0.4646356225013733,0.4653183162212372,0.4648941487073898,0.46508019268512724,0.46560496985912325,0.4656318023800849,0.46511366963386525,0.4662900559604168,0.46538560688495634,0.4643936313688755,0.46428236439824105,0.46445935145020484,0.4639253720641136,0.4648166388273238,0.46585445180535306,0.4660604313015938,0.46592489928007125,0.46676246225833895,0.46687850430607797,0.4671006828546524,0.4672992005944252,0.46727297604084017,0.46671744659543035,0.4663915753364563,0.46620967611670494,0.4658009834587574,0.46597268655896185,0.4661875404417515,0.4664167441427708,0.4664564236998558,0.46663766726851463,0.4669985204935074,0.46679810136556626,0.4673094697296619,0.46738598123192787,0.46778359562158583,0.46763788163661957,0.46792311519384383,0.4677123583853245,0.4680258259177208,0.46751984506845473,0.467444808781147,0.467233594506979,0.4666879989206791,0.4662370540201664,0.4669689737260342,0.46710963621735574,0.4673020221292973,0.46824508905410767,0.4690551429986954],"label":"RedPajama2"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"},"range":[4,355]},"yaxis":{"title":{"text":"Agg Score"},"range":[0.35,0.5]},"title":{"text":"Dataset Ablations"}}}
|
data/plots/filtering_steps.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data":{"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.36926820836961266,0.38203409612178796,0.39078781828284265,0.3956208534538746,0.3990963250398636,0.40308327972888947,0.40649700313806536,0.407912353426218,0.411406696587801,0.41404814645648,0.4155092850327492,0.41795957311987875,0.4200871780514717,0.4227150440216064,0.42384095713496206,0.426036062836647,0.4271404132246971,0.42894179224967954,0.4305384442210197,0.4319251500070095,0.4326193243265151,0.43427793607115744,0.4347934126853943,0.4345370322465897,0.4352454699575901,0.43554568514227865,0.4339814603328705,0.4342834234237671,0.43531606569886205,0.43576630502939223,0.4357234910130501,0.4372501514852047,0.43833074644207953,0.43781240433454516,0.4379648305475712,0.4381132386624813,0.438448067009449,0.4384333245456219,0.4389940395951271,0.4400040380656719,0.441561146825552,0.4423880904912948,0.44331216141581525,0.4443627014756202,0.44366268813610077,0.44311698600649835,0.44364576414227486,0.4431915104389191,0.4427203856408596,0.4443676613271236,0.4455058850347995,0.44615875855088233,0.4465731419622898,0.4479690581560135,0.44840321913361547,0.447981271892786,0.4482423685491085,0.44924388974905016,0.44899323880672454,0.4485030435025692,0.45028517991304395,0.4507353216409683,0.45116728320717814,0.45144951716065407,0.4518436208367348,0.4513734854757786,0.4518222324550152,0.45145809948444365,0.4512837529182434,0.4519437663257122,0.4521554559469223,0.45147905945777883,0.4518196329474449,0.45257959216833116,0.4524036094546318,0.4527964904904366,0.4530105598270893,0.45335754454135896,0.45396619141101835,0.45459589511156084,0.4550217859447002,0.4560036838054657,0.4565500751137733,0.4568891301751137,0.45712930485606196,0.4563898526132107,0.45567619875073423,0.45559246614575377,0.45511320903897284,0.45539349168539045,0.4561412051320076,0.45715235769748686,0.4576687462627887,0.45818602591753,0.458062607049942,0.4581084720790386,0.4570564292371273,0.4567670665681362,0.45731772035360335,0.4576993718743324,0.4577161468565464,0.4591300703585148,0.45967747196555137,0.4598850101232529,0.4607538335025311,0.4612186782062054,0.4619384504854679,0.4626227006316185,0.46234423369169236,0.46176643297076225,0.46136114969849584,0.46150869578123094,0.4619676761329174,0.4622086018323898,0.46235987469553946,0.46299406290054324,0.4630826920270919,0.46264655888080586,0.46258691400289537,0.4632560282945633,0.4634956054389477,0.4633393153548241,0.46364396810531616,0.46416858658194543,0.46397360116243364,0.464645367115736,0.46442538052797316,0.46406724750995637,0.46415518000721934,0.4643302120268345,0.4648954145610332,0.465849281847477,0.4662369966506958,0.4658923275768757,0.466351430863142,0.46581498682498934,0.4653567478060722,0.46588137596845625,0.46648752465844157,0.4664620153605938,0.4667314425110817,0.4669733181595802,0.4670629248023033,0.467547832429409,0.46745155528187754,0.46771074533462526,0.46740650609135626,0.46696603000164033,0.466818867623806,0.467232009768486,0.4671480022370815,0.46770014688372613,0.4685129299759865,0.46813031658530235,0.4682801507413387,0.4687854625284672,0.4682334467768669,0.4680188588798046,0.46834352910518645,0.467834347486496,0.46763256937265396,0.46837265491485597,0.4681259348988533,0.4686297595500945],"label":"FineWeb: base filtering only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.370305210724473,0.38538585379719736,0.39511206299066537,0.40298017859458923,0.409290175139904,0.413666632771492,0.41749832555651667,0.42125996947288513,0.42437445372343063,0.4274251386523247,0.43023875951766966,0.4321109838783741,0.43424192667007444,0.43613557070493697,0.437407860904932,0.43808431178331375,0.43985605910420417,0.44053239896893504,0.44194763004779813,0.4428853578865528,0.44475727826356887,0.44533989727497103,0.4468288891017437,0.44789535403251646,0.44945245534181594,0.45021386444568634,0.45117616206407546,0.45165844038128855,0.451799089461565,0.4520416229963303,0.4520642809569836,0.45239565819501876,0.4518701292574406,0.4518812879920005,0.45248807445168493,0.45262535139918325,0.45352781713008883,0.4553974881768227,0.4566363111138344,0.45700768008828163,0.4576442360877991,0.4585714004933834,0.4587270848453045,0.45955651104450224,0.460345446318388,0.4614695131778717,0.46071490868926046,0.46118568256497383,0.4599429272115231,0.4599632196128368,0.46037245318293574,0.4606971569359303,0.4606951214373112,0.4611978381872177,0.46112299934029577,0.4607429005205631,0.46163453757762907,0.46182278990745546,0.4631089620292187,0.4638592816889286,0.4651353217661381,0.4655767247080803,0.4659814231097698,0.46648655757308,0.4675781108438968,0.4675243757665156,0.46797287091612816,0.46853338107466697,0.4683393523097038,0.4670936234295368,0.4668723739683628,0.4665733970701694,0.4668287120759487,0.46617602929472923,0.467276993393898,0.46755580604076385,0.4675174213945866,0.4671892985701561,0.4676224373281002,0.46739820912480357,0.4674134634435177,0.4675380110740662,0.4687583431601524,0.4697458289563656,0.47001201286911964,0.4705664157867432,0.47074690759181975,0.47003005519509317,0.4694135203957558,0.46936979740858076,0.4689124390482903,0.4697516694664955,0.4707345478236674,0.4721454992890357,0.4728621408343314,0.47416575327515603,0.47499901577830317,0.4748332165181637,0.4749793991446495,0.47513311058282853,0.4751249924302101,0.47412342876195906,0.4744432017207146,0.474757094681263,0.4747910059988499,0.4747039243578911,0.47488212734460833,0.47521442025899885,0.47504854425787923,0.47552693635225296,0.47552043944597244,0.476516292989254,0.4771891236305237,0.47768160700798035,0.47804755941033356,0.47844709232449534,0.4783868357539177,0.4781750589609146,0.47845144048333166,0.47777490466833117,0.4781319200992584,0.4779041476547718,0.4774407997727394,0.4771588683128357,0.47804371640086174,0.4785159431397915,0.4782356970012188,0.47833485826849936,0.4782362774014473,0.478005338460207,0.47810146361589434,0.47927170172333716,0.4796604186296463,0.4800727739930153,0.48045224100351336,0.48049417063593863,0.4802452601492405,0.47976561784744265,0.48014174327254294,0.4805368520319462,0.480988722294569,0.4814631320536137,0.48263009190559386,0.48285799250006667,0.48333154022693625,0.48357397094368937,0.48365214839577675,0.4836900994181633,0.484160565584898,0.4837813340127468,0.4838929153978824,0.48443443700671185,0.48451621904969217,0.48415875136852266,0.48465299159288405,0.4842782385647297,0.4841688245534897,0.48420060351490973,0.48406358510255804,0.48331916853785517,0.48322339728474617,0.48276304453611374,0.4823808237910271,0.48311666399240494],"label":"FineWeb: independent MinHash (id mh)"},"big-run-sampled-fineweb-c4-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.3712877172976732,0.38670630827546115,0.3968585044145584,0.40554883033037187,0.4112760901451111,0.4152331717312336,0.4188694812357426,0.422267484664917,0.4243667766451836,0.42823049873113633,0.4314244844019413,0.43514703661203386,0.43815202414989474,0.4403209649026394,0.441806273162365,0.44376508668065073,0.44470142126083373,0.44577157199382783,0.44722985848784447,0.4481617622077465,0.44891021847724916,0.44869395792484285,0.44921630769968035,0.44984524250030516,0.4503038816154003,0.45043965727090834,0.4506390020251274,0.4515539951622486,0.4522737957537174,0.4526932582259178,0.45317558869719504,0.45490805581212046,0.4543730542063713,0.4542605958878994,0.45531259998679163,0.456124572455883,0.45578436031937597,0.4565194480121136,0.4580251507461071,0.458286102861166,0.4588122643530369,0.45901755914092063,0.4594882421195507,0.4588031515479088,0.4597215823829174,0.4601985849440098,0.4617794468998909,0.4625217795372009,0.46424990594387056,0.4641730964183807,0.46495022475719444,0.465314683318138,0.4660008154809475,0.46568036153912545,0.4662005819380283,0.46679292619228363,0.4663336597383022,0.46641242429614066,0.4673418693244457,0.46755558997392654,0.46722459942102434,0.46758766695857046,0.4678506188094616,0.46803232878446577,0.46744388714432716,0.4670224241912365,0.46717201396822927,0.4677027724683285,0.46757384911179545,0.46773228943347933,0.46816296875476837,0.4693212352693081,0.4695054106414318,0.4697939470410347,0.4703126437962055,0.4709296621382236,0.47073659524321554,0.4703500494360924,0.4706985726952553,0.47107653468847277,0.47199371829628944,0.47200043573975564,0.4733529381453991,0.473261346668005,0.47444241046905516,0.47374596521258355,0.47453284040093424,0.47517829909920695,0.47522192895412446,0.47508981078863144,0.4765052072703838,0.4767457105219364,0.4760475426912308,0.47723821401596067,0.4775520570576191,0.4769199274480343,0.47712810933589933,0.47803026512265206,0.4779872573912144,0.4787396192550659,0.478919418156147,0.47907158359885216,0.478837601095438,0.4786116167902946,0.478425532579422,0.47877322882413864,0.4795349933207035,0.4798359669744968,0.48130340352654455,0.48103265911340715,0.4820302747189998,0.48193556591868403,0.48263323605060576,0.481933069229126,0.4822238326072693,0.48176143541932104,0.48164708986878396,0.4815290026366711,0.48166586011648177,0.48188470155000684,0.4825871430337429,0.4832161031663418,0.48361599147319795,0.48348221480846404,0.4844072677195072,0.48435106575489045,0.483872663974762,0.48382994458079337,0.48410715833306306,0.483836580067873,0.4837873600423336,0.48440969437360765,0.485145579278469,0.48565298914909355,0.4863760128617286,0.48663519993424414,0.4869131810963154,0.48599529191851615,0.4861787423491478,0.4851324915885925,0.48535613566637037,0.48531915694475175,0.48612819239497174,0.4862202785909176,0.4868394561111927,0.48683882877230644,0.48713951632380487,0.4866954326629639,0.4870163291692734,0.4873525105416775,0.48744417652487754,0.48692786917090414,0.4877421960234642,0.4879465028643607,0.4881573535501956,0.48859142065048217,0.4891356773674488,0.4892439402639866,0.4889910131692886,0.48889509662985803,0.48904356732964516,0.48887107595801355,0.4881747141480446,0.48855887949466703],"label":"FineWeb: id mh + C4 filters"},"big-run-fineweb-v1-all-dumps":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.3712157472968102,0.38569323457777493,0.3981285229325294,0.4066198557615279,0.41478234604001035,0.4201014973223208,0.42716352790594103,0.43035290017724037,0.4334479622542857,0.43603434190154067,0.4374171830713749,0.4378406159579754,0.44050431549549096,0.4426967814564704,0.444344089180231,0.44582218378782273,0.44676088243722917,0.44773444831371306,0.44797005578875543,0.4479209095239639,0.4497388891875743,0.451181749254465,0.4514984801411629,0.4518859677016735,0.45252272188663484,0.4525098197162151,0.4530941963195801,0.453959035128355,0.4554740980267525,0.4559641920030117,0.45572500452399256,0.45685316547751426,0.45709227323532103,0.45743352398276327,0.4586012713611126,0.4593494705855846,0.45895241349935534,0.4599407374858856,0.4597075887024403,0.46024790331721305,0.46140901520848276,0.4629077948629856,0.46317790597677233,0.46494788229465484,0.4657444849610329,0.4665262944996357,0.46651043817400933,0.466835780441761,0.46796536445617676,0.4681459695100784,0.4683231070637703,0.46803470253944396,0.4680803991854191,0.46696048602461804,0.46784936264157295,0.4682179480791092,0.46965089738368987,0.4700184591114521,0.47036621868610384,0.4702661596238612,0.4707536585628985,0.47030577659606926,0.47028143927454946,0.4708319529891014,0.47103812247514726,0.46995603293180466,0.4700050979852676,0.47025975957512844,0.47025189697742464,0.4695272326469421,0.4708316929638386,0.47134945169091225,0.47233630791306497,0.472486087679863,0.47400609999895094,0.4740514993667603,0.4746290810406208,0.47478879392147066,0.474917159229517,0.47537324130535125,0.4754243455827236,0.4759679578244686,0.4765262506902218,0.4769704148173332,0.47606868073344233,0.476014269143343,0.4759118087589741,0.4757449202239513,0.4757299132645129,0.4764128066599368,0.4770635090768337,0.4772775433957577,0.4777563586831093,0.4781574815511703,0.47913932502269735,0.47997388914227485,0.47990029156208036,0.48027274310588836,0.48123554810881614,0.4807776227593422,0.48076377213001253,0.48124308288097384,0.4807186722755432,0.480298426002264,0.48083388805389404,0.48105664253234864,0.48138434141874314,0.4823483981192112,0.4827818602323532,0.4833590790629387,0.4834526889026165,0.4835219897329807,0.48327558264136317,0.4832548059523106,0.482832083106041,0.482999499887228,0.48349839374423026,0.48438283503055574,0.48480270951986315,0.4846198961138725,0.4855451248586178,0.48584607914090155,0.48580994829535484,0.48572852462530136,0.4865334898233414,0.48629426285624505,0.4866053737699986,0.486323406547308,0.4868562504649162,0.48631764724850657,0.48656089380383494,0.4869096249341965,0.4872058629989624,0.4876266010105609,0.48828058391809454,0.48916375041007987,0.4885731689631939,0.48913962915539744,0.48933037295937537,0.4896206259727478,0.4888610139489174,0.4889063358306884,0.48800586611032487,0.48769507482647895,0.4881645493209362,0.4885176382958889,0.4889214225113392,0.489346531778574,0.4897931829094887,0.4895638138055801,0.4895636722445488,0.4894485406577587,0.4895821809768677,0.48963045328855515,0.4895258165895939,0.4892311230301857,0.48935932368040086,0.48930409029126165,0.4892051488161087,0.48934917002916334,0.4892323412001133,0.4888860426843166,0.4898474603891373,0.4898831218481064],"label":"FineWeb: id mh + C4 + custom filters"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"agg_score"},"range":[0.44,0.495]},"title":{"text":"The different FineWeb processing steps"}}}
|
index.html
CHANGED
@@ -2,6 +2,10 @@
|
|
2 |
|
3 |
<head>
|
4 |
<script src="https://distill.pub/template.v2.js"></script>
|
|
|
|
|
|
|
|
|
5 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
6 |
<meta charset="utf8">
|
7 |
<title>FineWeb: 15T tokens of high quality web data</title>
|
@@ -155,6 +159,8 @@
|
|
155 |
<d-contents>
|
156 |
</d-contents>
|
157 |
|
|
|
|
|
158 |
<p>We have recently released 🍷FineWeb, our new large scale
|
159 |
(15T tokens, 44TB disk space) dataset of clean text sourced from the web for LLM pretraining. You can
|
160 |
download it <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb">here</a>.</p>
|
@@ -495,6 +501,7 @@
|
|
495 |
each of the different filters used in C4 to a baseline of the independently deduped FineWeb 2019-18 dump
|
496 |
(plot smoothed with a 3 checkpoints sliding window):</p>
|
497 |
<figure><img src="plots/c4_filters.png"/></figure>
|
|
|
498 |
<ul>
|
499 |
<li>applying “All filters” (drop lines not ending on punctuation marks,
|
500 |
mentioning javascript and cookie notices + drop documents outside length thresholds, containing “lorem
|
@@ -584,6 +591,8 @@
|
|
584 |
<li>our custom filters (mentioned in the previous section)</li>
|
585 |
</ul>
|
586 |
<figure><img src="plots/fineweb_all_filters.png"/></figure>
|
|
|
|
|
587 |
<p>We compared 🍷 FineWeb with the following datasets:</p>
|
588 |
<ul>
|
589 |
<li><a
|
@@ -617,7 +626,7 @@
|
|
617 |
collection</a>. We have uploaded checkpoints at every 1000 training steps. You will also find our full <a
|
618 |
href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/eval_results.csv">evaluation
|
619 |
results here</a>.</p>
|
620 |
-
<figure
|
621 |
<p>Some histogram comparisons of C4, Dolma, RefinedWeb and
|
622 |
FineWeb:</p>
|
623 |
<figure><img src="plots/Untitled%203.png"/></figure>
|
|
|
2 |
|
3 |
<head>
|
4 |
<script src="https://distill.pub/template.v2.js"></script>
|
5 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjs/12.4.2/math.min.js" charset="utf-8"></script>
|
6 |
+
<script src="https://cdn.plot.ly/plotly-2.32.0.min.js" charset="utf-8"></script>
|
7 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js" charset="utf-8"></script>
|
8 |
+
<script type="module" src="src/plotting.js"></script>
|
9 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
10 |
<meta charset="utf8">
|
11 |
<title>FineWeb: 15T tokens of high quality web data</title>
|
|
|
159 |
<d-contents>
|
160 |
</d-contents>
|
161 |
|
162 |
+
<!-- Your JavaScript file -->
|
163 |
+
|
164 |
<p>We have recently released 🍷FineWeb, our new large scale
|
165 |
(15T tokens, 44TB disk space) dataset of clean text sourced from the web for LLM pretraining. You can
|
166 |
download it <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb">here</a>.</p>
|
|
|
501 |
each of the different filters used in C4 to a baseline of the independently deduped FineWeb 2019-18 dump
|
502 |
(plot smoothed with a 3 checkpoints sliding window):</p>
|
503 |
<figure><img src="plots/c4_filters.png"/></figure>
|
504 |
+
<figure id="plot-c4_filters_hellaswag" style="height:600px;"></figure>
|
505 |
<ul>
|
506 |
<li>applying “All filters” (drop lines not ending on punctuation marks,
|
507 |
mentioning javascript and cookie notices + drop documents outside length thresholds, containing “lorem
|
|
|
591 |
<li>our custom filters (mentioned in the previous section)</li>
|
592 |
</ul>
|
593 |
<figure><img src="plots/fineweb_all_filters.png"/></figure>
|
594 |
+
|
595 |
+
<figure id="plot-filtering_steps" style="height:600px;"></figure>
|
596 |
<p>We compared 🍷 FineWeb with the following datasets:</p>
|
597 |
<ul>
|
598 |
<li><a
|
|
|
626 |
collection</a>. We have uploaded checkpoints at every 1000 training steps. You will also find our full <a
|
627 |
href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/eval_results.csv">evaluation
|
628 |
results here</a>.</p>
|
629 |
+
<figure id="plot-dataset_ablations" style="height:600px;"></figure>
|
630 |
<p>Some histogram comparisons of C4, Dolma, RefinedWeb and
|
631 |
FineWeb:</p>
|
632 |
<figure><img src="plots/Untitled%203.png"/></figure>
|
src/plotting.js
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const DEFAULT_LAYOUT = {
|
2 |
+
title: {
|
3 |
+
text: 'Plot Title',
|
4 |
+
font: {
|
5 |
+
size: 19
|
6 |
+
},
|
7 |
+
y: 0.87
|
8 |
+
|
9 |
+
},
|
10 |
+
xaxis: {
|
11 |
+
title: {
|
12 |
+
text: 'X-axis',
|
13 |
+
font: {
|
14 |
+
size: 15
|
15 |
+
}
|
16 |
+
},
|
17 |
+
tickfont: {
|
18 |
+
size: 14
|
19 |
+
},
|
20 |
+
showgrid: false,
|
21 |
+
mirror: true,
|
22 |
+
ticks: 'outside',
|
23 |
+
showline: true,
|
24 |
+
},
|
25 |
+
yaxis: {
|
26 |
+
title: {
|
27 |
+
text: 'Y-axis',
|
28 |
+
font: {
|
29 |
+
size: 15
|
30 |
+
}
|
31 |
+
},
|
32 |
+
range: [0, 1],
|
33 |
+
showgrid: false,
|
34 |
+
mirror: true,
|
35 |
+
ticks: 'outside',
|
36 |
+
showline: true,
|
37 |
+
tickfont: {
|
38 |
+
size: 14
|
39 |
+
}
|
40 |
+
},
|
41 |
+
legend: {
|
42 |
+
orientation: 'v',
|
43 |
+
xanchor: 'right',
|
44 |
+
yanchor: 'bottom',
|
45 |
+
x: 1,
|
46 |
+
y: 0,
|
47 |
+
font: {
|
48 |
+
size: 14
|
49 |
+
},
|
50 |
+
bgcolor: 'rgba(0,0,0,0)',
|
51 |
+
}
|
52 |
+
}
|
53 |
+
|
54 |
+
document.addEventListener('DOMContentLoaded', function() {
|
55 |
+
const plotElements = document.querySelectorAll('[id^="plot-"]');
|
56 |
+
plotElements.forEach((elem) => {
|
57 |
+
const plotName = `${elem.id.replace('plot-', '')}`;
|
58 |
+
fetch(`data/plots/${plotName}.json`)
|
59 |
+
.then(response => response.json())
|
60 |
+
.then(data => {
|
61 |
+
const traces = [];
|
62 |
+
for (const key in data.data) {
|
63 |
+
const trace = {
|
64 |
+
x: data.data[key].x,
|
65 |
+
y: data.data[key].y,
|
66 |
+
type: 'scatter',
|
67 |
+
mode: 'lines',
|
68 |
+
line: {
|
69 |
+
width: 2.5
|
70 |
+
},
|
71 |
+
name: data.data[key].label
|
72 |
+
};
|
73 |
+
traces.push(trace);
|
74 |
+
}
|
75 |
+
const layout = _.merge(DEFAULT_LAYOUT, data.layout);
|
76 |
+
console.log(layout);
|
77 |
+
|
78 |
+
Plotly.newPlot(elem, traces, layout);
|
79 |
+
})
|
80 |
+
.catch(error => console.error('Error loading the data:', error));
|
81 |
+
})
|
82 |
+
});
|