hynky HF staff commited on
Commit
ea0a273
1 Parent(s): a8eb593

ablations runs with plotly + mobile view

Browse files
data/plots/c4_filters_hellaswag.json CHANGED
@@ -1 +1 @@
1
- {"data":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.2911666582028071,0.31999999781449634,0.3476666659116745,0.3673333376646042,0.3841666678587596,0.396166667342186,0.40683333575725555,0.4143333335717519,0.42099999884764355,0.4248333275318146,0.4294999986886978,0.4333333323399226,0.4363333334525426],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29316666225592297,0.32333333293596905,0.34833333392937976,0.37016666928927106,0.3866666704416275,0.39916667342185974,0.4098333368698756,0.4176666686932246,0.422666663924853,0.42466666797796887,0.429666668176651,0.43316666781902313,0.43683333198229474],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29383333027362823,0.3306666662295659,0.3608333319425583,0.3838333288828532,0.39299999674161273,0.4049999962250392,0.4166666666666667,0.4331666628519694,0.44066666563351947,0.4479999939600627,0.45033333202203113,0.4536666621764501,0.4533333331346512],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29099999864896137,0.31966666877269745,0.34183333317438763,0.3605000029007594,0.3793333371480306,0.39800000190734863,0.4116666615009308,0.4183333267768224,0.4231666624546051,0.4298333326975505,0.43566666543483734,0.44033333162466687,0.44200000166893005],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29849999646345776,0.3388333320617676,0.3713333358367284,0.39916666845480603,0.41683333615461987,0.43249999980131787,0.4416666676600774,0.450833335518837,0.4599999984105428,0.47033333281675976,0.4763333300749461,0.4795000006755193,0.4826666663090388],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.2973333348830541,0.33400000631809235,0.36916666726271313,0.39299999674161273,0.40933333337306976,0.41700000564257306,0.4266666720310847,0.4348333328962326,0.4429999937613805,0.44849999248981476,0.453999991218249,0.4598333289225896,0.464999998609225],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,0.29733332329326206,0.3346666594346364,0.36666666467984516,0.39177778032090926,0.4108888937367334,0.42588888936572605,0.4386666648917728,0.44777777459886337,0.45900000135103863,0.46800000137752956,0.4753333330154419,0.4793333311875661,0.4818888869550493],"label":"C4"}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"},"range":[4,29]},"yaxis":{"title":{"text":"agg_score"},"range":[0.3,0.49]},"title":{"text":"C4 filtering effect on HellaSwag"}}}
 
1
+ {"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36176552809774876,0.3792347256094217,0.3928614556789398,0.40233771689236164,0.4110558107495308,0.4163004532456398,0.4155100043863058,0.42281083948910236,0.424554904922843,0.42792712710797787,0.4278372637927532,0.43066211044788355,0.43145042285323143,0.4331468697637319],"label":"Filters combined"}},"commonsense_qa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.24049999564886088,0.2750000059604645,0.27250000834465027,0.2974999994039535,0.3079999983310699,0.32949998974800104,0.3349999934434891,0.3235000073909759,0.3339999914169311,0.33550000190734863,0.340499997138977,0.3439999967813492,0.34450000524520874,0.3474999964237213],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2405000030994415,0.2854999899864197,0.2944999933242798,0.30900000035762787,0.32199999690055847,0.3264999985694885,0.33150000870227814,0.35099999606609344,0.346000000834465,0.35850000381469727,0.35599999129772186,0.36149999499320984,0.35549999773502344,0.356000006198883],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2560000047087669,0.2854999899864197,0.30550000071525574,0.31249999999999994,0.3150000125169754,0.32499998807907104,0.32850000262260437,0.3369999974966049,0.3310000002384186,0.33949999511241913,0.3385000079870224,0.340499997138977,0.341499999165535,0.33650000393390656],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.25050000101327896,0.27250000834465027,0.2939999997615814,0.31849999725818634,0.3205000013113022,0.3244999945163727,0.3295000046491623,0.33500000834465027,0.328000009059906,0.3320000022649765,0.3464999943971634,0.341499999165535,0.34250000119209284,0.34699998795986176],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2584999948740005,0.2800000011920929,0.2915000021457672,0.3174999952316284,0.328000009059906,0.32899999618530273,0.32899999618530273,0.3429999947547912,0.3465000092983246,0.34800000488758087,0.33900000154972076,0.3449999988079071,0.3409999907016754,0.3425000011920929],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2910000085830688,0.3009999990463257,0.31249999999999994,0.3149999976158142,0.3240000009536743,0.3165000081062317,0.3240000009536743,0.33250001072883606,0.3375000059604645,0.330499991774559,0.33949999511241913,0.3334999978542328,0.3400000035762787],"label":"Filters combined"}},"hellaswag/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.2880000025033951,0.31700000166893,0.3389999866485595,0.34450000524520874,0.35349999368190765,0.35450001060962677,0.36599999666213984,0.37299999594688416,0.3790000081062317,0.3779999911785126,0.37700000405311584,0.3824999928474426,0.3830000013113022,0.3879999965429306],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.27150000631809235,0.29200001060962677,0.3145000040531158,0.3384999930858612,0.3499999940395355,0.35899999737739563,0.36549998819828033,0.3610000014305115,0.36800000071525574,0.375,0.3779999911785126,0.3889999985694885,0.3889999985694885,0.39149999618530273,0.3909999877214432],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.2919999957084656,0.32349999248981476,0.3334999978542328,0.3474999964237213,0.3514999896287918,0.36450000107288355,0.37350000441074366,0.3725000023841858,0.3830000013113022,0.3849999904632568,0.390500009059906,0.39199998974800104,0.3930000066757202,0.39149999618530273],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.27150000631809235,0.2889999896287918,0.32199999690055847,0.33550000190734863,0.3445000052452087,0.35300000011920923,0.3579999953508377,0.3695000112056732,0.37249998748302454,0.380500003695488,0.38150000572204584,0.3900000005960464,0.3935000002384186,0.39200000464916224,0.3965000063180923],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.291499987244606,0.3149999976158142,0.34199999272823334,0.3499999940395355,0.3680000007152557,0.36499999463558197,0.3755000084638595,0.3830000013113022,0.3840000033378601,0.39049999415874476,0.3889999985694885,0.39399999380111694,0.3904999941587448,0.3920000046491623],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3004999905824661,0.3384999930858612,0.3760000020265579,0.3955000042915344,0.4230000078678131,0.43800000846385956,0.4375,0.45050001144409174,0.460999995470047,0.46400000154972076,0.4724999964237213,0.47599999606609344,0.47699999809265137,0.48049999773502344],"label":"Filters combined"}},"openbookqa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1290000006556511,0.1389999985694885,0.14100000262260431,0.1610000059008598,0.157999999821186,0.1649999991059303,0.1689999997615814,0.1749999970197677,0.17700000107288355,0.18600000441074366,0.19000000506639475,0.18100000172853464,0.1780000030994415,0.1860000044107437],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.13399999588727945,0.1269999966025352,0.157999999821186,0.1660000011324882,0.1689999997615814,0.1700000017881393,0.1749999970197677,0.18199999630451197,0.1829999983310699,0.17199999839067454,0.18699999898672098,0.18999999761581415,0.18899999558925626,0.18200000375509257],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.12900000438094134,0.1439999938011169,0.14599999785423273,0.1660000011324882,0.1550000011920929,0.1770000010728836,0.17999999970197672,0.1789999976754188,0.18100000172853464,0.18699999898672098,0.17999999970197678,0.18100000172853464,0.18400000035762787,0.18900000303983683],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.14000000059604645,0.14599999785423273,0.1480000019073486,0.15799999982118607,0.16899999976158142,0.17000000178813934,0.1790000051259994,0.17400000244379038,0.19699999690055847,0.18999999761581415,0.1909999996423721,0.1950000002980232,0.1940000057220459,0.18899999558925626],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1389999985694885,0.14200000464916224,0.14699999988079065,0.15700000524520868,0.1620000004768371,0.18100000172853464,0.1749999970197677,0.1840000003576278,0.19399999827146525,0.19500000029802322,0.19900000095367426,0.18500000238418576,0.19299999624490735,0.1909999996423721],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.27600000798702234,0.270000010728836,0.28900000452995295,0.2880000025033951,0.2899999916553497,0.3229999989271164,0.306999996304512,0.3240000009536743,0.3189999908208847,0.3229999989271164,0.32500000298023224,0.3189999908208847,0.32100000977516174,0.3230000138282776],"label":"Filters combined"}},"piqa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6010000109672546,0.6324999928474426,0.6525000035762787,0.6665000021457672,0.6689999997615814,0.6789999902248383,0.6784999966621399,0.6885000169277191,0.68299999833107,0.6884999871253967,0.68299999833107,0.6935000121593475,0.6929999887943268,0.69200000166893],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6049999892711639,0.6340000033378601,0.6464999914169312,0.6629999876022339,0.6660000085830688,0.6730000078678131,0.6824999749660492,0.6819999814033508,0.6940000057220459,0.6915000081062317,0.7014999985694885,0.6990000009536743,0.6969999969005585,0.7024999856948853],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6089999973773956,0.6369999945163727,0.6469999849796295,0.6655000150203705,0.6720000207424164,0.6694999933242798,0.6769999861717224,0.6809999942779541,0.6744999885559082,0.679500013589859,0.6834999918937683,0.6835000216960907,0.6884999871253967,0.6870000064373016],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5410000085830688,0.6140000224113464,0.6304999887943268,0.648499995470047,0.6705000102519989,0.6739999949932098,0.6724999845027924,0.6809999942779541,0.6780000030994415,0.6775000095367432,0.6890000104904175,0.69200000166893,0.6955000162124634,0.7019999921321869,0.6994999945163727],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6054999828338623,0.6330000162124634,0.6509999930858612,0.6644999980926514,0.6790000200271606,0.6749999821186066,0.6889999806880951,0.6955000162124634,0.6930000185966492,0.6939999759197235,0.6989999711513519,0.6959999799728394,0.7019999921321869,0.7055000066757202],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6195000112056732,0.6534999907016754,0.6725000143051147,0.69200000166893,0.7099999785423279,0.7005000114440918,0.7139999866485596,0.715499997138977,0.722000002861023,0.7254999876022339,0.7285000085830688,0.7290000021457672,0.7309999763965607,0.7305000126361847],"label":"Filters combined"}},"siqa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3634999990463257,0.37150000035762787,0.38799999654293055,0.3709999918937683,0.38199999928474426,0.37849999964237213,0.3889999985694885,0.3879999965429306,0.3854999989271164,0.3849999904632568,0.39100000262260437,0.39000000059604645,0.3904999941587448,0.3880000114440918],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.375,0.3675000071525574,0.37150000035762787,0.3734999895095825,0.3824999928474426,0.37849999964237213,0.3779999911785126,0.3840000033378601,0.3779999911785126,0.3895000070333481,0.39100000262260437,0.3895000070333481,0.3869999945163727,0.3869999945163727],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.35999999940395355,0.3735000044107437,0.37499999999999994,0.3739999979734421,0.37949998676776886,0.37649999558925623,0.3869999945163727,0.3865000009536743,0.3830000013113022,0.3860000073909759,0.37849999964237213,0.38349999487400055,0.38450001180171967,0.39100000262260437],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.36050000786781305,0.37699998915195465,0.375,0.37849999964237213,0.3790000081062317,0.37899999320507044,0.3799999952316284,0.3765000104904175,0.37949998676776886,0.38449999690055847,0.380500003695488,0.38099999725818634,0.38449999690055847,0.38099999725818634],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.37049999833106995,0.36550000309944153,0.3794999867677688,0.37550000846385956,0.367000013589859,0.3789999932050705,0.3720000088214874,0.3769999891519546,0.380500003695488,0.38199999928474426,0.37749999761581415,0.38499999046325684,0.3865000009536743,0.37849999964237213],"label":"Filters combined"}},"siqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.40049999952316284,0.3959999978542328,0.39800000190734863,0.39800000190734863,0.40299999713897705,0.3969999998807907,0.40549999475479126,0.4054999947547912,0.4155000001192093,0.41800001263618464,0.40949998795986176,0.41099999845027924,0.41200000047683716,0.4065000116825104],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3894999921321869,0.3989999890327453,0.4060000032186508,0.40299999713897705,0.4055000096559524,0.4095000028610229,0.40450000762939453,0.40750001370906824,0.4074999988079071,0.408500000834465,0.41050000488758087,0.40450000762939453,0.40500000119209284,0.4035000056028366],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3930000066757202,0.39750000834465027,0.40049999952316284,0.39849999547004694,0.40449999272823334,0.4054999947547912,0.4020000100135803,0.4115000069141388,0.40800000727176666,0.402999997138977,0.4074999988079071,0.40700000524520874,0.4060000032186508,0.40250000357627863],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40350000560283655,0.403999999165535,0.4004999995231628,0.4010000079870224,0.39899998903274536,0.4015000015497207,0.39750000834465027,0.3969999998807907,0.4030000120401382,0.4055000096559524,0.4010000079870224,0.4020000100135803,0.40299999713897705,0.3990000039339065],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.392999991774559,0.39499999582767487,0.39499999582767487,0.3995000123977661,0.3970000147819519,0.40199999511241913,0.39800000190734863,0.39549998939037323,0.3889999985694885,0.39149999618530273,0.390500009059906,0.39900000393390656,0.4025000035762787,0.4035000056028366],"label":"Filters combined"}},"winogrande/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5024999976158142,0.5144999921321869,0.5129999816417694,0.5154999792575836,0.5239999890327454,0.523499995470047,0.5304999947547913,0.5239999890327454,0.5264999866485596,0.5194999873638153,0.5209999978542328,0.5289999842643738,0.5275000035762787,0.5315000116825104],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5020000040531158,0.5125000178813934,0.5214999914169312,0.5199999809265137,0.515500009059906,0.5130000114440918,0.523499995470047,0.5194999873638153,0.523499995470047,0.5194999873638153,0.5239999890327454,0.527999997138977,0.5240000188350677,0.5269999802112579],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.49549999833106995,0.5164999961853027,0.5209999978542328,0.5125000178813934,0.5165000259876251,0.527999997138977,0.5374999940395355,0.5295000076293945,0.5320000052452087,0.5435000061988831,0.5354999899864197,0.5349999964237213,0.5360000133514404,0.5385000109672546],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.49050000309944153,0.5104999840259552,0.5055000185966492,0.5120000243186951,0.511000007390976,0.5200000107288361,0.5259999930858612,0.5215000212192535,0.523499995470047,0.5324999988079071,0.523499995470047,0.5260000228881836,0.5269999802112579,0.5290000140666962],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5020000040531158,0.5010000020265579,0.5085000097751617,0.5169999897480011,0.5379999876022339,0.5169999897480011,0.5185000002384186,0.5230000019073486,0.5225000083446503,0.5300000011920929,0.5284999907016754,0.5270000100135803,0.527999997138977,0.5230000019073486],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4895000010728836,0.4949999898672104,0.5049999952316284,0.5065000057220459,0.5220000147819519,0.5069999992847443,0.5094999969005585,0.5239999890327454,0.5190000236034393,0.5239999890327454,0.5175000131130219,0.5164999961853027,0.5185000002384186,0.5180000066757202],"label":"Filters combined"}},"sciq/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2089999988675117,0.5660000145435333,0.6834999918937683,0.710999995470047,0.7395000159740448,0.7504999935626984,0.7555000185966492,0.7709999978542328,0.7819999754428864,0.7855000197887421,0.7960000038146973,0.7950000166893005,0.7860000133514404,0.8034999966621399,0.7915000021457672],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,0.7240000069141388,null,0.7569999992847443,null,null,null,0.7860000133514404,null,null,null,null,null],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,0.8080000281333923,null,null,null],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.5529999732971191,0.6869999766349792,0.7129999995231628,0.7250000238418579,null,0.7590000033378601,null,null,null,null,null,null,null,null],"label":"Filters combined"}},"sciq/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.5009999871253967,0.5934999883174896,0.6264999806880951,0.6234999895095825,0.6369999945163727,0.6624999940395355,0.6679999828338623,0.6785000264644623,0.6910000145435333,0.6915000081062317,0.6875,0.6854999959468842,0.6990000009536743,0.6934999823570251],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,0.6184999942779541,null,0.6559999883174896,null,null,null,0.675000011920929,null,null,null,null,null],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,0.6899999976158142,null,null,null],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.5019999742507935,0.5759999752044678,0.6079999804496765,0.625,null,0.6574999988079071,null,null,null,null,null,null,null,null],"label":"Filters combined"}},"arc/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2189999967813491,0.26950000226497645,0.3042500019073486,0.32200001180171967,0.3322499990463257,0.3317500054836273,0.3454999923706054,0.35325001180171967,0.35999999940395355,0.3557500094175339,0.36499999463558197,0.37025000154972076,0.3697499930858612,0.3684999942779541,0.3684999942779541],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.27124999463558197,0.29475000500679016,0.31700000166893,0.3344999998807907,0.33500000834465027,0.3374999910593033,0.35375000536441803,0.36374999582767487,0.3699999898672104,0.3697500079870224,0.3752500116825104,0.37749999761581415,0.38500000536441803,0.3812499940395355],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.27674999833106995,0.30375000834465027,0.3227500021457672,0.3332500010728836,0.33949999511241913,0.3530000001192093,0.35475000739097595,0.357000008225441,0.36525000631809235,0.3669999986886978,0.3707500100135803,0.37150000035762787,0.3742499947547912,0.3774999976158142],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2192500010132789,0.2730000019073486,0.30249999463558197,0.32099999487400055,0.3365000039339065,0.3449999988079071,0.35324999690055847,0.35950000584125513,0.3610000014305115,0.36775000393390656,0.3645000010728836,0.3665000051259994,0.369499996304512,0.36775000393390656,0.375],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2762499898672104,0.3017500042915344,0.3217500001192093,0.3374999910593033,0.3510000109672546,0.35199999809265137,0.3552500009536743,0.364750012755394,0.3739999979734421,0.37450000643730164,0.37699998915195465,0.3802500069141388,0.37475000321865076,0.38075000047683716],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2952499985694885,0.32774999737739563,0.3400000035762787,0.35400000214576716,0.3605000078678131,0.3577500134706497,0.3610000014305115,0.36724999547004694,0.37199999392032623,0.3722499907016754,0.37174999713897705,0.37825000286102295,0.37825000286102295,0.3800000101327896],"label":"Filters combined"}},"mmlu/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.230211392045021,0.24219277501106262,0.2469336539506912,0.2517240643501282,0.25671665370464325,0.2612752318382263,0.26097105443477625,0.2655869126319885,0.2683205902576446,0.27027665078639984,0.268289178609848,0.2714609056711197,0.2713967561721802,0.27323792874813074,0.27401906251907343],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302158325910568,0.24256114661693567,0.24768169224262235,0.2507193014025688,0.2572980225086212,0.25885994732379913,0.26284952461719513,0.264521986246109,0.2700442671775818,0.27003195881843567,0.26974256336688995,0.27085191011428833,0.2683039605617523,0.26985205709934235,0.2728962004184723],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.242008663713932,0.25057119876146317,0.25422972440719604,0.2581385523080826,0.26281121373176575,0.2661672830581665,0.2682761400938034,0.26779243350028986,0.2691957354545593,0.27077533304691315,0.2710053026676178,0.2722867876291275,0.27343617379665375,0.2736435830593109],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.23012810945510864,0.2384799942374229,0.2488720491528511,0.2525842636823654,0.2563225924968719,0.25964072346687317,0.2621316760778427,0.26215170323848724,0.26426468789577484,0.2665379047393799,0.2679245918989181,0.26838323473930353,0.269764631986618,0.2705488502979278,0.27158279716968536],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.2428865954279899,0.24688749760389325,0.2509344741702079,0.2560944706201553,0.2591594159603119,0.26104307174682617,0.2641162872314453,0.2664932608604431,0.26827967166900635,0.27037402987480164,0.2704364061355591,0.2743852883577347,0.2725917398929596,0.2734202444553375],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25587423145771027,0.26312781870365143,0.26439163088798523,0.27070170640945435,0.2709464728832245,0.27815359830856323,0.2805800437927246,0.28173673152923584,0.28193922340869904,0.2856670469045639,0.28644809126853943,0.2880468964576721,0.28985339403152466,0.28967490792274475],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"}},"defaultWindowSize":3}
data/plots/cross_ind_unfiltered_comparison.json ADDED
The diff for this file is too large to render. See raw diff
 
data/plots/dataset_ablations.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/plots/dededup_difference.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,10.48576,20.97152,31.45728,41.94304,52.4288,62.91456,73.40032000000001,83.88608,94.37184,104.8576,115.34336,125.82912,136.31488000000002,146.80064000000002,157.28640000000001,167.77216,178.25792,188.74368,199.22944,209.7152,220.20096,230.68672,241.17248,251.65824,262.144,272.62976000000003,283.11552,293.60128000000003,304.08704,314.57280000000003,325.05856,335.54432,346.03008],"y":[null,null,null,null,0.40171657912433145,0.42239717617630956,0.43069435879588125,0.4351756565272808,0.43896834924817085,0.4424236983060837,0.4452380746603012,0.44781614691019056,0.45025914907455444,0.4521562337875366,0.4531575210392475,0.45397180542349813,0.4548915736377239,0.4563755728304386,0.45696389451622965,0.458776044100523,0.4609984554350376,0.4624955080449581,0.4629682660102844,0.4638278633356093,0.4645016059279441,0.4646032989025116,0.46489162668585776,0.4657001614570618,0.46593172624707224,0.4667894795536995,0.4675446107983589,0.46748293563723564,0.4683080866932869,0.46885923445224764],"label":"FineWeb full MinHash"},"big-run-refinedweb":{"x":[0.0,10.48576,20.97152,28.311552000000002,31.45728,41.94304,52.4288,62.91456,73.40032000000001,83.88608,94.37184,104.8576,115.34336,125.82912,136.31488000000002,146.80064000000002,157.28640000000001,167.77216,178.25792,188.74368,199.22944,209.7152,220.20096,230.68672,241.17248,251.65824,262.144,272.62976000000003,283.11552,293.60128000000003,304.08704,314.57280000000003,325.05856,335.54432,346.03008],"y":[null,null,null,null,0.40424661971628667,0.42596163749694826,0.43559565395116806,0.4419388733804226,0.4472432412207127,0.4522114463150501,0.45583397448062896,0.45813767313957215,0.460252707451582,0.4618991769850254,0.46210767328739166,0.46468816623091697,0.46640462651848785,0.46798615977168073,0.4687947325408458,0.4707141913473606,0.47183807417750356,0.4731586426496506,0.474202574789524,0.47580953985452645,0.4768182456493378,0.47721000015735626,0.477897260338068,0.47868331149220467,0.4798942424356937,0.48083210438489904,0.48233432918786995,0.4825453333556652,0.48372062146663664,0.48404486328363416,0.48417936712503434],"label":"RefinedWeb"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,10.48576,20.97152,28.311552000000002,31.45728,41.94304,52.4288,62.91456,73.40032000000001,83.88608,94.37184,104.8576,115.34336,125.82912,136.31488000000002,146.80064000000002,157.28640000000001,167.77216,178.25792,188.74368,199.22944,209.7152,220.20096,230.68672,241.17248,251.65824,262.144,272.62976000000003,283.11552,293.60128000000003,304.08704,314.57280000000003,325.05856,335.54432,346.03008],"y":[null,null,null,null,0.39733172245323656,0.4170659720897675,0.42569294571876515,0.42934197112917893,0.4318342722952365,0.43489449843764305,0.43767731785774233,0.43933030366897585,0.4432003878057003,0.44580490812659257,0.44852474182844154,0.4508663788437842,0.45200284123420714,0.45270049944519997,0.45411895886063575,0.45437362268567083,0.4551906920969486,0.45563211515545843,0.4572733923792839,0.45865254402160643,0.4608928956091404,0.46221072375774386,0.464424304664135,0.4650039754807949,0.465817741304636,0.46602572202682496,0.4663869492709637,0.466600227355957,0.4675856366753578,0.4670651629567145,0.46774301379919053],"label":"FineWeb filtered only"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"yaxis":{"title":{"text":"Agg Score"},"range":[0.35,0.5]},"title":{"text":"Dataset Ablations"}}}
data/plots/dedup_all_dumps_bad.json ADDED
The diff for this file is too large to render. See raw diff
 
data/plots/dedup_attempts.json ADDED
The diff for this file is too large to render. See raw diff
 
data/plots/dedup_ind_dedup_better.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"big-run-fineweb-cross-dedup-fixed":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.3672033607959747,0.38096171133220197,0.3917147450149059,0.39900310933589933,0.4051190324127674,0.4095103912055492,0.41330323591828344,0.4149076819419861,0.41757638081908227,0.41976293846964835,0.42186418995261193,0.4239416286349297,0.4263807497918606,0.4276331432163715,0.4292814388871193,0.4303454652428627,0.43060190230607986,0.4308535061776638,0.43169609159231187,0.4327337495982647,0.4333489783108234,0.4341445043683052,0.4353914998471737,0.4358899496495724,0.43601696118712424,0.4378262721002102,0.43920120820403097,0.4399888008832932,0.44119313880801203,0.4422744408249854,0.44176214635372163,0.4417391061782837,0.4425580069422722,0.4431712307035923,0.4434129290282726,0.4443381614983082,0.44572699517011644,0.4453575126826763,0.4454136833548546,0.44582572504878043,0.44621556624770164,0.4461346596479416,0.44694186076521875,0.44875710010528563,0.449846402555704,0.45069288089871395,0.4508474342525005,0.4521035052835941,0.4518044240772724,0.4524715684354305,0.45248012244701385,0.4536187544465065,0.45318649634718894,0.45328381657600403,0.4527669608592987,0.4528887316584587,0.45273062139749526,0.45357308611273767,0.45381348878145217,0.4547001600265503,0.4546117797493935,0.45545469596982,0.45476473346352575,0.45535570830106736,0.4548635698854923,0.45556106716394423,0.4545843631029129,0.45548452287912367,0.45508705601096155,0.4554302103817463,0.45531417429447174,0.455281724780798,0.4551672667264938,0.4557623341679573,0.4562753804028034,0.4574616298079491,0.45870574563741684,0.4589271329343319,0.4585279285907745,0.45825705826282503,0.45734578669071196,0.45775190740823746,0.4584693379700184,0.45953013077378274,0.46007281318306914,0.46138040721416473,0.4613276459276676,0.4619451716542244,0.4623797655105591,0.4633366808295249,0.46385016515851013,0.4640972889959811,0.4631796069443226,0.4626925766468048,0.46241471245884896,0.4609967730939388,0.4613745853304862,0.46160615757107726,0.46123209297657014,0.46154375076293946,0.46204839274287224,0.46256714984774583,0.46329036951065056,0.46346144676208495,0.46339531019330027,0.46370579674839973,0.46308762282133104,0.4635562561452389,0.4643616922199726,0.4638181388378143,0.4639813356101513,0.46431541368365287,0.46424245834350586,0.46426875963807107,0.4644449658691883,0.46465888917446135,0.46472015827894203,0.4649341784417628,0.4651320852339267,0.4659062191843987,0.466090302914381,0.46589606553316115,0.46540559232234957,0.4648520238697529,0.4642270617187023,0.46465921476483346,0.4653646066784859,0.4659481205046177,0.46683111041784286,0.46745011433959005,0.4675717860460281,0.46747381910681723,0.4676897287368774,0.46756943613290786,0.4678742937743664,0.4680095374584198,0.46776664555072783,0.4681504011154175,0.468402536213398,0.46808195635676386,0.46761643439531325,0.4683038920164108,0.4683209776878357,0.4682114221155643,0.4685330279171467,0.4685236789286137,0.4682268194854259,0.46757171973586076,0.4671459726989268,0.46695934161543845,0.46684810146689415,0.4667366869747639,0.4670290745794773,0.46758588030934334,0.46814455687999723,0.4684222251176834,0.46921108439564707,0.4692001290619373,0.4696392446756363,0.46900371834635735,0.469001292437315,0.4684843972325325,0.4679580770432949,0.4676573075354099],"label":"FineWeb full MinHash"},"big-run-refinedweb":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.36935312487184996,0.38374419510364527,0.39556556120514863,0.40262971073389053,0.40787065848708154,0.41219308972358704,0.4158456213772297,0.4189255848526955,0.4221720516681671,0.4249745920300484,0.42687042206525805,0.4291741780936718,0.4310132198035717,0.4323438510298729,0.4334666162729263,0.43628031089901925,0.4377801202237606,0.4382371336221695,0.4408359855413437,0.443413619697094,0.44420371055603025,0.44484473913908007,0.4471548080444336,0.44681392312049867,0.4467212952673435,0.44725192710757256,0.449183938652277,0.44954567328095435,0.4513785853981972,0.453145457059145,0.4544703222811222,0.45540532246232035,0.45598348379135134,0.4556840226054192,0.45618869438767434,0.4562996678054333,0.4565622642636299,0.45782349109649656,0.4579134449362754,0.45743068605661386,0.4575530268251895,0.45722133442759516,0.4563326708972454,0.45796059966087344,0.4588068947196007,0.4595696978271008,0.46085438057780265,0.4618456199765205,0.4616811737418175,0.461778799444437,0.4613799624145031,0.4618814967572689,0.46205456703901293,0.4624464973807335,0.46321221739053725,0.46389560475945474,0.4640473112463951,0.46415690779685975,0.46410940065979955,0.4646980591118336,0.4655677951872348,0.46603046879172316,0.46585372239351264,0.4667601495981216,0.46638399064540864,0.46536661833524706,0.46535017490386965,0.4668335080146789,0.46656889393925666,0.46725284233689307,0.4680892124772072,0.4678822658956051,0.4680094726383686,0.469198103249073,0.470146207511425,0.47041539251804354,0.47116121649742126,0.4711993597447872,0.4709890834987164,0.4708621069788933,0.47159409448504447,0.4708657041192055,0.4702023401856422,0.4696627654135227,0.46882943958044054,0.468185143917799,0.46857124716043475,0.4695053867995739,0.470417944341898,0.4714525043964386,0.4722059585154056,0.4725564628839492,0.47324422001838684,0.4733317464590073,0.47330567091703407,0.4735308401286601,0.4738423585891723,0.4741662509739399,0.4753894440829754,0.47546544298529625,0.4756516933441162,0.4759101323783397,0.47567163929343215,0.47464561983942977,0.47505067735910417,0.47595281824469565,0.47621366158127787,0.47611333057284344,0.476607958972454,0.47582818642258634,0.4758710920810699,0.4767298653721809,0.4770657457411289,0.47764199823141096,0.4785135343670845,0.47847120761871337,0.47832037433981894,0.47839118018746374,0.4785721957683563,0.4793835572898388,0.4795083418488503,0.4799169234931469,0.48053632155060766,0.4808934584259987,0.48123580515384673,0.4816051431000233,0.4815927021205425,0.48145300000905983,0.48048100918531417,0.4804213002324104,0.48112683445215226,0.4811866842210293,0.4815112330019474,0.48153620585799217,0.481848969310522,0.48141826167702667,0.4815475441515445,0.4816103555262089,0.4822722218930721,0.48146815076470373,0.4818138018250465,0.48226772993803024,0.4829028986394405,0.48318603038787844,0.484446120262146,0.4841261185705662,0.4838590003550053,0.48318717777729037,0.48345534726977346,0.4831574305891991,0.4837306492030621,0.48418081998825074,0.4850054018199444,0.4853626333177089,0.48546760007739065,0.4851828873157501,0.4849528044462204,0.4844263069331646,0.4835553206503391,0.48416665121912955,0.4842161864042282,0.4840385474264622,0.4839116208255291,0.48467452675104133,0.4844333350658417],"label":"RefinedWeb"},"big-run-sampled_full_filtered_no_dedup":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.36926820836961266,0.38203409612178796,0.39078781828284265,0.3956208534538746,0.3990963250398636,0.40308327972888947,0.40649700313806536,0.407912353426218,0.411406696587801,0.41404814645648,0.4164203993976116,0.41819266974925995,0.42072816491127013,0.422138486802578,0.4239996671676636,0.42384095713496206,0.426036062836647,0.4271404132246971,0.42894179224967954,0.4305384442210197,0.4319251500070095,0.4326193243265152,0.43427793607115744,0.4347934126853943,0.4345370322465897,0.4352454699575901,0.43554568514227865,0.4339814603328705,0.4342834234237671,0.43531606569886205,0.43576630502939223,0.4357234910130501,0.4372501514852047,0.43833074644207953,0.43781240433454516,0.4379648305475712,0.4381132386624813,0.438448067009449,0.4384333245456219,0.4389940395951271,0.4400040380656719,0.4415611468255519,0.4423880904912948,0.44331216141581525,0.4443627014756203,0.44366268813610077,0.44311698600649835,0.44364576414227486,0.44319151043891897,0.4427203856408595,0.44436766132712363,0.4455058850347996,0.44615875855088233,0.4465731419622898,0.4479690581560135,0.4484032191336154,0.447981271892786,0.4482423685491085,0.44924388974905016,0.44899323880672454,0.4485030435025692,0.45028517991304395,0.4507353216409683,0.45116728320717814,0.45144951716065407,0.4518436208367348,0.4513734854757786,0.4518222324550152,0.45145809948444365,0.4512837529182434,0.4519437663257122,0.4521554559469223,0.45147905945777894,0.4518196329474449,0.45257959216833116,0.4524036094546318,0.4527964904904366,0.4530105598270893,0.45335754454135896,0.45396619141101835,0.45459589511156084,0.4550217859447002,0.4560036838054657,0.4565500751137733,0.4568891301751137,0.45712930485606185,0.4563898526132107,0.45567619875073434,0.4555924661457539,0.45511320903897284,0.45539349168539045,0.4561412051320076,0.4571523576974868,0.45766874626278875,0.45818602591753005,0.458062607049942,0.4581084720790386,0.4570564292371273,0.4567670665681362,0.45731772035360335,0.4576993718743324,0.4577161468565464,0.4591300703585148,0.45967747196555137,0.4598850101232529,0.4607538335025311,0.4612186782062054,0.4619384504854679,0.4626227006316185,0.46234423369169236,0.46176643297076225,0.46136114969849584,0.46150869578123094,0.4619676761329174,0.4622086018323898,0.46235987469553946,0.4629940629005431,0.4630826920270919,0.46264655888080597,0.46258691400289537,0.4632560282945633,0.4634956054389477,0.4633393153548241,0.46364396810531616,0.4641685865819453,0.46397360116243364,0.464645367115736,0.46442538052797316,0.46406724750995637,0.46415518000721934,0.4643302120268345,0.4648954145610332,0.465849281847477,0.46623699665069573,0.4658923275768757,0.466351430863142,0.46581498682498934,0.4653567478060722,0.46588137596845625,0.46648752465844157,0.4664620153605938,0.4667314425110817,0.46697331815958015,0.4670629248023032,0.467547832429409,0.46745155528187754,0.46771074533462526,0.46740650609135626,0.46696603000164033,0.466818867623806,0.467232009768486,0.4671480022370815,0.46770014688372613,0.4685129299759865,0.46813031658530235,0.4682801507413387,0.4687854625284672,0.4682334467768669,0.4680188588798046,0.46834352910518645,0.467834347486496,0.46763256937265396,0.46837265491485597,0.46812593489885324,0.4686297595500945],"label":"FineWeb filtered only"},"big-run-sampled_full_ind_minhash":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002,29.360128000000003,31.45728,33.554432000000006,35.651584,37.748736,39.845888,41.94304,44.040192000000005,46.137344000000006,48.234496,50.331648,52.4288,54.525952000000004,56.623104000000005,58.720256000000006,60.817408,62.91456,65.011712,67.10886400000001,69.206016,71.303168,73.40032000000001,75.497472,77.59462400000001,79.691776,81.788928,83.88608,85.983232,88.08038400000001,90.177536,92.27468800000001,94.37184,96.468992,98.56614400000001,100.663296,102.76044800000001,104.8576,106.95475200000001,109.05190400000001,111.149056,113.24620800000001,115.34336,117.44051200000001,119.537664,121.634816,123.73196800000001,125.82912,127.92627200000001,130.023424,132.120576,134.21772800000002,136.31488000000002,138.412032,140.509184,142.606336,144.70348800000002,146.80064000000002,148.897792,150.994944,153.092096,155.18924800000002,157.28640000000001,159.383552,161.480704,163.577856,165.67500800000002,167.77216,169.869312,171.966464,174.06361600000002,176.16076800000002,178.25792,180.355072,182.452224,184.54937600000002,186.64652800000002,188.74368,190.840832,192.937984,195.03513600000002,197.13228800000002,199.22944,201.326592,203.423744,205.52089600000002,207.61804800000002,209.7152,211.812352,213.90950400000003,216.00665600000002,218.10380800000001,220.20096,222.298112,224.39526400000003,226.49241600000002,228.589568,230.68672,232.783872,234.88102400000002,236.97817600000002,239.075328,241.17248,243.269632,245.36678400000002,247.46393600000002,249.561088,251.65824,253.75539200000003,255.85254400000002,257.949696,260.046848,262.144,264.241152,266.338304,268.43545600000004,270.53260800000004,272.62976000000003,274.726912,276.824064,278.921216,281.018368,283.11552,285.212672,287.309824,289.40697600000004,291.50412800000004,293.60128000000003,295.698432,297.795584,299.892736,301.989888,304.08704,306.184192,308.28134400000005,310.37849600000004,312.47564800000004,314.57280000000003,316.669952,318.767104,320.864256,322.961408,325.05856,327.155712,329.25286400000005,331.35001600000004,333.44716800000003,335.54432,337.641472,339.738624,341.835776,343.932928,346.03008,348.12723200000005,350.22438400000004],"y":[null,null,null,null,0.370305210724473,0.38538585379719736,0.39511206299066537,0.40298017859458923,0.409290175139904,0.413666632771492,0.41749832555651667,0.42125996947288513,0.42437445372343063,0.4274251386523247,0.42931835278868674,0.4312170252203941,0.43252006247639657,0.43441952392458916,0.4355986125767231,0.437407860904932,0.43808431178331375,0.43985605910420417,0.44053239896893504,0.44194763004779813,0.4428853578865528,0.44475727826356887,0.44533989727497103,0.4468288891017437,0.44789535403251646,0.44945245534181594,0.45021386444568634,0.45117616206407546,0.45165844038128855,0.451799089461565,0.4520416229963303,0.4520642809569836,0.45239565819501876,0.4518701292574406,0.4518812879920005,0.45248807445168493,0.45262535139918325,0.45352781713008883,0.4553974881768227,0.4566363111138344,0.45700768008828163,0.4576442360877991,0.4585714004933834,0.4587270848453045,0.45955651104450224,0.460345446318388,0.4614695131778717,0.46071490868926046,0.46118568256497383,0.4599429272115231,0.4599632196128368,0.46037245318293574,0.4606971569359303,0.4606951214373112,0.4611978381872177,0.46112299934029577,0.4607429005205631,0.46163453757762907,0.46182278990745546,0.4631089620292187,0.4638592816889286,0.4651353217661381,0.4655767247080803,0.4659814231097698,0.46648655757308,0.4675781108438968,0.4675243757665156,0.46797287091612816,0.46853338107466697,0.4683393523097038,0.4670936234295368,0.4668723739683628,0.4665733970701694,0.4668287120759487,0.46617602929472923,0.467276993393898,0.46755580604076385,0.4675174213945866,0.4671892985701561,0.4676224373281002,0.46739820912480357,0.4674134634435177,0.4675380110740662,0.4687583431601524,0.4697458289563656,0.47001201286911964,0.4705664157867432,0.47074690759181975,0.47003005519509317,0.4694135203957558,0.46936979740858076,0.4689124390482903,0.4697516694664955,0.4707345478236674,0.4721454992890357,0.4728621408343314,0.47416575327515603,0.47499901577830317,0.4748332165181637,0.4749793991446495,0.47513311058282853,0.4751249924302101,0.47412342876195906,0.4744432017207146,0.474757094681263,0.4747910059988499,0.4747039243578911,0.47488212734460833,0.47521442025899885,0.47504854425787923,0.47552693635225296,0.47552043944597244,0.476516292989254,0.4771891236305237,0.47768160700798035,0.47804755941033356,0.47844709232449534,0.4783868357539177,0.4781750589609146,0.47845144048333166,0.47777490466833117,0.4781319200992584,0.4779041476547718,0.4774407997727394,0.4771588683128357,0.47804371640086174,0.4785159431397915,0.4782356970012188,0.47833485826849936,0.4782362774014473,0.478005338460207,0.47810146361589434,0.47927170172333716,0.4796604186296463,0.4800727739930153,0.48045224100351336,0.48049417063593863,0.4802452601492405,0.47976561784744265,0.48014174327254294,0.4805368520319462,0.480988722294569,0.4814631320536137,0.48263009190559386,0.48285799250006667,0.48333154022693625,0.48357397094368937,0.48365214839577675,0.4836900994181633,0.484160565584898,0.4837813340127468,0.4838929153978824,0.48443443700671185,0.48451621904969217,0.48415875136852266,0.48465299159288405,0.4842782385647297,0.4841688245534897,0.48420060351490973,0.48406358510255804,0.48331916853785517,0.48322339728474617,0.48276304453611374,0.4823808237910271,0.48311666399240494],"label":"FineWeb independent MinHash"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"},"range":[0.0,367.73560320000007]},"yaxis":{"title":{"text":"Agg Score"}},"title":{"text":"Independent dedup outperforms dedup across dumps"}}}
data/plots/filtering_steps.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/plots/removed_data_cross_dedup.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"agg_score":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3310184087604284,0.3494944926351309,0.3678930029273033,0.3791136778891086,0.3830251954495907,0.387223158031702,0.3940111547708511,0.3980898857116699,0.398512527346611,0.3974943198263645,0.4026404283940792,0.402598962187767,0.4074418470263481,0.4055770002305507,0.4050002694129944],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3570646308362484,0.3725825920701027,0.383445743471384,0.39065178856253624,0.3996846079826355,0.4021379072219133,0.4061895925551653,0.41160152666270733,0.4141362868249416,0.4196407739073038,0.4217643104493618,0.4209167677909136,0.42394610680639744,0.4236117731779814],"label":"Originally removed data"}},"commonsense_qa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2169999927282333,0.2450000047683715,0.2689999938011169,0.2770000100135803,0.2899999916553497,0.3030000030994415,0.3160000145435333,0.3260000050067901,0.3100000023841858,0.3210000097751617,0.3179999887943268,0.328000009059906,0.3240000009536743,0.3199999928474426],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.24449999630451197,0.27199999988079065,0.2944999933242798,0.2965000122785568,0.3154999911785126,0.3205000013113022,0.3215000033378601,0.32900001108646393,0.3360000103712082,0.33449999988079065,0.34049999713897705,0.3449999988079071,0.3474999964237213,0.346000000834465],"label":"Originally removed data"}},"commonsense_qa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2479999959468841,0.2800000011920929,0.2910000085830688,0.289000004529953,0.3059999942779541,0.3050000071525574,0.3050000071525574,0.3149999976158142,0.3140000104904175,0.3269999921321869,0.3219999969005584,0.3190000057220459,0.3179999887943268,0.3120000064373016],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2580000013113022,0.28649999201297754,0.289000004529953,0.29500000178813934,0.30949999392032623,0.31599999964237213,0.31700000166893005,0.318000003695488,0.32549999654293055,0.32099999487400055,0.33250001072883606,0.32500000298023224,0.3330000042915344,0.32750000059604645],"label":"Originally removed data"}},"hellaswag/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.2879999876022339,0.2980000078678131,0.3039999902248382,0.3109999895095825,0.3269999921321869,0.3319999873638153,0.3370000123977661,0.3389999866485595,0.3449999988079071,0.3470000028610229,0.3479999899864197,0.3490000069141388,0.3499999940395355,0.3540000021457672],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.28649999201297754,0.3079999983310699,0.3230000138282776,0.33149999380111694,0.3339999914169311,0.34699998795986176,0.3530000001192093,0.3564999997615814,0.36400000751018524,0.37299999594688416,0.37199999392032623,0.3734999895095825,0.3774999976158142,0.37800000607967377],"label":"Originally removed data"}},"hellaswag/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2860000133514404,0.2879999876022339,0.328000009059906,0.3379999995231628,0.356000006198883,0.356000006198883,0.3589999973773956,0.3720000088214874,0.3740000128746032,0.382999986410141,0.3810000121593475,0.395000010728836,0.3849999904632568,0.3930000066757202],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2775000035762787,0.3085000067949295,0.32750000059604645,0.35600000619888306,0.36999998986721033,0.37950000166893005,0.3965000063180923,0.41050000488758087,0.41250000894069666,0.42149999737739563,0.4270000010728836,0.4339999854564667,0.4389999955892563,0.4375],"label":"Originally removed data"}},"openbookqa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1040000021457672,0.1280000060796737,0.1379999965429306,0.1319999992847442,0.1379999965429306,0.1420000046491623,0.1420000046491623,0.1500000059604644,0.1400000005960464,0.1560000032186508,0.1599999964237213,0.1620000004768371,0.1580000072717666,0.1519999951124191],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1269999966025352,0.123999997973442,0.1530000045895576,0.1519999951124191,0.16600000113248825,0.16899999976158137,0.17999999970197675,0.1689999997615814,0.1789999976754188,0.1829999983310699,0.1879999935626983,0.18400000035762787,0.18400000035762787,0.19400000572204584],"label":"Originally removed data"}},"openbookqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2440000027418136,0.2800000011920929,0.2660000026226043,0.2800000011920929,0.2759999930858612,0.2879999876022339,0.3019999861717224,0.2879999876022339,0.2739999890327453,0.2800000011920929,0.2840000092983246,0.2899999916553497,0.2899999916553497,0.2879999876022339],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26899999380111694,0.26899999380111694,0.2870000004768371,0.2849999964237213,0.29200001060962677,0.29900000989437103,0.29900000989437103,0.2980000078678131,0.2939999997615814,0.32199999690055847,0.31700000166893005,0.30799999833106995,0.31700000166893005,0.3260000050067901],"label":"Originally removed data"}},"piqa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5429999828338623,0.5839999914169312,0.628000020980835,0.6259999871253967,0.6389999985694885,0.6520000100135803,0.640999972820282,0.6589999794960022,0.6549999713897705,0.6690000295639038,0.671999990940094,0.675000011920929,0.6759999990463257,0.6809999942779541,0.6759999990463257],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.5945000052452087,0.6354999840259552,0.6315000057220459,0.6439999938011169,0.6584999859333038,0.6629999876022339,0.6730000078678131,0.6784999966621399,0.6854999959468842,0.6825000047683716,0.6889999806880951,0.6899999976158142,0.6929999887943268,0.6940000057220459],"label":"Originally removed data"}},"piqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6140000224113464,0.6480000019073486,0.6539999842643738,0.6669999957084656,0.6549999713897705,0.6679999828338623,0.6850000023841858,0.671999990940094,0.6869999766349792,0.6840000152587891,0.6880000233650208,0.6890000104904175,0.6980000138282776,0.6940000057220459],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.6419999897480011,0.6620000004768372,0.6780000030994415,0.6819999814033508,0.6865000128746033,0.6884999871253967,0.7064999938011169,0.7080000042915344,0.7055000066757202,0.7114999890327454,0.715499997138977,0.7084999978542328,0.7074999809265137],"label":"Originally removed data"}},"siqa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3659999966621399,0.3659999966621399,0.3619999885559082,0.3610000014305115,0.3650000095367431,0.375,0.3720000088214874,0.3720000088214874,0.363999992609024,0.3759999871253967,0.367000013589859,0.367000013589859,0.3729999959468841,0.367000013589859],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3610000014305115,0.3720000088214874,0.37299999594688416,0.3830000013113022,0.3869999945163727,0.37649999558925623,0.3809999972581863,0.3845000118017196,0.39050000905990595,0.3930000066757202,0.4004999995231628,0.38799999654293055,0.39649999141693115,0.39649999141693115],"label":"Originally removed data"}},"siqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3799999952316284,0.382999986410141,0.3939999938011169,0.3930000066757202,0.3889999985694885,0.3970000147819519,0.4009999930858612,0.3959999978542328,0.3919999897480011,0.3970000147819519,0.3869999945163727,0.4070000052452087,0.3959999978542328,0.3959999978542328],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.38750000298023224,0.3945000022649765,0.3974999934434891,0.39699999988079065,0.3994999974966049,0.3974999934434891,0.39149999618530273,0.3975000083446502,0.39699999988079065,0.39499999582767487,0.39750000834465027,0.39399999380111694,0.39399999380111694,0.3919999897480011],"label":"Originally removed data"}},"winogrande/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5149999856948853,0.4990000128746032,0.5189999938011169,0.5189999938011169,0.5019999742507935,0.5149999856948853,0.5400000214576721,0.531000018119812,0.5320000052452087,0.5289999842643738,0.5289999842643738,0.5379999876022339,0.527999997138977,0.5379999876022339,0.527999997138977],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5090000033378601,0.5189999938011169,0.5080000162124634,0.507999986410141,0.5214999914169312,0.510000005364418,0.5195000171661377,0.531499981880188,0.5290000140666962,0.5300000011920929,0.5324999988079071,0.5394999980926514,0.5375000238418579,0.5415000021457672],"label":"Originally removed data"}},"winogrande/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.492000013589859,0.4990000128746032,0.5040000081062317,0.4959999918937683,0.5109999775886536,0.5210000276565552,0.5099999904632568,0.5080000162124634,0.5059999823570251,0.5130000114440918,0.515999972820282,0.5099999904632568,0.5099999904632568,0.5130000114440918],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49900001287460327,0.50450000166893,0.4985000044107437,0.502499982714653,0.5170000195503235,0.5030000060796738,0.5210000276565552,0.5160000026226044,0.5200000107288361,0.527999997138977,0.5224999785423279,0.5259999930858612,0.5240000188350677,0.5214999914169312],"label":"Originally removed data"}},"sciq/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.5460000038146973,null,null,null,null,null,null,null,null,null,null,null,null,0.7850000262260437],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2080000042915344,0.5705000162124634,0.6755000054836273,0.7165000140666962,0.734499990940094,0.7419999837875366,0.7645000219345093,0.7664999961853027,0.7675000131130219,0.7719999849796295,0.7745000123977661,0.8009999990463257,0.7900000214576721,0.8019999861717224,0.7899999916553497],"label":"Originally removed data"}},"sciq/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.4839999973773956,null,null,null,null,null,null,null,null,null,null,null,null,0.675000011920929],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.507999986410141,0.5740000009536743,0.6184999942779541,0.6365000009536743,0.6500000059604645,0.6665000021457672,0.6669999957084656,0.6615000069141388,0.6689999997615814,0.6774999797344208,0.6949999928474426,0.6899999976158142,0.7070000171661377,0.6909999847412109],"label":"Originally removed data"}},"arc/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2565000057220459,0.2845000028610229,0.3034999966621399,0.3104999959468841,0.3190000057220459,0.328000009059906,0.3319999873638153,0.3364999890327453,0.3445000052452087,0.3445000052452087,0.3490000069141388,0.3510000109672546,0.3540000021457672,0.3589999973773956],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2707500010728836,0.30649998784065247,0.31075000762939453,0.3317500054836273,0.3429999947547912,0.3489999920129776,0.36050000786781305,0.3542499989271164,0.3617500066757202,0.37049999833106995,0.37324999272823334,0.3669999986886978,0.377250000834465,0.37675000727176666],"label":"Originally removed data"}},"arc/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2520000040531158,0.277999997138977,0.3115000128746032,0.3334999978542328,0.3375000059604645,0.3379999995231628,0.351500004529953,0.3549999892711639,0.3630000054836273,0.3610000014305115,0.3650000095367431,0.3659999966621399,0.3700000047683716,0.3729999959468841,0.3659999966621399],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29625000059604645,0.3167499899864197,0.3412500023841858,0.3410000056028366,0.35625000298023224,0.3604999929666519,0.36050000786781305,0.3652499914169311,0.3722499907016754,0.37925000488758087,0.3789999932050705,0.3782500028610229,0.38750000298023224,0.3879999965429306],"label":"Originally removed data"}},"mmlu/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302279472351074,0.2396509051322937,0.2397145926952362,0.2505511939525604,0.2498872578144073,0.2526205778121948,0.2522554993629455,0.2501455843448639,0.2584334015846252,0.2589265406131744,0.2582942545413971,0.2596102058887481,0.2603496015071869,0.2584807872772217,0.2601740062236786],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.24148745089769358,0.24818557500839228,0.2512914910912514,0.25797079503536224,0.2579677104949951,0.2621956318616867,0.2641489803791046,0.2655077129602432,0.26579149067401886,0.2702796012163162,0.26829390227794647,0.2691189646720886,0.2700086534023285,0.27018964290618896],"label":"Originally removed data"}},"mmlu/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25014728307724,0.2539559006690979,0.2536440193653106,0.2624094188213348,0.2637015581130981,0.2667853236198425,0.2655892074108124,0.2677191197872162,0.2741002142429352,0.271954596042633,0.2721233963966369,0.2767916917800903,0.2795347571372986,0.2746160328388214,0.2780021429061889],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2537670284509659,0.25891076028347015,0.26481594145298004,0.27071431279182434,0.2712268680334091,0.27510324120521545,0.2755167037248611,0.28106220066547394,0.28384028375148773,0.2848761975765228,0.2871145009994507,0.28658416867256165,0.288568839430809,0.2888942211866379],"label":"Originally removed data"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"The originally removed data outperforms the kept data"}}}
data/plots/wet_comparison.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data":{"agg_score":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35846718959510326,0.37850185949355364,0.39194786734879017,0.39600365422666073,0.40486439503729343,0.4064061753451824,0.41104014590382576,0.41393135115504265,0.41698802448809147,0.42121383734047413,0.4219294786453247,0.4234823901206255,0.42346264235675335,0.42699199728667736],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308620806783438,0.3493095366284251,0.367914117872715,0.37658837065100664,0.3858313206583261,0.3908915650099516,0.3968510050326586,0.39992102794349194,0.40259181894361973,0.4055726025253534,0.4074157159775495,0.40804907679557795,0.40894216299057007,0.4123705606907606,0.4108315110206604],"label":"WET data"}},"commonsense_qa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.255500003695488,0.2810000032186508,0.29899999499320984,0.3074999898672104,0.3125,0.3264999985694885,0.3229999989271164,0.3340000063180923,0.3275000005960464,0.3410000056028366,0.33900000154972076,0.3425000011920929,0.3479999899864197,0.34849999845027924],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2275000065565109,0.2554999962449073,0.27199999988079065,0.29200001060962677,0.29950000345706934,0.30650000274181366,0.3110000044107437,0.3100000023841858,0.32150000333786005,0.3200000077486038,0.3214999884366989,0.3199999928474426,0.3179999887943268,0.3255000114440918],"label":"WET data"}},"commonsense_qa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26000000536441803,0.291499987244606,0.3019999861717224,0.3034999966621399,0.3109999895095825,0.31599999964237213,0.3254999965429306,0.3210000097751617,0.3320000022649765,0.33449999988079065,0.3344999998807907,0.3334999978542328,0.3349999934434891,0.3385000079870224],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.24249999970197675,0.2705000042915344,0.27549999952316284,0.28700000047683716,0.28449998795986176,0.29099999368190765,0.2979999929666519,0.3075000047683716,0.30550000071525574,0.3079999983310699,0.3110000044107437,0.30949999392032623,0.3114999979734421,0.3100000023841858],"label":"WET data"}},"hellaswag/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.29099999368190765,0.3135000020265579,0.34000000357627863,0.34900000691413874,0.3554999977350235,0.3554999977350235,0.36499999463558197,0.37499999999999994,0.37450000643730164,0.3824999928474426,0.3869999945163727,0.3819999992847442,0.38450001180171967,0.3835000097751617],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.27850000560283655,0.3020000010728836,0.31299999356269836,0.3240000009536743,0.3375000059604645,0.340499997138977,0.3489999920129776,0.35249999165534973,0.35599999129772186,0.35999999940395355,0.35899999737739563,0.3614999949932098,0.36500000953674316,0.3645000010728836],"label":"WET data"}},"hellaswag/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2915000021457672,0.33500000834465027,0.35800001025199885,0.37450000643730164,0.3859999924898147,0.3959999978542328,0.4035000056028366,0.4220000058412552,0.4294999986886978,0.43400000035762787,0.44099999964237213,0.4424999952316284,0.44449999928474426,0.4494999945163727],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28650000691413874,0.31299999356269836,0.3305000066757202,0.3569999933242798,0.3710000067949295,0.3879999965429306,0.3854999989271164,0.39199998974800104,0.4055000096559524,0.4064999967813492,0.4065000116825104,0.4120000004768371,0.41700001060962677,0.4175000041723251],"label":"WET data"}},"openbookqa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1270000040531158,0.13499999791383738,0.15399999916553497,0.15300000458955765,0.17999999970197672,0.1789999976754188,0.1780000030994415,0.1909999996423721,0.1950000002980232,0.19799999892711634,0.19600000232458115,0.1960000023245811,0.19500000029802322,0.20199999958276743],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1280000060796737,0.12900000065565104,0.13300000131130213,0.15100000053644175,0.14999999850988385,0.1560000032186508,0.1659999936819076,0.17199999839067454,0.16899999976158137,0.1689999997615814,0.16799999773502344,0.17599999904632566,0.1749999970197677,0.1680000051856041],"label":"WET data"}},"openbookqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26500000059604645,0.24800000339746475,0.28299999237060547,0.28200000524520874,0.30900000035762787,0.3100000023841858,0.3020000010728836,0.3149999976158142,0.3110000044107437,0.32100000977516174,0.31700000166893005,0.31599999964237213,0.31599999964237213,0.31900000572204584],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2580000013113022,0.2719999998807907,0.2770000100135803,0.27300000190734863,0.2880000025033951,0.2989999949932098,0.29500000178813934,0.29899999499320984,0.3100000023841858,0.30300000309944153,0.30600000917911524,0.3040000051259994,0.3110000044107437,0.30300000309944153],"label":"WET data"}},"piqa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.609499990940094,0.6369999945163727,0.6509999930858612,0.6589999794960022,0.6659999787807465,0.6730000078678131,0.6805000007152557,0.6780000030994415,0.6889999806880951,0.6875,0.6990000009536743,0.6969999969005585,0.6974999904632568,0.6995000243186951],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.5834999978542328,0.6269999742507935,0.6359999775886536,0.6525000035762787,0.6499999761581421,0.6565000116825104,0.6660000085830688,0.6630000174045563,0.6620000004768372,0.6689999997615814,0.6680000126361847,0.6755000054836273,0.6744999885559082,0.6684999763965607],"label":"WET data"}},"piqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6130000054836273,0.6599999964237213,0.6704999804496765,0.6845000088214874,0.6854999959468842,0.6895000040531158,0.7005000114440918,0.6990000009536743,0.7090000212192535,0.707999974489212,0.7125000059604645,0.7114999890327454,0.7094999849796295,0.7150000035762787],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.5995000004768372,0.6385000050067902,0.6534999907016754,0.6675000190734863,0.6755000054836273,0.6814999878406525,0.6859999895095825,0.6840000152587891,0.6924999952316284,0.6944999992847443,0.69200000166893,0.6995000243186951,0.6960000097751617,0.6979999840259552],"label":"WET data"}},"siqa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.36200000345706934,0.37549999356269836,0.3665000051259994,0.369499996304512,0.37049999833106995,0.37950000166893005,0.38449999690055847,0.3860000073909759,0.3800000101327896,0.38499999046325684,0.3865000009536743,0.39050000905990595,0.38599999248981476,0.3824999928474426],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3630000054836273,0.3709999918937683,0.37849999964237213,0.36800000071525574,0.36849999427795405,0.37450000643730164,0.36450000107288355,0.37899999320507044,0.37800000607967377,0.3790000081062317,0.38350000977516174,0.38150000572204584,0.37849999964237213,0.38199999928474426],"label":"WET data"}},"siqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.39000000059604645,0.40549999475479126,0.39900000393390656,0.3969999998807907,0.39800000190734863,0.3955000042915344,0.40549999475479126,0.398499995470047,0.39650000631809235,0.40050001442432404,0.40300001204013824,0.402999997138977,0.40100000798702234,0.4089999943971634],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.39499999582767487,0.39249999821186066,0.39800000190734863,0.4025000035762787,0.3924999982118606,0.39750000834465027,0.3999999910593033,0.403999999165535,0.39750000834465027,0.39800000190734863,0.40049999952316284,0.402999997138977,0.40800000727176666,0.4095000028610229],"label":"WET data"}},"winogrande/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5130000114440918,0.5020000040531158,0.5160000026226044,0.5130000114440918,0.5225000083446503,0.5230000019073486,0.5284999907016754,0.523499995470047,0.5250000059604645,0.5340000092983246,0.539000004529953,0.5304999947547913,0.5304999947547913,0.5315000116825104],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.50450000166893,0.5049999952316284,0.5090000182390213,0.5074999928474426,0.5244999825954437,0.5220000147819519,0.5164999961853027,0.5170000195503235,0.5195000171661377,0.5264999866485596,0.5289999842643738,0.5210000276565552,0.5264999866485596,0.5214999914169312],"label":"WET data"}},"winogrande/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4960000067949295,0.5010000020265579,0.5055000185966492,0.5119999945163727,0.5230000019073486,0.5149999856948853,0.5180000066757202,0.5145000219345093,0.5139999985694885,0.5200000107288361,0.5140000283718109,0.5220000147819519,0.5195000171661377,0.5194999873638153],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49299998581409454,0.494499996304512,0.4989999979734421,0.502000018954277,0.5115000009536743,0.5119999945163727,0.5185000002384186,0.5119999945163727,0.5090000033378601,0.5175000131130219,0.5139999985694885,0.5074999928474426,0.5164999961853027,0.5115000009536743],"label":"WET data"}},"sciq/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2080000042915344,0.5795000195503235,0.6650000214576721,0.7175000011920929,0.74549999833107,0.7540000081062317,0.7669999897480011,0.7685000002384186,0.7835000157356262,0.7915000021457672,0.7910000085830688,0.7994999885559082,0.7994999885559082,0.7999999821186066,0.7970000207424164],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2080000042915344,0.5444999933242798,0.6380000114440918,0.6674999892711639,0.6964999735355377,0.7229999899864197,0.7310000061988831,0.7495000064373016,0.7479999959468842,0.7459999918937683,0.7650000154972076,0.7660000026226044,0.7604999840259552,0.7674999833106995,0.7739999890327454],"label":"WET data"}},"sciq/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.5144999921321869,0.578000009059906,0.6195000112056732,0.6634999811649323,0.656499981880188,0.6635000109672546,0.6605000197887421,0.6694999933242798,0.6810000240802765,0.6885000169277191,0.6969999969005585,0.6910000145435333,0.6935000121593475,0.6994999945163727],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.49549999833106995,0.5710000097751617,0.5699999928474426,0.6069999933242798,0.6215000152587891,0.6235000193119049,0.637499988079071,0.6394999921321869,0.6234999895095825,0.6464999914169312,0.6509999930858612,0.6445000171661377,0.6509999930858612,0.656499981880188],"label":"WET data"}},"arc/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.27875000238418574,0.29625000059604645,0.32250000536441803,0.33999998867511744,0.3452500104904175,0.35175000131130213,0.35224999487400055,0.3577499985694885,0.36249999701976776,0.36675000190734863,0.3662499934434891,0.37424999475479126,0.37325000762939453,0.37574999034404755],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2539999932050705,0.2867499887943268,0.3050000071525574,0.3192500025033951,0.3237499892711639,0.3242499977350235,0.3344999998807907,0.3422500044107437,0.3464999943971634,0.3434999883174896,0.34675000607967377,0.3487500101327896,0.3527500033378601,0.35424999892711634],"label":"WET data"}},"arc/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2957500070333481,0.32750000059604645,0.3479999899864197,0.3422500044107437,0.3535000085830688,0.35199999809265137,0.3564999997615814,0.36150000989437103,0.36275000870227814,0.36924999952316284,0.3685000091791153,0.37325000762939453,0.3764999955892563,0.3779999911785126],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.2689999938011169,0.304749995470047,0.32025000452995295,0.33400000631809235,0.3375000059604645,0.3384999930858612,0.346000000834465,0.34949998557567596,0.3512499928474426,0.3535000085830688,0.3577500134706497,0.35724999010562897,0.35950000584125513,0.35875000059604645],"label":"WET data"}},"mmlu/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.24134793877601618,0.2466971725225448,0.25406564772129053,0.2581391483545303,0.2584117949008941,0.2615972906351089,0.2637547105550766,0.26588438451290125,0.2674466520547867,0.2716866135597229,0.2710578292608261,0.27132466435432434,0.2711777687072754,0.27193698287010193],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302158325910568,0.2384574860334396,0.24489634484052655,0.2443553805351257,0.2494990974664688,0.2522265464067459,0.2537587881088257,0.25265602767467493,0.25639262795448303,0.2572821974754333,0.2601653635501861,0.260224312543869,0.26066550612449646,0.2612243592739105,0.26030270755290985],"label":"WET data"}},"mmlu/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25648748874664307,0.2595148831605911,0.2695829570293426,0.27227921783924103,0.27291516959667206,0.2772494107484817,0.27682115137577057,0.2799507677555084,0.28115415573120117,0.28246068954467773,0.28493577241897583,0.2861091196537018,0.2857011407613754,0.28743599355220795],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2509763091802597,0.25756295025348663,0.2589569538831711,0.2636505216360092,0.2666325122117996,0.26730807125568384,0.2703682482242584,0.2727345675230026,0.273330807685852,0.2783257067203522,0.27664257586002344,0.278787299990654,0.27946445345878596,0.27840209007263184],"label":"WET data"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"WET data is worse than data extracted from WARC"}}}
data/stats/cont/long_line_ratio_chars.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dedup_minhash_independent_output_CC-MAIN-2013-48": {"summary": {"total": 503675.7632949609, "n": 147079460, "mean": 0.003424514635116019, "variance": 0.0025697256643773384, "std_dev": 0.05069246161291971, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-10": {"summary": {"total": 477249.8732939497, "n": 143657153, "mean": 0.003322144865936122, "variance": 0.0024192317318115163, "std_dev": 0.04918568624926886, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-15": {"summary": {"total": 468517.8572129764, "n": 132360296, "mean": 0.0035397159977111005, "variance": 0.0026489978777381137, "std_dev": 0.0514684163127069, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-23": {"summary": {"total": 546784.0897851108, "n": 155998870, "mean": 0.0035050516057270856, "variance": 0.0026337359768035754, "std_dev": 0.0513199374201058, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-35": {"summary": {"total": 462439.50784318714, "n": 142620383, "mean": 0.0032424503294398465, "variance": 0.0023715666360857755, "std_dev": 0.04869873341356811, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-41": {"summary": {"total": 490538.38539629214, "n": 147734577, "mean": 0.003320403356868125, "variance": 0.0024591230903567266, "std_dev": 0.04958954618018526, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-42": {"summary": {"total": 450102.6547545083, "n": 132613482, "mean": 0.003394094235113352, "variance": 0.0025276948241174956, "std_dev": 0.050276185457107776, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-49": {"summary": {"total": 381564.70029478066, "n": 112267003, "mean": 0.003398725271884031, "variance": 0.0024800930139761376, "std_dev": 0.04980053226599227, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2014-52": {"summary": {"total": 479021.4195261399, "n": 139120486, "mean": 0.00344321266622185, "variance": 0.002511182262691407, "std_dev": 0.05011169786278855, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-06": {"summary": {"total": 458767.9491788598, "n": 126844432, "mean": 0.003616776408276715, "variance": 0.002665500420296227, "std_dev": 0.05162848458260447, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-11": {"summary": {"total": 483941.3190566223, "n": 127643166, "mean": 0.003791360980944502, "variance": 0.0028157731340798845, "std_dev": 0.053063859019862894, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-14": {"summary": {"total": 430001.1293451162, "n": 118475124, "mean": 0.0036294634251247226, "variance": 0.002631370624335548, "std_dev": 0.05129688708231278, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-18": {"summary": {"total": 491718.89834296564, "n": 138468137, "mean": 0.003551133921466467, "variance": 0.0025214439225141275, "std_dev": 0.05021398134498128, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-22": {"summary": {"total": 451628.632703181, "n": 134167086, "mean": 0.0033661656235358687, "variance": 0.0023407613311427, "std_dev": 0.04838141514200158, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-27": {"summary": {"total": 406312.26674646034, "n": 117680691, "mean": 0.0034526672412763123, "variance": 0.002472225793966121, "std_dev": 0.049721482218112935, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-32": {"summary": {"total": 437276.81537853123, "n": 123822434, "mean": 0.0035314829571072015, "variance": 0.0025440031555350982, "std_dev": 0.050438112132940686, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-35": {"summary": {"total": 435313.406535138, "n": 125287657, "mean": 0.003474511511817466, "variance": 0.0025227301264488995, "std_dev": 0.050226786941321454, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-40": {"summary": {"total": 335877.0474209389, "n": 97610055, "mean": 0.0034410086893295858, "variance": 0.0025122403784736216, "std_dev": 0.05012225432353997, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2015-48": {"summary": {"total": 424636.2947204954, "n": 121770747, "mean": 0.003487178203156589, "variance": 0.0025490425828088204, "std_dev": 0.050488043959028756, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-07": {"summary": {"total": 431724.598848168, "n": 117487315, "mean": 0.0036746486107727258, "variance": 0.0027041145881101, "std_dev": 0.0520011017970783, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-18": {"summary": {"total": 372260.8337468853, "n": 101598456, "mean": 0.003664040265994645, "variance": 0.0026363101718714544, "std_dev": 0.051345011168286395, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-22": {"summary": {"total": 446777.3666128868, "n": 104095051, "mean": 0.004292013523418003, "variance": 0.003045487097677912, "std_dev": 0.055185932063143706, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-26": {"summary": {"total": 312118.40920063324, "n": 85658795, "mean": 0.0036437403678236833, "variance": 0.0026001851877553474, "std_dev": 0.05099201101893656, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-30": {"summary": {"total": 512501.76048538735, "n": 116150874, "mean": 0.004412379716448683, "variance": 0.003362512028791432, "std_dev": 0.05798717124322786, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-36": {"summary": {"total": 501346.3803307479, "n": 112444756, "mean": 0.004458601700649762, "variance": 0.003419464565568183, "std_dev": 0.058476188021862226, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-40": {"summary": {"total": 621159.6135185926, "n": 130199237, "mean": 0.0047708391218805185, "variance": 0.0037445205063846286, "std_dev": 0.06119248733614796, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-44": {"summary": {"total": 1055952.5382291991, "n": 179260687, "mean": 0.005890597408171262, "variance": 0.004729650636563256, "std_dev": 0.06877245550773403, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2016-50": {"summary": {"total": 961368.7369086652, "n": 168161644, "mean": 0.005716932316079558, "variance": 0.0045665700170274106, "std_dev": 0.06757640133232466, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-04": {"summary": {"total": 1029148.6498264002, "n": 179574881, "mean": 0.005731027881490845, "variance": 0.004594296088489218, "std_dev": 0.06778123699438672, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-09": {"summary": {"total": 986219.1923490353, "n": 184211607, "mean": 0.005353729921855774, "variance": 0.004347835212885512, "std_dev": 0.06593811654032523, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-13": {"summary": {"total": 1006838.3877869259, "n": 210831052, "mean": 0.004775569719146163, "variance": 0.0037945105915810568, "std_dev": 0.06159959895633296, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-17": {"summary": {"total": 901989.5704826788, "n": 212150290, "mean": 0.004251653723794903, "variance": 0.0032366148021543747, "std_dev": 0.05689125417983308, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-22": {"summary": {"total": 723954.7833984483, "n": 158977255, "mean": 0.004553826164619889, "variance": 0.0035502816876231014, "std_dev": 0.05958424026219602, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-26": {"summary": {"total": 784596.4819787261, "n": 177870110, "mean": 0.004411064242208694, "variance": 0.0033950943592638374, "std_dev": 0.05826743824181596, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-30": {"summary": {"total": 710929.3914884122, "n": 156120923, "mean": 0.004553709892481311, "variance": 0.0034693424702404983, "std_dev": 0.058901124524413775, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-34": {"summary": {"total": 898191.5311259428, "n": 187764584, "mean": 0.0047836046180356724, "variance": 0.003626917323301685, "std_dev": 0.06022389329246063, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-39": {"summary": {"total": 781385.625545756, "n": 160381272, "mean": 0.004872050307381008, "variance": 0.0037673378974601717, "std_dev": 0.061378643659339455, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-43": {"summary": {"total": 880902.8865198086, "n": 195809549, "mean": 0.004498773890336728, "variance": 0.003395797301247199, "std_dev": 0.058273469960584974, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-47": {"summary": {"total": 781253.4772761391, "n": 164388521, "mean": 0.0047524819404886525, "variance": 0.003608850956999768, "std_dev": 0.0600737126953193, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2017-51": {"summary": {"total": 793631.4709445388, "n": 162229133, "mean": 0.004892040389222426, "variance": 0.0036606111460592726, "std_dev": 0.060502984604557095, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-05": {"summary": {"total": 939267.8233628391, "n": 172656024, "mean": 0.005440110351219711, "variance": 0.004058909137002564, "std_dev": 0.06370956864555405, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-09": {"summary": {"total": 973523.870943135, "n": 177757323, "mean": 0.005476701913108438, "variance": 0.004100443301281274, "std_dev": 0.06403470388220182, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-13": {"summary": {"total": 1002655.8024193122, "n": 170088561, "mean": 0.00589490437525255, "variance": 0.004316622584080868, "std_dev": 0.06570100900352191, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-17": {"summary": {"total": 900028.6114917084, "n": 155130249, "mean": 0.005801760889919717, "variance": 0.004393816767854762, "std_dev": 0.06628587155536812, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-22": {"summary": {"total": 997759.9918591186, "n": 143015146, "mean": 0.0069766036658740925, "variance": 0.005162646918437998, "std_dev": 0.07185156169797563, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-26": {"summary": {"total": 1014252.8332074926, "n": 161907214, "mean": 0.0062644079170400165, "variance": 0.004751920007478497, "std_dev": 0.06893417155140473, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-30": {"summary": {"total": 1087020.4441681516, "n": 178526282, "mean": 0.006088853876249716, "variance": 0.004538699487335283, "std_dev": 0.06736987076828396, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-34": {"summary": {"total": 1108507.8574295447, "n": 140846849, "mean": 0.007870306402307556, "variance": 0.00578969226771958, "std_dev": 0.07609002738677112, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-39": {"summary": {"total": 1035117.5410038651, "n": 150618106, "mean": 0.006872464197656722, "variance": 0.005211673617960884, "std_dev": 0.07219192211017023, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-43": {"summary": {"total": 1173722.1659370977, "n": 159866232, "mean": 0.007341901734051622, "variance": 0.0053867209892465425, "std_dev": 0.07339428444536088, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-47": {"summary": {"total": 1249978.3146059723, "n": 145300674, "mean": 0.008602701420407529, "variance": 0.006163003573420449, "std_dev": 0.07850479968397123, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2018-51": {"summary": {"total": 1168319.3037019395, "n": 172879777, "mean": 0.006757987105119529, "variance": 0.004972167868400544, "std_dev": 0.07051360059166277, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-04": {"summary": {"total": 977965.7039252293, "n": 154269422, "mean": 0.006339336021659746, "variance": 0.004832956760116896, "std_dev": 0.0695194703670626, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-09": {"summary": {"total": 988275.6502614045, "n": 159920439, "mean": 0.0061797957561972715, "variance": 0.004655558914722139, "std_dev": 0.06823165625076193, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-13": {"summary": {"total": 827668.1030660868, "n": 142662038, "mean": 0.00580160016406107, "variance": 0.004454687491574847, "std_dev": 0.06674344530794651, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-18": {"summary": {"total": 854471.737594323, "n": 145713985, "mean": 0.005864033830344578, "variance": 0.004420355586624384, "std_dev": 0.06648575476464401, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-22": {"summary": {"total": 881014.9035341938, "n": 148088651, "mean": 0.005949239847786809, "variance": 0.004579135266965264, "std_dev": 0.0676693081608292, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-26": {"summary": {"total": 800557.0594747367, "n": 142252748, "mean": 0.005627708924644023, "variance": 0.0043384435579366465, "std_dev": 0.0658668623659625, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-30": {"summary": {"total": 838455.8789033149, "n": 143259044, "mean": 0.0058527256324795406, "variance": 0.004567327401291433, "std_dev": 0.0675820050108861, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-35": {"summary": {"total": 955149.576227793, "n": 156831307, "mean": 0.006090299153266592, "variance": 0.004808853342797712, "std_dev": 0.06934589636595458, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-39": {"summary": {"total": 820186.322513755, "n": 134745873, "mean": 0.006086912379971414, "variance": 0.0048103471807444675, "std_dev": 0.06935666644775013, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-43": {"summary": {"total": 959726.2397462637, "n": 147157039, "mean": 0.006521782758528212, "variance": 0.005209465059366591, "std_dev": 0.07217662405077278, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-47": {"summary": {"total": 927854.1858019684, "n": 136120584, "mean": 0.006816413495566316, "variance": 0.005513330232358156, "std_dev": 0.07425180288961444, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2019-51": {"summary": {"total": 860548.9512176219, "n": 123503939, "mean": 0.006967785466483151, "variance": 0.005654462762542109, "std_dev": 0.07519616188704122, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-05": {"summary": {"total": 1081193.1639570629, "n": 164726756, "mean": 0.006563555248772479, "variance": 0.0053661921022507076, "std_dev": 0.07325429750021979, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-10": {"summary": {"total": 818791.4592350877, "n": 127628145, "mean": 0.0064154458974161836, "variance": 0.005092760832361699, "std_dev": 0.07136358197541445, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-16": {"summary": {"total": 938080.521681098, "n": 154522237, "mean": 0.006070844817507387, "variance": 0.0049151755382179484, "std_dev": 0.07010831290380584, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-24": {"summary": {"total": 835475.3434012146, "n": 138096561, "mean": 0.0060499359097089655, "variance": 0.0047562782335996015, "std_dev": 0.06896577581380203, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-29": {"summary": {"total": 996104.3818256244, "n": 166375437, "mean": 0.005987087996803421, "variance": 0.00477807106251262, "std_dev": 0.0691235926620761, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-34": {"summary": {"total": 757284.570626636, "n": 127949886, "mean": 0.005918602933547253, "variance": 0.004655070584104364, "std_dev": 0.06822807768143818, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-40": {"summary": {"total": 1121932.1457166374, "n": 188549619, "mean": 0.005950328362724702, "variance": 0.004610354878849073, "std_dev": 0.06789959409929541, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-45": {"summary": {"total": 898136.7545987347, "n": 143621070, "mean": 0.006253516664363625, "variance": 0.004819489000985139, "std_dev": 0.06942253957458729, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2020-50": {"summary": {"total": 921745.2368867062, "n": 142687320, "mean": 0.006459895924085656, "variance": 0.004858940829807899, "std_dev": 0.0697061032464726, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-04": {"summary": {"total": 1380468.3881329126, "n": 188520474, "mean": 0.007322644373008058, "variance": 0.0054924991003183965, "std_dev": 0.07411139656165168, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-10": {"summary": {"total": 1380080.997586365, "n": 152186861, "mean": 0.00906833210513728, "variance": 0.006933151395739397, "std_dev": 0.08326554747156471, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-17": {"summary": {"total": 1777493.858790605, "n": 179483315, "mean": 0.009903393297536362, "variance": 0.007556649378647187, "std_dev": 0.08692899043844457, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-21": {"summary": {"total": 1388873.7347647285, "n": 147621413, "mean": 0.00940834873843627, "variance": 0.007226993128261255, "std_dev": 0.08501172347541988, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-25": {"summary": {"total": 1229187.004094604, "n": 138602658, "mean": 0.008868423029049022, "variance": 0.006836825079330768, "std_dev": 0.08268509587181216, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-31": {"summary": {"total": 1514567.0610846342, "n": 194355920, "mean": 0.0077927498225144635, "variance": 0.006049983390913356, "std_dev": 0.07778163916319426, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-39": {"summary": {"total": 1260318.8726773285, "n": 167937330, "mean": 0.00750469757186999, "variance": 0.005810419525351843, "std_dev": 0.0762261078984874, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-43": {"summary": {"total": 1282284.6833311722, "n": 193371340, "mean": 0.006631203379627882, "variance": 0.005085851350922766, "std_dev": 0.07131515512794434, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-49": {"summary": {"total": 887411.7239694647, "n": 133804836, "mean": 0.006632134910052615, "variance": 0.00499948301747581, "std_dev": 0.07070702240566923, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-05": {"summary": {"total": 1004964.2492821562, "n": 167645350, "mean": 0.005994584694905976, "variance": 0.004565722887037251, "std_dev": 0.067570133099153, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-21": {"summary": {"total": 1344421.0495852856, "n": 207624464, "mean": 0.006475253559644523, "variance": 0.004837947172950345, "std_dev": 0.06955535330188715, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-27": {"summary": {"total": 1157428.9405497618, "n": 184273418, "mean": 0.006281041254413393, "variance": 0.004757810216526788, "std_dev": 0.06897688175415578, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-33": {"summary": {"total": 957446.7036670311, "n": 139108805, "mean": 0.006882718197938881, "variance": 0.005182069291078837, "std_dev": 0.071986591050548, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-40": {"summary": {"total": 1210398.5894698824, "n": 187550803, "mean": 0.006453710515277731, "variance": 0.004958475676888503, "std_dev": 0.07041644464816797, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-49": {"summary": {"total": 1162946.8706869408, "n": 196557016, "mean": 0.005916587941520951, "variance": 0.004452045986711361, "std_dev": 0.0667236538771024, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-06": {"summary": {"total": 1044589.5385340602, "n": 195066201, "mean": 0.005355051429612166, "variance": 0.004100268474943806, "std_dev": 0.0640333387771074, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-14": {"summary": {"total": 899287.716765282, "n": 188826402, "mean": 0.004762510471206711, "variance": 0.003513188567285435, "std_dev": 0.059272156762559564, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-23": {"summary": {"total": 934071.81333711, "n": 194940378, "mean": 0.004791576906335518, "variance": 0.0035248085537592515, "std_dev": 0.059370098145103745, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-40": {"summary": {"total": 1103155.9533836418, "n": 208054671, "mean": 0.005302240743172967, "variance": 0.0041191304750644855, "std_dev": 0.06418045243736199, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-50": {"summary": {"total": 1116051.0902119053, "n": 202203321, "mean": 0.005519449852220315, "variance": 0.004265091414414218, "std_dev": 0.06530766734782538, "min": 0.0, "max": 1.0, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2024-10": {"summary": {"total": 628757.3635543024, "n": 128549482, "mean": 0.004891169950838865, "variance": 0.0037600369381117675, "std_dev": 0.06131914006337473, "min": 0.0, "max": 1.0, "unit": "task"}}}
data/stats/cont/words_contamination.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dedup_minhash_independent_output_CC-MAIN-2021-04": {"summary": {"total": 1693.968081984778, "n": 377038104, "mean": 4.49282993950334e-06, "variance": 2.0697388559068213e-08, "std_dev": 0.00014386587002853808, "min": 0.0, "max": 0.05813953488372093, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-10": {"summary": {"total": 1348.5632420596241, "n": 304275233, "mean": 4.4320506429770015e-06, "variance": 2.0515744728141895e-08, "std_dev": 0.00014323318305526096, "min": 0.0, "max": 0.052434456928838954, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-17": {"summary": {"total": 1623.5421116934626, "n": 358760318, "mean": 4.525422769007163e-06, "variance": 2.056864436263732e-08, "std_dev": 0.00014341772680752306, "min": 0.0, "max": 0.058394160583941604, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-21": {"summary": {"total": 1279.688115252939, "n": 295028552, "mean": 4.337506002649248e-06, "variance": 1.990944020795556e-08, "std_dev": 0.0001411008157593554, "min": 0.0, "max": 0.061855670103092786, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-25": {"summary": {"total": 1194.8531060882322, "n": 277093169, "mean": 4.31209874426111e-06, "variance": 2.0183571863928246e-08, "std_dev": 0.00014206889829912895, "min": 0.0, "max": 0.061855670103092786, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-31": {"summary": {"total": 1766.303254710353, "n": 388199818, "mean": 4.549984757361091e-06, "variance": 2.0992476969644057e-08, "std_dev": 0.00014488780821602642, "min": 0.0, "max": 0.08955223880597014, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-39": {"summary": {"total": 1502.681429656869, "n": 335672655, "mean": 4.476627474039729e-06, "variance": 2.0909131188060804e-08, "std_dev": 0.0001445999003736199, "min": 0.0, "max": 0.07194244604316546, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-43": {"summary": {"total": 1743.7464460228732, "n": 386186196, "mean": 4.515299780479149e-06, "variance": 2.0817148272579713e-08, "std_dev": 0.0001442814897087624, "min": 0.0, "max": 0.061855670103092786, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2021-49": {"summary": {"total": 993.9192572231602, "n": 267807721, "mean": 3.711316662237467e-06, "variance": 1.8462170947701775e-08, "std_dev": 0.0001358755715634778, "min": 0.0, "max": 0.03961748633879782, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-05": {"summary": {"total": 1285.264286267883, "n": 335296026, "mean": 3.833222545464518e-06, "variance": 1.862601108388856e-08, "std_dev": 0.00013647714491404252, "min": 0.0, "max": 0.06451612903225806, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-21": {"summary": {"total": 1675.009594480409, "n": 415278462, "mean": 4.033461274185702e-06, "variance": 2.0230278810005606e-08, "std_dev": 0.000142233184630049, "min": 0.0, "max": 0.041811846689895474, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-27": {"summary": {"total": 1464.0191669746634, "n": 368049792, "mean": 3.977774743517952e-06, "variance": 2.0131638358808316e-08, "std_dev": 0.00014188600480247626, "min": 0.0, "max": 0.057750759878419454, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-33": {"summary": {"total": 1039.432169573988, "n": 278230671, "mean": 3.735864798219777e-06, "variance": 1.8675509219710315e-08, "std_dev": 0.00013665836681195308, "min": 0.0, "max": 0.06480117820324006, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-40": {"summary": {"total": 1501.82542740878, "n": 375451013, "mean": 4.000056932617072e-06, "variance": 1.9507739251441335e-08, "std_dev": 0.00013967010865407578, "min": 0.0, "max": 0.046875, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2022-49": {"summary": {"total": 1602.648918588473, "n": 393408946, "mean": 4.073748029584546e-06, "variance": 2.030700486062336e-08, "std_dev": 0.00014250264860915168, "min": 0.0, "max": 0.061855670103092786, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-06": {"summary": {"total": 1593.8579804751307, "n": 390237059, "mean": 4.084332699102002e-06, "variance": 2.0331466366637558e-08, "std_dev": 0.0001425884510282567, "min": 0.0, "max": 0.061855670103092786, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-14": {"summary": {"total": 1548.6925492595572, "n": 378402849, "mean": 4.092708480795626e-06, "variance": 1.9976227023606516e-08, "std_dev": 0.00014133728108183812, "min": 0.0, "max": 0.061855670103092786, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-23": {"summary": {"total": 1946.2760281538951, "n": 390305404, "mean": 4.986546453643025e-06, "variance": 2.3632546588483725e-08, "std_dev": 0.0001537288085834393, "min": 0.0, "max": 0.07647058823529412, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-40": {"summary": {"total": 3254.0915876179683, "n": 401479044, "mean": 8.10525888274716e-06, "variance": 3.6937159590745443e-08, "std_dev": 0.0001921904253357733, "min": 0.0, "max": 0.061855670103092786, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2023-50": {"summary": {"total": 4582.481964015081, "n": 404644558, "mean": 1.1324709237817275e-05, "variance": 4.420868206864301e-08, "std_dev": 0.00021025860759703278, "min": 0.0, "max": 0.05263157894736842, "unit": "task"}}, "dedup_minhash_independent_output_CC-MAIN-2024-10": {"summary": {"total": 4235.758994879221, "n": 257452499, "mean": 1.645258450134167e-05, "variance": 6.328120497385323e-08, "std_dev": 0.0002515575579740216, "min": 0.0, "max": 0.041666666666666664, "unit": "task"}}}
index.html CHANGED
@@ -6,6 +6,7 @@
6
  <script src="https://cdn.plot.ly/plotly-2.32.0.min.js" charset="utf-8"></script>
7
  <script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js" charset="utf-8"></script>
8
  <script type="module" src="src/plotting.js"></script>
 
9
  <meta name="viewport" content="width=device-width, initial-scale=1">
10
  <meta charset="utf8">
11
  <title>FineWeb: 15T tokens of high quality web data</title>
@@ -281,7 +282,10 @@
281
  resulting dataset is considerably larger for the WET data (around 254BT), it proves to be of much worse
282
  quality than the one that used trafilatura to extract text from WARC files (which is around 200BT). Many of
283
  these additional tokens on the WET files are unnecessary page boilerplate.</p>
284
- <figure><img src="plots/wet_comparison.png"/></figure>
 
 
 
285
 
286
  <h3>Base filtering</h3>
287
  <p>Filtering is an important part of the curation process. It
@@ -354,7 +358,10 @@
354
  trillion tokens of data, but, quite surprisingly for us, when training on a randomly sampled 350 billion
355
  tokens subset, the model showed no improvement over one trained on the non deduplicated data (see orange and
356
  green curve below), scoring far below its predecessor RefinedWeb on our aggregate of tasks.</p>
357
- <figure><img src="plots/dedup_all_dumps_bad.png"/></figure>
 
 
 
358
  <p>This was quite puzzling as our intuition regarding web
359
  data was that more deduplication would always result in improved performance. We decided to take a closer
360
  look at one of the oldest dumps, dump 2013-48:</p>
@@ -379,7 +386,10 @@
379
  iterative dedup process (<em>originally removed data</em>)
380
  </li>
381
  </ul>
382
- <figure><img src="plots/removed_data_cross_dedup.png"/></figure>
 
 
 
383
  <p>These results show that, for this older dump where we were
384
  removing over 90% of the original data, the data that was kept was actually <em>worse</em> than the data
385
  removed (considered independently of all the other dumps).</p>
@@ -389,8 +399,10 @@
389
  tokens of data.</p>
390
  <p>When training on a random sample from this dataset we see
391
  that it now matches RefinedWeb’s performance (blue and red curves below):</p>
392
- <figure><img src="plots/cross_ind_unfiltered_comparison.png"/>
393
- </figure>
 
 
394
  <p>We hypothesize that the main improvement gained from
395
  deduplication is the removal of very large clusters that are present in every single dump (you will find
396
  some examples of these clusters on the RefinedWeb paper, each containing <em>hundreds of thousands</em> of
@@ -477,7 +489,10 @@
477
  <p>The performance of the models trained on each of these was
478
  consistently worse (even if to different degrees) than that of the original independently deduplicated
479
  data:</p>
480
- <figure><img src="plots/Untitled.png"/></figure>
 
 
 
481
  <h3>Additional filtering</h3>
482
  <p>By this point we had reached the same performance as
483
  RefinedWeb, but on our aggregate of tasks, another heavily filtered dataset, the C4 dataset<d-cite bibtex-key="raffel2023exploring"></d-cite>, still showed stronger performance (with
@@ -497,8 +512,10 @@
497
  the relatively recent Llama1 model<d-cite bibtex-key="touvron2023llama"></d-cite>. We experimented applying
498
  each of the different filters used in C4 to a baseline of the independently deduped FineWeb 2019-18 dump
499
  (plot smoothed with a 3 checkpoints sliding window):</p>
500
- <figure><img src="plots/c4_filters.png"/></figure>
501
- <figure id="plot-c4_filters_hellaswag" style="height:600px;"></figure>
 
 
502
  <ul>
503
  <li>applying “All filters” (drop lines not ending on punctuation marks,
504
  mentioning javascript and cookie notices + drop documents outside length thresholds, containing “lorem
@@ -570,7 +587,10 @@
570
  <ul>
571
  <li>When applying the 3 together, ~22% of tokens were removed</li>
572
  </ul>
573
- <figure><img src="plots/Untitled%202.png"/></figure>
 
 
 
574
  <h2>The final dataset</h2>
575
  <p>The final FineWeb dataset comprises 15T tokens and
576
  includes the following previously mentioned steps, in order, each providing a performance boost on our group
@@ -587,9 +607,10 @@
587
  <ul>
588
  <li>our custom filters (mentioned in the previous section)</li>
589
  </ul>
590
- <figure><img src="plots/fineweb_all_filters.png"/></figure>
591
-
592
- <figure id="plot-filtering_steps" style="height:600px;"></figure>
 
593
  <p>We compared 🍷 FineWeb with the following datasets:</p>
594
  <ul>
595
  <li><a
@@ -623,7 +644,10 @@
623
  collection</a>. We have uploaded checkpoints at every 1000 training steps. You will also find our full <a
624
  href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/eval_results.csv">evaluation
625
  results here</a>.</p>
626
- <figure id="plot-dataset_ablations" style="height:600px;"></figure>
 
 
 
627
  <p>Some histogram comparisons of C4, Dolma, RefinedWeb and
628
  FineWeb:</p>
629
  <figure><img src="plots/Untitled%203.png"/></figure>
 
6
  <script src="https://cdn.plot.ly/plotly-2.32.0.min.js" charset="utf-8"></script>
7
  <script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js" charset="utf-8"></script>
8
  <script type="module" src="src/plotting.js"></script>
9
+ <link rel="stylesheet" href="style.css">
10
  <meta name="viewport" content="width=device-width, initial-scale=1">
11
  <meta charset="utf8">
12
  <title>FineWeb: 15T tokens of high quality web data</title>
 
282
  resulting dataset is considerably larger for the WET data (around 254BT), it proves to be of much worse
283
  quality than the one that used trafilatura to extract text from WARC files (which is around 200BT). Many of
284
  these additional tokens on the WET files are unnecessary page boilerplate.</p>
285
+ <div class="main-plot-container">
286
+ <figure><img src="plots/wet_comparison.png"/></figure>
287
+ <div id="plot-wet_comparison"></div>
288
+ </div>
289
 
290
  <h3>Base filtering</h3>
291
  <p>Filtering is an important part of the curation process. It
 
358
  trillion tokens of data, but, quite surprisingly for us, when training on a randomly sampled 350 billion
359
  tokens subset, the model showed no improvement over one trained on the non deduplicated data (see orange and
360
  green curve below), scoring far below its predecessor RefinedWeb on our aggregate of tasks.</p>
361
+ <div class="main-plot-container">
362
+ <figure><img src="plots/dedup_all_dumps_bad.png"/></figure>
363
+ <div id="plot-dedup_all_dumps_bad"></div>
364
+ </div>
365
  <p>This was quite puzzling as our intuition regarding web
366
  data was that more deduplication would always result in improved performance. We decided to take a closer
367
  look at one of the oldest dumps, dump 2013-48:</p>
 
386
  iterative dedup process (<em>originally removed data</em>)
387
  </li>
388
  </ul>
389
+ <div class="main-plot-container">
390
+ <figure><img src="plots/removed_data_cross_dedup.png"/></figure>
391
+ <div id="plot-removed_data_cross_dedup"></div>
392
+ </div>
393
  <p>These results show that, for this older dump where we were
394
  removing over 90% of the original data, the data that was kept was actually <em>worse</em> than the data
395
  removed (considered independently of all the other dumps).</p>
 
399
  tokens of data.</p>
400
  <p>When training on a random sample from this dataset we see
401
  that it now matches RefinedWeb’s performance (blue and red curves below):</p>
402
+ <div class="main-plot-container">
403
+ <figure><img src="plots/cross_ind_unfiltered_comparison.png"/></figure>
404
+ <div id="plot-cross_ind_unfiltered_comparison"></div>
405
+ </div>
406
  <p>We hypothesize that the main improvement gained from
407
  deduplication is the removal of very large clusters that are present in every single dump (you will find
408
  some examples of these clusters on the RefinedWeb paper, each containing <em>hundreds of thousands</em> of
 
489
  <p>The performance of the models trained on each of these was
490
  consistently worse (even if to different degrees) than that of the original independently deduplicated
491
  data:</p>
492
+ <div class="main-plot-container">
493
+ <figure><img src="plots/dedup_attempts.png"/></figure>
494
+ <div id="plot-dedup_attempts"></div>
495
+ </div>
496
  <h3>Additional filtering</h3>
497
  <p>By this point we had reached the same performance as
498
  RefinedWeb, but on our aggregate of tasks, another heavily filtered dataset, the C4 dataset<d-cite bibtex-key="raffel2023exploring"></d-cite>, still showed stronger performance (with
 
512
  the relatively recent Llama1 model<d-cite bibtex-key="touvron2023llama"></d-cite>. We experimented applying
513
  each of the different filters used in C4 to a baseline of the independently deduped FineWeb 2019-18 dump
514
  (plot smoothed with a 3 checkpoints sliding window):</p>
515
+ <div class="main-plot-container">
516
+ <figure><img src="plots/c4_filters_hellaswag.png"/></figure>
517
+ <div id="plot-c4_filters_hellaswag"></div>
518
+ </div>
519
  <ul>
520
  <li>applying “All filters” (drop lines not ending on punctuation marks,
521
  mentioning javascript and cookie notices + drop documents outside length thresholds, containing “lorem
 
587
  <ul>
588
  <li>When applying the 3 together, ~22% of tokens were removed</li>
589
  </ul>
590
+ <div class="main-plot-container">
591
+ <figure><img src="plots/custom_filters.png"/></figure>
592
+ <div id="plot-custom_filters"></div>
593
+ </div>
594
  <h2>The final dataset</h2>
595
  <p>The final FineWeb dataset comprises 15T tokens and
596
  includes the following previously mentioned steps, in order, each providing a performance boost on our group
 
607
  <ul>
608
  <li>our custom filters (mentioned in the previous section)</li>
609
  </ul>
610
+ <div class="main-plot-container">
611
+ <figure><img src="plots/filtering_steps.png"/></figure>
612
+ <div id="plot-filtering_steps"></div>
613
+ </div>
614
  <p>We compared 🍷 FineWeb with the following datasets:</p>
615
  <ul>
616
  <li><a
 
644
  collection</a>. We have uploaded checkpoints at every 1000 training steps. You will also find our full <a
645
  href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/eval_results.csv">evaluation
646
  results here</a>.</p>
647
+ <div class="main-plot-container">
648
+ <figure><img src="plots/dataset_ablations.png"/></figure>
649
+ <div id="plot-dataset_ablations"></div>
650
+ </div>
651
  <p>Some histogram comparisons of C4, Dolma, RefinedWeb and
652
  FineWeb:</p>
653
  <figure><img src="plots/Untitled%203.png"/></figure>
plots/Untitled 2.png DELETED
Binary file (64.3 kB)
 
plots/c4_filters.png DELETED
Binary file (260 kB)
 
plots/c4_filters_hellaswag.png ADDED
plots/cross_ind_unfiltered_comparison.png CHANGED
plots/custom_filters.png ADDED
plots/dataset_ablations.png ADDED
plots/dedup_all_dumps_bad.png CHANGED
plots/dedup_attempts.png ADDED
plots/{fineweb_all_filters.png → filtering_steps.png} RENAMED
File without changes
plots/fineweb_ablations.png DELETED
Binary file (321 kB)
 
plots/removed_data_cross_dedup.png CHANGED
plots/wet_comparison.png CHANGED
src/plotting.js CHANGED
@@ -1,21 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  const DEFAULT_LAYOUT = {
2
  title: {
3
  text: 'Plot Title',
4
  font: {
5
- size: 19
6
- },
7
- y: 0.87
8
-
9
  },
10
  xaxis: {
11
  title: {
12
- text: 'X-axis',
13
  font: {
14
- size: 15
 
 
15
  }
16
  },
17
  tickfont: {
18
- size: 14
 
19
  },
20
  showgrid: false,
21
  mirror: true,
@@ -24,18 +49,21 @@ const DEFAULT_LAYOUT = {
24
  },
25
  yaxis: {
26
  title: {
27
- text: 'Y-axis',
28
  font: {
29
- size: 15
30
- }
 
 
31
  },
32
  showgrid: false,
33
  mirror: true,
34
  ticks: 'outside',
35
  showline: true,
36
  tickfont: {
37
- size: 14
38
- }
 
39
  },
40
  legend: {
41
  orientation: 'v',
@@ -44,38 +72,151 @@ const DEFAULT_LAYOUT = {
44
  x: 1,
45
  y: 0,
46
  font: {
47
- size: 14
 
48
  },
49
  bgcolor: 'rgba(0,0,0,0)',
50
- }
 
 
 
 
 
51
  }
52
 
53
- document.addEventListener('DOMContentLoaded', function() {
54
- const plotElements = document.querySelectorAll('[id^="plot-"]');
55
- plotElements.forEach((elem) => {
56
- const plotName = `${elem.id.replace('plot-', '')}`;
57
- fetch(`data/plots/${plotName}.json`)
58
- .then(response => response.json())
59
- .then(data => {
60
- const traces = [];
61
- for (const key in data.data) {
62
- const trace = {
63
- x: data.data[key].x,
64
- y: data.data[key].y,
65
- type: 'scatter',
66
- mode: 'lines',
67
- line: {
68
- width: 2.5
69
- },
70
- name: data.data[key].label
71
- };
72
- traces.push(trace);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  }
74
- const layout = _.merge(DEFAULT_LAYOUT, data.layout);
75
- console.log(layout);
76
-
77
- Plotly.newPlot(elem, traces, layout);
78
- })
79
- .catch(error => console.error('Error loading the data:', error));
80
- })
81
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const TASK_ID_TO_NAME = {
2
+ 'agg_score': 'Aggregate Score',
3
+ 'commonsense_qa/acc': 'Commonsense QA Acc',
4
+ 'commonsense_qa/acc_norm': 'Commonsense QA Norm Acc',
5
+ 'hellaswag/acc': 'HellaSwag Acc',
6
+ 'hellaswag/acc_norm': 'HellaSwag Norm Acc',
7
+ 'openbookqa/acc': 'OpenBook QA Acc',
8
+ 'openbookqa/acc_norm': 'OpenBook QA Norm Acc',
9
+ 'piqa/acc': 'PIQA Acc',
10
+ 'piqa/acc_norm': 'PIQA Norm Acc',
11
+ 'siqa/acc': 'Social IQA Acc',
12
+ 'siqa/acc_norm': 'Social IQA Norm Acc',
13
+ 'winogrande/acc': 'WinoGrande Acc',
14
+ 'winogrande/acc_norm': 'WinoGrande Norm Acc',
15
+ 'sciq/acc': 'SciQ Acc',
16
+ 'sciq/acc_norm': 'SciQ Norm Acc',
17
+ 'arc/acc': 'ARC Acc',
18
+ 'arc/acc_norm': 'ARC Norm Acc',
19
+ 'mmlu/acc': 'MMLU Acc',
20
+ 'mmlu/acc_norm': 'MMLU Norm Acc'
21
+ };
22
+
23
+
24
  const DEFAULT_LAYOUT = {
25
  title: {
26
  text: 'Plot Title',
27
  font: {
28
+ size: 19,
29
+ family: "apple-system, Arial, sans-serif"
30
+ }
 
31
  },
32
  xaxis: {
33
  title: {
34
+ text: 'Training tokens (billions)',
35
  font: {
36
+ size: 15,
37
+ family: "apple-system, Arial, sans-serif"
38
+
39
  }
40
  },
41
  tickfont: {
42
+ size: 14,
43
+ family: "apple-system, Arial, sans-serif"
44
  },
45
  showgrid: false,
46
  mirror: true,
 
49
  },
50
  yaxis: {
51
  title: {
52
+ text: "Agg Score",
53
  font: {
54
+ size: 15,
55
+ family: "apple-system, Arial, sans-serif"
56
+ },
57
+ standoff: 10
58
  },
59
  showgrid: false,
60
  mirror: true,
61
  ticks: 'outside',
62
  showline: true,
63
  tickfont: {
64
+ size: 14,
65
+ family: "apple-system, Arial, sans-serif"
66
+ },
67
  },
68
  legend: {
69
  orientation: 'v',
 
72
  x: 1,
73
  y: 0,
74
  font: {
75
+ size: 14,
76
+ family: "apple-system, Arial, sans-serif"
77
  },
78
  bgcolor: 'rgba(0,0,0,0)',
79
+ },
80
+ margin: {
81
+ t: 30,
82
+ b: 50
83
+ },
84
+ height: 400
85
  }
86
 
87
+
88
+
89
+ const init_plot = function() {
90
+ const plotElements = document.querySelectorAll('[id^="plot-"]');
91
+
92
+ plotElements.forEach(async (plotElement) => {
93
+ const plotName = plotElement.id.replace('plot-', '');
94
+ const data = await fetch(`data/plots/${plotName}.json`).then((response) => response.json());
95
+ const {dropdown, slider, plot} = createPlottingElements(plotElement, data.data, data.defaultMetric ?? "agg_score", data.defaultWindowSize ?? 0);
96
+ plot.id = `graph-${plotName}`;
97
+ dropdown.addEventListener('change', () => updatePlot(dropdown, slider));
98
+ let timeoutId;
99
+ // Debounce the slider
100
+ slider.addEventListener('input', () => {
101
+ clearTimeout(timeoutId);
102
+ timeoutId = setTimeout(() => {
103
+ updatePlot(dropdown, slider);
104
+ }, 500);
105
+ });
106
+
107
+ function updatePlot(dropdown, slider) {
108
+ const metric = dropdown.value;
109
+ const sliderValue = parseInt(slider.value);
110
+ const traces = [];
111
+ const metricData = data.data[metric];
112
+ for (const key in metricData) {
113
+ const y = rollingWindow(metricData[key].y, sliderValue);
114
+ const x = metricData[key].x.slice(0, y.length);
115
+ const trace = {
116
+ x: x,
117
+ y: y,
118
+ type: 'scatter',
119
+ mode: 'lines',
120
+ line: {
121
+ width: 2.5
122
+ },
123
+ name: metricData[key].label
124
+ };
125
+ traces.push(trace);
126
+ }
127
+ let minX = Math.min(...traces.flatMap(trace => trace.x));
128
+ let maxX = Math.max(...traces.flatMap(trace => trace.x));
129
+ const width = plot.parentElement.offsetWidth;
130
+ console.log(width);
131
+ const layout = _.merge({}, DEFAULT_LAYOUT, {width: width, yaxis: {title: {text: TASK_ID_TO_NAME[metric]}}, xaxis: {range: [minX*0.95, maxX*1.05]}}, data.layout);
132
+ Plotly.newPlot(plot, traces, layout);
133
+
134
+ window.addEventListener('resize', () => {
135
+ // For some reason plotly doesn't respect the width :(
136
+ console.log(plot.parentElement.offsetWidth);
137
+ console.log(plot.id);
138
+ Plotly.relayout(plot, {width: plot.parentElement.offsetWidth});
139
+ })
140
+
141
  }
142
+ // Initial plot
143
+ updatePlot(dropdown, slider);
144
+ });
145
+ };
146
+ document.addEventListener('DOMContentLoaded', init_plot);
147
+
148
+ const createPlottingElements = (plotElement, data, defaultMetric, defaultWindowSize) => {
149
+ // Create plot
150
+ const plot = document.createElement('figure');
151
+ const controls = document.createElement('div');
152
+ plot.classList.add('plotly');
153
+ controls.classList.add('plotly_controls');
154
+ plotElement.appendChild(plot);
155
+ plotElement.appendChild(controls);
156
+
157
+
158
+ const metricOptions = Object.keys(data);
159
+ // Dropdown
160
+ const dropdownLabel = document.createElement('label');
161
+ dropdownLabel.textContent = 'Metric:';
162
+ const dropdown = document.createElement('select');
163
+ dropdown.innerHTML = metricOptions.map((option) => `<option value="${option}">${TASK_ID_TO_NAME[option]}</option>`).join('');
164
+ dropdown.value = defaultMetric;
165
+ const dropdownContainer = document.createElement('div');
166
+ dropdownContainer.classList.add('plotly_input_container');
167
+ dropdownContainer.appendChild(dropdownLabel);
168
+ dropdownContainer.appendChild(dropdown);
169
+ controls.appendChild(dropdownContainer);
170
+
171
+ // Slider
172
+ const sliderLabel = document.createElement('label');
173
+ sliderLabel.textContent = 'Rolling window:';
174
+ const slider = document.createElement('input');
175
+ slider.type = 'range';
176
+ slider.min = 0;
177
+ slider.max = 40;
178
+ slider.value = defaultWindowSize ?? 0;
179
+
180
+
181
+ // current value
182
+ const sliderValue = document.createElement('span');
183
+ sliderValue.textContent = slider.value;
184
+ slider.addEventListener('input', () => {
185
+ sliderValue.textContent = slider.value;
186
+ });
187
+ const sliderInputContainer = document.createElement('div');
188
+ sliderInputContainer.classList.add('plotly_slider');
189
+ sliderInputContainer.appendChild(slider);
190
+ sliderInputContainer.appendChild(sliderValue);
191
+
192
+
193
+ const sliderContainer = document.createElement('div');
194
+ sliderContainer.classList.add('plotly_input_container');
195
+
196
+
197
+ sliderContainer.appendChild(sliderLabel);
198
+ sliderContainer.appendChild(sliderInputContainer);
199
+ controls.appendChild(sliderContainer);
200
+
201
+
202
+ return {dropdown, slider, plot};
203
+ }
204
+
205
+ const rollingWindow = function(data, windowSize) {
206
+ if (windowSize === 0) {
207
+ return data;
208
+ }
209
+ const rollingData = [];
210
+
211
+ // Start at halfWindowSize to ensure we can get a full window
212
+ for (let i = windowSize; i < data.length; i++) {
213
+ const windowStart = i - windowSize;
214
+ const windowEnd = i;
215
+ const windowData = data.slice(windowStart, windowEnd);
216
+
217
+ const windowAverage = windowData.reduce((acc, value) => acc + value, 0) / windowData.length;
218
+ rollingData.push(windowAverage);
219
+ }
220
+
221
+ return rollingData;
222
+ }
style.css CHANGED
@@ -1,3 +1,13 @@
 
 
 
 
 
 
 
 
 
 
1
  body {
2
  padding: 2rem;
3
  font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
@@ -9,7 +19,7 @@ h1 {
9
  }
10
 
11
  p {
12
- color: rgb(107, 114, 128);
13
  font-size: 15px;
14
  margin-bottom: 10px;
15
  margin-top: 5px;
@@ -19,10 +29,127 @@ p {
19
  max-width: 620px;
20
  margin: 0 auto;
21
  padding: 16px;
22
- border: 1px solid lightgray;
23
  border-radius: 16px;
24
  }
25
 
26
  .card p:last-child {
27
  margin-bottom: 0;
28
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* style.css */
2
+ /* Define colors */
3
+ :root {
4
+ --distill-gray: rgb(107, 114, 128);
5
+ --distill-gray-light: rgb(185, 185, 185);
6
+ --distill-gray-lighter: rgb(228, 228, 228);
7
+ --distill-gray-lightest: rgb(245, 245, 245);
8
+ --distill-blue: #007BFF;
9
+ }
10
+
11
  body {
12
  padding: 2rem;
13
  font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
 
19
  }
20
 
21
  p {
22
+ color: var(--distill-gray);
23
  font-size: 15px;
24
  margin-bottom: 10px;
25
  margin-top: 5px;
 
29
  max-width: 620px;
30
  margin: 0 auto;
31
  padding: 16px;
32
+ border: 1px solid var(--distill-gray-light);
33
  border-radius: 16px;
34
  }
35
 
36
  .card p:last-child {
37
  margin-bottom: 0;
38
  }
39
+
40
+ /* Container for the controls */
41
+ [id^="plot-"] {
42
+ display: flex;
43
+ flex-direction: column;
44
+ align-items: center;
45
+ gap: 15px; /* Adjust the gap between controls as needed */
46
+ }
47
+ [id^="plot-"] figure {
48
+ margin-bottom: 0px;
49
+ margin-top: 0px;
50
+ padding: 0px;
51
+ }
52
+
53
+ .plotly_controls {
54
+ display: flex;
55
+ flex-wrap: wrap;
56
+ flex-direction: row;
57
+ justify-content: center;
58
+ align-items: flex-start;
59
+ gap: 30px;
60
+ }
61
+
62
+
63
+ .plotly_input_container {
64
+ display: flex;
65
+ align-items: center;
66
+ flex-direction: column;
67
+ gap: 10px;
68
+ }
69
+
70
+ /* Style for the select dropdown */
71
+ .plotly_input_container > select {
72
+ padding: 2px 4px;
73
+ /* border: 1px solid #ccc; */
74
+ line-height: 1.5em;
75
+ text-align: center;
76
+ border-radius: 4px;
77
+ font-size: 12px;
78
+ background-color: var(--distill-gray-lightest);
79
+ outline: none;
80
+ }
81
+
82
+ /* Style for the range input */
83
+
84
+ .plotly_slider {
85
+ display: flex;
86
+ align-items: center;
87
+ gap: 10px;
88
+ }
89
+
90
+ .plotly_slider > input[type="range"] {
91
+ -webkit-appearance: none;
92
+ height: 2px;
93
+ background: var(--distill-gray-light);
94
+ border-radius: 5px;
95
+ outline: none;
96
+ }
97
+
98
+ .plotly_slider > span {
99
+ font-size: 14px;
100
+ line-height: 1.6em;
101
+ min-width: 16px;
102
+ }
103
+
104
+ .plotly_slider > input[type="range"]::-webkit-slider-thumb {
105
+ -webkit-appearance: none;
106
+ appearance: none;
107
+ width: 18px;
108
+ height: 18px;
109
+ border-radius: 50%;
110
+ background: var(--distill-blue);
111
+ cursor: pointer;
112
+ }
113
+
114
+ .plotly_slider > input[type="range"]::-moz-range-thumb {
115
+ width: 18px;
116
+ height: 18px;
117
+ border-radius: 50%;
118
+ background: var(--distill-blue);
119
+ cursor: pointer;
120
+ }
121
+
122
+ /* Style for the labels */
123
+ .plotly_input_container > label {
124
+ font-size: 14px;
125
+ font-weight: bold;
126
+ }
127
+
128
+ .main-plot-container {
129
+ margin-top: 21px;
130
+ margin-bottom: 35px;
131
+ }
132
+
133
+ .main-plot-container > figure {
134
+ display: block !important;
135
+ /* Let this be handled by graph-container */
136
+ margin-bottom: 0px;
137
+ margin-top: 0px;
138
+ }
139
+ .main-plot-container > div {
140
+ display: none !important;
141
+ }
142
+
143
+
144
+ @media (min-width: 768px) {
145
+ .main-plot-container > figure {
146
+ display: none !important;
147
+ }
148
+ .main-plot-container > div {
149
+ display: flex !important;
150
+ }
151
+ }
152
+
153
+
154
+
155
+