update slider values + fix plotly errors + update jsons to exclude sciqa
Browse files- data/plots/c4_filters_hellaswag.json +1 -1
- data/plots/cross_ind_unfiltered_comparison.json +0 -0
- data/plots/custom-_ilters.json +1 -0
- data/plots/custom-filters.json +1 -0
- data/plots/custom_filters.json +1 -0
- data/plots/dataset_ablations.json +0 -0
- data/plots/dedup_all_dumps_bad.json +0 -0
- data/plots/dedup_attempts.json +0 -0
- data/plots/filtering_steps.json +0 -0
- data/plots/removed_data_cross_dedup.json +1 -1
- data/plots/wet_comparison.json +1 -1
- index.html +1 -1
- plots/c4_filters_hellaswag.png +0 -0
- plots/cross_ind_unfiltered_comparison.png +0 -0
- plots/custom_filters.png +0 -0
- plots/dataset_ablations.png +0 -0
- plots/dedup_attempts.png +0 -0
- plots/filtering_steps.png +0 -0
- plots/removed_data_cross_dedup.png +0 -0
- plots/wet_comparison.png +0 -0
- src/plotting.js +29 -20
- src/utils.js +15 -0
data/plots/c4_filters_hellaswag.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36176552809774876,0.3792347256094217,0.3928614556789398,0.40233771689236164,0.4110558107495308,0.4163004532456398,0.4155100043863058,0.42281083948910236,0.424554904922843,0.42792712710797787,0.4278372637927532,0.43066211044788355,0.43145042285323143,0.4331468697637319],"label":"Filters combined"}},"commonsense_qa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.24049999564886088,0.2750000059604645,0.27250000834465027,0.2974999994039535,0.3079999983310699,0.32949998974800104,0.3349999934434891,0.3235000073909759,0.3339999914169311,0.33550000190734863,0.340499997138977,0.3439999967813492,0.34450000524520874,0.3474999964237213],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2405000030994415,0.2854999899864197,0.2944999933242798,0.30900000035762787,0.32199999690055847,0.3264999985694885,0.33150000870227814,0.35099999606609344,0.346000000834465,0.35850000381469727,0.35599999129772186,0.36149999499320984,0.35549999773502344,0.356000006198883],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2560000047087669,0.2854999899864197,0.30550000071525574,0.31249999999999994,0.3150000125169754,0.32499998807907104,0.32850000262260437,0.3369999974966049,0.3310000002384186,0.33949999511241913,0.3385000079870224,0.340499997138977,0.341499999165535,0.33650000393390656],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.25050000101327896,0.27250000834465027,0.2939999997615814,0.31849999725818634,0.3205000013113022,0.3244999945163727,0.3295000046491623,0.33500000834465027,0.328000009059906,0.3320000022649765,0.3464999943971634,0.341499999165535,0.34250000119209284,0.34699998795986176],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2584999948740005,0.2800000011920929,0.2915000021457672,0.3174999952316284,0.328000009059906,0.32899999618530273,0.32899999618530273,0.3429999947547912,0.3465000092983246,0.34800000488758087,0.33900000154972076,0.3449999988079071,0.3409999907016754,0.3425000011920929],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2910000085830688,0.3009999990463257,0.31249999999999994,0.3149999976158142,0.3240000009536743,0.3165000081062317,0.3240000009536743,0.33250001072883606,0.3375000059604645,0.330499991774559,0.33949999511241913,0.3334999978542328,0.3400000035762787],"label":"Filters combined"}},"hellaswag/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.2880000025033951,0.31700000166893,0.3389999866485595,0.34450000524520874,0.35349999368190765,0.35450001060962677,0.36599999666213984,0.37299999594688416,0.3790000081062317,0.3779999911785126,0.37700000405311584,0.3824999928474426,0.3830000013113022,0.3879999965429306],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.27150000631809235,0.29200001060962677,0.3145000040531158,0.3384999930858612,0.3499999940395355,0.35899999737739563,0.36549998819828033,0.3610000014305115,0.36800000071525574,0.375,0.3779999911785126,0.3889999985694885,0.3889999985694885,0.39149999618530273,0.3909999877214432],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.2919999957084656,0.32349999248981476,0.3334999978542328,0.3474999964237213,0.3514999896287918,0.36450000107288355,0.37350000441074366,0.3725000023841858,0.3830000013113022,0.3849999904632568,0.390500009059906,0.39199998974800104,0.3930000066757202,0.39149999618530273],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.27150000631809235,0.2889999896287918,0.32199999690055847,0.33550000190734863,0.3445000052452087,0.35300000011920923,0.3579999953508377,0.3695000112056732,0.37249998748302454,0.380500003695488,0.38150000572204584,0.3900000005960464,0.3935000002384186,0.39200000464916224,0.3965000063180923],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.291499987244606,0.3149999976158142,0.34199999272823334,0.3499999940395355,0.3680000007152557,0.36499999463558197,0.3755000084638595,0.3830000013113022,0.3840000033378601,0.39049999415874476,0.3889999985694885,0.39399999380111694,0.3904999941587448,0.3920000046491623],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3004999905824661,0.3384999930858612,0.3760000020265579,0.3955000042915344,0.4230000078678131,0.43800000846385956,0.4375,0.45050001144409174,0.460999995470047,0.46400000154972076,0.4724999964237213,0.47599999606609344,0.47699999809265137,0.48049999773502344],"label":"Filters combined"}},"openbookqa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1290000006556511,0.1389999985694885,0.14100000262260431,0.1610000059008598,0.157999999821186,0.1649999991059303,0.1689999997615814,0.1749999970197677,0.17700000107288355,0.18600000441074366,0.19000000506639475,0.18100000172853464,0.1780000030994415,0.1860000044107437],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.13399999588727945,0.1269999966025352,0.157999999821186,0.1660000011324882,0.1689999997615814,0.1700000017881393,0.1749999970197677,0.18199999630451197,0.1829999983310699,0.17199999839067454,0.18699999898672098,0.18999999761581415,0.18899999558925626,0.18200000375509257],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.12900000438094134,0.1439999938011169,0.14599999785423273,0.1660000011324882,0.1550000011920929,0.1770000010728836,0.17999999970197672,0.1789999976754188,0.18100000172853464,0.18699999898672098,0.17999999970197678,0.18100000172853464,0.18400000035762787,0.18900000303983683],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.14000000059604645,0.14599999785423273,0.1480000019073486,0.15799999982118607,0.16899999976158142,0.17000000178813934,0.1790000051259994,0.17400000244379038,0.19699999690055847,0.18999999761581415,0.1909999996423721,0.1950000002980232,0.1940000057220459,0.18899999558925626],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1389999985694885,0.14200000464916224,0.14699999988079065,0.15700000524520868,0.1620000004768371,0.18100000172853464,0.1749999970197677,0.1840000003576278,0.19399999827146525,0.19500000029802322,0.19900000095367426,0.18500000238418576,0.19299999624490735,0.1909999996423721],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.27600000798702234,0.270000010728836,0.28900000452995295,0.2880000025033951,0.2899999916553497,0.3229999989271164,0.306999996304512,0.3240000009536743,0.3189999908208847,0.3229999989271164,0.32500000298023224,0.3189999908208847,0.32100000977516174,0.3230000138282776],"label":"Filters combined"}},"piqa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6010000109672546,0.6324999928474426,0.6525000035762787,0.6665000021457672,0.6689999997615814,0.6789999902248383,0.6784999966621399,0.6885000169277191,0.68299999833107,0.6884999871253967,0.68299999833107,0.6935000121593475,0.6929999887943268,0.69200000166893],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6049999892711639,0.6340000033378601,0.6464999914169312,0.6629999876022339,0.6660000085830688,0.6730000078678131,0.6824999749660492,0.6819999814033508,0.6940000057220459,0.6915000081062317,0.7014999985694885,0.6990000009536743,0.6969999969005585,0.7024999856948853],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6089999973773956,0.6369999945163727,0.6469999849796295,0.6655000150203705,0.6720000207424164,0.6694999933242798,0.6769999861717224,0.6809999942779541,0.6744999885559082,0.679500013589859,0.6834999918937683,0.6835000216960907,0.6884999871253967,0.6870000064373016],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5410000085830688,0.6140000224113464,0.6304999887943268,0.648499995470047,0.6705000102519989,0.6739999949932098,0.6724999845027924,0.6809999942779541,0.6780000030994415,0.6775000095367432,0.6890000104904175,0.69200000166893,0.6955000162124634,0.7019999921321869,0.6994999945163727],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.6054999828338623,0.6330000162124634,0.6509999930858612,0.6644999980926514,0.6790000200271606,0.6749999821186066,0.6889999806880951,0.6955000162124634,0.6930000185966492,0.6939999759197235,0.6989999711513519,0.6959999799728394,0.7019999921321869,0.7055000066757202],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6195000112056732,0.6534999907016754,0.6725000143051147,0.69200000166893,0.7099999785423279,0.7005000114440918,0.7139999866485596,0.715499997138977,0.722000002861023,0.7254999876022339,0.7285000085830688,0.7290000021457672,0.7309999763965607,0.7305000126361847],"label":"Filters combined"}},"siqa/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3634999990463257,0.37150000035762787,0.38799999654293055,0.3709999918937683,0.38199999928474426,0.37849999964237213,0.3889999985694885,0.3879999965429306,0.3854999989271164,0.3849999904632568,0.39100000262260437,0.39000000059604645,0.3904999941587448,0.3880000114440918],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.375,0.3675000071525574,0.37150000035762787,0.3734999895095825,0.3824999928474426,0.37849999964237213,0.3779999911785126,0.3840000033378601,0.3779999911785126,0.3895000070333481,0.39100000262260437,0.3895000070333481,0.3869999945163727,0.3869999945163727],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.35999999940395355,0.3735000044107437,0.37499999999999994,0.3739999979734421,0.37949998676776886,0.37649999558925623,0.3869999945163727,0.3865000009536743,0.3830000013113022,0.3860000073909759,0.37849999964237213,0.38349999487400055,0.38450001180171967,0.39100000262260437],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.36050000786781305,0.37699998915195465,0.375,0.37849999964237213,0.3790000081062317,0.37899999320507044,0.3799999952316284,0.3765000104904175,0.37949998676776886,0.38449999690055847,0.380500003695488,0.38099999725818634,0.38449999690055847,0.38099999725818634],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.37049999833106995,0.36550000309944153,0.3794999867677688,0.37550000846385956,0.367000013589859,0.3789999932050705,0.3720000088214874,0.3769999891519546,0.380500003695488,0.38199999928474426,0.37749999761581415,0.38499999046325684,0.3865000009536743,0.37849999964237213],"label":"Filters combined"}},"siqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.40049999952316284,0.3959999978542328,0.39800000190734863,0.39800000190734863,0.40299999713897705,0.3969999998807907,0.40549999475479126,0.4054999947547912,0.4155000001192093,0.41800001263618464,0.40949998795986176,0.41099999845027924,0.41200000047683716,0.4065000116825104],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3894999921321869,0.3989999890327453,0.4060000032186508,0.40299999713897705,0.4055000096559524,0.4095000028610229,0.40450000762939453,0.40750001370906824,0.4074999988079071,0.408500000834465,0.41050000488758087,0.40450000762939453,0.40500000119209284,0.4035000056028366],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3930000066757202,0.39750000834465027,0.40049999952316284,0.39849999547004694,0.40449999272823334,0.4054999947547912,0.4020000100135803,0.4115000069141388,0.40800000727176666,0.402999997138977,0.4074999988079071,0.40700000524520874,0.4060000032186508,0.40250000357627863],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40350000560283655,0.403999999165535,0.4004999995231628,0.4010000079870224,0.39899998903274536,0.4015000015497207,0.39750000834465027,0.3969999998807907,0.4030000120401382,0.4055000096559524,0.4010000079870224,0.4020000100135803,0.40299999713897705,0.3990000039339065],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.392999991774559,0.39499999582767487,0.39499999582767487,0.3995000123977661,0.3970000147819519,0.40199999511241913,0.39800000190734863,0.39549998939037323,0.3889999985694885,0.39149999618530273,0.390500009059906,0.39900000393390656,0.4025000035762787,0.4035000056028366],"label":"Filters combined"}},"winogrande/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5024999976158142,0.5144999921321869,0.5129999816417694,0.5154999792575836,0.5239999890327454,0.523499995470047,0.5304999947547913,0.5239999890327454,0.5264999866485596,0.5194999873638153,0.5209999978542328,0.5289999842643738,0.5275000035762787,0.5315000116825104],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5020000040531158,0.5125000178813934,0.5214999914169312,0.5199999809265137,0.515500009059906,0.5130000114440918,0.523499995470047,0.5194999873638153,0.523499995470047,0.5194999873638153,0.5239999890327454,0.527999997138977,0.5240000188350677,0.5269999802112579],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.49549999833106995,0.5164999961853027,0.5209999978542328,0.5125000178813934,0.5165000259876251,0.527999997138977,0.5374999940395355,0.5295000076293945,0.5320000052452087,0.5435000061988831,0.5354999899864197,0.5349999964237213,0.5360000133514404,0.5385000109672546],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.49050000309944153,0.5104999840259552,0.5055000185966492,0.5120000243186951,0.511000007390976,0.5200000107288361,0.5259999930858612,0.5215000212192535,0.523499995470047,0.5324999988079071,0.523499995470047,0.5260000228881836,0.5269999802112579,0.5290000140666962],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5020000040531158,0.5010000020265579,0.5085000097751617,0.5169999897480011,0.5379999876022339,0.5169999897480011,0.5185000002384186,0.5230000019073486,0.5225000083446503,0.5300000011920929,0.5284999907016754,0.5270000100135803,0.527999997138977,0.5230000019073486],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4895000010728836,0.4949999898672104,0.5049999952316284,0.5065000057220459,0.5220000147819519,0.5069999992847443,0.5094999969005585,0.5239999890327454,0.5190000236034393,0.5239999890327454,0.5175000131130219,0.5164999961853027,0.5185000002384186,0.5180000066757202],"label":"Filters combined"}},"sciq/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2089999988675117,0.5660000145435333,0.6834999918937683,0.710999995470047,0.7395000159740448,0.7504999935626984,0.7555000185966492,0.7709999978542328,0.7819999754428864,0.7855000197887421,0.7960000038146973,0.7950000166893005,0.7860000133514404,0.8034999966621399,0.7915000021457672],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,0.7240000069141388,null,0.7569999992847443,null,null,null,0.7860000133514404,null,null,null,null,null],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,0.8080000281333923,null,null,null],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.5529999732971191,0.6869999766349792,0.7129999995231628,0.7250000238418579,null,0.7590000033378601,null,null,null,null,null,null,null,null],"label":"Filters combined"}},"sciq/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.5009999871253967,0.5934999883174896,0.6264999806880951,0.6234999895095825,0.6369999945163727,0.6624999940395355,0.6679999828338623,0.6785000264644623,0.6910000145435333,0.6915000081062317,0.6875,0.6854999959468842,0.6990000009536743,0.6934999823570251],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,0.6184999942779541,null,0.6559999883174896,null,null,null,0.675000011920929,null,null,null,null,null],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,0.6899999976158142,null,null,null],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.5019999742507935,0.5759999752044678,0.6079999804496765,0.625,null,0.6574999988079071,null,null,null,null,null,null,null,null],"label":"Filters combined"}},"arc/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2189999967813491,0.26950000226497645,0.3042500019073486,0.32200001180171967,0.3322499990463257,0.3317500054836273,0.3454999923706054,0.35325001180171967,0.35999999940395355,0.3557500094175339,0.36499999463558197,0.37025000154972076,0.3697499930858612,0.3684999942779541,0.3684999942779541],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.27124999463558197,0.29475000500679016,0.31700000166893,0.3344999998807907,0.33500000834465027,0.3374999910593033,0.35375000536441803,0.36374999582767487,0.3699999898672104,0.3697500079870224,0.3752500116825104,0.37749999761581415,0.38500000536441803,0.3812499940395355],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.27674999833106995,0.30375000834465027,0.3227500021457672,0.3332500010728836,0.33949999511241913,0.3530000001192093,0.35475000739097595,0.357000008225441,0.36525000631809235,0.3669999986886978,0.3707500100135803,0.37150000035762787,0.3742499947547912,0.3774999976158142],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2192500010132789,0.2730000019073486,0.30249999463558197,0.32099999487400055,0.3365000039339065,0.3449999988079071,0.35324999690055847,0.35950000584125513,0.3610000014305115,0.36775000393390656,0.3645000010728836,0.3665000051259994,0.369499996304512,0.36775000393390656,0.375],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2762499898672104,0.3017500042915344,0.3217500001192093,0.3374999910593033,0.3510000109672546,0.35199999809265137,0.3552500009536743,0.364750012755394,0.3739999979734421,0.37450000643730164,0.37699998915195465,0.3802500069141388,0.37475000321865076,0.38075000047683716],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2952499985694885,0.32774999737739563,0.3400000035762787,0.35400000214576716,0.3605000078678131,0.3577500134706497,0.3610000014305115,0.36724999547004694,0.37199999392032623,0.3722499907016754,0.37174999713897705,0.37825000286102295,0.37825000286102295,0.3800000101327896],"label":"Filters combined"}},"mmlu/acc":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.230211392045021,0.24219277501106262,0.2469336539506912,0.2517240643501282,0.25671665370464325,0.2612752318382263,0.26097105443477625,0.2655869126319885,0.2683205902576446,0.27027665078639984,0.268289178609848,0.2714609056711197,0.2713967561721802,0.27323792874813074,0.27401906251907343],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302158325910568,0.24256114661693567,0.24768169224262235,0.2507193014025688,0.2572980225086212,0.25885994732379913,0.26284952461719513,0.264521986246109,0.2700442671775818,0.27003195881843567,0.26974256336688995,0.27085191011428833,0.2683039605617523,0.26985205709934235,0.2728962004184723],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.242008663713932,0.25057119876146317,0.25422972440719604,0.2581385523080826,0.26281121373176575,0.2661672830581665,0.2682761400938034,0.26779243350028986,0.2691957354545593,0.27077533304691315,0.2710053026676178,0.2722867876291275,0.27343617379665375,0.2736435830593109],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.23012810945510864,0.2384799942374229,0.2488720491528511,0.2525842636823654,0.2563225924968719,0.25964072346687317,0.2621316760778427,0.26215170323848724,0.26426468789577484,0.2665379047393799,0.2679245918989181,0.26838323473930353,0.269764631986618,0.2705488502979278,0.27158279716968536],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.2428865954279899,0.24688749760389325,0.2509344741702079,0.2560944706201553,0.2591594159603119,0.26104307174682617,0.2641162872314453,0.2664932608604431,0.26827967166900635,0.27037402987480164,0.2704364061355591,0.2743852883577347,0.2725917398929596,0.2734202444553375],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-c4-hynek-filters":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25587423145771027,0.26312781870365143,0.26439163088798523,0.27070170640945435,0.2709464728832245,0.27815359830856323,0.2805800437927246,0.28173673152923584,0.28193922340869904,0.2856670469045639,0.28644809126853943,0.2880468964576721,0.28985339403152466,0.28967490792274475],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"}},"defaultWindowSize":3}
|
|
|
1 |
+
{"data":{"agg_score":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308296035975218,0.35613923892378807,0.3746252153068781,0.38806260935962195,0.39690930768847466,0.4043668694794178,0.40220927633345127,0.41070565767586226,0.41399387270212173,0.4170555509626865,0.42098715901374817,0.4210818205028772,0.42051274701952934,0.424176013097167,0.4225243702530861],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3583905678242445,0.38119001872837543,0.3873079549521208,0.39723034016788,0.4043100867420435,0.40908974781632423,0.4140731003135443,0.41894380562007427,0.41736695170402527,0.4232212919741869,0.4229240976274013,0.4236308634281158,0.42750727012753487,0.4268195778131485],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36182260885834694,0.3764855917543173,0.3928546328097582,0.3978128544986248,0.4073755294084549,0.4112890623509884,0.41486112400889397,0.4196756165474653,0.4235504809767008,0.42218128964304924,0.4228535555303097,0.4249562546610832,0.42740595713257784,0.42711055465042586],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36000680737197394,0.37551611103117466,0.38802069239318365,0.3933942876756191,0.4043118376284838,0.40780537389218807,0.4112964067608118,0.4137573726475239,0.41791345551609993,0.4173779133707285,0.42117033526301384,0.42073468305170536,0.42412591539323324,0.4260616712272167],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.36066408455371857,0.3812380563467741,0.394003426656127,0.40062618628144264,0.4117735456675291,0.4165923688560724,0.4175422675907612,0.42100309208035464,0.42246321588754654,0.42360376194119453,0.42823668196797365,0.4299001637846231,0.4302353039383888,0.4310380257666111],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.330924579873681,0.35825083684176207,0.37912008538842196,0.38942993618547916,0.3983491826802492,0.4053049590438604,0.4079726096242666,0.4135104585438967,0.41717425361275673,0.41904263757169247,0.4211529679596424,0.4212619122117758,0.42373160831630224,0.42435371689498425,0.4279126934707165],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3580685469011466,0.3740996705989043,0.39048008372386295,0.39857714250683784,0.40837346265713376,0.4111154315372308,0.41773712386687595,0.4196594481666882,0.42379963273803395,0.4276047808428605,0.42980752388636273,0.43098293244838715,0.43155378103256226,0.4327609067161878],"label":"C4"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2584999948740005,0.2850000113248825,0.30850000679492945,0.30149999260902405,0.31049999594688416,0.3079999983310699,0.3150000125169754,0.32199999690055847,0.3244999945163727,0.3205000013113022,0.3244999945163727,0.3279999941587448,0.33149999380111694,0.32850000262260437],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2619999945163727,0.288000002503395,0.29749999940395355,0.30399999022483826,0.3149999976158142,0.3245000094175339,0.3230000138282776,0.3240000009536743,0.3245000094175339,0.33550000190734863,0.335999995470047,0.32999999821186066,0.3375000059604645,0.34049999713897705],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2650000005960464,0.28599999845027924,0.3110000044107437,0.2944999933242798,0.3085000067949295,0.32199999690055847,0.31949999928474426,0.3240000009536743,0.32500000298023224,0.3245000094175339,0.32199999690055847,0.3265000134706497,0.3295000046491623,0.32999999821186066],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.2824999988079071,0.2985000014305115,0.3050000071525574,0.3119999915361404,0.3110000044107437,0.3164999932050705,0.32199999690055847,0.3279999941587448,0.3365000039339065,0.3375000059604645,0.3384999930858612,0.340499997138977,0.341499999165535],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26299999654293055,0.2864999920129776,0.2944999933242798,0.2985000014305115,0.3165000081062317,0.3194999992847442,0.318000003695488,0.32500000298023224,0.32899999618530273,0.3254999965429306,0.33150000870227814,0.3330000042915344,0.33200000226497645,0.3330000042915344],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25800000131130213,0.2849999964237213,0.29200001060962677,0.289000004529953,0.30349999666213984,0.30400000512599945,0.3139999955892563,0.3139999955892563,0.318000003695488,0.32299999892711634,0.3174999952316284,0.3215000033378601,0.32250000536441803,0.32549999654293055],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.25700000921885174,0.2786666651566823,0.2960000038146972,0.3049999972184499,0.3053333262602488,0.3120000064373016,0.31733333071072894,0.3163333336512248,0.3186666667461395,0.3226666748523712,0.3286666671435038,0.3240000009536743,0.32900000611941016,0.3283333381017049],"label":"C4"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28949999809265137,0.32599999010562897,0.34450000524520874,0.3725000023841858,0.38500000536441803,0.39499999582767487,0.408500000834465,0.41700001060962677,0.4174999892711639,0.4284999966621399,0.42849999666213984,0.43150000274181366,0.4399999976158142,0.4375],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29749999940395355,0.3240000009536743,0.34849999845027924,0.3725000023841858,0.3895000070333481,0.39800000190734863,0.41000001132488245,0.4214999973773956,0.42149999737739563,0.42499999701976776,0.42750000953674316,0.4364999979734421,0.4354999959468841,0.4385000020265579],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2939999997615814,0.3295000046491623,0.3684999942779541,0.38449999690055847,0.398499995470047,0.3959999978542328,0.4204999953508377,0.4335000067949295,0.445499986410141,0.443000003695488,0.455499991774559,0.45250000059604645,0.4529999941587448,0.4545000046491623],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29100000858306885,0.32400000095367426,0.3439999967813492,0.3575000017881393,0.3800000101327896,0.40049999952316284,0.4134999960660934,0.42099998891353607,0.4204999953508377,0.4280000030994415,0.44099999964237213,0.43799999356269836,0.44200000166893005,0.44600000977516174],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29950000345706934,0.33799999952316284,0.3789999932050705,0.3970000147819519,0.42149999737739563,0.431999996304512,0.4440000057220459,0.4490000009536743,0.45949999988079065,0.4714999943971634,0.48000000417232513,0.47749999165534973,0.48100000619888306,0.48950000107288355],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2955000102519989,0.3385000079870224,0.36800000071525574,0.40099999308586115,0.4099999964237213,0.41700001060962677,0.42400000989437103,0.4389999955892563,0.4414999932050705,0.4484999924898147,0.455499991774559,0.45799998939037323,0.4660000056028366,0.471000000834465],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29699999094009394,0.3369999925295512,0.3699999948342641,0.3930000066757202,0.41233333945274353,0.42733333508173627,0.43799999356269836,0.4506666660308838,0.454666664203008,0.47166667381922406,0.47766666611035663,0.476666659116745,0.48366666833559663,0.4853333334128062],"label":"C4"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.27699999511241913,0.288000002503395,0.2980000078678131,0.31199999153614044,0.29500000178813934,0.3139999955892563,0.31199999153614044,0.31200000643730164,0.3369999974966049,0.32899999618530273,0.3200000077486038,0.3310000002384186,0.3330000042915344],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25800000131130213,0.29899999499320984,0.27900001406669617,0.296999990940094,0.2980000078678131,0.3149999976158142,0.3179999887943268,0.32500000298023224,0.3079999983310699,0.32900001108646393,0.32599999010562897,0.3190000057220459,0.3279999941587448,0.3229999989271164],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.27400000393390656,0.2929999977350235,0.29600000381469727,0.306999996304512,0.3199999928474426,0.3190000057220459,0.31299999356269836,0.3229999989271164,0.3210000097751617,0.3270000070333481,0.3230000138282776,0.33399999141693115,0.3260000050067901],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2690000087022781,0.27300000190734863,0.28599999845027924,0.28299999237060547,0.3050000071525574,0.30900000035762787,0.31199999153614044,0.3200000077486038,0.33200000226497645,0.31200000643730164,0.3230000138282776,0.32299999892711634,0.32899999618530273,0.3320000022649765],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2590000033378601,0.278999999165535,0.2979999929666519,0.29899999499320984,0.3270000070333481,0.32800000905990595,0.32899999618530273,0.3369999974966049,0.33200000226497645,0.3260000050067901,0.33599999547004694,0.335999995470047,0.33500000834465027,0.3330000042915344],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2500000074505806,0.2759999930858612,0.2800000011920929,0.29099999368190765,0.3070000112056732,0.3070000112056732,0.3229999989271164,0.3240000009536743,0.31700000166893005,0.3100000023841858,0.31300000846385956,0.31700000166893005,0.3100000023841858,0.3189999908208847],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2526666720708211,0.26533332467079157,0.26600000262260437,0.29333333174387616,0.3059999942779541,0.30933333436648053,0.31600000460942584,0.31466667850812274,0.32933333516120905,0.3346666693687439,0.3366666634877522,0.3386666675408681,0.33799999952316284,0.33066666126251215],"label":"C4"}},"piqa/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6105000078678131,0.6350000202655792,0.6620000004768372,0.675000011920929,0.6940000057220459,0.6974999904632568,0.7054999768733978,0.7060000002384186,0.7059999704360962,0.7084999978542328,0.7060000002384186,0.7084999978542328,0.7144999802112579,0.7134999930858612],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6149999797344208,0.6520000100135803,0.6789999902248383,0.69200000166893,0.6949999928474426,0.6955000162124634,0.7055000066757202,0.7150000035762787,0.7169999778270721,0.7184999883174896,0.7235000133514404,0.7240000069141388,0.723499983549118,0.7249999940395355],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6549999713897705,0.6695000231266022,0.6860000193119049,0.6994999945163727,0.6980000138282776,0.7084999978542328,0.7120000123977661,0.7124999761581421,0.7160000205039978,0.7179999947547913,0.7195000052452087,0.7229999899864197,0.723499983549118],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6229999959468842,0.6590000092983246,0.6714999973773956,0.6820000112056732,0.6949999928474426,0.6940000057220459,0.7064999938011169,0.7005000114440918,0.6989999711513519,0.7084999978542328,0.7060000002384186,0.7099999785423279,0.7160000205039978,0.7150000035762787],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6215000152587891,0.6580000221729279,0.6784999966621399,0.69200000166893,0.703499972820282,0.7029999792575836,0.710999995470047,0.7139999866485596,0.7179999947547913,0.7150000035762787,0.715499997138977,0.7184999883174896,0.7160000205039978,0.7224999964237213],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.621999979019165,0.6520000100135803,0.6800000071525574,0.6895000040531158,0.6949999928474426,0.6990000009536743,0.7045000195503235,0.7114999890327454,0.710999995470047,0.7159999907016754,0.7199999988079071,0.7199999988079071,0.7204999923706055,0.7254999876022339],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6196666558583578,0.6583333412806193,0.6833333373069763,0.6829999883969625,0.6983333230018616,0.702999989191691,0.7056666612625122,0.7076666553815206,0.7139999866485596,0.7209999958674113,0.7179999947547913,0.7273333470026652,0.7209999958674113,0.7273333271344503],"label":"C4"}},"siqa/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.39800000190734863,0.3970000147819519,0.4000000059604645,0.39799998700618744,0.408500000834465,0.39400000870227814,0.392999991774559,0.40450000762939453,0.4070000052452087,0.39950001239776606,0.3994999974966049,0.3949999958276748,0.398499995470047,0.3920000046491623],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.4009999930858612,0.4024999886751175,0.3935000002384186,0.3904999941587448,0.4040000140666961,0.4035000056028366,0.40449999272823334,0.4079999923706054,0.40049999952316284,0.3985000103712082,0.39750000834465027,0.39799998700618744,0.3995000123977661,0.39699999988079065],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.4010000079870224,0.3974999934434891,0.39499999582767487,0.403999999165535,0.40299999713897705,0.4095000028610229,0.4074999988079071,0.4065000116825104,0.4074999988079071,0.4050000011920929,0.3999999910593033,0.40700000524520874,0.4050000011920929,0.40799999237060547],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40150000154972076,0.39549998939037323,0.3969999998807907,0.3974999934434891,0.3959999978542328,0.39750000834465027,0.39549998939037323,0.3895000070333481,0.3994999974966049,0.3980000019073486,0.4000000059604645,0.39100000262260437,0.39250001311302185,0.39499999582767487],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3955000042915344,0.40049999952316284,0.3945000022649765,0.40700000524520874,0.4010000079870224,0.4025000035762787,0.39650000631809235,0.4004999995231628,0.4020000100135803,0.40150000154972076,0.40950000286102295,0.4080000072717666,0.40600000321865076,0.40750001370906824],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3994999974966049,0.392999991774559,0.40350000560283655,0.3939999938011169,0.39650000631809235,0.39450000226497645,0.4025000035762787,0.39900000393390656,0.3985000103712082,0.4030000120401382,0.3969999998807907,0.40150000154972076,0.40049999952316284,0.4025000035762787],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.39633333683013916,0.3893333276112874,0.3933333357175191,0.39800000190734863,0.4013333320617676,0.4010000030199687,0.4059999982515971,0.41100000341733295,0.4063333372275035,0.40433333317438763,0.404666672150294,0.3993333379427592,0.4053333302338918,0.40800000230471295],"label":"C4"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48950000107288355,0.48950000107288355,0.5049999952316284,0.5125000178813934,0.5004999935626984,0.5065000057220459,0.5055000185966492,0.511000007390976,0.5160000026226044,0.5209999978542328,0.5270000100135803,0.5219999849796295,0.5149999856948853,0.5125000178813934],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48350000381469727,0.5024999976158142,0.5039999932050705,0.5049999952316284,0.5115000009536743,0.50450000166893,0.5120000243186951,0.5144999921321869,0.5194999873638153,0.5250000059604645,0.5170000195503235,0.5180000066757202,0.527999997138977,0.5259999930858612],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.48900000751018524,0.5080000162124634,0.50450000166893,0.5185000002384186,0.5175000131130219,0.5099999904632568,0.526500016450882,0.5320000052452087,0.5230000019073486,0.5105000138282776,0.5214999914169312,0.523499995470047,0.5264999866485596],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49050000309944153,0.49000000953674316,0.4999999850988388,0.4989999830722809,0.5115000009536743,0.5105000138282776,0.5069999992847443,0.5109999775886536,0.5164999961853027,0.5059999823570251,0.5129999816417694,0.5059999823570251,0.5115000009536743,0.5164999961853027],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.4989999979734421,0.5064999908208847,0.49800001084804535,0.5040000081062317,0.5139999985694885,0.5160000026226044,0.5109999775886536,0.5070000141859055,0.5115000009536743,0.5105000138282776,0.5175000131130219,0.5200000107288361,0.5135000050067902],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49150000512599945,0.49900001287460327,0.49300000071525574,0.5015000104904175,0.5094999969005585,0.5109999775886536,0.5085000097751617,0.507500022649765,0.5205000042915344,0.5125000178813934,0.5160000026226044,0.5175000131130219,0.5150000154972076,0.5179999768733978],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4933333396911621,0.48733333746592206,0.5056666731834412,0.5066666503747305,0.5116666754086813,0.5076666871706644,0.5213333169619242,0.5150000055631002,0.5183333357175192,0.5169999996821085,0.515333334604899,0.5193333427111307,0.5143333276112875,0.5196666717529297],"label":"C4"}},"arc/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.32549999654293055,0.3307500034570694,0.3467499911785126,0.3500000089406967,0.3452499955892563,0.3622500002384185,0.35999999940395355,0.37024998664855957,0.3684999942779541,0.3675000071525574,0.37249998748302454,0.37675000727176666,0.3760000020265579],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29250000417232513,0.3184999972581863,0.3297500014305115,0.34450000524520874,0.3512499928474426,0.35724999010562897,0.36375001072883606,0.3665000051259994,0.3684999942779541,0.3712499886751175,0.37375000119209284,0.37800000607967377,0.3840000033378601,0.37950000166893005],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2947500050067901,0.31974999606609344,0.3344999998807907,0.3445000052452087,0.351500004529953,0.35199999809265137,0.35925000905990595,0.3634999990463257,0.36374999582767487,0.36550000309944153,0.36775000393390656,0.3677499890327453,0.36900000274181366,0.36650000512599945],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.28949999809265137,0.3187499940395355,0.33825001120567316,0.35074999928474426,0.3604999929666519,0.36274999380111694,0.3634999990463257,0.3645000010728836,0.3644999861717224,0.3669999986886978,0.3642500042915344,0.3722499907016754,0.37499999999999994,0.37549999356269836],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.30024999380111694,0.32724998891353607,0.33374999463558197,0.34574998915195465,0.351749986410141,0.36124999821186066,0.3527500033378601,0.3582500070333481,0.35850000381469727,0.36075000464916224,0.364750012755394,0.37049999833106995,0.3729999959468841,0.36974999308586115],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.294500008225441,0.32725000381469727,0.3352499902248382,0.3504999876022339,0.3487499952316284,0.3557500094175339,0.35324999690055847,0.36374999582767487,0.36474999785423273,0.372749999165535,0.36775000393390656,0.3707500100135803,0.3734999895095825,0.375],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2928333381811778,0.3191666702429453,0.3451666633288066,0.342166672150294,0.35983332991600037,0.35483332475026447,0.3643333315849304,0.3631666700045268,0.3698333303133647,0.3696666657924652,0.37433333198229474,0.3805000086625417,0.3800000051657359,0.3798333406448364],"label":"C4"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-60gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25013685226440424,0.25661391019821167,0.2620016932487488,0.2657508552074432,0.2710244506597519,0.2744349539279938,0.27642421424388885,0.2818952649831772,0.2794509679079056,0.2831944525241852,0.28439727425575256,0.2866545617580414,0.2866020053625107,0.28615814447402954,0.2871949374675751],"label":"baseline"},"filtering-c4-curly_bracket":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25762456655502314,0.2630201578140259,0.2672136425971985,0.27234274148941034,0.2702306807041168,0.27446796000003815,0.27583475410938263,0.2770504504442215,0.2794356495141983,0.28302033245563507,0.28214274346828455,0.2855468988418579,0.2840581685304642,0.28505663573741913],"label":"curly_bracket filter"},"filtering-c4-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2583308666944504,0.2611347585916519,0.26333703100681305,0.2685028165578842,0.2725042402744293,0.27531248331069946,0.27463899552822113,0.2784048914909363,0.27915388345718384,0.27945026755332947,0.28207844495773315,0.281900018453598,0.2822476774454117,0.28188446164131165],"label":"terminal_punct filter"},"filtering-c4-word_lengths":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25205445289611816,0.2613788843154907,0.26891554892063135,0.2724043130874634,0.27449470758438105,0.27719296514987946,0.27587129175662994,0.2815589904785156,0.2833077013492584,0.2830233126878738,0.28461267054080963,0.2871275246143341,0.28650729358196253,0.2869933694601059],"label":"word_lengths filter"},"filtering-c4-all":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25806266069412226,0.26165445148944855,0.26727744936943054,0.2677594721317291,0.2689383774995804,0.2724889665842056,0.27308812737464905,0.27327476441860193,0.27370570600032806,0.277080088853836,0.27814342081546783,0.2782013118267059,0.27888238430023193,0.2795541882514953],"label":"All filters"},"filtering-c4-all-except-terminal_punct":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25500668585300446,0.26221066713333124,0.26368947327136993,0.2702934741973877,0.27218967676162714,0.27553085982799524,0.27833363413810724,0.2786440253257751,0.2810910940170288,0.2834737300872803,0.2833452969789505,0.2836028486490249,0.28682972490787506,0.2868015915155411],"label":"All filters except terminal_punct"},"sm-baseline-c4":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2557150324185689,0.25763070583343506,0.2643406589825948,0.26745049158732087,0.2721543808778127,0.2737567722797394,0.2732303539911906,0.27877557277679443,0.27923040588696796,0.2798382341861725,0.2831268608570099,0.28203009565671283,0.2810969154040019,0.28292057911554974],"label":"C4"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"C4 filtering effect on HellaSwag"}},"defaultWindowSize":3,"defaultMetric":"hellaswag/acc_norm"}
|
data/plots/cross_ind_unfiltered_comparison.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/plots/custom-_ilters.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"siqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.40049999952316284,0.3959999978542328,0.39800000190734863,0.39800000190734863,0.40299999713897705,0.3969999998807907,0.40549999475479126,0.4054999947547912,0.4155000001192093,0.41800001263618464,0.40949998795986176,0.41099999845027924,0.41200000047683716,0.4065000116825104],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3894999921321869,0.3989999890327453,0.4060000032186508,0.40299999713897705,0.4055000096559524,0.4095000028610229,0.40450000762939453,0.40750001370906824,0.4074999988079071,0.408500000834465,0.41050000488758087,0.40450000762939453,0.40500000119209284,0.4035000056028366],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3930000066757202,0.39750000834465027,0.40049999952316284,0.39849999547004694,0.40449999272823334,0.4054999947547912,0.4020000100135803,0.4115000069141388,0.40800000727176666,0.402999997138977,0.4074999988079071,0.40700000524520874,0.4060000032186508,0.40250000357627863],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40350000560283655,0.403999999165535,0.4004999995231628,0.4010000079870224,0.39899998903274536,0.4015000015497207,0.39750000834465027,0.3969999998807907,0.4030000120401382,0.4055000096559524,0.4010000079870224,0.4020000100135803,0.40299999713897705,0.3990000039339065],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3889999985694885,0.4040000140666961,0.4035000056028366,0.4050000011920929,0.3995000123977661,0.4064999967813492,0.4050000011920929,0.4025000035762787,0.4055000096559524,0.40799999237060547,0.4000000059604645,0.4025000035762787,0.403999999165535,0.40150000154972076],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}},"defaultWindowSize":3}
|
data/plots/custom-filters.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"siqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.40049999952316284,0.3959999978542328,0.39800000190734863,0.39800000190734863,0.40299999713897705,0.3969999998807907,0.40549999475479126,0.4054999947547912,0.4155000001192093,0.41800001263618464,0.40949998795986176,0.41099999845027924,0.41200000047683716,0.4065000116825104],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3894999921321869,0.3989999890327453,0.4060000032186508,0.40299999713897705,0.4055000096559524,0.4095000028610229,0.40450000762939453,0.40750001370906824,0.4074999988079071,0.408500000834465,0.41050000488758087,0.40450000762939453,0.40500000119209284,0.4035000056028366],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3930000066757202,0.39750000834465027,0.40049999952316284,0.39849999547004694,0.40449999272823334,0.4054999947547912,0.4020000100135803,0.4115000069141388,0.40800000727176666,0.402999997138977,0.4074999988079071,0.40700000524520874,0.4060000032186508,0.40250000357627863],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40350000560283655,0.403999999165535,0.4004999995231628,0.4010000079870224,0.39899998903274536,0.4015000015497207,0.39750000834465027,0.3969999998807907,0.4030000120401382,0.4055000096559524,0.4010000079870224,0.4020000100135803,0.40299999713897705,0.3990000039339065],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3889999985694885,0.4040000140666961,0.4035000056028366,0.4050000011920929,0.3995000123977661,0.4064999967813492,0.4050000011920929,0.4025000035762787,0.4055000096559524,0.40799999237060547,0.4000000059604645,0.4025000035762787,0.403999999165535,0.40150000154972076],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.38,0.44]}},"defaultWindowSize":3}
|
data/plots/custom_filters.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data":{"agg_score":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308596294373274,0.35654734168201685,0.3758235517889261,0.38752372190356255,0.39841264486312866,0.4040419068187475,0.4097859803587198,0.41541148349642754,0.416892247274518,0.41986062191426754,0.4234193116426468,0.4218583852052688,0.4243287574499845,0.42519346065819263,0.42440339736640453],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308979943394661,0.35727922804653645,0.3758955802768469,0.39312327839434147,0.3984657619148493,0.4037223849445581,0.40907647646963596,0.41408527828752995,0.42114910110831255,0.42039695382118225,0.4248786196112633,0.42590542137622833,0.4263712782412767,0.42797840014100075,0.4277621991932392],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35735468938946724,0.3787423223257065,0.391122592613101,0.3976811040192842,0.4041402228176594,0.4110417179763317,0.4150725454092026,0.42221225984394545,0.4235249478369951,0.42567262239754194,0.42764298990368843,0.4280493911355734,0.42981273680925364,0.42845905013382435],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.33087017294019455,0.35839469730854034,0.379800958558917,0.3909519836306572,0.3985003251582384,0.4028578344732523,0.4080309104174375,0.411550747230649,0.4152813777327537,0.41849316097795963,0.42109199613332743,0.4223319999873638,0.42558939941227436,0.42717534117400646,0.426479609683156],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35955795273184776,0.3757704347372055,0.3934198468923569,0.398214865475893,0.4062729831784963,0.41363069601356983,0.41463132016360754,0.41851891577243805,0.4239445272833109,0.42439557053148746,0.4273625332862139,0.4289980959147215,0.4327357914298773,0.43017333932220936],"label":"Filters combined"}},"commonsense_qa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2850000113248825,0.2875000089406967,0.31049999594688416,0.3135000020265579,0.3279999941587448,0.32999999821186066,0.32349999248981476,0.3229999989271164,0.32350000739097595,0.32900001108646393,0.3264999985694885,0.3349999934434891,0.32999999821186066],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2644999921321869,0.2750000059604645,0.2989999949932098,0.2974999994039535,0.31599999964237213,0.3149999976158142,0.3199999928474426,0.3244999945163727,0.3269999921321869,0.33550000190734863,0.3275000005960464,0.33599999547004694,0.3349999934434891,0.33849999308586115],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26349999010562897,0.28849999606609344,0.29600000381469727,0.30650000274181366,0.31900000572204584,0.3229999989271164,0.3150000125169754,0.3244999945163727,0.3310000002384186,0.3310000002384186,0.32999999821186066,0.3334999978542328,0.3344999998807907,0.32999999821186066],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2620000094175339,0.28949999809265137,0.2974999994039535,0.30550000071525574,0.30900000035762787,0.31200000643730164,0.3190000057220459,0.32999999821186066,0.3254999965429306,0.3344999998807907,0.3320000022649765,0.3374999910593033,0.3369999974966049,0.33949999511241913],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2649999856948852,0.2790000140666961,0.29649999737739563,0.3135000020265579,0.3164999932050705,0.32099999487400055,0.3210000097751617,0.3305000066757202,0.3205000013113022,0.32549999654293055,0.3295000046491623,0.33050000667572016,0.335999995470047,0.33200000226497645],"label":"Filters combined"}},"hellaswag/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2900000065565109,0.3279999941587448,0.3550000041723251,0.375,0.38850000500679016,0.40350000560283655,0.41200000047683716,0.4194999933242798,0.42249999940395355,0.4329999983310699,0.43449999392032623,0.43700000643730164,0.4395000040531158,0.43950000405311584],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28900000452995295,0.3310000002384186,0.3505000025033951,0.3790000081062317,0.39250001311302185,0.40549999475479126,0.4224999994039535,0.4284999966621399,0.43050000071525574,0.43799999356269836,0.4459999948740005,0.4495000094175339,0.4564999938011169,0.4529999941587448],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29449999332427973,0.33550000190734863,0.34800000488758087,0.3764999955892563,0.3824999928474426,0.3955000042915344,0.41799999773502344,0.4270000010728836,0.43400000035762787,0.44450001418590546,0.45049999654293055,0.45450000464916224,0.45449998974800104,0.4550000131130218],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.3020000010728836,0.3310000002384186,0.357000008225441,0.37899999320507044,0.38850000500679016,0.3994999974966049,0.40349999070167536,0.4175000041723251,0.42400000989437103,0.4245000034570694,0.4335000067949295,0.4360000044107437,0.44750000536441803,0.44200000166893],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.29349999129772186,0.3210000097751617,0.36150000989437103,0.3734999895095825,0.39599999785423273,0.4125000089406967,0.4234999865293503,0.42749999463558197,0.44699999690055847,0.4549999982118606,0.4660000056028366,0.46600000560283655,0.47050000727176666,0.4675000011920929],"label":"Filters combined"}},"openbookqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25099999457597727,0.2629999965429306,0.2810000032186508,0.2939999997615814,0.2900000065565109,0.3100000023841858,0.3129999935626983,0.3149999976158142,0.3229999989271164,0.3310000002384186,0.32100000977516174,0.32999999821186066,0.32100000977516174,0.32100000977516174],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26900000870227814,0.2670000046491623,0.306999996304512,0.2939999997615814,0.2999999970197677,0.306999996304512,0.31200000643730164,0.31299999356269836,0.3200000077486038,0.3229999989271164,0.32099999487400055,0.32500000298023224,0.3240000009536743,0.3219999969005584],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2559999972581863,0.2849999964237213,0.3110000044107437,0.2979999929666519,0.3009999990463257,0.318000003695488,0.3140000104904175,0.32899999618530273,0.32899999618530273,0.3369999974966049,0.33599999547004694,0.32900001108646393,0.3299999982118606,0.3330000042915344],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.25999999046325684,0.28200000524520874,0.28599999845027924,0.289000004529953,0.29999999701976776,0.31300000846385956,0.31900000572204584,0.3149999976158142,0.32099999487400055,0.3139999955892563,0.3190000057220459,0.32200001180171967,0.3229999989271164,0.3240000009536743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2719999998807907,0.277999997138977,0.3039999902248382,0.28199999034404755,0.30200000107288355,0.3050000071525574,0.31299999356269836,0.32099999487400055,0.3269999921321869,0.31599999964237213,0.3260000050067901,0.32600000500679016,0.3299999982118606,0.32500000298023224],"label":"Filters combined"}},"piqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.613999992609024,0.6504999995231628,0.6649999916553497,0.6870000064373016,0.6915000081062317,0.6974999904632568,0.7035000026226044,0.7129999995231628,0.7055000066757202,0.7080000042915344,0.7084999978542328,0.7114999890327454,0.714000016450882,0.7115000188350677],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.609499990940094,0.652999997138977,0.6744999885559082,0.68299999833107,0.6809999942779541,0.6965000033378601,0.6995000243186951,0.7145000100135803,0.7100000083446503,0.7105000019073486,0.7134999930858612,0.7159999907016754,0.7170000076293945,0.7199999988079071],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.648499995470047,0.6649999916553497,0.6865000128746033,0.690500020980835,0.6965000033378601,0.7029999792575836,0.7139999866485596,0.7105000019073486,0.7089999914169312,0.7139999866485596,0.7144999802112579,0.7229999899864197,0.7175000011920929],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6254999935626984,0.6530000269412994,0.6665000021457672,0.6860000193119049,0.6980000138282776,0.695499986410141,0.7084999978542328,0.7080000042915344,0.7064999938011169,0.7095000147819519,0.7129999995231628,0.7159999907016754,0.7179999947547913,0.718500018119812],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6274999976158142,0.656000018119812,0.6665000021457672,0.6854999959468842,0.6895000040531158,0.7035000026226044,0.7060000002384186,0.7100000083446503,0.7195000052452087,0.7159999907016754,0.715499997138977,0.7170000076293945,0.7274999916553497,0.7199999988079071],"label":"Filters combined"}},"siqa/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.40049999952316284,0.3959999978542328,0.39800000190734863,0.39800000190734863,0.40299999713897705,0.3969999998807907,0.40549999475479126,0.4054999947547912,0.4155000001192093,0.41800001263618464,0.40949998795986176,0.41099999845027924,0.41200000047683716,0.4065000116825104],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3894999921321869,0.3989999890327453,0.4060000032186508,0.40299999713897705,0.4055000096559524,0.4095000028610229,0.40450000762939453,0.40750001370906824,0.4074999988079071,0.408500000834465,0.41050000488758087,0.40450000762939453,0.40500000119209284,0.4035000056028366],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3930000066757202,0.39750000834465027,0.40049999952316284,0.39849999547004694,0.40449999272823334,0.4054999947547912,0.4020000100135803,0.4115000069141388,0.40800000727176666,0.402999997138977,0.4074999988079071,0.40700000524520874,0.4060000032186508,0.40250000357627863],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.40350000560283655,0.403999999165535,0.4004999995231628,0.4010000079870224,0.39899998903274536,0.4015000015497207,0.39750000834465027,0.3969999998807907,0.4030000120401382,0.4055000096559524,0.4010000079870224,0.4020000100135803,0.40299999713897705,0.3990000039339065],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3889999985694885,0.4040000140666961,0.4035000056028366,0.4050000011920929,0.3995000123977661,0.4064999967813492,0.4050000011920929,0.4025000035762787,0.4055000096559524,0.40799999237060547,0.4000000059604645,0.4025000035762787,0.403999999165535,0.40150000154972076],"label":"Filters combined"}},"winogrande/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4884999990463257,0.49699999392032623,0.5035000145435333,0.5054999887943268,0.5200000107288361,0.5115000009536743,0.5154999792575836,0.507999986410141,0.515500009059906,0.5160000026226044,0.5139999985694885,0.5165000259876251,0.5160000026226044,0.523499995470047],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48749999701976776,0.5024999976158142,0.5035000145435333,0.5099999904632568,0.5080000162124634,0.5050000250339508,0.5069999992847443,0.5180000066757202,0.5085000097751617,0.515500009059906,0.5165000259876251,0.5080000162124634,0.5090000033378601,0.5104999840259552],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.48899999260902405,0.49300000071525574,0.5030000060796738,0.49799999594688416,0.5024999976158142,0.5150000154972076,0.5259999930858612,0.527999997138977,0.5245000123977661,0.5275000035762787,0.5199999809265137,0.5300000011920929,0.5300000011920929,0.5289999842643738],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.471000000834465,0.494499996304512,0.511000007390976,0.511000007390976,0.5070000290870667,0.50450000166893,0.5060000121593475,0.5074999928474426,0.5169999897480011,0.5264999866485596,0.526500016450882,0.5290000140666962,0.5275000035762787,0.5259999930858612],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4854999929666519,0.4860000014305115,0.5094999969005585,0.5090000033378601,0.5195000171661377,0.5185000002384186,0.5090000182390213,0.5084999799728394,0.5209999978542328,0.5164999961853027,0.5254999995231628,0.5250000059604645,0.5250000059604645,0.5254999995231628],"label":"Filters combined"}},"arc/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.29224999248981476,0.3267499953508377,0.34375,0.34800000488758087,0.35224999487400055,0.3567499965429306,0.36450000107288355,0.369499996304512,0.3712500035762787,0.3722500056028366,0.37325000762939453,0.377250000834465,0.37624999880790705,0.3764999955892563],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29474999010562897,0.3184999972581863,0.3392500132322311,0.35074999928474426,0.35300000011920923,0.35750000178813934,0.3684999942779541,0.3817500025033951,0.37800000607967377,0.38199999928474426,0.38600000739097595,0.38525000214576716,0.39000000059604645,0.38850000500679016],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29100000858306885,0.31949999928474426,0.33675000071525574,0.34524999558925623,0.35850000381469727,0.3557499945163727,0.36124999821186066,0.3599999994039535,0.36800000071525574,0.36775000393390656,0.3770000040531158,0.37025000154972076,0.37424999475479126,0.37299999594688416],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2892500013113022,0.3190000057220459,0.3385000079870224,0.3449999988079071,0.3495000004768371,0.36374999582767487,0.3604999929666519,0.36549998819828033,0.37074999511241913,0.37150000035762787,0.3722500056028366,0.37774999439716334,0.3774999976158142,0.37899999320507044],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2905000001192093,0.3199999928474426,0.3397499918937683,0.3467499911785126,0.3540000021457672,0.3662499934434891,0.36374999582767487,0.3647499978542328,0.3675000071525574,0.371749997138977,0.37074999511241913,0.375,0.3787499964237213,0.38099999725818634],"label":"Filters combined"}},"mmlu/acc_norm":{"filtering-baseline-2019-18-40gt":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501270473003387,0.25162875652313227,0.26033842563629145,0.26643975079059595,0.26930116117000574,0.27358523011207575,0.27403785288333893,0.2792918980121612,0.28113801777362823,0.2826349586248398,0.2856044620275497,0.2851170748472214,0.28488004207611084,0.2877976596355438,0.28672714531421656],"label":"Baseline"},"filtering-custom-line-char-duplicated-v2-0.01":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25018398463726044,0.2544838488101959,0.2611646503210068,0.2652362138032913,0.2704761028289795,0.2737790495157242,0.276611790060997,0.2786822021007538,0.281442791223526,0.2816756069660187,0.2860289514064789,0.28624334931373596,0.2867202013731003,0.28732720017433167,0.28609761595726013],"label":"Line duplicates filter"},"filtering-custom-lines-punc-0.12":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2563375234603882,0.26243858039379114,0.26873072981834406,0.27219884097576136,0.27462176978588104,0.27908372879028315,0.2813303619623184,0.28369809687137604,0.28319956362247467,0.28563097119331354,0.28614395856857294,0.28564512729644775,0.2862519174814224,0.2876724004745483],"label":"Punctuation filter"},"filtering-custom-short-line-ratio-0.67":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.24996141344308848,0.25390757620334625,0.26540763676166534,0.27061584591865534,0.27150256931781763,0.2718626409769058,0.27449728548526764,0.2784059643745422,0.28175103664398193,0.28019529581069946,0.2827359586954117,0.2814059555530548,0.2844651788473129,0.28390273451805115,0.2838368713855743],"label":"Short lines filter"},"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2534636557102203,0.2621634304523468,0.2661087810993194,0.2704689502716064,0.27318383753299713,0.2757955640554428,0.2758005559444427,0.28340134024620056,0.2835562080144882,0.28641459345817566,0.28565025329589844,0.28998473286628723,0.29013633728027344,0.2888867110013962],"label":"Filters combined"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (Billions)"}},"title":{"text":"Custom filters performance"},"yaxis":{"range":[0.4,0.44]}},"defaultWindowSize":3}
|
data/plots/dataset_ablations.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/plots/dedup_all_dumps_bad.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/plots/dedup_attempts.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/plots/filtering_steps.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/plots/removed_data_cross_dedup.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"data":{"agg_score":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3310184087604284,0.3494944926351309,0.3678930029273033,0.3791136778891086,0.3830251954495907,0.387223158031702,0.3940111547708511,0.3980898857116699,0.398512527346611,0.3974943198263645,0.4026404283940792,0.402598962187767,0.4074418470263481,0.4055770002305507,0.4050002694129944],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3570646308362484,0.3725825920701027,0.383445743471384,0.39065178856253624,0.3996846079826355,0.4021379072219133,0.4061895925551653,0.41160152666270733,0.4141362868249416,0.4196407739073038,0.4217643104493618,0.4209167677909136,0.42394610680639744,0.4236117731779814],"label":"Originally removed data"}},"commonsense_qa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2169999927282333,0.2450000047683715,0.2689999938011169,0.2770000100135803,0.2899999916553497,0.3030000030994415,0.3160000145435333,0.3260000050067901,0.3100000023841858,0.3210000097751617,0.3179999887943268,0.328000009059906,0.3240000009536743,0.3199999928474426],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.24449999630451197,0.27199999988079065,0.2944999933242798,0.2965000122785568,0.3154999911785126,0.3205000013113022,0.3215000033378601,0.32900001108646393,0.3360000103712082,0.33449999988079065,0.34049999713897705,0.3449999988079071,0.3474999964237213,0.346000000834465],"label":"Originally removed data"}},"commonsense_qa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2479999959468841,0.2800000011920929,0.2910000085830688,0.289000004529953,0.3059999942779541,0.3050000071525574,0.3050000071525574,0.3149999976158142,0.3140000104904175,0.3269999921321869,0.3219999969005584,0.3190000057220459,0.3179999887943268,0.3120000064373016],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2580000013113022,0.28649999201297754,0.289000004529953,0.29500000178813934,0.30949999392032623,0.31599999964237213,0.31700000166893005,0.318000003695488,0.32549999654293055,0.32099999487400055,0.33250001072883606,0.32500000298023224,0.3330000042915344,0.32750000059604645],"label":"Originally removed data"}},"hellaswag/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.2879999876022339,0.2980000078678131,0.3039999902248382,0.3109999895095825,0.3269999921321869,0.3319999873638153,0.3370000123977661,0.3389999866485595,0.3449999988079071,0.3470000028610229,0.3479999899864197,0.3490000069141388,0.3499999940395355,0.3540000021457672],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.28649999201297754,0.3079999983310699,0.3230000138282776,0.33149999380111694,0.3339999914169311,0.34699998795986176,0.3530000001192093,0.3564999997615814,0.36400000751018524,0.37299999594688416,0.37199999392032623,0.3734999895095825,0.3774999976158142,0.37800000607967377],"label":"Originally removed data"}},"hellaswag/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2860000133514404,0.2879999876022339,0.328000009059906,0.3379999995231628,0.356000006198883,0.356000006198883,0.3589999973773956,0.3720000088214874,0.3740000128746032,0.382999986410141,0.3810000121593475,0.395000010728836,0.3849999904632568,0.3930000066757202],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2775000035762787,0.3085000067949295,0.32750000059604645,0.35600000619888306,0.36999998986721033,0.37950000166893005,0.3965000063180923,0.41050000488758087,0.41250000894069666,0.42149999737739563,0.4270000010728836,0.4339999854564667,0.4389999955892563,0.4375],"label":"Originally removed data"}},"openbookqa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1040000021457672,0.1280000060796737,0.1379999965429306,0.1319999992847442,0.1379999965429306,0.1420000046491623,0.1420000046491623,0.1500000059604644,0.1400000005960464,0.1560000032186508,0.1599999964237213,0.1620000004768371,0.1580000072717666,0.1519999951124191],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1269999966025352,0.123999997973442,0.1530000045895576,0.1519999951124191,0.16600000113248825,0.16899999976158137,0.17999999970197675,0.1689999997615814,0.1789999976754188,0.1829999983310699,0.1879999935626983,0.18400000035762787,0.18400000035762787,0.19400000572204584],"label":"Originally removed data"}},"openbookqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2440000027418136,0.2800000011920929,0.2660000026226043,0.2800000011920929,0.2759999930858612,0.2879999876022339,0.3019999861717224,0.2879999876022339,0.2739999890327453,0.2800000011920929,0.2840000092983246,0.2899999916553497,0.2899999916553497,0.2879999876022339],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26899999380111694,0.26899999380111694,0.2870000004768371,0.2849999964237213,0.29200001060962677,0.29900000989437103,0.29900000989437103,0.2980000078678131,0.2939999997615814,0.32199999690055847,0.31700000166893005,0.30799999833106995,0.31700000166893005,0.3260000050067901],"label":"Originally removed data"}},"piqa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5429999828338623,0.5839999914169312,0.628000020980835,0.6259999871253967,0.6389999985694885,0.6520000100135803,0.640999972820282,0.6589999794960022,0.6549999713897705,0.6690000295639038,0.671999990940094,0.675000011920929,0.6759999990463257,0.6809999942779541,0.6759999990463257],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.5945000052452087,0.6354999840259552,0.6315000057220459,0.6439999938011169,0.6584999859333038,0.6629999876022339,0.6730000078678131,0.6784999966621399,0.6854999959468842,0.6825000047683716,0.6889999806880951,0.6899999976158142,0.6929999887943268,0.6940000057220459],"label":"Originally removed data"}},"piqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6140000224113464,0.6480000019073486,0.6539999842643738,0.6669999957084656,0.6549999713897705,0.6679999828338623,0.6850000023841858,0.671999990940094,0.6869999766349792,0.6840000152587891,0.6880000233650208,0.6890000104904175,0.6980000138282776,0.6940000057220459],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.6419999897480011,0.6620000004768372,0.6780000030994415,0.6819999814033508,0.6865000128746033,0.6884999871253967,0.7064999938011169,0.7080000042915344,0.7055000066757202,0.7114999890327454,0.715499997138977,0.7084999978542328,0.7074999809265137],"label":"Originally removed data"}},"siqa/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3659999966621399,0.3659999966621399,0.3619999885559082,0.3610000014305115,0.3650000095367431,0.375,0.3720000088214874,0.3720000088214874,0.363999992609024,0.3759999871253967,0.367000013589859,0.367000013589859,0.3729999959468841,0.367000013589859],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3610000014305115,0.3720000088214874,0.37299999594688416,0.3830000013113022,0.3869999945163727,0.37649999558925623,0.3809999972581863,0.3845000118017196,0.39050000905990595,0.3930000066757202,0.4004999995231628,0.38799999654293055,0.39649999141693115,0.39649999141693115],"label":"Originally removed data"}},"siqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3799999952316284,0.382999986410141,0.3939999938011169,0.3930000066757202,0.3889999985694885,0.3970000147819519,0.4009999930858612,0.3959999978542328,0.3919999897480011,0.3970000147819519,0.3869999945163727,0.4070000052452087,0.3959999978542328,0.3959999978542328],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.38750000298023224,0.3945000022649765,0.3974999934434891,0.39699999988079065,0.3994999974966049,0.3974999934434891,0.39149999618530273,0.3975000083446502,0.39699999988079065,0.39499999582767487,0.39750000834465027,0.39399999380111694,0.39399999380111694,0.3919999897480011],"label":"Originally removed data"}},"winogrande/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5149999856948853,0.4990000128746032,0.5189999938011169,0.5189999938011169,0.5019999742507935,0.5149999856948853,0.5400000214576721,0.531000018119812,0.5320000052452087,0.5289999842643738,0.5289999842643738,0.5379999876022339,0.527999997138977,0.5379999876022339,0.527999997138977],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5090000033378601,0.5189999938011169,0.5080000162124634,0.507999986410141,0.5214999914169312,0.510000005364418,0.5195000171661377,0.531499981880188,0.5290000140666962,0.5300000011920929,0.5324999988079071,0.5394999980926514,0.5375000238418579,0.5415000021457672],"label":"Originally removed data"}},"winogrande/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.492000013589859,0.4990000128746032,0.5040000081062317,0.4959999918937683,0.5109999775886536,0.5210000276565552,0.5099999904632568,0.5080000162124634,0.5059999823570251,0.5130000114440918,0.515999972820282,0.5099999904632568,0.5099999904632568,0.5130000114440918],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49900001287460327,0.50450000166893,0.4985000044107437,0.502499982714653,0.5170000195503235,0.5030000060796738,0.5210000276565552,0.5160000026226044,0.5200000107288361,0.527999997138977,0.5224999785423279,0.5259999930858612,0.5240000188350677,0.5214999914169312],"label":"Originally removed data"}},"sciq/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.5460000038146973,null,null,null,null,null,null,null,null,null,null,null,null,0.7850000262260437],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2080000042915344,0.5705000162124634,0.6755000054836273,0.7165000140666962,0.734499990940094,0.7419999837875366,0.7645000219345093,0.7664999961853027,0.7675000131130219,0.7719999849796295,0.7745000123977661,0.8009999990463257,0.7900000214576721,0.8019999861717224,0.7899999916553497],"label":"Originally removed data"}},"sciq/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[null,0.4839999973773956,null,null,null,null,null,null,null,null,null,null,null,null,0.675000011920929],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.507999986410141,0.5740000009536743,0.6184999942779541,0.6365000009536743,0.6500000059604645,0.6665000021457672,0.6669999957084656,0.6615000069141388,0.6689999997615814,0.6774999797344208,0.6949999928474426,0.6899999976158142,0.7070000171661377,0.6909999847412109],"label":"Originally removed data"}},"arc/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2565000057220459,0.2845000028610229,0.3034999966621399,0.3104999959468841,0.3190000057220459,0.328000009059906,0.3319999873638153,0.3364999890327453,0.3445000052452087,0.3445000052452087,0.3490000069141388,0.3510000109672546,0.3540000021457672,0.3589999973773956],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2707500010728836,0.30649998784065247,0.31075000762939453,0.3317500054836273,0.3429999947547912,0.3489999920129776,0.36050000786781305,0.3542499989271164,0.3617500066757202,0.37049999833106995,0.37324999272823334,0.3669999986886978,0.377250000834465,0.37675000727176666],"label":"Originally removed data"}},"arc/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2520000040531158,0.277999997138977,0.3115000128746032,0.3334999978542328,0.3375000059604645,0.3379999995231628,0.351500004529953,0.3549999892711639,0.3630000054836273,0.3610000014305115,0.3650000095367431,0.3659999966621399,0.3700000047683716,0.3729999959468841,0.3659999966621399],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29625000059604645,0.3167499899864197,0.3412500023841858,0.3410000056028366,0.35625000298023224,0.3604999929666519,0.36050000786781305,0.3652499914169311,0.3722499907016754,0.37925000488758087,0.3789999932050705,0.3782500028610229,0.38750000298023224,0.3879999965429306],"label":"Originally removed data"}},"mmlu/acc":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302279472351074,0.2396509051322937,0.2397145926952362,0.2505511939525604,0.2498872578144073,0.2526205778121948,0.2522554993629455,0.2501455843448639,0.2584334015846252,0.2589265406131744,0.2582942545413971,0.2596102058887481,0.2603496015071869,0.2584807872772217,0.2601740062236786],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.24148745089769358,0.24818557500839228,0.2512914910912514,0.25797079503536224,0.2579677104949951,0.2621956318616867,0.2641489803791046,0.2655077129602432,0.26579149067401886,0.2702796012163162,0.26829390227794647,0.2691189646720886,0.2700086534023285,0.27018964290618896],"label":"Originally removed data"}},"mmlu/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25014728307724,0.2539559006690979,0.2536440193653106,0.2624094188213348,0.2637015581130981,0.2667853236198425,0.2655892074108124,0.2677191197872162,0.2741002142429352,0.271954596042633,0.2721233963966369,0.2767916917800903,0.2795347571372986,0.2746160328388214,0.2780021429061889],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2537670284509659,0.25891076028347015,0.26481594145298004,0.27071431279182434,0.2712268680334091,0.27510324120521545,0.2755167037248611,0.28106220066547394,0.28384028375148773,0.2848761975765228,0.2871145009994507,0.28658416867256165,0.288568839430809,0.2888942211866379],"label":"Originally removed data"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"The originally removed data outperforms the kept data"}}}
|
|
|
1 |
+
{"data":{"agg_score":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3310184087604284,0.3494944926351309,0.3678930029273033,0.3791136778891086,0.3830251954495907,0.387223158031702,0.3940111547708511,0.3980898857116699,0.398512527346611,0.3974943198263645,0.4026404283940792,0.402598962187767,0.4074418470263481,0.4055770002305507,0.4050002694129944],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.3570646308362484,0.3725825920701027,0.383445743471384,0.39065178856253624,0.3996846079826355,0.4021379072219133,0.4061895925551653,0.41160152666270733,0.4141362868249416,0.4196407739073038,0.4217643104493618,0.4209167677909136,0.42394610680639744,0.4236117731779814],"label":"Originally removed data"}},"commonsense_qa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2479999959468841,0.2800000011920929,0.2910000085830688,0.289000004529953,0.3059999942779541,0.3050000071525574,0.3050000071525574,0.3149999976158142,0.3140000104904175,0.3269999921321869,0.3219999969005584,0.3190000057220459,0.3179999887943268,0.3120000064373016],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.2580000013113022,0.28649999201297754,0.289000004529953,0.29500000178813934,0.30949999392032623,0.31599999964237213,0.31700000166893005,0.318000003695488,0.32549999654293055,0.32099999487400055,0.33250001072883606,0.32500000298023224,0.3330000042915344,0.32750000059604645],"label":"Originally removed data"}},"hellaswag/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2860000133514404,0.2879999876022339,0.328000009059906,0.3379999995231628,0.356000006198883,0.356000006198883,0.3589999973773956,0.3720000088214874,0.3740000128746032,0.382999986410141,0.3810000121593475,0.395000010728836,0.3849999904632568,0.3930000066757202],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2775000035762787,0.3085000067949295,0.32750000059604645,0.35600000619888306,0.36999998986721033,0.37950000166893005,0.3965000063180923,0.41050000488758087,0.41250000894069666,0.42149999737739563,0.4270000010728836,0.4339999854564667,0.4389999955892563,0.4375],"label":"Originally removed data"}},"openbookqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2440000027418136,0.2800000011920929,0.2660000026226043,0.2800000011920929,0.2759999930858612,0.2879999876022339,0.3019999861717224,0.2879999876022339,0.2739999890327453,0.2800000011920929,0.2840000092983246,0.2899999916553497,0.2899999916553497,0.2879999876022339],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26899999380111694,0.26899999380111694,0.2870000004768371,0.2849999964237213,0.29200001060962677,0.29900000989437103,0.29900000989437103,0.2980000078678131,0.2939999997615814,0.32199999690055847,0.31700000166893005,0.30799999833106995,0.31700000166893005,0.3260000050067901],"label":"Originally removed data"}},"piqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6140000224113464,0.6480000019073486,0.6539999842643738,0.6669999957084656,0.6549999713897705,0.6679999828338623,0.6850000023841858,0.671999990940094,0.6869999766349792,0.6840000152587891,0.6880000233650208,0.6890000104904175,0.6980000138282776,0.6940000057220459],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6155000030994415,0.6419999897480011,0.6620000004768372,0.6780000030994415,0.6819999814033508,0.6865000128746033,0.6884999871253967,0.7064999938011169,0.7080000042915344,0.7055000066757202,0.7114999890327454,0.715499997138977,0.7084999978542328,0.7074999809265137],"label":"Originally removed data"}},"siqa/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.3799999952316284,0.382999986410141,0.3939999938011169,0.3930000066757202,0.3889999985694885,0.3970000147819519,0.4009999930858612,0.3959999978542328,0.3919999897480011,0.3970000147819519,0.3869999945163727,0.4070000052452087,0.3959999978542328,0.3959999978542328],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.38750000298023224,0.3945000022649765,0.3974999934434891,0.39699999988079065,0.3994999974966049,0.3974999934434891,0.39149999618530273,0.3975000083446502,0.39699999988079065,0.39499999582767487,0.39750000834465027,0.39399999380111694,0.39399999380111694,0.3919999897480011],"label":"Originally removed data"}},"winogrande/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.492000013589859,0.4990000128746032,0.5040000081062317,0.4959999918937683,0.5109999775886536,0.5210000276565552,0.5099999904632568,0.5080000162124634,0.5059999823570251,0.5130000114440918,0.515999972820282,0.5099999904632568,0.5099999904632568,0.5130000114440918],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49900001287460327,0.50450000166893,0.4985000044107437,0.502499982714653,0.5170000195503235,0.5030000060796738,0.5210000276565552,0.5160000026226044,0.5200000107288361,0.527999997138977,0.5224999785423279,0.5259999930858612,0.5240000188350677,0.5214999914169312],"label":"Originally removed data"}},"arc/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2520000040531158,0.277999997138977,0.3115000128746032,0.3334999978542328,0.3375000059604645,0.3379999995231628,0.351500004529953,0.3549999892711639,0.3630000054836273,0.3610000014305115,0.3650000095367431,0.3659999966621399,0.3700000047683716,0.3729999959468841,0.3659999966621399],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.29625000059604645,0.3167499899864197,0.3412500023841858,0.3410000056028366,0.35625000298023224,0.3604999929666519,0.36050000786781305,0.3652499914169311,0.3722499907016754,0.37925000488758087,0.3789999932050705,0.3782500028610229,0.38750000298023224,0.3879999965429306],"label":"Originally removed data"}},"mmlu/acc_norm":{"cross_minhash_dump_CC-MAIN-2013-48":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.25014728307724,0.2539559006690979,0.2536440193653106,0.2624094188213348,0.2637015581130981,0.2667853236198425,0.2655892074108124,0.2677191197872162,0.2741002142429352,0.271954596042633,0.2721233963966369,0.2767916917800903,0.2795347571372986,0.2746160328388214,0.2780021429061889],"label":"Originally kept data"},"deduped_removed_cross":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2537670284509659,0.25891076028347015,0.26481594145298004,0.27071431279182434,0.2712268680334091,0.27510324120521545,0.2755167037248611,0.28106220066547394,0.28384028375148773,0.2848761975765228,0.2871145009994507,0.28658416867256165,0.288568839430809,0.2888942211866379],"label":"Originally removed data"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"The originally removed data outperforms the kept data"}}}
|
data/plots/wet_comparison.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"data":{"agg_score":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35846718959510326,0.37850185949355364,0.39194786734879017,0.39600365422666073,0.40486439503729343,0.4064061753451824,0.41104014590382576,0.41393135115504265,0.41698802448809147,0.42121383734047413,0.4219294786453247,0.4234823901206255,0.42346264235675335,0.42699199728667736],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308620806783438,0.3493095366284251,0.367914117872715,0.37658837065100664,0.3858313206583261,0.3908915650099516,0.3968510050326586,0.39992102794349194,0.40259181894361973,0.4055726025253534,0.4074157159775495,0.40804907679557795,0.40894216299057007,0.4123705606907606,0.4108315110206604],"label":"WET data"}},"commonsense_qa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.255500003695488,0.2810000032186508,0.29899999499320984,0.3074999898672104,0.3125,0.3264999985694885,0.3229999989271164,0.3340000063180923,0.3275000005960464,0.3410000056028366,0.33900000154972076,0.3425000011920929,0.3479999899864197,0.34849999845027924],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1860000044107437,0.2275000065565109,0.2554999962449073,0.27199999988079065,0.29200001060962677,0.29950000345706934,0.30650000274181366,0.3110000044107437,0.3100000023841858,0.32150000333786005,0.3200000077486038,0.3214999884366989,0.3199999928474426,0.3179999887943268,0.3255000114440918],"label":"WET data"}},"commonsense_qa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26000000536441803,0.291499987244606,0.3019999861717224,0.3034999966621399,0.3109999895095825,0.31599999964237213,0.3254999965429306,0.3210000097751617,0.3320000022649765,0.33449999988079065,0.3344999998807907,0.3334999978542328,0.3349999934434891,0.3385000079870224],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.24249999970197675,0.2705000042915344,0.27549999952316284,0.28700000047683716,0.28449998795986176,0.29099999368190765,0.2979999929666519,0.3075000047683716,0.30550000071525574,0.3079999983310699,0.3110000044107437,0.30949999392032623,0.3114999979734421,0.3100000023841858],"label":"WET data"}},"hellaswag/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.29099999368190765,0.3135000020265579,0.34000000357627863,0.34900000691413874,0.3554999977350235,0.3554999977350235,0.36499999463558197,0.37499999999999994,0.37450000643730164,0.3824999928474426,0.3869999945163727,0.3819999992847442,0.38450001180171967,0.3835000097751617],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2720000147819519,0.27850000560283655,0.3020000010728836,0.31299999356269836,0.3240000009536743,0.3375000059604645,0.340499997138977,0.3489999920129776,0.35249999165534973,0.35599999129772186,0.35999999940395355,0.35899999737739563,0.3614999949932098,0.36500000953674316,0.3645000010728836],"label":"WET data"}},"hellaswag/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2915000021457672,0.33500000834465027,0.35800001025199885,0.37450000643730164,0.3859999924898147,0.3959999978542328,0.4035000056028366,0.4220000058412552,0.4294999986886978,0.43400000035762787,0.44099999964237213,0.4424999952316284,0.44449999928474426,0.4494999945163727],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28650000691413874,0.31299999356269836,0.3305000066757202,0.3569999933242798,0.3710000067949295,0.3879999965429306,0.3854999989271164,0.39199998974800104,0.4055000096559524,0.4064999967813492,0.4065000116825104,0.4120000004768371,0.41700001060962677,0.4175000041723251],"label":"WET data"}},"openbookqa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1270000040531158,0.13499999791383738,0.15399999916553497,0.15300000458955765,0.17999999970197672,0.1789999976754188,0.1780000030994415,0.1909999996423721,0.1950000002980232,0.19799999892711634,0.19600000232458115,0.1960000023245811,0.19500000029802322,0.20199999958276743],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.1659999936819076,0.1280000060796737,0.12900000065565104,0.13300000131130213,0.15100000053644175,0.14999999850988385,0.1560000032186508,0.1659999936819076,0.17199999839067454,0.16899999976158137,0.1689999997615814,0.16799999773502344,0.17599999904632566,0.1749999970197677,0.1680000051856041],"label":"WET data"}},"openbookqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26500000059604645,0.24800000339746475,0.28299999237060547,0.28200000524520874,0.30900000035762787,0.3100000023841858,0.3020000010728836,0.3149999976158142,0.3110000044107437,0.32100000977516174,0.31700000166893005,0.31599999964237213,0.31599999964237213,0.31900000572204584],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2580000013113022,0.2719999998807907,0.2770000100135803,0.27300000190734863,0.2880000025033951,0.2989999949932098,0.29500000178813934,0.29899999499320984,0.3100000023841858,0.30300000309944153,0.30600000917911524,0.3040000051259994,0.3110000044107437,0.30300000309944153],"label":"WET data"}},"piqa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.609499990940094,0.6369999945163727,0.6509999930858612,0.6589999794960022,0.6659999787807465,0.6730000078678131,0.6805000007152557,0.6780000030994415,0.6889999806880951,0.6875,0.6990000009536743,0.6969999969005585,0.6974999904632568,0.6995000243186951],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5419999957084656,0.5834999978542328,0.6269999742507935,0.6359999775886536,0.6525000035762787,0.6499999761581421,0.6565000116825104,0.6660000085830688,0.6630000174045563,0.6620000004768372,0.6689999997615814,0.6680000126361847,0.6755000054836273,0.6744999885559082,0.6684999763965607],"label":"WET data"}},"piqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6130000054836273,0.6599999964237213,0.6704999804496765,0.6845000088214874,0.6854999959468842,0.6895000040531158,0.7005000114440918,0.6990000009536743,0.7090000212192535,0.707999974489212,0.7125000059604645,0.7114999890327454,0.7094999849796295,0.7150000035762787],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.5995000004768372,0.6385000050067902,0.6534999907016754,0.6675000190734863,0.6755000054836273,0.6814999878406525,0.6859999895095825,0.6840000152587891,0.6924999952316284,0.6944999992847443,0.69200000166893,0.6995000243186951,0.6960000097751617,0.6979999840259552],"label":"WET data"}},"siqa/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.36200000345706934,0.37549999356269836,0.3665000051259994,0.369499996304512,0.37049999833106995,0.37950000166893005,0.38449999690055847,0.3860000073909759,0.3800000101327896,0.38499999046325684,0.3865000009536743,0.39050000905990595,0.38599999248981476,0.3824999928474426],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.367000013589859,0.3630000054836273,0.3709999918937683,0.37849999964237213,0.36800000071525574,0.36849999427795405,0.37450000643730164,0.36450000107288355,0.37899999320507044,0.37800000607967377,0.3790000081062317,0.38350000977516174,0.38150000572204584,0.37849999964237213,0.38199999928474426],"label":"WET data"}},"siqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.39000000059604645,0.40549999475479126,0.39900000393390656,0.3969999998807907,0.39800000190734863,0.3955000042915344,0.40549999475479126,0.398499995470047,0.39650000631809235,0.40050001442432404,0.40300001204013824,0.402999997138977,0.40100000798702234,0.4089999943971634],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.39499999582767487,0.39249999821186066,0.39800000190734863,0.4025000035762787,0.3924999982118606,0.39750000834465027,0.3999999910593033,0.403999999165535,0.39750000834465027,0.39800000190734863,0.40049999952316284,0.402999997138977,0.40800000727176666,0.4095000028610229],"label":"WET data"}},"winogrande/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.5130000114440918,0.5020000040531158,0.5160000026226044,0.5130000114440918,0.5225000083446503,0.5230000019073486,0.5284999907016754,0.523499995470047,0.5250000059604645,0.5340000092983246,0.539000004529953,0.5304999947547913,0.5304999947547913,0.5315000116825104],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.515999972820282,0.50450000166893,0.5049999952316284,0.5090000182390213,0.5074999928474426,0.5244999825954437,0.5220000147819519,0.5164999961853027,0.5170000195503235,0.5195000171661377,0.5264999866485596,0.5289999842643738,0.5210000276565552,0.5264999866485596,0.5214999914169312],"label":"WET data"}},"winogrande/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4960000067949295,0.5010000020265579,0.5055000185966492,0.5119999945163727,0.5230000019073486,0.5149999856948853,0.5180000066757202,0.5145000219345093,0.5139999985694885,0.5200000107288361,0.5140000283718109,0.5220000147819519,0.5195000171661377,0.5194999873638153],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49299998581409454,0.494499996304512,0.4989999979734421,0.502000018954277,0.5115000009536743,0.5119999945163727,0.5185000002384186,0.5119999945163727,0.5090000033378601,0.5175000131130219,0.5139999985694885,0.5074999928474426,0.5164999961853027,0.5115000009536743],"label":"WET data"}},"sciq/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2080000042915344,0.5795000195503235,0.6650000214576721,0.7175000011920929,0.74549999833107,0.7540000081062317,0.7669999897480011,0.7685000002384186,0.7835000157356262,0.7915000021457672,0.7910000085830688,0.7994999885559082,0.7994999885559082,0.7999999821186066,0.7970000207424164],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2080000042915344,0.5444999933242798,0.6380000114440918,0.6674999892711639,0.6964999735355377,0.7229999899864197,0.7310000061988831,0.7495000064373016,0.7479999959468842,0.7459999918937683,0.7650000154972076,0.7660000026226044,0.7604999840259552,0.7674999833106995,0.7739999890327454],"label":"WET data"}},"sciq/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.5144999921321869,0.578000009059906,0.6195000112056732,0.6634999811649323,0.656499981880188,0.6635000109672546,0.6605000197887421,0.6694999933242798,0.6810000240802765,0.6885000169277191,0.6969999969005585,0.6910000145435333,0.6935000121593475,0.6994999945163727],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.202000007033348,0.49549999833106995,0.5710000097751617,0.5699999928474426,0.6069999933242798,0.6215000152587891,0.6235000193119049,0.637499988079071,0.6394999921321869,0.6234999895095825,0.6464999914169312,0.6509999930858612,0.6445000171661377,0.6509999930858612,0.656499981880188],"label":"WET data"}},"arc/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.27875000238418574,0.29625000059604645,0.32250000536441803,0.33999998867511744,0.3452500104904175,0.35175000131130213,0.35224999487400055,0.3577499985694885,0.36249999701976776,0.36675000190734863,0.3662499934434891,0.37424999475479126,0.37325000762939453,0.37574999034404755],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2195000052452087,0.2539999932050705,0.2867499887943268,0.3050000071525574,0.3192500025033951,0.3237499892711639,0.3242499977350235,0.3344999998807907,0.3422500044107437,0.3464999943971634,0.3434999883174896,0.34675000607967377,0.3487500101327896,0.3527500033378601,0.35424999892711634],"label":"WET data"}},"arc/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2957500070333481,0.32750000059604645,0.3479999899864197,0.3422500044107437,0.3535000085830688,0.35199999809265137,0.3564999997615814,0.36150000989437103,0.36275000870227814,0.36924999952316284,0.3685000091791153,0.37325000762939453,0.3764999955892563,0.3779999911785126],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.2689999938011169,0.304749995470047,0.32025000452995295,0.33400000631809235,0.3375000059604645,0.3384999930858612,0.346000000834465,0.34949998557567596,0.3512499928474426,0.3535000085830688,0.3577500134706497,0.35724999010562897,0.35950000584125513,0.35875000059604645],"label":"WET data"}},"mmlu/acc":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302941530942917,0.24134793877601618,0.2466971725225448,0.25406564772129053,0.2581391483545303,0.2584117949008941,0.2615972906351089,0.2637547105550766,0.26588438451290125,0.2674466520547867,0.2716866135597229,0.2710578292608261,0.27132466435432434,0.2711777687072754,0.27193698287010193],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2302158325910568,0.2384574860334396,0.24489634484052655,0.2443553805351257,0.2494990974664688,0.2522265464067459,0.2537587881088257,0.25265602767467493,0.25639262795448303,0.2572821974754333,0.2601653635501861,0.260224312543869,0.26066550612449646,0.2612243592739105,0.26030270755290985],"label":"WET data"}},"mmlu/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25648748874664307,0.2595148831605911,0.2695829570293426,0.27227921783924103,0.27291516959667206,0.2772494107484817,0.27682115137577057,0.2799507677555084,0.28115415573120117,0.28246068954467773,0.28493577241897583,0.2861091196537018,0.2857011407613754,0.28743599355220795],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2509763091802597,0.25756295025348663,0.2589569538831711,0.2636505216360092,0.2666325122117996,0.26730807125568384,0.2703682482242584,0.2727345675230026,0.273330807685852,0.2783257067203522,0.27664257586002344,0.278787299990654,0.27946445345878596,0.27840209007263184],"label":"WET data"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"WET data is worse than data extracted from WARC"}}}
|
|
|
1 |
+
{"data":{"agg_score":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308933284133672,0.35846718959510326,0.37850185949355364,0.39194786734879017,0.39600365422666073,0.40486439503729343,0.4064061753451824,0.41104014590382576,0.41393135115504265,0.41698802448809147,0.42121383734047413,0.4219294786453247,0.4234823901206255,0.42346264235675335,0.42699199728667736],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3308620806783438,0.3493095366284251,0.367914117872715,0.37658837065100664,0.3858313206583261,0.3908915650099516,0.3968510050326586,0.39992102794349194,0.40259181894361973,0.4055726025253534,0.4074157159775495,0.40804907679557795,0.40894216299057007,0.4123705606907606,0.4108315110206604],"label":"WET data"}},"commonsense_qa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26000000536441803,0.291499987244606,0.3019999861717224,0.3034999966621399,0.3109999895095825,0.31599999964237213,0.3254999965429306,0.3210000097751617,0.3320000022649765,0.33449999988079065,0.3344999998807907,0.3334999978542328,0.3349999934434891,0.3385000079870224],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.24249999970197675,0.2705000042915344,0.27549999952316284,0.28700000047683716,0.28449998795986176,0.29099999368190765,0.2979999929666519,0.3075000047683716,0.30550000071525574,0.3079999983310699,0.3110000044107437,0.30949999392032623,0.3114999979734421,0.3100000023841858],"label":"WET data"}},"hellaswag/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.2915000021457672,0.33500000834465027,0.35800001025199885,0.37450000643730164,0.3859999924898147,0.3959999978542328,0.4035000056028366,0.4220000058412552,0.4294999986886978,0.43400000035762787,0.44099999964237213,0.4424999952316284,0.44449999928474426,0.4494999945163727],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.257999986410141,0.28650000691413874,0.31299999356269836,0.3305000066757202,0.3569999933242798,0.3710000067949295,0.3879999965429306,0.3854999989271164,0.39199998974800104,0.4055000096559524,0.4064999967813492,0.4065000116825104,0.4120000004768371,0.41700001060962677,0.4175000041723251],"label":"WET data"}},"openbookqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.26500000059604645,0.24800000339746475,0.28299999237060547,0.28200000524520874,0.30900000035762787,0.3100000023841858,0.3020000010728836,0.3149999976158142,0.3110000044107437,0.32100000977516174,0.31700000166893005,0.31599999964237213,0.31599999964237213,0.31900000572204584],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2860000133514404,0.2580000013113022,0.2719999998807907,0.2770000100135803,0.27300000190734863,0.2880000025033951,0.2989999949932098,0.29500000178813934,0.29899999499320984,0.3100000023841858,0.30300000309944153,0.30600000917911524,0.3040000051259994,0.3110000044107437,0.30300000309944153],"label":"WET data"}},"piqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.6130000054836273,0.6599999964237213,0.6704999804496765,0.6845000088214874,0.6854999959468842,0.6895000040531158,0.7005000114440918,0.6990000009536743,0.7090000212192535,0.707999974489212,0.7125000059604645,0.7114999890327454,0.7094999849796295,0.7150000035762787],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.5099999904632568,0.5995000004768372,0.6385000050067902,0.6534999907016754,0.6675000190734863,0.6755000054836273,0.6814999878406525,0.6859999895095825,0.6840000152587891,0.6924999952316284,0.6944999992847443,0.69200000166893,0.6995000243186951,0.6960000097751617,0.6979999840259552],"label":"WET data"}},"siqa/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.3619999885559082,0.39000000059604645,0.40549999475479126,0.39900000393390656,0.3969999998807907,0.39800000190734863,0.3955000042915344,0.40549999475479126,0.398499995470047,0.39650000631809235,0.40050001442432404,0.40300001204013824,0.402999997138977,0.40100000798702234,0.4089999943971634],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.36149999499320984,0.39499999582767487,0.39249999821186066,0.39800000190734863,0.4025000035762787,0.3924999982118606,0.39750000834465027,0.3999999910593033,0.403999999165535,0.39750000834465027,0.39800000190734863,0.40049999952316284,0.402999997138977,0.40800000727176666,0.4095000028610229],"label":"WET data"}},"winogrande/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.4960000067949295,0.5010000020265579,0.5055000185966492,0.5119999945163727,0.5230000019073486,0.5149999856948853,0.5180000066757202,0.5145000219345093,0.5139999985694885,0.5200000107288361,0.5140000283718109,0.5220000147819519,0.5195000171661377,0.5194999873638153],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.4970000088214874,0.49299998581409454,0.494499996304512,0.4989999979734421,0.502000018954277,0.5115000009536743,0.5119999945163727,0.5185000002384186,0.5119999945163727,0.5090000033378601,0.5175000131130219,0.5139999985694885,0.5074999928474426,0.5164999961853027,0.5115000009536743],"label":"WET data"}},"arc/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2509999871253967,0.2957500070333481,0.32750000059604645,0.3479999899864197,0.3422500044107437,0.3535000085830688,0.35199999809265137,0.3564999997615814,0.36150000989437103,0.36275000870227814,0.36924999952316284,0.3685000091791153,0.37325000762939453,0.3764999955892563,0.3779999911785126],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2512499988079071,0.2689999938011169,0.304749995470047,0.32025000452995295,0.33400000631809235,0.3375000059604645,0.3384999930858612,0.346000000834465,0.34949998557567596,0.3512499928474426,0.3535000085830688,0.3577500134706497,0.35724999010562897,0.35950000584125513,0.35875000059604645],"label":"WET data"}},"mmlu/acc_norm":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.25648748874664307,0.2595148831605911,0.2695829570293426,0.27227921783924103,0.27291516959667206,0.2772494107484817,0.27682115137577057,0.2799507677555084,0.28115415573120117,0.28246068954467773,0.28493577241897583,0.2861091196537018,0.2857011407613754,0.28743599355220795],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2501466572284698,0.2509763091802597,0.25756295025348663,0.2589569538831711,0.2636505216360092,0.2666325122117996,0.26730807125568384,0.2703682482242584,0.2727345675230026,0.273330807685852,0.2783257067203522,0.27664257586002344,0.278787299990654,0.27946445345878596,0.27840209007263184],"label":"WET data"}}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"WET data is worse than data extracted from WARC"}}}
|
index.html
CHANGED
@@ -589,7 +589,7 @@
|
|
589 |
</ul>
|
590 |
<div class="main-plot-container">
|
591 |
<figure><img src="plots/custom_filters.png"/></figure>
|
592 |
-
<div id="plot-
|
593 |
</div>
|
594 |
<h2>The final dataset</h2>
|
595 |
<p>The final FineWeb dataset comprises 15T tokens and
|
|
|
589 |
</ul>
|
590 |
<div class="main-plot-container">
|
591 |
<figure><img src="plots/custom_filters.png"/></figure>
|
592 |
+
<div id="plot-custom-filters"></div>
|
593 |
</div>
|
594 |
<h2>The final dataset</h2>
|
595 |
<p>The final FineWeb dataset comprises 15T tokens and
|
plots/c4_filters_hellaswag.png
CHANGED
plots/cross_ind_unfiltered_comparison.png
CHANGED
plots/custom_filters.png
CHANGED
plots/dataset_ablations.png
CHANGED
plots/dedup_attempts.png
CHANGED
plots/filtering_steps.png
CHANGED
plots/removed_data_cross_dedup.png
CHANGED
plots/wet_comparison.png
CHANGED
src/plotting.js
CHANGED
@@ -1,23 +1,14 @@
|
|
1 |
const TASK_ID_TO_NAME = {
|
2 |
'agg_score': 'Aggregate Score',
|
3 |
-
'commonsense_qa/
|
4 |
-
'
|
5 |
-
'
|
6 |
-
'
|
7 |
-
'
|
8 |
-
'
|
9 |
-
'
|
10 |
-
'
|
11 |
-
'
|
12 |
-
'siqa/acc_norm': 'Social IQA Norm Acc',
|
13 |
-
'winogrande/acc': 'WinoGrande Acc',
|
14 |
-
'winogrande/acc_norm': 'WinoGrande Norm Acc',
|
15 |
-
'sciq/acc': 'SciQ Acc',
|
16 |
-
'sciq/acc_norm': 'SciQ Norm Acc',
|
17 |
-
'arc/acc': 'ARC Acc',
|
18 |
-
'arc/acc_norm': 'ARC Norm Acc',
|
19 |
-
'mmlu/acc': 'MMLU Acc',
|
20 |
-
'mmlu/acc_norm': 'MMLU Norm Acc'
|
21 |
};
|
22 |
|
23 |
|
@@ -127,11 +118,14 @@ const init_plot = function() {
|
|
127 |
let minX = Math.min(...traces.flatMap(trace => trace.x));
|
128 |
let maxX = Math.max(...traces.flatMap(trace => trace.x));
|
129 |
const width = plot.parentElement.offsetWidth;
|
130 |
-
console.log(width);
|
131 |
const layout = _.merge({}, DEFAULT_LAYOUT, {width: width, yaxis: {title: {text: TASK_ID_TO_NAME[metric]}}, xaxis: {range: [minX*0.95, maxX*1.05]}}, data.layout);
|
132 |
Plotly.newPlot(plot, traces, layout);
|
133 |
|
134 |
window.addEventListener('resize', () => {
|
|
|
|
|
|
|
|
|
135 |
// For some reason plotly doesn't respect the width :(
|
136 |
console.log(plot.parentElement.offsetWidth);
|
137 |
console.log(plot.id);
|
@@ -145,6 +139,18 @@ const init_plot = function() {
|
|
145 |
};
|
146 |
document.addEventListener('DOMContentLoaded', init_plot);
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
const createPlottingElements = (plotElement, data, defaultMetric, defaultWindowSize) => {
|
149 |
// Create plot
|
150 |
const plot = document.createElement('figure');
|
@@ -174,10 +180,13 @@ const createPlottingElements = (plotElement, data, defaultMetric, defaultWindowS
|
|
174 |
const slider = document.createElement('input');
|
175 |
slider.type = 'range';
|
176 |
slider.min = 0;
|
177 |
-
slider.max =
|
178 |
slider.value = defaultWindowSize ?? 0;
|
179 |
|
180 |
|
|
|
|
|
|
|
181 |
// current value
|
182 |
const sliderValue = document.createElement('span');
|
183 |
sliderValue.textContent = slider.value;
|
|
|
1 |
const TASK_ID_TO_NAME = {
|
2 |
'agg_score': 'Aggregate Score',
|
3 |
+
'commonsense_qa/acc_norm': 'Commonsense QA Norm',
|
4 |
+
'hellaswag/acc_norm': 'HellaSwag Norm',
|
5 |
+
'openbookqa/acc_norm': 'OpenBook QA Norm',
|
6 |
+
'piqa/acc_norm': 'PIQA',
|
7 |
+
'siqa/acc_norm': 'Social IQA',
|
8 |
+
'winogrande/acc_norm': 'WinoGrande',
|
9 |
+
'sciq/acc_norm': 'SciQ',
|
10 |
+
'arc/acc_norm': 'ARC',
|
11 |
+
'mmlu/acc_norm': 'MMLU'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
};
|
13 |
|
14 |
|
|
|
118 |
let minX = Math.min(...traces.flatMap(trace => trace.x));
|
119 |
let maxX = Math.max(...traces.flatMap(trace => trace.x));
|
120 |
const width = plot.parentElement.offsetWidth;
|
|
|
121 |
const layout = _.merge({}, DEFAULT_LAYOUT, {width: width, yaxis: {title: {text: TASK_ID_TO_NAME[metric]}}, xaxis: {range: [minX*0.95, maxX*1.05]}}, data.layout);
|
122 |
Plotly.newPlot(plot, traces, layout);
|
123 |
|
124 |
window.addEventListener('resize', () => {
|
125 |
+
// If the window size is smaller than 768, we don't care as it's not shown
|
126 |
+
if (window.innerWidth < 768) {
|
127 |
+
return;
|
128 |
+
}
|
129 |
// For some reason plotly doesn't respect the width :(
|
130 |
console.log(plot.parentElement.offsetWidth);
|
131 |
console.log(plot.id);
|
|
|
139 |
};
|
140 |
document.addEventListener('DOMContentLoaded', init_plot);
|
141 |
|
142 |
+
|
143 |
+
const getSliderMax = (data) => {
|
144 |
+
const firstMetricData = data[Object.keys(data)[0]]
|
145 |
+
const totalSamples = firstMetricData[Object.keys(firstMetricData)[0]].x.length
|
146 |
+
console.log(totalSamples);
|
147 |
+
if (totalSamples < 20) {
|
148 |
+
return 10;
|
149 |
+
}
|
150 |
+
|
151 |
+
return 30;
|
152 |
+
}
|
153 |
+
|
154 |
const createPlottingElements = (plotElement, data, defaultMetric, defaultWindowSize) => {
|
155 |
// Create plot
|
156 |
const plot = document.createElement('figure');
|
|
|
180 |
const slider = document.createElement('input');
|
181 |
slider.type = 'range';
|
182 |
slider.min = 0;
|
183 |
+
slider.max = getSliderMax(data);
|
184 |
slider.value = defaultWindowSize ?? 0;
|
185 |
|
186 |
|
187 |
+
// Get the first example for any metric
|
188 |
+
|
189 |
+
|
190 |
// current value
|
191 |
const sliderValue = document.createElement('span');
|
192 |
sliderValue.textContent = slider.value;
|
src/utils.js
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export function calculateRollingAverage(data, windowSize) {
|
2 |
+
let rollingAvg = [];
|
3 |
+
for (let i = 0; i < data.length; i++) {
|
4 |
+
if (i < windowSize - 1) {
|
5 |
+
rollingAvg.push(null); // Not enough data points to calculate average
|
6 |
+
} else {
|
7 |
+
let sum = 0;
|
8 |
+
for (let j = 0; j < windowSize; j++) {
|
9 |
+
sum += data[i - j];
|
10 |
+
}
|
11 |
+
rollingAvg.push(sum / windowSize);
|
12 |
+
}
|
13 |
+
}
|
14 |
+
return rollingAvg;
|
15 |
+
}
|