blogpost-fineweb-v1 / data /plots /wet_comparison /commonsense_qa_acc_norm.json
hynky's picture
hynky HF staff
new plotting code (JIT)
0932e7b
raw
history blame
1.28 kB
{"data":{"ind_minhash-CC-MAIN-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.26000000536441803,0.291499987244606,0.3019999861717224,0.3034999966621399,0.3109999895095825,0.31599999964237213,0.3254999965429306,0.3210000097751617,0.3320000022649765,0.33449999988079065,0.3344999998807907,0.3334999978542328,0.3349999934434891,0.3385000079870224],"label":"Extracted from WARC"},"wet-extraction-2019-18":{"x":[0.0,2.0971520000000003,4.194304000000001,6.291456,8.388608000000001,10.48576,12.582912,14.680064000000002,16.777216000000003,18.874368,20.97152,23.068672000000003,25.165824,27.262976000000002,28.311552000000002],"y":[0.2329999953508377,0.24249999970197675,0.2705000042915344,0.27549999952316284,0.28700000047683716,0.28449998795986176,0.29099999368190765,0.2979999929666519,0.3075000047683716,0.30550000071525574,0.3079999983310699,0.3110000044107437,0.30949999392032623,0.3114999979734421,0.3100000023841858],"label":"WET data"}},"layout":{"xaxis":{"title":{"text":"Training tokens (billions)"}},"title":{"text":"WET data is worse than data extracted from WARC"}}}