loubnabnl HF Staff commited on
Commit
0df1973
·
2 Parent(s): e6b21ff f7efb82

Merge branch 'main' of https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. assets/data/clustering/data.csv +0 -0
  2. assets/data/plots/edu-100k/arc_acc_norm.json +1 -1
  3. assets/data/plots/edu-100k/hellaswag_acc_norm.json +1 -1
  4. assets/data/plots/edu-100k/mmlu_acc_norm.json +1 -1
  5. assets/data/plots/edu-100k/openbookqa_acc_norm.json +1 -1
  6. assets/data/plots/edu-100k/piqa_acc_norm.json +1 -1
  7. assets/data/plots/edu-100k/siqa_acc_norm.json +1 -1
  8. assets/data/plots/edu-100k/winogrande_acc_norm.json +1 -1
  9. assets/images/Untitled 1.png +0 -0
  10. assets/images/Untitled 3.png +0 -0
  11. assets/images/Untitled 4.png +0 -0
  12. assets/images/Untitled 5.png +0 -0
  13. assets/images/Untitled 6.png +0 -0
  14. assets/images/Untitled.png +0 -0
  15. assets/images/c4_filters_hellaswag.png +0 -0
  16. assets/images/clusters.png +0 -0
  17. assets/images/cross_ind_unfiltered_comparison.png +0 -0
  18. assets/images/custom_filters.png +0 -0
  19. assets/images/dataset_ablations.png +0 -0
  20. assets/images/dedup_all_dumps_bad.png +0 -0
  21. assets/images/dedup_attempts.png +0 -0
  22. assets/images/dedup_impact_simulation.png +0 -0
  23. assets/images/duplicates_simul.png +0 -0
  24. assets/images/edu-100k.png +0 -0
  25. assets/images/edu-8k.png +0 -0
  26. assets/images/filtering_steps.png +0 -0
  27. assets/images/minhash_parameters_comparison.png +0 -0
  28. assets/images/minhash_params.png +0 -0
  29. assets/images/removed_data_cross_dedup.png +0 -0
  30. assets/images/score_by_dump.png +0 -0
  31. assets/images/stats.png +0 -0
  32. assets/images/wet_comparison.png +0 -0
  33. dist/assets/data/clustering/data.csv +0 -0
  34. dist/assets/data/plots/edu-100k/arc_acc_norm.json +1 -1
  35. dist/assets/data/plots/edu-100k/hellaswag_acc_norm.json +1 -1
  36. dist/assets/data/plots/edu-100k/mmlu_acc_norm.json +1 -1
  37. dist/assets/data/plots/edu-100k/openbookqa_acc_norm.json +1 -1
  38. dist/assets/data/plots/edu-100k/piqa_acc_norm.json +1 -1
  39. dist/assets/data/plots/edu-100k/siqa_acc_norm.json +1 -1
  40. dist/assets/data/plots/edu-100k/winogrande_acc_norm.json +1 -1
  41. dist/assets/images/Untitled 1.png +0 -0
  42. dist/assets/images/Untitled 3.png +0 -0
  43. dist/assets/images/Untitled 4.png +0 -0
  44. dist/assets/images/Untitled 5.png +0 -0
  45. dist/assets/images/Untitled 6.png +0 -0
  46. dist/assets/images/Untitled.png +0 -0
  47. dist/assets/images/c4_filters_hellaswag.png +0 -0
  48. dist/assets/images/clusters.png +0 -0
  49. dist/assets/images/cross_ind_unfiltered_comparison.png +0 -0
  50. dist/assets/images/custom_filters.png +0 -0
assets/data/clustering/data.csv CHANGED
The diff for this file is too large to render. See raw diff
 
assets/data/plots/edu-100k/arc_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data": {"C4": {"x": ["C4"], "y": [0.4390000104904175], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.4329999983310699], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.4404999911785126], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.4395000040531158], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.4375], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.4514999985694885], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.4090000092983246], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.5475000143051147], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.6120000171661377]}, "margin": {"b": 100}}}
 
1
+ {"data": {"C4": {"x": ["C4"], "y": [0.4435000121593475], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.44200000166893], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.4600000083446502], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.4494999945163727], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.4555000066757202], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.4605000019073486], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.4375], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.5734999775886536], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.6431999731063843]}, "margin": {"b": 100}}}
assets/data/plots/edu-100k/hellaswag_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data": {"C4": {"x": ["C4"], "y": [0.6010000109672546], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.5879999995231628], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.6060000061988831], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.5440000295639038], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.5709999799728394], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.5649999976158142], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.515999972820282], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.5830000042915344], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.6822000074386597]}, "margin": {"b": 100}}}
 
1
+ {"data": {"C4": {"x": ["C4"], "y": [0.6389999985694885], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.6159999966621399], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.6269999742507935], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.5600000023841858], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.6019999980926514], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.5839999914169312], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.5569999814033508], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.597000002861023], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.7217999982833863]}, "margin": {"b": 100}}}
assets/data/plots/edu-100k/mmlu_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data": {"C4": {"x": ["C4"], "y": [0.3100601136684418], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.3082475662231445], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.3212694227695465], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.3182428181171417], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.3192791938781738], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.3266949653625488], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.3129254281520843], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.3591959178447723], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.38603510141372677]}, "margin": {"b": 100}}}
 
1
+ {"data": {"C4": {"x": ["C4"], "y": [0.3162081837654114], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.3209713697433471], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.3296935856342315], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.3293801844120025], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.3323083519935608], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.3337143063545227], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.3308100700378418], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.3744533956050873], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.40434407472610473]}, "margin": {"b": 100}}}
assets/data/plots/edu-100k/openbookqa_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data": {"C4": {"x": ["C4"], "y": [0.3740000128746032], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.3499999940395355], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.3680000007152557], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.3339999914169311], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.3459999859333038], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.3499999940395355], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.3339999914169311], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.4000000059604645], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.4350000071525574]}, "margin": {"b": 100}}}
 
1
+ {"data": {"C4": {"x": ["C4"], "y": [0.3720000088214874], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.3799999952316284], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.3959999978542328], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.3540000021457672], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.356000006198883], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.3459999859333038], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.356000006198883], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.4180000126361847], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.225, 0.45660001516342164]}, "margin": {"b": 100}}}
assets/data/plots/edu-100k/piqa_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data": {"C4": {"x": ["C4"], "y": [0.7739999890327454], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.7549999952316284], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.7580000162124634], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.7239999771118164], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.7710000276565552], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.7379999756813049], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.7179999947547913], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.7590000033378601], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.45, 0.8387999868392945]}, "margin": {"b": 100}}}
 
1
+ {"data": {"C4": {"x": ["C4"], "y": [0.7710000276565552], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.7689999938011169], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.7609999775886536], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.7310000061988831], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.7730000019073486], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.7570000290870667], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.7200000286102295], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.7689999938011169], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.45, 0.8376000022888184]}, "margin": {"b": 100}}}
assets/data/plots/edu-100k/siqa_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data": {"C4": {"x": ["C4"], "y": [0.4009999930858612], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.4000000059604645], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.4059999883174896], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.4120000004768371], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.4129999876022339], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.3860000073909759], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.3869999945163727], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.4099999964237213], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.29700000000000004, 0.43619998512268066]}, "margin": {"b": 100}}}
 
1
+ {"data": {"C4": {"x": ["C4"], "y": [0.4009999930858612], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.3989999890327453], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.414000004529953], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.4059999883174896], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.4099999964237213], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.402999997138977], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.4020000100135803], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.4040000140666961], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.29700000000000004, 0.4374000054359436]}, "margin": {"b": 100}}}
assets/data/plots/edu-100k/winogrande_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data": {"C4": {"x": ["C4"], "y": [0.5490000247955322], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.550000011920929], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.5559999942779541], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.5509999990463257], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.5680000185966492], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.5519999861717224], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.5350000262260437], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.5720000267028809], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.45, 0.596400032043457]}, "margin": {"b": 100}}}
 
1
+ {"data": {"C4": {"x": ["C4"], "y": [0.5609999895095825], "label": "C4"}, "Dolma": {"x": ["Dolma"], "y": [0.5550000071525574], "label": "Dolma"}, "FineWeb": {"x": ["FineWeb"], "y": [0.5640000104904175], "label": "FineWeb"}, "RedPajama2": {"x": ["RedPajama2"], "y": [0.5490000247955322], "label": "RedPajama2"}, "RefinedWeb": {"x": ["RefinedWeb"], "y": [0.5540000200271606], "label": "RefinedWeb"}, "SlimPajama": {"x": ["SlimPajama"], "y": [0.5479999780654907], "label": "SlimPajama"}, "The Pile": {"x": ["The Pile"], "y": [0.5590000152587891], "label": "The Pile"}, "FineWeb-Edu": {"x": ["FineWeb-Edu"], "y": [0.578000009059906], "label": "FineWeb-Edu"}}, "layout": {"showlegend": false, "title": {"text": "Evaluation results at 350B tokens"}, "xaxis": {"title": {"text": "Dataset", "standoff": 30}, "tickangle": 30}, "yaxis": {"range": [0.45, 0.6036000108718872]}, "margin": {"b": 100}}}
assets/images/Untitled 1.png DELETED
Binary file (139 kB)
 
assets/images/Untitled 3.png DELETED
Binary file (551 kB)
 
assets/images/Untitled 4.png DELETED
Binary file (483 kB)
 
assets/images/Untitled 5.png DELETED
Binary file (475 kB)
 
assets/images/Untitled 6.png DELETED
Binary file (116 kB)
 
assets/images/Untitled.png DELETED
Binary file (309 kB)
 
assets/images/c4_filters_hellaswag.png CHANGED
assets/images/clusters.png CHANGED
assets/images/cross_ind_unfiltered_comparison.png CHANGED
assets/images/custom_filters.png CHANGED
assets/images/dataset_ablations.png CHANGED
assets/images/dedup_all_dumps_bad.png CHANGED
assets/images/dedup_attempts.png CHANGED
assets/images/dedup_impact_simulation.png DELETED
Binary file (123 kB)
 
assets/images/duplicates_simul.png ADDED
assets/images/edu-100k.png CHANGED
assets/images/edu-8k.png CHANGED
assets/images/filtering_steps.png CHANGED
assets/images/minhash_parameters_comparison.png DELETED
Binary file (35.3 kB)
 
assets/images/minhash_params.png ADDED
assets/images/removed_data_cross_dedup.png CHANGED
assets/images/score_by_dump.png DELETED
Binary file (400 kB)
 
assets/images/stats.png ADDED
assets/images/wet_comparison.png CHANGED
dist/assets/data/clustering/data.csv CHANGED
The diff for this file is too large to render. See raw diff
 
dist/assets/data/plots/edu-100k/arc_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data":{"C4":{"x":["C4"],"y":[0.4390000104904175],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.4329999983310699],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.4404999911785126],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.4395000040531158],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.4375],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.4514999985694885],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.4090000092983246],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.5475000143051147],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.6120000171661377]},"margin":{"b":100}}}
 
1
+ {"data":{"C4":{"x":["C4"],"y":[0.4435000121593475],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.44200000166893],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.4600000083446502],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.4494999945163727],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.4555000066757202],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.4605000019073486],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.4375],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.5734999775886536],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.6431999731063843]},"margin":{"b":100}}}
dist/assets/data/plots/edu-100k/hellaswag_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data":{"C4":{"x":["C4"],"y":[0.6010000109672546],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.5879999995231628],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.6060000061988831],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.5440000295639038],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.5709999799728394],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.5649999976158142],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.515999972820282],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.5830000042915344],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.6822000074386597]},"margin":{"b":100}}}
 
1
+ {"data":{"C4":{"x":["C4"],"y":[0.6389999985694885],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.6159999966621399],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.6269999742507935],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.5600000023841858],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.6019999980926514],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.5839999914169312],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.5569999814033508],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.597000002861023],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.7217999982833863]},"margin":{"b":100}}}
dist/assets/data/plots/edu-100k/mmlu_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data":{"C4":{"x":["C4"],"y":[0.3100601136684418],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.3082475662231445],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.3212694227695465],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.3182428181171417],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.3192791938781738],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.3266949653625488],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.3129254281520843],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.3591959178447723],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.38603510141372677]},"margin":{"b":100}}}
 
1
+ {"data":{"C4":{"x":["C4"],"y":[0.3162081837654114],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.3209713697433471],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.3296935856342315],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.3293801844120025],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.3323083519935608],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.3337143063545227],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.3308100700378418],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.3744533956050873],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.40434407472610473]},"margin":{"b":100}}}
dist/assets/data/plots/edu-100k/openbookqa_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data":{"C4":{"x":["C4"],"y":[0.3740000128746032],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.3499999940395355],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.3680000007152557],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.3339999914169311],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.3459999859333038],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.3499999940395355],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.3339999914169311],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.4000000059604645],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.4350000071525574]},"margin":{"b":100}}}
 
1
+ {"data":{"C4":{"x":["C4"],"y":[0.3720000088214874],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.3799999952316284],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.3959999978542328],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.3540000021457672],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.356000006198883],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.3459999859333038],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.356000006198883],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.4180000126361847],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.225,0.45660001516342164]},"margin":{"b":100}}}
dist/assets/data/plots/edu-100k/piqa_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data":{"C4":{"x":["C4"],"y":[0.7739999890327454],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.7549999952316284],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.7580000162124634],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.7239999771118164],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.7710000276565552],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.7379999756813049],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.7179999947547913],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.7590000033378601],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.45,0.8387999868392945]},"margin":{"b":100}}}
 
1
+ {"data":{"C4":{"x":["C4"],"y":[0.7710000276565552],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.7689999938011169],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.7609999775886536],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.7310000061988831],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.7730000019073486],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.7570000290870667],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.7200000286102295],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.7689999938011169],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.45,0.8376000022888184]},"margin":{"b":100}}}
dist/assets/data/plots/edu-100k/siqa_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data":{"C4":{"x":["C4"],"y":[0.4009999930858612],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.4000000059604645],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.4059999883174896],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.4120000004768371],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.4129999876022339],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.3860000073909759],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.3869999945163727],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.4099999964237213],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.29700000000000004,0.43619998512268066]},"margin":{"b":100}}}
 
1
+ {"data":{"C4":{"x":["C4"],"y":[0.4009999930858612],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.3989999890327453],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.414000004529953],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.4059999883174896],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.4099999964237213],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.402999997138977],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.4020000100135803],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.4040000140666961],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.29700000000000004,0.4374000054359436]},"margin":{"b":100}}}
dist/assets/data/plots/edu-100k/winogrande_acc_norm.json CHANGED
@@ -1 +1 @@
1
- {"data":{"C4":{"x":["C4"],"y":[0.5490000247955322],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.550000011920929],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.5559999942779541],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.5509999990463257],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.5680000185966492],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.5519999861717224],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.5350000262260437],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.5720000267028809],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.45,0.596400032043457]},"margin":{"b":100}}}
 
1
+ {"data":{"C4":{"x":["C4"],"y":[0.5609999895095825],"label":"C4"},"Dolma":{"x":["Dolma"],"y":[0.5550000071525574],"label":"Dolma"},"FineWeb":{"x":["FineWeb"],"y":[0.5640000104904175],"label":"FineWeb"},"RedPajama2":{"x":["RedPajama2"],"y":[0.5490000247955322],"label":"RedPajama2"},"RefinedWeb":{"x":["RefinedWeb"],"y":[0.5540000200271606],"label":"RefinedWeb"},"SlimPajama":{"x":["SlimPajama"],"y":[0.5479999780654907],"label":"SlimPajama"},"The Pile":{"x":["The Pile"],"y":[0.5590000152587891],"label":"The Pile"},"FineWeb-Edu":{"x":["FineWeb-Edu"],"y":[0.578000009059906],"label":"FineWeb-Edu"}},"layout":{"showlegend":false,"title":{"text":"Evaluation results at 350B tokens"},"xaxis":{"title":{"text":"Dataset","standoff":30},"tickangle":30},"yaxis":{"range":[0.45,0.6036000108718872]},"margin":{"b":100}}}
dist/assets/images/Untitled 1.png DELETED
Binary file (139 kB)
 
dist/assets/images/Untitled 3.png DELETED
Binary file (551 kB)
 
dist/assets/images/Untitled 4.png DELETED
Binary file (483 kB)
 
dist/assets/images/Untitled 5.png DELETED
Binary file (475 kB)
 
dist/assets/images/Untitled 6.png DELETED
Binary file (116 kB)
 
dist/assets/images/Untitled.png DELETED
Binary file (309 kB)
 
dist/assets/images/c4_filters_hellaswag.png CHANGED
dist/assets/images/clusters.png CHANGED
dist/assets/images/cross_ind_unfiltered_comparison.png CHANGED
dist/assets/images/custom_filters.png CHANGED