File size: 51,666 Bytes
54613ab
d2c70ff
 
 
dd17a04
 
 
 
ea0a273
d2c70ff
 
 
b3970b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
891aa98
b3970b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
891aa98
d2c70ff
 
 
 
 
b3970b3
 
 
dd17a04
 
d2c70ff
 
 
 
 
 
 
 
 
b3970b3
891aa98
b3970b3
d2c70ff
 
b3970b3
 
d2c70ff
 
 
 
b3970b3
 
d2c70ff
 
 
 
 
 
 
 
891aa98
b3970b3
d2c70ff
 
 
 
 
891aa98
d2c70ff
b3970b3
d2c70ff
b3970b3
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
891aa98
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3970b3
 
d2c70ff
 
 
 
b3970b3
 
d2c70ff
 
 
 
891aa98
d2c70ff
 
891aa98
 
b3970b3
d2c70ff
891aa98
b3970b3
548ee1d
 
891aa98
 
b3970b3
d2c70ff
 
 
 
 
 
 
 
891aa98
d2c70ff
 
 
 
 
 
 
 
ea0a273
 
 
 
d2c70ff
b3970b3
d2c70ff
 
 
 
891aa98
b3970b3
 
d2c70ff
 
 
b3970b3
 
891aa98
d2c70ff
 
 
b3970b3
891aa98
d2c70ff
 
 
 
 
b3970b3
d2c70ff
 
 
b3970b3
d2c70ff
 
 
891aa98
d2c70ff
 
 
 
 
 
 
 
891aa98
d2c70ff
 
 
 
 
 
891aa98
d2c70ff
 
b3970b3
d2c70ff
 
 
 
891aa98
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
 
 
ea0a273
 
 
 
d2c70ff
 
 
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
 
 
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
 
ea0a273
 
 
 
d2c70ff
 
b3970b3
891aa98
d2c70ff
 
 
 
 
ea0a273
 
 
 
b3970b3
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
b3970b3
d2c70ff
 
 
 
 
 
 
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
 
 
b3970b3
d2c70ff
 
 
 
 
 
 
 
 
 
 
b3970b3
d2c70ff
 
b3970b3
 
d2c70ff
 
b3970b3
 
 
 
d2c70ff
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
 
 
 
 
 
ea0a273
 
 
 
b3970b3
d2c70ff
891aa98
d2c70ff
 
 
 
b3970b3
d2c70ff
 
 
 
 
 
 
891aa98
 
d2c70ff
 
ea0a273
 
 
 
b3970b3
 
d2c70ff
 
 
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
 
 
 
b3970b3
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3970b3
d2c70ff
 
 
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
ea0a273
 
7b37a0b
ea0a273
b3970b3
d2c70ff
 
 
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
ea0a273
 
 
 
d2c70ff
b3970b3
 
891aa98
d2c70ff
 
b3970b3
891aa98
d2c70ff
b3970b3
 
891aa98
d2c70ff
 
b3970b3
891aa98
d2c70ff
b3970b3
 
891aa98
d2c70ff
 
b3970b3
 
891aa98
d2c70ff
 
 
 
 
 
 
 
ea0a273
 
 
 
d2c70ff
 
b3970b3
595179b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3970b3
 
d2c70ff
 
 
 
 
 
 
ea8de33
 
 
 
 
 
d2c70ff
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
 
 
b3970b3
 
d2c70ff
 
 
 
 
 
 
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
b3970b3
 
d2c70ff
 
 
 
 
891aa98
d2c70ff
 
 
b3970b3
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3970b3
d2c70ff
 
 
 
 
 
 
 
 
 
 
 
 
 
b3970b3
 
d2c70ff
 
 
 
 
 
 
 
b3970b3
 
 
 
 
891aa98
b3970b3
891aa98
b3970b3
 
 
 
 
 
 
 
891aa98
 
 
 
 
 
 
 
 
b3970b3
891aa98
 
 
 
b3970b3
 
891aa98
 
 
b3970b3
 
 
 
 
 
d2c70ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
<!doctype html>

<head>
    <script src="https://distill.pub/template.v2.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjs/12.4.2/math.min.js" charset="utf-8"></script>
    <script src="https://cdn.plot.ly/plotly-2.32.0.min.js" charset="utf-8"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js" charset="utf-8"></script>
    <script type="module" src="src/plotting.js"></script>
    <link rel="stylesheet" href="style.css">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta charset="utf8">
    <title>FineWeb: 15T tokens of high quality web data</title>
    <style>

        /* ****************************************
         * TOC
         ******************************************/
        @media (max-width: 1199px) {
            d-contents {
                display: none;
                justify-self: start;
                align-self: start;
                padding-bottom: 0.5em;
                margin-bottom: 1em;
                padding-left: 0.25em;
                border-bottom: 1px solid rgba(0, 0, 0, 0.1);
                border-bottom-width: 1px;
                border-bottom-style: solid;
                border-bottom-color: rgba(0, 0, 0, 0.1);
            }
        }

        d-contents a:hover {
            border-bottom: none;
        }


        @media (min-width: 1200px) {
            d-article {
                /* Ensure d-article does not prevent sticky positioning */
                overflow: visible;
            }

            d-contents {
                align-self: start;
                grid-column-start: 1 !important;
                grid-column-end: 4 !important;
                grid-row: auto / span 6;
                justify-self: end;
                margin-top: 0em;
                padding-right: 3em;
                padding-left: 2em;
                border-right: 1px solid rgba(0, 0, 0, 0.1);
                border-right-width: 1px;
                border-right-style: solid;
                border-right-color: rgba(0, 0, 0, 0.1);
                position: -webkit-sticky; /* For Safari */
                position: sticky;
                top: 10px; /* Adjust this value if needed */
            }
        }

        d-contents nav h3 {
            margin-top: 0;
            margin-bottom: 1em;
        }

        d-contents nav div {
            color: rgba(0, 0, 0, 0.8);
            font-weight: bold;
        }

        d-contents nav a {
            color: rgba(0, 0, 0, 0.8);
            border-bottom: none;
            text-decoration: none;
        }

        d-contents li {
            list-style-type: none;
        }

        d-contents ul, d-article d-contents ul {
            padding-left: 1em;
        }

        d-contents nav ul li {
            margin-bottom: .25em;
        }

        d-contents nav a:hover {
            text-decoration: underline solid rgba(0, 0, 0, 0.6);
        }

        d-contents nav ul {
            margin-top: 0;
            margin-bottom: 6px;
        }


        d-contents nav > div {
            display: block;
            outline: none;
            margin-bottom: 0.5em;
        }

        d-contents nav > div > a {
            font-size: 13px;
            font-weight: 600;
        }

        d-contents nav > div > a:hover,
        d-contents nav > ul > li > a:hover {
            text-decoration: none;
        }

    </style>
</head>

<body>
<d-front-matter>
    <script id='distill-front-matter' type="text/json">{
    "title": "FineWeb: 15T tokens of high quality web data",
    "description": "This blog covers the FineWeb recipe, why more deduplication is not always better and some interesting findings on the difference in quality of CommonCrawl dumps.",
    "published": "May 28, 2024",
    "authors": [
      {
        "author":"Guilherme Penedo",
        "authorURL":"https://huggingface.co/guipenedo",
        "affiliations": [{"name": "HuggingFace"}]
      },
      {
        "author":"Hynek Kydlíček",
        "authorURL":"https://huggingface.co/hynky"
      },
      {
        "author":"Leandro Werra",
        "authorURL":"https://huggingface.co/lvwerra"
      },
      {
        "author":"Thomas Wolf",
        "authorURL":"https://huggingface.co/thomwolf"
      }
    ],
    "katex": {
      "delimiters": [
        {"left": "$$", "right": "$$", "display": false}
      ]
    }
  }
    </script>
</d-front-matter>
<d-title>
    <figure class="l-page">
        <img src="banner.png" alt="FineWeb">
    </figure>
</d-title>
<d-byline></d-byline>
<d-article>
    <d-contents>
    </d-contents>

    <!-- Your JavaScript file -->

    <p>We have recently released 🍷FineWeb, our new large scale
        (15T tokens, 44TB disk space) dataset of clean text sourced from the web for LLM pretraining. You can
        download it <a href="https://huggingface.co/datasets/HuggingFaceFW/fineweb">here</a>.</p>
    <p>As 🍷FineWeb has gathered a lot of interest from the
        community, we decided to further explain the steps involved in creating it, our processing decisions and
        some lessons learned along the way. Read on for all the juicy details on large text dataset creation!</p>
    <p><strong>TLDR:</strong> This blog covers the FineWeb
        recipe, why more deduplication is not always better and some interesting findings on the difference in
        quality of CommonCrawl dumps.</p>

    <h2>General considerations on web data</h2>
    <h3>Sourcing the data</h3>
    <p>A common question we see asked regarding web datasets used
        to train LLMs is “where do they even get all that data?” There are generally two options:</p>
    <ul>
        <li>you either crawl it yourself, like <a
                href="https://platform.openai.com/docs/gptbot">OpenAI</a> or <a
                href="https://darkvisitors.com/agents/claudebot">Anthropic</a> seem to do
        </li>
    </ul>
    <ul>
        <li>you use a public repository of crawled webpages, like the one maintained by
            the non-profit <a href="https://commoncrawl.org/">CommonCrawl</a></li>
    </ul>
    <p>For FineWeb, similarly to what was done for a large number
        of other public datasets, we used <a href="https://commoncrawl.org/">CommonCrawl</a> as a starting point.
        They have been crawling the web since 2007 (long before LLMs were a thing) and release a new dump usually
        every 1 or 2 months, which can be freely downloaded. </p>
    <p>As an example, their latest crawl (2024-10) contains 3.16
        billion web pages, totaling 424.7 TiB of uncompressed content (the size changes from dump to dump). There
        are 95 dumps since 2013 and 3 dumps from 2008 to 2012, which are in a different (older) format.<d-footnote>We have not processed these 3 older dumps.</d-footnote> </p>
    <h3>Processing at scale</h3>
    <p>Given the sheer size of the data involved, one of the main
        challenges we had to overcome was having a modular, scalable codebase that would allow us to quickly iterate
        on our processing decisions and easily try out new ideas, while appropriately parallelizing our workloads
        and providing clear insights into the data. </p>
    <p>For this purpose, we developed <a
            href="https://github.com/huggingface/datatrove"><code>datatrove</code></a><d-cite bibtex-key="penedo2024datatrove"></d-cite>, an open-source data
        processing library that allowed us to seamlessly scale our filtering and deduplication setup to thousands of
        CPU cores. All the data processing steps involved in the creation of FineWeb used this <a
                href="https://github.com/huggingface/datatrove">library</a>.</p>
    <h3>What is clean, good data?</h3>
    <p>This is probably the main question to keep in mind when
        creating a dataset. A good first lesson is that data that would intuitively be considered high quality by a
        human may not be necessarily the best data (or at least not all that you need) to train a good model on.</p>
    <p>It is still common to train a model on a given corpus
        (wikipedia, or some other web dataset considered clean) and use it to check the perplexity on the dataset
        that we were trying to curate. Unfortunately this does not always correlate with performance on downstream
        tasks, and so another often used approach is to train small models (small because training models is
        expensive and time consuming, and we want to be able to quickly iterate) on our dataset and evaluate them on
        a set of evaluation tasks. As we are curating a dataset for pretraining a generalist LLM, it is important to
        choose a diverse set of tasks and try not to overfit to any one individual benchmark.</p>
    <p>Another way to evaluate different datasets would be to
        train a model on each one and have humans rate and compare the outputs of each one (like on the <a
                href="https://chat.lmsys.org/">LMSYS Chatbot Arena</a>)<d-cite bibtex-key="chiang2024chatbot"></d-cite>. This would arguably provide the most
        reliable results in terms of representing real model usage, but getting ablation results this way is too
        expensive and slow.</p>
    <p>The approach we ultimately went with was to train small
        models and evaluate them on a set of benchmark tasks. We believe this is a reasonable proxy for the quality
        of the data used to train these models.</p>
    <h3>Ablations and evaluation setup</h3>
    <p>To be able to compare the impact of a given processing
        step, we would train 2 models, one where the data included the extra step and another where this step was
        ablated (cut/removed). These 2 models would have the same number of parameters, architecture, and be trained
        on an equal number of tokens and with the same hyperparameters — the only difference would be in the
        training data. We would then evaluate each model on the same set of tasks and compare the average
        scores.</p>
    <p>Our ablation models were trained using <a
            href="https://github.com/huggingface/nanotron"><code>nanotron</code></a> with this config [<strong>TODO:
        INSERT SIMPLIFIED NANOTRON CONFIG HERE</strong>]. The models had 1.82B parameters, used the Llama
        architecture with a 2048 sequence length, and a global batch size of ~2 million tokens. For filtering
        ablations we mostly trained on ~28B tokens (which is roughly the Chinchilla optimal training size for this
        model size).</p>
    <p>We evaluated the models using <a
            href="https://github.com/huggingface/lighteval/"><code>lighteval</code></a>. We tried selecting
        benchmarks that would provide good signal at a relatively small scale (small models trained on only a few
        billion tokens). Furthermore, we also used the following criteria when selecting benchmarks:</p>
    <ul>
        <li>small variance between runs trained on different samplings of the same
            dataset: we want our runs on a subset of the data to be representative of the whole dataset, and the
            resulting scores to have as little noise as possible
        </li>
    </ul>
    <ul>
        <li>performance increasing monotonically (or close) over a training run:
            ideally, as the number of seen tokens increases, the performance on this benchmark should not decrease
            (should not be too noisy)
        </li>
    </ul>
    <p>To
        have results quickly we capped longer benchmarks at 1000 samples (wall-clock evaluation taking less than 5
        min on a single node of 8 GPUs - done in parallel to the training).</p>
    <aside>You can find the full list of tasks and prompts we used <a
            href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/lighteval_tasks.py">here</a>.</aside>
    <h2>The FineWeb recipe</h2>
    <p>In the next subsections we will explain each of the steps
        taken to produce the FineWeb dataset.</p>
    <figure class="l-body">
        <img src="plots/fineweb-recipe.png"/>
    </figure>
    <aside>You can find a fully reproducible <code>datatrove</code> config <a
                href="https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py">here</a>.</aside>
    <h3>Starting point: text extraction</h3>
    <p>CommonCrawl data is available in two main formats: WARC
        and WET. <strong>WARC </strong>(Web ARChive format) files contain the raw data from the crawl, including the
        full page HTML and request metadata. <strong>WET</strong> (WARC Encapsulated Text) files provide a text only
        version of those websites.</p>
    <p>A large number of datasets take the WET files as their
        starting point. In our experience the default text extraction (extracting the main text of a webpage from
        its HTML) used to create these WET files is suboptimal and there are a variety of open-source libraries that
        provide better text extraction (by, namely, keeping less boilerplate content/navigation menus). We extracted
        the text content from the WARC files using the trafilatura library<d-cite bibtex-key="barbaresi-2021-trafilatura"></d-cite>. It is important to note, however, that text extraction is one of the most costly steps of our
        processing, so we believe that using the readily available WET data could be a reasonable trade-off for
        lower budget teams.</p>
    <p>To validate this decision, we processed the 2019-18 dump
        directly using the WET files and with text extracted from WARC files using trafilatura. We applied the same
        processing to each one (our base filtering+minhash, detailed below) and trained two models. While the
        resulting dataset is considerably larger for the WET data (around 254BT), it proves to be of much worse
        quality than the one that used trafilatura to extract text from WARC files (which is around 200BT). Many of
        these additional tokens on the WET files are unnecessary page boilerplate.</p>
    <div class="main-plot-container">
        <figure><img src="plots/wet_comparison.png"/></figure>
        <div id="plot-wet_comparison"></div>
    </div>

    <h3>Base filtering</h3>
    <p>Filtering is an important part of the curation process. It
        removes part of the data (be it words, lines, or full documents) that would harm performance and is thus
        deemed to be “lower quality”.</p>
    <p>As a basis for our filtering we used part of the setup
        from RefinedWeb<d-cite bibtex-key="penedo2023refinedweb"></d-cite>. Namely, we:</p>
    <ul>
        <li>Applied URL filtering using a <a
                href="https://dsi.ut-capitole.fr/blacklists/">blocklist</a> to remove adult content
        </li>
    </ul>
    <ul>
        <li>Applied a <a
                href="https://fasttext.cc/docs/en/language-identification.html">fastText language classifier</a><d-cite bibtex-key="joulin2016bag"></d-cite><d-cite bibtex-key="joulin2016fasttext"></d-cite> to
            keep only English text with a score ≥ 0.65
        </li>
    </ul>
    <ul>
        <li>Applied quality and repetition filters from the Gopher<d-cite bibtex-key="rae2022scaling"></d-cite> paper (using the default thresholds)
        </li>
    </ul>
    <p>After applying this filtering to each of the text
        extracted dumps (there are currently 95 dumps) we obtained roughly 36 trillion tokens of data (when
        tokenized with the <code>gpt2</code> tokenizer).</p>
    <h3>Deduplication</h3>
    <p>Deduplication is another important step, specially for web
        datasets. Methods to deduplicate datasets attempt to remove redundant/repeated data. Deduplication is one of
        the most important steps when creating large web datasets for LLMs.</p>
    <h4>Why deduplicate?</h4>
    <p>The web has many aggregators, mirrors, templated pages or
        just otherwise repeated content spread over different domains and webpages. Often, these duplicated pages
        can be introduced by the crawler itself, when different links point to the same page. </p>
    <p>Removing these duplicates (deduplicating) has been linked to an improvement in model performance<d-cite bibtex-key="lee2022deduplicating"></d-cite> and a reduction in memorization of pretraining data<d-cite bibtex-key="carlini2023quantifying"></d-cite>, which might
        allow for better generalization. Additionally, the performance uplift can also be tied to increased training
        efficiency: by removing duplicated content, for the same number of training tokens, a model will have seen
        more diverse data.</p>
    <p>There are different ways to identify and even define
        duplicated data. Common approaches rely on hashing techniques to speed up the process, or on building
        efficient data structures to index the data (like suffix arrays). Methods can also be “fuzzy”, by using some
        similarity metric to mark documents as duplicates, or “exact” by checking for exact matches between two
        documents (or lines, paragraphs, or whatever other granularity level being used).</p>
    <h4>Our deduplication parameters</h4>
    <p>Similarly to RefinedWeb, we decided to apply MinHash, a
        fuzzy hash based deduplication technique. We chose to compute minhashes on each document’s 5-grams, using
        112 hash functions in total, split into 14 buckets of 8 hashes each — targeting documents that are at least
        75% similar. Documents with the same 8 minhashes in any bucket are considered a duplicate of each other.</p>
    <p>This would mean that for two documents with a similarity (<code>s</code>)
        of 0.7, 0.75, 0.8 and 0.85, the probability that they would be identified as duplicates would be 56%, 77%,
        92% and 98.8% respectively ($$1-(1-s^8)^{14}$$). See the plot below for a match probability
        comparison between our setup with 112 hashes and the one from RefinedWeb, with 9000 hashes, divided into 450
        buckets of 20 hashes (that requires a substantially larger amount of compute resources):</p>
    <figure><img src="plots/minhash_parameters_comparison.png"/>
    </figure>
    <p>While the high number of hash functions in RefinedWeb
        allows for a steeper, more well defined cut off, we believe the compute and storage savings are a reasonable
        trade off.</p>
    <h4>More deduplication is always better, right?</h4>
    <p>Our initial approach was to take the entire dataset (all
        95 dumps) and deduplicate them as one big dataset using MinHash.</p>
    <p>We did this in an iterative manner: starting with the most
        recent dump (which at the time was 2023-50) and taking the oldest one last, we would deduplicate each dump
        not only against itself but also by removing any matches with duplicates from the previously processed
        dumps. </p>
    <p>For instance, for the second most recent dump (2023-40 at
        the time), we deduplicated it against the most recent one in addition to itself. In particular, the oldest
        dump was deduplicated against all other dumps. As a result, more data was removed in the oldest dumps (last
        to be deduplicated) than in the most recent ones.</p>
    <p>Deduplicating the dataset in this manner resulted in 4
        trillion tokens of data, but, quite surprisingly for us, when training on a randomly sampled 350 billion
        tokens subset, the model showed no improvement over one trained on the non deduplicated data (see orange and
        green curve below), scoring far below its predecessor RefinedWeb on our aggregate of tasks.</p>
    <div class="main-plot-container">
        <figure><img src="plots/dedup_all_dumps_bad.png"/></figure>
        <div id="plot-dedup_all_dumps_bad"></div>
    </div>
    <p>This was quite puzzling as our intuition regarding web
        data was that more deduplication would always result in improved performance. We decided to take a closer
        look at one of the oldest dumps, dump 2013-48:</p>
    <ul>
        <li>pre deduplication, this dump had ~490 billion tokens</li>
    </ul>
    <ul>
        <li>after our iterative MinHash, ~31 billion tokens remained (94% of data
            removed)
        </li>
    </ul>
    <p>As an experiment, we tried training two models on 28BT
        sampled from the following data from 2013-48:</p>
    <ul>
        <li>the fully deduplicated remaining ~31 billion tokens (<em>originally kept
            data</em>)
        </li>
    </ul>
    <ul>
        <li>171 billion tokens obtained by individually deduplicating (without
            considering the other dumps) the ~460 billion tokens that had been removed from this dump in the
            iterative dedup process (<em>originally removed data</em>)
        </li>
    </ul>
    <div class="main-plot-container">
        <figure><img src="plots/removed_data_cross_dedup.png"/></figure>
        <div id="plot-removed_data_cross_dedup"></div>
    </div>
    <p>These results show that, for this older dump where we were
        removing over 90% of the original data, the data that was kept was actually <em>worse</em> than the data
        removed (considered independently of all the other dumps).</p>
    <h4>Taking a step back: individual dump dedup</h4>
    <p>We then tried an alternative approach: we deduplicated
        each dump with MinHash individually (without considering the other dumps). This resulted in 20 trillion
        tokens of data.</p>
    <p>When training on a random sample from this dataset we see
        that it now matches RefinedWeb’s performance (blue and red curves below):</p>
    <div class="main-plot-container">
        <figure><img src="plots/cross_ind_unfiltered_comparison.png"/></figure>
        <div id="plot-cross_ind_unfiltered_comparison"></div>
    </div>
    <p>We hypothesize that the main improvement gained from
        deduplication is the removal of very large clusters that are present in every single dump (you will find
        some examples of these clusters on the RefinedWeb paper, each containing <em>hundreds of thousands</em> of
        documents) and that further deduplication for low number of deduplications (less than ~100 i.e. the number
        of dumps) actually harm performance: data that does not find a duplicate match in any other dump might
        actually be worse quality/more out of distribution (as evidenced by the results on the 2013-48 data). </p>
    <p>While you might see some performance improvement when
        deduplicating a few dumps together, at the scale of all the dumps this upsampling of lower quality data side
        effect seems to have a great impact.</p>
    <p>One possibility to consider is that as filtering quality
        improves, this effect may not be as prevalent, since the filtering might be able to remove some of this
        lower quality data. We also experimented with applying different, and often “lighter”, deduplication
        approaches on top of the individually deduplicated dumps. You can read about them further below.</p>
    <h4>A note on measuring the effect of deduplication</h4>
    <p>Given the nature of deduplication, its effect is not
        always very visible in a smaller slice of the dataset (such as 28B tokens, the size we used for our
        filtering ablations). Furthermore, one must consider the fact that there are specific effects at play when
        deduplicating across all CommonCrawl dumps, as some URLs/pages are recrawled from one dump to the next.</p>
    <p>To visualize the effect of scaling the number of training
        tokens on measuring deduplication impact, we considered the following (very extreme and unrealistic
        regarding the degree of duplication observed) theoretical scenario:</p>
    <ul>
        <li>there are 100 CommonCrawl dumps (actually roughly true)</li>
    </ul>
    <ul>
        <li>each dump has been perfectly individually deduplicated (every single
            document in it is unique)
        </li>
    </ul>
    <ul>
        <li>each dump is a perfect copy of each other (maximum possible duplication
            across dumps, effectively the worst case scenario)
        </li>
    </ul>
    <ul>
        <li>each dump has 200 billion tokens (for a total of 20 trillion, the resulting
            size of our individual dedup above)
        </li>
    </ul>
    <ul>
        <li>each dump is made up of documents of 1k tokens (200M documents per dump)
        </li>
    </ul>
    <p>We then simulated uniformly sampling documents from this
        entire dataset of 20 trillion tokens, to obtain subsets of 1B, 10B, 100B, 350B and 1T tokens. In the image
        below you can see how often each document would be repeated.</p>
    <figure><img src="plots/dedup_impact_simulation.png"/></figure>
    <p>For 1B almost all documents would be unique
        (#duplicates=1), despite the fact that in the entire dataset each document is repeated 100 times (once per
        dump). We start seeing some changes at the 100B scale (0.5% of the total dataset), with a large number of
        documents being repeated twice, and a few even 4-8 times. At the larger scale of 1T (5% of the total
        dataset), the majority of the documents are repeated up to 8 times, with a some being repeated up to 16
        times. </p>
    <p>We ran our performance evaluations for the deduplicated
        data at the 350B scale, which would, under this theoretical scenario, be made up of a significant portion of
        documents duplicated up to 8 times. This simulation illustrates the inherent difficulties associated with
        measuring deduplication impact on the training of LLMs, once the biggest document clusters have been
        removed.</p>
    <h4>Other (failed) approaches</h4>
    <p>We attempted to improve the performance of the
        independently minhash deduped 20T of data by further deduplicating it with the following methods</p>
    <ul>
        <li>URL deduplication, where we only kept one document per normalized
            (lowercased) URL (71.5% of tokens removed, 5.6T left) — <em>FineWeb URL dedup</em></li>
    </ul>
    <ul>
        <li>Line deduplication:
            <ul>
                <li>remove all but 1 occurrence of each duplicated line (77.8% of
                    tokens dropped, 4.4T left) — <em>FineWeb line dedup</em></li>
            </ul>
            <ul>
                <li>same as above, but only removing duplicate lines with at least 10
                    words and dropping documents with fewer than 3 sentences after deduplication (85% of tokens
                    dropped, 2.9T left) — <em>FineWeb line dedup w/ min words</em></li>
            </ul>
            <ul>
                <li>remove all but 1 occurrence of each span of 3 duplicated lines
                    with all numbers replaced by 0 (80.9% of tokens removed, 3.7T left) — <em>FineWeb 3-line
                        dedup</em></li>
            </ul>
        </li>
    </ul>
    <p>The performance of the models trained on each of these was
        consistently worse (even if to different degrees) than that of the original independently deduplicated
        data:</p>
    <div class="main-plot-container">
        <figure><img src="plots/dedup_attempts.png"/></figure>
        <div id="plot-dedup_attempts"></div>
    </div>
    <h3>Additional filtering</h3>
    <p>By this point we had reached the same performance as
        RefinedWeb, but on our aggregate of tasks, another heavily filtered dataset, the C4 dataset<d-cite bibtex-key="raffel2023exploring"></d-cite>, still showed stronger performance (with
        the caveat that it is a relatively small dataset for current web-scale standards).</p>
    <p>We therefore set out to find new filtering steps that
        would, at first, allow us to match the performance of C4 and eventually surpass it. A natural starting point
        was to look into the processing of C4 itself.</p>
    <h4>C4: A dataset that has stood the test of time</h4>
    <p>The <a href="https://huggingface.co/datasets/c4">C4
        dataset</a> was first released in 2019. It was obtained from the <code>2019-18</code> CommonCrawl dump by
        removing non english data, applying some heuristic filters on both the line and document level,
        deduplicating on the line level and removing documents containing words from a word blocklist.</p>
    <p>Despite its age and limited size (around 175B gpt2
        tokens), models trained on this dataset have strong performance, excelling in particular on the Hellaswag
        benchmark, one of the benchmarks in our “early signal” group with the stronger signal and highest
        signal-over-noise ratio. As such, it has stayed a common sub-set of typical LLM training, for instance in
        the relatively recent Llama1 model<d-cite bibtex-key="touvron2023llama"></d-cite>. We experimented applying
        each of the different filters used in C4 to a baseline of the independently deduped FineWeb 2019-18 dump
        (plot smoothed with a 3 checkpoints sliding window):</p>
    <div class="main-plot-container">
        <figure><img src="plots/c4_filters_hellaswag.png"/></figure>
        <div id="plot-c4_filters_hellaswag"></div>
    </div>
    <ul>
        <li>applying “All filters” (drop lines not ending on punctuation marks,
            mentioning javascript and cookie notices + drop documents outside length thresholds, containing “lorem
            ipsum” or a curly bracket, <code>{</code>) allows us to match C4’s HellaSwag performance (purple versus
            pink curves).
        </li>
    </ul>
    <ul>
        <li>The curly bracket filter, and the word lengths filter only give a small
            boost, removing 2.8% and 4.3% of tokens, respectively
        </li>
    </ul>
    <ul>
        <li>The terminal punctuation filter, by itself, gives the biggest individual
            boost, but removes <em>around 30%</em> of all tokens (!)
        </li>
    </ul>
    <ul>
        <li>The lorem_ipsum, javascript and policy rules each remove &lt;0.5% of
            training tokens, so we did not train on them individually
        </li>
    </ul>
    <ul>
        <li>All filters except the very destructive terminal_punct perform better than
            terminal_punct by itself, while removing less in total (~7%)
        </li>
    </ul>
    <p>We decided to apply all C4 filters mentioned above except
        the terminal punctuation one. We validated these results with a longer run, which you will find in a plot in
        the next section.</p>
    <h4>A statistical approach to develop heuristic filters</h4>
    <p>To come up with new possible filtering rules, we collected
        a very large list of statistics (statistical metrics) — over <strong>50</strong> — from different reference
        datasets (C4, RefinedWeb, etc) and from a select list of our processed dumps, on both the independently
        minhashed version and the result from the (worse quality) full dedup. This allowed us to compare the
        different datasets at a macro level, by looking at the distribution of these metrics for each one.</p>
    <p>The collected statistics ranged from common document-level
        metrics (e.g. number of lines, avg. line/word length, etc) to inter-document repetition metrics (gopher
        inspired). Perhaps not too surprisingly given our findings for deduplication, we found significant
        disparities in most of the metrics for the two deduplication methods. For instance, the <code>line-char-duplicates</code>
        metric (nb. of characters in duplicated lines / nb. characters), roughly doubled from the independent dedup
        (0.0053 for 2015-22 and 0.0058 for 2013-48), to the full dedup (0.011 for 2015-22 and 0.01 for 2013-48),
        indicating that the latter had higher inter-document repetition.</p>
    <p>Working under the assumption that these differences were
        caused by lower quality data on the full dedup version, we inspected histograms and manually defined
        thresholds for the metrics where these differences were starker. This process yielded 17 candidate
        threshold-filter pairs. In the image below, you can see 3 of these histograms.</p>
    <figure><img src="plots/Untitled%201.png"/></figure>

    <p>To assess the effectiveness of these newly created
        filters, we conducted <strong>28B tokens </strong>ablation runs on the <strong>2019-18 crawl</strong>. Out
        of all those runs, we identified three filters (the ones based on the histograms above) that demonstrated
        the most significant improvements on the aggregate score:</p>
    <ul>
        <li>Remove documents where the fraction of lines ending with punctuation ≤ 0.12
            (10.14% of tokens removed) — vs the 30% from the original C4 terminal punct filter
        </li>
    </ul>
    <ul>
        <li>Remove documents where the fraction of characters in duplicated lines ≥ 0.1
            (12.47% of tokens removed) — the original Gopher threshold for this ratio is ≥ 0.2
        </li>
    </ul>
    <ul>
        <li>Remove documents where the fraction of lines shorter than 30 characters ≥
            0.67 (3.73% of tokens removed)
        </li>
    </ul>
    <ul>
        <li>When applying the 3 together, ~22% of tokens were removed</li>
    </ul>
    <div class="main-plot-container">
        <figure><img src="plots/custom_filters.png"/></figure>
        <div id="plot-custom-filters"></div>
    </div>
    <h2>The final dataset</h2>
    <p>The final FineWeb dataset comprises 15T tokens and
        includes the following previously mentioned steps, in order, each providing a performance boost on our group
        of benchmark tasks:</p>
    <ul>
        <li>base filtering</li>
    </ul>
    <ul>
        <li>independent MinHash deduplication per dump</li>
    </ul>
    <ul>
        <li>a selection of C4 filters</li>
    </ul>
    <ul>
        <li>our custom filters (mentioned in the previous section)</li>
    </ul>
    <div class="main-plot-container">
        <figure><img src="plots/filtering_steps.png"/></figure>
        <div id="plot-filtering_steps"></div>
    </div>
    <p>We compared 🍷 FineWeb with the following datasets:</p>
    <ul>
        <li><a
                href="https://huggingface.co/datasets/tiiuae/falcon-refinedweb">RefinedWeb</a><d-cite bibtex-key="penedo2023refinedweb"></d-cite>
        </li>
    </ul>
    <ul>
        <li><a href="https://huggingface.co/datasets/allenai/c4">C4</a><d-cite bibtex-key="raffel2023exploring"></d-cite></li>
    </ul>
    <ul>
        <li><a href="https://huggingface.co/datasets/allenai/dolma">Dolma v1.6</a> (the
            CommonCrawl part) <d-cite bibtex-key="dolma"></d-cite>
        </li>
    </ul>
    <ul>
        <li><a href="https://huggingface.co/datasets/EleutherAI/pile">The Pile</a> <d-cite bibtex-key="gao2020pile"></d-cite></li>
    </ul>
    <ul>
        <li><a
                href="https://huggingface.co/datasets/cerebras/SlimPajama-627B">SlimPajama</a> <d-cite bibtex-key="cerebras2023slimpajama"></d-cite>
        </li>
    </ul>
    <ul>
        <li><a
                href="https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2">RedPajama2</a> <d-cite bibtex-key="together2023redpajama"></d-cite>
            (deduplicated)
        </li>
    </ul>
    <p>You will find these models on <a
            href="https://huggingface.co/collections/HuggingFaceFW/ablation-models-662457b0d213e8c14fe47f32">this
        collection</a>. We have uploaded checkpoints at every 1000 training steps. You will also find our full <a
            href="https://huggingface.co/datasets/HuggingFaceFW/fineweb/blob/main/eval_results.csv">evaluation
        results here</a>.</p>
    <div class="main-plot-container">
        <figure><img src="plots/dataset_ablations.png"/></figure>
        <div id="plot-dataset_ablations"></div>
    </div>
    <p>Some histogram comparisons of C4, Dolma, RefinedWeb and
        FineWeb:</p>
    <figure><img src="plots/Untitled%203.png"/></figure>
    <h2>📚 FineWeb-Edu</h2>
    <p>We are excited to release 📚 FineWeb-Edu, a filtered version of FineWeb for educational content, available in two sizes: 1.2 trillion and 4.5 trillion tokens. FineWeb-Edu outperforms all existing web datasets, with notable improvements on MMLU, ARC, and OpenBookQA benchmarks.</p>
    <p>A new approach has recently emerged for filtering LLM training datasets: using synthetic data to develop classifiers for identifying educational content. This technique was used in the trainings of <a href="https://ai.meta.com/blog/meta-llama-3-meta-ai-responsibility/">LLama3</a> and <a href="https://arxiv.org/abs/2404.14219">Phi3</a>, but its large-scale impact on web data filtering hasn't been fully explored or published.</p>
    <p>The popular Phi3 models were trained on 3.3 and 4.8 trillion tokens, with the  <a href="https://arxiv.org/abs/2404.14219">paper</a> stating:</p>
    <blockquote>Our training data consists of heavily filtered publicly available web data (according to the 'educational level') from various open internet sources, as well as synthetic LLM-generated data.</blockquote>
    <p>Similarly, <a href="https://ai.meta.com/blog/meta-llama-3-meta-ai-responsibility/">LLama3 blog post</a> notes:</p>
    <blockquote>We found that previous generations of Llama are good at identifying high-quality data, so we used Llama 2 to help build the text-quality classifiers that are powering Llama 3.</blockquote>
    <p>However, these classifiers and filtered datasets are not publicly available. To enhance FineWeb's quality, we developed an educational quality classifier using annotations generated by <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">Llama3-70B-Instruct</a> to create FineWeb-Edu.</p>
    <h3>Annotation</h3>
    <p>We used Llama3-70B-Instruct to annotate 500k samples from the FineWeb dataset, scoring each for their educational quality on a scale from 0 to 5.</p>
    <p>We explored various prompts and found that the additive scale by <a href="https://arxiv.org/pdf/2401.10020">Yuan et al.</a> worked best. This scale allows the LLM to reason about each additional point awarded, unlike the single-rating Likert scale which fits samples into predefined boxes. Then, to avoid the LLM favoring highly technical pages like arXiv abstracts and submissions, we focused on grade-school and middle-school level knowledge. By setting a threshold of 3 (on a scale of 0 to 5) during the filtering process, we were able to also retain some high-level educational pages.</p>
    <div style="text-align: center; margin: 20px 0;">
        <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/fjZQ4izIj1rx1xQnBTKKr.png" alt="Prompt for LLM annotation" style="width: 90%; max-width: 800px; height: auto;">
        <figcaption style="font-style: italic; margin-top: 10px;">Prompt used for Llama3 annotations of the educational score</figcaption>
    </div>    
    <p>We also experimented with different LLMs: <a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct">Llama3-70B-Instruct</a>, <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">Mixtral-8x-7B-Instruct</a>, and <a href="https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1">Mixtral-8x22B-Instruct</a>. Llama3 and Mixtral-8x22B produced similar scores, while Mixtral-8x7B tended to be more generous, not fully adhering to the score scale. <a href="https://arxiv.org/abs/2404.18796">Verga et al.</a> suggest using multiple LLMs as juries. We tried averaging the scores from the three models, but this shifted the distribution to the right due to the higher scores from Mixtral-8x7B. Training on a dataset filtered with a classifier using jury annotations performed worse than using a classifier based on Llama3 annotations. We hypothesize that the jury-based approach retains more low-quality samples.</p>
    <div style="text-align: center; margin: 20px 0;">
        <img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/dQskZA-4fsk8aR_8g9evJ.png" style="width: 80%; max-width: 700px; height: auto;"></figure>
    </div>    
    <h3>Classifier Training</h3>
    <p>We added a classification head with a single regression output to <a href="https://huggingface.co/Snowflake/snowflake-arctic-embed-m">Snowflake-arctic-embed</a> and trained it on 450,000 Llama3 annotations for 20 epochs with a learning rate of 3e-4, freezing the embedding and encoder layers. We saved the checkpoint with the highest F1 score on our validation set. After training, we rounded the scores to integers from 0 to 5. This approach resulted in the model achieving an F1 score of 82%, indicating robust performance in distinguishing high-quality educational content.</p>
    <p>The classifier is available at: <a href="https://huggingface.co/HuggingFaceTB/snowflake_m_edu_reg">https://huggingface.co/HuggingFaceTB/snowflake_m_edu_reg</a>. The training and inference code is available on  <a href="https://github.com/huggingface/cosmopedia/tree/edu-classifier/classification">GitHub</a>.</p>
    <p><strong>TODO: fill model card and move the github code to another folder</strong></p>
    <h3>Filtering</h3>
    <p>We applied the classifier to the 15T tokens of FineWeb, a process that required 6,000 H100 GPU hours. To build FineWeb-Edu, we filtered out samples with scores lower than 3. This removed 92% of the dataset, leaving us with 1.2T educational tokens. Here are the key highlights of the ablation results:</p>
    <ul>
        <li>FineWeb-Edu surpasses FineWeb and all other web datasets, with remarkable improvements on educational benchmarks such as MMLU, ARC, and OpenBookQA.</li>
        <li>It achieves the same performance with significantly less data, requiring 10x fewer tokens compared to C4 and Dolma1.7 to match MMLU results.</li>
        <li>It gives strong performance boosts on benchmarks like MMLU and ARC without trying to overfit on them.</li>
        <li>This demonstrates the effectiveness of using classifiers trained on LLM annotations for large-scale data filtering.</li>
    </ul>
    <p>To keep more tokens, we also experimented with a less strict threshold of 2 instead of 3. This approach preserved 4.5T tokens and still outperformed the FineWeb dataset, with performance just slightly below that of threshold 3.</p>
    <p>We release these two datasets as FineWeb-Edu and FineWeb-edu-Large along with the classifier used for the filtering.</p>
    <p><strong>TODO: add ablation results and dataset links, and maybe FineWeb-edu-smol</strong></p>    
    <h2>Just like fine wine, not all crawls are created
        equal</h2>
    <p>During our ablation runs, we observed that certain crawls
        outperformed others by a significant margin. To investigate this phenomenon, we conducted 27B token runs for
        each dump (we used the version with base filtering + ind dedup), with 2 trainings per dump, where each used
        a different data subset. We trained 190 such models, totaling over 60k H100 GPU-hours. We subsequently took
        the last 3 checkpoints for both seeds and plotted the average of these 6 data points per dump. </p>
    <p>The plot below clearly shows that some dumps perform far
        worse than others. Each year has a different color, and the number of crawls per year also changes.</p>

    <div class="main-plot-container l-page">
        <figure><img src="plots/score_by_dump.png"/></figure>
        <div id="plot-score_by_dump"></div>
    </div>

    <p>We identified 5 main relevant time intervals:</p>
    <ul>
        <li>2013 to 2016: relatively stable, average quality</li>
    </ul>
    <ul>
        <li>2017 to 2018: high quality, with a drop by the end of 2018</li>
    </ul>
    <ul>
        <li>2019 to 2021: high quality, steadily increase</li>
    </ul>
    <ul>
        <li>2021-49 and 2022: very large drop in performance, followed by worse quality
            dumps
        </li>
    </ul>
    <ul>
        <li>2023 and 2024-10: almost exponential improvement. In particular, 2023-50
            and 2024-10 are by far the best dumps
        </li>
    </ul>
    <p>One possibility to improve performance when training
        models on &lt; 15T would be to train on FineWeb while excluding the worst quality CommonCrawl dumps.</p>
    <p>We conducted further analysis to investigate the factors
        causing these differences from dump to dump. In particular, we considered 3 potential causes: </p>
    <ul>
        <li>large sudden changes in the list of crawled URLs;</li>
    </ul>
    <ul>
        <li>synthetic (LLM generated) data;</li>
    </ul>
    <ul>
        <li>benchmark contamination;</li>
    </ul>
    <p>We go over each one in the following sections.</p>
    <h3>Changes in the most frequent URLs [HAVE TO RECHECK]</h3>
    <p>For each crawl from 2021-10 onwards, we gathered a list of
        the 60k most frequent <strong>FQDNs</strong> (fully qualified domain name). We then calculated the <a
                href="https://en.wikipedia.org/wiki/Jaccard_index">Jaccard similarity</a><d-cite bibtex-key="jaccard1912distribution"></d-cite> between consecutive
        crawls. A high value means that a crawl/dump has many of the same FQDNs as the dump immediately preceding
        it, while a small value means that a considerable number of top 60k FQDNs were downsampled or removed, or
        that alternatively new FQDNs were added to the top 60k.</p>
    <figure><img src="plots/Untitled%204.png"/></figure>
    <p>The data indicates three significant changes:
        2021-43/2021-49, 2022-33/2022-40, and 2023-40/2023-50.</p>
    <p>The explanation for the changes between 2022-33/2022-40
        and 2023-40/2023-50 is straightforward: CommonCrawl accidentally did not index several popular suffixes,
        such as .co.uk, as documented on <a href="https://commoncrawl.org/errata/co-uk-cctld-not-included">this
            erratum</a>. This particular change does not seem particularly correlated on the overall dump quality.
    </p>
    <p>As to the shift from 2021-43 to 2021-49, which coincides
        with a sharp performance drop, roughly half (~30k) of the former’s top 60k FQDNs are not present in the
        latter’s list of top 60k FQDNs, and the dump size itself also decreased (19% reduction in WARC size, and a
        28% token reduction after deduplication). </p>
    <p>We were unable to find a clear reason for this drastic
        change, but upon reaching out to CommonCrawl, we were informed that these differences likely stem from a
        major update in adult content and malicious site blocking. It is therefore possible that the new updated
        adult site filter could have also removed a high number of high quality domains resulting in poor
        performance of the crawl. <strong>[TODO: change this framing a bit, it seems to suggest adult content is
            high quality for LLMs]</strong></p>
    <h3>Synthetic data contamination [HAVE TO RECHECK]</h3>
    <p>Secondly, we wondered if part of the changes in
        performance on recent dumps could be attributed to the presence of a larger quantity of synthetic data (data
        generated by LLMs). Such a change would not be surprising due to the recent increase in popularity of LLMs,
        notably of ChatGPT.</p>
    <p>Since, to the best of our knowledge, there is no fool
        proof method to detect synthetic data, we opted to use a proxy metric: we measured the frequency of the
        following words: <code>delve, as a large language model, it&#x27;s important to note, rich tapestry,
            intertwined, certainly!, dive into</code>, which are words commonly used by ChatGPT.</p>
    <p>It is important to note that not all samples containing
        one of these phrases were necessarily generated by ChatGPT (and also that many ChatGPT generated samples do
        not contain any of these phrases), but assuming that the amount of synthetic data were to not change across
        dumps, one would expect these frequencies to remain approximately constant over time.</p>
    <p>The results are shown in the following graph:</p>
    <figure><img src="plots/Untitled%205.png"/></figure>
    <p>While the frequency remained approximately constant until
        2023-14 (ChatGPT was released at the end of 2022), not only do we find a steep increase of our proxy metric
        in recent crawls, as the proxy metric also correlates well with the agg score, with a pearson correlation of
        <strong>0.590</strong>. It is therefore possible that synthetic data has positively impacted performance in
        our selected tasks for these most recent dumps (with all limitations in interpretation from a single
        correlation measurement without intervention of randomization or any causality tools being used here). In
        particular, it could explain why the 2023-50 and 2024-10 dumps have such a strong performance. </p>
    <h3>Benchmarks contamination [HAVE TO RECHECK]</h3>
    <p>Also, most of our used benchmarks were introduced around
        <strong>2019</strong>. It’s thus possible that the 2019-XX 2021-43 performance increase might be caused by
        higher benchmark contamination in those crawls. Similarly, the recent increase in LLM popularity and
        evaluations, might have increased the contamination in recent benchmarks, explaining the score improvements
        of the two most recent crawls. <strong>[NOTE: the plot does not seem to support this at all]</strong></p>

    <figure><img src="plots/Untitled%206.png"/></figure>
    <h2>Next steps</h2>
    <p>We want to continue improving FineWeb and will also
        release a technical report with more details soon.</p>
    <p>Adapting the FineWeb recipe [wip]</p>
</d-article>

<d-appendix>
    <d-bibliography src="bibliography.bib"></d-bibliography>
</d-appendix>

<script>
    const article = document.querySelector('d-article');
    const toc = document.querySelector('d-contents');
    if (toc) {
        const headings = article.querySelectorAll('h2, h3, h4');
        let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`;
        let prevLevel = 0;

        for (const el of headings) {
            // should element be included in TOC?
            const isInTitle = el.parentElement.tagName == 'D-TITLE';
            const isException = el.getAttribute('no-toc');
            if (isInTitle || isException) continue;
            el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
            const link = '<a href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';

            const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
            while (prevLevel < level) {
                ToC += '<ul>'
                prevLevel++;
            }
            while (prevLevel > level) {
                ToC += '</ul>'
                prevLevel--;
            }
            if (level === 0)
                ToC += '<div>' + link + '</div>';
            else
                ToC += '<li>' + link + '</li>';
        }

        while (prevLevel > 0) {
            ToC += '</ul>'
            prevLevel--;
        }
        ToC += '</nav>';
        toc.innerHTML = ToC;
        toc.setAttribute('prerendered', 'true');
    }
</script>
</body>