Merge branch 'main' of https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1
Browse files- index.html +5 -4
index.html
CHANGED
@@ -11,6 +11,7 @@
|
|
11 |
<link rel="stylesheet" href="style.css">
|
12 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
13 |
<meta charset="utf8">
|
|
|
14 |
<title>FineWeb: 15T tokens of high quality web data</title>
|
15 |
<style>
|
16 |
|
@@ -326,7 +327,7 @@
|
|
326 |
</li>
|
327 |
</ul>
|
328 |
<ul>
|
329 |
-
<li>Applied quality and repetition filters from
|
330 |
</li>
|
331 |
</ul>
|
332 |
<p>After applying this filtering to each of the text
|
@@ -577,7 +578,7 @@
|
|
577 |
minhashed version and the result from the (worse quality) full dedup from 2013-48 and 2015-22 crawls (older crawls). We then compared the
|
578 |
statistics at a macro level, by looking at the distribution of these metrics for each one.</p>
|
579 |
<p>The collected statistics ranged from common document-level
|
580 |
-
metrics (e.g. number of lines, avg. line/word length, etc) to inter-document repetition metrics (
|
581 |
inspired). Perhaps not too surprisingly given our findings for deduplication, we found significant
|
582 |
disparities in most of the metrics for the two deduplication methods. For instance, the <code>line-char-duplicates</code>
|
583 |
metric (nb. of characters in duplicated lines / nb. characters), roughly doubled from the independent dedup
|
@@ -604,7 +605,7 @@
|
|
604 |
</ul>
|
605 |
<ul>
|
606 |
<li>Remove documents where the fraction of characters in duplicated lines ≥ 0.1
|
607 |
-
(12.47% of tokens removed) — the original
|
608 |
</li>
|
609 |
</ul>
|
610 |
<ul>
|
@@ -742,7 +743,7 @@
|
|
742 |
const isException = el.getAttribute('no-toc');
|
743 |
if (isInTitle || isException) continue;
|
744 |
el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
|
745 |
-
const link = '<a href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';
|
746 |
|
747 |
const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
|
748 |
while (prevLevel < level) {
|
|
|
11 |
<link rel="stylesheet" href="style.css">
|
12 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
13 |
<meta charset="utf8">
|
14 |
+
<base target="_blank">
|
15 |
<title>FineWeb: 15T tokens of high quality web data</title>
|
16 |
<style>
|
17 |
|
|
|
327 |
</li>
|
328 |
</ul>
|
329 |
<ul>
|
330 |
+
<li>Applied quality and repetition filters from MassiveText<d-cite bibtex-key="rae2022scaling"></d-cite> (using the default thresholds)
|
331 |
</li>
|
332 |
</ul>
|
333 |
<p>After applying this filtering to each of the text
|
|
|
578 |
minhashed version and the result from the (worse quality) full dedup from 2013-48 and 2015-22 crawls (older crawls). We then compared the
|
579 |
statistics at a macro level, by looking at the distribution of these metrics for each one.</p>
|
580 |
<p>The collected statistics ranged from common document-level
|
581 |
+
metrics (e.g. number of lines, avg. line/word length, etc) to inter-document repetition metrics (MassiveText
|
582 |
inspired). Perhaps not too surprisingly given our findings for deduplication, we found significant
|
583 |
disparities in most of the metrics for the two deduplication methods. For instance, the <code>line-char-duplicates</code>
|
584 |
metric (nb. of characters in duplicated lines / nb. characters), roughly doubled from the independent dedup
|
|
|
605 |
</ul>
|
606 |
<ul>
|
607 |
<li>Remove documents where the fraction of characters in duplicated lines ≥ 0.1
|
608 |
+
(12.47% of tokens removed) — the original MassiveText threshold for this ratio is ≥ 0.2
|
609 |
</li>
|
610 |
</ul>
|
611 |
<ul>
|
|
|
743 |
const isException = el.getAttribute('no-toc');
|
744 |
if (isInTitle || isException) continue;
|
745 |
el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
|
746 |
+
const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';
|
747 |
|
748 |
const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
|
749 |
while (prevLevel < level) {
|