victormiller
commited on
Update web.py
Browse files
web.py
CHANGED
@@ -9,6 +9,152 @@ from data.url_blocklist import urls_high_matches, urls_false_positives
|
|
9 |
from data.non_web_urls import non_web_urls
|
10 |
from fasthtml.components import D_code
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def DVS(
|
14 |
left,
|
@@ -216,6 +362,12 @@ def web_data():
|
|
216 |
),
|
217 |
style="margin-top: 20px;",
|
218 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
H3("1. Document Preparation"),
|
220 |
|
221 |
H4("1.1 Text Extraction"),
|
@@ -226,6 +378,9 @@ def web_data():
|
|
226 |
we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
|
227 |
Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
|
228 |
"""),
|
|
|
|
|
|
|
229 |
DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
230 |
|
231 |
H4("1.2 Language Identification"),
|
|
|
9 |
from data.non_web_urls import non_web_urls
|
10 |
from fasthtml.components import D_code
|
11 |
|
12 |
+
data_filtering_table_data = pd.DataFrame(
|
13 |
+
{
|
14 |
+
"Dataset": [
|
15 |
+
"TxT360",
|
16 |
+
"FineWeb",
|
17 |
+
"RefinedWeb",
|
18 |
+
"RedPajamaV2",
|
19 |
+
"C4",
|
20 |
+
"Dolma",
|
21 |
+
"RedPajamaV1",
|
22 |
+
"The Pile",
|
23 |
+
],
|
24 |
+
"Data Reading": [
|
25 |
+
"warc",
|
26 |
+
"warc",
|
27 |
+
"warc",
|
28 |
+
"wet",
|
29 |
+
"wet",
|
30 |
+
"warc",
|
31 |
+
"wet",
|
32 |
+
"warc",
|
33 |
+
],
|
34 |
+
"Text Extraction": [
|
35 |
+
"trafilatura",
|
36 |
+
"trafilatura",
|
37 |
+
"trafilatura",
|
38 |
+
"n/a",
|
39 |
+
"n/a",
|
40 |
+
"?",
|
41 |
+
"n/a",
|
42 |
+
"jusText",
|
43 |
+
],
|
44 |
+
"URL Filtering": [
|
45 |
+
"Yes",
|
46 |
+
"Yes",
|
47 |
+
"Yes",
|
48 |
+
"Yes",
|
49 |
+
"No",
|
50 |
+
"No",
|
51 |
+
"No",
|
52 |
+
"No",
|
53 |
+
],
|
54 |
+
"Language Identification": [
|
55 |
+
"fastText",
|
56 |
+
"fastText",
|
57 |
+
"fastText",
|
58 |
+
"fastText",
|
59 |
+
"langdetect",
|
60 |
+
"fastText",
|
61 |
+
"fastText",
|
62 |
+
"pycld2",
|
63 |
+
],
|
64 |
+
"Line Removal": [
|
65 |
+
"Yes",
|
66 |
+
"Yes",
|
67 |
+
"Yes",
|
68 |
+
"Yes",
|
69 |
+
"Yes",
|
70 |
+
"Yes",
|
71 |
+
"No",
|
72 |
+
"No",
|
73 |
+
],
|
74 |
+
"QF: ML-based": [
|
75 |
+
"No",
|
76 |
+
"No",
|
77 |
+
"No",
|
78 |
+
"Yes",
|
79 |
+
"No",
|
80 |
+
"No",
|
81 |
+
"Yes",
|
82 |
+
"Yes",
|
83 |
+
],
|
84 |
+
"QF: Repition-based": [
|
85 |
+
"Yes",
|
86 |
+
"Yes",
|
87 |
+
"Yes",
|
88 |
+
"Yes",
|
89 |
+
"No",
|
90 |
+
"Yes",
|
91 |
+
"No",
|
92 |
+
"No",
|
93 |
+
],
|
94 |
+
"QF: Correction-based": [
|
95 |
+
"Yes",
|
96 |
+
"Yes",
|
97 |
+
"Yes",
|
98 |
+
"No",
|
99 |
+
"No",
|
100 |
+
"No",
|
101 |
+
"No",
|
102 |
+
"No",
|
103 |
+
],
|
104 |
+
"QF: Gopher Rules": [
|
105 |
+
"Yes",
|
106 |
+
"Yes",
|
107 |
+
"Yes",
|
108 |
+
"Yes",
|
109 |
+
"No",
|
110 |
+
"Yes",
|
111 |
+
"No",
|
112 |
+
"No",
|
113 |
+
],
|
114 |
+
"QF: C4 Rules": [
|
115 |
+
"Yes",
|
116 |
+
"Yes",
|
117 |
+
"Yes",
|
118 |
+
"Yes",
|
119 |
+
"Yes",
|
120 |
+
"Yes",
|
121 |
+
"No",
|
122 |
+
"No",
|
123 |
+
],
|
124 |
+
"PI Filtering": [
|
125 |
+
"Yes",
|
126 |
+
"Yes",
|
127 |
+
"No",
|
128 |
+
"No",
|
129 |
+
"No",
|
130 |
+
"Yes",
|
131 |
+
"No",
|
132 |
+
"No",
|
133 |
+
],
|
134 |
+
"Exact Deduplication": [
|
135 |
+
"Bloom Filter",
|
136 |
+
"n/a",
|
137 |
+
"ExactSubStr",
|
138 |
+
"Bloom Filter",
|
139 |
+
"n/a",
|
140 |
+
"Bloom Filter",
|
141 |
+
"n/a",
|
142 |
+
"n/a",
|
143 |
+
],
|
144 |
+
"Fuzzy Deduplication": [
|
145 |
+
"Global",
|
146 |
+
"Local",
|
147 |
+
"Local",
|
148 |
+
"Local",
|
149 |
+
"Local",
|
150 |
+
"Local",
|
151 |
+
"Local",
|
152 |
+
"Global",
|
153 |
+
],
|
154 |
+
}
|
155 |
+
)
|
156 |
+
table_html_filter_data = data_filtering_table_data.to_html(index=False, border=0)
|
157 |
+
table_div_filter_data = Div(NotStr(table_html_filter_data), style="margin: 40px;")
|
158 |
|
159 |
def DVS(
|
160 |
left,
|
|
|
362 |
),
|
363 |
style="margin-top: 20px;",
|
364 |
),
|
365 |
+
H2("Web Data Processing Overview"),
|
366 |
+
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
367 |
+
table_div_filter_data,
|
368 |
+
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
369 |
+
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
370 |
+
P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
|
371 |
H3("1. Document Preparation"),
|
372 |
|
373 |
H4("1.1 Text Extraction"),
|
|
|
378 |
we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
|
379 |
Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
|
380 |
"""),
|
381 |
+
P("We directly read WARC files instead of WET files and extracted text using Trafilatura. Similar to RefinedWeb, we avoid using Machine Learning (ML)-based metrics for filtering documents to prevent bias introduced by ML models. Importantly, we apply global deduplication across the entire dataset, whereas previous works only use local deduplication. Note that although The Pile also employed global deduplication on its web data (Pile-CC), this accounted for just 0.6\% of 74 snapshots."),
|
382 |
+
|
383 |
+
|
384 |
DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
385 |
|
386 |
H4("1.2 Language Identification"),
|