victormiller
commited on
Update web.py
Browse files
web.py
CHANGED
@@ -254,46 +254,14 @@ def web_data():
|
|
254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
255 |
Li("Each section is complete with code and comparisons to Dolma, DataTrove, and/or RedPajama-V-2", style = "margin-bottom: 5px"),
|
256 |
),
|
257 |
-
),
|
258 |
-
|
259 |
-
Div(
|
260 |
-
H2("Common Crawl Data Processing Summary"),
|
261 |
-
P(
|
262 |
-
"To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
|
263 |
-
A("Common Crawl", href="https://commoncrawl.org/"),
|
264 |
-
", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
|
265 |
-
),
|
266 |
-
style="margin-top: 20px;",
|
267 |
-
),
|
268 |
-
Div(
|
269 |
-
Ul(
|
270 |
-
Li(
|
271 |
-
A(
|
272 |
-
"Raw Documentation",
|
273 |
-
href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
|
274 |
-
)
|
275 |
-
),
|
276 |
-
Li(
|
277 |
-
A(
|
278 |
-
"Github link of Web Data Pipeline",
|
279 |
-
href="https://github.com/CIAI-LLM/WebDataProcessing.git",
|
280 |
-
)
|
281 |
-
),
|
282 |
-
),
|
283 |
-
style="""
|
284 |
-
background-color: #d4edda; /* Light green background */
|
285 |
-
border: 1px solid #c3e6cb; /* Green border */
|
286 |
-
border-radius: 5px;
|
287 |
-
padding: 15px 15px 0px 15px;
|
288 |
-
margin-bottom: 15px
|
289 |
-
""",
|
290 |
),
|
291 |
id="section1",),
|
292 |
Section(
|
293 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
294 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
295 |
table_div_filter_data,
|
296 |
-
P("The table below provides a comparison of the quality filters that have been applied to each dataset."),
|
297 |
table_div_qf_filter_data,
|
298 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
299 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
@@ -408,7 +376,7 @@ def web_data():
|
|
408 |
"""),
|
409 |
|
410 |
Details(
|
411 |
-
Summary("
|
412 |
Div (
|
413 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
414 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
@@ -425,7 +393,7 @@ def web_data():
|
|
425 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
426 |
"""),
|
427 |
Details(
|
428 |
-
Summary("6
|
429 |
Div (
|
430 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
431 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
@@ -439,7 +407,7 @@ def web_data():
|
|
439 |
),
|
440 |
|
441 |
Details(
|
442 |
-
Summary("
|
443 |
Div(
|
444 |
DV(
|
445 |
"data/bad_url_doc.jsonl",
|
@@ -460,7 +428,7 @@ def web_data():
|
|
460 |
"""),
|
461 |
|
462 |
Details(
|
463 |
-
Summary("
|
464 |
Div (
|
465 |
DVS(
|
466 |
non_web_urls,
|
@@ -477,7 +445,7 @@ def web_data():
|
|
477 |
),
|
478 |
|
479 |
Details(
|
480 |
-
Summary("
|
481 |
Div (
|
482 |
DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
|
483 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
@@ -539,7 +507,7 @@ def web_data():
|
|
539 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
540 |
"""),
|
541 |
Details(
|
542 |
-
Summary("Javascript
|
543 |
Div (
|
544 |
DV(
|
545 |
"data/sample_java.jsonl",
|
@@ -589,7 +557,7 @@ def web_data():
|
|
589 |
the bad words from English but also consider the bad words from other languages.
|
590 |
"""),
|
591 |
Details(
|
592 |
-
Summary("
|
593 |
Div (
|
594 |
DVS(
|
595 |
json.load(open("data/toxic_lines.json")),
|
@@ -611,7 +579,7 @@ def web_data():
|
|
611 |
In this section, we introduce each quality signal used to filter out low-quality documents.
|
612 |
"""),
|
613 |
Details(
|
614 |
-
Summary("
|
615 |
Div (
|
616 |
DVS(
|
617 |
json.load(open("data/all_signals.json")),
|
@@ -732,7 +700,6 @@ def web_data():
|
|
732 |
We adjusted the method in Dolma for counting characters within lines by excluding whitespace. This modification
|
733 |
ensures consistency with the overall document character count calculation.
|
734 |
"""),
|
735 |
-
H3("TxT360 Implementation"),
|
736 |
Details(
|
737 |
Summary("TxT360 Implementation"),
|
738 |
Div(
|
@@ -1153,9 +1120,6 @@ def web_data():
|
|
1153 |
margin-bottom: 15px
|
1154 |
""",
|
1155 |
),
|
1156 |
-
H5(
|
1157 |
-
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
1158 |
-
),
|
1159 |
Details(
|
1160 |
Summary("Documents Filtered by Duplicated n-Grams (n=5,...,10)"),
|
1161 |
Div(
|
@@ -1300,13 +1264,22 @@ def web_data():
|
|
1300 |
Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
|
1301 |
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
1302 |
),
|
1303 |
-
H3("Word Count"),
|
1304 |
Details(
|
|
|
1305 |
Summary("Implementations from Dolma"),
|
1306 |
D_code("""
|
1307 |
words = text.split()
|
1308 |
word_count = len(words)
|
1309 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1310 |
),
|
1311 |
Details(
|
1312 |
Summary("Implementations from RedPajama-V2"),
|
|
|
254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
255 |
Li("Each section is complete with code and comparisons to Dolma, DataTrove, and/or RedPajama-V-2", style = "margin-bottom: 5px"),
|
256 |
),
|
257 |
+
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
),
|
259 |
id="section1",),
|
260 |
Section(
|
261 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
262 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
263 |
table_div_filter_data,
|
264 |
+
P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and effecient filtering processing that should be consider for any filtering project. However, we are leaving that option to TxT360's end users."),
|
265 |
table_div_qf_filter_data,
|
266 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
267 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
|
|
376 |
"""),
|
377 |
|
378 |
Details(
|
379 |
+
Summary(" List of 24 URLs with 4k+ Matches"),
|
380 |
Div (
|
381 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
382 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
|
393 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
394 |
"""),
|
395 |
Details(
|
396 |
+
Summary("6 URLS Manually Removed from the Blocklist"),
|
397 |
Div (
|
398 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
399 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
|
407 |
),
|
408 |
|
409 |
Details(
|
410 |
+
Summary("Blocked Document Examples from the URL Blocklist"),
|
411 |
Div(
|
412 |
DV(
|
413 |
"data/bad_url_doc.jsonl",
|
|
|
428 |
"""),
|
429 |
|
430 |
Details(
|
431 |
+
Summary("TxT360 Excluded URLs"),
|
432 |
Div (
|
433 |
DVS(
|
434 |
non_web_urls,
|
|
|
445 |
),
|
446 |
|
447 |
Details(
|
448 |
+
Summary("TxT360 Excluded URLs Example Documents"),
|
449 |
Div (
|
450 |
DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
|
451 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
|
507 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
508 |
"""),
|
509 |
Details(
|
510 |
+
Summary("Javascript Documents Filtered by C4 but Kept in TxT360"),
|
511 |
Div (
|
512 |
DV(
|
513 |
"data/sample_java.jsonl",
|
|
|
557 |
the bad words from English but also consider the bad words from other languages.
|
558 |
"""),
|
559 |
Details(
|
560 |
+
Summary("Toxic Line Examples (WARNING: MAY CONTAIN OFFENSIVE MATERIAL)"),
|
561 |
Div (
|
562 |
DVS(
|
563 |
json.load(open("data/toxic_lines.json")),
|
|
|
579 |
In this section, we introduce each quality signal used to filter out low-quality documents.
|
580 |
"""),
|
581 |
Details(
|
582 |
+
Summary("Quality Signals Used For Filtering"),
|
583 |
Div (
|
584 |
DVS(
|
585 |
json.load(open("data/all_signals.json")),
|
|
|
700 |
We adjusted the method in Dolma for counting characters within lines by excluding whitespace. This modification
|
701 |
ensures consistency with the overall document character count calculation.
|
702 |
"""),
|
|
|
703 |
Details(
|
704 |
Summary("TxT360 Implementation"),
|
705 |
Div(
|
|
|
1120 |
margin-bottom: 15px
|
1121 |
""",
|
1122 |
),
|
|
|
|
|
|
|
1123 |
Details(
|
1124 |
Summary("Documents Filtered by Duplicated n-Grams (n=5,...,10)"),
|
1125 |
Div(
|
|
|
1264 |
Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
|
1265 |
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
1266 |
),
|
1267 |
+
H3("Word Count Filters"),
|
1268 |
Details(
|
1269 |
+
Div(
|
1270 |
Summary("Implementations from Dolma"),
|
1271 |
D_code("""
|
1272 |
words = text.split()
|
1273 |
word_count = len(words)
|
1274 |
""", block="block", language="python"),
|
1275 |
+
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
1276 |
+
),
|
1277 |
+
style="""
|
1278 |
+
background-color: #EAFFF1; /* Light yellow background */
|
1279 |
+
padding: 15px;
|
1280 |
+
border-radius: 12px;
|
1281 |
+
margin-bottom: 15px
|
1282 |
+
""",
|
1283 |
),
|
1284 |
Details(
|
1285 |
Summary("Implementations from RedPajama-V2"),
|