victormiller
commited on
Commit
•
61e28b6
1
Parent(s):
0dbc84f
Update curated.py
Browse files- curated.py +0 -1
curated.py
CHANGED
@@ -445,7 +445,6 @@ filtering_process = Div(
|
|
445 |
Section(
|
446 |
Div(
|
447 |
H3("Wikipedia"),
|
448 |
-
H4("What is Wikipedia:")
|
449 |
P("Wikipedia is an encyclopedia form of high-quality text data used for language modeling. We have included filtered and deduplicated versions of complete Wikipedia data directly provided by the Wikipedia Foundation for more than 350 languages."),
|
450 |
H4("Download and Extraction"),
|
451 |
P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
|
|
|
445 |
Section(
|
446 |
Div(
|
447 |
H3("Wikipedia"),
|
|
|
448 |
P("Wikipedia is an encyclopedia form of high-quality text data used for language modeling. We have included filtered and deduplicated versions of complete Wikipedia data directly provided by the Wikipedia Foundation for more than 350 languages."),
|
449 |
H4("Download and Extraction"),
|
450 |
P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
|