victormiller commited on
Commit
b961e8c
·
verified ·
1 Parent(s): e636662

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +12 -22
curated.py CHANGED
@@ -528,10 +528,9 @@ filtering_process = Div(
528
  Div(
529
  H3("Wikipedia"),
530
  P("Wikipedia is an encyclopedia form of high-quality text data used for language modeling. We have included filtered and deduplicated versions of complete Wikipedia data directly provided by the Wikipedia Foundation for more than 350 languages."),
531
- H4("Download and Extraction"),
532
- P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
533
- H4("Filtering"),
534
- P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
535
  table_div_wikipedia,
536
  Details(
537
  Summary("Wikipedia Filtering Examples"),
@@ -552,8 +551,8 @@ filtering_process = Div(
552
  Div(
553
  H3("ArXiv"),
554
  P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
555
- H4("Download and Extraction"),
556
- P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
557
  H4("Filtering"),
558
  P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
559
  Ol(
@@ -678,8 +677,7 @@ filtering_process = Div(
678
  Div(
679
  H3("Phil Papers"),
680
  P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
681
- H4("Download and Extraction"),
682
- P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
683
  H4("Filtering"),
684
  Ol(
685
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
@@ -704,8 +702,7 @@ filtering_process = Div(
704
  Div(
705
  H3("Europarl"),
706
  P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
707
- H4("Download and Extraction"),
708
- P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
709
  H4("Filtering"),
710
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
711
  table_div_up,
@@ -744,8 +741,7 @@ filtering_process = Div(
744
  Div(
745
  H3("USPTO"),
746
  P("Patent documents from the United States Patent and Trademark Office."),
747
- H4("Download and Extraction"),
748
- P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
749
  H4("Filtering"),
750
  Ol(
751
  Li("Language Filter: English"),
@@ -802,8 +798,7 @@ filtering_process = Div(
802
  Div(
803
  H3("StackExchange"),
804
  P("A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post."),
805
- H4("Download and Extraction"),
806
- P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
807
  P("""
808
  1. Questions:
809
  2. Comment1:
@@ -839,8 +834,7 @@ filtering_process = Div(
839
  Div(
840
  H3("Ubuntu IRC"),
841
  P("Chat logs from the Ubuntu Internet Relay Chat (IRC) channels on the Freenode IRC chat server. This data is also another form of dialog dataset on niche topics."),
842
- H4("Download and Extraction"),
843
- P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
844
  P("During extraction, the logs were cleaned using following functions:"),
845
  D_code("""
846
  def exclude_system(x):
@@ -867,8 +861,7 @@ filtering_process = Div(
867
  Div(
868
  H3("DM Math"),
869
  P("DeepMind Math dataset with generated questions from various topics like algebra, calculus, geometry, etc. Maths data is included to improve model reasoning abilities in the downstream tasks."),
870
- H4("Download and Extraction"),
871
- P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
872
  D_code("""
873
  Question: TEXT
874
  Answer: TEXT""", block="block", language="python"),
@@ -900,10 +893,7 @@ filtering_process = Div(
900
  Div(
901
  H3("PG-19"),
902
  P("A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919."),
903
- H4("Download and Extraction"),
904
- Ol(
905
- Li("The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
906
- ),
907
  H4("Filtering"),
908
  Ol(
909
  Li("Language Filter: ???"),
 
528
  Div(
529
  H3("Wikipedia"),
530
  P("Wikipedia is an encyclopedia form of high-quality text data used for language modeling. We have included filtered and deduplicated versions of complete Wikipedia data directly provided by the Wikipedia Foundation for more than 350 languages."),
531
+ P(B("Download and Extraction: "), "The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
532
+
533
+ P(B("Filtering: "), "Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
 
534
  table_div_wikipedia,
535
  Details(
536
  Summary("Wikipedia Filtering Examples"),
 
551
  Div(
552
  H3("ArXiv"),
553
  P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
554
+
555
+ P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
556
  H4("Filtering"),
557
  P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
558
  Ol(
 
677
  Div(
678
  H3("Phil Papers"),
679
  P("Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research maintained by the Center for Digital Philosophy at the University of Western Ontario."),
680
+ P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
 
681
  H4("Filtering"),
682
  Ol(
683
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
 
702
  Div(
703
  H3("Europarl"),
704
  P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
705
+ P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
 
706
  H4("Filtering"),
707
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
708
  table_div_up,
 
741
  Div(
742
  H3("USPTO"),
743
  P("Patent documents from the United States Patent and Trademark Office."),
744
+ P(B("Download and Extraction: "), "Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
 
745
  H4("Filtering"),
746
  Ol(
747
  Li("Language Filter: English"),
 
798
  Div(
799
  H3("StackExchange"),
800
  P("A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post."),
801
+ P(B("Download and Extraction: "), "The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
 
802
  P("""
803
  1. Questions:
804
  2. Comment1:
 
834
  Div(
835
  H3("Ubuntu IRC"),
836
  P("Chat logs from the Ubuntu Internet Relay Chat (IRC) channels on the Freenode IRC chat server. This data is also another form of dialog dataset on niche topics."),
837
+ P(B("Download and Extraction: "), "The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
 
838
  P("During extraction, the logs were cleaned using following functions:"),
839
  D_code("""
840
  def exclude_system(x):
 
861
  Div(
862
  H3("DM Math"),
863
  P("DeepMind Math dataset with generated questions from various topics like algebra, calculus, geometry, etc. Maths data is included to improve model reasoning abilities in the downstream tasks."),
864
+ P(B("Download and Extraction: "), "The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
 
865
  D_code("""
866
  Question: TEXT
867
  Answer: TEXT""", block="block", language="python"),
 
893
  Div(
894
  H3("PG-19"),
895
  P("A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919."),
896
+ P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
 
 
 
897
  H4("Filtering"),
898
  Ol(
899
  Li("Language Filter: ???"),