victormiller
commited on
Commit
•
913dc7b
1
Parent(s):
8acb3f0
Update curated.py
Browse files- curated.py +63 -31
curated.py
CHANGED
@@ -680,17 +680,17 @@ filtering_process = Div(
|
|
680 |
P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
681 |
H4("Filtering"),
|
682 |
Ul(
|
683 |
-
Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
|
684 |
-
Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
|
685 |
-
Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",)),
|
686 |
-
Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python"))),
|
687 |
-
Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0")),
|
688 |
-
Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts.")),
|
689 |
-
Li(P(B("Letterness Filter: "), "discards paragraphs with a low proportion of letters")),
|
690 |
-
Li(P(B("Removing Leading/Trailing Numbers: "), "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
|
691 |
-
Li(P(B("Fixing Unicode Issues: "), "fixes Unicode issues.")),
|
692 |
-
Li(P(B("Combining Diacritics Correction: "), D_code("a'", language="python"), " becomes ", D_code("å", language="python"))),
|
693 |
-
Li(P(B("Unigram Log Probability: "), "the document must have higher than -20 average unigram log probability.")),
|
694 |
),
|
695 |
table_div_phil,
|
696 |
Details(
|
@@ -714,7 +714,39 @@ filtering_process = Div(
|
|
714 |
P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
|
715 |
P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
716 |
H4("Filtering"),
|
717 |
-
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
718 |
table_div_up,
|
719 |
Details(
|
720 |
Summary("EuroParl Filtering Examples"),
|
@@ -736,12 +768,12 @@ filtering_process = Div(
|
|
736 |
H3("HackerNews"),
|
737 |
P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
|
738 |
P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
739 |
-
P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest
|
740 |
H4("Filtering"),
|
741 |
-
|
742 |
-
Li("Language Filter: English"),
|
743 |
-
Li("Minimum Word Count Filter: 10"),
|
744 |
-
Li("Unigram Log Probability"),
|
745 |
),
|
746 |
table_div_hn,
|
747 |
),
|
@@ -750,12 +782,12 @@ filtering_process = Div(
|
|
750 |
Div(
|
751 |
H3("USPTO"),
|
752 |
P("Patent documents from the United States Patent and Trademark Office."),
|
753 |
-
P(B("Download and Extraction: "), "Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and
|
754 |
H4("Filtering"),
|
755 |
Ol(
|
756 |
-
Li("Language Filter: English"),
|
757 |
-
Li("Minimum Word Count Filter: 50"),
|
758 |
-
Li("Unigram Log Probability"),
|
759 |
),
|
760 |
table_div_uspto,
|
761 |
),
|
@@ -778,9 +810,9 @@ filtering_process = Div(
|
|
778 |
P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
|
779 |
H4("Filtering"),
|
780 |
Ol(
|
781 |
-
Li("Language Filter: English"),
|
782 |
-
Li("Minimum Word Count Filter: 50"),
|
783 |
-
Li("Unigram Log Probability"),
|
784 |
),
|
785 |
H4("Local Deduplication Process"),
|
786 |
Ol(
|
@@ -821,7 +853,7 @@ filtering_process = Div(
|
|
821 |
"""),
|
822 |
H4("Filtering"),
|
823 |
Ol(
|
824 |
-
Li("Minimum Word Count Filter: 10"),
|
825 |
),
|
826 |
table_div_se,
|
827 |
Details(
|
@@ -859,9 +891,9 @@ filtering_process = Div(
|
|
859 |
""", block="block", language="python" ),
|
860 |
H4("Filtering"),
|
861 |
Ol(
|
862 |
-
Li("Language Filter: English"),
|
863 |
-
Li("Minimum Word Count Filter: 10"),
|
864 |
-
Li("Unigram Log Probability"),
|
865 |
),
|
866 |
table_div_uirc,
|
867 |
),
|
@@ -905,9 +937,9 @@ filtering_process = Div(
|
|
905 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
906 |
H4("Filtering"),
|
907 |
Ol(
|
908 |
-
Li("Language Filter: ???"),
|
909 |
-
Li("Minimum Word Count Filter: 20"),
|
910 |
-
Li("Unigram Log Probability"),
|
911 |
),
|
912 |
table_div_pg19,
|
913 |
Details(
|
|
|
680 |
P(B("Download and Extraction: "), "Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
681 |
H4("Filtering"),
|
682 |
Ul(
|
683 |
+
Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python")), style = "margin-bottom: 2px"),
|
684 |
+
Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: 2px"),
|
685 |
+
Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",), style = "margin-bottom: 2px"),
|
686 |
+
Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python")), style = "margin-bottom: 2px"),
|
687 |
+
Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0"), style = "margin-bottom: 2px"),
|
688 |
+
Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts."), style = "margin-bottom: 2px"),
|
689 |
+
Li(P(B("Letterness Filter: "), "discards paragraphs with a low proportion of letters"), style = "margin-bottom: 2px"),
|
690 |
+
Li(P(B("Removing Leading/Trailing Numbers: "), "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")), style = "margin-bottom: 2px"),
|
691 |
+
Li(P(B("Fixing Unicode Issues: "), "fixes Unicode issues."), style = "margin-bottom: 2px"),
|
692 |
+
Li(P(B("Combining Diacritics Correction: "), D_code("a'", language="python"), " becomes ", D_code("å", language="python")), style = "margin-bottom: 2px"),
|
693 |
+
Li(P(B("Unigram Log Probability: "), "the document must have higher than -20 average unigram log probability."), style = "margin-bottom: 2px"),
|
694 |
),
|
695 |
table_div_phil,
|
696 |
Details(
|
|
|
714 |
P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
|
715 |
P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
716 |
H4("Filtering"),
|
717 |
+
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained HTML tags which were removed."),
|
718 |
+
D_code("""
|
719 |
+
Raw single line in data: <P> Hi I am speaker
|
720 |
+
After tag removal: P Hi I am speaker
|
721 |
+
We remove everything that starts with ["P", "BRK", "CHAPTER", "/P"]
|
722 |
+
and only keep tagnae == SPEAKER
|
723 |
+
because line starting with <SPEAKER> TEXT TEXT ....... has the relevant text
|
724 |
+
""", style="block", language = "python"),
|
725 |
+
D_code("""
|
726 |
+
def process_tag(original_tag):
|
727 |
+
tag = original_tag.strip(">").strip("<")
|
728 |
+
|
729 |
+
# Skip empty tags
|
730 |
+
if not tag:
|
731 |
+
return None
|
732 |
+
|
733 |
+
tagname = tag.split()[0]
|
734 |
+
|
735 |
+
# Skip paragraph, break, and chapter tags
|
736 |
+
if tagname in ["P", "BRK", "CHAPTER", "/P"]:
|
737 |
+
return None
|
738 |
+
|
739 |
+
# For speaker tags, return the name
|
740 |
+
if tagname == "SPEAKER":
|
741 |
+
soup = bs4.BeautifulSoup(original_tag, "html.parser")
|
742 |
+
name = soup.speaker["name"]
|
743 |
+
return name
|
744 |
+
|
745 |
+
# Raise a error here if there is a tag we don't know
|
746 |
+
raise ValueError(f"Unknown tag {tag}")
|
747 |
+
|
748 |
+
|
749 |
+
""", style="block", language = "python"),
|
750 |
table_div_up,
|
751 |
Details(
|
752 |
Summary("EuroParl Filtering Examples"),
|
|
|
768 |
H3("HackerNews"),
|
769 |
P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
|
770 |
P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
771 |
+
P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
|
772 |
H4("Filtering"),
|
773 |
+
Ul(
|
774 |
+
Li("Language Filter: English", style = "margin-bottom: 2px"),
|
775 |
+
Li("Minimum Word Count Filter: 10", style = "margin-bottom: 2px"),
|
776 |
+
Li("Unigram Log Probability Threshold: -20", style = "margin-bottom: 2px"),
|
777 |
),
|
778 |
table_div_hn,
|
779 |
),
|
|
|
782 |
Div(
|
783 |
H3("USPTO"),
|
784 |
P("Patent documents from the United States Patent and Trademark Office."),
|
785 |
+
P(B("Download and Extraction: "), "Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year:", I("Pre_2002"), ", ", I("2002_to_2004"), " and", I("post_2004"),". We used the exact code used in The Pile (citation needed)."),
|
786 |
H4("Filtering"),
|
787 |
Ol(
|
788 |
+
Li("Language Filter: English", style = "margin-bottom: 2px"),
|
789 |
+
Li("Minimum Word Count Filter: 50", style = "margin-bottom: 2px"),
|
790 |
+
Li("Unigram Log Probability", style = "margin-bottom: 2px"),
|
791 |
),
|
792 |
table_div_uspto,
|
793 |
),
|
|
|
810 |
P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
|
811 |
H4("Filtering"),
|
812 |
Ol(
|
813 |
+
Li("Language Filter: English", style = "margin-bottom: 2px"),
|
814 |
+
Li("Minimum Word Count Filter: 50", style = "margin-bottom: 2px"),
|
815 |
+
Li("Unigram Log Probability", style = "margin-bottom: 2px"),
|
816 |
),
|
817 |
H4("Local Deduplication Process"),
|
818 |
Ol(
|
|
|
853 |
"""),
|
854 |
H4("Filtering"),
|
855 |
Ol(
|
856 |
+
Li("Minimum Word Count Filter: 10", style = "margin-bottom: 2px"),
|
857 |
),
|
858 |
table_div_se,
|
859 |
Details(
|
|
|
891 |
""", block="block", language="python" ),
|
892 |
H4("Filtering"),
|
893 |
Ol(
|
894 |
+
Li("Language Filter: English", style = "margin-bottom: 2px"),
|
895 |
+
Li("Minimum Word Count Filter: 10", style = "margin-bottom: 2px"),
|
896 |
+
Li("Unigram Log Probability", style = "margin-bottom: 2px"),
|
897 |
),
|
898 |
table_div_uirc,
|
899 |
),
|
|
|
937 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
938 |
H4("Filtering"),
|
939 |
Ol(
|
940 |
+
Li("Language Filter: ???", style = "margin-bottom: 2px"),
|
941 |
+
Li("Minimum Word Count Filter: 20", style = "margin-bottom: 2px"),
|
942 |
+
Li("Unigram Log Probability", style = "margin-bottom: 2px"),
|
943 |
),
|
944 |
table_div_pg19,
|
945 |
Details(
|