victormiller commited on
Commit
666337a
·
verified ·
1 Parent(s): ff67812

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +2 -36
curated.py CHANGED
@@ -450,9 +450,6 @@ filtering_process = Div(
450
  P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
451
  H4("Filtering"),
452
  P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
453
- H4("Local Deduplication Process"),
454
- Ol(
455
- Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
456
  ),
457
  table_div_wikipedia,
458
  ),
@@ -470,10 +467,6 @@ filtering_process = Div(
470
  Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
471
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
472
  ),
473
- H4("Local Deduplication Process"),
474
- Ol(
475
- Li("Local dedup was done with all papers combined."),
476
- ),
477
  table_div_arx,
478
  ),
479
  ),
@@ -554,10 +547,6 @@ filtering_process = Div(
554
  Ol(
555
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
556
  ),
557
- H4("Local Deduplication Process"),
558
- Ol(
559
- Li("Local dedup was done with all papers combined."),
560
- ),
561
  table_div_phil,
562
  ),
563
  ),
@@ -568,10 +557,6 @@ filtering_process = Div(
568
  P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
569
  H4("Filtering"),
570
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
571
- H4("Local Deduplication Process"),
572
- Ol(
573
- Li("Local dedup was done within europarl itself"),
574
- ),
575
  table_div_up,
576
  ),
577
  ),
@@ -587,10 +572,6 @@ filtering_process = Div(
587
  Li("Minimum Word Count Filter: 10"),
588
  Li("Unigram Log Probability"),
589
  ),
590
- H4("Local Deduplication Process"),
591
- Ol(
592
- Li("Local dedup was done within hackernews itself"),
593
- ),
594
  table_div_hn,
595
  ),
596
  ),
@@ -605,10 +586,6 @@ filtering_process = Div(
605
  Li("Minimum Word Count Filter: 50"),
606
  Li("Unigram Log Probability"),
607
  ),
608
- H4("Local Deduplication Process"),
609
- Ol(
610
- Li("Local dedup was done within USPTO itself"),
611
- ),
612
  table_div_uspto,
613
  ),
614
  ),
@@ -660,10 +637,6 @@ filtering_process = Div(
660
  Ol(
661
  Li("Minimum Word Count Filter: 10"),
662
  ),
663
- H4("Local Deduplication Process"),
664
- Ol(
665
- Li("Local dedup was done within stackexchange itself"),
666
- ),
667
  table_div_se,
668
  ),
669
  ),
@@ -679,7 +652,8 @@ filtering_process = Div(
679
 
680
  def exclude_select_system(x):
681
  return '\n'.join(line for line in x.split('\n') if not (line.startswith('===')
682
- and any(term in line for term in ['has joined #', 'has left #', 'Topic for #', "Topic (#", "is now known as"]) ))
 
683
 
684
  def clean(x):
685
  return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
@@ -690,10 +664,6 @@ filtering_process = Div(
690
  Li("Minimum Word Count Filter: 10"),
691
  Li("Unigram Log Probability"),
692
  ),
693
- H4("Local Deduplication Process"),
694
- Ol(
695
- Li("Local dedup was done within Ubuntu IRC itself"),
696
- ),
697
  table_div_uirc,
698
  ),
699
  ),
@@ -729,10 +699,6 @@ filtering_process = Div(
729
  Li("Minimum Word Count Filter: 20"),
730
  Li("Unigram Log Probability"),
731
  ),
732
- H4("Local Deduplication Process"),
733
- Ol(
734
- Li("Local dedup was done within PG19 itself"),
735
- ),
736
  table_div_pg19,
737
  ),
738
  ),
 
450
  P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
451
  H4("Filtering"),
452
  P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
 
 
 
453
  ),
454
  table_div_wikipedia,
455
  ),
 
467
  Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
468
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
469
  ),
 
 
 
 
470
  table_div_arx,
471
  ),
472
  ),
 
547
  Ol(
548
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
549
  ),
 
 
 
 
550
  table_div_phil,
551
  ),
552
  ),
 
557
  P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
558
  H4("Filtering"),
559
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
 
 
 
 
560
  table_div_up,
561
  ),
562
  ),
 
572
  Li("Minimum Word Count Filter: 10"),
573
  Li("Unigram Log Probability"),
574
  ),
 
 
 
 
575
  table_div_hn,
576
  ),
577
  ),
 
586
  Li("Minimum Word Count Filter: 50"),
587
  Li("Unigram Log Probability"),
588
  ),
 
 
 
 
589
  table_div_uspto,
590
  ),
591
  ),
 
637
  Ol(
638
  Li("Minimum Word Count Filter: 10"),
639
  ),
 
 
 
 
640
  table_div_se,
641
  ),
642
  ),
 
652
 
653
  def exclude_select_system(x):
654
  return '\n'.join(line for line in x.split('\n') if not (line.startswith('===')
655
+ and any(term in line for term in
656
+ ['has joined #', 'has left #', 'Topic for #', "Topic (#", "is now known as"]) ))
657
 
658
  def clean(x):
659
  return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
 
664
  Li("Minimum Word Count Filter: 10"),
665
  Li("Unigram Log Probability"),
666
  ),
 
 
 
 
667
  table_div_uirc,
668
  ),
669
  ),
 
699
  Li("Minimum Word Count Filter: 20"),
700
  Li("Unigram Log Probability"),
701
  ),
 
 
 
 
702
  table_div_pg19,
703
  ),
704
  ),