victormiller
commited on
Commit
•
ed0e179
1
Parent(s):
6a7bb93
Update curated.py
Browse files- curated.py +36 -318
curated.py
CHANGED
@@ -455,34 +455,6 @@ data_sources = [
|
|
455 |
"Europarl",
|
456 |
]
|
457 |
|
458 |
-
def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
|
459 |
-
doc_id = max(0, min(int(doc_id), 9))
|
460 |
-
|
461 |
-
if data_source == "Freelaw":
|
462 |
-
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
|
463 |
-
extracted_sample_doc = json.load(
|
464 |
-
open("data/curated_samples/freelaw_extract.json")
|
465 |
-
)
|
466 |
-
else:
|
467 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
468 |
-
|
469 |
-
raw_json = raw_sample_doc[doc_id]
|
470 |
-
extracted_json = extracted_sample_doc[doc_id]
|
471 |
-
return view_data(
|
472 |
-
raw_json,
|
473 |
-
extracted_json,
|
474 |
-
doc_id=doc_id,
|
475 |
-
data_source="Freelaw",
|
476 |
-
data_sources="Freelaw",
|
477 |
-
target=target,
|
478 |
-
)
|
479 |
-
|
480 |
-
freelaw_examples = Div(
|
481 |
-
Div(
|
482 |
-
get_freelaw_data(target=gen_random_id()),
|
483 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
484 |
-
),
|
485 |
-
)
|
486 |
|
487 |
|
488 |
def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
|
@@ -513,261 +485,7 @@ wiki_examples = Div(
|
|
513 |
),
|
514 |
)
|
515 |
|
516 |
-
def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
|
517 |
-
doc_id = max(0, min(int(doc_id), 9))
|
518 |
-
|
519 |
-
if data_source == "StackExchange":
|
520 |
-
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
|
521 |
-
extracted_sample_doc = json.load(
|
522 |
-
open("data/curated_samples/stackexchange_extract.json")
|
523 |
-
)
|
524 |
-
else:
|
525 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
526 |
-
|
527 |
-
raw_json = raw_sample_doc[doc_id]
|
528 |
-
extracted_json = extracted_sample_doc[doc_id]
|
529 |
-
return view_data(
|
530 |
-
raw_json,
|
531 |
-
extracted_json,
|
532 |
-
doc_id=doc_id,
|
533 |
-
data_source="StackExchange",
|
534 |
-
data_sources="StackExchange",
|
535 |
-
target=target,
|
536 |
-
)
|
537 |
-
|
538 |
-
se_examples = Div(
|
539 |
-
Div(
|
540 |
-
get_se_data(target=gen_random_id()),
|
541 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
542 |
-
),
|
543 |
-
)
|
544 |
-
|
545 |
-
def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
|
546 |
-
doc_id = max(0, min(int(doc_id), 9))
|
547 |
-
|
548 |
-
if data_source == "PhilPapers":
|
549 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
550 |
-
open("data/curated_samples/philpapers_raw.json")
|
551 |
-
)
|
552 |
-
else:
|
553 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
554 |
-
|
555 |
-
raw_json = raw_sample_doc[doc_id]
|
556 |
-
extracted_json = extracted_sample_doc[doc_id]
|
557 |
-
return view_data(
|
558 |
-
raw_json,
|
559 |
-
extracted_json,
|
560 |
-
doc_id=doc_id,
|
561 |
-
data_source="PhilPapers",
|
562 |
-
data_sources="PhilPapers",
|
563 |
-
target=target,
|
564 |
-
)
|
565 |
-
|
566 |
-
phil_examples = Div(
|
567 |
-
Div(
|
568 |
-
get_phil_data(target=gen_random_id()),
|
569 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
570 |
-
),
|
571 |
-
)
|
572 |
-
|
573 |
-
def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
|
574 |
-
doc_id = max(0, min(int(doc_id), 9))
|
575 |
-
|
576 |
-
if data_source == "Arxiv":
|
577 |
-
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
|
578 |
-
extracted_sample_doc = json.load(
|
579 |
-
open("data/curated_samples/arxiv_extract.json")
|
580 |
-
)
|
581 |
-
else:
|
582 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
583 |
-
|
584 |
-
raw_json = raw_sample_doc[doc_id]
|
585 |
-
extracted_json = extracted_sample_doc[doc_id]
|
586 |
-
return view_data(
|
587 |
-
raw_json,
|
588 |
-
extracted_json,
|
589 |
-
doc_id=doc_id,
|
590 |
-
data_source="Arxiv",
|
591 |
-
data_sources="Arxiv",
|
592 |
-
target=target,
|
593 |
-
)
|
594 |
-
|
595 |
-
arx_examples = Div(
|
596 |
-
Div(
|
597 |
-
get_arx_data(target=gen_random_id()),
|
598 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
599 |
-
),
|
600 |
-
)
|
601 |
-
|
602 |
-
def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
|
603 |
-
doc_id = max(0, min(int(doc_id), 9))
|
604 |
-
|
605 |
-
if data_source == "S2ORC":
|
606 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
607 |
-
open("data/curated_samples/s2orc_raw.json")
|
608 |
-
)
|
609 |
-
else:
|
610 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
611 |
-
|
612 |
-
raw_json = raw_sample_doc[doc_id]
|
613 |
-
extracted_json = extracted_sample_doc[doc_id]
|
614 |
-
return view_data(
|
615 |
-
raw_json,
|
616 |
-
extracted_json,
|
617 |
-
doc_id=doc_id,
|
618 |
-
data_source="S2ORC",
|
619 |
-
data_sources="S2ORC",
|
620 |
-
target=target,
|
621 |
-
)
|
622 |
-
|
623 |
-
s2o_examples = Div(
|
624 |
-
Div(
|
625 |
-
get_S2ORC_data(target=gen_random_id()),
|
626 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
627 |
-
),
|
628 |
-
)
|
629 |
-
|
630 |
-
def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
|
631 |
-
doc_id = max(0, min(int(doc_id), 9))
|
632 |
-
|
633 |
-
if data_source == "S2ORC":
|
634 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
635 |
-
open("data/curated_samples/s2orc_abstract_raw.json")
|
636 |
-
)
|
637 |
-
else:
|
638 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
639 |
-
|
640 |
-
raw_json = raw_sample_doc[doc_id]
|
641 |
-
extracted_json = extracted_sample_doc[doc_id]
|
642 |
-
return view_data(
|
643 |
-
raw_json,
|
644 |
-
extracted_json,
|
645 |
-
doc_id=doc_id,
|
646 |
-
data_source="S2ORC Abstract",
|
647 |
-
data_sources="S2ORC Abstract",
|
648 |
-
target=target,
|
649 |
-
)
|
650 |
-
|
651 |
-
s2oa_examples = Div(
|
652 |
-
Div(
|
653 |
-
get_S2ORCA_data(target=gen_random_id()),
|
654 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
655 |
-
),
|
656 |
-
)
|
657 |
-
|
658 |
-
def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
|
659 |
-
doc_id = max(0, min(int(doc_id), 9))
|
660 |
-
|
661 |
-
if data_source == "Pubmed":
|
662 |
-
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
|
663 |
-
extracted_sample_doc = json.load(
|
664 |
-
open("data/curated_samples/pubmed_extract.json")
|
665 |
-
)
|
666 |
-
else:
|
667 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
668 |
-
|
669 |
-
raw_json = raw_sample_doc[doc_id]
|
670 |
-
extracted_json = extracted_sample_doc[doc_id]
|
671 |
-
return view_data(
|
672 |
-
raw_json,
|
673 |
-
extracted_json,
|
674 |
-
doc_id=doc_id,
|
675 |
-
data_source="Pubmed",
|
676 |
-
data_sources="Pubmed",
|
677 |
-
target=target,
|
678 |
-
)
|
679 |
-
|
680 |
-
pubmed_examples = Div(
|
681 |
-
Div(
|
682 |
-
get_pubmed_data(target=gen_random_id()),
|
683 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
684 |
-
),
|
685 |
-
)
|
686 |
-
|
687 |
-
def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
|
688 |
-
doc_id = max(0, min(int(doc_id), 9))
|
689 |
-
|
690 |
-
if data_source == "DM Math":
|
691 |
-
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
|
692 |
-
extracted_sample_doc = json.load(
|
693 |
-
open("data/curated_samples/dm_maths_extract.json")
|
694 |
-
)
|
695 |
-
else:
|
696 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
697 |
-
|
698 |
-
raw_json = raw_sample_doc[doc_id]
|
699 |
-
extracted_json = extracted_sample_doc[doc_id]
|
700 |
-
return view_data(
|
701 |
-
raw_json,
|
702 |
-
extracted_json,
|
703 |
-
doc_id=doc_id,
|
704 |
-
data_source="DM Math",
|
705 |
-
data_sources="DM Math",
|
706 |
-
target=target,
|
707 |
-
)
|
708 |
-
|
709 |
-
dmm_examples = Div(
|
710 |
-
Div(
|
711 |
-
get_dmm_data(target=gen_random_id()),
|
712 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
713 |
-
),
|
714 |
-
)
|
715 |
-
|
716 |
-
def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
|
717 |
-
doc_id = max(0, min(int(doc_id), 9))
|
718 |
-
|
719 |
-
if data_source == "PG19":
|
720 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
721 |
-
open("data/curated_samples/pg19_raw.json")
|
722 |
-
)
|
723 |
-
else:
|
724 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
725 |
-
|
726 |
-
raw_json = raw_sample_doc[doc_id]
|
727 |
-
extracted_json = extracted_sample_doc[doc_id]
|
728 |
-
return view_data(
|
729 |
-
raw_json,
|
730 |
-
extracted_json,
|
731 |
-
doc_id=doc_id,
|
732 |
-
data_source="PG19",
|
733 |
-
data_sources="PG19",
|
734 |
-
target=target,
|
735 |
-
)
|
736 |
-
|
737 |
-
pg19_examples = Div(
|
738 |
-
Div(
|
739 |
-
get_pg19_data(target=gen_random_id()),
|
740 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
741 |
-
),
|
742 |
-
)
|
743 |
-
|
744 |
-
def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
|
745 |
-
doc_id = max(0, min(int(doc_id), 9))
|
746 |
|
747 |
-
if data_source == "Europarl":
|
748 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
749 |
-
open("data/curated_samples/europarl_raw.json")
|
750 |
-
)
|
751 |
-
else:
|
752 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
753 |
-
|
754 |
-
raw_json = raw_sample_doc[doc_id]
|
755 |
-
extracted_json = extracted_sample_doc[doc_id]
|
756 |
-
return view_data(
|
757 |
-
raw_json,
|
758 |
-
extracted_json,
|
759 |
-
doc_id=doc_id,
|
760 |
-
data_source="Europarl",
|
761 |
-
data_sources="Europarl",
|
762 |
-
target=target,
|
763 |
-
)
|
764 |
-
|
765 |
-
eu_examples = Div(
|
766 |
-
Div(
|
767 |
-
get_eu_data(target=gen_random_id()),
|
768 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
769 |
-
),
|
770 |
-
)
|
771 |
|
772 |
filtering_process = Div(
|
773 |
Section(
|
@@ -803,10 +521,10 @@ filtering_process = Div(
|
|
803 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
804 |
),
|
805 |
table_div_arx,
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
),
|
811 |
),
|
812 |
Section(
|
@@ -845,10 +563,10 @@ filtering_process = Div(
|
|
845 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
846 |
),
|
847 |
table_div_s2o,
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
),
|
853 |
),
|
854 |
Section(
|
@@ -881,10 +599,10 @@ filtering_process = Div(
|
|
881 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
882 |
),
|
883 |
table_div_med,
|
884 |
-
|
885 |
-
|
886 |
-
|
887 |
-
|
888 |
),
|
889 |
),
|
890 |
Section(
|
@@ -898,10 +616,10 @@ filtering_process = Div(
|
|
898 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
899 |
),
|
900 |
table_div_phil,
|
901 |
-
|
902 |
-
|
903 |
-
|
904 |
-
|
905 |
),
|
906 |
),
|
907 |
Section(
|
@@ -913,10 +631,10 @@ filtering_process = Div(
|
|
913 |
H4("Filtering"),
|
914 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
915 |
table_div_up,
|
916 |
-
|
917 |
-
|
918 |
-
|
919 |
-
|
920 |
),
|
921 |
),
|
922 |
Section(
|
@@ -977,10 +695,10 @@ filtering_process = Div(
|
|
977 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
978 |
),
|
979 |
table_div_freelaw,
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
-
|
984 |
|
985 |
),
|
986 |
),
|
@@ -1006,10 +724,10 @@ filtering_process = Div(
|
|
1006 |
Li("Minimum Word Count Filter: 10"),
|
1007 |
),
|
1008 |
table_div_se,
|
1009 |
-
|
1010 |
-
|
1011 |
-
|
1012 |
-
|
1013 |
),
|
1014 |
),
|
1015 |
Section(
|
@@ -1058,10 +776,10 @@ filtering_process = Div(
|
|
1058 |
Li("None"),
|
1059 |
),
|
1060 |
table_div_dmm,
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
),
|
1066 |
),
|
1067 |
Section(
|
@@ -1079,10 +797,10 @@ filtering_process = Div(
|
|
1079 |
Li("Unigram Log Probability"),
|
1080 |
),
|
1081 |
table_div_pg19,
|
1082 |
-
Details(
|
1083 |
-
|
1084 |
-
|
1085 |
-
),
|
1086 |
),
|
1087 |
),
|
1088 |
)
|
|
|
455 |
"Europarl",
|
456 |
]
|
457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
|
459 |
|
460 |
def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
|
|
|
485 |
),
|
486 |
)
|
487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
|
490 |
filtering_process = Div(
|
491 |
Section(
|
|
|
521 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
522 |
),
|
523 |
table_div_arx,
|
524 |
+
# Details(
|
525 |
+
# Summary("ArXiv Filtering Examples"),
|
526 |
+
# arx_examples,
|
527 |
+
# ),
|
528 |
),
|
529 |
),
|
530 |
Section(
|
|
|
563 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
564 |
),
|
565 |
table_div_s2o,
|
566 |
+
# Details(
|
567 |
+
# Summary("FreeLaw Filtering Examples -- need to update"),
|
568 |
+
# freelaw_examples,
|
569 |
+
# ),
|
570 |
),
|
571 |
),
|
572 |
Section(
|
|
|
599 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
600 |
),
|
601 |
table_div_med,
|
602 |
+
# Details(
|
603 |
+
# Summary("PubMed Filtering Examples"),
|
604 |
+
# pubmed_examples,
|
605 |
+
# ),
|
606 |
),
|
607 |
),
|
608 |
Section(
|
|
|
616 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
617 |
),
|
618 |
table_div_phil,
|
619 |
+
# Details(
|
620 |
+
# Summary("Phil Papers Filtering Examples"),
|
621 |
+
# phil_examples,
|
622 |
+
# ),
|
623 |
),
|
624 |
),
|
625 |
Section(
|
|
|
631 |
H4("Filtering"),
|
632 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
633 |
table_div_up,
|
634 |
+
# Details(
|
635 |
+
# Summary("EuroParl Filtering Examples"),
|
636 |
+
# eu_examples,
|
637 |
+
# ),
|
638 |
),
|
639 |
),
|
640 |
Section(
|
|
|
695 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
696 |
),
|
697 |
table_div_freelaw,
|
698 |
+
# Details(
|
699 |
+
# Summary("FreeLaw Filtering Examples"),
|
700 |
+
# freelaw_examples,
|
701 |
+
# ),
|
702 |
|
703 |
),
|
704 |
),
|
|
|
724 |
Li("Minimum Word Count Filter: 10"),
|
725 |
),
|
726 |
table_div_se,
|
727 |
+
# Details(
|
728 |
+
# Summary("StackExchange Filtering Examples"),
|
729 |
+
# se_examples,
|
730 |
+
# ),
|
731 |
),
|
732 |
),
|
733 |
Section(
|
|
|
776 |
Li("None"),
|
777 |
),
|
778 |
table_div_dmm,
|
779 |
+
# Details(
|
780 |
+
# Summary("DM Math Filtering Examples"),
|
781 |
+
# dmm_examples,
|
782 |
+
# ),
|
783 |
),
|
784 |
),
|
785 |
Section(
|
|
|
797 |
Li("Unigram Log Probability"),
|
798 |
),
|
799 |
table_div_pg19,
|
800 |
+
#Details(
|
801 |
+
# Summary("PG-19 Filtering Examples"),
|
802 |
+
# pg19_examples,
|
803 |
+
#),
|
804 |
),
|
805 |
),
|
806 |
)
|