omkarenator
commited on
Commit
•
0698fac
1
Parent(s):
9a127b5
fix curated page layout. remove fixed width of the chart
Browse files- curated.py +39 -44
curated.py
CHANGED
@@ -579,54 +579,49 @@ data_preprocessing_div = Div(
|
|
579 |
),
|
580 |
)
|
581 |
|
582 |
-
# Data for the stacked bar chart
|
583 |
-
data = {
|
584 |
-
"Filter": [
|
585 |
-
"Downloaded Lines",
|
586 |
-
"Language Filter",
|
587 |
-
"Min Word Count",
|
588 |
-
"Unigram Log Probability",
|
589 |
-
],
|
590 |
-
"Wikipedia": [61614907, 61614907, 60468491, 60468491],
|
591 |
-
"Freelaw": [75971288, 73690766, 68171834, 68123174],
|
592 |
-
"DM Maths": [112559888, 112559888, 112559888, 112559888],
|
593 |
-
"USPTO": [6880276, 6878964, 6749922, 6749389],
|
594 |
-
"PG19": [28752, 28683, 28682, 28632],
|
595 |
-
"Hackernews": [2064931, 2010802, 2010488, 2003636],
|
596 |
-
"Ubuntu IRC": [37966, 23501, 23468, 23205],
|
597 |
-
"Europarl": [69814, 69814, 69814, 69814],
|
598 |
-
"StackExchange": [23246548, 23246548, 23246352, 23246352],
|
599 |
-
"Arxiv": [1911867, 1869441, 1763840, 1762661],
|
600 |
-
"S2ORC": [12963563, 12963563, 12963563, 12963563],
|
601 |
-
"S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
|
602 |
-
"Pubmed Central": [5230932, 4830486, 4768310, 4767474],
|
603 |
-
"Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
|
604 |
-
"Phil Papers": [49389, 39175, 39175, 39128],
|
605 |
-
}
|
606 |
|
607 |
-
|
608 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
609 |
|
610 |
-
|
611 |
-
fig = go.Figure()
|
612 |
|
613 |
-
|
614 |
-
for dataset in df.columns[1:]:
|
615 |
-
fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
|
616 |
|
617 |
-
|
618 |
-
fig.
|
619 |
-
barmode="stack",
|
620 |
-
title="Document Reduction by Filter for Each Dataset",
|
621 |
-
xaxis_title="Filter",
|
622 |
-
yaxis_title="Number of Lines",
|
623 |
-
legend_title="Dataset",
|
624 |
-
height=600,
|
625 |
-
width=1000,
|
626 |
-
)
|
627 |
|
628 |
-
|
629 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
630 |
|
631 |
|
632 |
filtering_process = Div(
|
@@ -635,7 +630,7 @@ filtering_process = Div(
|
|
635 |
P(
|
636 |
"Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
|
637 |
),
|
638 |
-
plotly2fasthtml(diff2_stacked_bar),
|
639 |
H3(
|
640 |
"This section continues below with the specific filtering steps taken for all 14 curated datasets."
|
641 |
),
|
|
|
579 |
),
|
580 |
)
|
581 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
582 |
|
583 |
+
def diff2_stacked_bar():
|
584 |
+
# Data for the stacked bar chart
|
585 |
+
data = {
|
586 |
+
"Filter": [
|
587 |
+
"Downloaded Lines",
|
588 |
+
"Language Filter",
|
589 |
+
"Min Word Count",
|
590 |
+
"Unigram Log Probability",
|
591 |
+
],
|
592 |
+
"Wikipedia": [61614907, 61614907, 60468491, 60468491],
|
593 |
+
"Freelaw": [75971288, 73690766, 68171834, 68123174],
|
594 |
+
"DM Maths": [112559888, 112559888, 112559888, 112559888],
|
595 |
+
"USPTO": [6880276, 6878964, 6749922, 6749389],
|
596 |
+
"PG19": [28752, 28683, 28682, 28632],
|
597 |
+
"Hackernews": [2064931, 2010802, 2010488, 2003636],
|
598 |
+
"Ubuntu IRC": [37966, 23501, 23468, 23205],
|
599 |
+
"Europarl": [69814, 69814, 69814, 69814],
|
600 |
+
"StackExchange": [23246548, 23246548, 23246352, 23246352],
|
601 |
+
"Arxiv": [1911867, 1869441, 1763840, 1762661],
|
602 |
+
"S2ORC": [12963563, 12963563, 12963563, 12963563],
|
603 |
+
"S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
|
604 |
+
"Pubmed Central": [5230932, 4830486, 4768310, 4767474],
|
605 |
+
"Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
|
606 |
+
"Phil Papers": [49389, 39175, 39175, 39128],
|
607 |
+
}
|
608 |
|
609 |
+
df = pd.DataFrame(data)
|
|
|
610 |
|
611 |
+
fig = go.Figure()
|
|
|
|
|
612 |
|
613 |
+
for dataset in df.columns[1:]:
|
614 |
+
fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
615 |
|
616 |
+
fig.update_layout(
|
617 |
+
barmode="stack",
|
618 |
+
title="Document Reduction by Filter for Each Dataset",
|
619 |
+
xaxis_title="Filter",
|
620 |
+
yaxis_title="Number of Lines",
|
621 |
+
legend_title="Dataset",
|
622 |
+
height=600,
|
623 |
+
)
|
624 |
+
return fig
|
625 |
|
626 |
|
627 |
filtering_process = Div(
|
|
|
630 |
P(
|
631 |
"Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
|
632 |
),
|
633 |
+
plotly2fasthtml(diff2_stacked_bar()),
|
634 |
H3(
|
635 |
"This section continues below with the specific filtering steps taken for all 14 curated datasets."
|
636 |
),
|