Spaces:
Running
Running
victormiller
commited on
Update curated.py
Browse files- curated.py +60 -1
curated.py
CHANGED
@@ -9,7 +9,6 @@ from rich import print
|
|
9 |
import uuid
|
10 |
import plotly.express as px
|
11 |
|
12 |
-
|
13 |
overview = Div(
|
14 |
H2("Curated Source Processing Overview"),
|
15 |
H3("What This Section Contains"),
|
@@ -751,6 +750,65 @@ def update(target: str, request):
|
|
751 |
params.get(f"data_source_{target}"), doc_id, target)
|
752 |
|
753 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
754 |
def curated(request):
|
755 |
|
756 |
# Partial Updates
|
@@ -884,6 +942,7 @@ def curated(request):
|
|
884 |
data_preprocessing_div,
|
885 |
plotly2fasthtml(get_chart_28168342()),
|
886 |
plotly2fasthtml(get_chart_new()),
|
|
|
887 |
H2("Curated Sources Processing"),
|
888 |
filtering_process,
|
889 |
data_preparation_div,
|
|
|
9 |
import uuid
|
10 |
import plotly.express as px
|
11 |
|
|
|
12 |
overview = Div(
|
13 |
H2("Curated Source Processing Overview"),
|
14 |
H3("What This Section Contains"),
|
|
|
750 |
params.get(f"data_source_{target}"), doc_id, target)
|
751 |
|
752 |
|
753 |
+
# Creating the dataframe from the provided table data
|
754 |
+
data = {
|
755 |
+
'Dataset': ['Wikipedia', 'Freelaw', 'DM Maths', 'USPTO', 'PG19', 'Hackernews', 'Ubuntu IRC', 'Europarl',
|
756 |
+
'StackExchange', 'Arxiv', 'S2ORC', 'S2ORC Abstract', 'Pubmed Central', 'Pubmed Abstract', 'Phil Papers'],
|
757 |
+
'Downloaded Lines': [61614907, 75971288, 112559888, 6880276, 28752, 2064931, 37966, 69814, 23246548, 1911867,
|
758 |
+
12963563, 102324176, 5230932, 25787474, 49389],
|
759 |
+
'Language Filter': [0, 2280522, 0, 1312, 69, 54129, 14465, 0, 0, 42426, 0, 18456575, 400446, 3100, 10214],
|
760 |
+
'Min Word Count': [1146416, 5518932, 0, 129042, 1, 314, 33, 0, 196, 105601, 0, 978308, 62176, 36419, 0],
|
761 |
+
'Unigram log probability': [60468491, 68171834, 112559888, 6749922, 28682, 2010488, 23468, 69814, 23246352,
|
762 |
+
1763840, 12963563, 82889293, 4768310, 25747955, 39175],
|
763 |
+
'Total Lines Remaining': [60468491, 68123174, 112559888, 6749389, 28632, 2003636, 23205, 69814, 23246352,
|
764 |
+
1762661, 12963563, 82777912, 4767474, 25746724, 39128]
|
765 |
+
}
|
766 |
+
|
767 |
+
df = pd.DataFrame(data)
|
768 |
+
|
769 |
+
# Create the stacked bar chart
|
770 |
+
fig = go.Figure()
|
771 |
+
|
772 |
+
# Adding traces for each filter stage
|
773 |
+
fig.add_trace(go.Bar(
|
774 |
+
name='Language Filter',
|
775 |
+
x=df['Dataset'],
|
776 |
+
y=df['Language Filter']
|
777 |
+
))
|
778 |
+
|
779 |
+
fig.add_trace(go.Bar(
|
780 |
+
name='Min Word Count Filter',
|
781 |
+
x=df['Dataset'],
|
782 |
+
y=df['Min Word Count']
|
783 |
+
))
|
784 |
+
|
785 |
+
fig.add_trace(go.Bar(
|
786 |
+
name='Unigram log probability Filter',
|
787 |
+
x=df['Dataset'],
|
788 |
+
y=df['Unigram log probability']
|
789 |
+
))
|
790 |
+
|
791 |
+
fig.add_trace(go.Bar(
|
792 |
+
name='Total Lines Remaining',
|
793 |
+
x=df['Dataset'],
|
794 |
+
y=df['Total Lines Remaining']
|
795 |
+
))
|
796 |
+
|
797 |
+
# Update the layout
|
798 |
+
fig.update_layout(
|
799 |
+
barmode='stack',
|
800 |
+
title='Stacked Bar Chart of Line Reductions by Dataset',
|
801 |
+
xaxis_title='Dataset',
|
802 |
+
yaxis_title='Number of Lines',
|
803 |
+
legend_title='Filters',
|
804 |
+
height=600,
|
805 |
+
width=1000
|
806 |
+
)
|
807 |
+
|
808 |
+
# Show the plot
|
809 |
+
stacked_bar = fig
|
810 |
+
|
811 |
+
|
812 |
def curated(request):
|
813 |
|
814 |
# Partial Updates
|
|
|
942 |
data_preprocessing_div,
|
943 |
plotly2fasthtml(get_chart_28168342()),
|
944 |
plotly2fasthtml(get_chart_new()),
|
945 |
+
plotly2fasthtml(stacked_bar),
|
946 |
H2("Curated Sources Processing"),
|
947 |
filtering_process,
|
948 |
data_preparation_div,
|