TxT360

Running

App Files Files Community

victormiller commited on Oct 7, 2024

Commit

2c39f2b

verified ·

1 Parent(s): 81bacff

Update curated.py

Browse files

Files changed (1) hide show

curated.py +60 -0

curated.py CHANGED Viewed

@@ -436,6 +436,35 @@ s2o_filter = pd.DataFrame(
 table_html_s2o = s2o_filter.to_html(index=False, border=0)
 table_div_s2o = Div(NotStr(table_html_s2o))
 med_filter = pd.DataFrame(
     {
         "Dataset": [
@@ -465,6 +494,35 @@ med_filter = pd.DataFrame(
 table_html_med = med_filter.to_html(index=False, border=0)
 table_div_med = Div(NotStr(table_html_med))
 phil_filter = pd.DataFrame(
     {
         "Dataset": [
@@ -855,6 +913,7 @@ filtering_process = Div(
                     style="margin-bottom: -3px",
                 ),
             ),
             #Details(
             #    Summary("S2ORC Abstract Filtering Examples "),
            #     Div(
@@ -914,6 +973,7 @@ filtering_process = Div(
                 ),
             ),
             table_div_med,
             Details(
                 Summary("PubMed Filtering Examples"),
                 Div(

 table_html_s2o = s2o_filter.to_html(index=False, border=0)
 table_div_s2o = Div(NotStr(table_html_s2o))
+s2oa_filter = pd.DataFrame(
+    {
+        "Dataset": [
+            "S2ORC Abstract",
+        ],
+        "Lines Downloaded": [
+            "102324176",
+        ],
+        "Percent Removed After Language Filter": [
+            "18.04%",
+        ],
+        "Percent Removed After Min Word Count Filter": [
+            "1.17%",
+        ],
+        "Percent Removed After Unigram Probability Filter": [
+            "0.00%",
+        ],
+        "Percent Removed After Local Dedup": [
+            "0.13%",
+        ],
+        "Total Percentage Remaining": [
+            "80.66%",
+        ],
+    }
+)
+table_html_s2oa = s2oa_filter.to_html(index=False, border=0)
+table_div_s2oa = Div(NotStr(table_html_s2oa))
 med_filter = pd.DataFrame(
     {
         "Dataset": [
 table_html_med = med_filter.to_html(index=False, border=0)
 table_div_med = Div(NotStr(table_html_med))
+pma_filter = pd.DataFrame(
+    {
+        "Dataset": [
+            "PubMed - Abstract",
+        ],
+        "Lines Downloaded": [
+            "25787474",
+        ],
+        "Percent Removed After Language Filter": [
+            "0.01%",
+        ],
+        "Percent Removed After Min Word Count Filter": [
+            "0.14%",
+        ],
+        "Percent Removed After Unigram Probability Filter": [
+            "0.00%",
+        ],
+        "Percent Removed After Local Dedup": [
+            "0.00%",
+        ],
+        "Total Percentage Remaining": [
+            "98.85%",
+        ],
+    }
+)
+table_html_pma = pma_filter.to_html(index=False, border=0)
+table_div_pma = Div(NotStr(table_html_pma))
 phil_filter = pd.DataFrame(
     {
         "Dataset": [
                     style="margin-bottom: -3px",
                 ),
             ),
+            table_div_s2oa,
             #Details(
             #    Summary("S2ORC Abstract Filtering Examples "),
            #     Div(
                 ),
             ),
             table_div_med,
+            table_div_pma,
             Details(
                 Summary("PubMed Filtering Examples"),
                 Div(