victormiller
commited on
Commit
•
2c39f2b
1
Parent(s):
81bacff
Update curated.py
Browse files- curated.py +60 -0
curated.py
CHANGED
@@ -436,6 +436,35 @@ s2o_filter = pd.DataFrame(
|
|
436 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
437 |
table_div_s2o = Div(NotStr(table_html_s2o))
|
438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
med_filter = pd.DataFrame(
|
440 |
{
|
441 |
"Dataset": [
|
@@ -465,6 +494,35 @@ med_filter = pd.DataFrame(
|
|
465 |
table_html_med = med_filter.to_html(index=False, border=0)
|
466 |
table_div_med = Div(NotStr(table_html_med))
|
467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
phil_filter = pd.DataFrame(
|
469 |
{
|
470 |
"Dataset": [
|
@@ -855,6 +913,7 @@ filtering_process = Div(
|
|
855 |
style="margin-bottom: -3px",
|
856 |
),
|
857 |
),
|
|
|
858 |
#Details(
|
859 |
# Summary("S2ORC Abstract Filtering Examples "),
|
860 |
# Div(
|
@@ -914,6 +973,7 @@ filtering_process = Div(
|
|
914 |
),
|
915 |
),
|
916 |
table_div_med,
|
|
|
917 |
Details(
|
918 |
Summary("PubMed Filtering Examples"),
|
919 |
Div(
|
|
|
436 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
437 |
table_div_s2o = Div(NotStr(table_html_s2o))
|
438 |
|
439 |
+
s2oa_filter = pd.DataFrame(
|
440 |
+
{
|
441 |
+
"Dataset": [
|
442 |
+
"S2ORC Abstract",
|
443 |
+
],
|
444 |
+
"Lines Downloaded": [
|
445 |
+
"102324176",
|
446 |
+
],
|
447 |
+
"Percent Removed After Language Filter": [
|
448 |
+
"18.04%",
|
449 |
+
],
|
450 |
+
"Percent Removed After Min Word Count Filter": [
|
451 |
+
"1.17%",
|
452 |
+
],
|
453 |
+
"Percent Removed After Unigram Probability Filter": [
|
454 |
+
"0.00%",
|
455 |
+
],
|
456 |
+
"Percent Removed After Local Dedup": [
|
457 |
+
"0.13%",
|
458 |
+
],
|
459 |
+
"Total Percentage Remaining": [
|
460 |
+
"80.66%",
|
461 |
+
],
|
462 |
+
}
|
463 |
+
)
|
464 |
+
|
465 |
+
table_html_s2oa = s2oa_filter.to_html(index=False, border=0)
|
466 |
+
table_div_s2oa = Div(NotStr(table_html_s2oa))
|
467 |
+
|
468 |
med_filter = pd.DataFrame(
|
469 |
{
|
470 |
"Dataset": [
|
|
|
494 |
table_html_med = med_filter.to_html(index=False, border=0)
|
495 |
table_div_med = Div(NotStr(table_html_med))
|
496 |
|
497 |
+
pma_filter = pd.DataFrame(
|
498 |
+
{
|
499 |
+
"Dataset": [
|
500 |
+
"PubMed - Abstract",
|
501 |
+
],
|
502 |
+
"Lines Downloaded": [
|
503 |
+
"25787474",
|
504 |
+
],
|
505 |
+
"Percent Removed After Language Filter": [
|
506 |
+
"0.01%",
|
507 |
+
],
|
508 |
+
"Percent Removed After Min Word Count Filter": [
|
509 |
+
"0.14%",
|
510 |
+
],
|
511 |
+
"Percent Removed After Unigram Probability Filter": [
|
512 |
+
"0.00%",
|
513 |
+
],
|
514 |
+
"Percent Removed After Local Dedup": [
|
515 |
+
"0.00%",
|
516 |
+
],
|
517 |
+
"Total Percentage Remaining": [
|
518 |
+
"98.85%",
|
519 |
+
],
|
520 |
+
}
|
521 |
+
)
|
522 |
+
|
523 |
+
table_html_pma = pma_filter.to_html(index=False, border=0)
|
524 |
+
table_div_pma = Div(NotStr(table_html_pma))
|
525 |
+
|
526 |
phil_filter = pd.DataFrame(
|
527 |
{
|
528 |
"Dataset": [
|
|
|
913 |
style="margin-bottom: -3px",
|
914 |
),
|
915 |
),
|
916 |
+
table_div_s2oa,
|
917 |
#Details(
|
918 |
# Summary("S2ORC Abstract Filtering Examples "),
|
919 |
# Div(
|
|
|
973 |
),
|
974 |
),
|
975 |
table_div_med,
|
976 |
+
table_div_pma,
|
977 |
Details(
|
978 |
Summary("PubMed Filtering Examples"),
|
979 |
Div(
|