Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +36 -18
pipeline.py
CHANGED
@@ -403,6 +403,24 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=No
|
|
403 |
print(f"π Stop processing {accession}, aborting early...")
|
404 |
return {}
|
405 |
# check doi first
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])
|
407 |
if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
|
408 |
if doi != "unknown":
|
@@ -548,24 +566,24 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=No
|
|
548 |
if stop_flag is not None and stop_flag.value:
|
549 |
print(f"π Stop processing {accession}, aborting early...")
|
550 |
return {}
|
551 |
-
print("chunk filename: ", chunk_filename)
|
552 |
-
if chunk_exists:
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
if all_exists:
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
if not chunk and not all_output:
|
570 |
print("not chunk and all output")
|
571 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
|
|
403 |
print(f"π Stop processing {accession}, aborting early...")
|
404 |
return {}
|
405 |
# check doi first
|
406 |
+
print("chunk filename: ", chunk_filename)
|
407 |
+
if chunk_exists:
|
408 |
+
print("File chunk exists!")
|
409 |
+
if not chunk:
|
410 |
+
print("start to get chunk")
|
411 |
+
text, table, document_title = model.read_docx_text(file_chunk_path)
|
412 |
+
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
413 |
+
if str(chunk_filename) != "":
|
414 |
+
print("first time have chunk path at chunk exist: ", str(chunk_filename))
|
415 |
+
acc_score["file_chunk"] = str(chunk_filename)
|
416 |
+
if all_exists:
|
417 |
+
print("File all output exists!")
|
418 |
+
if not all_output:
|
419 |
+
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
420 |
+
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
421 |
+
if str(all_filename) != "":
|
422 |
+
print("first time have all path at all exist: ", str(all_filename))
|
423 |
+
acc_score["file_all_output"] = str(all_filename)
|
424 |
print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])
|
425 |
if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
|
426 |
if doi != "unknown":
|
|
|
566 |
if stop_flag is not None and stop_flag.value:
|
567 |
print(f"π Stop processing {accession}, aborting early...")
|
568 |
return {}
|
569 |
+
# print("chunk filename: ", chunk_filename)
|
570 |
+
# if chunk_exists:
|
571 |
+
# print("File chunk exists!")
|
572 |
+
# if not chunk:
|
573 |
+
# print("start to get chunk")
|
574 |
+
# text, table, document_title = model.read_docx_text(file_chunk_path)
|
575 |
+
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
576 |
+
# if str(chunk_filename) != "":
|
577 |
+
# print("first time have chunk path at chunk exist: ", str(chunk_filename))
|
578 |
+
# acc_score["file_chunk"] = str(chunk_filename)
|
579 |
+
# if all_exists:
|
580 |
+
# print("File all output exists!")
|
581 |
+
# if not all_output:
|
582 |
+
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
583 |
+
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
584 |
+
# if str(all_filename) != "":
|
585 |
+
# print("first time have all path at all exist: ", str(all_filename))
|
586 |
+
# acc_score["file_all_output"] = str(all_filename)
|
587 |
if not chunk and not all_output:
|
588 |
print("not chunk and all output")
|
589 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|