Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +7 -0
pipeline.py
CHANGED
@@ -548,17 +548,24 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=No
|
|
548 |
if stop_flag is not None and stop_flag.value:
|
549 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
550 |
return {}
|
|
|
551 |
if chunk_exists:
|
552 |
print("File chunk exists!")
|
553 |
if not chunk:
|
554 |
print("start to get chunk")
|
555 |
text, table, document_title = model.read_docx_text(file_chunk_path)
|
556 |
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
|
|
|
|
|
|
557 |
if all_exists:
|
558 |
print("File all output exists!")
|
559 |
if not all_output:
|
560 |
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
561 |
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
|
|
|
|
|
|
562 |
if not chunk and not all_output:
|
563 |
print("not chunk and all output")
|
564 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
|
|
548 |
if stop_flag is not None and stop_flag.value:
|
549 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
550 |
return {}
|
551 |
+
print("chunk filename: ", chunk_filename)
|
552 |
if chunk_exists:
|
553 |
print("File chunk exists!")
|
554 |
if not chunk:
|
555 |
print("start to get chunk")
|
556 |
text, table, document_title = model.read_docx_text(file_chunk_path)
|
557 |
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
558 |
+
if str(chunk_filename) != "":
|
559 |
+
print("first time have chunk path at chunk exist: ", str(chunk_filename))
|
560 |
+
acc_score["file_chunk"] = str(chunk_filename)
|
561 |
if all_exists:
|
562 |
print("File all output exists!")
|
563 |
if not all_output:
|
564 |
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
565 |
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
566 |
+
if str(all_filename) != "":
|
567 |
+
print("first time have all path at all exist: ", str(all_filename))
|
568 |
+
acc_score["file_all_output"] = str(all_filename)
|
569 |
if not chunk and not all_output:
|
570 |
print("not chunk and all output")
|
571 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|