VyLala commited on
Commit
81e271a
·
verified ·
1 Parent(s): 7a708c4

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +7 -0
pipeline.py CHANGED
@@ -548,17 +548,24 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=No
548
  if stop_flag is not None and stop_flag.value:
549
  print(f"🛑 Stop processing {accession}, aborting early...")
550
  return {}
 
551
  if chunk_exists:
552
  print("File chunk exists!")
553
  if not chunk:
554
  print("start to get chunk")
555
  text, table, document_title = model.read_docx_text(file_chunk_path)
556
  chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
 
 
 
557
  if all_exists:
558
  print("File all output exists!")
559
  if not all_output:
560
  text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
561
  all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
 
 
 
562
  if not chunk and not all_output:
563
  print("not chunk and all output")
564
  # else: check if we can reuse these chunk and all output of existed accession to find another
 
548
  if stop_flag is not None and stop_flag.value:
549
  print(f"🛑 Stop processing {accession}, aborting early...")
550
  return {}
551
+ print("chunk filename: ", chunk_filename)
552
  if chunk_exists:
553
  print("File chunk exists!")
554
  if not chunk:
555
  print("start to get chunk")
556
  text, table, document_title = model.read_docx_text(file_chunk_path)
557
  chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
558
+ if str(chunk_filename) != "":
559
+ print("first time have chunk path at chunk exist: ", str(chunk_filename))
560
+ acc_score["file_chunk"] = str(chunk_filename)
561
  if all_exists:
562
  print("File all output exists!")
563
  if not all_output:
564
  text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
565
  all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
566
+ if str(all_filename) != "":
567
+ print("first time have all path at all exist: ", str(all_filename))
568
+ acc_score["file_all_output"] = str(all_filename)
569
  if not chunk and not all_output:
570
  print("not chunk and all output")
571
  # else: check if we can reuse these chunk and all output of existed accession to find another