VyLala commited on
Commit
816b392
Β·
verified Β·
1 Parent(s): 1ba0b1f

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +36 -18
pipeline.py CHANGED
@@ -403,6 +403,24 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=No
403
  print(f"πŸ›‘ Stop processing {accession}, aborting early...")
404
  return {}
405
  # check doi first
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])
407
  if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
408
  if doi != "unknown":
@@ -548,24 +566,24 @@ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=No
548
  if stop_flag is not None and stop_flag.value:
549
  print(f"πŸ›‘ Stop processing {accession}, aborting early...")
550
  return {}
551
- print("chunk filename: ", chunk_filename)
552
- if chunk_exists:
553
- print("File chunk exists!")
554
- if not chunk:
555
- print("start to get chunk")
556
- text, table, document_title = model.read_docx_text(file_chunk_path)
557
- chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
558
- if str(chunk_filename) != "":
559
- print("first time have chunk path at chunk exist: ", str(chunk_filename))
560
- acc_score["file_chunk"] = str(chunk_filename)
561
- if all_exists:
562
- print("File all output exists!")
563
- if not all_output:
564
- text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
565
- all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
566
- if str(all_filename) != "":
567
- print("first time have all path at all exist: ", str(all_filename))
568
- acc_score["file_all_output"] = str(all_filename)
569
  if not chunk and not all_output:
570
  print("not chunk and all output")
571
  # else: check if we can reuse these chunk and all output of existed accession to find another
 
403
  print(f"πŸ›‘ Stop processing {accession}, aborting early...")
404
  return {}
405
  # check doi first
406
+ print("chunk filename: ", chunk_filename)
407
+ if chunk_exists:
408
+ print("File chunk exists!")
409
+ if not chunk:
410
+ print("start to get chunk")
411
+ text, table, document_title = model.read_docx_text(file_chunk_path)
412
+ chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
413
+ if str(chunk_filename) != "":
414
+ print("first time have chunk path at chunk exist: ", str(chunk_filename))
415
+ acc_score["file_chunk"] = str(chunk_filename)
416
+ if all_exists:
417
+ print("File all output exists!")
418
+ if not all_output:
419
+ text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
420
+ all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
421
+ if str(all_filename) != "":
422
+ print("first time have all path at all exist: ", str(all_filename))
423
+ acc_score["file_all_output"] = str(all_filename)
424
  print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])
425
  if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
426
  if doi != "unknown":
 
566
  if stop_flag is not None and stop_flag.value:
567
  print(f"πŸ›‘ Stop processing {accession}, aborting early...")
568
  return {}
569
+ # print("chunk filename: ", chunk_filename)
570
+ # if chunk_exists:
571
+ # print("File chunk exists!")
572
+ # if not chunk:
573
+ # print("start to get chunk")
574
+ # text, table, document_title = model.read_docx_text(file_chunk_path)
575
+ # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
576
+ # if str(chunk_filename) != "":
577
+ # print("first time have chunk path at chunk exist: ", str(chunk_filename))
578
+ # acc_score["file_chunk"] = str(chunk_filename)
579
+ # if all_exists:
580
+ # print("File all output exists!")
581
+ # if not all_output:
582
+ # text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
583
+ # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
584
+ # if str(all_filename) != "":
585
+ # print("first time have all path at all exist: ", str(all_filename))
586
+ # acc_score["file_all_output"] = str(all_filename)
587
  if not chunk and not all_output:
588
  print("not chunk and all output")
589
  # else: check if we can reuse these chunk and all output of existed accession to find another