VyLala commited on
Commit
ac84bdc
·
verified ·
1 Parent(s): d47e63d

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +5 -3
pipeline.py CHANGED
@@ -150,6 +150,7 @@ def pipeline_with_gemini(accessions):
150
  meta = mtdna_classifier.fetch_ncbi_metadata(acc)
151
  country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
152
  acc_score["isolate"] = iso
 
153
  # set up step: create the folder to save document
154
  chunk, all_output = "",""
155
  if pudID:
@@ -173,7 +174,7 @@ def pipeline_with_gemini(accessions):
173
  # Define document names
174
  chunk_filename = f"{saveTitle}_merged_document.docx"
175
  all_filename = f"{saveTitle}_all_merged_document.docx"
176
-
177
  # Define local temp paths for reading/writing
178
  import tempfile
179
  tmp_dir = tempfile.mkdtemp()
@@ -183,7 +184,7 @@ def pipeline_with_gemini(accessions):
183
  # Try to download if already exists on Drive
184
  chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
185
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
186
-
187
  # first way: ncbi method
188
  if country.lower() != "unknown":
189
  stand_country = standardize_location.smart_country_lookup(country.lower())
@@ -227,7 +228,7 @@ def pipeline_with_gemini(accessions):
227
  links.append(link)
228
  if jsonSM:
229
  links += sum((jsonSM[key] for key in jsonSM),[])
230
- #print(links)
231
  links = unique_preserve_order(links)
232
  acc_score["source"] = links
233
  # chunk_path = "/"+saveTitle+"_merged_document.docx"
@@ -256,6 +257,7 @@ def pipeline_with_gemini(accessions):
256
  text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
257
  all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
258
  if not chunk and not all_output:
 
259
  # else: check if we can reuse these chunk and all output of existed accession to find another
260
  if links:
261
  for link in links:
 
150
  meta = mtdna_classifier.fetch_ncbi_metadata(acc)
151
  country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
152
  acc_score["isolate"] = iso
153
+ print(meta)
154
  # set up step: create the folder to save document
155
  chunk, all_output = "",""
156
  if pudID:
 
174
  # Define document names
175
  chunk_filename = f"{saveTitle}_merged_document.docx"
176
  all_filename = f"{saveTitle}_all_merged_document.docx"
177
+ print(chunk_filename, all_filename)
178
  # Define local temp paths for reading/writing
179
  import tempfile
180
  tmp_dir = tempfile.mkdtemp()
 
184
  # Try to download if already exists on Drive
185
  chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
186
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
187
+ print("chunk exist: ", chunk_exists)
188
  # first way: ncbi method
189
  if country.lower() != "unknown":
190
  stand_country = standardize_location.smart_country_lookup(country.lower())
 
228
  links.append(link)
229
  if jsonSM:
230
  links += sum((jsonSM[key] for key in jsonSM),[])
231
+ print("this is links: ",links)
232
  links = unique_preserve_order(links)
233
  acc_score["source"] = links
234
  # chunk_path = "/"+saveTitle+"_merged_document.docx"
 
257
  text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
258
  all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
259
  if not chunk and not all_output:
260
+ print("not chunk and all output")
261
  # else: check if we can reuse these chunk and all output of existed accession to find another
262
  if links:
263
  for link in links: