Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +5 -3
pipeline.py
CHANGED
@@ -150,6 +150,7 @@ def pipeline_with_gemini(accessions):
|
|
150 |
meta = mtdna_classifier.fetch_ncbi_metadata(acc)
|
151 |
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
152 |
acc_score["isolate"] = iso
|
|
|
153 |
# set up step: create the folder to save document
|
154 |
chunk, all_output = "",""
|
155 |
if pudID:
|
@@ -173,7 +174,7 @@ def pipeline_with_gemini(accessions):
|
|
173 |
# Define document names
|
174 |
chunk_filename = f"{saveTitle}_merged_document.docx"
|
175 |
all_filename = f"{saveTitle}_all_merged_document.docx"
|
176 |
-
|
177 |
# Define local temp paths for reading/writing
|
178 |
import tempfile
|
179 |
tmp_dir = tempfile.mkdtemp()
|
@@ -183,7 +184,7 @@ def pipeline_with_gemini(accessions):
|
|
183 |
# Try to download if already exists on Drive
|
184 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
185 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
186 |
-
|
187 |
# first way: ncbi method
|
188 |
if country.lower() != "unknown":
|
189 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
@@ -227,7 +228,7 @@ def pipeline_with_gemini(accessions):
|
|
227 |
links.append(link)
|
228 |
if jsonSM:
|
229 |
links += sum((jsonSM[key] for key in jsonSM),[])
|
230 |
-
|
231 |
links = unique_preserve_order(links)
|
232 |
acc_score["source"] = links
|
233 |
# chunk_path = "/"+saveTitle+"_merged_document.docx"
|
@@ -256,6 +257,7 @@ def pipeline_with_gemini(accessions):
|
|
256 |
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
257 |
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
258 |
if not chunk and not all_output:
|
|
|
259 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
260 |
if links:
|
261 |
for link in links:
|
|
|
150 |
meta = mtdna_classifier.fetch_ncbi_metadata(acc)
|
151 |
country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
|
152 |
acc_score["isolate"] = iso
|
153 |
+
print(meta)
|
154 |
# set up step: create the folder to save document
|
155 |
chunk, all_output = "",""
|
156 |
if pudID:
|
|
|
174 |
# Define document names
|
175 |
chunk_filename = f"{saveTitle}_merged_document.docx"
|
176 |
all_filename = f"{saveTitle}_all_merged_document.docx"
|
177 |
+
print(chunk_filename, all_filename)
|
178 |
# Define local temp paths for reading/writing
|
179 |
import tempfile
|
180 |
tmp_dir = tempfile.mkdtemp()
|
|
|
184 |
# Try to download if already exists on Drive
|
185 |
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
186 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
187 |
+
print("chunk exist: ", chunk_exists)
|
188 |
# first way: ncbi method
|
189 |
if country.lower() != "unknown":
|
190 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
|
|
228 |
links.append(link)
|
229 |
if jsonSM:
|
230 |
links += sum((jsonSM[key] for key in jsonSM),[])
|
231 |
+
print("this is links: ",links)
|
232 |
links = unique_preserve_order(links)
|
233 |
acc_score["source"] = links
|
234 |
# chunk_path = "/"+saveTitle+"_merged_document.docx"
|
|
|
257 |
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
258 |
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
259 |
if not chunk and not all_output:
|
260 |
+
print("not chunk and all output")
|
261 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
262 |
if links:
|
263 |
for link in links:
|