Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +125 -25
pipeline.py
CHANGED
@@ -2,7 +2,8 @@
|
|
2 |
# test2: "A1YU101" thailand cross-ref
|
3 |
# test3: "EBK109" thailand cross-ref
|
4 |
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
|
5 |
-
|
|
|
6 |
import mtdna_classifier
|
7 |
import app
|
8 |
import pandas as pd
|
@@ -17,6 +18,57 @@ import standardize_location
|
|
17 |
# Track time
|
18 |
import time
|
19 |
import multiprocessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def run_with_timeout(func, args=(), kwargs={}, timeout=20):
|
22 |
"""
|
@@ -98,19 +150,37 @@ def pipeline_with_gemini(accessions):
|
|
98 |
# set up step: create the folder to save document
|
99 |
chunk, all_output = "",""
|
100 |
if pudID:
|
101 |
-
id = pudID
|
102 |
saveTitle = title
|
103 |
else:
|
104 |
saveTitle = title + "_" + col_date
|
105 |
id = "DirectSubmission"
|
106 |
-
folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
107 |
-
if not folder_path.exists():
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
else:
|
112 |
-
|
113 |
-
saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
# first way: ncbi method
|
115 |
if country.lower() != "unknown":
|
116 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
@@ -154,24 +224,34 @@ def pipeline_with_gemini(accessions):
|
|
154 |
links.append(link)
|
155 |
if jsonSM:
|
156 |
links += sum((jsonSM[key] for key in jsonSM),[])
|
157 |
-
print(links)
|
158 |
links = unique_preserve_order(links)
|
159 |
acc_score["source"] = links
|
160 |
-
chunk_path = "/"+saveTitle+"_merged_document.docx"
|
161 |
-
all_path = "/"+saveTitle+"_all_merged_document.docx"
|
162 |
-
# if chunk and all output not exist yet
|
163 |
-
file_chunk_path = saveLinkFolder + chunk_path
|
164 |
-
file_all_path = saveLinkFolder + all_path
|
165 |
-
if os.path.exists(file_chunk_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
print("File chunk exists!")
|
167 |
if not chunk:
|
168 |
-
|
169 |
-
|
170 |
-
if
|
171 |
print("File all output exists!")
|
172 |
if not all_output:
|
173 |
-
|
174 |
-
|
175 |
if not chunk and not all_output:
|
176 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
177 |
if links:
|
@@ -233,8 +313,16 @@ def pipeline_with_gemini(accessions):
|
|
233 |
all_output = all_output[:1*1024*1024]
|
234 |
print("chunk len: ", len(chunk))
|
235 |
print("all output len: ", len(all_output))
|
|
|
|
|
|
|
236 |
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
237 |
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
|
|
|
|
|
|
|
|
|
|
238 |
# else:
|
239 |
# final_input = ""
|
240 |
# if all_output:
|
@@ -253,9 +341,21 @@ def pipeline_with_gemini(accessions):
|
|
253 |
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
|
254 |
|
255 |
# Define paths for cached RAG assets
|
256 |
-
faiss_index_path = saveLinkFolder+"/faiss_index.bin"
|
257 |
-
document_chunks_path = saveLinkFolder+"/document_chunks.json"
|
258 |
-
structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
|
261 |
faiss_index_path, document_chunks_path, structured_lookup_path
|
|
|
2 |
# test2: "A1YU101" thailand cross-ref
|
3 |
# test3: "EBK109" thailand cross-ref
|
4 |
# test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
|
5 |
+
import data_preprocess
|
6 |
+
import model
|
7 |
import mtdna_classifier
|
8 |
import app
|
9 |
import pandas as pd
|
|
|
18 |
# Track time
|
19 |
import time
|
20 |
import multiprocessing
|
21 |
+
import gspread
|
22 |
+
from googleapiclient.discovery import build
|
23 |
+
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
|
24 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
25 |
+
import io
|
26 |
+
#––– Authentication setup –––
|
27 |
+
GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
|
28 |
+
GDRIVE_DATA_FOLDER_NAME = "data"
|
29 |
+
GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
|
30 |
+
GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
|
31 |
+
drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
|
32 |
+
|
33 |
+
def get_or_create_drive_folder(name, parent_id=None):
|
34 |
+
query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
|
35 |
+
if parent_id:
|
36 |
+
query += f" and '{parent_id}' in parents"
|
37 |
+
results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
|
38 |
+
items = results.get("files", [])
|
39 |
+
if items:
|
40 |
+
return items[0]["id"]
|
41 |
+
file_metadata = {
|
42 |
+
"name": name,
|
43 |
+
"mimeType": "application/vnd.google-apps.folder"
|
44 |
+
}
|
45 |
+
if parent_id:
|
46 |
+
file_metadata["parents"] = [parent_id]
|
47 |
+
file = drive_service.files().create(body=file_metadata, fields="id").execute()
|
48 |
+
return file["id"]
|
49 |
+
|
50 |
+
def upload_file_to_drive(local_path, remote_name, folder_id):
|
51 |
+
file_metadata = {"name": remote_name, "parents": [folder_id]}
|
52 |
+
media = MediaFileUpload(local_path, resumable=True)
|
53 |
+
existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
|
54 |
+
if existing:
|
55 |
+
drive_service.files().delete(fileId=existing[0]["id"]).execute()
|
56 |
+
file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
|
57 |
+
return file["id"]
|
58 |
+
|
59 |
+
def download_file_from_drive(remote_name, folder_id, local_path):
|
60 |
+
results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
|
61 |
+
files = results.get("files", [])
|
62 |
+
if not files:
|
63 |
+
return False
|
64 |
+
file_id = files[0]["id"]
|
65 |
+
request = drive_service.files().get_media(fileId=file_id)
|
66 |
+
fh = io.FileIO(local_path, 'wb')
|
67 |
+
downloader = MediaIoBaseDownload(fh, request)
|
68 |
+
done = False
|
69 |
+
while not done:
|
70 |
+
_, done = downloader.next_chunk()
|
71 |
+
return True
|
72 |
|
73 |
def run_with_timeout(func, args=(), kwargs={}, timeout=20):
|
74 |
"""
|
|
|
150 |
# set up step: create the folder to save document
|
151 |
chunk, all_output = "",""
|
152 |
if pudID:
|
153 |
+
id = str(pudID)
|
154 |
saveTitle = title
|
155 |
else:
|
156 |
saveTitle = title + "_" + col_date
|
157 |
id = "DirectSubmission"
|
158 |
+
# folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
|
159 |
+
# if not folder_path.exists():
|
160 |
+
# cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
|
161 |
+
# result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
162 |
+
# print("data/"+str(id) +" created.")
|
163 |
+
# else:
|
164 |
+
# print("data/"+str(id) +" already exists.")
|
165 |
+
# saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
|
166 |
+
parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
|
167 |
+
data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
|
168 |
+
sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
|
169 |
+
|
170 |
+
# Define document names
|
171 |
+
chunk_filename = f"{saveTitle}_merged_document.docx"
|
172 |
+
all_filename = f"{saveTitle}_all_merged_document.docx"
|
173 |
+
|
174 |
+
# Define local temp paths for reading/writing
|
175 |
+
import tempfile
|
176 |
+
tmp_dir = tempfile.mkdtemp()
|
177 |
+
file_chunk_path = os.path.join(tmp_dir, chunk_filename)
|
178 |
+
file_all_path = os.path.join(tmp_dir, all_filename)
|
179 |
+
|
180 |
+
# Try to download if already exists on Drive
|
181 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
182 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
183 |
+
|
184 |
# first way: ncbi method
|
185 |
if country.lower() != "unknown":
|
186 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
|
|
224 |
links.append(link)
|
225 |
if jsonSM:
|
226 |
links += sum((jsonSM[key] for key in jsonSM),[])
|
227 |
+
#print(links)
|
228 |
links = unique_preserve_order(links)
|
229 |
acc_score["source"] = links
|
230 |
+
# chunk_path = "/"+saveTitle+"_merged_document.docx"
|
231 |
+
# all_path = "/"+saveTitle+"_all_merged_document.docx"
|
232 |
+
# # if chunk and all output not exist yet
|
233 |
+
# file_chunk_path = saveLinkFolder + chunk_path
|
234 |
+
# file_all_path = saveLinkFolder + all_path
|
235 |
+
# if os.path.exists(file_chunk_path):
|
236 |
+
# print("File chunk exists!")
|
237 |
+
# if not chunk:
|
238 |
+
# text, table, document_title = model.read_docx_text(file_chunk_path)
|
239 |
+
# chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
240 |
+
# if os.path.exists(file_all_path):
|
241 |
+
# print("File all output exists!")
|
242 |
+
# if not all_output:
|
243 |
+
# text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
244 |
+
# all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
245 |
+
if chunk_exists:
|
246 |
print("File chunk exists!")
|
247 |
if not chunk:
|
248 |
+
text, table, document_title = model.read_docx_text(file_chunk_path)
|
249 |
+
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
250 |
+
if all_exists:
|
251 |
print("File all output exists!")
|
252 |
if not all_output:
|
253 |
+
text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
|
254 |
+
all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
|
255 |
if not chunk and not all_output:
|
256 |
# else: check if we can reuse these chunk and all output of existed accession to find another
|
257 |
if links:
|
|
|
313 |
all_output = all_output[:1*1024*1024]
|
314 |
print("chunk len: ", len(chunk))
|
315 |
print("all output len: ", len(all_output))
|
316 |
+
# data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
317 |
+
# data_preprocess.save_text_to_docx(all_output, file_all_path)
|
318 |
+
# Later when saving new files
|
319 |
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
320 |
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
321 |
+
|
322 |
+
# Upload to Drive
|
323 |
+
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
324 |
+
upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
325 |
+
|
326 |
# else:
|
327 |
# final_input = ""
|
328 |
# if all_output:
|
|
|
341 |
# chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
|
342 |
|
343 |
# Define paths for cached RAG assets
|
344 |
+
# faiss_index_path = saveLinkFolder+"/faiss_index.bin"
|
345 |
+
# document_chunks_path = saveLinkFolder+"/document_chunks.json"
|
346 |
+
# structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
|
347 |
+
faiss_filename = "faiss_index.bin"
|
348 |
+
chunks_filename = "document_chunks.json"
|
349 |
+
lookup_filename = "structured_lookup.json"
|
350 |
+
|
351 |
+
# Save in temporary local directory
|
352 |
+
faiss_index_path = os.path.join(tmp_dir, faiss_filename)
|
353 |
+
document_chunks_path = os.path.join(tmp_dir, chunks_filename)
|
354 |
+
structured_lookup_path = os.path.join(tmp_dir, lookup_filename)
|
355 |
+
|
356 |
+
download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
|
357 |
+
download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
|
358 |
+
download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
|
359 |
|
360 |
master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
|
361 |
faiss_index_path, document_chunks_path, structured_lookup_path
|