VyLala commited on
Commit
4cc7eed
·
verified ·
1 Parent(s): da14ceb

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +125 -25
pipeline.py CHANGED
@@ -2,7 +2,8 @@
2
  # test2: "A1YU101" thailand cross-ref
3
  # test3: "EBK109" thailand cross-ref
4
  # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
5
- from iterate3 import data_preprocess, model
 
6
  import mtdna_classifier
7
  import app
8
  import pandas as pd
@@ -17,6 +18,57 @@ import standardize_location
17
  # Track time
18
  import time
19
  import multiprocessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def run_with_timeout(func, args=(), kwargs={}, timeout=20):
22
  """
@@ -98,19 +150,37 @@ def pipeline_with_gemini(accessions):
98
  # set up step: create the folder to save document
99
  chunk, all_output = "",""
100
  if pudID:
101
- id = pudID
102
  saveTitle = title
103
  else:
104
  saveTitle = title + "_" + col_date
105
  id = "DirectSubmission"
106
- folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
107
- if not folder_path.exists():
108
- cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
109
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
110
- print("data/"+str(id) +" created.")
111
- else:
112
- print("data/"+str(id) +" already exists.")
113
- saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  # first way: ncbi method
115
  if country.lower() != "unknown":
116
  stand_country = standardize_location.smart_country_lookup(country.lower())
@@ -154,24 +224,34 @@ def pipeline_with_gemini(accessions):
154
  links.append(link)
155
  if jsonSM:
156
  links += sum((jsonSM[key] for key in jsonSM),[])
157
- print(links)
158
  links = unique_preserve_order(links)
159
  acc_score["source"] = links
160
- chunk_path = "/"+saveTitle+"_merged_document.docx"
161
- all_path = "/"+saveTitle+"_all_merged_document.docx"
162
- # if chunk and all output not exist yet
163
- file_chunk_path = saveLinkFolder + chunk_path
164
- file_all_path = saveLinkFolder + all_path
165
- if os.path.exists(file_chunk_path):
 
 
 
 
 
 
 
 
 
 
166
  print("File chunk exists!")
167
  if not chunk:
168
- text, table, document_title = model.read_docx_text(file_chunk_path)
169
- chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
170
- if os.path.exists(file_all_path):
171
  print("File all output exists!")
172
  if not all_output:
173
- text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
174
- all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
175
  if not chunk and not all_output:
176
  # else: check if we can reuse these chunk and all output of existed accession to find another
177
  if links:
@@ -233,8 +313,16 @@ def pipeline_with_gemini(accessions):
233
  all_output = all_output[:1*1024*1024]
234
  print("chunk len: ", len(chunk))
235
  print("all output len: ", len(all_output))
 
 
 
236
  data_preprocess.save_text_to_docx(chunk, file_chunk_path)
237
  data_preprocess.save_text_to_docx(all_output, file_all_path)
 
 
 
 
 
238
  # else:
239
  # final_input = ""
240
  # if all_output:
@@ -253,9 +341,21 @@ def pipeline_with_gemini(accessions):
253
  # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
254
 
255
  # Define paths for cached RAG assets
256
- faiss_index_path = saveLinkFolder+"/faiss_index.bin"
257
- document_chunks_path = saveLinkFolder+"/document_chunks.json"
258
- structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
261
  faiss_index_path, document_chunks_path, structured_lookup_path
 
2
  # test2: "A1YU101" thailand cross-ref
3
  # test3: "EBK109" thailand cross-ref
4
  # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
5
+ import data_preprocess
6
+ import model
7
  import mtdna_classifier
8
  import app
9
  import pandas as pd
 
18
  # Track time
19
  import time
20
  import multiprocessing
21
+ import gspread
22
+ from googleapiclient.discovery import build
23
+ from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
24
+ from oauth2client.service_account import ServiceAccountCredentials
25
+ import io
26
+ #––– Authentication setup –––
27
+ GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
28
+ GDRIVE_DATA_FOLDER_NAME = "data"
29
+ GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
30
+ GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
31
+ drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
32
+
33
+ def get_or_create_drive_folder(name, parent_id=None):
34
+ query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
35
+ if parent_id:
36
+ query += f" and '{parent_id}' in parents"
37
+ results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
38
+ items = results.get("files", [])
39
+ if items:
40
+ return items[0]["id"]
41
+ file_metadata = {
42
+ "name": name,
43
+ "mimeType": "application/vnd.google-apps.folder"
44
+ }
45
+ if parent_id:
46
+ file_metadata["parents"] = [parent_id]
47
+ file = drive_service.files().create(body=file_metadata, fields="id").execute()
48
+ return file["id"]
49
+
50
+ def upload_file_to_drive(local_path, remote_name, folder_id):
51
+ file_metadata = {"name": remote_name, "parents": [folder_id]}
52
+ media = MediaFileUpload(local_path, resumable=True)
53
+ existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
54
+ if existing:
55
+ drive_service.files().delete(fileId=existing[0]["id"]).execute()
56
+ file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
57
+ return file["id"]
58
+
59
+ def download_file_from_drive(remote_name, folder_id, local_path):
60
+ results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
61
+ files = results.get("files", [])
62
+ if not files:
63
+ return False
64
+ file_id = files[0]["id"]
65
+ request = drive_service.files().get_media(fileId=file_id)
66
+ fh = io.FileIO(local_path, 'wb')
67
+ downloader = MediaIoBaseDownload(fh, request)
68
+ done = False
69
+ while not done:
70
+ _, done = downloader.next_chunk()
71
+ return True
72
 
73
  def run_with_timeout(func, args=(), kwargs={}, timeout=20):
74
  """
 
150
  # set up step: create the folder to save document
151
  chunk, all_output = "",""
152
  if pudID:
153
+ id = str(pudID)
154
  saveTitle = title
155
  else:
156
  saveTitle = title + "_" + col_date
157
  id = "DirectSubmission"
158
+ # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
159
+ # if not folder_path.exists():
160
+ # cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
161
+ # result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
162
+ # print("data/"+str(id) +" created.")
163
+ # else:
164
+ # print("data/"+str(id) +" already exists.")
165
+ # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
166
+ parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
167
+ data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
168
+ sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
169
+
170
+ # Define document names
171
+ chunk_filename = f"{saveTitle}_merged_document.docx"
172
+ all_filename = f"{saveTitle}_all_merged_document.docx"
173
+
174
+ # Define local temp paths for reading/writing
175
+ import tempfile
176
+ tmp_dir = tempfile.mkdtemp()
177
+ file_chunk_path = os.path.join(tmp_dir, chunk_filename)
178
+ file_all_path = os.path.join(tmp_dir, all_filename)
179
+
180
+ # Try to download if already exists on Drive
181
+ chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
182
+ all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
183
+
184
  # first way: ncbi method
185
  if country.lower() != "unknown":
186
  stand_country = standardize_location.smart_country_lookup(country.lower())
 
224
  links.append(link)
225
  if jsonSM:
226
  links += sum((jsonSM[key] for key in jsonSM),[])
227
+ #print(links)
228
  links = unique_preserve_order(links)
229
  acc_score["source"] = links
230
+ # chunk_path = "/"+saveTitle+"_merged_document.docx"
231
+ # all_path = "/"+saveTitle+"_all_merged_document.docx"
232
+ # # if chunk and all output not exist yet
233
+ # file_chunk_path = saveLinkFolder + chunk_path
234
+ # file_all_path = saveLinkFolder + all_path
235
+ # if os.path.exists(file_chunk_path):
236
+ # print("File chunk exists!")
237
+ # if not chunk:
238
+ # text, table, document_title = model.read_docx_text(file_chunk_path)
239
+ # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
240
+ # if os.path.exists(file_all_path):
241
+ # print("File all output exists!")
242
+ # if not all_output:
243
+ # text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
244
+ # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
245
+ if chunk_exists:
246
  print("File chunk exists!")
247
  if not chunk:
248
+ text, table, document_title = model.read_docx_text(file_chunk_path)
249
+ chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
250
+ if all_exists:
251
  print("File all output exists!")
252
  if not all_output:
253
+ text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
254
+ all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
255
  if not chunk and not all_output:
256
  # else: check if we can reuse these chunk and all output of existed accession to find another
257
  if links:
 
313
  all_output = all_output[:1*1024*1024]
314
  print("chunk len: ", len(chunk))
315
  print("all output len: ", len(all_output))
316
+ # data_preprocess.save_text_to_docx(chunk, file_chunk_path)
317
+ # data_preprocess.save_text_to_docx(all_output, file_all_path)
318
+ # Later when saving new files
319
  data_preprocess.save_text_to_docx(chunk, file_chunk_path)
320
  data_preprocess.save_text_to_docx(all_output, file_all_path)
321
+
322
+ # Upload to Drive
323
+ upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
324
+ upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
325
+
326
  # else:
327
  # final_input = ""
328
  # if all_output:
 
341
  # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
342
 
343
  # Define paths for cached RAG assets
344
+ # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
345
+ # document_chunks_path = saveLinkFolder+"/document_chunks.json"
346
+ # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
347
+ faiss_filename = "faiss_index.bin"
348
+ chunks_filename = "document_chunks.json"
349
+ lookup_filename = "structured_lookup.json"
350
+
351
+ # Save in temporary local directory
352
+ faiss_index_path = os.path.join(tmp_dir, faiss_filename)
353
+ document_chunks_path = os.path.join(tmp_dir, chunks_filename)
354
+ structured_lookup_path = os.path.join(tmp_dir, lookup_filename)
355
+
356
+ download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
357
+ download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
358
+ download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
359
 
360
  master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
361
  faiss_index_path, document_chunks_path, structured_lookup_path