Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +32 -16
pipeline.py
CHANGED
@@ -250,23 +250,39 @@ def pipeline_with_gemini(accessions):
|
|
250 |
# Define local temp paths for reading/writing
|
251 |
# import tempfile
|
252 |
# tmp_dir = tempfile.mkdtemp()
|
253 |
-
|
254 |
-
os.makedirs(
|
255 |
-
file_chunk_path = os.path.join(
|
256 |
-
file_all_path = os.path.join(
|
257 |
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
|
258 |
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
|
259 |
print(file_chunk_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
# 🔥 Remove the local file first if it exists
|
261 |
-
if os.path.exists(file_chunk_path):
|
262 |
-
|
263 |
-
|
264 |
-
if os.path.exists(file_all_path):
|
265 |
-
|
266 |
-
|
267 |
# Try to download if already exists on Drive
|
268 |
-
|
269 |
-
|
270 |
print("chunk exist: ", chunk_exists)
|
271 |
# first way: ncbi method
|
272 |
print("country.lower: ",country.lower())
|
@@ -405,11 +421,11 @@ def pipeline_with_gemini(accessions):
|
|
405 |
all_output = all_output[:1*1024*1024]
|
406 |
print("chunk len: ", len(chunk))
|
407 |
print("all output len: ", len(all_output))
|
408 |
-
|
409 |
-
|
410 |
# Later when saving new files
|
411 |
-
data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
|
412 |
-
data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
413 |
|
414 |
# Upload to Drive
|
415 |
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|
|
|
250 |
# Define local temp paths for reading/writing
|
251 |
# import tempfile
|
252 |
# tmp_dir = tempfile.mkdtemp()
|
253 |
+
LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
|
254 |
+
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
|
255 |
+
file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
|
256 |
+
file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
|
257 |
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
|
258 |
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
|
259 |
print(file_chunk_path)
|
260 |
+
chunk_id = find_drive_file(chunk_filename, sample_folder_id)
|
261 |
+
all_id = find_drive_file(all_filename, sample_folder_id)
|
262 |
+
|
263 |
+
if chunk_id and all_id:
|
264 |
+
print("✅ Files already exist in Google Drive. Downloading them...")
|
265 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
266 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
267 |
+
# Read and parse these into `chunk` and `all_output`
|
268 |
+
else:
|
269 |
+
# 🔥 Remove any stale local copies
|
270 |
+
if os.path.exists(file_chunk_path):
|
271 |
+
os.remove(file_chunk_path)
|
272 |
+
print(f"🗑️ Removed stale: {file_chunk_path}")
|
273 |
+
if os.path.exists(file_all_path):
|
274 |
+
os.remove(file_all_path)
|
275 |
+
print(f"🗑️ Removed stale: {file_all_path}")
|
276 |
# 🔥 Remove the local file first if it exists
|
277 |
+
# if os.path.exists(file_chunk_path):
|
278 |
+
# os.remove(file_chunk_path)
|
279 |
+
# print("remove chunk path")
|
280 |
+
# if os.path.exists(file_all_path):
|
281 |
+
# os.remove(file_all_path)
|
282 |
+
# print("remove all path")
|
283 |
# Try to download if already exists on Drive
|
284 |
+
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
285 |
+
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
286 |
print("chunk exist: ", chunk_exists)
|
287 |
# first way: ncbi method
|
288 |
print("country.lower: ",country.lower())
|
|
|
421 |
all_output = all_output[:1*1024*1024]
|
422 |
print("chunk len: ", len(chunk))
|
423 |
print("all output len: ", len(all_output))
|
424 |
+
data_preprocess.save_text_to_docx(chunk, file_chunk_path)
|
425 |
+
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
426 |
# Later when saving new files
|
427 |
+
# data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
|
428 |
+
# data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
|
429 |
|
430 |
# Upload to Drive
|
431 |
upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
|