VyLala commited on
Commit
da14ceb
Β·
verified Β·
1 Parent(s): 2b37483

Update mtdna_backend.py

Browse files
Files changed (1) hide show
  1. mtdna_backend.py +607 -509
mtdna_backend.py CHANGED
@@ -1,510 +1,608 @@
1
- import gradio as gr
2
- from collections import Counter
3
- import csv
4
- import os
5
- from functools import lru_cache
6
- import mtdna_ui_app
7
- from mtdna_classifier import classify_sample_location
8
- from iterate3 import data_preprocess, model, pipeline
9
- import subprocess
10
- import json
11
- import pandas as pd
12
- import io
13
- import re
14
- import tempfile
15
- import gspread
16
- from oauth2client.service_account import ServiceAccountCredentials
17
- from io import StringIO
18
- import hashlib
19
- import threading
20
-
21
- # @lru_cache(maxsize=3600)
22
- # def classify_sample_location_cached(accession):
23
- # return classify_sample_location(accession)
24
-
25
- @lru_cache(maxsize=3600)
26
- def pipeline_classify_sample_location_cached(accession):
27
- return pipeline.pipeline_with_gemini([accession])
28
-
29
- # Count and suggest final location
30
- # def compute_final_suggested_location(rows):
31
- # candidates = [
32
- # row.get("Predicted Location", "").strip()
33
- # for row in rows
34
- # if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
35
- # ] + [
36
- # row.get("Inferred Region", "").strip()
37
- # for row in rows
38
- # if row.get("Inferred Region", "").strip().lower() not in ["", "sample id not found", "unknown"]
39
- # ]
40
-
41
- # if not candidates:
42
- # return Counter(), ("Unknown", 0)
43
- # # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
44
- # tokens = []
45
- # for item in candidates:
46
- # # Split by comma, whitespace, and newlines
47
- # parts = re.split(r'[\s,]+', item)
48
- # tokens.extend(parts)
49
-
50
- # # Step 2: Clean and normalize tokens
51
- # tokens = [word.strip() for word in tokens if word.strip().isalpha()] # Keep only alphabetic tokens
52
-
53
- # # Step 3: Count
54
- # counts = Counter(tokens)
55
-
56
- # # Step 4: Get most common
57
- # top_location, count = counts.most_common(1)[0]
58
- # return counts, (top_location, count)
59
-
60
- # Store feedback (with required fields)
61
-
62
- def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
63
- if not answer1.strip() or not answer2.strip():
64
- return "⚠️ Please answer both questions before submitting."
65
-
66
- try:
67
- # βœ… Step: Load credentials from Hugging Face secret
68
- creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
69
- scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
70
- creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
71
-
72
- # Connect to Google Sheet
73
- client = gspread.authorize(creds)
74
- sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches
75
-
76
- # Append feedback
77
- sheet.append_row([accession, answer1, answer2, contact])
78
- return "βœ… Feedback submitted. Thank you!"
79
-
80
- except Exception as e:
81
- return f"❌ Error submitting feedback: {e}"
82
-
83
- # helper function to extract accessions
84
- def extract_accessions_from_input(file=None, raw_text=""):
85
- print(f"RAW TEXT RECEIVED: {raw_text}")
86
- accessions = []
87
- seen = set()
88
- if file:
89
- try:
90
- if file.name.endswith(".csv"):
91
- df = pd.read_csv(file)
92
- elif file.name.endswith(".xlsx"):
93
- df = pd.read_excel(file)
94
- else:
95
- return [], "Unsupported file format. Please upload CSV or Excel."
96
- for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
97
- if acc not in seen:
98
- accessions.append(acc)
99
- seen.add(acc)
100
- except Exception as e:
101
- return [], f"Failed to read file: {e}"
102
-
103
- if raw_text:
104
- text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
105
- for acc in text_ids:
106
- if acc not in seen:
107
- accessions.append(acc)
108
- seen.add(acc)
109
-
110
- return list(accessions), None
111
- # βœ… Add a new helper to backend: `filter_unprocessed_accessions()`
112
- def get_incomplete_accessions(file_path):
113
- df = pd.read_excel(file_path)
114
-
115
- incomplete_accessions = []
116
- for _, row in df.iterrows():
117
- sample_id = str(row.get("Sample ID", "")).strip()
118
-
119
- # Skip if no sample ID
120
- if not sample_id:
121
- continue
122
-
123
- # Drop the Sample ID and check if the rest is empty
124
- other_cols = row.drop(labels=["Sample ID"], errors="ignore")
125
- if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
126
- # Extract the accession number from the sample ID using regex
127
- match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
128
- if match:
129
- incomplete_accessions.append(match.group(0))
130
- print(len(incomplete_accessions))
131
- return incomplete_accessions
132
-
133
- def summarize_results(accession, KNOWN_OUTPUT_PATH = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/iterate3/known_samples.xlsx"):
134
- # try cache first
135
- cached = check_known_output(accession)
136
- if cached:
137
- print(f"βœ… Using cached result for {accession}")
138
- return [[
139
- cached["Sample ID"],
140
- cached["Predicted Country"],
141
- cached["Country Explanation"],
142
- cached["Predicted Sample Type"],
143
- cached["Sample Type Explanation"],
144
- cached["Sources"],
145
- cached["Time cost"]
146
- ]]
147
- # only run when nothing in the cache
148
- try:
149
- outputs = pipeline_classify_sample_location_cached(accession)
150
- # outputs = {'KU131308': {'isolate':'BRU18',
151
- # 'country': {'brunei': ['ncbi',
152
- # 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
153
- # 'sample_type': {'modern':
154
- # ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
155
- # 'query_cost': 9.754999999999999e-05,
156
- # 'time_cost': '24.776 seconds',
157
- # 'source': ['https://doi.org/10.1007/s00439-015-1620-z',
158
- # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
159
- # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
160
- except Exception as e:
161
- return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"
162
-
163
- if accession not in outputs:
164
- return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
165
-
166
- row_score = []
167
- rows = []
168
- save_rows = []
169
- for key in outputs:
170
- pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
171
- for section, results in outputs[key].items():
172
- if section == "country" or section =="sample_type":
173
- pred_output = "\n".join(list(results.keys()))
174
- output_explanation = ""
175
- for result, content in results.items():
176
- if len(result) == 0: result = "unknown"
177
- if len(content) == 0: output_explanation = "unknown"
178
- else:
179
- output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
180
- if section == "country":
181
- pred_country, country_explanation = pred_output, output_explanation
182
- elif section == "sample_type":
183
- pred_sample, sample_explanation = pred_output, output_explanation
184
- if outputs[key]["isolate"].lower()!="unknown":
185
- label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
186
- else: label = key
187
- if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
188
- row = {
189
- "Sample ID": label,
190
- "Predicted Country": pred_country,
191
- "Country Explanation": country_explanation,
192
- "Predicted Sample Type":pred_sample,
193
- "Sample Type Explanation":sample_explanation,
194
- "Sources": "\n".join(outputs[key]["source"]),
195
- "Time cost": outputs[key]["time_cost"]
196
- }
197
- #row_score.append(row)
198
- rows.append(list(row.values()))
199
-
200
- save_row = {
201
- "Sample ID": label,
202
- "Predicted Country": pred_country,
203
- "Country Explanation": country_explanation,
204
- "Predicted Sample Type":pred_sample,
205
- "Sample Type Explanation":sample_explanation,
206
- "Sources": "\n".join(outputs[key]["source"]),
207
- "Query_cost": outputs[key]["query_cost"],
208
- "Time cost": outputs[key]["time_cost"]
209
- }
210
- #row_score.append(row)
211
- save_rows.append(list(save_row.values()))
212
-
213
- # #location_counts, (final_location, count) = compute_final_suggested_location(row_score)
214
- # summary_lines = [f"### 🧭 Location Summary:\n"]
215
- # summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
216
- # summary_lines.append(f"\n**Final Suggested Location:** πŸ—ΊοΈ **{final_location}** (mentioned {count} times)")
217
- # summary = "\n".join(summary_lines)
218
-
219
- # save the new running sample to known excel file
220
- try:
221
- df_new = pd.DataFrame(save_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Query_cost","Time cost"])
222
- if os.path.exists(KNOWN_OUTPUT_PATH):
223
- df_old = pd.read_excel(KNOWN_OUTPUT_PATH)
224
- df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
225
- else:
226
- df_combined = df_new
227
- df_combined.to_excel(KNOWN_OUTPUT_PATH, index=False)
228
- except Exception as e:
229
- print(f"⚠️ Failed to save known output: {e}")
230
-
231
- return rows#, summary, labelAncient_Modern, explain_label
232
-
233
- # save the batch input in excel file
234
- # def save_to_excel(all_rows, summary_text, flag_text, filename):
235
- # with pd.ExcelWriter(filename) as writer:
236
- # # Save table
237
- # df_new = pd.DataFrame(all_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
238
- # df.to_excel(writer, sheet_name="Detailed Results", index=False)
239
- # try:
240
- # df_old = pd.read_excel(filename)
241
- # except:
242
- # df_old = pd.DataFrame([[]], columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
243
- # df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
244
- # # if os.path.exists(filename):
245
- # # df_old = pd.read_excel(filename)
246
- # # df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
247
- # # else:
248
- # # df_combined = df_new
249
- # df_combined.to_excel(filename, index=False)
250
- # # # Save summary
251
- # # summary_df = pd.DataFrame({"Summary": [summary_text]})
252
- # # summary_df.to_excel(writer, sheet_name="Summary", index=False)
253
-
254
- # # # Save flag
255
- # # flag_df = pd.DataFrame({"Flag": [flag_text]})
256
- # # flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
257
- # def save_to_excel(all_rows, summary_text, flag_text, filename):
258
- # df_new = pd.DataFrame(all_rows, columns=[
259
- # "Sample ID", "Predicted Country", "Country Explanation",
260
- # "Predicted Sample Type", "Sample Type Explanation",
261
- # "Sources", "Time cost"
262
- # ])
263
-
264
- # try:
265
- # if os.path.exists(filename):
266
- # df_old = pd.read_excel(filename)
267
- # else:
268
- # df_old = pd.DataFrame(columns=df_new.columns)
269
- # except Exception as e:
270
- # print(f"⚠️ Warning reading old Excel file: {e}")
271
- # df_old = pd.DataFrame(columns=df_new.columns)
272
-
273
- # #df_combined = pd.concat([df_new, df_old], ignore_index=True).drop_duplicates(subset="Sample ID", keep="first")
274
- # df_old.set_index("Sample ID", inplace=True)
275
- # df_new.set_index("Sample ID", inplace=True)
276
-
277
- # df_old.update(df_new) # <-- update matching rows in df_old with df_new content
278
-
279
- # df_combined = df_old.reset_index()
280
-
281
- # try:
282
- # df_combined.to_excel(filename, index=False)
283
- # except Exception as e:
284
- # print(f"❌ Failed to write Excel file {filename}: {e}")
285
- def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
286
- df_new = pd.DataFrame(all_rows, columns=[
287
- "Sample ID", "Predicted Country", "Country Explanation",
288
- "Predicted Sample Type", "Sample Type Explanation",
289
- "Sources", "Time cost"
290
- ])
291
-
292
- if is_resume and os.path.exists(filename):
293
- try:
294
- df_old = pd.read_excel(filename)
295
- except Exception as e:
296
- print(f"⚠️ Warning reading old Excel file: {e}")
297
- df_old = pd.DataFrame(columns=df_new.columns)
298
-
299
- # Set index and update existing rows
300
- df_old.set_index("Sample ID", inplace=True)
301
- df_new.set_index("Sample ID", inplace=True)
302
- df_old.update(df_new)
303
-
304
- df_combined = df_old.reset_index()
305
- else:
306
- # If not resuming or file doesn't exist, just use new rows
307
- df_combined = df_new
308
-
309
- try:
310
- df_combined.to_excel(filename, index=False)
311
- except Exception as e:
312
- print(f"❌ Failed to write Excel file {filename}: {e}")
313
-
314
-
315
- # save the batch input in JSON file
316
- def save_to_json(all_rows, summary_text, flag_text, filename):
317
- output_dict = {
318
- "Detailed_Results": all_rows#, # <-- make sure this is a plain list, not a DataFrame
319
- # "Summary_Text": summary_text,
320
- # "Ancient_Modern_Flag": flag_text
321
- }
322
-
323
- # If all_rows is a DataFrame, convert it
324
- if isinstance(all_rows, pd.DataFrame):
325
- output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
326
-
327
- with open(filename, "w") as external_file:
328
- json.dump(output_dict, external_file, indent=2)
329
-
330
- # save the batch input in Text file
331
- def save_to_txt(all_rows, summary_text, flag_text, filename):
332
- if isinstance(all_rows, pd.DataFrame):
333
- detailed_results = all_rows.to_dict(orient="records")
334
- output = ""
335
- output += ",".join(list(detailed_results[0].keys())) + "\n\n"
336
- for r in detailed_results:
337
- output += ",".join([str(v) for v in r.values()]) + "\n\n"
338
- with open(filename, "w") as f:
339
- f.write("=== Detailed Results ===\n")
340
- f.write(output + "\n")
341
-
342
- # f.write("\n=== Summary ===\n")
343
- # f.write(summary_text + "\n")
344
-
345
- # f.write("\n=== Ancient/Modern Flag ===\n")
346
- # f.write(flag_text + "\n")
347
-
348
- def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
349
- tmp_dir = tempfile.mkdtemp()
350
-
351
- #html_table = all_rows.value # assuming this is stored somewhere
352
-
353
- # Parse back to DataFrame
354
- #all_rows = pd.read_html(all_rows)[0] # [0] because read_html returns a list
355
- all_rows = pd.read_html(StringIO(all_rows))[0]
356
- print(all_rows)
357
-
358
- if output_type == "Excel":
359
- file_path = f"{tmp_dir}/batch_output.xlsx"
360
- save_to_excel(all_rows, summary_text, flag_text, file_path)
361
- elif output_type == "JSON":
362
- file_path = f"{tmp_dir}/batch_output.json"
363
- save_to_json(all_rows, summary_text, flag_text, file_path)
364
- print("Done with JSON")
365
- elif output_type == "TXT":
366
- file_path = f"{tmp_dir}/batch_output.txt"
367
- save_to_txt(all_rows, summary_text, flag_text, file_path)
368
- else:
369
- return gr.update(visible=False) # invalid option
370
-
371
- return gr.update(value=file_path, visible=True)
372
- # save cost by checking the known outputs
373
-
374
- def check_known_output(accession, KNOWN_OUTPUT_PATH = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/iterate3/known_samples.xlsx"):
375
- if not os.path.exists(KNOWN_OUTPUT_PATH):
376
- return None
377
-
378
- try:
379
- df = pd.read_excel(KNOWN_OUTPUT_PATH)
380
- match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
381
- if match:
382
- accession = match.group(0)
383
-
384
- matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
385
- if not matched.empty:
386
- return matched.iloc[0].to_dict() # Return the cached row
387
- except Exception as e:
388
- print(f"⚠️ Failed to load known samples: {e}")
389
- return None
390
-
391
- USER_USAGE_TRACK_FILE = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/iterate3/user_usage_log.json"
392
-
393
- def hash_user_id(user_input):
394
- return hashlib.sha256(user_input.encode()).hexdigest()
395
-
396
- # βœ… Load and save usage count
397
-
398
- # def load_user_usage():
399
- # if os.path.exists(USER_USAGE_TRACK_FILE):
400
- # with open(USER_USAGE_TRACK_FILE, "r") as f:
401
- # return json.load(f)
402
- # return {}
403
-
404
- def load_user_usage():
405
- if not os.path.exists(USER_USAGE_TRACK_FILE):
406
- return {}
407
-
408
- try:
409
- with open(USER_USAGE_TRACK_FILE, "r") as f:
410
- content = f.read().strip()
411
- if not content:
412
- return {} # file is empty
413
- return json.loads(content)
414
- except (json.JSONDecodeError, ValueError):
415
- print("⚠️ Warning: user_usage.json is corrupted or invalid. Resetting.")
416
- return {} # fallback to empty dict
417
-
418
-
419
- def save_user_usage(usage):
420
- with open(USER_USAGE_TRACK_FILE, "w") as f:
421
- json.dump(usage, f, indent=2)
422
-
423
- # def increment_usage(user_id, num_samples=1):
424
- # usage = load_user_usage()
425
- # if user_id not in usage:
426
- # usage[user_id] = 0
427
- # usage[user_id] += num_samples
428
- # save_user_usage(usage)
429
- # return usage[user_id]
430
- def increment_usage(email: str, count: int):
431
- usage = load_user_usage()
432
- email_key = email.strip().lower()
433
- usage[email_key] = usage.get(email_key, 0) + count
434
- save_user_usage(usage)
435
- return usage[email_key]
436
-
437
- # run the batch
438
- def summarize_batch(file=None, raw_text="", resume_file=None, user_email="",
439
- stop_flag=None, output_file_path=None,
440
- limited_acc=50, yield_callback=None):
441
- if user_email:
442
- limited_acc += 10
443
- accessions, error = extract_accessions_from_input(file, raw_text)
444
- if error:
445
- #return [], "", "", f"Error: {error}"
446
- return [], f"Error: {error}", 0, "", ""
447
- if resume_file:
448
- accessions = get_incomplete_accessions(resume_file)
449
- tmp_dir = tempfile.mkdtemp()
450
- if not output_file_path:
451
- if resume_file:
452
- output_file_path = os.path.join(tmp_dir, resume_file)
453
- else:
454
- output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
455
-
456
- all_rows = []
457
- # all_summaries = []
458
- # all_flags = []
459
- progress_lines = []
460
- warning = ""
461
- if len(accessions) > limited_acc:
462
- accessions = accessions[:limited_acc]
463
- warning = f"Your number of accessions is more than the {limited_acc}, only handle first {limited_acc} accessions"
464
- for i, acc in enumerate(accessions):
465
- if stop_flag and stop_flag.value:
466
- line = f"πŸ›‘ Stopped at {acc} ({i+1}/{len(accessions)})"
467
- progress_lines.append(line)
468
- if yield_callback:
469
- yield_callback(line)
470
- print("πŸ›‘ User requested stop.")
471
- break
472
- print(f"[{i+1}/{len(accessions)}] Processing {acc}")
473
- try:
474
- # rows, summary, label, explain = summarize_results(acc)
475
- rows = summarize_results(acc)
476
- all_rows.extend(rows)
477
- # all_summaries.append(f"**{acc}**\n{summary}")
478
- # all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
479
- #save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path)
480
- save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path, is_resume=bool(resume_file))
481
- line = f"βœ… Processed {acc} ({i+1}/{len(accessions)})"
482
- progress_lines.append(line)
483
- if yield_callback:
484
- yield_callback(f"βœ… Processed {acc} ({i+1}/{len(accessions)})")
485
- except Exception as e:
486
- print(f"❌ Failed to process {acc}: {e}")
487
- continue
488
- #all_summaries.append(f"**{acc}**: Failed - {e}")
489
- #progress_lines.append(f"βœ… Processed {acc} ({i+1}/{len(accessions)})")
490
- limited_acc -= 1
491
- """for row in all_rows:
492
- source_column = row[2] # Assuming the "Source" is in the 3rd column (index 2)
493
-
494
- if source_column.startswith("http"): # Check if the source is a URL
495
- # Wrap it with HTML anchor tags to make it clickable
496
- row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
497
- if not warning:
498
- warning = f"You only have {limited_acc} left"
499
- if user_email.strip():
500
- user_hash = hash_user_id(user_email)
501
- total_queries = increment_usage(user_hash, len(all_rows))
502
- else:
503
- total_queries = 0
504
- yield_callback("βœ… Finished!")
505
-
506
- # summary_text = "\n\n---\n\n".join(all_summaries)
507
- # flag_text = "\n\n---\n\n".join(all_flags)
508
- #return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
509
- #return all_rows, gr.update(visible=True), gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  return all_rows, output_file_path, total_queries, "\n".join(progress_lines), warning
 
1
+ import gradio as gr
2
+ from collections import Counter
3
+ import csv
4
+ import os
5
+ from functools import lru_cache
6
+ import mtdna_ui_app
7
+ from mtdna_classifier import classify_sample_location
8
+ from iterate3 import data_preprocess, model, pipeline
9
+ import subprocess
10
+ import json
11
+ import pandas as pd
12
+ import io
13
+ import re
14
+ import tempfile
15
+ import gspread
16
+ from oauth2client.service_account import ServiceAccountCredentials
17
+ from io import StringIO
18
+ import hashlib
19
+ import threading
20
+
21
+ # @lru_cache(maxsize=3600)
22
+ # def classify_sample_location_cached(accession):
23
+ # return classify_sample_location(accession)
24
+
25
+ @lru_cache(maxsize=3600)
26
+ def pipeline_classify_sample_location_cached(accession):
27
+ return pipeline.pipeline_with_gemini([accession])
28
+
29
+ # Count and suggest final location
30
+ # def compute_final_suggested_location(rows):
31
+ # candidates = [
32
+ # row.get("Predicted Location", "").strip()
33
+ # for row in rows
34
+ # if row.get("Predicted Location", "").strip().lower() not in ["", "sample id not found", "unknown"]
35
+ # ] + [
36
+ # row.get("Inferred Region", "").strip()
37
+ # for row in rows
38
+ # if row.get("Inferred Region", "").strip().lower() not in ["", "sample id not found", "unknown"]
39
+ # ]
40
+
41
+ # if not candidates:
42
+ # return Counter(), ("Unknown", 0)
43
+ # # Step 1: Combine into one string and split using regex to handle commas, line breaks, etc.
44
+ # tokens = []
45
+ # for item in candidates:
46
+ # # Split by comma, whitespace, and newlines
47
+ # parts = re.split(r'[\s,]+', item)
48
+ # tokens.extend(parts)
49
+
50
+ # # Step 2: Clean and normalize tokens
51
+ # tokens = [word.strip() for word in tokens if word.strip().isalpha()] # Keep only alphabetic tokens
52
+
53
+ # # Step 3: Count
54
+ # counts = Counter(tokens)
55
+
56
+ # # Step 4: Get most common
57
+ # top_location, count = counts.most_common(1)[0]
58
+ # return counts, (top_location, count)
59
+
60
+ # Store feedback (with required fields)
61
+
62
+ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
63
+ if not answer1.strip() or not answer2.strip():
64
+ return "⚠️ Please answer both questions before submitting."
65
+
66
+ try:
67
+ # βœ… Step: Load credentials from Hugging Face secret
68
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
69
+ scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
70
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
71
+
72
+ # Connect to Google Sheet
73
+ client = gspread.authorize(creds)
74
+ sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches
75
+
76
+ # Append feedback
77
+ sheet.append_row([accession, answer1, answer2, contact])
78
+ return "βœ… Feedback submitted. Thank you!"
79
+
80
+ except Exception as e:
81
+ return f"❌ Error submitting feedback: {e}"
82
+
83
+ # helper function to extract accessions
84
+ def extract_accessions_from_input(file=None, raw_text=""):
85
+ print(f"RAW TEXT RECEIVED: {raw_text}")
86
+ accessions = []
87
+ seen = set()
88
+ if file:
89
+ try:
90
+ if file.name.endswith(".csv"):
91
+ df = pd.read_csv(file)
92
+ elif file.name.endswith(".xlsx"):
93
+ df = pd.read_excel(file)
94
+ else:
95
+ return [], "Unsupported file format. Please upload CSV or Excel."
96
+ for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
97
+ if acc not in seen:
98
+ accessions.append(acc)
99
+ seen.add(acc)
100
+ except Exception as e:
101
+ return [], f"Failed to read file: {e}"
102
+
103
+ if raw_text:
104
+ text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
105
+ for acc in text_ids:
106
+ if acc not in seen:
107
+ accessions.append(acc)
108
+ seen.add(acc)
109
+
110
+ return list(accessions), None
111
+ # βœ… Add a new helper to backend: `filter_unprocessed_accessions()`
112
+ def get_incomplete_accessions(file_path):
113
+ df = pd.read_excel(file_path)
114
+
115
+ incomplete_accessions = []
116
+ for _, row in df.iterrows():
117
+ sample_id = str(row.get("Sample ID", "")).strip()
118
+
119
+ # Skip if no sample ID
120
+ if not sample_id:
121
+ continue
122
+
123
+ # Drop the Sample ID and check if the rest is empty
124
+ other_cols = row.drop(labels=["Sample ID"], errors="ignore")
125
+ if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
126
+ # Extract the accession number from the sample ID using regex
127
+ match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
128
+ if match:
129
+ incomplete_accessions.append(match.group(0))
130
+ print(len(incomplete_accessions))
131
+ return incomplete_accessions
132
+
133
+ # GOOGLE_SHEET_NAME = "known_samples"
134
+ # USAGE_DRIVE_FILENAME = "user_usage_log.json"
135
+
136
+ def summarize_results(accession):
137
+ # try cache first
138
+ cached = check_known_output(accession)
139
+ if cached:
140
+ print(f"βœ… Using cached result for {accession}")
141
+ return [[
142
+ cached["Sample ID"],
143
+ cached["Predicted Country"],
144
+ cached["Country Explanation"],
145
+ cached["Predicted Sample Type"],
146
+ cached["Sample Type Explanation"],
147
+ cached["Sources"],
148
+ cached["Time cost"]
149
+ ]]
150
+ # only run when nothing in the cache
151
+ try:
152
+ outputs = pipeline_classify_sample_location_cached(accession)
153
+ # outputs = {'KU131308': {'isolate':'BRU18',
154
+ # 'country': {'brunei': ['ncbi',
155
+ # 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
156
+ # 'sample_type': {'modern':
157
+ # ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.']},
158
+ # 'query_cost': 9.754999999999999e-05,
159
+ # 'time_cost': '24.776 seconds',
160
+ # 'source': ['https://doi.org/10.1007/s00439-015-1620-z',
161
+ # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
162
+ # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
163
+ except Exception as e:
164
+ return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"
165
+
166
+ if accession not in outputs:
167
+ return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
168
+
169
+ row_score = []
170
+ rows = []
171
+ save_rows = []
172
+ for key in outputs:
173
+ pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
174
+ for section, results in outputs[key].items():
175
+ if section == "country" or section =="sample_type":
176
+ pred_output = "\n".join(list(results.keys()))
177
+ output_explanation = ""
178
+ for result, content in results.items():
179
+ if len(result) == 0: result = "unknown"
180
+ if len(content) == 0: output_explanation = "unknown"
181
+ else:
182
+ output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
183
+ if section == "country":
184
+ pred_country, country_explanation = pred_output, output_explanation
185
+ elif section == "sample_type":
186
+ pred_sample, sample_explanation = pred_output, output_explanation
187
+ if outputs[key]["isolate"].lower()!="unknown":
188
+ label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
189
+ else: label = key
190
+ if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
191
+ row = {
192
+ "Sample ID": label,
193
+ "Predicted Country": pred_country,
194
+ "Country Explanation": country_explanation,
195
+ "Predicted Sample Type":pred_sample,
196
+ "Sample Type Explanation":sample_explanation,
197
+ "Sources": "\n".join(outputs[key]["source"]),
198
+ "Time cost": outputs[key]["time_cost"]
199
+ }
200
+ #row_score.append(row)
201
+ rows.append(list(row.values()))
202
+
203
+ save_row = {
204
+ "Sample ID": label,
205
+ "Predicted Country": pred_country,
206
+ "Country Explanation": country_explanation,
207
+ "Predicted Sample Type":pred_sample,
208
+ "Sample Type Explanation":sample_explanation,
209
+ "Sources": "\n".join(outputs[key]["source"]),
210
+ "Query_cost": outputs[key]["query_cost"],
211
+ "Time cost": outputs[key]["time_cost"]
212
+ }
213
+ #row_score.append(row)
214
+ save_rows.append(list(save_row.values()))
215
+
216
+ # #location_counts, (final_location, count) = compute_final_suggested_location(row_score)
217
+ # summary_lines = [f"### 🧭 Location Summary:\n"]
218
+ # summary_lines += [f"- **{loc}**: {cnt} times" for loc, cnt in location_counts.items()]
219
+ # summary_lines.append(f"\n**Final Suggested Location:** πŸ—ΊοΈ **{final_location}** (mentioned {count} times)")
220
+ # summary = "\n".join(summary_lines)
221
+
222
+ # save the new running sample to known excel file
223
+ # try:
224
+ # df_new = pd.DataFrame(save_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Query_cost","Time cost"])
225
+ # if os.path.exists(KNOWN_OUTPUT_PATH):
226
+ # df_old = pd.read_excel(KNOWN_OUTPUT_PATH)
227
+ # df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
228
+ # else:
229
+ # df_combined = df_new
230
+ # df_combined.to_excel(KNOWN_OUTPUT_PATH, index=False)
231
+ # except Exception as e:
232
+ # print(f"⚠️ Failed to save known output: {e}")
233
+ try:
234
+ df_new = pd.DataFrame(save_rows, columns=[
235
+ "Sample ID", "Predicted Country", "Country Explanation",
236
+ "Predicted Sample Type", "Sample Type Explanation",
237
+ "Sources", "Query_cost", "Time cost"
238
+ ])
239
+
240
+ # βœ… Google Sheets API setup
241
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
242
+ scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
243
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
244
+ client = gspread.authorize(creds)
245
+
246
+ # βœ… Open the known_samples sheet
247
+ spreadsheet = client.open("known_samples") # Replace with your sheet name
248
+ sheet = spreadsheet.sheet1
249
+
250
+ # βœ… Read old data
251
+ existing_data = sheet.get_all_values()
252
+ if existing_data:
253
+ df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
254
+ else:
255
+ df_old = pd.DataFrame(columns=df_new.columns)
256
+
257
+ # βœ… Combine and remove duplicates
258
+ df_combined = pd.concat([df_old, df_new], ignore_index=True).drop_duplicates(subset="Sample ID")
259
+
260
+ # βœ… Clear and write back
261
+ sheet.clear()
262
+ sheet.update([df_combined.columns.values.tolist()] + df_combined.values.tolist())
263
+
264
+ except Exception as e:
265
+ print(f"⚠️ Failed to save known output to Google Sheets: {e}")
266
+
267
+ return rows#, summary, labelAncient_Modern, explain_label
268
+
269
+ # save the batch input in excel file
270
+ # def save_to_excel(all_rows, summary_text, flag_text, filename):
271
+ # with pd.ExcelWriter(filename) as writer:
272
+ # # Save table
273
+ # df_new = pd.DataFrame(all_rows, columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
274
+ # df.to_excel(writer, sheet_name="Detailed Results", index=False)
275
+ # try:
276
+ # df_old = pd.read_excel(filename)
277
+ # except:
278
+ # df_old = pd.DataFrame([[]], columns=["Sample ID", "Predicted Country", "Country Explanation", "Predicted Sample Type", "Sample Type Explanation", "Sources", "Time cost"])
279
+ # df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
280
+ # # if os.path.exists(filename):
281
+ # # df_old = pd.read_excel(filename)
282
+ # # df_combined = pd.concat([df_old, df_new]).drop_duplicates(subset="Sample ID")
283
+ # # else:
284
+ # # df_combined = df_new
285
+ # df_combined.to_excel(filename, index=False)
286
+ # # # Save summary
287
+ # # summary_df = pd.DataFrame({"Summary": [summary_text]})
288
+ # # summary_df.to_excel(writer, sheet_name="Summary", index=False)
289
+
290
+ # # # Save flag
291
+ # # flag_df = pd.DataFrame({"Flag": [flag_text]})
292
+ # # flag_df.to_excel(writer, sheet_name="Ancient_Modern_Flag", index=False)
293
+ # def save_to_excel(all_rows, summary_text, flag_text, filename):
294
+ # df_new = pd.DataFrame(all_rows, columns=[
295
+ # "Sample ID", "Predicted Country", "Country Explanation",
296
+ # "Predicted Sample Type", "Sample Type Explanation",
297
+ # "Sources", "Time cost"
298
+ # ])
299
+
300
+ # try:
301
+ # if os.path.exists(filename):
302
+ # df_old = pd.read_excel(filename)
303
+ # else:
304
+ # df_old = pd.DataFrame(columns=df_new.columns)
305
+ # except Exception as e:
306
+ # print(f"⚠️ Warning reading old Excel file: {e}")
307
+ # df_old = pd.DataFrame(columns=df_new.columns)
308
+
309
+ # #df_combined = pd.concat([df_new, df_old], ignore_index=True).drop_duplicates(subset="Sample ID", keep="first")
310
+ # df_old.set_index("Sample ID", inplace=True)
311
+ # df_new.set_index("Sample ID", inplace=True)
312
+
313
+ # df_old.update(df_new) # <-- update matching rows in df_old with df_new content
314
+
315
+ # df_combined = df_old.reset_index()
316
+
317
+ # try:
318
+ # df_combined.to_excel(filename, index=False)
319
+ # except Exception as e:
320
+ # print(f"❌ Failed to write Excel file {filename}: {e}")
321
+ def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
322
+ df_new = pd.DataFrame(all_rows, columns=[
323
+ "Sample ID", "Predicted Country", "Country Explanation",
324
+ "Predicted Sample Type", "Sample Type Explanation",
325
+ "Sources", "Time cost"
326
+ ])
327
+
328
+ if is_resume and os.path.exists(filename):
329
+ try:
330
+ df_old = pd.read_excel(filename)
331
+ except Exception as e:
332
+ print(f"⚠️ Warning reading old Excel file: {e}")
333
+ df_old = pd.DataFrame(columns=df_new.columns)
334
+
335
+ # Set index and update existing rows
336
+ df_old.set_index("Sample ID", inplace=True)
337
+ df_new.set_index("Sample ID", inplace=True)
338
+ df_old.update(df_new)
339
+
340
+ df_combined = df_old.reset_index()
341
+ else:
342
+ # If not resuming or file doesn't exist, just use new rows
343
+ df_combined = df_new
344
+
345
+ try:
346
+ df_combined.to_excel(filename, index=False)
347
+ except Exception as e:
348
+ print(f"❌ Failed to write Excel file {filename}: {e}")
349
+
350
+
351
+ # save the batch input in JSON file
352
+ def save_to_json(all_rows, summary_text, flag_text, filename):
353
+ output_dict = {
354
+ "Detailed_Results": all_rows#, # <-- make sure this is a plain list, not a DataFrame
355
+ # "Summary_Text": summary_text,
356
+ # "Ancient_Modern_Flag": flag_text
357
+ }
358
+
359
+ # If all_rows is a DataFrame, convert it
360
+ if isinstance(all_rows, pd.DataFrame):
361
+ output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
362
+
363
+ with open(filename, "w") as external_file:
364
+ json.dump(output_dict, external_file, indent=2)
365
+
366
+ # save the batch input in Text file
367
+ def save_to_txt(all_rows, summary_text, flag_text, filename):
368
+ if isinstance(all_rows, pd.DataFrame):
369
+ detailed_results = all_rows.to_dict(orient="records")
370
+ output = ""
371
+ output += ",".join(list(detailed_results[0].keys())) + "\n\n"
372
+ for r in detailed_results:
373
+ output += ",".join([str(v) for v in r.values()]) + "\n\n"
374
+ with open(filename, "w") as f:
375
+ f.write("=== Detailed Results ===\n")
376
+ f.write(output + "\n")
377
+
378
+ # f.write("\n=== Summary ===\n")
379
+ # f.write(summary_text + "\n")
380
+
381
+ # f.write("\n=== Ancient/Modern Flag ===\n")
382
+ # f.write(flag_text + "\n")
383
+
384
+ def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
385
+ tmp_dir = tempfile.mkdtemp()
386
+
387
+ #html_table = all_rows.value # assuming this is stored somewhere
388
+
389
+ # Parse back to DataFrame
390
+ #all_rows = pd.read_html(all_rows)[0] # [0] because read_html returns a list
391
+ all_rows = pd.read_html(StringIO(all_rows))[0]
392
+ print(all_rows)
393
+
394
+ if output_type == "Excel":
395
+ file_path = f"{tmp_dir}/batch_output.xlsx"
396
+ save_to_excel(all_rows, summary_text, flag_text, file_path)
397
+ elif output_type == "JSON":
398
+ file_path = f"{tmp_dir}/batch_output.json"
399
+ save_to_json(all_rows, summary_text, flag_text, file_path)
400
+ print("Done with JSON")
401
+ elif output_type == "TXT":
402
+ file_path = f"{tmp_dir}/batch_output.txt"
403
+ save_to_txt(all_rows, summary_text, flag_text, file_path)
404
+ else:
405
+ return gr.update(visible=False) # invalid option
406
+
407
+ return gr.update(value=file_path, visible=True)
408
+ # save cost by checking the known outputs
409
+
410
+ # def check_known_output(accession):
411
+ # if not os.path.exists(KNOWN_OUTPUT_PATH):
412
+ # return None
413
+
414
+ # try:
415
+ # df = pd.read_excel(KNOWN_OUTPUT_PATH)
416
+ # match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
417
+ # if match:
418
+ # accession = match.group(0)
419
+
420
+ # matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
421
+ # if not matched.empty:
422
+ # return matched.iloc[0].to_dict() # Return the cached row
423
+ # except Exception as e:
424
+ # print(f"⚠️ Failed to load known samples: {e}")
425
+ # return None
426
+
427
+ def check_known_output(accession):
428
+ try:
429
+ # βœ… Load credentials from Hugging Face secret
430
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
431
+ scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
432
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
433
+ client = gspread.authorize(creds)
434
+
435
+ # βœ… Open the known_samples sheet
436
+ spreadsheet = client.open("known_samples") # Replace with your sheet name
437
+ sheet = spreadsheet.sheet1
438
+
439
+ # βœ… Read all rows
440
+ data = sheet.get_all_values()
441
+ if not data:
442
+ return None
443
+
444
+ df = pd.DataFrame(data[1:], columns=data[0]) # Skip header row
445
+
446
+ # βœ… Normalize accession pattern
447
+ match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
448
+ if match:
449
+ accession = match.group(0)
450
+
451
+ matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
452
+ if not matched.empty:
453
+ return matched.iloc[0].to_dict()
454
+
455
+ except Exception as e:
456
+ print(f"⚠️ Failed to load known samples from Google Sheets: {e}")
457
+ return None
458
+
459
+ def hash_user_id(user_input):
460
+ return hashlib.sha256(user_input.encode()).hexdigest()
461
+
462
+ # βœ… Load and save usage count
463
+
464
+ # def load_user_usage():
465
+ # if not os.path.exists(USER_USAGE_TRACK_FILE):
466
+ # return {}
467
+
468
+ # try:
469
+ # with open(USER_USAGE_TRACK_FILE, "r") as f:
470
+ # content = f.read().strip()
471
+ # if not content:
472
+ # return {} # file is empty
473
+ # return json.loads(content)
474
+ # except (json.JSONDecodeError, ValueError):
475
+ # print("⚠️ Warning: user_usage.json is corrupted or invalid. Resetting.")
476
+ # return {} # fallback to empty dict
477
+ def load_user_usage():
478
+ try:
479
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
480
+ scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
481
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
482
+ client = gspread.authorize(creds)
483
+
484
+ sheet = client.open("user_usage_log").sheet1
485
+ data = sheet.get_all_records() # Assumes columns: email, usage_count
486
+
487
+ usage = {}
488
+ for row in data:
489
+ email = row.get("email", "").strip().lower()
490
+ count = int(row.get("usage_count", 0))
491
+ if email:
492
+ usage[email] = count
493
+ return usage
494
+ except Exception as e:
495
+ print(f"⚠️ Failed to load user usage from Google Sheets: {e}")
496
+ return {}
497
+
498
+
499
+
500
+ # def save_user_usage(usage):
501
+ # with open(USER_USAGE_TRACK_FILE, "w") as f:
502
+ # json.dump(usage, f, indent=2)
503
+
504
+ def save_user_usage(usage_dict):
505
+ try:
506
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
507
+ scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
508
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
509
+ client = gspread.authorize(creds)
510
+
511
+ sheet = client.open("user_usage_log").sheet1
512
+ sheet.clear() # clear old contents first
513
+
514
+ # Write header + rows
515
+ rows = [["email", "usage_count"]] + [[email, count] for email, count in usage_dict.items()]
516
+ sheet.update(rows)
517
+ except Exception as e:
518
+ print(f"❌ Failed to save user usage to Google Sheets: {e}")
519
+
520
+
521
+ # def increment_usage(user_id, num_samples=1):
522
+ # usage = load_user_usage()
523
+ # if user_id not in usage:
524
+ # usage[user_id] = 0
525
+ # usage[user_id] += num_samples
526
+ # save_user_usage(usage)
527
+ # return usage[user_id]
528
+ def increment_usage(email: str, count: int):
529
+ usage = load_user_usage()
530
+ email_key = email.strip().lower()
531
+ usage[email_key] = usage.get(email_key, 0) + count
532
+ save_user_usage(usage)
533
+ return usage[email_key]
534
+
535
+ # run the batch
536
+ def summarize_batch(file=None, raw_text="", resume_file=None, user_email="",
537
+ stop_flag=None, output_file_path=None,
538
+ limited_acc=50, yield_callback=None):
539
+ if user_email:
540
+ limited_acc += 10
541
+ accessions, error = extract_accessions_from_input(file, raw_text)
542
+ if error:
543
+ #return [], "", "", f"Error: {error}"
544
+ return [], f"Error: {error}", 0, "", ""
545
+ if resume_file:
546
+ accessions = get_incomplete_accessions(resume_file)
547
+ tmp_dir = tempfile.mkdtemp()
548
+ if not output_file_path:
549
+ if resume_file:
550
+ output_file_path = os.path.join(tmp_dir, resume_file)
551
+ else:
552
+ output_file_path = os.path.join(tmp_dir, "batch_output_live.xlsx")
553
+
554
+ all_rows = []
555
+ # all_summaries = []
556
+ # all_flags = []
557
+ progress_lines = []
558
+ warning = ""
559
+ if len(accessions) > limited_acc:
560
+ accessions = accessions[:limited_acc]
561
+ warning = f"Your number of accessions is more than the {limited_acc}, only handle first {limited_acc} accessions"
562
+ for i, acc in enumerate(accessions):
563
+ if stop_flag and stop_flag.value:
564
+ line = f"πŸ›‘ Stopped at {acc} ({i+1}/{len(accessions)})"
565
+ progress_lines.append(line)
566
+ if yield_callback:
567
+ yield_callback(line)
568
+ print("πŸ›‘ User requested stop.")
569
+ break
570
+ print(f"[{i+1}/{len(accessions)}] Processing {acc}")
571
+ try:
572
+ # rows, summary, label, explain = summarize_results(acc)
573
+ rows = summarize_results(acc)
574
+ all_rows.extend(rows)
575
+ # all_summaries.append(f"**{acc}**\n{summary}")
576
+ # all_flags.append(f"**{acc}**\n### 🏺 Ancient/Modern Flag\n**{label}**\n\n_Explanation:_ {explain}")
577
+ #save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path)
578
+ save_to_excel(all_rows, summary_text="", flag_text="", filename=output_file_path, is_resume=bool(resume_file))
579
+ line = f"βœ… Processed {acc} ({i+1}/{len(accessions)})"
580
+ progress_lines.append(line)
581
+ if yield_callback:
582
+ yield_callback(f"βœ… Processed {acc} ({i+1}/{len(accessions)})")
583
+ except Exception as e:
584
+ print(f"❌ Failed to process {acc}: {e}")
585
+ continue
586
+ #all_summaries.append(f"**{acc}**: Failed - {e}")
587
+ #progress_lines.append(f"βœ… Processed {acc} ({i+1}/{len(accessions)})")
588
+ limited_acc -= 1
589
+ """for row in all_rows:
590
+ source_column = row[2] # Assuming the "Source" is in the 3rd column (index 2)
591
+
592
+ if source_column.startswith("http"): # Check if the source is a URL
593
+ # Wrap it with HTML anchor tags to make it clickable
594
+ row[2] = f'<a href="{source_column}" target="_blank" style="color: blue; text-decoration: underline;">{source_column}</a>'"""
595
+ if not warning:
596
+ warning = f"You only have {limited_acc} left"
597
+ if user_email.strip():
598
+ user_hash = hash_user_id(user_email)
599
+ total_queries = increment_usage(user_hash, len(all_rows))
600
+ else:
601
+ total_queries = 0
602
+ yield_callback("βœ… Finished!")
603
+
604
+ # summary_text = "\n\n---\n\n".join(all_summaries)
605
+ # flag_text = "\n\n---\n\n".join(all_flags)
606
+ #return all_rows, summary_text, flag_text, gr.update(visible=True), gr.update(visible=False)
607
+ #return all_rows, gr.update(visible=True), gr.update(visible=False)
608
  return all_rows, output_file_path, total_queries, "\n".join(progress_lines), warning