VyLala commited on
Commit
c63e748
·
verified ·
1 Parent(s): 05b38b7

Update mtdna_classifier.py

Browse files
Files changed (1) hide show
  1. mtdna_classifier.py +524 -519
mtdna_classifier.py CHANGED
@@ -1,519 +1,524 @@
1
- # mtDNA Location Classifier MVP (Google Colab)
2
- # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
- import os
4
- import subprocess
5
- import re
6
- from Bio import Entrez
7
- import fitz
8
- import spacy
9
- from spacy.cli import download
10
- from NER.PDF import pdf
11
- from NER.WordDoc import wordDoc
12
- from NER.html import extractHTML
13
- from NER.word2Vec import word2vec
14
- from transformers import pipeline
15
- import urllib.parse, requests
16
- from pathlib import Path
17
- from upgradeClassify import filter_context_for_sample, infer_location_for_sample
18
- # Set your email (required by NCBI Entrez)
19
- #Entrez.email = "[email protected]"
20
- import nltk
21
-
22
- nltk.download("stopwords")
23
- nltk.download("punkt")
24
- nltk.download('punkt_tab')
25
- # Step 1: Get PubMed ID from Accession using EDirect
26
-
27
- '''def get_info_from_accession(accession):
28
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
29
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
30
- output = result.stdout
31
- pubmedID, isolate = "", ""
32
- for line in output.split("\n"):
33
- if len(line) > 0:
34
- if "PUBMED" in line:
35
- pubmedID = line.split()[-1]
36
- if "isolate" in line: # Check for isolate information
37
- # Try direct GenBank annotation: /isolate="XXX"
38
- match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
39
- if match1:
40
- isolate = match1.group(1)
41
- else:
42
- # Try from DEFINITION line: ...isolate XXX...
43
- match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
44
- if match2:
45
- isolate = match2.group(1)'''
46
- from Bio import Entrez, Medline
47
- import re
48
-
49
- Entrez.email = "[email protected]"
50
-
51
- def get_info_from_accession(accession):
52
- try:
53
- handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
54
- text = handle.read()
55
- handle.close()
56
-
57
- # Extract PUBMED ID from the Medline text
58
- pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
59
- pubmed_id = pubmed_match.group(1) if pubmed_match else ""
60
-
61
- # Extract isolate if available
62
- isolate_match = re.search(r'/isolate="([^"]+)"', text)
63
- if not isolate_match:
64
- isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
65
- isolate = isolate_match.group(1) if isolate_match else ""
66
-
67
- if not pubmed_id:
68
- print(f"⚠️ No PubMed ID found for accession {accession}")
69
-
70
- return pubmed_id, isolate
71
-
72
- except Exception as e:
73
- print("❌ Entrez error:", e)
74
- return "", ""
75
- # Step 2: Get doi link to access the paper
76
- '''def get_doi_from_pubmed_id(pubmed_id):
77
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
78
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
79
- output = result.stdout
80
-
81
- doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
82
- match = re.search(doi_pattern, output, re.IGNORECASE)
83
-
84
- if match:
85
- return match.group(0)
86
- else:
87
- return None # or raise an Exception with a helpful message'''
88
-
89
- def get_doi_from_pubmed_id(pubmed_id):
90
- try:
91
- handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
92
- records = list(Medline.parse(handle))
93
- handle.close()
94
-
95
- if not records:
96
- return None
97
-
98
- record = records[0]
99
- if "AID" in record:
100
- for aid in record["AID"]:
101
- if "[doi]" in aid:
102
- return aid.split(" ")[0] # extract the DOI
103
-
104
- return None
105
-
106
- except Exception as e:
107
- print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
108
- return None
109
-
110
-
111
- # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
112
- # Step 3.1: Extract Text
113
- # sub: download excel file
114
- def download_excel_file(url, save_path="temp.xlsx"):
115
- if "view.officeapps.live.com" in url:
116
- parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
117
- real_url = urllib.parse.unquote(parsed_url["src"][0])
118
- response = requests.get(real_url)
119
- with open(save_path, "wb") as f:
120
- f.write(response.content)
121
- return save_path
122
- elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
123
- response = requests.get(url)
124
- response.raise_for_status() # Raises error if download fails
125
- with open(save_path, "wb") as f:
126
- f.write(response.content)
127
- return save_path
128
- else:
129
- print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
130
- return url
131
- def get_paper_text(doi,id,manualLinks=None):
132
- # create the temporary folder to contain the texts
133
- folder_path = Path("data/"+str(id))
134
- if not folder_path.exists():
135
- cmd = f'mkdir data/{id}'
136
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
137
- print("data/"+str(id) +" created.")
138
- else:
139
- print("data/"+str(id) +" already exists.")
140
- saveLinkFolder = "data/"+id
141
-
142
- link = 'https://doi.org/' + doi
143
- '''textsToExtract = { "doiLink":"paperText"
144
- "file1.pdf":"text1",
145
- "file2.doc":"text2",
146
- "file3.xlsx":excelText3'''
147
- textsToExtract = {}
148
- # get the file to create listOfFile for each id
149
- html = extractHTML.HTML("",link)
150
- jsonSM = html.getSupMaterial()
151
- text = ""
152
- links = [link] + sum((jsonSM[key] for key in jsonSM),[])
153
- if manualLinks != None:
154
- links += manualLinks
155
- for l in links:
156
- # get the main paper
157
- name = l.split("/")[-1]
158
- file_path = folder_path / name
159
- if l == link:
160
- text = html.getListSection()
161
- textsToExtract[link] = text
162
- elif l.endswith(".pdf"):
163
- if file_path.is_file():
164
- l = saveLinkFolder + "/" + name
165
- print("File exists.")
166
- p = pdf.PDF(l,saveLinkFolder,doi)
167
- f = p.openPDFFile()
168
- pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
169
- doc = fitz.open(pdf_path)
170
- text = "\n".join([page.get_text() for page in doc])
171
- textsToExtract[l] = text
172
- elif l.endswith(".doc") or l.endswith(".docx"):
173
- d = wordDoc.wordDoc(l,saveLinkFolder)
174
- text = d.extractTextByPage()
175
- textsToExtract[l] = text
176
- elif l.split(".")[-1].lower() in "xlsx":
177
- wc = word2vec.word2Vec()
178
- # download excel file if it not downloaded yet
179
- savePath = saveLinkFolder +"/"+ l.split("/")[-1]
180
- excelPath = download_excel_file(l, savePath)
181
- corpus = wc.tableTransformToCorpusText([],excelPath)
182
- text = ''
183
- for c in corpus:
184
- para = corpus[c]
185
- for words in para:
186
- text += " ".join(words)
187
- textsToExtract[l] = text
188
- # delete folder after finishing getting text
189
- #cmd = f'rm -r data/{id}'
190
- #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
191
- return textsToExtract
192
- # Step 3.2: Extract context
193
- def extract_context(text, keyword, window=500):
194
- # firstly try accession number
195
- idx = text.find(keyword)
196
- if idx == -1:
197
- return "Sample ID not found."
198
- return text[max(0, idx-window): idx+window]
199
- def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
200
- if keep_if is None:
201
- keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
202
-
203
- outputs = ""
204
- text = text.lower()
205
-
206
- # If isolate is provided, prioritize paragraphs that mention it
207
- # If isolate is provided, prioritize paragraphs that mention it
208
- if accession and accession.lower() in text:
209
- if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
210
- outputs += extract_context(text, accession.lower(), window=700)
211
- if isolate and isolate.lower() in text:
212
- if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
213
- outputs += extract_context(text, isolate.lower(), window=700)
214
- for keyword in keep_if:
215
- para = extract_context(text, keyword)
216
- if para and para not in outputs:
217
- outputs += para + "\n"
218
- return outputs
219
- # Step 4: Classification for now (demo purposes)
220
- # 4.1: Using a HuggingFace model (question-answering)
221
- def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
222
- try:
223
- qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
224
- result = qa({"context": context, "question": question})
225
- return result.get("answer", "Unknown")
226
- except Exception as e:
227
- return f"Error: {str(e)}"
228
-
229
- # 4.2: Infer from haplogroup
230
- # Load pre-trained spaCy model for NER
231
- try:
232
- nlp = spacy.load("en_core_web_sm")
233
- except OSError:
234
- download("en_core_web_sm")
235
- nlp = spacy.load("en_core_web_sm")
236
-
237
- # Define the haplogroup-to-region mapping (simple rule-based)
238
- import csv
239
-
240
- def load_haplogroup_mapping(csv_path):
241
- mapping = {}
242
- with open(csv_path) as f:
243
- reader = csv.DictReader(f)
244
- for row in reader:
245
- mapping[row["haplogroup"]] = [row["region"],row["source"]]
246
- return mapping
247
-
248
- # Function to extract haplogroup from the text
249
- def extract_haplogroup(text):
250
- match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
251
- if match:
252
- submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
253
- if submatch:
254
- return submatch.group(0)
255
- else:
256
- return match.group(1) # fallback
257
- fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
258
- if fallback:
259
- return fallback.group(1)
260
- return None
261
-
262
-
263
- # Function to extract location based on NER
264
- def extract_location(text):
265
- doc = nlp(text)
266
- locations = []
267
- for ent in doc.ents:
268
- if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
269
- locations.append(ent.text)
270
- return locations
271
-
272
- # Function to infer location from haplogroup
273
- def infer_location_from_haplogroup(haplogroup):
274
- haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
275
- return haplo_map.get(haplogroup, ["Unknown","Unknown"])
276
-
277
- # Function to classify the mtDNA sample
278
- def classify_mtDNA_sample_from_haplo(text):
279
- # Extract haplogroup
280
- haplogroup = extract_haplogroup(text)
281
- # Extract location based on NER
282
- locations = extract_location(text)
283
- # Infer location based on haplogroup
284
- inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
285
- return {
286
- "source":sourceHaplo,
287
- "locations_found_in_context": locations,
288
- "haplogroup": haplogroup,
289
- "inferred_location": inferred_location
290
-
291
- }
292
- # 4.3 Get from available NCBI
293
- def infer_location_fromNCBI(accession):
294
- try:
295
- handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
296
- text = handle.read()
297
- handle.close()
298
- match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
299
- if match:
300
- return match.group(2), match.group(0) # This is the value like "Brunei"
301
- return "Not found", "Not found"
302
-
303
- except Exception as e:
304
- print("❌ Entrez error:", e)
305
- return "Not found", "Not found"
306
-
307
- ### ANCIENT/MODERN FLAG
308
- from Bio import Entrez
309
- import re
310
-
311
- def flag_ancient_modern(accession, textsToExtract, isolate=None):
312
- """
313
- Try to classify a sample as Ancient or Modern using:
314
- 1. NCBI accession (if available)
315
- 2. Supplementary text or context fallback
316
- """
317
- context = ""
318
- label, explain = "", ""
319
-
320
- try:
321
- # Check if we can fetch metadata from NCBI using the accession
322
- handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
323
- text = handle.read()
324
- handle.close()
325
-
326
- isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
327
- if isolate_source:
328
- context += isolate_source.group(0) + " "
329
-
330
- specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
331
- if specimen:
332
- context += specimen.group(0) + " "
333
-
334
- if context.strip():
335
- label, explain = detect_ancient_flag(context)
336
- if label!="Unknown":
337
- return label, explain + " from NCBI\n(" + context + ")"
338
-
339
- # If no useful NCBI metadata, check supplementary texts
340
- if textsToExtract:
341
- labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
342
-
343
- for source in textsToExtract:
344
- text_block = textsToExtract[source]
345
- context = extract_relevant_paragraphs(text_block, accession, isolate=isolate) # Reduce to informative paragraph(s)
346
- label, explain = detect_ancient_flag(context)
347
-
348
- if label == "Ancient":
349
- labels["ancient"][0] += 1
350
- labels["ancient"][1] += f"{source}:\n{explain}\n\n"
351
- elif label == "Modern":
352
- labels["modern"][0] += 1
353
- labels["modern"][1] += f"{source}:\n{explain}\n\n"
354
- else:
355
- labels["unknown"] += 1
356
-
357
- if max(labels["modern"][0],labels["ancient"][0]) > 0:
358
- if labels["modern"][0] > labels["ancient"][0]:
359
- return "Modern", labels["modern"][1]
360
- else:
361
- return "Ancient", labels["ancient"][1]
362
- else:
363
- return "Unknown", "No strong keywords detected"
364
- else:
365
- print("No DOI or PubMed ID available for inference.")
366
- return "", ""
367
-
368
- except Exception as e:
369
- print("Error:", e)
370
- return "", ""
371
-
372
-
373
- def detect_ancient_flag(context_snippet):
374
- context = context_snippet.lower()
375
-
376
- ancient_keywords = [
377
- "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
378
- "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
379
- "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
380
- "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
381
- ]
382
-
383
- modern_keywords = [
384
- "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
385
- "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
386
- "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
387
- "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
388
- "bioinformatic analysis", "samples from", "population genetics", "genome-wide data"
389
- ]
390
-
391
- ancient_hits = [k for k in ancient_keywords if k in context]
392
- modern_hits = [k for k in modern_keywords if k in context]
393
-
394
- if ancient_hits and not modern_hits:
395
- return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
396
- elif modern_hits and not ancient_hits:
397
- return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
398
- elif ancient_hits and modern_hits:
399
- if len(ancient_hits) >= len(modern_hits):
400
- return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
401
- else:
402
- return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
403
-
404
- # Fallback to QA
405
- answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
406
- if answer.startswith("Error"):
407
- return "Unknown", answer
408
- if "ancient" in answer.lower():
409
- return "Ancient", f"Leaning ancient based on QA: {answer}"
410
- elif "modern" in answer.lower():
411
- return "Modern", f"Leaning modern based on QA: {answer}"
412
- else:
413
- return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
414
-
415
- # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
416
- def classify_sample_location(accession):
417
- outputs = {}
418
- keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
419
- # Step 1: get pubmed id and isolate
420
- pubmedID, isolate = get_info_from_accession(accession)
421
- '''if not pubmedID:
422
- return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
423
- if not isolate:
424
- isolate = "UNKNOWN_ISOLATE"
425
- # Step 2: get doi
426
- doi = get_doi_from_pubmed_id(pubmedID)
427
- '''if not doi:
428
- return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
429
- # Step 3: get text
430
- '''textsToExtract = { "doiLink":"paperText"
431
- "file1.pdf":"text1",
432
- "file2.doc":"text2",
433
- "file3.xlsx":excelText3'''
434
- if doi and pubmedID:
435
- textsToExtract = get_paper_text(doi,pubmedID)
436
- else: textsToExtract = {}
437
- '''if not textsToExtract:
438
- return {"error": f"No texts extracted for DOI {doi}"}'''
439
- if isolate not in [None, "UNKNOWN_ISOLATE"]:
440
- label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
441
- else:
442
- label, explain = flag_ancient_modern(accession,textsToExtract)
443
- # Step 4: prediction
444
- outputs[accession] = {}
445
- outputs[isolate] = {}
446
- # 4.0 Infer from NCBI
447
- location, outputNCBI = infer_location_fromNCBI(accession)
448
- NCBI_result = {
449
- "source": "NCBI",
450
- "sample_id": accession,
451
- "predicted_location": location,
452
- "context_snippet": outputNCBI}
453
- outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
454
- if textsToExtract:
455
- long_text = ""
456
- for key in textsToExtract:
457
- text = textsToExtract[key]
458
- # try accession number first
459
- outputs[accession][key] = {}
460
- keyword = accession
461
- context = extract_context(text, keyword, window=500)
462
- # 4.1: Using a HuggingFace model (question-answering)
463
- location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
464
- qa_result = {
465
- "source": key,
466
- "sample_id": keyword,
467
- "predicted_location": location,
468
- "context_snippet": context
469
- }
470
- outputs[keyword][key]["QAModel"] = qa_result
471
- # 4.2: Infer from haplogroup
472
- haplo_result = classify_mtDNA_sample_from_haplo(context)
473
- outputs[keyword][key]["haplogroup"] = haplo_result
474
- # try isolate
475
- keyword = isolate
476
- outputs[isolate][key] = {}
477
- context = extract_context(text, keyword, window=500)
478
- # 4.1.1: Using a HuggingFace model (question-answering)
479
- location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
480
- qa_result = {
481
- "source": key,
482
- "sample_id": keyword,
483
- "predicted_location": location,
484
- "context_snippet": context
485
- }
486
- outputs[keyword][key]["QAModel"] = qa_result
487
- # 4.2.1: Infer from haplogroup
488
- haplo_result = classify_mtDNA_sample_from_haplo(context)
489
- outputs[keyword][key]["haplogroup"] = haplo_result
490
- # add long text
491
- long_text += text + ". \n"
492
- # 4.3: UpgradeClassify
493
- # try sample_id as accession number
494
- sample_id = accession
495
- if sample_id:
496
- filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
497
- locations = infer_location_for_sample(sample_id.upper(), filtered_context)
498
- if locations!="No clear location found in top matches":
499
- outputs[sample_id]["upgradeClassifier"] = {}
500
- outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
501
- "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
502
- "sample_id": sample_id,
503
- "predicted_location": ", ".join(locations),
504
- "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
505
- }
506
- # try sample_id as isolate name
507
- sample_id = isolate
508
- if sample_id:
509
- filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
510
- locations = infer_location_for_sample(sample_id.upper(), filtered_context)
511
- if locations!="No clear location found in top matches":
512
- outputs[sample_id]["upgradeClassifier"] = {}
513
- outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
514
- "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
515
- "sample_id": sample_id,
516
- "predicted_location": ", ".join(locations),
517
- "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
518
- }
519
- return outputs, label, explain
 
 
 
 
 
 
1
+ # mtDNA Location Classifier MVP (Google Colab)
2
+ # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
+ import os
4
+ import subprocess
5
+ import re
6
+ from Bio import Entrez
7
+ import fitz
8
+ import spacy
9
+ from spacy.cli import download
10
+ from NER.PDF import pdf
11
+ from NER.WordDoc import wordDoc
12
+ from NER.html import extractHTML
13
+ from NER.word2Vec import word2vec
14
+ from transformers import pipeline
15
+ import urllib.parse, requests
16
+ from pathlib import Path
17
+ from upgradeClassify import filter_context_for_sample, infer_location_for_sample
18
+ # Set your email (required by NCBI Entrez)
19
+ #Entrez.email = "[email protected]"
20
+ import nltk
21
+
22
+ nltk.download("stopwords")
23
+ #nltk.download("punkt")
24
+ nltk.download('punkt', download_dir='/home/user/nltk_data')
25
+
26
+ nltk.download('punkt_tab')
27
+ # Step 1: Get PubMed ID from Accession using EDirect
28
+
29
+ '''def get_info_from_accession(accession):
30
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
31
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
32
+ output = result.stdout
33
+ pubmedID, isolate = "", ""
34
+ for line in output.split("\n"):
35
+ if len(line) > 0:
36
+ if "PUBMED" in line:
37
+ pubmedID = line.split()[-1]
38
+ if "isolate" in line: # Check for isolate information
39
+ # Try direct GenBank annotation: /isolate="XXX"
40
+ match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
41
+ if match1:
42
+ isolate = match1.group(1)
43
+ else:
44
+ # Try from DEFINITION line: ...isolate XXX...
45
+ match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
46
+ if match2:
47
+ isolate = match2.group(1)'''
48
+ from Bio import Entrez, Medline
49
+ import re
50
+
51
+ Entrez.email = "[email protected]"
52
+
53
+ def get_info_from_accession(accession):
54
+ try:
55
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
56
+ text = handle.read()
57
+ handle.close()
58
+
59
+ # Extract PUBMED ID from the Medline text
60
+ pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
61
+ pubmed_id = pubmed_match.group(1) if pubmed_match else ""
62
+
63
+ # Extract isolate if available
64
+ isolate_match = re.search(r'/isolate="([^"]+)"', text)
65
+ if not isolate_match:
66
+ isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
67
+ isolate = isolate_match.group(1) if isolate_match else ""
68
+
69
+ if not pubmed_id:
70
+ print(f"⚠️ No PubMed ID found for accession {accession}")
71
+
72
+ return pubmed_id, isolate
73
+
74
+ except Exception as e:
75
+ print("❌ Entrez error:", e)
76
+ return "", ""
77
+ # Step 2: Get doi link to access the paper
78
+ '''def get_doi_from_pubmed_id(pubmed_id):
79
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
80
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
81
+ output = result.stdout
82
+
83
+ doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
84
+ match = re.search(doi_pattern, output, re.IGNORECASE)
85
+
86
+ if match:
87
+ return match.group(0)
88
+ else:
89
+ return None # or raise an Exception with a helpful message'''
90
+
91
+ def get_doi_from_pubmed_id(pubmed_id):
92
+ try:
93
+ handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
94
+ records = list(Medline.parse(handle))
95
+ handle.close()
96
+
97
+ if not records:
98
+ return None
99
+
100
+ record = records[0]
101
+ if "AID" in record:
102
+ for aid in record["AID"]:
103
+ if "[doi]" in aid:
104
+ return aid.split(" ")[0] # extract the DOI
105
+
106
+ return None
107
+
108
+ except Exception as e:
109
+ print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
110
+ return None
111
+
112
+
113
+ # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
114
+ # Step 3.1: Extract Text
115
+ # sub: download excel file
116
+ def download_excel_file(url, save_path="temp.xlsx"):
117
+ if "view.officeapps.live.com" in url:
118
+ parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
119
+ real_url = urllib.parse.unquote(parsed_url["src"][0])
120
+ response = requests.get(real_url)
121
+ with open(save_path, "wb") as f:
122
+ f.write(response.content)
123
+ return save_path
124
+ elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
125
+ response = requests.get(url)
126
+ response.raise_for_status() # Raises error if download fails
127
+ with open(save_path, "wb") as f:
128
+ f.write(response.content)
129
+ return save_path
130
+ else:
131
+ print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
132
+ return url
133
+ def get_paper_text(doi,id,manualLinks=None):
134
+ # create the temporary folder to contain the texts
135
+ '''folder_path = Path("data/"+str(id))
136
+ if not folder_path.exists():
137
+ cmd = f'mkdir data/{id}'
138
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
139
+ print("data/"+str(id) +" created.")
140
+ else:
141
+ print("data/"+str(id) +" already exists.")'''
142
+
143
+ cmd = f'mkdir data/{id}'
144
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
145
+ saveLinkFolder = "data/"+id
146
+
147
+ link = 'https://doi.org/' + doi
148
+ '''textsToExtract = { "doiLink":"paperText"
149
+ "file1.pdf":"text1",
150
+ "file2.doc":"text2",
151
+ "file3.xlsx":excelText3'''
152
+ textsToExtract = {}
153
+ # get the file to create listOfFile for each id
154
+ html = extractHTML.HTML("",link)
155
+ jsonSM = html.getSupMaterial()
156
+ text = ""
157
+ links = [link] + sum((jsonSM[key] for key in jsonSM),[])
158
+ if manualLinks != None:
159
+ links += manualLinks
160
+ for l in links:
161
+ # get the main paper
162
+ name = l.split("/")[-1]
163
+ #file_path = folder_path / name
164
+ if l == link:
165
+ text = html.getListSection()
166
+ textsToExtract[link] = text
167
+ elif l.endswith(".pdf"):
168
+ '''if file_path.is_file():
169
+ l = saveLinkFolder + "/" + name
170
+ print("File exists.")'''
171
+ p = pdf.PDF(l,saveLinkFolder,doi)
172
+ f = p.openPDFFile()
173
+ pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
174
+ doc = fitz.open(pdf_path)
175
+ text = "\n".join([page.get_text() for page in doc])
176
+ textsToExtract[l] = text
177
+ elif l.endswith(".doc") or l.endswith(".docx"):
178
+ d = wordDoc.wordDoc(l,saveLinkFolder)
179
+ text = d.extractTextByPage()
180
+ textsToExtract[l] = text
181
+ elif l.split(".")[-1].lower() in "xlsx":
182
+ wc = word2vec.word2Vec()
183
+ # download excel file if it not downloaded yet
184
+ savePath = saveLinkFolder +"/"+ l.split("/")[-1]
185
+ excelPath = download_excel_file(l, savePath)
186
+ corpus = wc.tableTransformToCorpusText([],excelPath)
187
+ text = ''
188
+ for c in corpus:
189
+ para = corpus[c]
190
+ for words in para:
191
+ text += " ".join(words)
192
+ textsToExtract[l] = text
193
+ # delete folder after finishing getting text
194
+ cmd = f'rm -r data/{id}'
195
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
196
+ return textsToExtract
197
+ # Step 3.2: Extract context
198
+ def extract_context(text, keyword, window=500):
199
+ # firstly try accession number
200
+ idx = text.find(keyword)
201
+ if idx == -1:
202
+ return "Sample ID not found."
203
+ return text[max(0, idx-window): idx+window]
204
+ def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
205
+ if keep_if is None:
206
+ keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
207
+
208
+ outputs = ""
209
+ text = text.lower()
210
+
211
+ # If isolate is provided, prioritize paragraphs that mention it
212
+ # If isolate is provided, prioritize paragraphs that mention it
213
+ if accession and accession.lower() in text:
214
+ if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
215
+ outputs += extract_context(text, accession.lower(), window=700)
216
+ if isolate and isolate.lower() in text:
217
+ if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
218
+ outputs += extract_context(text, isolate.lower(), window=700)
219
+ for keyword in keep_if:
220
+ para = extract_context(text, keyword)
221
+ if para and para not in outputs:
222
+ outputs += para + "\n"
223
+ return outputs
224
+ # Step 4: Classification for now (demo purposes)
225
+ # 4.1: Using a HuggingFace model (question-answering)
226
+ def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
227
+ try:
228
+ qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
229
+ result = qa({"context": context, "question": question})
230
+ return result.get("answer", "Unknown")
231
+ except Exception as e:
232
+ return f"Error: {str(e)}"
233
+
234
+ # 4.2: Infer from haplogroup
235
+ # Load pre-trained spaCy model for NER
236
+ try:
237
+ nlp = spacy.load("en_core_web_sm")
238
+ except OSError:
239
+ download("en_core_web_sm")
240
+ nlp = spacy.load("en_core_web_sm")
241
+
242
+ # Define the haplogroup-to-region mapping (simple rule-based)
243
+ import csv
244
+
245
+ def load_haplogroup_mapping(csv_path):
246
+ mapping = {}
247
+ with open(csv_path) as f:
248
+ reader = csv.DictReader(f)
249
+ for row in reader:
250
+ mapping[row["haplogroup"]] = [row["region"],row["source"]]
251
+ return mapping
252
+
253
+ # Function to extract haplogroup from the text
254
+ def extract_haplogroup(text):
255
+ match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
256
+ if match:
257
+ submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
258
+ if submatch:
259
+ return submatch.group(0)
260
+ else:
261
+ return match.group(1) # fallback
262
+ fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
263
+ if fallback:
264
+ return fallback.group(1)
265
+ return None
266
+
267
+
268
+ # Function to extract location based on NER
269
+ def extract_location(text):
270
+ doc = nlp(text)
271
+ locations = []
272
+ for ent in doc.ents:
273
+ if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
274
+ locations.append(ent.text)
275
+ return locations
276
+
277
+ # Function to infer location from haplogroup
278
+ def infer_location_from_haplogroup(haplogroup):
279
+ haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
280
+ return haplo_map.get(haplogroup, ["Unknown","Unknown"])
281
+
282
+ # Function to classify the mtDNA sample
283
+ def classify_mtDNA_sample_from_haplo(text):
284
+ # Extract haplogroup
285
+ haplogroup = extract_haplogroup(text)
286
+ # Extract location based on NER
287
+ locations = extract_location(text)
288
+ # Infer location based on haplogroup
289
+ inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
290
+ return {
291
+ "source":sourceHaplo,
292
+ "locations_found_in_context": locations,
293
+ "haplogroup": haplogroup,
294
+ "inferred_location": inferred_location
295
+
296
+ }
297
+ # 4.3 Get from available NCBI
298
+ def infer_location_fromNCBI(accession):
299
+ try:
300
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
301
+ text = handle.read()
302
+ handle.close()
303
+ match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
304
+ if match:
305
+ return match.group(2), match.group(0) # This is the value like "Brunei"
306
+ return "Not found", "Not found"
307
+
308
+ except Exception as e:
309
+ print("❌ Entrez error:", e)
310
+ return "Not found", "Not found"
311
+
312
+ ### ANCIENT/MODERN FLAG
313
+ from Bio import Entrez
314
+ import re
315
+
316
+ def flag_ancient_modern(accession, textsToExtract, isolate=None):
317
+ """
318
+ Try to classify a sample as Ancient or Modern using:
319
+ 1. NCBI accession (if available)
320
+ 2. Supplementary text or context fallback
321
+ """
322
+ context = ""
323
+ label, explain = "", ""
324
+
325
+ try:
326
+ # Check if we can fetch metadata from NCBI using the accession
327
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
328
+ text = handle.read()
329
+ handle.close()
330
+
331
+ isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
332
+ if isolate_source:
333
+ context += isolate_source.group(0) + " "
334
+
335
+ specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
336
+ if specimen:
337
+ context += specimen.group(0) + " "
338
+
339
+ if context.strip():
340
+ label, explain = detect_ancient_flag(context)
341
+ if label!="Unknown":
342
+ return label, explain + " from NCBI\n(" + context + ")"
343
+
344
+ # If no useful NCBI metadata, check supplementary texts
345
+ if textsToExtract:
346
+ labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
347
+
348
+ for source in textsToExtract:
349
+ text_block = textsToExtract[source]
350
+ context = extract_relevant_paragraphs(text_block, accession, isolate=isolate) # Reduce to informative paragraph(s)
351
+ label, explain = detect_ancient_flag(context)
352
+
353
+ if label == "Ancient":
354
+ labels["ancient"][0] += 1
355
+ labels["ancient"][1] += f"{source}:\n{explain}\n\n"
356
+ elif label == "Modern":
357
+ labels["modern"][0] += 1
358
+ labels["modern"][1] += f"{source}:\n{explain}\n\n"
359
+ else:
360
+ labels["unknown"] += 1
361
+
362
+ if max(labels["modern"][0],labels["ancient"][0]) > 0:
363
+ if labels["modern"][0] > labels["ancient"][0]:
364
+ return "Modern", labels["modern"][1]
365
+ else:
366
+ return "Ancient", labels["ancient"][1]
367
+ else:
368
+ return "Unknown", "No strong keywords detected"
369
+ else:
370
+ print("No DOI or PubMed ID available for inference.")
371
+ return "", ""
372
+
373
+ except Exception as e:
374
+ print("Error:", e)
375
+ return "", ""
376
+
377
+
378
+ def detect_ancient_flag(context_snippet):
379
+ context = context_snippet.lower()
380
+
381
+ ancient_keywords = [
382
+ "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
383
+ "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
384
+ "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
385
+ "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
386
+ ]
387
+
388
+ modern_keywords = [
389
+ "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
390
+ "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
391
+ "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
392
+ "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
393
+ "bioinformatic analysis", "samples from", "population genetics", "genome-wide data"
394
+ ]
395
+
396
+ ancient_hits = [k for k in ancient_keywords if k in context]
397
+ modern_hits = [k for k in modern_keywords if k in context]
398
+
399
+ if ancient_hits and not modern_hits:
400
+ return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
401
+ elif modern_hits and not ancient_hits:
402
+ return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
403
+ elif ancient_hits and modern_hits:
404
+ if len(ancient_hits) >= len(modern_hits):
405
+ return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
406
+ else:
407
+ return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
408
+
409
+ # Fallback to QA
410
+ answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
411
+ if answer.startswith("Error"):
412
+ return "Unknown", answer
413
+ if "ancient" in answer.lower():
414
+ return "Ancient", f"Leaning ancient based on QA: {answer}"
415
+ elif "modern" in answer.lower():
416
+ return "Modern", f"Leaning modern based on QA: {answer}"
417
+ else:
418
+ return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
419
+
420
+ # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
421
+ def classify_sample_location(accession):
422
+ outputs = {}
423
+ keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
424
+ # Step 1: get pubmed id and isolate
425
+ pubmedID, isolate = get_info_from_accession(accession)
426
+ '''if not pubmedID:
427
+ return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
428
+ if not isolate:
429
+ isolate = "UNKNOWN_ISOLATE"
430
+ # Step 2: get doi
431
+ doi = get_doi_from_pubmed_id(pubmedID)
432
+ '''if not doi:
433
+ return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
434
+ # Step 3: get text
435
+ '''textsToExtract = { "doiLink":"paperText"
436
+ "file1.pdf":"text1",
437
+ "file2.doc":"text2",
438
+ "file3.xlsx":excelText3'''
439
+ if doi and pubmedID:
440
+ textsToExtract = get_paper_text(doi,pubmedID)
441
+ else: textsToExtract = {}
442
+ '''if not textsToExtract:
443
+ return {"error": f"No texts extracted for DOI {doi}"}'''
444
+ if isolate not in [None, "UNKNOWN_ISOLATE"]:
445
+ label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
446
+ else:
447
+ label, explain = flag_ancient_modern(accession,textsToExtract)
448
+ # Step 4: prediction
449
+ outputs[accession] = {}
450
+ outputs[isolate] = {}
451
+ # 4.0 Infer from NCBI
452
+ location, outputNCBI = infer_location_fromNCBI(accession)
453
+ NCBI_result = {
454
+ "source": "NCBI",
455
+ "sample_id": accession,
456
+ "predicted_location": location,
457
+ "context_snippet": outputNCBI}
458
+ outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
459
+ if textsToExtract:
460
+ long_text = ""
461
+ for key in textsToExtract:
462
+ text = textsToExtract[key]
463
+ # try accession number first
464
+ outputs[accession][key] = {}
465
+ keyword = accession
466
+ context = extract_context(text, keyword, window=500)
467
+ # 4.1: Using a HuggingFace model (question-answering)
468
+ location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
469
+ qa_result = {
470
+ "source": key,
471
+ "sample_id": keyword,
472
+ "predicted_location": location,
473
+ "context_snippet": context
474
+ }
475
+ outputs[keyword][key]["QAModel"] = qa_result
476
+ # 4.2: Infer from haplogroup
477
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
478
+ outputs[keyword][key]["haplogroup"] = haplo_result
479
+ # try isolate
480
+ keyword = isolate
481
+ outputs[isolate][key] = {}
482
+ context = extract_context(text, keyword, window=500)
483
+ # 4.1.1: Using a HuggingFace model (question-answering)
484
+ location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
485
+ qa_result = {
486
+ "source": key,
487
+ "sample_id": keyword,
488
+ "predicted_location": location,
489
+ "context_snippet": context
490
+ }
491
+ outputs[keyword][key]["QAModel"] = qa_result
492
+ # 4.2.1: Infer from haplogroup
493
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
494
+ outputs[keyword][key]["haplogroup"] = haplo_result
495
+ # add long text
496
+ long_text += text + ". \n"
497
+ # 4.3: UpgradeClassify
498
+ # try sample_id as accession number
499
+ sample_id = accession
500
+ if sample_id:
501
+ filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
502
+ locations = infer_location_for_sample(sample_id.upper(), filtered_context)
503
+ if locations!="No clear location found in top matches":
504
+ outputs[sample_id]["upgradeClassifier"] = {}
505
+ outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
506
+ "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
507
+ "sample_id": sample_id,
508
+ "predicted_location": ", ".join(locations),
509
+ "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
510
+ }
511
+ # try sample_id as isolate name
512
+ sample_id = isolate
513
+ if sample_id:
514
+ filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
515
+ locations = infer_location_for_sample(sample_id.upper(), filtered_context)
516
+ if locations!="No clear location found in top matches":
517
+ outputs[sample_id]["upgradeClassifier"] = {}
518
+ outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
519
+ "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
520
+ "sample_id": sample_id,
521
+ "predicted_location": ", ".join(locations),
522
+ "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
523
+ }
524
+ return outputs, label, explain