VyLala commited on
Commit
6110a0d
·
verified ·
1 Parent(s): 743194b

Update mtdna_classifier.py

Browse files
Files changed (1) hide show
  1. mtdna_classifier.py +273 -272
mtdna_classifier.py CHANGED
@@ -1,273 +1,274 @@
1
- # mtDNA Location Classifier MVP (Google Colab)
2
- # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
- import os
4
- import subprocess
5
- import re
6
- from Bio import Entrez
7
- import fitz
8
- import spacy
9
- from spacy.cli import download
10
- from NER.PDF import pdf
11
- from NER.WordDoc import wordDoc
12
- from NER.html import extractHTML
13
- from NER.word2Vec import word2vec
14
- from transformers import pipeline
15
- # Set your email (required by NCBI Entrez)
16
- #Entrez.email = "[email protected]"
17
- import nltk
18
-
19
- nltk.download("stopwords")
20
- nltk.download("punkt")
21
- nltk.download('punkt_tab')
22
- # Step 1: Get PubMed ID from Accession using EDirect
23
-
24
- def get_info_from_accession(accession):
25
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
26
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
27
- output = result.stdout
28
- pubmedID, isolate = "", ""
29
- for line in output.split("\n"):
30
- if len(line) > 0:
31
- if "PUBMED" in line:
32
- pubmedID = line.split()[-1]
33
- if "isolate" in line: # Check for isolate information
34
- # Try direct GenBank annotation: /isolate="XXX"
35
- match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
36
- if match1:
37
- isolate = match1.group(1)
38
- else:
39
- # Try from DEFINITION line: ...isolate XXX...
40
- match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
41
- if match2:
42
- isolate = match2.group(1)
43
-
44
- # Return the values, even if they are empty strings
45
- return pubmedID, isolate
46
- # Step 2: Get doi link to access the paper
47
- def get_doi_from_pubmed_id(pubmed_id):
48
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
49
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
50
- output = result.stdout
51
-
52
- doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
53
- match = re.search(doi_pattern, output, re.IGNORECASE)
54
-
55
- if match:
56
- return match.group(0)
57
- else:
58
- return None # or raise an Exception with a helpful message
59
-
60
-
61
- # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
62
- # Step 3.1: Extract Text
63
- def get_paper_text(doi,id):
64
- # create the temporary folder to contain the texts
65
- cmd = f'mkdir data/{id}'
66
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
67
- saveLinkFolder = "data/"+id
68
-
69
- link = 'https://doi.org/' + doi
70
- '''textsToExtract = { "doiLink":"paperText"
71
- "file1.pdf":"text1",
72
- "file2.doc":"text2",
73
- "file3.xlsx":excelText3'''
74
- textsToExtract = {}
75
- # get the file to create listOfFile for each id
76
- html = extractHTML.HTML("",link)
77
- jsonSM = html.getSupMaterial()
78
- text = ""
79
- links = [link] + sum((jsonSM[key] for key in jsonSM),[])
80
- #print(links)
81
- for l in links:
82
- # get the main paper
83
- if l == link:
84
- text = html.getListSection()
85
- textsToExtract[link] = text
86
- elif l.endswith(".pdf"):
87
- p = pdf.PDF(l,saveLinkFolder,doi)
88
- f = p.openPDFFile()
89
- pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
90
- doc = fitz.open(pdf_path)
91
- text = "\n".join([page.get_text() for page in doc])
92
- textsToExtract[l] = text
93
- elif l.endswith(".doc") or l.endswith(".docx"):
94
- d = wordDoc.wordDoc(l,saveLinkFolder)
95
- text = d.extractTextByPage()
96
- textsToExtract[l] = text
97
- elif l.split(".")[-1].lower() in "xlsx":
98
- wc = word2vec.word2Vec()
99
- corpus = wc.tableTransformToCorpusText([],l)
100
- text = ''
101
- for c in corpus:
102
- para = corpus[c]
103
- for words in para:
104
- text += " ".join(words)
105
- textsToExtract[l] = text
106
- # delete folder after finishing getting text
107
- cmd = f'rm -r data/{id}'
108
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
109
- return textsToExtract
110
- # Step 3.2: Extract context
111
- def extract_context(text, keyword, window=500):
112
- idx = text.find(keyword)
113
- if idx == -1:
114
- return "Sample ID not found."
115
- return text[max(0, idx-window): idx+window]
116
- # Step 4: Classification for now (demo purposes)
117
- # 4.1: Using a HuggingFace model (question-answering)
118
- def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
119
- try:
120
- qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
121
- result = qa({"context": context, "question": question})
122
- return result.get("answer", "Unknown")
123
- except Exception as e:
124
- return f"Error: {str(e)}"
125
-
126
- # 4.2: Infer from haplogroup
127
- # Load pre-trained spaCy model for NER
128
- try:
129
- nlp = spacy.load("en_core_web_sm")
130
- except OSError:
131
- download("en_core_web_sm")
132
- nlp = spacy.load("en_core_web_sm")
133
-
134
- nlp = spacy.load("en_core_web_sm")
135
- # Define the haplogroup-to-region mapping (simple rule-based)
136
- import csv
137
-
138
- def load_haplogroup_mapping(csv_path):
139
- mapping = {}
140
- with open(csv_path) as f:
141
- reader = csv.DictReader(f)
142
- for row in reader:
143
- mapping[row["haplogroup"]] = [row["region"],row["source"]]
144
- return mapping
145
-
146
- # Function to extract haplogroup from the text
147
- def extract_haplogroup(text):
148
- match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
149
- if match:
150
- submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
151
- if submatch:
152
- return submatch.group(0)
153
- else:
154
- return match.group(1) # fallback
155
- fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
156
- if fallback:
157
- return fallback.group(1)
158
- return None
159
-
160
-
161
- # Function to extract location based on NER
162
- def extract_location(text):
163
- doc = nlp(text)
164
- locations = []
165
- for ent in doc.ents:
166
- if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
167
- locations.append(ent.text)
168
- return locations
169
-
170
- # Function to infer location from haplogroup
171
- def infer_location_from_haplogroup(haplogroup):
172
- haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
173
- return haplo_map.get(haplogroup, ["Unknown","Unknown"])
174
-
175
- # Function to classify the mtDNA sample
176
- def classify_mtDNA_sample_from_haplo(text):
177
- # Extract haplogroup
178
- haplogroup = extract_haplogroup(text)
179
- # Extract location based on NER
180
- locations = extract_location(text)
181
- # Infer location based on haplogroup
182
- inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
183
- return {
184
- "source":sourceHaplo,
185
- "locations_found_in_context": locations,
186
- "haplogroup": haplogroup,
187
- "inferred_location": inferred_location
188
-
189
- }
190
- # 4.3 Get from available NCBI
191
- def infer_location_fromNCBI(accession):
192
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "location|country|geo"'
193
- result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
194
- output, location = "",""
195
- output = result.stdout
196
- if "location" in output or "country" in output or "geo" in output:
197
- location = output.split('"')[1]
198
- output = output.split()[0]
199
- else:
200
- location = "Unknown"
201
- output = "No location information found in NCBI."
202
- return location, output
203
-
204
- # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
205
- def classify_sample_location(accession):
206
- outputs = {}
207
- keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
208
- # Step 1: get pubmed id and isolate
209
- pubmedID, isolate = get_info_from_accession(accession)
210
- if not pubmedID:
211
- return {"error": f"Could not retrieve PubMed ID for accession {accession}"}
212
- if not isolate:
213
- isolate = "UNKNOWN_ISOLATE"
214
- # Step 2: get doi
215
- doi = get_doi_from_pubmed_id(pubmedID)
216
- if not doi:
217
- return {"error": "DOI not found for this accession. Cannot fetch paper or context."}
218
-
219
- # Step 3: get text
220
- '''textsToExtract = { "doiLink":"paperText"
221
- "file1.pdf":"text1",
222
- "file2.doc":"text2",
223
- "file3.xlsx":excelText3'''
224
- textsToExtract = get_paper_text(doi,pubmedID)
225
- if not textsToExtract:
226
- return {"error": f"No texts extracted for DOI {doi}"}
227
-
228
- # Step 4: prediction
229
- outputs[accession] = {}
230
- outputs[isolate] = {}
231
- # 4.0 Infer from NCBI
232
- location, outputNCBI = infer_location_fromNCBI(accession)
233
- NCBI_result = {
234
- "source": "NCBI",
235
- "sample_id": accession,
236
- "predicted_location": location,
237
- "context_snippet": outputNCBI}
238
- outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
239
- for key in textsToExtract:
240
- text = textsToExtract[key]
241
- # try accession number first
242
- outputs[accession][key] = {}
243
- keyword = accession
244
- context = extract_context(text, keyword, window=500)
245
- # 4.1: Using a HuggingFace model (question-answering)
246
- location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
247
- qa_result = {
248
- "source": key,
249
- "sample_id": keyword,
250
- "predicted_location": location,
251
- "context_snippet": context
252
- }
253
- outputs[keyword][key]["QAModel"] = qa_result
254
- # 4.2: Infer from haplogroup
255
- haplo_result = classify_mtDNA_sample_from_haplo(context)
256
- outputs[keyword][key]["haplogroup"] = haplo_result
257
- # try isolate
258
- keyword = isolate
259
- outputs[isolate][key] = {}
260
- context = extract_context(text, keyword, window=500)
261
- # 4.1.1: Using a HuggingFace model (question-answering)
262
- location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
263
- qa_result = {
264
- "source": key,
265
- "sample_id": keyword,
266
- "predicted_location": location,
267
- "context_snippet": context
268
- }
269
- outputs[keyword][key]["QAModel"] = qa_result
270
- # 4.2.1: Infer from haplogroup
271
- haplo_result = classify_mtDNA_sample_from_haplo(context)
272
- outputs[keyword][key]["haplogroup"] = haplo_result
 
273
  return outputs
 
1
+ # mtDNA Location Classifier MVP (Google Colab)
2
+ # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
+ import os
4
+ import subprocess
5
+ import re
6
+ from Bio import Entrez
7
+ import fitz
8
+ import spacy
9
+ from spacy.cli import download
10
+ from NER.PDF import pdf
11
+ from NER.WordDoc import wordDoc
12
+ from NER.html import extractHTML
13
+ from NER.word2Vec import word2vec
14
+ from transformers import pipeline
15
+ # Set your email (required by NCBI Entrez)
16
+ #Entrez.email = "[email protected]"
17
+ import nltk
18
+
19
+ nltk.download("stopwords")
20
+ nltk.download("punkt")
21
+ nltk.download('punkt_tab')
22
+ # Step 1: Get PubMed ID from Accession using EDirect
23
+
24
+ def get_info_from_accession(accession):
25
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
26
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
27
+ output = result.stdout
28
+ pubmedID, isolate = "", ""
29
+ for line in output.split("\n"):
30
+ if len(line) > 0:
31
+ if "PUBMED" in line:
32
+ pubmedID = line.split()[-1]
33
+ if "isolate" in line: # Check for isolate information
34
+ # Try direct GenBank annotation: /isolate="XXX"
35
+ match1 = re.search(r'/isolate\s*=\s*"([^"]+)"', line) # search on current line
36
+ if match1:
37
+ isolate = match1.group(1)
38
+ else:
39
+ # Try from DEFINITION line: ...isolate XXX...
40
+ match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
41
+ if match2:
42
+ isolate = match2.group(1)
43
+
44
+ # Return the values, even if they are empty strings
45
+ return pubmedID, isolate
46
+ print(get_info_from_accession("KU131308"))
47
+ # Step 2: Get doi link to access the paper
48
+ def get_doi_from_pubmed_id(pubmed_id):
49
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
50
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
51
+ output = result.stdout
52
+
53
+ doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
54
+ match = re.search(doi_pattern, output, re.IGNORECASE)
55
+
56
+ if match:
57
+ return match.group(0)
58
+ else:
59
+ return None # or raise an Exception with a helpful message
60
+
61
+
62
+ # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
63
+ # Step 3.1: Extract Text
64
+ def get_paper_text(doi,id):
65
+ # create the temporary folder to contain the texts
66
+ cmd = f'mkdir data/{id}'
67
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
68
+ saveLinkFolder = "data/"+id
69
+
70
+ link = 'https://doi.org/' + doi
71
+ '''textsToExtract = { "doiLink":"paperText"
72
+ "file1.pdf":"text1",
73
+ "file2.doc":"text2",
74
+ "file3.xlsx":excelText3'''
75
+ textsToExtract = {}
76
+ # get the file to create listOfFile for each id
77
+ html = extractHTML.HTML("",link)
78
+ jsonSM = html.getSupMaterial()
79
+ text = ""
80
+ links = [link] + sum((jsonSM[key] for key in jsonSM),[])
81
+ #print(links)
82
+ for l in links:
83
+ # get the main paper
84
+ if l == link:
85
+ text = html.getListSection()
86
+ textsToExtract[link] = text
87
+ elif l.endswith(".pdf"):
88
+ p = pdf.PDF(l,saveLinkFolder,doi)
89
+ f = p.openPDFFile()
90
+ pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
91
+ doc = fitz.open(pdf_path)
92
+ text = "\n".join([page.get_text() for page in doc])
93
+ textsToExtract[l] = text
94
+ elif l.endswith(".doc") or l.endswith(".docx"):
95
+ d = wordDoc.wordDoc(l,saveLinkFolder)
96
+ text = d.extractTextByPage()
97
+ textsToExtract[l] = text
98
+ elif l.split(".")[-1].lower() in "xlsx":
99
+ wc = word2vec.word2Vec()
100
+ corpus = wc.tableTransformToCorpusText([],l)
101
+ text = ''
102
+ for c in corpus:
103
+ para = corpus[c]
104
+ for words in para:
105
+ text += " ".join(words)
106
+ textsToExtract[l] = text
107
+ # delete folder after finishing getting text
108
+ cmd = f'rm -r data/{id}'
109
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
110
+ return textsToExtract
111
+ # Step 3.2: Extract context
112
+ def extract_context(text, keyword, window=500):
113
+ idx = text.find(keyword)
114
+ if idx == -1:
115
+ return "Sample ID not found."
116
+ return text[max(0, idx-window): idx+window]
117
+ # Step 4: Classification for now (demo purposes)
118
+ # 4.1: Using a HuggingFace model (question-answering)
119
+ def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
120
+ try:
121
+ qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
122
+ result = qa({"context": context, "question": question})
123
+ return result.get("answer", "Unknown")
124
+ except Exception as e:
125
+ return f"Error: {str(e)}"
126
+
127
+ # 4.2: Infer from haplogroup
128
+ # Load pre-trained spaCy model for NER
129
+ try:
130
+ nlp = spacy.load("en_core_web_sm")
131
+ except OSError:
132
+ download("en_core_web_sm")
133
+ nlp = spacy.load("en_core_web_sm")
134
+
135
+ nlp = spacy.load("en_core_web_sm")
136
+ # Define the haplogroup-to-region mapping (simple rule-based)
137
+ import csv
138
+
139
+ def load_haplogroup_mapping(csv_path):
140
+ mapping = {}
141
+ with open(csv_path) as f:
142
+ reader = csv.DictReader(f)
143
+ for row in reader:
144
+ mapping[row["haplogroup"]] = [row["region"],row["source"]]
145
+ return mapping
146
+
147
+ # Function to extract haplogroup from the text
148
+ def extract_haplogroup(text):
149
+ match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
150
+ if match:
151
+ submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
152
+ if submatch:
153
+ return submatch.group(0)
154
+ else:
155
+ return match.group(1) # fallback
156
+ fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
157
+ if fallback:
158
+ return fallback.group(1)
159
+ return None
160
+
161
+
162
+ # Function to extract location based on NER
163
+ def extract_location(text):
164
+ doc = nlp(text)
165
+ locations = []
166
+ for ent in doc.ents:
167
+ if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
168
+ locations.append(ent.text)
169
+ return locations
170
+
171
+ # Function to infer location from haplogroup
172
+ def infer_location_from_haplogroup(haplogroup):
173
+ haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
174
+ return haplo_map.get(haplogroup, ["Unknown","Unknown"])
175
+
176
+ # Function to classify the mtDNA sample
177
+ def classify_mtDNA_sample_from_haplo(text):
178
+ # Extract haplogroup
179
+ haplogroup = extract_haplogroup(text)
180
+ # Extract location based on NER
181
+ locations = extract_location(text)
182
+ # Infer location based on haplogroup
183
+ inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
184
+ return {
185
+ "source":sourceHaplo,
186
+ "locations_found_in_context": locations,
187
+ "haplogroup": haplogroup,
188
+ "inferred_location": inferred_location
189
+
190
+ }
191
+ # 4.3 Get from available NCBI
192
+ def infer_location_fromNCBI(accession):
193
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "location|country|geo"'
194
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
195
+ output, location = "",""
196
+ output = result.stdout
197
+ if "location" in output or "country" in output or "geo" in output:
198
+ location = output.split('"')[1]
199
+ output = output.split()[0]
200
+ else:
201
+ location = "Unknown"
202
+ output = "No location information found in NCBI."
203
+ return location, output
204
+
205
+ # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
206
+ def classify_sample_location(accession):
207
+ outputs = {}
208
+ keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
209
+ # Step 1: get pubmed id and isolate
210
+ pubmedID, isolate = get_info_from_accession(accession)
211
+ if not pubmedID:
212
+ return {"error": f"Could not retrieve PubMed ID for accession {accession}"}
213
+ if not isolate:
214
+ isolate = "UNKNOWN_ISOLATE"
215
+ # Step 2: get doi
216
+ doi = get_doi_from_pubmed_id(pubmedID)
217
+ if not doi:
218
+ return {"error": "DOI not found for this accession. Cannot fetch paper or context."}
219
+
220
+ # Step 3: get text
221
+ '''textsToExtract = { "doiLink":"paperText"
222
+ "file1.pdf":"text1",
223
+ "file2.doc":"text2",
224
+ "file3.xlsx":excelText3'''
225
+ textsToExtract = get_paper_text(doi,pubmedID)
226
+ if not textsToExtract:
227
+ return {"error": f"No texts extracted for DOI {doi}"}
228
+
229
+ # Step 4: prediction
230
+ outputs[accession] = {}
231
+ outputs[isolate] = {}
232
+ # 4.0 Infer from NCBI
233
+ location, outputNCBI = infer_location_fromNCBI(accession)
234
+ NCBI_result = {
235
+ "source": "NCBI",
236
+ "sample_id": accession,
237
+ "predicted_location": location,
238
+ "context_snippet": outputNCBI}
239
+ outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
240
+ for key in textsToExtract:
241
+ text = textsToExtract[key]
242
+ # try accession number first
243
+ outputs[accession][key] = {}
244
+ keyword = accession
245
+ context = extract_context(text, keyword, window=500)
246
+ # 4.1: Using a HuggingFace model (question-answering)
247
+ location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
248
+ qa_result = {
249
+ "source": key,
250
+ "sample_id": keyword,
251
+ "predicted_location": location,
252
+ "context_snippet": context
253
+ }
254
+ outputs[keyword][key]["QAModel"] = qa_result
255
+ # 4.2: Infer from haplogroup
256
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
257
+ outputs[keyword][key]["haplogroup"] = haplo_result
258
+ # try isolate
259
+ keyword = isolate
260
+ outputs[isolate][key] = {}
261
+ context = extract_context(text, keyword, window=500)
262
+ # 4.1.1: Using a HuggingFace model (question-answering)
263
+ location = infer_location_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
264
+ qa_result = {
265
+ "source": key,
266
+ "sample_id": keyword,
267
+ "predicted_location": location,
268
+ "context_snippet": context
269
+ }
270
+ outputs[keyword][key]["QAModel"] = qa_result
271
+ # 4.2.1: Infer from haplogroup
272
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
273
+ outputs[keyword][key]["haplogroup"] = haplo_result
274
  return outputs