Spaces:
Running
Running
Upload 2 files
Browse files- app.py +1 -1
- mtdna_classifier.py +26 -6
app.py
CHANGED
@@ -136,4 +136,4 @@ with gr.Blocks() as interface:
|
|
136 |
submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
|
137 |
reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
|
138 |
|
139 |
-
interface.launch()
|
|
|
136 |
submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
|
137 |
reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
|
138 |
|
139 |
+
interface.launch(share=True)
|
mtdna_classifier.py
CHANGED
@@ -44,13 +44,19 @@ def get_info_from_accession(accession):
|
|
44 |
# Return the values, even if they are empty strings
|
45 |
return pubmedID, isolate
|
46 |
# Step 2: Get doi link to access the paper
|
47 |
-
def get_doi_from_pubmed_id(
|
48 |
-
cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {
|
49 |
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
50 |
output = result.stdout
|
|
|
51 |
doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
|
52 |
match = re.search(doi_pattern, output, re.IGNORECASE)
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
|
56 |
# Step 3.1: Extract Text
|
@@ -110,9 +116,13 @@ def extract_context(text, keyword, window=500):
|
|
110 |
# Step 4: Classification for now (demo purposes)
|
111 |
# 4.1: Using a HuggingFace model (question-answering)
|
112 |
def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
116 |
# 4.2: Infer from haplogroup
|
117 |
# Load pre-trained spaCy model for NER
|
118 |
try:
|
@@ -197,14 +207,24 @@ def classify_sample_location(accession):
|
|
197 |
keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
|
198 |
# Step 1: get pubmed id and isolate
|
199 |
pubmedID, isolate = get_info_from_accession(accession)
|
|
|
|
|
|
|
|
|
200 |
# Step 2: get doi
|
201 |
doi = get_doi_from_pubmed_id(pubmedID)
|
|
|
|
|
|
|
202 |
# Step 3: get text
|
203 |
'''textsToExtract = { "doiLink":"paperText"
|
204 |
"file1.pdf":"text1",
|
205 |
"file2.doc":"text2",
|
206 |
"file3.xlsx":excelText3'''
|
207 |
textsToExtract = get_paper_text(doi,pubmedID)
|
|
|
|
|
|
|
208 |
# Step 4: prediction
|
209 |
outputs[accession] = {}
|
210 |
outputs[isolate] = {}
|
|
|
44 |
# Return the values, even if they are empty strings
|
45 |
return pubmedID, isolate
|
46 |
# Step 2: Get doi link to access the paper
|
47 |
+
def get_doi_from_pubmed_id(pubmed_id):
|
48 |
+
cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
|
49 |
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
50 |
output = result.stdout
|
51 |
+
|
52 |
doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
|
53 |
match = re.search(doi_pattern, output, re.IGNORECASE)
|
54 |
+
|
55 |
+
if match:
|
56 |
+
return match.group(0)
|
57 |
+
else:
|
58 |
+
return None # or raise an Exception with a helpful message
|
59 |
+
|
60 |
|
61 |
# Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
|
62 |
# Step 3.1: Extract Text
|
|
|
116 |
# Step 4: Classification for now (demo purposes)
|
117 |
# 4.1: Using a HuggingFace model (question-answering)
|
118 |
def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
|
119 |
+
try:
|
120 |
+
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
121 |
+
result = qa({"context": context, "question": question})
|
122 |
+
return result.get("answer", "Unknown")
|
123 |
+
except Exception as e:
|
124 |
+
return f"Error: {str(e)}"
|
125 |
+
|
126 |
# 4.2: Infer from haplogroup
|
127 |
# Load pre-trained spaCy model for NER
|
128 |
try:
|
|
|
207 |
keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
|
208 |
# Step 1: get pubmed id and isolate
|
209 |
pubmedID, isolate = get_info_from_accession(accession)
|
210 |
+
if not pubmedID:
|
211 |
+
return {"error": f"Could not retrieve PubMed ID for accession {accession}"}
|
212 |
+
if not isolate:
|
213 |
+
isolate = "UNKNOWN_ISOLATE"
|
214 |
# Step 2: get doi
|
215 |
doi = get_doi_from_pubmed_id(pubmedID)
|
216 |
+
if not doi:
|
217 |
+
return {"error": "DOI not found for this accession. Cannot fetch paper or context."}
|
218 |
+
|
219 |
# Step 3: get text
|
220 |
'''textsToExtract = { "doiLink":"paperText"
|
221 |
"file1.pdf":"text1",
|
222 |
"file2.doc":"text2",
|
223 |
"file3.xlsx":excelText3'''
|
224 |
textsToExtract = get_paper_text(doi,pubmedID)
|
225 |
+
if not textsToExtract:
|
226 |
+
return {"error": f"No texts extracted for DOI {doi}"}
|
227 |
+
|
228 |
# Step 4: prediction
|
229 |
outputs[accession] = {}
|
230 |
outputs[isolate] = {}
|