VyLala commited on
Commit
e9dc740
·
verified ·
1 Parent(s): d8e40f0

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. mtdna_classifier.py +26 -6
app.py CHANGED
@@ -136,4 +136,4 @@ with gr.Blocks() as interface:
136
  submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
137
  reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
138
 
139
- interface.launch()
 
136
  submit_feedback.click(fn=store_feedback_to_drive, inputs=[accession, q1, q2, contact], outputs=feedback_status)
137
  reset_button.click(fn=reset_fields, inputs=[], outputs=[accession, q1, q2, contact, feedback_status, output_table, output_summary, status])
138
 
139
+ interface.launch(share=True)
mtdna_classifier.py CHANGED
@@ -44,13 +44,19 @@ def get_info_from_accession(accession):
44
  # Return the values, even if they are empty strings
45
  return pubmedID, isolate
46
  # Step 2: Get doi link to access the paper
47
- def get_doi_from_pubmed_id(id):
48
- cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {id} -format medline | grep -i "AID"'
49
  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
50
  output = result.stdout
 
51
  doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
52
  match = re.search(doi_pattern, output, re.IGNORECASE)
53
- return match.group(0)
 
 
 
 
 
54
 
55
  # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
56
  # Step 3.1: Extract Text
@@ -110,9 +116,13 @@ def extract_context(text, keyword, window=500):
110
  # Step 4: Classification for now (demo purposes)
111
  # 4.1: Using a HuggingFace model (question-answering)
112
  def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
113
- qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
114
- result = qa({"context": context, "question": question})
115
- return result["answer"]
 
 
 
 
116
  # 4.2: Infer from haplogroup
117
  # Load pre-trained spaCy model for NER
118
  try:
@@ -197,14 +207,24 @@ def classify_sample_location(accession):
197
  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
198
  # Step 1: get pubmed id and isolate
199
  pubmedID, isolate = get_info_from_accession(accession)
 
 
 
 
200
  # Step 2: get doi
201
  doi = get_doi_from_pubmed_id(pubmedID)
 
 
 
202
  # Step 3: get text
203
  '''textsToExtract = { "doiLink":"paperText"
204
  "file1.pdf":"text1",
205
  "file2.doc":"text2",
206
  "file3.xlsx":excelText3'''
207
  textsToExtract = get_paper_text(doi,pubmedID)
 
 
 
208
  # Step 4: prediction
209
  outputs[accession] = {}
210
  outputs[isolate] = {}
 
44
  # Return the values, even if they are empty strings
45
  return pubmedID, isolate
46
  # Step 2: Get doi link to access the paper
47
+ def get_doi_from_pubmed_id(pubmed_id):
48
+ cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
49
  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
50
  output = result.stdout
51
+
52
  doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+(?=\s*\[doi\])'
53
  match = re.search(doi_pattern, output, re.IGNORECASE)
54
+
55
+ if match:
56
+ return match.group(0)
57
+ else:
58
+ return None # or raise an Exception with a helpful message
59
+
60
 
61
  # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
62
  # Step 3.1: Extract Text
 
116
  # Step 4: Classification for now (demo purposes)
117
  # 4.1: Using a HuggingFace model (question-answering)
118
  def infer_location_fromQAModel(context, question="Where is the mtDNA sample from?"):
119
+ try:
120
+ qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
121
+ result = qa({"context": context, "question": question})
122
+ return result.get("answer", "Unknown")
123
+ except Exception as e:
124
+ return f"Error: {str(e)}"
125
+
126
  # 4.2: Infer from haplogroup
127
  # Load pre-trained spaCy model for NER
128
  try:
 
207
  keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
208
  # Step 1: get pubmed id and isolate
209
  pubmedID, isolate = get_info_from_accession(accession)
210
+ if not pubmedID:
211
+ return {"error": f"Could not retrieve PubMed ID for accession {accession}"}
212
+ if not isolate:
213
+ isolate = "UNKNOWN_ISOLATE"
214
  # Step 2: get doi
215
  doi = get_doi_from_pubmed_id(pubmedID)
216
+ if not doi:
217
+ return {"error": "DOI not found for this accession. Cannot fetch paper or context."}
218
+
219
  # Step 3: get text
220
  '''textsToExtract = { "doiLink":"paperText"
221
  "file1.pdf":"text1",
222
  "file2.doc":"text2",
223
  "file3.xlsx":excelText3'''
224
  textsToExtract = get_paper_text(doi,pubmedID)
225
+ if not textsToExtract:
226
+ return {"error": f"No texts extracted for DOI {doi}"}
227
+
228
  # Step 4: prediction
229
  outputs[accession] = {}
230
  outputs[isolate] = {}