VyLala commited on
Commit
538d1c5
·
verified ·
1 Parent(s): bd86ca3

Update mtdna_classifier.py

Browse files
Files changed (1) hide show
  1. mtdna_classifier.py +24 -6
mtdna_classifier.py CHANGED
@@ -40,7 +40,7 @@ nltk.download('punkt_tab')
40
  match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
41
  if match2:
42
  isolate = match2.group(1)'''
43
- from Bio import Entrez
44
  import re
45
 
46
  Entrez.email = "[email protected]"
@@ -69,11 +69,8 @@ def get_info_from_accession(accession):
69
  except Exception as e:
70
  print("❌ Entrez error:", e)
71
  return "", ""
72
-
73
- # Return the values, even if they are empty strings
74
- return pubmedID, isolate
75
  # Step 2: Get doi link to access the paper
76
- def get_doi_from_pubmed_id(pubmed_id):
77
  cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
78
  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
79
  output = result.stdout
@@ -84,7 +81,28 @@ def get_doi_from_pubmed_id(pubmed_id):
84
  if match:
85
  return match.group(0)
86
  else:
87
- return None # or raise an Exception with a helpful message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
 
90
  # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
 
40
  match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
41
  if match2:
42
  isolate = match2.group(1)'''
43
+ from Bio import Entrez, Medline
44
  import re
45
 
46
  Entrez.email = "[email protected]"
 
69
  except Exception as e:
70
  print("❌ Entrez error:", e)
71
  return "", ""
 
 
 
72
  # Step 2: Get doi link to access the paper
73
+ '''def get_doi_from_pubmed_id(pubmed_id):
74
  cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
75
  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
76
  output = result.stdout
 
81
  if match:
82
  return match.group(0)
83
  else:
84
+ return None # or raise an Exception with a helpful message'''
85
+
86
+ def get_doi_from_pubmed_id(pubmed_id):
87
+ try:
88
+ handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
89
+ records = list(Medline.parse(handle))
90
+ handle.close()
91
+
92
+ if not records:
93
+ return None
94
+
95
+ record = records[0]
96
+ if "AID" in record:
97
+ for aid in record["AID"]:
98
+ if "[doi]" in aid:
99
+ return aid.split(" ")[0] # extract the DOI
100
+
101
+ return None
102
+
103
+ except Exception as e:
104
+ print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
105
+ return None
106
 
107
 
108
  # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing