VyLala commited on
Commit
219b756
·
verified ·
1 Parent(s): 733de13

Update mtdna_classifier.py

Browse files
Files changed (1) hide show
  1. mtdna_classifier.py +31 -2
mtdna_classifier.py CHANGED
@@ -21,7 +21,7 @@ nltk.download("punkt")
21
  nltk.download('punkt_tab')
22
  # Step 1: Get PubMed ID from Accession using EDirect
23
 
24
- def get_info_from_accession(accession):
25
  cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
26
  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
27
  output = result.stdout
@@ -39,7 +39,36 @@ def get_info_from_accession(accession):
39
  # Try from DEFINITION line: ...isolate XXX...
40
  match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
41
  if match2:
42
- isolate = match2.group(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Return the values, even if they are empty strings
45
  return pubmedID, isolate
 
21
  nltk.download('punkt_tab')
22
  # Step 1: Get PubMed ID from Accession using EDirect
23
 
24
+ '''def get_info_from_accession(accession):
25
  cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
26
  result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
27
  output = result.stdout
 
39
  # Try from DEFINITION line: ...isolate XXX...
40
  match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
41
  if match2:
42
+ isolate = match2.group(1)'''
43
+ from Bio import Entrez
44
+ import re
45
+
46
+ Entrez.email = "[email protected]"
47
+
48
+ def get_info_from_accession(accession):
49
+ try:
50
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
51
+ text = handle.read()
52
+ handle.close()
53
+
54
+ # Extract PUBMED ID from the Medline text
55
+ pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
56
+ pubmed_id = pubmed_match.group(1) if pubmed_match else ""
57
+
58
+ # Extract isolate if available
59
+ isolate_match = re.search(r'/isolate="([^"]+)"', text)
60
+ if not isolate_match:
61
+ isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
62
+ isolate = isolate_match.group(1) if isolate_match else ""
63
+
64
+ if not pubmed_id:
65
+ print(f"⚠️ No PubMed ID found for accession {accession}")
66
+
67
+ return pubmed_id, isolate
68
+
69
+ except Exception as e:
70
+ print("❌ Entrez error:", e)
71
+ return "", ""
72
 
73
  # Return the values, even if they are empty strings
74
  return pubmedID, isolate