Spaces:
Running
Running
Update mtdna_classifier.py
Browse files- mtdna_classifier.py +31 -2
mtdna_classifier.py
CHANGED
@@ -21,7 +21,7 @@ nltk.download("punkt")
|
|
21 |
nltk.download('punkt_tab')
|
22 |
# Step 1: Get PubMed ID from Accession using EDirect
|
23 |
|
24 |
-
def get_info_from_accession(accession):
|
25 |
cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
|
26 |
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
27 |
output = result.stdout
|
@@ -39,7 +39,36 @@ def get_info_from_accession(accession):
|
|
39 |
# Try from DEFINITION line: ...isolate XXX...
|
40 |
match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
|
41 |
if match2:
|
42 |
-
isolate = match2.group(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# Return the values, even if they are empty strings
|
45 |
return pubmedID, isolate
|
|
|
21 |
nltk.download('punkt_tab')
|
22 |
# Step 1: Get PubMed ID from Accession using EDirect
|
23 |
|
24 |
+
'''def get_info_from_accession(accession):
|
25 |
cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
|
26 |
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
27 |
output = result.stdout
|
|
|
39 |
# Try from DEFINITION line: ...isolate XXX...
|
40 |
match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
|
41 |
if match2:
|
42 |
+
isolate = match2.group(1)'''
|
43 |
+
from Bio import Entrez
|
44 |
+
import re
|
45 |
+
|
46 |
+
Entrez.email = "[email protected]"
|
47 |
+
|
48 |
+
def get_info_from_accession(accession):
|
49 |
+
try:
|
50 |
+
handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
|
51 |
+
text = handle.read()
|
52 |
+
handle.close()
|
53 |
+
|
54 |
+
# Extract PUBMED ID from the Medline text
|
55 |
+
pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
|
56 |
+
pubmed_id = pubmed_match.group(1) if pubmed_match else ""
|
57 |
+
|
58 |
+
# Extract isolate if available
|
59 |
+
isolate_match = re.search(r'/isolate="([^"]+)"', text)
|
60 |
+
if not isolate_match:
|
61 |
+
isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
|
62 |
+
isolate = isolate_match.group(1) if isolate_match else ""
|
63 |
+
|
64 |
+
if not pubmed_id:
|
65 |
+
print(f"⚠️ No PubMed ID found for accession {accession}")
|
66 |
+
|
67 |
+
return pubmed_id, isolate
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
print("❌ Entrez error:", e)
|
71 |
+
return "", ""
|
72 |
|
73 |
# Return the values, even if they are empty strings
|
74 |
return pubmedID, isolate
|