Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

VyLala commited on Apr 13

Commit

219b756

verified ·

1 Parent(s): 733de13

Update mtdna_classifier.py

Browse files

Files changed (1) hide show

mtdna_classifier.py +31 -2

mtdna_classifier.py CHANGED Viewed

@@ -21,7 +21,7 @@ nltk.download("punkt")
 nltk.download('punkt_tab')
 # Step 1: Get PubMed ID from Accession using EDirect
-def get_info_from_accession(accession):
     cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     output = result.stdout
@@ -39,7 +39,36 @@ def get_info_from_accession(accession):
             # Try from DEFINITION line: ...isolate XXX...
             match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
             if match2:
-              isolate = match2.group(1)
     # Return the values, even if they are empty strings
     return pubmedID, isolate

 nltk.download('punkt_tab')
 # Step 1: Get PubMed ID from Accession using EDirect
+'''def get_info_from_accession(accession):
     cmd = f'{os.environ["HOME"]}/edirect/esummary -db nuccore -id {accession} -format medline | egrep "PUBMED|isolate"'
     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     output = result.stdout
             # Try from DEFINITION line: ...isolate XXX...
             match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
             if match2:
+              isolate = match2.group(1)'''
+from Bio import Entrez
+import re
+Entrez.email = "[email protected]"
+def get_info_from_accession(accession):
+    try:
+        handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
+        text = handle.read()
+        handle.close()
+        # Extract PUBMED ID from the Medline text
+        pubmed_match = re.search(r'PUBMED\s+(\d+)', text)
+        pubmed_id = pubmed_match.group(1) if pubmed_match else ""
+        # Extract isolate if available
+        isolate_match = re.search(r'/isolate="([^"]+)"', text)
+        if not isolate_match:
+            isolate_match = re.search(r'isolate\s+([A-Za-z0-9_-]+)', text)
+        isolate = isolate_match.group(1) if isolate_match else ""
+        if not pubmed_id:
+            print(f"⚠️ No PubMed ID found for accession {accession}")
+        return pubmed_id, isolate
+    except Exception as e:
+        print("❌ Entrez error:", e)
+        return "", ""
     # Return the values, even if they are empty strings
     return pubmedID, isolate