Spaces:
No application file
No application file
File size: 7,455 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# Copyright 2018 by Adhemar Zerlotini. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for InterProScan XML output formats."""
# for more info: https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats
import re
from xml.etree import ElementTree
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
# element - hit attribute name mapping
_ELEM_HIT = {
"name": ("accession", str),
"ac": ("id", str),
"desc": ("description", str),
}
# element - hsp attribute name mapping
_ELEM_HSP = {"score": ("bitscore", float), "evalue": ("evalue", float)}
# element - fragment attribute name mapping
_ELEM_FRAG = {
"start": ("query_start", int),
"end": ("query_end", int),
"hmm-start": ("hit_start", int),
"hmm-end": ("hit_end", int),
}
class InterproscanXmlParser:
"""Parser for the InterProScan XML format."""
def __init__(self, handle):
"""Initialize the class."""
self.xml_iter = iter(ElementTree.iterparse(handle, events=("start", "end")))
self._meta = self._parse_header()
def __iter__(self):
"""Iterate qresults."""
yield from self._parse_qresult()
def _parse_header(self):
"""Parse the header for the InterProScan version (PRIVATE)."""
event, elem = next(self.xml_iter)
meta = {}
meta["target"] = "InterPro"
meta["program"] = "InterProScan"
meta["version"] = elem.attrib["interproscan-version"]
# store the namespace value
self.NS = re.sub("protein-matches", "", elem.tag)
return meta
def _parse_qresult(self):
"""Parse query results (PRIVATE)."""
for event, elem in self.xml_iter:
if event == "end" and elem.tag == self.NS + "protein":
# store the query sequence
seq = elem.find(self.NS + "sequence")
query_seq = seq.text
# store the query id and description
xref = elem.find(self.NS + "xref")
query_id = xref.attrib["id"]
query_desc = xref.attrib["name"]
# parse each hit
hit_list = []
for hit_new in self._parse_hit(
elem.find(self.NS + "matches"), query_id, query_seq
):
# interproscan results contain duplicate hits rather than
# a single hit with multiple hsps. In this case the hsps
# of a duplicate hit will be appended to the already
# existing hit
for hit in hit_list:
if hit.id == hit_new.id:
for hsp in hit_new.hsps:
hit.hsps.append(hsp)
break
else:
hit_list.append(hit_new)
# create qresult and assign attributes
qresult = QueryResult(hit_list, query_id)
setattr(qresult, "description", query_desc)
for key, value in self._meta.items():
setattr(qresult, key, value)
yield qresult
def _parse_hit(self, root_hit_elem, query_id, query_seq=None):
"""Parse hit (PRIVATE)."""
# feed the loop below an empty list so iteration still works
if root_hit_elem is None:
root_hit_elem = []
for hit_elem in root_hit_elem:
# store the match/location type
hit_type = re.sub(r"%s(\w+)-match" % self.NS, r"\1", hit_elem.find(".").tag)
# store the hit id
signature = hit_elem.find(self.NS + "signature")
hit_id = signature.attrib["ac"]
# store xrefs and alt_descs
xrefs = self._parse_xrefs(signature.find(self.NS + "entry"))
# parse each hsp
hsps = list(
self._parse_hsp(
hit_elem.find(self.NS + "locations"), query_id, hit_id, query_seq
)
)
# create hit and assign attributes
hit = Hit(hsps, hit_id)
setattr(hit, "dbxrefs", xrefs)
for key, (attr, caster) in _ELEM_HIT.items():
value = signature.attrib.get(key)
if value is not None:
setattr(hit, attr, caster(value))
# format specific attributes
hit.attributes["Hit type"] = hit_type
signature_lib = signature.find(self.NS + "signature-library-release")
hit.attributes["Target"] = str(signature_lib.attrib.get("library"))
hit.attributes["Target version"] = str(signature_lib.attrib.get("version"))
yield hit
def _parse_hsp(self, root_hsp_elem, query_id, hit_id, query_seq=None):
"""Parse hsp (PRIVATE)."""
# feed the loop below an empty list so iteration still works
if root_hsp_elem is None:
root_hsp_elem = []
for hsp_elem in root_hsp_elem:
# create frag and assign attributes
frag = HSPFragment(hit_id, query_id)
setattr(frag, "molecule_type", "protein")
if query_seq is not None:
setattr(frag, "query", query_seq)
for key, (attr, caster) in _ELEM_FRAG.items():
value = hsp_elem.attrib.get(key)
if value is not None:
# start should be 0-based
if attr.endswith("start"):
value = caster(value) - 1
# store query start and end to calculate aln_span
if attr == "query_start":
start = int(value)
if attr == "query_end":
end = int(value)
setattr(frag, attr, caster(value))
# calculate aln_span and store
setattr(frag, "aln_span", end - start)
# create hsp and assign attributes
hsp = HSP([frag])
setattr(hsp, "query_id", query_id)
setattr(hsp, "hit_id", hit_id)
for key, (attr, caster) in _ELEM_HSP.items():
value = hsp_elem.attrib.get(key)
if value is not None:
setattr(hsp, attr, caster(value))
yield hsp
def _parse_xrefs(self, root_entry_elem):
"""Parse xrefs (PRIVATE)."""
xrefs = []
# store entry id and description
if root_entry_elem is not None:
xrefs.append("IPR:" + root_entry_elem.attrib["ac"])
# store go-xrefs and pathway-refs id and description
if root_entry_elem is not None:
xref_elems = []
xref_elems = xref_elems + root_entry_elem.findall(self.NS + "go-xref")
xref_elems = xref_elems + root_entry_elem.findall(self.NS + "pathway-xref")
for entry in xref_elems:
xref = entry.attrib["id"]
if ":" not in xref:
xref = entry.attrib["db"] + ":" + xref
xrefs.append(xref)
return xrefs
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()
|