File size: 7,455 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# Copyright 2018 by Adhemar Zerlotini. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""Bio.SearchIO parser for InterProScan XML output formats."""
# for more info: https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats

import re
from xml.etree import ElementTree

from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment


# element - hit attribute name mapping
_ELEM_HIT = {
    "name": ("accession", str),
    "ac": ("id", str),
    "desc": ("description", str),
}
# element - hsp attribute name mapping
_ELEM_HSP = {"score": ("bitscore", float), "evalue": ("evalue", float)}
# element - fragment attribute name mapping
_ELEM_FRAG = {
    "start": ("query_start", int),
    "end": ("query_end", int),
    "hmm-start": ("hit_start", int),
    "hmm-end": ("hit_end", int),
}


class InterproscanXmlParser:
    """Parser for the InterProScan XML format."""

    def __init__(self, handle):
        """Initialize the class."""
        self.xml_iter = iter(ElementTree.iterparse(handle, events=("start", "end")))
        self._meta = self._parse_header()

    def __iter__(self):
        """Iterate qresults."""
        yield from self._parse_qresult()

    def _parse_header(self):
        """Parse the header for the InterProScan version (PRIVATE)."""
        event, elem = next(self.xml_iter)
        meta = {}
        meta["target"] = "InterPro"
        meta["program"] = "InterProScan"
        meta["version"] = elem.attrib["interproscan-version"]
        # store the namespace value
        self.NS = re.sub("protein-matches", "", elem.tag)
        return meta

    def _parse_qresult(self):
        """Parse query results (PRIVATE)."""
        for event, elem in self.xml_iter:
            if event == "end" and elem.tag == self.NS + "protein":
                # store the query sequence
                seq = elem.find(self.NS + "sequence")
                query_seq = seq.text

                # store the query id and description
                xref = elem.find(self.NS + "xref")
                query_id = xref.attrib["id"]
                query_desc = xref.attrib["name"]

                # parse each hit
                hit_list = []
                for hit_new in self._parse_hit(
                    elem.find(self.NS + "matches"), query_id, query_seq
                ):
                    # interproscan results contain duplicate hits rather than
                    # a single hit with multiple hsps. In this case the hsps
                    # of a duplicate hit will be appended to the already
                    # existing hit
                    for hit in hit_list:
                        if hit.id == hit_new.id:
                            for hsp in hit_new.hsps:
                                hit.hsps.append(hsp)
                            break
                    else:
                        hit_list.append(hit_new)

                # create qresult and assign attributes
                qresult = QueryResult(hit_list, query_id)
                setattr(qresult, "description", query_desc)
                for key, value in self._meta.items():
                    setattr(qresult, key, value)
                yield qresult

    def _parse_hit(self, root_hit_elem, query_id, query_seq=None):
        """Parse hit (PRIVATE)."""
        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:
            # store the match/location type
            hit_type = re.sub(r"%s(\w+)-match" % self.NS, r"\1", hit_elem.find(".").tag)
            # store the hit id
            signature = hit_elem.find(self.NS + "signature")
            hit_id = signature.attrib["ac"]

            # store xrefs and alt_descs
            xrefs = self._parse_xrefs(signature.find(self.NS + "entry"))

            # parse each hsp
            hsps = list(
                self._parse_hsp(
                    hit_elem.find(self.NS + "locations"), query_id, hit_id, query_seq
                )
            )

            # create hit and assign attributes
            hit = Hit(hsps, hit_id)
            setattr(hit, "dbxrefs", xrefs)
            for key, (attr, caster) in _ELEM_HIT.items():
                value = signature.attrib.get(key)
                if value is not None:
                    setattr(hit, attr, caster(value))
            # format specific attributes
            hit.attributes["Hit type"] = hit_type
            signature_lib = signature.find(self.NS + "signature-library-release")
            hit.attributes["Target"] = str(signature_lib.attrib.get("library"))
            hit.attributes["Target version"] = str(signature_lib.attrib.get("version"))

            yield hit

    def _parse_hsp(self, root_hsp_elem, query_id, hit_id, query_seq=None):
        """Parse hsp (PRIVATE)."""
        # feed the loop below an empty list so iteration still works
        if root_hsp_elem is None:
            root_hsp_elem = []

        for hsp_elem in root_hsp_elem:
            # create frag and assign attributes
            frag = HSPFragment(hit_id, query_id)
            setattr(frag, "molecule_type", "protein")
            if query_seq is not None:
                setattr(frag, "query", query_seq)
            for key, (attr, caster) in _ELEM_FRAG.items():
                value = hsp_elem.attrib.get(key)
                if value is not None:
                    # start should be 0-based
                    if attr.endswith("start"):
                        value = caster(value) - 1
                    # store query start and end to calculate aln_span
                    if attr == "query_start":
                        start = int(value)
                    if attr == "query_end":
                        end = int(value)
                    setattr(frag, attr, caster(value))
            # calculate aln_span and store
            setattr(frag, "aln_span", end - start)

            # create hsp and assign attributes
            hsp = HSP([frag])
            setattr(hsp, "query_id", query_id)
            setattr(hsp, "hit_id", hit_id)
            for key, (attr, caster) in _ELEM_HSP.items():
                value = hsp_elem.attrib.get(key)
                if value is not None:
                    setattr(hsp, attr, caster(value))
            yield hsp

    def _parse_xrefs(self, root_entry_elem):
        """Parse xrefs (PRIVATE)."""
        xrefs = []
        # store entry id and description
        if root_entry_elem is not None:
            xrefs.append("IPR:" + root_entry_elem.attrib["ac"])

        # store go-xrefs and pathway-refs id and description
        if root_entry_elem is not None:
            xref_elems = []
            xref_elems = xref_elems + root_entry_elem.findall(self.NS + "go-xref")
            xref_elems = xref_elems + root_entry_elem.findall(self.NS + "pathway-xref")

            for entry in xref_elems:
                xref = entry.attrib["id"]
                if ":" not in xref:
                    xref = entry.attrib["db"] + ":" + xref
                xrefs.append(xref)
        return xrefs


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()