Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

File size: 7,455 Bytes

b7731cd

# Copyright 2018 by Adhemar Zerlotini. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""Bio.SearchIO parser for InterProScan XML output formats."""
# for more info: https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats

import re
from xml.etree import ElementTree

from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment


# element - hit attribute name mapping
_ELEM_HIT = {
    "name": ("accession", str),
    "ac": ("id", str),
    "desc": ("description", str),
}
# element - hsp attribute name mapping
_ELEM_HSP = {"score": ("bitscore", float), "evalue": ("evalue", float)}
# element - fragment attribute name mapping
_ELEM_FRAG = {
    "start": ("query_start", int),
    "end": ("query_end", int),
    "hmm-start": ("hit_start", int),
    "hmm-end": ("hit_end", int),
}


class InterproscanXmlParser:
    """Parser for the InterProScan XML format."""

    def __init__(self, handle):
        """Initialize the class."""
        self.xml_iter = iter(ElementTree.iterparse(handle, events=("start", "end")))
        self._meta = self._parse_header()

    def __iter__(self):
        """Iterate qresults."""
        yield from self._parse_qresult()

    def _parse_header(self):
        """Parse the header for the InterProScan version (PRIVATE)."""
        event, elem = next(self.xml_iter)
        meta = {}
        meta["target"] = "InterPro"
        meta["program"] = "InterProScan"
        meta["version"] = elem.attrib["interproscan-version"]
        # store the namespace value
        self.NS = re.sub("protein-matches", "", elem.tag)
        return meta

    def _parse_qresult(self):
        """Parse query results (PRIVATE)."""
        for event, elem in self.xml_iter:
            if event == "end" and elem.tag == self.NS + "protein":
                # store the query sequence
                seq = elem.find(self.NS + "sequence")
                query_seq = seq.text

                # store the query id and description
                xref = elem.find(self.NS + "xref")
                query_id = xref.attrib["id"]
                query_desc = xref.attrib["name"]

                # parse each hit
                hit_list = []
                for hit_new in self._parse_hit(
                    elem.find(self.NS + "matches"), query_id, query_seq
                ):
                    # interproscan results contain duplicate hits rather than
                    # a single hit with multiple hsps. In this case the hsps
                    # of a duplicate hit will be appended to the already
                    # existing hit
                    for hit in hit_list:
                        if hit.id == hit_new.id:
                            for hsp in hit_new.hsps:
                                hit.hsps.append(hsp)
                            break
                    else:
                        hit_list.append(hit_new)

                # create qresult and assign attributes
                qresult = QueryResult(hit_list, query_id)
                setattr(qresult, "description", query_desc)
                for key, value in self._meta.items():
                    setattr(qresult, key, value)
                yield qresult

    def _parse_hit(self, root_hit_elem, query_id, query_seq=None):
        """Parse hit (PRIVATE)."""
        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:
            # store the match/location type
            hit_type = re.sub(r"%s(\w+)-match" % self.NS, r"\1", hit_elem.find(".").tag)
            # store the hit id
            signature = hit_elem.find(self.NS + "signature")
            hit_id = signature.attrib["ac"]

            # store xrefs and alt_descs
            xrefs = self._parse_xrefs(signature.find(self.NS + "entry"))

            # parse each hsp
            hsps = list(
                self._parse_hsp(
                    hit_elem.find(self.NS + "locations"), query_id, hit_id, query_seq
                )
            )

            # create hit and assign attributes
            hit = Hit(hsps, hit_id)
            setattr(hit, "dbxrefs", xrefs)
            for key, (attr, caster) in _ELEM_HIT.items():
                value = signature.attrib.get(key)
                if value is not None:
                    setattr(hit, attr, caster(value))
            # format specific attributes
            hit.attributes["Hit type"] = hit_type
            signature_lib = signature.find(self.NS + "signature-library-release")
            hit.attributes["Target"] = str(signature_lib.attrib.get("library"))
            hit.attributes["Target version"] = str(signature_lib.attrib.get("version"))

            yield hit

    def _parse_hsp(self, root_hsp_elem, query_id, hit_id, query_seq=None):
        """Parse hsp (PRIVATE)."""
        # feed the loop below an empty list so iteration still works
        if root_hsp_elem is None:
            root_hsp_elem = []

        for hsp_elem in root_hsp_elem:
            # create frag and assign attributes
            frag = HSPFragment(hit_id, query_id)
            setattr(frag, "molecule_type", "protein")
            if query_seq is not None:
                setattr(frag, "query", query_seq)
            for key, (attr, caster) in _ELEM_FRAG.items():
                value = hsp_elem.attrib.get(key)
                if value is not None:
                    # start should be 0-based
                    if attr.endswith("start"):
                        value = caster(value) - 1
                    # store query start and end to calculate aln_span
                    if attr == "query_start":
                        start = int(value)
                    if attr == "query_end":
                        end = int(value)
                    setattr(frag, attr, caster(value))
            # calculate aln_span and store
            setattr(frag, "aln_span", end - start)

            # create hsp and assign attributes
            hsp = HSP([frag])
            setattr(hsp, "query_id", query_id)
            setattr(hsp, "hit_id", hit_id)
            for key, (attr, caster) in _ELEM_HSP.items():
                value = hsp_elem.attrib.get(key)
                if value is not None:
                    setattr(hsp, attr, caster(value))
            yield hsp

    def _parse_xrefs(self, root_entry_elem):
        """Parse xrefs (PRIVATE)."""
        xrefs = []
        # store entry id and description
        if root_entry_elem is not None:
            xrefs.append("IPR:" + root_entry_elem.attrib["ac"])

        # store go-xrefs and pathway-refs id and description
        if root_entry_elem is not None:
            xref_elems = []
            xref_elems = xref_elems + root_entry_elem.findall(self.NS + "go-xref")
            xref_elems = xref_elems + root_entry_elem.findall(self.NS + "pathway-xref")

            for entry in xref_elems:
                xref = entry.attrib["id"]
                if ":" not in xref:
                    xref = entry.attrib["db"] + ":" + xref
                xrefs.append(xref)
        return xrefs


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()