File size: 9,261 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# Copyright 2019 by Jens Thomas.  All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for HHSUITE version 2 and 3 plain text output format."""

import re
import warnings

from Bio.SearchIO._utils import read_forward
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

__all__ = ("Hhsuite2TextParser",)

# precompile regex patterns for faster processing
# regex for query name capture
_RE_QUERY = re.compile(r"^Query\s+(.+)\s?$")

# regex for version string capture
_RE_HIT_BLOCK_START = re.compile(r"^No +(\d+)\s+$")

# id and full description
_RE_HIT_BLOCK_DESC = re.compile(r">(\S+)\s+(.*)$")

# sequence alignment line
# Q sp|Q9BSU1|CP07  229 DAKMRVFERSVYFGDSCQDVLSMLGSPHKV  258 (422)
_RE_MATCH_BLOCK_QUERY_SEQ = re.compile(r"^Q\s+(.+) +(\d+) +([A-Z-]+) +(\d+) +\(\d+\)$")
_RE_MATCH_BLOCK_HIT_SEQ = re.compile(r"^T\s+(.+) +(\d+) +([A-Z-]+) +(\d+) +\(\d+\)$")

_END_OF_FILE_MARKER = "Done!"

_PROGRAM = "HHSUITE"

# Maximum number of lines to read before expecting a hit block
# This determines the maximum number of hits that would be allowed in
# the initial hit table.
MAX_READ_UNTIL = 5000


class Hhsuite2TextParser:
    """Parser for the HHSUITE version 2 and 3 text output."""

    def __init__(self, handle):
        """Initialize the class."""
        self.handle = handle
        self.line = read_forward(self.handle)
        self.done = False
        self.query_id = None
        self.seq_len = None

    def __iter__(self):
        """Iterate over query results - there will only ever be one."""
        yield from self._parse_qresult()

    def _read_until(self, bool_func, stop_on_blank=True, max_read_until=MAX_READ_UNTIL):
        """Read the file handle until the given function returns True (PRIVATE)."""
        count = 0
        while True:
            if stop_on_blank and not self.line:
                return
            if bool_func(self.line):
                return
            else:
                self.line = read_forward(self.handle)
            count += 1
            if count >= max_read_until:
                raise RuntimeError("Exceeded max_read_until in _read_until")

    def _parse_qresult(self):
        """Parse HHSUITE output file (PRIVATE)."""
        hit_block_data = []
        self._parse_preamble()
        self._read_until(
            lambda line: re.search(_RE_HIT_BLOCK_START, line), stop_on_blank=False
        )
        while not self.done:
            hit_dict = self._parse_hit_block()
            hit_block_data.append(hit_dict)
        return self._create_qresult(hit_block_data)

    def _parse_preamble(self):
        """Parse metadata about query (PRIVATE)."""
        meta = {}
        while self.line:
            regx = re.search(_RE_QUERY, self.line)
            if regx:
                self.query_id = regx.group(1)
            if self.line.startswith("Match_columns"):
                self.seq_len = int(self.line.strip().split()[1])
            self.line = self.handle.readline().strip()
        return meta

    def _parse_hit_block(self):
        """Parse a hit block (PRIVATE)."""
        self.line = read_forward(self.handle)
        match = re.search(_RE_HIT_BLOCK_DESC, self.line)
        if not match:
            raise RuntimeError(
                f"Unexpected content in HIT_BLOCK_DESC line'{self.line}'"
            )
        hit_data = {
            "hit_id": match.group(1),
            "description": match.group(2).lstrip(" ;"),
            "evalue": None,
            "hit_start": None,
            "hit_end": None,
            "hit_seq": "",
            "prob": None,
            "query_start": None,
            "query_end": None,
            "query_seq": "",
            "score": None,
        }
        self.line = self.handle.readline()
        self._process_score_line(self.line, hit_data)
        while True:
            self.line = read_forward(self.handle)
            if not self.line.strip() or self.line.startswith(_END_OF_FILE_MARKER):
                # _END_OF_FILE_MARKER isn't always present
                self.done = True
                return hit_data
            elif re.search(_RE_HIT_BLOCK_START, self.line):
                return hit_data
            else:
                self._parse_hit_match_block(hit_data)

    @staticmethod
    def _process_score_line(line, hit_data):
        """Parse the scores from the line and populate hit_data dict (PRIVATE).

        Lines are of the form:
        Probab=99.95  E-value=3.7e-34  Score=210.31  Aligned_cols=171  Identities=100%  Similarity=2.050  Sum_probs=166.9

        E-value could be in decimal or scientific notation, so split the string rather then use regexp - this
        also means we should be tolerant of additional fields being added/removed
        """
        score_map = {"E-value": "evalue", "Score": "score", "Probab": "prob"}
        for score_pair in line.strip().split():
            key, value = score_pair.split("=")
            if key in score_map:
                try:
                    hit_data[score_map[key]] = float(value)
                except KeyError:
                    # We trigger warnings here as it's not a big enough problem to crash, but indicates something unexpected.
                    warnings.warn(
                        f"HHsuite parser: unable to extract {key} from line: {line}"
                    )

    def _parse_hit_match_block(self, hit_match_data):
        """Parse a single block of hit sequence data (PRIVATE).

        Parses block such as ::

            Q ss_pred             ceecchHHHHHHHHHHHHHHHHHHHhhhhhcCCCCccc
            Q 4P79:A|PDBID|C  160 YELGPALYLGWSASLLSILGGICVFSTAAASSKEEPAT  197 (198)
            Q Consensus       160 ~~~g~sf~l~~~~~~l~~~~~~l~~~~~~~~~~~~~~~  197 (198)
                                  .++|||||++|++.++.+++++++++..+..++++..+
            T Consensus       327 ~~~GwS~~l~~~s~~l~lia~~l~~~~~~~~~~~~~~~  364 (364)
            T 5B2G_A          327 REMGASLYVGWAASGLLLLGGGLLCCSGPSSGENLYFQ  364 (364)
            T ss_dssp             EEECTHHHHHHHHHHHHHHHHHHHHCC-----------
            T ss_pred             cccchHHHHHHHHHHHHHHHHHHHHhcCCCCCCccccC

        """

        def match_is_valid(match):
            """Return True if match is not a Consensus column (PRIVATE).

            It's not possible to distinguish a sequence line from a Consensus line with
            a regexp, so need to check the ID column.
            """
            return match.group(1).strip() != "Consensus"

        while True:
            if not self.line.strip():  # blank lines indicate the end of a hit block
                return
            match = re.match(_RE_MATCH_BLOCK_QUERY_SEQ, self.line)
            if match and match_is_valid(match):
                hit_match_data["query_seq"] += match.group(3).strip()
                if hit_match_data["query_start"] is None:
                    hit_match_data["query_start"] = int(match.group(2))
                hit_match_data["query_end"] = int(match.group(4))
            else:
                match = re.match(_RE_MATCH_BLOCK_HIT_SEQ, self.line)
                if match and match_is_valid(match):
                    hit_match_data["hit_seq"] += match.group(3).strip()
                    if hit_match_data["hit_start"] is None:
                        hit_match_data["hit_start"] = int(match.group(2))
                    hit_match_data["hit_end"] = int(match.group(4))
            self.line = self.handle.readline()

    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = {}

        for output_index, block in enumerate(hit_blocks):
            hit_id = block["hit_id"]

            frag = HSPFragment(hit_id, query_id)
            frag.molecule_type = "protein"
            frag.query_start = block["query_start"] - 1
            frag.query_end = block["query_end"]
            frag.hit_start = block["hit_start"] - 1
            frag.hit_end = block["hit_end"]
            frag.hit = block["hit_seq"]
            frag.query = block["query_seq"]

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block["description"]
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block["evalue"]
            hsp.score = block["score"]
            hsp.prob = block["prob"]

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block["description"]
                hit.is_included = is_included
                hit.evalue = block["evalue"]
                hit.score = block["score"]
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]