File size: 18,733 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for HMMER plain text output format."""

import re

from Bio.SearchIO._utils import read_forward
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

from ._base import _BaseHmmerTextIndexer

__all__ = ("Hmmer3TextParser", "Hmmer3TextIndexer")


# precompile regex patterns for faster processing
# regex for program name capture
_RE_PROGRAM = re.compile(r"^# (\w*hmm\w+) :: .*$")
# regex for version string capture
_RE_VERSION = re.compile(r"# \w+ ([\w+\.]+) .*; http.*$")
# regex for option string capture
_RE_OPT = re.compile(r"^# (.+):\s+(.+)$")
# regex for parsing query id and length, for parsing
_QRE_ID_LEN_PTN = r"^Query:\s*(.*)\s+\[\w=(\d+)\]"
_QRE_ID_LEN = re.compile(_QRE_ID_LEN_PTN)
# regex for hsp validation
_HRE_VALIDATE = re.compile(r"score:\s(-?\d+\.?\d+)\sbits.*value:\s(.*)")
# regexes for parsing hsp alignment blocks
_HRE_ANNOT_LINE = re.compile(r"^(\s+)(.+)\s(\w+)")
_HRE_ID_LINE = re.compile(r"^(\s+\S+\s+[0-9-]+ )(.+?)(\s+[0-9-]+)")


class Hmmer3TextParser:
    """Parser for the HMMER 3.0 text output."""

    def __init__(self, handle):
        """Initialize the class."""
        self.handle = handle
        self.line = read_forward(self.handle)
        self._meta = self._parse_preamble()

    def __iter__(self):
        """Iterate over query results."""
        yield from self._parse_qresult()

    def _read_until(self, bool_func):
        """Read the file handle until the given function returns True (PRIVATE)."""
        while True:
            if not self.line or bool_func(self.line):
                return
            else:
                self.line = read_forward(self.handle)

    def _parse_preamble(self):
        """Parse HMMER preamble (lines beginning with '#') (PRIVATE)."""
        meta = {}
        # bool flag for storing state ~ whether we are parsing the option
        # lines or not
        has_opts = False
        while True:
            # no pound sign means we've left the preamble
            if not self.line.startswith("#"):
                break
            # dashes could either mean we are entering or leaving the options
            # section ~ so it's a switch for the has_opts flag
            elif "- - -" in self.line:
                if not has_opts:
                    # if flag is false, that means we're entering opts
                    # so switch the flag accordingly
                    has_opts = True
                else:
                    # if flag is true, that means we've reached the end of opts
                    # so we can break out of the function
                    break
            elif not has_opts:
                # try parsing program
                regx = re.search(_RE_PROGRAM, self.line)
                if regx:
                    meta["program"] = regx.group(1)
                # try parsing version
                regx = re.search(_RE_VERSION, self.line)
                if regx:
                    meta["version"] = regx.group(1)
            elif has_opts:
                regx = re.search(_RE_OPT, self.line)
                # if target in regx.group(1), then we store the key as target
                if "target" in regx.group(1):
                    meta["target"] = regx.group(2).strip()
                else:
                    meta[regx.group(1)] = regx.group(2)

            self.line = read_forward(self.handle)

        return meta

    def _parse_qresult(self):
        """Parse a HMMER3 query block (PRIVATE)."""
        self._read_until(lambda line: line.startswith("Query:"))

        while self.line:

            regx = re.search(_QRE_ID_LEN, self.line)

            while not regx:
                self.line = read_forward(self.handle)
                regx = re.search(_QRE_ID_LEN, self.line)

            # get query id and length
            qid = regx.group(1).strip()
            # store qresult attributes
            qresult_attrs = {
                "seq_len": int(regx.group(2)),
                "program": self._meta.get("program"),
                "version": self._meta.get("version"),
                "target": self._meta.get("target"),
            }

            # get description and accession, if they exist
            qdesc = "<unknown description>"  # placeholder
            while not self.line.startswith("Scores for "):
                self.line = read_forward(self.handle)

                if self.line.startswith("Accession:"):
                    acc = self.line.strip().split(" ", 1)[1]
                    qresult_attrs["accession"] = acc.strip()
                elif self.line.startswith("Description:"):
                    qdesc = self.line.strip().split(" ", 1)[1].strip()
                    qresult_attrs["description"] = qdesc

            # parse the query hits
            while self.line and "//" not in self.line:
                hit_list = self._parse_hit(qid, qdesc)
                # read through the statistics summary
                # TODO: parse and store this information?
                if self.line.startswith("Internal pipeline"):
                    while self.line and "//" not in self.line:
                        self.line = read_forward(self.handle)

            # create qresult, set its attributes and yield
            # not initializing hit_list directly to handle empty hits
            # (i.e. need to set its query description manually)
            qresult = QueryResult(id=qid, hits=hit_list)
            for attr, value in qresult_attrs.items():
                setattr(qresult, attr, value)
            yield qresult
            self.line = read_forward(self.handle)

            # Skip line beginning with '# Alignment of', which are output
            # when running phmmer with the '-A' flag.
            if self.line.startswith("#"):
                self.line = self.handle.readline()

            # HMMER >= 3.1 outputs '[ok]' at the end of all results file,
            # which means we can break the main loop when we see the line
            if "[ok]" in self.line:
                break

    def _parse_hit(self, qid, qdesc):
        """Parse a HMMER3 hit block, beginning with the hit table (PRIVATE)."""
        # get to the end of the hit table delimiter and read one more line
        self._read_until(lambda line: line.startswith("    ------- ------ -----"))
        self.line = read_forward(self.handle)

        # assume every hit is in inclusion threshold until the inclusion
        # threshold line is encountered
        is_included = True

        # parse the hit table
        hit_attr_list = []
        while True:
            if not self.line:
                return []
            elif self.line.startswith("  ------ inclusion"):
                is_included = False
                self.line = read_forward(self.handle)
            # if there are no hits, then there are no hsps
            # so we forward-read until 'Internal pipeline..'
            elif self.line.startswith("   [No hits detected that satisfy reporting"):
                while True:
                    self.line = read_forward(self.handle)
                    if self.line.startswith("Internal pipeline"):
                        assert len(hit_attr_list) == 0
                        return []
            elif self.line.startswith("Domain annotation for each "):
                hit_list = self._create_hits(hit_attr_list, qid, qdesc)
                return hit_list
            # entering hit results row
            # parse the columns into a list
            row = [x for x in self.line.strip().split(" ") if x]
            # join the description words if it's >1 word
            if len(row) > 10:
                row[9] = " ".join(row[9:])
            # if there's no description, set it to an empty string
            elif len(row) < 10:
                row.append("")
                assert len(row) == 10
            # create the hit object
            hit_attrs = {
                "id": row[8],
                "query_id": qid,
                "evalue": float(row[0]),
                "bitscore": float(row[1]),
                "bias": float(row[2]),
                # row[3:6] is not parsed, since the info is available
                # at the HSP level
                "domain_exp_num": float(row[6]),
                "domain_obs_num": int(row[7]),
                "description": row[9],
                "is_included": is_included,
            }
            hit_attr_list.append(hit_attrs)

            self.line = read_forward(self.handle)

    def _create_hits(self, hit_attrs, qid, qdesc):
        """Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE)."""
        # read through until the beginning of the hsp block
        self._read_until(
            lambda line: line.startswith("Internal pipeline") or line.startswith(">>")
        )

        # start parsing the hsp block
        hit_list = []
        while True:
            if self.line.startswith("Internal pipeline"):
                # by this time we should've emptied the hit attr list
                assert len(hit_attrs) == 0
                return hit_list
            assert self.line.startswith(">>")
            hid, hdesc = self.line[len(">> ") :].split("  ", 1)
            hdesc = hdesc.strip()

            # read through the hsp table header and move one more line
            self._read_until(
                lambda line: line.startswith(" ---   ------ ----- --------")
                or line.startswith("   [No individual domains")
            )
            self.line = read_forward(self.handle)

            # parse the hsp table for the current hit
            hsp_list = []
            while True:
                # break out of hsp parsing if there are no hits, it's the last hsp
                # or it's the start of a new hit
                if (
                    self.line.startswith("   [No targets detected that satisfy")
                    or self.line.startswith("   [No individual domains")
                    or self.line.startswith("Internal pipeline statistics summary:")
                    or self.line.startswith("  Alignments for each domain:")
                    or self.line.startswith(">>")
                ):

                    hit_attr = hit_attrs.pop(0)
                    hit = Hit(hsp_list)
                    for attr, value in hit_attr.items():
                        if attr == "description":
                            cur_val = getattr(hit, attr)
                            if cur_val and value and cur_val.startswith(value):
                                continue
                        setattr(hit, attr, value)
                    if not hit:
                        hit.query_description = qdesc
                    hit_list.append(hit)
                    break

                parsed = [x for x in self.line.strip().split(" ") if x]
                assert len(parsed) == 16
                # parsed column order:
                # index, is_included, bitscore, bias, evalue_cond, evalue
                # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
                # envfrom, envto, acc_avg
                frag = HSPFragment(hid, qid)
                # set query and hit descriptions if they are defined / nonempty string
                if qdesc:
                    frag.query_description = qdesc
                if hdesc:
                    frag.hit_description = hdesc
                # HMMER3 results are always protein
                frag.molecule_type = "protein"
                # depending on whether the program is hmmsearch, hmmscan, or phmmer
                # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
                # for hmmscan, hit is the hmm profile, query is the sequence
                if self._meta.get("program") == "hmmscan":
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[6]) - 1
                    frag.hit_end = int(parsed[7])
                    frag.query_start = int(parsed[9]) - 1
                    frag.query_end = int(parsed[10])
                elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[9]) - 1
                    frag.hit_end = int(parsed[10])
                    frag.query_start = int(parsed[6]) - 1
                    frag.query_end = int(parsed[7])
                # strand is always 0, since HMMER now only handles protein
                frag.hit_strand = frag.query_strand = 0

                hsp = HSP([frag])
                hsp.domain_index = int(parsed[0])
                hsp.is_included = parsed[1] == "!"
                hsp.bitscore = float(parsed[2])
                hsp.bias = float(parsed[3])
                hsp.evalue_cond = float(parsed[4])
                hsp.evalue = float(parsed[5])
                if self._meta.get("program") == "hmmscan":
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[8]
                    hsp.query_endtype = parsed[11]
                elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[11]
                    hsp.query_endtype = parsed[8]
                # adjust 'from' and 'to' coordinates to 0-based ones
                hsp.env_start = int(parsed[12]) - 1
                hsp.env_end = int(parsed[13])
                hsp.env_endtype = parsed[14]
                hsp.acc_avg = float(parsed[15])

                hsp_list.append(hsp)
                self.line = read_forward(self.handle)

            # parse the hsp alignments
            if self.line.startswith("  Alignments for each domain:"):
                self._parse_aln_block(hid, hit.hsps)

    def _parse_aln_block(self, hid, hsp_list):
        """Parse a HMMER3 HSP alignment block (PRIVATE)."""
        self.line = read_forward(self.handle)
        dom_counter = 0
        while True:
            if self.line.startswith(">>") or self.line.startswith("Internal pipeline"):
                return hsp_list
            assert self.line.startswith("  == domain %i" % (dom_counter + 1))
            # alias hsp to local var
            # but note that we're still changing the attrs of the actual
            # hsp inside the qresult as we're not creating a copy
            frag = hsp_list[dom_counter][0]
            # XXX: should we validate again here? regex is expensive..
            # regx = re.search(_HRE_VALIDATE, self.line)
            # assert hsp.bitscore == float(regx.group(1))
            # assert hsp.evalue_cond == float(regx.group(2))
            hmmseq = ""
            aliseq = ""
            annot = {}
            self.line = self.handle.readline()

            # parse all the alignment blocks in the hsp
            while True:

                regx = None

                # check for hit or query line
                # we don't check for the hit or query id specifically
                # to anticipate special cases where query id == hit id
                regx = re.search(_HRE_ID_LINE, self.line)
                if regx:
                    # the first hit/query self.line we encounter is the hmmseq
                    if len(hmmseq) == len(aliseq):
                        hmmseq += regx.group(2)
                    # and for subsequent self.lines, len(hmmseq) is either
                    # > or == len(aliseq)
                    elif len(hmmseq) > len(aliseq):
                        aliseq += regx.group(2)
                    assert len(hmmseq) >= len(aliseq)
                # check for start of new domain
                elif (
                    self.line.startswith("  == domain")
                    or self.line.startswith(">>")
                    or self.line.startswith("Internal pipeline")
                ):
                    frag.aln_annotation = annot
                    if self._meta.get("program") == "hmmscan":
                        frag.hit = hmmseq
                        frag.query = aliseq
                    elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
                        frag.hit = aliseq
                        frag.query = hmmseq
                    dom_counter += 1
                    hmmseq = ""
                    aliseq = ""
                    annot = {}
                    break
                # otherwise check if it's an annotation line and parse it
                # len(hmmseq) is only != len(aliseq) when the cursor is parsing
                # the similarity character. Since we're not parsing that, we
                # check for when the condition is False (i.e. when it's ==)
                elif len(hmmseq) == len(aliseq):
                    regx = re.search(_HRE_ANNOT_LINE, self.line)
                    if regx:
                        annot_name = regx.group(3)
                        if annot_name in annot:
                            annot[annot_name] += regx.group(2)
                        else:
                            annot[annot_name] = regx.group(2)

                self.line = self.handle.readline()


class Hmmer3TextIndexer(_BaseHmmerTextIndexer):
    """Indexer class for HMMER plain text output."""

    _parser = Hmmer3TextParser
    qresult_start = b"Query: "
    qresult_end = b"//"

    def __iter__(self):
        """Iterate over Hmmer3TextIndexer; yields query results' key, offsets, 0."""
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_QRE_ID_LEN_PTN.encode())

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield qresult_key.decode(), start_offset, 0
                start_offset = end_offset
            elif not line:
                break


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()