File size: 12,818 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for HMMER table output format."""

from itertools import chain

from Bio.SearchIO._index import SearchIndexer
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment


__all__ = ("Hmmer3TabParser", "Hmmer3TabIndexer", "Hmmer3TabWriter")


class Hmmer3TabParser:
    """Parser for the HMMER table format."""

    def __init__(self, handle):
        """Initialize the class."""
        self.handle = handle
        self.line = self.handle.readline()

    def __iter__(self):
        """Iterate over Hmmer3TabParser, yields query results."""
        header_mark = "#"
        # read through the header if it exists
        while self.line.startswith(header_mark):
            self.line = self.handle.readline()
        # if we have result rows, parse it
        if self.line:
            yield from self._parse_qresult()

    def _parse_row(self):
        """Return a dictionary of parsed row values (PRIVATE)."""
        cols = [x for x in self.line.strip().split(" ") if x]
        if len(cols) < 18:
            raise ValueError("Less columns than expected, only %i" % len(cols))
        # if len(cols) > 19, we have extra description columns
        # combine them all into one string in the 19th column
        cols[18] = " ".join(cols[18:])

        # assign parsed column data into qresult, hit, and hsp dicts
        qresult = {}
        qresult["id"] = cols[2]  # query name
        qresult["accession"] = cols[3]  # query accession
        hit = {}
        hit["id"] = cols[0]  # target name
        hit["accession"] = cols[1]  # target accession
        hit["evalue"] = float(cols[4])  # evalue (full sequence)
        hit["bitscore"] = float(cols[5])  # score (full sequence)
        hit["bias"] = float(cols[6])  # bias (full sequence)
        hit["domain_exp_num"] = float(cols[10])  # exp
        hit["region_num"] = int(cols[11])  # reg
        hit["cluster_num"] = int(cols[12])  # clu
        hit["overlap_num"] = int(cols[13])  # ov
        hit["env_num"] = int(cols[14])  # env
        hit["domain_obs_num"] = int(cols[15])  # dom
        hit["domain_reported_num"] = int(cols[16])  # rep
        hit["domain_included_num"] = int(cols[17])  # inc
        hit["description"] = cols[18]  # description of target
        hsp = {}
        hsp["evalue"] = float(cols[7])  # evalue (best 1 domain)
        hsp["bitscore"] = float(cols[8])  # score (best 1 domain)
        hsp["bias"] = float(cols[9])  # bias (best 1 domain)
        # strand is always 0, since HMMER now only handles protein
        frag = {}
        frag["hit_strand"] = frag["query_strand"] = 0
        frag["molecule_type"] = "protein"

        return {"qresult": qresult, "hit": hit, "hsp": hsp, "frag": frag}

    def _parse_qresult(self):
        """Return QueryResult objects (PRIVATE)."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        # initial value dummies
        qres_state = None
        file_state = None
        prev_qid = None
        cur, prev = None, None
        # container for Hit objects, used to create QueryResult
        hit_list = []
        cur_qid = None
        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
            # only parse the result row if it's not EOF
            # NOTE: we are not parsing the extra '#' lines appended to the end
            # of hmmer31b1 tabular results since storing them in qresult
            # objects means we can not do a single-pass parsing
            if self.line and not self.line.startswith("#"):
                cur = self._parse_row()
                cur_qid = cur["qresult"]["id"]
            else:
                file_state = state_EOF
                # mock value for cur_qid, since we have nothing to parse
                cur_qid = None

            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME

            if prev is not None:
                # since domain tab formats only have 1 Hit per line
                # we always create HSPFragment, HSP, and Hit per line
                prev_hid = prev["hit"]["id"]

                # create fragment and HSP and set their attributes
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev["frag"].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev["hsp"].items():
                    setattr(hsp, attr, value)

                # create Hit and set its attributes
                hit = Hit([hsp])
                for attr, value in prev["hit"].items():
                    setattr(hit, attr, value)
                hit_list.append(hit)

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev["qresult"].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()


class Hmmer3TabIndexer(SearchIndexer):
    """Indexer class for HMMER table output."""

    _parser = Hmmer3TabParser
    # denotes column location for query identifier
    _query_id_idx = 2

    def __iter__(self):
        """Iterate over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = b"#"
        split_mark = b" "
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield (qresult_key.decode(), start_offset, adj_end - start_offset)
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield (qresult_key.decode(), start_offset, end_offset - start_offset)
                break

    def get_raw(self, offset):
        """Return the raw bytes string of a QueryResult object from the given offset."""
        handle = self._handle
        handle.seek(offset)
        query_id_idx = self._query_id_idx
        qresult_key = None
        qresult_raw = b""
        split_mark = b" "

        while True:
            line = handle.readline()
            if not line:
                break
            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]
                if curr_key != qresult_key:
                    break
            qresult_raw += line

        return qresult_raw


class Hmmer3TabWriter:
    """Writer for hmmer3-tab output format."""

    def __init__(self, handle):
        """Initialize the class."""
        self.handle = handle

    def write_file(self, qresults):
        """Write to the handle.

        Returns a tuple of how many QueryResult, Hit, and HSP objects were written.

        """
        handle = self.handle
        qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0

        try:
            first_qresult = next(qresults)
        except StopIteration:
            handle.write(self._build_header())
        else:
            # write header
            handle.write(self._build_header(first_qresult))
            # and then the qresults
            for qresult in chain([first_qresult], qresults):
                if qresult:
                    handle.write(self._build_row(qresult))
                    qresult_counter += 1
                    hit_counter += len(qresult)
                    hsp_counter += sum(len(hit) for hit in qresult)
                    frag_counter += sum(len(hit.fragments) for hit in qresult)

        return qresult_counter, hit_counter, hsp_counter, frag_counter

    def _build_header(self, first_qresult=None):
        """Return the header string of a HMMER table output (PRIVATE)."""
        # calculate whitespace required
        # adapted from HMMER's source: src/p7_tophits.c#L1083
        if first_qresult is not None:
            # qnamew = max(20, len(first_qresult.id))
            qnamew = 20  # why doesn't the above work?
            tnamew = max(20, len(first_qresult[0].id))
            qaccw = max(10, len(first_qresult.accession))
            taccw = max(10, len(first_qresult[0].accession))
        else:
            qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10
        # Turn black code style off
        # fmt: off
        header = ("#%*s %22s %22s %33s\n"
                  % (tnamew + qnamew + taccw + qaccw + 2, "",
                     "--- full sequence ----", "--- best 1 domain ----",
                     "--- domain number estimation ----"))
        header += ("#%-*s %-*s %-*s %-*s %9s %6s %5s %9s %6s %5s %5s %3s "
                   "%3s %3s %3s %3s %3s %3s %s\n"
                   % (tnamew - 1, " target name",
                      taccw, "accession", qnamew, "query name", qaccw,
                      "accession", "  E-value", " score", " bias",
                      "  E-value", " score", " bias", "exp",
                      "reg", "clu", " ov", "env", "dom", "rep",
                      "inc", "description of target"))
        header += ("#%*s %*s %*s %*s %9s %6s %5s %9s %6s %5s %5s %3s %3s "
                   "%3s %3s %3s %3s %3s %s\n"
                   % (tnamew - 1, "-------------------",
                      taccw, "----------", qnamew, "--------------------", qaccw,
                      "----------", "---------", "------", "-----", "---------",
                      "------", "-----", "---", "---", "---", "---", "---", "---",
                      "---", "---", "---------------------"))
        # Turn black code style on
        # fmt: on
        return header

    def _build_row(self, qresult):
        """Return a string or one row or more of the QueryResult object (PRIVATE)."""
        rows = ""

        # calculate whitespace required
        # adapted from HMMER's source: src/p7_tophits.c#L1083
        qnamew = max(20, len(qresult.id))
        tnamew = max(20, len(qresult[0].id))
        qaccw = max(10, len(qresult.accession))
        taccw = max(10, len(qresult[0].accession))

        for hit in qresult:
            rows += (
                "%-*s %-*s %-*s %-*s %9.2g %6.1f %5.1f %9.2g %6.1f"
                " %5.1f %5.1f %3d %3d %3d %3d %3d %3d %3d %s\n"
                % (
                    tnamew,
                    hit.id,
                    taccw,
                    hit.accession,
                    qnamew,
                    qresult.id,
                    qaccw,
                    qresult.accession,
                    hit.evalue,
                    hit.bitscore,
                    hit.bias,
                    hit.hsps[0].evalue,
                    hit.hsps[0].bitscore,
                    hit.hsps[0].bias,
                    hit.domain_exp_num,
                    hit.region_num,
                    hit.cluster_num,
                    hit.overlap_num,
                    hit.env_num,
                    hit.domain_obs_num,
                    hit.domain_reported_num,
                    hit.domain_included_num,
                    hit.description,
                )
            )

        return rows


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()