File size: 13,956 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for HMMER domain table output format."""

from itertools import chain

from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

from .hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer

__all__ = (
    "Hmmer3DomtabHmmhitParser",
    "Hmmer3DomtabHmmqueryParser",
    "Hmmer3DomtabHmmhitIndexer",
    "Hmmer3DomtabHmmqueryIndexer",
    "Hmmer3DomtabHmmhitWriter",
    "Hmmer3DomtabHmmqueryWriter",
)


class Hmmer3DomtabParser(Hmmer3TabParser):
    """Base hmmer3-domtab iterator."""

    def _parse_row(self):
        """Return a dictionary of parsed row values (PRIVATE)."""
        assert self.line
        cols = [x for x in self.line.strip().split(" ") if x]
        # if len(cols) > 23, we have extra description columns
        # combine them all into one string in the 19th column
        if len(cols) > 23:
            cols[22] = " ".join(cols[22:])
        elif len(cols) < 23:
            cols.append("")
            assert len(cols) == 23

        # assign parsed column data into qresult, hit, and hsp dicts
        qresult = {}
        qresult["id"] = cols[3]  # query name
        qresult["accession"] = cols[4]  # query accession
        qresult["seq_len"] = int(cols[5])  # qlen
        hit = {}
        hit["id"] = cols[0]  # target name
        hit["accession"] = cols[1]  # target accession
        hit["seq_len"] = int(cols[2])  # tlen
        hit["evalue"] = float(cols[6])  # evalue
        hit["bitscore"] = float(cols[7])  # score
        hit["bias"] = float(cols[8])  # bias
        hit["description"] = cols[22]  # description of target
        hsp = {}
        hsp["domain_index"] = int(cols[9])  # # (domain number)
        # not parsing cols[10] since it's basically len(hit)
        hsp["evalue_cond"] = float(cols[11])  # c-evalue
        hsp["evalue"] = float(cols[12])  # i-evalue
        hsp["bitscore"] = float(cols[13])  # score
        hsp["bias"] = float(cols[14])  # bias
        hsp["env_start"] = int(cols[19]) - 1  # env from
        hsp["env_end"] = int(cols[20])  # env to
        hsp["acc_avg"] = float(cols[21])  # acc
        frag = {}
        # strand is always 0, since HMMER now only handles protein
        frag["hit_strand"] = frag["query_strand"] = 0
        frag["hit_start"] = int(cols[15]) - 1  # hmm from
        frag["hit_end"] = int(cols[16])  # hmm to
        frag["query_start"] = int(cols[17]) - 1  # ali from
        frag["query_end"] = int(cols[18])  # ali to
        # HMMER results are always protein
        frag["molecule_type"] = "protein"

        # switch hmm<-->ali coordinates if hmm is not hit
        if not self.hmm_as_hit:
            frag["hit_end"], frag["query_end"] = (frag["query_end"], frag["hit_end"])
            frag["hit_start"], frag["query_start"] = (
                frag["query_start"],
                frag["hit_start"],
            )

        return {"qresult": qresult, "hit": hit, "hsp": hsp, "frag": frag}

    def _parse_qresult(self):
        """Return QueryResult objects (PRIVATE)."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # dummies for initial states
        qres_state = None
        hit_state = None
        file_state = None
        # dummies for initial id caches
        prev_qid = None
        prev_hid = None
        # dummies for initial parsed value containers
        cur, prev = None, None
        hit_list, hsp_list = [], []
        cur_qid = None
        cur_hid = None
        while True:
            # store previous line's parsed values, for every line after the 1st
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the line if it's not EOF
            if self.line and not self.line.startswith("#"):
                cur = self._parse_row()
                cur_qid = cur["qresult"]["id"]
                cur_hid = cur["hit"]["id"]
            else:
                file_state = state_EOF
                # mock ID values since the line is empty
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            # start creating objects after the first line (i.e. prev is filled)
            if prev is not None:
                # each line is basically an HSP with one HSPFragment
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev["frag"].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev["hsp"].items():
                    setattr(hsp, attr, value)
                hsp_list.append(hsp)

                # create hit object when we've finished parsing all its hsps
                # i.e. when hit state is state_HIT_NEW
                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev["hit"].items():
                        setattr(hit, attr, value)
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev["qresult"].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if current line is EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()


class Hmmer3DomtabHmmhitParser(Hmmer3DomtabParser):
    """HMMER domain table parser using hit coordinates.

    Parser for the HMMER domain table format that assumes HMM profile
    coordinates are hit coordinates.
    """

    hmm_as_hit = True


class Hmmer3DomtabHmmqueryParser(Hmmer3DomtabParser):
    """HMMER domain table parser using query coordinates.

    Parser for the HMMER domain table format that assumes HMM profile
    coordinates are query coordinates.
    """

    hmm_as_hit = False


class Hmmer3DomtabHmmhitIndexer(Hmmer3TabIndexer):
    """HMMER domain table indexer using hit coordinates.

    Indexer class for HMMER domain table output that assumes HMM profile
    coordinates are hit coordinates.
    """

    _parser = Hmmer3DomtabHmmhitParser
    _query_id_idx = 3


class Hmmer3DomtabHmmqueryIndexer(Hmmer3TabIndexer):
    """HMMER domain table indexer using query coordinates.

    Indexer class for HMMER domain table output that assumes HMM profile
    coordinates are query coordinates.
    """

    _parser = Hmmer3DomtabHmmqueryParser
    _query_id_idx = 3


class Hmmer3DomtabHmmhitWriter:
    """HMMER domain table writer using hit coordinates.

    Writer for hmmer3-domtab output format which writes hit coordinates
    as HMM profile coordinates.
    """

    hmm_as_hit = True

    def __init__(self, handle):
        """Initialize the class."""
        self.handle = handle

    def write_file(self, qresults):
        """Write to the handle.

        Returns a tuple of how many QueryResult, Hit, and HSP objects were written.

        """
        handle = self.handle
        qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0

        try:
            first_qresult = next(qresults)
        except StopIteration:
            handle.write(self._build_header())
        else:
            # write header
            handle.write(self._build_header(first_qresult))
            # and then the qresults
            for qresult in chain([first_qresult], qresults):
                if qresult:
                    handle.write(self._build_row(qresult))
                    qresult_counter += 1
                    hit_counter += len(qresult)
                    hsp_counter += sum(len(hit) for hit in qresult)
                    frag_counter += sum(len(hit.fragments) for hit in qresult)

        return qresult_counter, hit_counter, hsp_counter, frag_counter

    def _build_header(self, first_qresult=None):
        """Return the header string of a domain HMMER table output (PRIVATE)."""
        # calculate whitespace required
        # adapted from HMMER's source: src/p7_tophits.c#L1157
        if first_qresult:
            # qnamew = max(20, len(first_qresult.id))
            qnamew = 20
            tnamew = max(20, len(first_qresult[0].id))
            try:
                qaccw = max(10, len(first_qresult.acc))
                taccw = max(10, len(first_qresult[0].acc))
            except AttributeError:
                qaccw, taccw = 10, 10
        else:
            qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10
        # Turn black code style off
        # fmt: off
        header = ("#%*s %22s %40s %11s %11s %11s\n"
                  % (tnamew + qnamew - 1 + 15 + taccw + qaccw, "", "--- full sequence ---",
                     "-------------- this domain -------------", "hmm coord",
                     "ali coord", "env coord"))
        header += ("#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s "
                   "%9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n"
                   % (tnamew - 1,
                      " target name", taccw, "accession", "tlen", qnamew,
                      "query name", qaccw, "accession", "qlen", "E-value", "score",
                      "bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias",
                      "from", "to", "from", "to", "from", "to", "acc",
                      "description of target"))
        header += ("#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s "
                   "%6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n"
                   % (tnamew - 1,
                      "-------------------", taccw, "----------", "-----",
                      qnamew, "--------------------", qaccw, "----------",
                      "-----", "---------", "------", "-----", "---", "---",
                      "---------", "---------", "------", "-----", "-----", "-----",
                      "-----", "-----", "-----", "-----", "----",
                      "---------------------"))
        # Turn black code style on
        # fmt: on
        return header

    def _build_row(self, qresult):
        """Return a string or one row or more of the QueryResult object (PRIVATE)."""
        rows = ""

        # calculate whitespace required
        # adapted from HMMER's source: src/p7_tophits.c#L1083
        qnamew = max(20, len(qresult.id))
        tnamew = max(20, len(qresult[0].id))
        try:
            qaccw = max(10, len(qresult.accession))
            taccw = max(10, len(qresult[0].accession))
            qresult_acc = qresult.accession
        except AttributeError:
            qaccw, taccw = 10, 10
            qresult_acc = "-"

        for hit in qresult:

            # try to get hit accession
            try:
                hit_acc = hit.accession
            except AttributeError:
                hit_acc = "-"

            for hsp in hit.hsps:
                if self.hmm_as_hit:
                    hmm_to = hsp.hit_end
                    hmm_from = hsp.hit_start + 1
                    ali_to = hsp.query_end
                    ali_from = hsp.query_start + 1
                else:
                    hmm_to = hsp.query_end
                    hmm_from = hsp.query_start + 1
                    ali_to = hsp.hit_end
                    ali_from = hsp.hit_start + 1

                rows += (
                    "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d"
                    " %3d %9.2g %9.2g %6.1f %5.1f %5d %5d %5ld %5ld"
                    " %5d %5d %4.2f %s\n"
                    % (
                        tnamew,
                        hit.id,
                        taccw,
                        hit_acc,
                        hit.seq_len,
                        qnamew,
                        qresult.id,
                        qaccw,
                        qresult_acc,
                        qresult.seq_len,
                        hit.evalue,
                        hit.bitscore,
                        hit.bias,
                        hsp.domain_index,
                        len(hit.hsps),
                        hsp.evalue_cond,
                        hsp.evalue,
                        hsp.bitscore,
                        hsp.bias,
                        hmm_from,
                        hmm_to,
                        ali_from,
                        ali_to,
                        hsp.env_start + 1,
                        hsp.env_end,
                        hsp.acc_avg,
                        hit.description,
                    )
                )

        return rows


class Hmmer3DomtabHmmqueryWriter(Hmmer3DomtabHmmhitWriter):
    """HMMER domain table writer using query coordinates.

    Writer for hmmer3-domtab output format which writes query coordinates
    as HMM profile coordinates.
    """

    hmm_as_hit = False


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()