File size: 20,434 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for Exonerate plain text output format."""

import re
from itertools import chain


from ._base import (
    _BaseExonerateParser,
    _BaseExonerateIndexer,
    _STRAND_MAP,
    _parse_hit_or_query_line,
)
from .exonerate_vulgar import _RE_VULGAR


__all__ = ("ExonerateTextParser", "ExonerateTextIndexer")


# for capturing sequences in alignment blocks
# e.g. ' 529 : ATCCCTTATCTCTTTATCTTGTA :    472'
_RE_ALN_ROW = re.compile(r"\s*\d+\s+: (.*) :\s+\d+")
# for splitting the line based on intron annotations
# e.g. '  >>>> Target Intron 1 >>>>  ' or 'gt.........................ag'
_RE_EXON = re.compile(
    r"[atgc ]{2}?(?:(?:[<>]+ \w+ Intron \d+ [<>]+)|(?:\.+))[atgc ]{2}?"
)
# captures the intron length
# from e.g. '61 bp // 154295 bp' (joint intron lengths) or '177446 bp'
_RE_EXON_LEN = re.compile(r"(?:(\d+) bp // (\d+) bp)|(?:(\d+) bp)")
# for splitting lines in the NER model
_RE_NER = re.compile(r"--<\s+\d+\s+>--")
# for capturing NER gap lengths
_RE_NER_LEN = re.compile(r"--<\s+(\d+)\s+>--")
# regexes for capturing the letters inside curly braces
# no. of letters is either 1 or 2, since they are split codons
_RE_SCODON_START = re.compile(r"\{(\w{1,2})\}$")
_RE_SCODON_END = re.compile(r"^\{(\w{1,2})\}")


def _flip_codons(codon_seq, target_seq):
    """Flips the codon characters from one seq to another (PRIVATE)."""
    a, b = "", ""
    for char1, char2 in zip(codon_seq, target_seq):
        # no need to do anything if the codon seq line has nothing
        if char1 == " ":
            a += char1
            b += char2
        else:
            a += char2
            b += char1

    return a, b


def _get_block_coords(parsed_seq, row_dict, has_ner=False):
    """Return a list of start, end coordinates for each given block in the sequence (PRIVATE)."""
    start = 0
    coords = []
    if not has_ner:
        splitter = _RE_EXON
    else:
        splitter = _RE_NER

    # use the query line for reference
    seq = parsed_seq[row_dict["query"]]

    for block in re.split(splitter, seq):
        start += seq[start:].find(block)
        end = start + len(block)
        coords.append((start, end))

    return coords


def _get_inter_coords(coords, strand=1):
    """Return list of pairs covering intervening ranges (PRIVATE).

    From the given pairs of coordinates, returns a list of pairs
    covering the intervening ranges.
    """
    # adapted from Python's itertools guide
    # if strand is -1, adjust coords to the ends and starts are chained
    if strand == -1:
        sorted_coords = [(max(a, b), min(a, b)) for a, b in coords]
        inter_coords = list(chain(*sorted_coords))[1:-1]
        return list(zip(inter_coords[1::2], inter_coords[::2]))
    else:
        inter_coords = list(chain(*coords))[1:-1]
        return list(zip(inter_coords[::2], inter_coords[1::2]))


def _stitch_rows(raw_rows):
    """Stitches together the parsed alignment rows and returns them in a list (PRIVATE)."""
    # deal with possible codon surprise!
    # (i.e. alignments with codons using cdna2genome model)
    # by creating additional rows to contain the codons
    try:
        max_len = max(len(x) for x in raw_rows)
        for row in raw_rows:
            assert len(row) == max_len
    except AssertionError:
        for idx, row in enumerate(raw_rows):
            if len(row) != max_len:
                # codons must be present in the query and hit (so +2)
                assert len(row) + 2 == max_len
                # add additional empty lines to contain codons
                raw_rows[idx] = [" " * len(row[0])] + row + [" " * len(row[0])]

    cmbn_rows = []
    for idx, row in enumerate(raw_rows[0]):
        cmbn_row = "".join(aln_row[idx] for aln_row in raw_rows)
        cmbn_rows.append(cmbn_row)

    # the real aligned sequence is always the 'outer' one, so we want
    # to flip them with their 'inner' pairs
    if len(cmbn_rows) == 5:
        # flip query sequence
        cmbn_rows[0], cmbn_rows[1] = _flip_codons(cmbn_rows[0], cmbn_rows[1])
        # flip hit sequence
        cmbn_rows[4], cmbn_rows[3] = _flip_codons(cmbn_rows[4], cmbn_rows[3])

    return cmbn_rows


def _get_row_dict(row_len, model):
    """Return a dictionary of row indices for parsing alignment blocks (PRIVATE)."""
    idx = {}
    # 3 lines, usually in dna vs dna models
    if row_len == 3:
        idx["query"] = 0
        idx["midline"] = 1
        idx["hit"] = 2
        idx["qannot"], idx["hannot"] = None, None
    # 4 lines, in protein vs dna models or dna vs protein models
    # TODO: currently we check this from the model string; is there
    # a better way to do it?
    elif row_len == 4:
        if "protein2" in model:
            idx["query"] = 0
            idx["midline"] = 1
            idx["hit"] = 2
            idx["hannot"] = 3
            idx["qannot"] = None
        elif "2protein" in model:
            idx["query"] = 1
            idx["midline"] = 2
            idx["hit"] = 3
            idx["hannot"] = None
            idx["qannot"] = 0
        else:
            raise ValueError("Unexpected model: " + model)
    # 5 lines, translated dna vs translated dna
    elif row_len == 5:
        # set sequence indexes
        idx["qannot"] = 0
        idx["query"] = 1
        idx["midline"] = 2
        idx["hit"] = 3
        idx["hannot"] = 4
    else:
        raise ValueError("Unexpected row count in alignment block: %i" % row_len)
    return idx


def _get_blocks(rows, coords, idx):
    """Return a list of dictionaries of sequences split by the coordinates (PRIVATE)."""
    for idx_name in ("query", "hit", "midline", "qannot", "hannot"):
        assert idx_name in idx
    blocks = []
    for start, end in coords:
        block = {}
        # get seqs according to index
        block["query"] = rows[idx["query"]][start:end]
        block["hit"] = rows[idx["hit"]][start:end]
        block["similarity"] = rows[idx["midline"]][start:end]
        if idx["qannot"] is not None:
            block["query_annotation"] = rows[idx["qannot"]][start:end]
        if idx["hannot"] is not None:
            block["hit_annotation"] = rows[idx["hannot"]][start:end]
        blocks.append(block)

    return blocks


def _get_scodon_moves(tmp_seq_blocks):
    """Get a dictionary of split codon locations relative to each fragment end (PRIVATE)."""
    scodon_moves = {"query": [], "hit": []}
    for seq_type in scodon_moves:
        scoords = []
        for block in tmp_seq_blocks:
            # check both ends of the sequence for residues in curly braces
            m_start = re.search(_RE_SCODON_START, block[seq_type])
            m_end = re.search(_RE_SCODON_END, block[seq_type])
            if m_start:
                m_start = len(m_start.group(1))
                scoords.append((m_start, 0))
            else:
                scoords.append((0, 0))
            if m_end:
                m_end = len(m_end.group(1))
                scoords.append((0, m_end))
            else:
                scoords.append((0, 0))
        scodon_moves[seq_type] = scoords

    return scodon_moves


def _clean_blocks(tmp_seq_blocks):
    """Remove curly braces (split codon markers) from the given sequences (PRIVATE)."""
    seq_blocks = []
    for seq_block in tmp_seq_blocks:
        for line_name in seq_block:
            seq_block[line_name] = (
                seq_block[line_name].replace("{", "").replace("}", "")
            )
        seq_blocks.append(seq_block)

    return seq_blocks


def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens):
    """Return the length of introns between fragments (PRIVATE)."""
    # set opposite type, for setting introns
    opp_type = "hit" if seq_type == "query" else "query"
    # list of flags to denote if an intron follows a block
    # it reads e.g. this line:
    # "ATGTT{TT}  >>>> Target Intron 1 >>>>  {G}TGTGTGTACATT"
    # and sets the opposing sequence type's intron (since this
    # line is present on the opposite sequence type line)
    has_intron_after = ["Intron" in x[seq_type] for x in inter_blocks]
    assert len(has_intron_after) == len(raw_inter_lens)
    # create list containing coord adjustments incorporating
    # intron lengths
    inter_lens = []
    for flag, parsed_len in zip(has_intron_after, raw_inter_lens):
        if flag:
            # joint introns
            if all(parsed_len[:2]):
                # intron len is [0] if opp_type is query, otherwise it's [1]
                intron_len = (
                    int(parsed_len[0]) if opp_type == "query" else int(parsed_len[1])
                )
            # single hit/query introns
            elif parsed_len[2]:
                intron_len = int(parsed_len[2])
            else:
                raise ValueError("Unexpected intron parsing result: %r" % parsed_len)
        else:
            intron_len = 0

        inter_lens.append(intron_len)

    return inter_lens


def _comp_coords(hsp, seq_type, inter_lens):
    """Fill the block coordinates of the given hsp dictionary (PRIVATE)."""
    assert seq_type in ("hit", "query")
    # manually fill the first coord
    seq_step = 1 if hsp["%s_strand" % seq_type] >= 0 else -1
    fstart = hsp["%s_start" % seq_type]
    # fend is fstart + number of residues in the sequence, minus gaps
    fend = (
        fstart
        + len(hsp[seq_type][0].replace("-", "").replace(">", "").replace("<", ""))
        * seq_step
    )
    coords = [(fstart, fend)]
    # and start from the second block, after the first inter seq
    for idx, block in enumerate(hsp[seq_type][1:]):
        bstart = coords[-1][1] + inter_lens[idx] * seq_step
        bend = bstart + seq_step * len(block.replace("-", ""))
        coords.append((bstart, bend))

    # adjust the coords so the smallest is [0], if strand is -1
    # couldn't do this in the previous steps since we need the initial
    # block ordering
    if seq_step != 1:
        for idx, coord in enumerate(coords):
            coords[idx] = coords[idx][1], coords[idx][0]

    return coords


def _comp_split_codons(hsp, seq_type, scodon_moves):
    """Compute positions of split codons, store in given HSP dictionary (PRIVATE)."""
    scodons = []
    for idx in range(len(scodon_moves[seq_type])):
        pair = scodon_moves[seq_type][idx]
        if not any(pair):
            continue
        else:
            assert not all(pair)
        a, b = pair
        anchor_pair = hsp["%s_ranges" % seq_type][idx // 2]
        strand = 1 if hsp["%s_strand" % seq_type] >= 0 else -1

        if a:
            func = max if strand == 1 else min
            anchor = func(anchor_pair)
            start_c, end_c = anchor + a * strand * -1, anchor
        elif b:
            func = min if strand == 1 else max
            anchor = func(anchor_pair)
            start_c, end_c = anchor + b * strand, anchor
        scodons.append((min(start_c, end_c), max(start_c, end_c)))

    return scodons


class ExonerateTextParser(_BaseExonerateParser):
    """Parser for Exonerate plain text output."""

    _ALN_MARK = "C4 Alignment:"

    def parse_alignment_block(self, header):
        """Parse alignment block, return query result, hits, hsps."""
        qresult = header["qresult"]
        hit = header["hit"]
        hsp = header["hsp"]
        # check for values that must have been set by previous methods
        for val_name in (
            "query_start",
            "query_end",
            "hit_start",
            "hit_end",
            "query_strand",
            "hit_strand",
        ):
            assert val_name in hsp, hsp

        # get the alignment rows
        # and stitch them so we have the full sequences in single strings
        raw_aln_blocks, vulgar_comp = self._read_alignment()
        # cmbn_rows still has split codon markers (curly braces)
        cmbn_rows = _stitch_rows(raw_aln_blocks)
        row_dict = _get_row_dict(len(cmbn_rows), qresult["model"])
        # get the sequence blocks
        has_ner = "NER" in qresult["model"].upper()
        seq_coords = _get_block_coords(cmbn_rows, row_dict, has_ner)
        tmp_seq_blocks = _get_blocks(cmbn_rows, seq_coords, row_dict)
        # get split codon temp coords for later use
        # this result in pairs of base movement for both ends of each row
        scodon_moves = _get_scodon_moves(tmp_seq_blocks)
        # remove the split codon markers
        seq_blocks = _clean_blocks(tmp_seq_blocks)

        # adjust strands
        hsp["query_strand"] = _STRAND_MAP[hsp["query_strand"]]
        hsp["hit_strand"] = _STRAND_MAP[hsp["hit_strand"]]
        # cast coords into ints
        hsp["query_start"] = int(hsp["query_start"])
        hsp["query_end"] = int(hsp["query_end"])
        hsp["hit_start"] = int(hsp["hit_start"])
        hsp["hit_end"] = int(hsp["hit_end"])
        # cast score into ints
        hsp["score"] = int(hsp["score"])
        # set sequences
        hsp["query"] = [x["query"] for x in seq_blocks]
        hsp["hit"] = [x["hit"] for x in seq_blocks]
        hsp["aln_annotation"] = {}
        # set the molecule type
        # currently only limited to models with protein queries
        if (
            "protein2" in qresult["model"]
            or "coding2" in qresult["model"]
            or "2protein" in qresult["model"]
        ):
            hsp["molecule_type"] = "protein"
        # get the annotations if they exist
        for annot_type in ("similarity", "query_annotation", "hit_annotation"):
            try:
                hsp["aln_annotation"][annot_type] = [x[annot_type] for x in seq_blocks]
            except KeyError:
                pass

        # use vulgar coordinates if vulgar line is present and return
        # if vulgar_comp is not None:
        #    hsp = parse_vulgar_comp(hsp, vulgar_comp)

        #    return {'qresult': qresult, 'hit': hit, 'hsp': hsp}

        # otherwise we need to get the coordinates from the alignment
        # get the intervening blocks first, so we can use them
        # to adjust the coordinates
        if not has_ner:
            # get intervening coordinates and blocks, only if model is not ner
            # ner models have a much more simple coordinate calculation
            inter_coords = _get_inter_coords(seq_coords)
            inter_blocks = _get_blocks(cmbn_rows, inter_coords, row_dict)
            # returns a three-component tuple of intron lengths
            # first two component filled == intron in hit and query
            # last component filled == intron in hit or query
            raw_inter_lens = re.findall(_RE_EXON_LEN, cmbn_rows[row_dict["midline"]])

        # compute start and end coords for each block
        for seq_type in ("query", "hit"):

            # ner blocks and intron blocks require different adjustments
            if not has_ner:
                opp_type = "hit" if seq_type == "query" else "query"
                inter_lens = _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens)
            else:
                # for NER blocks, the length of the inter-fragment gaps is
                # written on the same strand, so opp_type is seq_type
                opp_type = seq_type
                inter_lens = [
                    int(x)
                    for x in re.findall(_RE_NER_LEN, cmbn_rows[row_dict[seq_type]])
                ]

            # check that inter_lens's length is len opp_type block - 1
            if len(inter_lens) != len(hsp[opp_type]) - 1:
                raise ValueError(
                    "Length mismatch: %r vs %r"
                    % (len(inter_lens), len(hsp[opp_type]) - 1)
                )
            # fill the hsp query and hit coordinates
            hsp["%s_ranges" % opp_type] = _comp_coords(hsp, opp_type, inter_lens)
            # and fill the split codon coordinates, if model != ner
            # can't do this in the if-else clause above since we need to
            # compute the ranges first
            if not has_ner:
                hsp["%s_split_codons" % opp_type] = _comp_split_codons(
                    hsp, opp_type, scodon_moves
                )

        # now that we've finished parsing coords, we can set the hit and start
        # coord according to Biopython's convention (start <= end)
        for seq_type in ("query", "hit"):
            if hsp["%s_strand" % seq_type] == -1:
                n_start = "%s_start" % seq_type
                n_end = "%s_end" % seq_type
                hsp[n_start], hsp[n_end] = hsp[n_end], hsp[n_start]

        return {"qresult": qresult, "hit": hit, "hsp": hsp}

    def _read_alignment(self):
        """Read the raw alignment block strings, returns them in a list (PRIVATE)."""
        raw_aln_blocks = []
        # flag to check whether we're in an alignment row
        in_aln_row = False
        # flag for vulgar line, if present, we can parse coordinates from it
        vulgar_comp = None
        while True:

            match = re.search(_RE_ALN_ROW, self.line.strip())
            # if we have a match, set flags and values
            if match and not in_aln_row:
                start_idx = self.line.index(match.group(1))
                row_len = len(match.group(1))
                in_aln_row = True
                raw_aln_block = []
            # if we're in an alignment row, grab the sequence
            if in_aln_row:
                raw_aln_block.append(self.line[start_idx : start_idx + row_len])
            # reset flags and values if the line matches, we're in an alignment
            # row, and there are more than 1 line in rows
            if match and in_aln_row and len(raw_aln_block) > 1:
                raw_aln_blocks.append(raw_aln_block)
                start_idx = None
                row_len = None
                in_aln_row = False

            self.line = self.handle.readline()
            # try to parse vulgar line if present
            if self.line.startswith("vulgar"):
                vulgar = re.search(_RE_VULGAR, self.line)
                vulgar_comp = vulgar.group(10)
            if not self.line or self.line.startswith(self._ALN_MARK):
                # HACK: this is so that the parse_qresult method does not
                # yield the objects before appending the last HSP. We are doing
                # this to keep the parser compatible with outputs without
                # human-readable alignment outputs. This also relies on the
                # fact that repeated readline() always returns '' on EOF.
                if not self.line:
                    self.line = "mock"
                break

        return raw_aln_blocks, vulgar_comp


class ExonerateTextIndexer(_BaseExonerateIndexer):
    """Indexer class for Exonerate plain text."""

    _parser = ExonerateTextParser
    _query_mark = b"C4 Alignment"

    def get_qresult_id(self, pos):
        """Return the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = b"Query:"

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(line.decode())

        return qid

    def get_raw(self, offset):
        """Return the raw string of a QueryResult object from the given offset."""
        handle = self._handle
        handle.seek(offset)
        qresult_key = None
        qresult_raw = b""

        while True:
            line = handle.readline()
            if not line:
                break
            elif line.startswith(self._query_mark):
                cur_pos = handle.tell()
                if qresult_key is None:
                    qresult_key = self.get_qresult_id(cur_pos)
                else:
                    curr_key = self.get_qresult_id(cur_pos)
                    if curr_key != qresult_key:
                        break
                handle.seek(cur_pos)
            qresult_raw += line

        return qresult_raw


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()