File size: 10,247 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# Copyright 2008-2016 by Peter Cock.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.Align support for "emboss" alignment output from EMBOSS tools.

This module contains a parser for the EMBOSS srspair/pair/simple file format,
for example from the needle, water, and stretcher tools.
"""
from Bio.Align import Alignment
from Bio.Align import interfaces
from Bio.Seq import Seq, reverse_complement
from Bio.SeqRecord import SeqRecord


class AlignmentIterator(interfaces.AlignmentIterator):
    """Emboss alignment iterator.

    For reading the (pairwise) alignments from EMBOSS tools in what they
    call the "pairs" and "simple" formats.
    """

    fmt = "EMBOSS"

    def _read_header(self, stream):
        try:
            line = next(stream)
        except StopIteration:
            raise ValueError("Empty file.") from None
        if line.rstrip() != "########################################":
            raise ValueError("Unexpected line: %s") % line

        # assume srspair format (default) if not specified explicitly in
        # the output file
        self.metadata = {}
        self.metadata["Align_format"] = "srspair"
        commandline = None
        for line in stream:
            if line.rstrip() == "########################################":
                break
            if not line.startswith("# "):
                raise ValueError("Unexpected line: %s") % line
            if commandline is not None:
                if line.startswith("#    "):
                    commandline += " " + line[1:].strip()
                    continue
                self.metadata["Command line"] = commandline
                commandline = None
            key, value = line[2:].split(":", 1)
            if key == "Program":
                self.metadata["Program"] = value.strip()
            elif key == "Rundate":
                self.metadata["Rundate"] = value.strip()
            elif key == "Report_file":
                self.metadata["Report_file"] = value.strip()
            elif key == "Align_format":
                self.metadata["Align_format"] = value.strip()
            elif key == "Commandline":
                commandline = value.strip()

    def _read_next_alignment(self, stream):
        number_of_sequences = None
        annotations = {}
        for line in stream:
            line = line.rstrip("\r\n")
            if not line:
                continue
            elif line.startswith("#---------------------------------------"):
                # may appear between alignments
                continue
            elif line.startswith("#======================================="):
                # found the alignment metadata start
                identifiers = []
                ncols = None
                sequences = None
                break
            else:
                raise ValueError("Unexpected line: %s" % line)
        for line in stream:
            line = line.rstrip("\r\n")
            if line == "#=======================================":
                # reached the end of alignment metadata
                break
            elif line.strip() == "#":
                continue
            elif not line.startswith("# "):
                raise ValueError("Unexpected line: %s") % line
            try:
                key, value = line[2:].split(":", 1)
            except ValueError:
                # An equal sign is used for Longest_Identity,
                # Longest_Similarity, Shortest_Identity, and
                # Shortest_Similarity, which are included if command line
                # argument -nobrief was used.
                key, value = line[2:].split(" = ", 1)
            if key == "Aligned_sequences":
                number_of_sequences = int(value.strip())
                assert len(identifiers) == 0
                # Should now expect the record identifiers...
                for i, line in enumerate(stream):
                    if not line.startswith("# "):
                        raise ValueError("Unexpected line: %s") % line
                    number, identifier = line[2:].split(":")
                    assert i + 1 == int(number)
                    identifiers.append(identifier.strip())
                    if len(identifiers) == number_of_sequences:
                        break
            elif key == "Matrix":
                annotations[key] = value.strip()
            elif key == "Gap_penalty":
                annotations[key] = float(value.strip())
            elif key == "Extend_penalty":
                annotations[key] = float(value.strip())
            elif key == "Length":
                ncols = int(value.strip())
            elif key == "Identity":
                annotations[key] = int(value.strip().split("/")[0])
            elif key == "Similarity":
                annotations[key] = int(value.strip().split("/")[0])
            elif key == "Gaps":
                annotations[key] = int(value.strip().split("/")[0])
            elif key == "Score":
                annotations[key] = float(value.strip())
            # TODO:
            # The following are generated if the -nobrief command line
            # argument used. We could simply calculate them from the
            # alignment, but then we have to define what we mean by
            # "similar". For now, simply store them as an annotation.
            elif key == "Longest_Identity":
                annotations[key] = value.strip()
            elif key == "Longest_Similarity":
                annotations[key] = value.strip()
            elif key == "Shortest_Identity":
                annotations[key] = value.strip()
            elif key == "Shortest_Similarity":
                annotations[key] = value.strip()
            else:
                raise ValueError("Failed to parse line '%s'" % line)
        else:
            return
        if len(identifiers) == 0:
            raise ValueError("Number of sequences missing!")
        if ncols is None:
            raise ValueError("Length of alignment missing!")
        sequences = [""] * number_of_sequences
        aligned_sequences = [""] * number_of_sequences
        consensus = ""
        starts = [0] * number_of_sequences
        ends = [0] * number_of_sequences
        column = 0
        index = 0
        for line in stream:
            line = line.rstrip("\r\n")
            # parse the sequences
            if not line:
                # empty line
                if index == number_of_sequences:
                    # reached the end of an alignment block
                    if column == ncols:
                        # reached the end of the sequences
                        break
                    index = 0
                continue
            prefix = line[:21].strip()
            if prefix == "":
                # match line
                consensus += line[21:71]
            else:
                identifier, start = prefix.split(None, 1)
                assert identifiers[index].startswith(identifier)
                aligned_sequence, end = line[21:].split(None, 1)
                start = int(start)
                end = int(end)
                length = len(sequences[index])
                sequence = aligned_sequence.replace("-", "")
                if length == 0 and len(sequence) > 0:
                    if start < end:
                        start -= 1  # Python counting
                        assert end == start + len(sequence)
                    else:
                        end -= 1  # Python counting
                        assert end == start - len(sequence)
                    # Record the start
                    starts[index] = start
                else:
                    if starts[index] <= ends[index]:
                        # forward strand
                        if (
                            self.metadata["Align_format"] == "srspair"
                            and len(sequence) == 0
                        ):
                            assert start == ends[index]
                            assert end == start
                        else:
                            start -= 1
                            assert end == start + len(sequence)
                    else:
                        if (
                            self.metadata["Align_format"] == "srspair"
                            and len(sequence) == 0
                        ):
                            assert start - 1 == ends[index]
                            assert end == start
                        else:
                            end -= 1
                            assert end == start - len(sequence)
                # Record the end
                ends[index] = end
                sequences[index] += sequence
                aligned_sequences[index] += aligned_sequence
                if index == 0:
                    column += len(aligned_sequence)
                else:
                    assert column == len(aligned_sequences[index])
                index += 1
        coordinates = Alignment.infer_coordinates(aligned_sequences)
        records = []
        n = len(sequences)
        for i in range(n):
            start = starts[i]
            end = ends[i]
            if start < end:
                coordinates[i, :] += start
                data = sequences[i]
            else:
                start, end = end, start
                coordinates[i, :] = end - coordinates[i, :]
                data = reverse_complement(sequences[i])
            if start == 0:
                sequence = Seq(data)
            else:
                # create a partially defined sequence
                sequence = Seq({start: data}, length=end)
            record = SeqRecord(sequence, identifiers[i])
            records.append(record)
        alignment = Alignment(records, coordinates)
        if annotations:
            alignment.annotations = annotations
        if consensus:
            alignment.column_annotations = {"emboss_consensus": consensus}
        return alignment