File size: 16,530 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# Copyright 2021 by Michiel de Hoon.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.Align support for tabular output from BLAST or FASTA.

This module contains a parser for tabular output from BLAST run with the
'-outfmt 7' argument, as well as tabular output from William Pearson's
FASTA alignment tools using the '-m 8CB' or '-m 8CC' arguments.
"""
import re
import enum
import numpy
from Bio.Align import Alignment
from Bio.Align import interfaces
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


class State(enum.Enum):
    """Enumerate alignment states needed when parsing a BTOP string."""

    MATCH = enum.auto()
    QUERY_GAP = enum.auto()
    TARGET_GAP = enum.auto()
    NONE = enum.auto()


class AlignmentIterator(interfaces.AlignmentIterator):
    """Alignment iterator for tabular output from BLAST or FASTA.

    For reading (pairwise) alignments from tabular output generated by BLAST
    run with the '-outfmt 7' argument, as well as tabular output generated by
    William Pearson's FASTA alignment programs with the '-m 8CB' or '-m 8CC'
    output formats.
    """

    fmt = "Tabular"

    def _read_header(self, stream):
        try:
            line = next(stream)
        except StopIteration:
            raise ValueError("Empty file.") from None
        if not line.startswith("# "):
            raise ValueError("Missing header.")
        line = line.rstrip()
        self._parse_header(stream, line)

    def _parse_header(self, stream, line):
        metadata = {}
        blast_programs = (
            "BLASTN",
            "BLASTP",
            "BLASTX",
            "TBLASTN",
            "TBLASTX",
            "DELTABLAST",
            "PSIBLAST",
            "RPSBLAST",
            "RPSTBLASTN",
        )
        try:
            program, version = line[2:].split(None, 1)
            if program not in blast_programs:
                raise ValueError("Not a BLAST program")
        except ValueError:
            # FASTA
            metadata["Command line"] = line[2:]
            line = next(stream)
            assert line.startswith("# ")
            metadata["Program"], metadata["Version"] = line[2:].rstrip().split(None, 1)
            self._final_prefix = "# FASTA processed "
        else:
            # BLAST
            metadata["Program"], metadata["Version"] = program, version
            self._final_prefix = "# BLAST processed "
        for line in stream:
            line = line.strip()
            assert line.startswith("# ")
            try:
                prefix, value = line[2:].split(": ")
            except ValueError:
                suffix = " hits found"
                assert line.endswith(suffix)
                hits = int(line[2 : -len(suffix)])
                break
            if prefix == "Query":
                if metadata["Program"] == "FASTA":
                    query_line, query_size = value.rsplit(" - ", 1)
                    query_size, unit = query_size.split()
                    self._query_size = int(query_size)
                    assert unit in ("nt", "aa")
                else:
                    query_line = value
                    self._query_size = None
                try:
                    self._query_id, self._query_description = query_line.split(None, 1)
                except ValueError:
                    self._query_id = query_line.strip()
                    self._query_description = None
            elif prefix == "Database":
                metadata["Database"] = value
            elif prefix == "Fields":
                self._fields = value.split(", ")
            elif prefix == "RID":
                metadata["RID"] = value
        self.metadata = metadata

    def _read_next_alignment(self, stream):
        for line in stream:
            line = line.rstrip()
            if line.startswith("# "):
                if line.startswith(self._final_prefix) and line.endswith(" queries"):
                    del self._fields
                    del self._query_id
                    del self._query_description
                    del self._query_size
                    del self._final_prefix
                    return
                self._parse_header(stream, line)
            else:
                break
        alignment_length = None
        identical = None
        btop = None
        cigar = None
        score = None
        query_id = None
        target_id = None
        query_start = None
        query_end = None
        target_start = None
        target_end = None
        query_sequence = None
        target_sequence = None
        target_length = None
        coordinates = None
        query_size = self._query_size
        columns = line.split("\t")
        assert len(columns) == len(self._fields)
        annotations = {}
        query_annotations = {}
        target_annotations = {}
        for column, field in zip(columns, self._fields):
            if field == "query id":
                query_id = column
                if self._query_id is not None:
                    assert query_id == self._query_id
            elif field == "subject id":
                target_id = column
            elif field == "% identity":
                annotations[field] = float(column)
            elif field == "alignment length":
                alignment_length = int(column)
            elif field == "mismatches":
                annotations[field] = int(column)
            elif field == "gap opens":
                annotations[field] = int(column)
            elif field == "q. start":
                query_start = int(column)
            elif field == "q. end":
                query_end = int(column)
            elif field == "s. start":
                target_start = int(column)
            elif field == "s. end":
                target_end = int(column)
            elif field == "evalue":
                annotations["evalue"] = float(column)
            elif field == "bit score":
                annotations["bit score"] = float(column)
            elif field == "BTOP":
                coordinates = self.parse_btop(column)
            elif field == "aln_code":
                coordinates = self.parse_cigar(column)
            elif field == "query gi":
                query_annotations["gi"] = column
            elif field == "query acc.":
                query_annotations["acc."] = column
            elif field == "query acc.ver":
                query_annotations["acc.ver"] = column
                if query_id is None:
                    query_id = column
            elif field == "query length":
                if query_size is None:
                    query_size = int(column)
                else:
                    assert query_size == int(column)
            elif field == "subject ids":
                target_annotations["ids"] = column
            elif field == "subject gi":
                target_annotations["gi"] = column
            elif field == "subject gis":
                target_annotations["gis"] = column
            elif field == "subject acc.":
                target_annotations["acc."] = column
            elif field == "subject accs.":
                target_annotations["accs."] = column
            elif field == "subject tax ids":
                target_annotations["tax ids"] = column
            elif field == "subject sci names":
                target_annotations["sci names"] = column
            elif field == "subject com names":
                target_annotations["com names"] = column
            elif field == "subject blast names":
                target_annotations["blast names"] = column
            elif field == "subject super kingdoms":
                target_annotations["super kingdoms"] = column
            elif field == "subject title":
                target_annotations["title"] = column
            elif field == "subject titles":
                target_annotations["titles"] = column
            elif field == "subject strand":
                target_annotations["strand"] = column
            elif field == "% subject coverage":
                target_annotations["% coverage"] = float(column)
            elif field == "subject acc.ver":
                target_annotations["acc.ver"] = column
                if target_id is None:
                    target_id = column
            elif field == "subject length":
                target_length = int(column)
            elif field == "query seq":
                query_sequence = column
            elif field == "subject seq":
                target_sequence = column
            elif field == "score":
                score = int(column)
            elif field == "identical":
                identical = int(column)
                annotations[field] = identical
            elif field == "positives":
                annotations[field] = int(column)
            elif field == "gaps":
                annotations[field] = int(column)
            elif field == "% positives":
                annotations[field] = float(column)
            elif field == "% hsp coverage":
                annotations[field] = float(column)
            elif field == "query/sbjct frames":
                annotations[field] = column
            elif field == "query frame":
                query_annotations["frame"] = column
            elif field == "sbjct frame":
                target_annotations["frame"] = column
            else:
                raise ValueError("Unexpected field '%s'" % field)
        program = self.metadata["Program"]
        if coordinates is None:
            if alignment_length is not None:
                annotations["alignment length"] = alignment_length
                # otherwise, get it from alignment.shape
        if query_start is not None and query_end is not None:
            if query_start < query_end:
                query_start -= 1
            else:
                query_end -= 1
        if target_start is not None and target_end is not None:
            if target_start < target_end:
                target_start -= 1
            else:
                target_end -= 1
        if coordinates is None or program in ("BLASTX", "TBLASTX"):
            if query_start is not None:
                query_annotations["start"] = query_start
            if query_end is not None:
                query_annotations["end"] = query_end
        elif coordinates is not None:
            if query_start < query_end:
                coordinates[1, :] += query_start
            else:
                # mapped to reverse strand
                coordinates[1, :] = query_start - coordinates[1, :]
        if coordinates is None or program in ("TBLASTN", "TBLASTX"):
            if target_start is not None:
                target_annotations["start"] = target_start
            if target_end is not None:
                target_annotations["end"] = target_end
        elif coordinates is not None:
            coordinates[0, :] += target_start
        if query_sequence is None:
            if query_size is None:
                query_seq = None
            else:
                query_seq = Seq(None, length=query_size)
        else:
            query_sequence = query_sequence.replace("-", "")
            if program == "TBLASTN":
                assert len(query_sequence) == query_end - query_start
                query_seq = Seq({query_start: query_sequence}, length=query_size)
            elif program == "TBLASTX":
                query_annotations["start"] = query_start
                query_annotations["end"] = query_end
                query_seq = Seq(query_sequence)
            else:
                raise Exception("Unknown program %s" % program)
        query = SeqRecord(query_seq, id=query_id)
        if self._query_description is not None:
            query.description = self._query_description
        if query_annotations:
            query.annotations = query_annotations
        if self.metadata["Program"] in ("TBLASTN", "TBLASTX"):
            target_annotations["length"] = target_length
            if target_sequence is None:
                target_seq = None
            else:
                target_sequence = target_sequence.replace("-", "")
                target_seq = Seq(target_sequence)
        else:
            if target_sequence is None:
                if target_end is None:
                    target_seq = None
                else:
                    target_seq = Seq(None, length=target_end)
            else:
                target_sequence = target_sequence.replace("-", "")
                if target_start is not None and target_end is not None:
                    assert len(target_sequence) == target_end - target_start
                    target_seq = Seq({target_start: target_sequence}, length=target_end)
        target = SeqRecord(target_seq, id=target_id)
        if target_annotations:
            target.annotations = target_annotations
        records = [target, query]
        alignment = Alignment(records, coordinates)
        alignment.annotations = annotations
        if score is not None:
            alignment.score = score
        return alignment

    def parse_btop(self, btop):
        """Parse a BTOP string and return alignment coordinates.

        A BTOP (Blast trace-back operations) string is used by BLAST to
        describe a sequence alignment.
        """
        target_coordinates = []
        query_coordinates = []
        target_coordinates.append(0)
        query_coordinates.append(0)
        state = State.NONE
        tokens = re.findall("([A-Z-*]{2}|\\d+)", btop)
        # each token is now
        # - an integer
        # - a pair of characters, which may include dashes
        for token in tokens:
            if token.startswith("-"):
                if state != State.QUERY_GAP:
                    target_coordinates.append(target_coordinates[-1])
                    query_coordinates.append(query_coordinates[-1])
                    state = State.QUERY_GAP
                target_coordinates[-1] += 1
            elif token.endswith("-"):
                if state != State.TARGET_GAP:
                    target_coordinates.append(target_coordinates[-1])
                    query_coordinates.append(query_coordinates[-1])
                    state = State.TARGET_GAP
                query_coordinates[-1] += 1
            else:
                try:
                    length = int(token)
                except ValueError:
                    # pair of mismatched letters
                    length = 1
                if state == State.MATCH:
                    target_coordinates[-1] += length
                    query_coordinates[-1] += length
                else:
                    target_coordinates.append(target_coordinates[-1] + length)
                    query_coordinates.append(query_coordinates[-1] + length)
                    state = State.MATCH
        coordinates = numpy.array([target_coordinates, query_coordinates])
        return coordinates

    def parse_cigar(self, cigar):
        """Parse a CIGAR string and return alignment coordinates.

        A CIGAR string, as defined by the SAM Sequence Alignment/Map format,
        describes a sequence alignment as a series of lengths and operation
        (alignment/insertion/deletion) codes.
        """
        target_coordinates = []
        query_coordinates = []
        target_coordinate = 0
        query_coordinate = 0
        target_coordinates.append(target_coordinate)
        query_coordinates.append(query_coordinate)
        state = State.NONE
        tokens = re.findall("(M|D|I|\\d+)", cigar)
        # each token is now
        # - the length of the operation
        # - the operation
        for length, operation in zip(tokens[::2], tokens[1::2]):
            length = int(length)
            if operation == "M":
                target_coordinate += length
                query_coordinate += length
            elif operation == "I":
                target_coordinate += length
            elif operation == "D":
                query_coordinate += length
            target_coordinates.append(target_coordinate)
            query_coordinates.append(query_coordinate)
        coordinates = numpy.array([target_coordinates, query_coordinates])
        return coordinates