# Copyright 2021 by Michiel de Hoon. All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.Align support for tabular output from BLAST or FASTA. This module contains a parser for tabular output from BLAST run with the '-outfmt 7' argument, as well as tabular output from William Pearson's FASTA alignment tools using the '-m 8CB' or '-m 8CC' arguments. """ import re import enum import numpy from Bio.Align import Alignment from Bio.Align import interfaces from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord class State(enum.Enum): """Enumerate alignment states needed when parsing a BTOP string.""" MATCH = enum.auto() QUERY_GAP = enum.auto() TARGET_GAP = enum.auto() NONE = enum.auto() class AlignmentIterator(interfaces.AlignmentIterator): """Alignment iterator for tabular output from BLAST or FASTA. For reading (pairwise) alignments from tabular output generated by BLAST run with the '-outfmt 7' argument, as well as tabular output generated by William Pearson's FASTA alignment programs with the '-m 8CB' or '-m 8CC' output formats. """ fmt = "Tabular" def _read_header(self, stream): try: line = next(stream) except StopIteration: raise ValueError("Empty file.") from None if not line.startswith("# "): raise ValueError("Missing header.") line = line.rstrip() self._parse_header(stream, line) def _parse_header(self, stream, line): metadata = {} blast_programs = ( "BLASTN", "BLASTP", "BLASTX", "TBLASTN", "TBLASTX", "DELTABLAST", "PSIBLAST", "RPSBLAST", "RPSTBLASTN", ) try: program, version = line[2:].split(None, 1) if program not in blast_programs: raise ValueError("Not a BLAST program") except ValueError: # FASTA metadata["Command line"] = line[2:] line = next(stream) assert line.startswith("# ") metadata["Program"], metadata["Version"] = line[2:].rstrip().split(None, 1) self._final_prefix = "# FASTA processed " else: # BLAST metadata["Program"], metadata["Version"] = program, version self._final_prefix = "# BLAST processed " for line in stream: line = line.strip() assert line.startswith("# ") try: prefix, value = line[2:].split(": ") except ValueError: suffix = " hits found" assert line.endswith(suffix) hits = int(line[2 : -len(suffix)]) break if prefix == "Query": if metadata["Program"] == "FASTA": query_line, query_size = value.rsplit(" - ", 1) query_size, unit = query_size.split() self._query_size = int(query_size) assert unit in ("nt", "aa") else: query_line = value self._query_size = None try: self._query_id, self._query_description = query_line.split(None, 1) except ValueError: self._query_id = query_line.strip() self._query_description = None elif prefix == "Database": metadata["Database"] = value elif prefix == "Fields": self._fields = value.split(", ") elif prefix == "RID": metadata["RID"] = value self.metadata = metadata def _read_next_alignment(self, stream): for line in stream: line = line.rstrip() if line.startswith("# "): if line.startswith(self._final_prefix) and line.endswith(" queries"): del self._fields del self._query_id del self._query_description del self._query_size del self._final_prefix return self._parse_header(stream, line) else: break alignment_length = None identical = None btop = None cigar = None score = None query_id = None target_id = None query_start = None query_end = None target_start = None target_end = None query_sequence = None target_sequence = None target_length = None coordinates = None query_size = self._query_size columns = line.split("\t") assert len(columns) == len(self._fields) annotations = {} query_annotations = {} target_annotations = {} for column, field in zip(columns, self._fields): if field == "query id": query_id = column if self._query_id is not None: assert query_id == self._query_id elif field == "subject id": target_id = column elif field == "% identity": annotations[field] = float(column) elif field == "alignment length": alignment_length = int(column) elif field == "mismatches": annotations[field] = int(column) elif field == "gap opens": annotations[field] = int(column) elif field == "q. start": query_start = int(column) elif field == "q. end": query_end = int(column) elif field == "s. start": target_start = int(column) elif field == "s. end": target_end = int(column) elif field == "evalue": annotations["evalue"] = float(column) elif field == "bit score": annotations["bit score"] = float(column) elif field == "BTOP": coordinates = self.parse_btop(column) elif field == "aln_code": coordinates = self.parse_cigar(column) elif field == "query gi": query_annotations["gi"] = column elif field == "query acc.": query_annotations["acc."] = column elif field == "query acc.ver": query_annotations["acc.ver"] = column if query_id is None: query_id = column elif field == "query length": if query_size is None: query_size = int(column) else: assert query_size == int(column) elif field == "subject ids": target_annotations["ids"] = column elif field == "subject gi": target_annotations["gi"] = column elif field == "subject gis": target_annotations["gis"] = column elif field == "subject acc.": target_annotations["acc."] = column elif field == "subject accs.": target_annotations["accs."] = column elif field == "subject tax ids": target_annotations["tax ids"] = column elif field == "subject sci names": target_annotations["sci names"] = column elif field == "subject com names": target_annotations["com names"] = column elif field == "subject blast names": target_annotations["blast names"] = column elif field == "subject super kingdoms": target_annotations["super kingdoms"] = column elif field == "subject title": target_annotations["title"] = column elif field == "subject titles": target_annotations["titles"] = column elif field == "subject strand": target_annotations["strand"] = column elif field == "% subject coverage": target_annotations["% coverage"] = float(column) elif field == "subject acc.ver": target_annotations["acc.ver"] = column if target_id is None: target_id = column elif field == "subject length": target_length = int(column) elif field == "query seq": query_sequence = column elif field == "subject seq": target_sequence = column elif field == "score": score = int(column) elif field == "identical": identical = int(column) annotations[field] = identical elif field == "positives": annotations[field] = int(column) elif field == "gaps": annotations[field] = int(column) elif field == "% positives": annotations[field] = float(column) elif field == "% hsp coverage": annotations[field] = float(column) elif field == "query/sbjct frames": annotations[field] = column elif field == "query frame": query_annotations["frame"] = column elif field == "sbjct frame": target_annotations["frame"] = column else: raise ValueError("Unexpected field '%s'" % field) program = self.metadata["Program"] if coordinates is None: if alignment_length is not None: annotations["alignment length"] = alignment_length # otherwise, get it from alignment.shape if query_start is not None and query_end is not None: if query_start < query_end: query_start -= 1 else: query_end -= 1 if target_start is not None and target_end is not None: if target_start < target_end: target_start -= 1 else: target_end -= 1 if coordinates is None or program in ("BLASTX", "TBLASTX"): if query_start is not None: query_annotations["start"] = query_start if query_end is not None: query_annotations["end"] = query_end elif coordinates is not None: if query_start < query_end: coordinates[1, :] += query_start else: # mapped to reverse strand coordinates[1, :] = query_start - coordinates[1, :] if coordinates is None or program in ("TBLASTN", "TBLASTX"): if target_start is not None: target_annotations["start"] = target_start if target_end is not None: target_annotations["end"] = target_end elif coordinates is not None: coordinates[0, :] += target_start if query_sequence is None: if query_size is None: query_seq = None else: query_seq = Seq(None, length=query_size) else: query_sequence = query_sequence.replace("-", "") if program == "TBLASTN": assert len(query_sequence) == query_end - query_start query_seq = Seq({query_start: query_sequence}, length=query_size) elif program == "TBLASTX": query_annotations["start"] = query_start query_annotations["end"] = query_end query_seq = Seq(query_sequence) else: raise Exception("Unknown program %s" % program) query = SeqRecord(query_seq, id=query_id) if self._query_description is not None: query.description = self._query_description if query_annotations: query.annotations = query_annotations if self.metadata["Program"] in ("TBLASTN", "TBLASTX"): target_annotations["length"] = target_length if target_sequence is None: target_seq = None else: target_sequence = target_sequence.replace("-", "") target_seq = Seq(target_sequence) else: if target_sequence is None: if target_end is None: target_seq = None else: target_seq = Seq(None, length=target_end) else: target_sequence = target_sequence.replace("-", "") if target_start is not None and target_end is not None: assert len(target_sequence) == target_end - target_start target_seq = Seq({target_start: target_sequence}, length=target_end) target = SeqRecord(target_seq, id=target_id) if target_annotations: target.annotations = target_annotations records = [target, query] alignment = Alignment(records, coordinates) alignment.annotations = annotations if score is not None: alignment.score = score return alignment def parse_btop(self, btop): """Parse a BTOP string and return alignment coordinates. A BTOP (Blast trace-back operations) string is used by BLAST to describe a sequence alignment. """ target_coordinates = [] query_coordinates = [] target_coordinates.append(0) query_coordinates.append(0) state = State.NONE tokens = re.findall("([A-Z-*]{2}|\\d+)", btop) # each token is now # - an integer # - a pair of characters, which may include dashes for token in tokens: if token.startswith("-"): if state != State.QUERY_GAP: target_coordinates.append(target_coordinates[-1]) query_coordinates.append(query_coordinates[-1]) state = State.QUERY_GAP target_coordinates[-1] += 1 elif token.endswith("-"): if state != State.TARGET_GAP: target_coordinates.append(target_coordinates[-1]) query_coordinates.append(query_coordinates[-1]) state = State.TARGET_GAP query_coordinates[-1] += 1 else: try: length = int(token) except ValueError: # pair of mismatched letters length = 1 if state == State.MATCH: target_coordinates[-1] += length query_coordinates[-1] += length else: target_coordinates.append(target_coordinates[-1] + length) query_coordinates.append(query_coordinates[-1] + length) state = State.MATCH coordinates = numpy.array([target_coordinates, query_coordinates]) return coordinates def parse_cigar(self, cigar): """Parse a CIGAR string and return alignment coordinates. A CIGAR string, as defined by the SAM Sequence Alignment/Map format, describes a sequence alignment as a series of lengths and operation (alignment/insertion/deletion) codes. """ target_coordinates = [] query_coordinates = [] target_coordinate = 0 query_coordinate = 0 target_coordinates.append(target_coordinate) query_coordinates.append(query_coordinate) state = State.NONE tokens = re.findall("(M|D|I|\\d+)", cigar) # each token is now # - the length of the operation # - the operation for length, operation in zip(tokens[::2], tokens[1::2]): length = int(length) if operation == "M": target_coordinate += length query_coordinate += length elif operation == "I": target_coordinate += length elif operation == "D": query_coordinate += length target_coordinates.append(target_coordinate) query_coordinates.append(query_coordinate) coordinates = numpy.array([target_coordinates, query_coordinates]) return coordinates