# Copyright 2022 by Michiel de Hoon. All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.Align support for the "sam" pairwise alignment format. The Sequence Alignment/Map (SAM) format, created by Heng Li and Richard Durbin at the Wellcome Trust Sanger Institute, stores a series of alignments to the genome in a single file. Typically they are used for next-generation sequencing data. SAM files store the alignment positions for mapped sequences, and may also store the aligned sequences and other information associated with the sequence. See http://www.htslib.org/ for more information. You are expected to use this module via the Bio.Align functions. Coordinates in the SAM format are defined in terms of one-based start positions; the parser converts these to zero-based coordinates to be consistent with Python and other alignment formats. """ from itertools import chain import copy try: import numpy except ImportError: from Bio import MissingPythonDependencyError raise MissingPythonDependencyError( "Please install numpy if you want to use Bio.Align. " "See http://www.numpy.org/" ) from None from Bio.Align import Alignment from Bio.Align import interfaces from Bio.Seq import Seq, reverse_complement, UndefinedSequenceError from Bio.SeqRecord import SeqRecord class AlignmentWriter(interfaces.AlignmentWriter): """Alignment file writer for the Sequence Alignment/Map (SAM) file format.""" fmt = "SAM" def __init__(self, target, md=False): """Create an AlignmentWriter object. Arguments: - md - If True, calculate the MD tag from the alignment and include it in the output. If False (default), do not include the MD tag in the output. """ super().__init__(target) self.md = md def write_header(self, alignments): """Write the SAM header.""" try: metadata = alignments.metadata except AttributeError: metadata = {} try: targets = alignments.targets except AttributeError: targets = {} values = metadata.get("HD") if values is not None: # if HD is present, then VN is required and must come first fields = ["@HD", "VN:%s" % values["VN"]] for key, value in values.items(): if key == "VN": continue fields.append("%s:%s" % (key, value)) line = "\t".join(fields) + "\n" self.stream.write(line) for record in targets: fields = ["@SQ"] fields.append("SN:%s" % record.id) length = len(record.seq) fields.append("LN:%d" % length) for key, value in record.annotations.items(): if key == "alternate_locus": fields.append("AH:%s" % value) elif key == "names": fields.append("AN:%s" % ",".join(value)) elif key == "assembly": fields.append("AS:%s" % value) elif key == "MD5": fields.append("M5:%s" % value) elif key == "species": fields.append("SP:%s" % value) elif key == "topology": assert value in ("linear", "circular") fields.append("PP:%s" % value) elif key == "URI": fields.append("UR:%s" % value) else: fields.append("%s:%s" % (key[:2], value)) try: description = record.description except AttributeError: pass else: if description != "": fields.append("DS:%s" % description) line = "\t".join(fields) + "\n" self.stream.write(line) for tag, rows in metadata.items(): if tag == "HD": # already written continue for row in rows: fields = ["@" + tag] for key, value in row.items(): fields.append("%s:%s" % (key, value)) line = "\t".join(fields) + "\n" self.stream.write(line) def format_alignment(self, alignment, md=None): """Return a string with a single alignment formatted as one SAM line.""" if not isinstance(alignment, Alignment): raise TypeError("Expected an Alignment object") coordinates = alignment.coordinates.transpose() target, query = alignment.sequences hard_clip_left = None hard_clip_right = None try: qName = query.id except AttributeError: qName = "query" qual = "*" else: try: hard_clip_left = query.annotations["hard_clip_left"] except (AttributeError, KeyError): pass try: hard_clip_right = query.annotations["hard_clip_right"] except (AttributeError, KeyError): pass try: qual = query.letter_annotations["phred_quality"] except (AttributeError, KeyError): qual = "*" query = query.seq qSize = len(query) try: rName = target.id except AttributeError: rName = "target" else: target = target.seq if coordinates[0, 1] < coordinates[-1, 1]: # mapped to forward strand flag = 0 else: # mapped to reverse strand flag = 16 query = reverse_complement(query, inplace=False) coordinates = numpy.array(coordinates) coordinates[:, 1] = qSize - coordinates[:, 1] hard_clip_left, hard_clip_right = hard_clip_right, hard_clip_left try: query = bytes(query) except TypeError: # string pass except UndefinedSequenceError: query = "*" else: query = str(query, "ASCII") tStart, qStart = coordinates[0, :] pos = tStart cigar = "" if hard_clip_left is not None: cigar += "%dH" % hard_clip_left if qStart > 0: cigar += "%dS" % qStart try: operations = alignment.operations except AttributeError: operations = None for tEnd, qEnd in coordinates[1:, :]: tCount = tEnd - tStart qCount = qEnd - qStart if tCount == 0: cigar += "%dI" % qCount # insertion to the reference qStart = qEnd elif qCount == 0: cigar += "%dD" % tCount # deletion from the reference tStart = tEnd else: if tCount != qCount: raise ValueError("Unequal step sizes in alignment") cigar += "%dM" % tCount tStart = tEnd qStart = qEnd else: for operation, (tEnd, qEnd) in zip(operations, coordinates[1:, :]): tCount = tEnd - tStart qCount = qEnd - qStart if tCount == 0: assert operation == ord("I") cigar += "%dI" % qCount # insertion to the reference qStart = qEnd elif qCount == 0: if operation == ord("N"): cigar += "%dN" % tCount # skipped region from the reference elif operation == ord("D"): cigar += "%dD" % tCount # deletion from the reference else: raise ValueError(f"Unexpected operation {operation}") tStart = tEnd else: if tCount != qCount: raise ValueError("Unequal step sizes in alignment") assert operation == ord("M") cigar += "%dM" % tCount tStart = tEnd qStart = qEnd if qEnd < qSize: cigar += "%dS" % (qSize - qEnd) if hard_clip_right is not None: cigar += "%dH" % hard_clip_right try: mapq = alignment.mapq except AttributeError: mapq = 255 # not available rNext = "*" pNext = 0 tLen = 0 fields = [ qName, str(flag), rName, str(pos + 1), # 1-based coordinates str(mapq), cigar, rNext, str(pNext), str(tLen), query, qual, ] if md is None: md = self.md if md is True: if query == "*": raise ValueError("requested MD tag with undefined sequence") # calculate the MD tag from the alignment coordinates and sequences tStart, qStart = coordinates[0, :] number = 0 md = "" if operations is None: for tEnd, qEnd in coordinates[1:, :]: tCount = tEnd - tStart qCount = qEnd - qStart if tCount == 0: # insertion to the reference qStart = qEnd elif qCount == 0: if True: # deletion from the reference if number: md += str(number) number = 0 md += "^" + target[tStart:tEnd] tStart = tEnd else: # alignment match if tCount != qCount: raise ValueError("Unequal step sizes in alignment") for tc, qc in zip(target[tStart:tEnd], query[qStart:qEnd]): if tc == qc: number += 1 else: md += str(number) + tc number = 0 tStart = tEnd qStart = qEnd if number: md += str(number) else: for operation, (tEnd, qEnd) in zip(operations, coordinates[1:, :]): tCount = tEnd - tStart qCount = qEnd - qStart if tCount == 0: # insertion to the reference qStart = qEnd elif qCount == 0: if operation != ord("N"): # deletion from the reference if number: md += str(number) number = 0 md += "^" + target[tStart:tEnd] tStart = tEnd else: # alignment match if tCount != qCount: raise ValueError("Unequal step sizes in alignment") for tc, qc in zip(target[tStart:tEnd], query[qStart:qEnd]): if tc == qc: number += 1 else: md += str(number) + tc number = 0 tStart = tEnd qStart = qEnd if number: md += str(number) field = "MD:Z:%s" % md fields.append(field) try: score = alignment.score except AttributeError: pass else: field = "AS:i:%d" % int(round(score)) fields.append(field) try: annotations = alignment.annotations except AttributeError: pass else: for key, value in annotations.items(): if isinstance(value, int): datatype = "i" value = str(value) elif isinstance(value, float): datatype = "f" value = str(value) elif isinstance(value, str): if len(value) == 1: datatype = "A" else: datatype = "Z" elif isinstance(value, bytes): datatype = "H" value = "".join(map(str, value)) elif isinstance(value, numpy.array): datatype = "B" if numpy.issubdtype(value.dtype, numpy.integer): pass elif numpy.issubdtype(value.dtype, float): pass else: raise ValueError( f"Array of incompatible data type {value.dtype} in annotation '{key}'" ) value = "".join(map(str, value)) field = f"{key}:{datatype}:{value}" fields.append(field) line = "\t".join(fields) + "\n" return line class AlignmentIterator(interfaces.AlignmentIterator): """Alignment iterator for Sequence Alignment/Map (SAM) files. Each line in the file contains one genomic alignment, which are loaded and returned incrementally. The following columns are stored as attributes of the alignment: - flag: The FLAG combination of bitwise flags; - mapq: Mapping Quality (only stored if available) - rnext: Reference sequence name of the primary alignment of the next read in the alignment (only stored if available) - pnext: Zero-based position of the primary alignment of the next read in the template (only stored if available) - tlen: signed observed template length (only stored if available) Other information associated with the alignment by its tags are stored in the annotations attribute of each alignment. Any hard clipping (clipped sequences not present in the query sequence) are stored as 'hard_clip_left' and 'hard_clip_right' in the annotations dictionary attribute of the query sequence record. The sequence quality, if available, is stored as 'phred_quality' in the letter_annotations dictionary attribute of the query sequence record. """ fmt = "SAM" def _read_header(self, stream): self.metadata = {} self.targets = [] for line in stream: if not line.startswith("@"): self._line = line break fields = line[1:].strip().split("\t") tag = fields[0] values = {} if tag == "SQ": annotations = {} description = None for field in fields[1:]: key, value = field.split(":", 1) assert len(key) == 2 if key == "SN": rname = value elif key == "LN": length = int(value) elif key == "AH": annotations["alternate_locus"] = value elif key == "AN": annotations["names"] = value.split(",") elif key == "AS": annotations["assembly"] = value elif key == "DS": description = value elif key == "M5": annotations["MD5"] = value elif key == "SP": annotations["species"] = value elif key == "TP": assert value in ("linear", "circular") annotations["topology"] = value elif key == "UR": annotations["URI"] = value else: annotations[key] = value sequence = Seq(None, length=length) record = SeqRecord( sequence, id=rname, description="", annotations=annotations ) if description is not None: record.description = description self.targets.append(record) else: for field in fields[1:]: key, value = field.split(":", 1) assert len(key) == 2 values[key] = value if tag == "HD": self.metadata[tag] = values else: if tag not in self.metadata: self.metadata[tag] = [] self.metadata[tag].append(values) self._target_indices = { record.id: index for index, record in enumerate(self.targets) } def _read_next_alignment(self, stream): try: line = self._line except AttributeError: lines = stream else: lines = chain([line], stream) del self._line for line in lines: fields = line.split() if len(fields) < 11: raise ValueError( "line has %d columns; expected at least 11" % len(fields) ) qname = fields[0] flag = int(fields[1]) rname = fields[2] target_pos = int(fields[3]) - 1 mapq = int(fields[4]) cigar = fields[5] rnext = fields[6] pnext = int(fields[7]) - 1 tlen = int(fields[8]) query = fields[9] qual = fields[10] md = None score = None annotations = {} for field in fields[11:]: tag, datatype, value = field.split(":", 2) if tag == "AS": assert datatype == "i" score = int(value) elif tag == "MD": assert datatype == "Z" md = value else: if datatype == "i": value = int(value) elif datatype == "f": value = float(value) elif datatype in ("A", "Z"): # string pass elif datatype == "H": n = len(value) value = bytes(int(value[i : i + 2]) for i in range(0, n, 2)) elif datatype == "B": letter = value[0] value = value[1:].split(",") if letter in "cCsSiI": dtype = int elif letter == "f": dtype = float else: raise ValueError( f"Unknown number type '{letter}' in tag '{field}'" ) value = numpy.array(value, dtype) annotations[tag] = value if flag & 0x10: strand = "-" else: strand = "+" hard_clip_left = None hard_clip_right = None store_operations = False if flag & 0x4: # unmapped target = None coordinates = None elif md is None: query_pos = 0 coordinates = [[target_pos, query_pos]] number = "" operations = bytearray() for letter in cigar: if letter == "M": # M: alignment match length = int(number) target_pos += length query_pos += length elif letter in "=X": # =: sequence match # X: sequence mismatch length = int(number) target_pos += length query_pos += length store_operations = True elif letter == "I": # I: insertion to the reference length = int(number) query_pos += length elif letter == "S": # S: soft clipping length = int(number) if query_pos == 0: coordinates[0][1] += length query_pos += length number = "" continue elif letter == "D": # D: deletion from the reference length = int(number) target_pos += length elif letter == "N": # N: skipped region from the reference length = int(number) target_pos += length store_operations = True elif letter == "H": # hard clipping if query_pos == 0: hard_clip_left = int(number) else: hard_clip_right = int(number) number = "" continue elif letter == "P": # padding raise NotImplementedError( "padding operator is not yet implemented" ) else: number += letter continue coordinates.append([target_pos, query_pos]) operations.append(ord(letter)) number = "" index = self._target_indices.get(rname) if index is None: if self.targets: raise ValueError(f"Found target {rname} missing from header") target = SeqRecord(None, id=rname, description="") else: target = self.targets[index] else: query_pos = 0 coordinates = [[target_pos, query_pos]] seq = query target = "" starts = [target_pos] size = 0 sizes = [] number = "" operations = bytearray() for letter in cigar: if letter in "M": # M: alignment match length = int(number) target_pos += length query_pos += length target += seq[:length] seq = seq[length:] size += length elif letter in "=X": # =: sequence match # X: sequence mismatch length = int(number) target_pos += length query_pos += length target += seq[:length] seq = seq[length:] size += length store_operations = True elif letter == "I": # I: insertion to the reference length = int(number) query_pos += length seq = seq[length:] elif letter == "S": # S: soft clipping length = int(number) if query_pos == 0: coordinates[0][1] += length query_pos += length seq = seq[length:] number = "" continue elif letter == "D": # deletion from the reference length = int(number) target_pos += length size += length starts.append(target_pos) sizes.append(size) size = 0 elif letter == "N": # skipped region from the reference length = int(number) target_pos += length starts.append(target_pos) sizes.append(size) size = 0 store_operations = True elif letter == "H": # hard clipping (clipped sequences not present in sequence) if query_pos == 0: hard_clip_left = int(number) else: hard_clip_right = int(number) number = "" continue elif letter == "P": # padding raise NotImplementedError( "padding operator is not yet implemented" ) else: number += letter continue coordinates.append([target_pos, query_pos]) operations.append(ord(letter)) number = "" sizes.append(size) seq = target target = "" number = "" letters = iter(md) for letter in letters: if letter in "ACGTNacgtn": if number: number = int(number) target += seq[:number] seq = seq[number:] number = "" target += letter seq = seq[1:] elif letter == "^": if number: number = int(number) target += seq[:number] seq = seq[number:] number = "" for letter in letters: if letter not in "ACGTNacgtn": break target += letter else: break number = letter else: number += letter if number: number = int(number) target += seq[:number] seq = target index = self._target_indices[rname] target = copy.deepcopy(self.targets[index]) length = len(target.seq) data = {} index = 0 for start, size in zip(starts, sizes): data[start] = seq[index : index + size] index += size target.seq = Seq(data, length=length) if coordinates is not None: coordinates = numpy.array(coordinates).transpose() if strand == "-": coordinates[1, :] = query_pos - coordinates[1, :] if query == "*": length = query_pos sequence = Seq(None, length=length) else: sequence = Seq(query) if not (flag & 0x4): # not unmapped assert len(query) == query_pos if strand == "-": sequence = sequence.reverse_complement() query = SeqRecord(sequence, id=qname, description="") if strand == "-": hard_clip_left, hard_clip_right = hard_clip_right, hard_clip_left if hard_clip_left is not None: query.annotations["hard_clip_left"] = hard_clip_left if hard_clip_right is not None: query.annotations["hard_clip_right"] = hard_clip_right if qual != "*": query.letter_annotations["phred_quality"] = qual records = [target, query] alignment = Alignment(records, coordinates) alignment.flag = flag if mapq != 255: alignment.mapq = mapq if rnext == "=": alignment.rnext = rname elif rnext != "*": alignment.rnext = rnext if pnext >= 0: alignment.pnext = pnext if tlen != 0: alignment.tlen = tlen if score is not None: alignment.score = score if annotations: alignment.annotations = annotations if hard_clip_left is not None: alignment.hard_clip_left = hard_clip_left if hard_clip_right is not None: alignment.hard_clip_right = hard_clip_right if store_operations: alignment.operations = operations return alignment