# Copyright 2022 by Michiel de Hoon. All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.Align support for alignment files in the bigBed format. The bigBed format stores a series of pairwise alignments in a single indexed binary file. Typically they are used for transcript to genome alignments. As in the BED format, the alignment positions and alignment scores are stored, but the aligned sequences are not. See http://genome.ucsc.edu/goldenPath/help/bigBed.html for more information. You are expected to use this module via the Bio.Align functions. """ # This parser was written based on the description of the bigBed file format in # W. J. Kent, A. S. Zweig,* G. Barber, A. S. Hinrichs, and D. Karolchik: # "BigWig and BigBed: enabling browsing of large distributed datasets: # Bioinformatics 26(17): 2204–2207 (2010) # in particular the tables in the supplemental materials listing the contents # of a bigBed file byte-by-byte. import numpy import struct import zlib from collections import namedtuple from Bio.Align import Alignment from Bio.Align import interfaces from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord Field = namedtuple("Field", ("type", "name", "comment")) class AutoSQLTable: """AutoSQL table describing the columns of an (possibly extended) BED format.""" def __init__(self, text=None): """Create an AutoSQL table describing the columns of an (extended) BED format.""" if text is None: self.name = None self.comment = None self.fields = [] else: assert text.endswith(chr(0)) # NULL-terminated string word, text = text[:-1].split(None, 1) assert word == "table" name, text = text.split(None, 1) assert len(name.split()) == 1 self.name = name assert text.startswith('"') i = text.find('"', 1) self.comment = text[1:i] text = text[i + 1 :].strip() assert text.startswith("(") assert text.endswith(")") text = text[1:-1].strip() fields = [] while text: i = text.index('"') j = text.index('"', i + 1) field_comment = text[i + 1 : j] definition = text[:i].strip() assert definition.endswith(";") field_type, field_name = definition[:-1].rsplit(None, 1) if field_type.endswith("]"): i = field_type.index("[") data_type = field_type[:i] else: data_type = field_type assert data_type in ( "int", "uint", "short", "ushort", "byte", "ubyte", "float", "char", "string", "lstring", ) field = Field(field_type, field_name, field_comment) fields.append(field) text = text[j + 1 :].strip() self.fields = fields def __str__(self): type_width = max(len(str(field.type)) for field in self.fields) name_width = max(len(field.name) for field in self.fields) + 1 lines = [] lines.append("table %s\n" % self.name) lines.append('"%s"\n' % self.comment) lines.append("(\n") for field in self.fields: name = field.name + ";" lines.append( ' %s %s "%s"\n' % (field.type.ljust(type_width), name.ljust(name_width), field.comment) ) lines.append(")\n") return "".join(lines) class AlignmentIterator(interfaces.AlignmentIterator): """Alignment iterator for bigBed files. The pairwise alignments stored in the bigBed file are loaded and returned incrementally. Additional alignment information is stored as attributes of each alignment. """ fmt = "bigBed" mode = "b" def _read_header(self, stream): # Supplemental Table 5: Common header # magic 4 bytes # version 2 bytes # zoomLevels 2 bytes # chromosomeTreeOffset 8 bytes # fullDataOffset 8 bytes; points to dataCount # fullIndexOffset 8 bytes # fieldCount 2 bytes # definedFieldCount 2 bytes # autoSqlOffset 8 bytes # totalSummaryOffset 8 bytes # uncompressBufSize 4 bytes # reserved 8 bytes signature = 0x8789F2EB magic = stream.read(4) for byteorder in ("little", "big"): if int.from_bytes(magic, byteorder=byteorder) == signature: break else: raise ValueError("not a bigBed file") self.byteorder = byteorder if byteorder == "little": byteorder_char = "<" elif byteorder == "big": byteorder_char = ">" else: raise ValueError("Unexpected byteorder '%s'" % byteorder) ( version, zoomLevels, chromosomeTreeOffset, fullDataOffset, fullIndexOffset, fieldCount, definedFieldCount, autoSqlOffset, totalSummaryOffset, uncompressBufSize, ) = struct.unpack(byteorder_char + "hhqqqhhqqixxxxxxxx", stream.read(60)) autoSqlSize = totalSummaryOffset - autoSqlOffset self.declaration = self._read_autosql( stream, autoSqlOffset, autoSqlSize, fieldCount, definedFieldCount ) stream.seek(fullDataOffset) dataCount = int.from_bytes(stream.read(8), byteorder=byteorder) self._length = dataCount if uncompressBufSize > 0: self._compressed = True else: self._compressed = False self.targets = self._read_chromosomes(stream, chromosomeTreeOffset) self.tree = self._read_index(stream, fullIndexOffset) self._data = self._iterate_index(stream) def _read_autosql(self, stream, pos, size, fieldCount, definedFieldCount): if definedFieldCount < 3 or definedFieldCount > 12: raise ValueError( "expected between 3 and 12 columns, found %d" % definedFieldCount ) self.bedN = definedFieldCount stream.seek(pos) data = stream.read(size) declaration = AutoSQLTable(data.decode()) self._analyze_fields(declaration.fields, fieldCount, definedFieldCount) return declaration def _analyze_fields(self, fields, fieldCount, definedFieldCount): names = ( "chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved", "blockCount", "blockSizes", "chromStarts", ) for i in range(self.bedN): name = fields[i].name if name != names[i]: raise ValueError( "Expected field name '%s'; found '%s'" % (names[i], name) ) if fieldCount > definedFieldCount: self._custom_fields = [] for i in range(definedFieldCount, fieldCount): field_name = fields[i].name field_type = fields[i].type if "[" in field_type and "]" in field_type: make_array = True field_type, _ = field_type.split("[") field_type = field_type.strip() else: make_array = False if field_type in ("int", "uint", "short", "ushort"): converter = int elif field_type in ("byte", "ubyte"): converter = bytes elif field_type == "float": converter = float elif field_type in ("float", "char", "string", "lstring"): converter = str else: raise Exception("Unknown field type %s" % field_type) if make_array: item_converter = converter def converter(data, item_converter=item_converter): values = data.rstrip(",").split(",") return [item_converter(value) for value in values] self._custom_fields.append([field_name, converter]) def _read_chromosomes(self, stream, pos): byteorder = self.byteorder if byteorder == "little": byteorder_char = "<" elif byteorder == "big": byteorder_char = ">" else: raise ValueError("Unexpected byteorder '%s'" % byteorder) # Supplemental Table 8: Chromosome B+ tree header # magic 4 bytes # blockSize 4 bytes # keySize 4 bytes # valSize 4 bytes # itemCount 8 bytes # reserved 8 bytes stream.seek(pos) signature = 0x78CA8C91 magic = int.from_bytes(stream.read(4), byteorder=byteorder) assert magic == signature blockSize, keySize, valSize, itemCount = struct.unpack( byteorder_char + "iiiqxxxxxxxx", stream.read(28) ) assert valSize == 8 Node = namedtuple("Node", ["parent", "children"]) targets = [] node = None while True: # Supplemental Table 9: Chromosome B+ tree node # isLeaf 1 byte # reserved 1 byte # count 2 bytes isLeaf, count = struct.unpack(byteorder_char + "?xh", stream.read(4)) if isLeaf: for i in range(count): # Supplemental Table 10: Chromosome B+ tree leaf item format # key keySize bytes # chromId 4 bytes # chromSize 4 bytes key = stream.read(keySize) name = key.rstrip(b"\x00").decode() chromId, chromSize = struct.unpack(" 0: data = zlib.decompress(data) while data: # Supplemental Table 12: Binary BED-data format # chromId 4 bytes # chromStart 4 bytes # chromEnd 4 bytes # rest zero-terminated string in tab-separated format chromId, chromStart, chromEnd = struct.unpack( byteorder_char + "III", data[:12] ) rest, data = data[12:].split(b"\00", 1) yield (chromId, chromStart, chromEnd, rest) while True: parent = node.parent if parent is None: return for index, child in enumerate(parent.children): if id(node) == id(child): break else: raise RuntimeError("Failed to find child node") try: node = parent.children[index + 1] except IndexError: node = parent else: break else: node = children[0] def _search_index(self, stream, chromIx, start, end): byteorder = self.byteorder if byteorder == "little": byteorder_char = "<" elif byteorder == "big": byteorder_char = ">" else: raise ValueError("Unexpected byteorder '%s'" % byteorder) padded_start = start - 1 padded_end = end + 1 node = self.tree while True: try: children = node.children except AttributeError: stream.seek(node.dataOffset) data = stream.read(node.dataSize) if self._compressed > 0: data = zlib.decompress(data) while data: # Supplemental Table 12: Binary BED-data format # chromId 4 bytes # chromStart 4 bytes # chromEnd 4 bytes # rest zero-terminated string in tab-separated format child_chromIx, child_chromStart, child_chromEnd = struct.unpack( byteorder_char + "III", data[:12] ) rest, data = data[12:].split(b"\00", 1) if child_chromIx != chromIx: continue if end <= child_chromStart or child_chromEnd <= start: if child_chromStart != child_chromEnd: continue if child_chromStart != end and child_chromEnd != start: continue yield (child_chromIx, child_chromStart, child_chromEnd, rest) else: visit_child = False for child in children: if (child.endChromIx, child.endBase) < (chromIx, padded_start): continue if (chromIx, padded_end) < (child.startChromIx, child.startBase): continue visit_child = True break if visit_child: node = child continue while True: parent = node.parent if parent is None: return for index, child in enumerate(parent.children): if id(node) == id(child): break else: raise RuntimeError("Failed to find child node") try: node = parent.children[index + 1] except IndexError: node = parent else: break def _read_next_alignment(self, stream): chunk = next(self._data) return self._create_alignment(chunk) def _create_alignment(self, chunk): chromId, chromStart, chromEnd, rest = chunk if rest: words = rest.decode().split("\t") else: words = [] target_record = self.targets[chromId] if self.bedN > 3: name = words[0] else: name = None if self.bedN > 5: strand = words[2] else: strand = "+" if self.bedN > 9: blockCount = int(words[6]) blockSizes = [ int(blockSize) for blockSize in words[7].rstrip(",").split(",") ] blockStarts = [ int(blockStart) for blockStart in words[8].rstrip(",").split(",") ] if len(blockSizes) != blockCount: raise ValueError( "Inconsistent number of block sizes (%d found, expected %d)" % (len(blockSizes), blockCount) ) if len(blockStarts) != blockCount: raise ValueError( "Inconsistent number of block start positions (%d found, expected %d)" % (len(blockStarts), blockCount) ) blockSizes = numpy.array(blockSizes) blockStarts = numpy.array(blockStarts) tPosition = 0 qPosition = 0 coordinates = [[tPosition, qPosition]] for blockSize, blockStart in zip(blockSizes, blockStarts): if blockStart != tPosition: coordinates.append([blockStart, qPosition]) tPosition = blockStart tPosition += blockSize qPosition += blockSize coordinates.append([tPosition, qPosition]) coordinates = numpy.array(coordinates).transpose() qSize = sum(blockSizes) else: blockSize = chromEnd - chromStart coordinates = numpy.array([[0, blockSize], [0, blockSize]]) qSize = blockSize coordinates[0, :] += chromStart query_sequence = Seq(None, length=qSize) query_record = SeqRecord(query_sequence, id=name) records = [target_record, query_record] if strand == "-": coordinates[1, :] = qSize - coordinates[1, :] if chromStart != coordinates[0, 0]: raise ValueError( "Inconsistent chromStart found (%d, expected %d)" % (chromStart, coordinates[0, 0]) ) if chromEnd != coordinates[0, -1]: raise ValueError( "Inconsistent chromEnd found (%d, expected %d)" % (chromEnd, coordinates[0, -1]) ) alignment = Alignment(records, coordinates) if len(words) > self.bedN - 3: alignment.annotations = {} for word, custom_field in zip(words[self.bedN - 3 :], self._custom_fields): name, converter = custom_field alignment.annotations[name] = converter(word) if self.bedN <= 4: return alignment score = words[1] try: score = float(score) except ValueError: pass else: if score.is_integer(): score = int(score) alignment.score = score if self.bedN <= 6: return alignment alignment.thickStart = int(words[3]) if self.bedN <= 7: return alignment alignment.thickEnd = int(words[4]) if self.bedN <= 8: return alignment alignment.itemRgb = words[5] return alignment def __len__(self): return self._length def search(self, chromosome=None, start=None, end=None): """Iterate over alignments overlapping the specified chromosome region.. This method searches the index to find alignments to the specified chromosome that fully or partially overlap the chromosome region between start and end. Arguments: - chromosome - chromosome name. If None (default value), include all alignments. - start - starting position on the chromosome. If None (default value), use 0 as the starting position. - end - end position on the chromosome. If None (default value), use the length of the chromosome as the end position. """ stream = self._stream if chromosome is None: if start is not None or end is not None: raise ValueError( "start and end must both be None if chromosome is None" ) else: for chromIx, target in enumerate(self.targets): if target.id == chromosome: break else: raise ValueError("Failed to find %s in alignments" % chromosome) if start is None: if end is None: start = 0 end = len(target) else: raise ValueError("end must be None if start is None") elif end is None: end = start + 1 data = self._search_index(stream, chromIx, start, end) for chunk in data: alignment = self._create_alignment(chunk) yield alignment