# Copyright 2019 by Michiel de Hoon. All rights reserved. # Based on code contributed and copyright 2016 by Peter Cock. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.SeqIO support for the UCSC nib file format. Nib stands for nibble (4 bit) representation of nucleotide sequences. The two nibbles in a byte each store one nucleotide, represented numerically as follows: - ``0`` - T - ``1`` - C - ``2`` - A - ``3`` - G - ``4`` - N (unknown) As the first bit in a nibble is set if the nucleotide is soft-masked, we additionally have: - ``8`` - t - ``9`` - c - ``a`` - a - ``b`` - g - ``c`` - n (unknown) A nib file contains only one sequence record. You are expected to use this module via the Bio.SeqIO functions under the format name "nib": >>> from Bio import SeqIO >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") >>> print("%i %s..." % (len(record), record.seq[:20])) 50 nAGAAGagccgcNGgCActt... For detailed information on the file format, please see the UCSC description at https://genome.ucsc.edu/FAQ/FAQformat.html. """ import binascii import struct import sys from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from .Interfaces import SequenceIterator from .Interfaces import SequenceWriter class NibIterator(SequenceIterator): """Parser for nib files.""" def __init__(self, source): """Iterate over a nib file and yield a SeqRecord. - source - a file-like object or a path to a file in the nib file format as defined by UCSC; the file must be opened in binary mode. Note that a nib file always contains only one sequence record. The sequence of the resulting SeqRecord object should match the sequence generated by Jim Kent's nibFrag utility run with the -masked option. This function is used internally via the Bio.SeqIO functions: >>> from Bio import SeqIO >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") >>> print("%s %i" % (record.seq, len(record))) nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 You can also call it directly: >>> with open("Nib/test_even_bigendian.nib", "rb") as handle: ... for record in NibIterator(handle): ... print("%s %i" % (record.seq, len(record))) ... nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 """ super().__init__(source, mode="b", fmt="Nib") def parse(self, handle): """Start parsing the file, and return a SeqRecord generator.""" word = handle.read(4) if not word: raise ValueError("Empty file.") signature = word.hex() if signature == "3a3de96b": byteorder = "little" # little-endian elif signature == "6be93d3a": byteorder = "big" # big-endian else: raise ValueError("unexpected signature in nib header") records = self.iterate(handle, byteorder) return records def iterate(self, handle, byteorder): """Iterate over the records in the nib file.""" number = handle.read(4) length = int.from_bytes(number, byteorder) data = handle.read() indices = binascii.hexlify(data) if length % 2 == 0: if len(indices) != length: raise ValueError("Unexpected file size") elif length % 2 == 1: if len(indices) != length + 1: raise ValueError("Unexpected file size") indices = indices[:length] if not set(indices).issubset(b"0123489abc"): raise ValueError("Unexpected sequence data found in file") table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn") nucleotides = indices.translate(table) sequence = Seq(nucleotides) record = SeqRecord(sequence) yield record class NibWriter(SequenceWriter): """Nib file writer.""" def __init__(self, target): """Initialize a Nib writer object. Arguments: - target - output stream opened in binary mode, or a path to a file """ super().__init__(target, mode="wb") def write_header(self): """Write the file header.""" super().write_header() handle = self.handle byteorder = sys.byteorder if byteorder == "little": # little-endian signature = "3a3de96b" elif byteorder == "big": # big-endian signature = "6be93d3a" else: raise RuntimeError(f"unexpected system byte order {byteorder}") handle.write(bytes.fromhex(signature)) def write_record(self, record): """Write a single record to the output file.""" handle = self.handle sequence = record.seq nucleotides = bytes(sequence) length = len(sequence) handle.write(struct.pack("i", length)) table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc") padding = length % 2 suffix = padding * b"T" nucleotides += suffix if not set(nucleotides).issubset(b"ACGTNacgtn"): raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only") indices = nucleotides.translate(table) handle.write(binascii.unhexlify(indices)) def write_file(self, records): """Write the complete file with the records, and return the number of records.""" count = super().write_file(records, mincount=1, maxcount=1) return count if __name__ == "__main__": from Bio._utils import run_doctest run_doctest(verbose=0)