Spaces:
No application file
No application file
# Copyright 2019 by Michiel de Hoon. All rights reserved. | |
# Based on code contributed and copyright 2016 by Peter Cock. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO support for the UCSC nib file format. | |
Nib stands for nibble (4 bit) representation of nucleotide sequences. | |
The two nibbles in a byte each store one nucleotide, represented numerically | |
as follows: | |
- ``0`` - T | |
- ``1`` - C | |
- ``2`` - A | |
- ``3`` - G | |
- ``4`` - N (unknown) | |
As the first bit in a nibble is set if the nucleotide is soft-masked, we | |
additionally have: | |
- ``8`` - t | |
- ``9`` - c | |
- ``a`` - a | |
- ``b`` - g | |
- ``c`` - n (unknown) | |
A nib file contains only one sequence record. | |
You are expected to use this module via the Bio.SeqIO functions under | |
the format name "nib": | |
>>> from Bio import SeqIO | |
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") | |
>>> print("%i %s..." % (len(record), record.seq[:20])) | |
50 nAGAAGagccgcNGgCActt... | |
For detailed information on the file format, please see the UCSC | |
description at https://genome.ucsc.edu/FAQ/FAQformat.html. | |
""" | |
import binascii | |
import struct | |
import sys | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import SequenceIterator | |
from .Interfaces import SequenceWriter | |
class NibIterator(SequenceIterator): | |
"""Parser for nib files.""" | |
def __init__(self, source): | |
"""Iterate over a nib file and yield a SeqRecord. | |
- source - a file-like object or a path to a file in the nib file | |
format as defined by UCSC; the file must be opened in binary mode. | |
Note that a nib file always contains only one sequence record. | |
The sequence of the resulting SeqRecord object should match the sequence | |
generated by Jim Kent's nibFrag utility run with the -masked option. | |
This function is used internally via the Bio.SeqIO functions: | |
>>> from Bio import SeqIO | |
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") | |
>>> print("%s %i" % (record.seq, len(record))) | |
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 | |
You can also call it directly: | |
>>> with open("Nib/test_even_bigendian.nib", "rb") as handle: | |
... for record in NibIterator(handle): | |
... print("%s %i" % (record.seq, len(record))) | |
... | |
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 | |
""" | |
super().__init__(source, mode="b", fmt="Nib") | |
def parse(self, handle): | |
"""Start parsing the file, and return a SeqRecord generator.""" | |
word = handle.read(4) | |
if not word: | |
raise ValueError("Empty file.") | |
signature = word.hex() | |
if signature == "3a3de96b": | |
byteorder = "little" # little-endian | |
elif signature == "6be93d3a": | |
byteorder = "big" # big-endian | |
else: | |
raise ValueError("unexpected signature in nib header") | |
records = self.iterate(handle, byteorder) | |
return records | |
def iterate(self, handle, byteorder): | |
"""Iterate over the records in the nib file.""" | |
number = handle.read(4) | |
length = int.from_bytes(number, byteorder) | |
data = handle.read() | |
indices = binascii.hexlify(data) | |
if length % 2 == 0: | |
if len(indices) != length: | |
raise ValueError("Unexpected file size") | |
elif length % 2 == 1: | |
if len(indices) != length + 1: | |
raise ValueError("Unexpected file size") | |
indices = indices[:length] | |
if not set(indices).issubset(b"0123489abc"): | |
raise ValueError("Unexpected sequence data found in file") | |
table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn") | |
nucleotides = indices.translate(table) | |
sequence = Seq(nucleotides) | |
record = SeqRecord(sequence) | |
yield record | |
class NibWriter(SequenceWriter): | |
"""Nib file writer.""" | |
def __init__(self, target): | |
"""Initialize a Nib writer object. | |
Arguments: | |
- target - output stream opened in binary mode, or a path to a file | |
""" | |
super().__init__(target, mode="wb") | |
def write_header(self): | |
"""Write the file header.""" | |
super().write_header() | |
handle = self.handle | |
byteorder = sys.byteorder | |
if byteorder == "little": # little-endian | |
signature = "3a3de96b" | |
elif byteorder == "big": # big-endian | |
signature = "6be93d3a" | |
else: | |
raise RuntimeError(f"unexpected system byte order {byteorder}") | |
handle.write(bytes.fromhex(signature)) | |
def write_record(self, record): | |
"""Write a single record to the output file.""" | |
handle = self.handle | |
sequence = record.seq | |
nucleotides = bytes(sequence) | |
length = len(sequence) | |
handle.write(struct.pack("i", length)) | |
table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc") | |
padding = length % 2 | |
suffix = padding * b"T" | |
nucleotides += suffix | |
if not set(nucleotides).issubset(b"ACGTNacgtn"): | |
raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only") | |
indices = nucleotides.translate(table) | |
handle.write(binascii.unhexlify(indices)) | |
def write_file(self, records): | |
"""Write the complete file with the records, and return the number of records.""" | |
count = super().write_file(records, mincount=1, maxcount=1) | |
return count | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest(verbose=0) | |