aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2019 by Michiel de Hoon. All rights reserved.
# Based on code contributed and copyright 2016 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the UCSC nib file format.
Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:
- ``0`` - T
- ``1`` - C
- ``2`` - A
- ``3`` - G
- ``4`` - N (unknown)
As the first bit in a nibble is set if the nucleotide is soft-masked, we
additionally have:
- ``8`` - t
- ``9`` - c
- ``a`` - a
- ``b`` - g
- ``c`` - n (unknown)
A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":
>>> from Bio import SeqIO
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
>>> print("%i %s..." % (len(record), record.seq[:20]))
50 nAGAAGagccgcNGgCActt...
For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
"""
import binascii
import struct
import sys
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter
class NibIterator(SequenceIterator):
"""Parser for nib files."""
def __init__(self, source):
"""Iterate over a nib file and yield a SeqRecord.
- source - a file-like object or a path to a file in the nib file
format as defined by UCSC; the file must be opened in binary mode.
Note that a nib file always contains only one sequence record.
The sequence of the resulting SeqRecord object should match the sequence
generated by Jim Kent's nibFrag utility run with the -masked option.
This function is used internally via the Bio.SeqIO functions:
>>> from Bio import SeqIO
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
>>> print("%s %i" % (record.seq, len(record)))
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
You can also call it directly:
>>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
... for record in NibIterator(handle):
... print("%s %i" % (record.seq, len(record)))
...
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
"""
super().__init__(source, mode="b", fmt="Nib")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
word = handle.read(4)
if not word:
raise ValueError("Empty file.")
signature = word.hex()
if signature == "3a3de96b":
byteorder = "little" # little-endian
elif signature == "6be93d3a":
byteorder = "big" # big-endian
else:
raise ValueError("unexpected signature in nib header")
records = self.iterate(handle, byteorder)
return records
def iterate(self, handle, byteorder):
"""Iterate over the records in the nib file."""
number = handle.read(4)
length = int.from_bytes(number, byteorder)
data = handle.read()
indices = binascii.hexlify(data)
if length % 2 == 0:
if len(indices) != length:
raise ValueError("Unexpected file size")
elif length % 2 == 1:
if len(indices) != length + 1:
raise ValueError("Unexpected file size")
indices = indices[:length]
if not set(indices).issubset(b"0123489abc"):
raise ValueError("Unexpected sequence data found in file")
table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn")
nucleotides = indices.translate(table)
sequence = Seq(nucleotides)
record = SeqRecord(sequence)
yield record
class NibWriter(SequenceWriter):
"""Nib file writer."""
def __init__(self, target):
"""Initialize a Nib writer object.
Arguments:
- target - output stream opened in binary mode, or a path to a file
"""
super().__init__(target, mode="wb")
def write_header(self):
"""Write the file header."""
super().write_header()
handle = self.handle
byteorder = sys.byteorder
if byteorder == "little": # little-endian
signature = "3a3de96b"
elif byteorder == "big": # big-endian
signature = "6be93d3a"
else:
raise RuntimeError(f"unexpected system byte order {byteorder}")
handle.write(bytes.fromhex(signature))
def write_record(self, record):
"""Write a single record to the output file."""
handle = self.handle
sequence = record.seq
nucleotides = bytes(sequence)
length = len(sequence)
handle.write(struct.pack("i", length))
table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc")
padding = length % 2
suffix = padding * b"T"
nucleotides += suffix
if not set(nucleotides).issubset(b"ACGTNacgtn"):
raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
indices = nucleotides.translate(table)
handle.write(binascii.unhexlify(indices))
def write_file(self, records):
"""Write the complete file with the records, and return the number of records."""
count = super().write_file(records, mincount=1, maxcount=1)
return count
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest(verbose=0)