aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2020 by Michiel de Hoon
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format.
This parser reads the index stored in the twoBit file, as well as the masked
regions and the N's for each sequence. It also creates sequence data objects
(_TwoBitSequenceData objects), which support only two methods: __len__ and
__getitem__. The former will return the length of the sequence, while the
latter returns the sequence (as a bytes object) for the requested region.
Using the information in the index, the __getitem__ method calculates the file
position at which the requested region starts, and only reads the requested
sequence region. Note that the full sequence of a record is loaded only if
specifically requested, making the parser memory-efficient.
The TwoBitIterator object implements the __getitem__, keys, and __len__
methods that allow it to be used as a dictionary.
"""
# The .2bit file format is defined by UCSC as follows
# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7):
#
#
# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
# randomly-accessible format. The file contains masking information as well
# as the DNA itself.
#
# The file begins with a 16-byte header containing the following fields:
#
# signature - the number 0x1A412743 in the architecture of the machine that
# created the file
# version - zero for now. Readers should abort if they see a version number
# higher than 0
# sequenceCount - the number of sequences in the file
# reserved - always zero for now
#
# All fields are 32 bits unless noted. If the signature value is not as
# given, the reader program should byte-swap the signature and check if the
# swapped version matches. If so, all multiple-byte entities in the file
# will have to be byte-swapped. This enables these binary files to be used
# unchanged on different architectures.
#
# The header is followed by a file index, which contains one entry for each
# sequence. Each index entry contains three fields:
#
# nameSize - a byte containing the length of the name field
# name - the sequence name itself (in ASCII-compatible byte string), of
# variable length depending on nameSize
# offset - the 32-bit offset of the sequence data relative to the start of
# the file, not aligned to any 4-byte padding boundary
#
# The index is followed by the sequence records, which contain nine fields:
#
# dnaSize - number of bases of DNA in the sequence
# nBlockCount - the number of blocks of Ns in the file (representing unknown
# sequence)
# nBlockStarts - an array of length nBlockCount of 32 bit integers
# indicating the (0-based) starting position of a block of Ns
# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating
# the length of a block of Ns
# maskBlockCount - the number of masked (lower-case) blocks
# maskBlockStarts - an array of length maskBlockCount of 32 bit integers
# indicating the (0-based) starting position of a masked block
# maskBlockSizes - an array of length maskBlockCount of 32 bit integers
# indicating the length of a masked block
# reserved - always zero for now
# packedDna - the DNA packed to two bits per base, represented as so:
# T - 00, C - 01, A - 10, G - 11. The first base is in the most
# significant 2-bit byte; the last base is in the least significan
# 2 bits. For example, the sequence TCAG is represented as 00011011.
import numpy
from Bio.Seq import Seq
from Bio.Seq import SequenceDataAbstractBaseClass
from Bio.SeqRecord import SeqRecord
from . import _twoBitIO
from .Interfaces import SequenceIterator
class _TwoBitSequenceData(SequenceDataAbstractBaseClass):
"""Stores information needed to retrieve sequence data from a .2bit file (PRIVATE).
Objects of this class store the file position at which the sequence data
start, the sequence length, and the start and end position of unknown (N)
and masked (lowercase) letters in the sequence.
Only two methods are provided: __len__ and __getitem__. The former will
return the length of the sequence, while the latter returns the sequence
(as a bytes object) for the requested region. The full sequence of a record
is loaded only if explicitly requested.
"""
__slots__ = ("stream", "offset", "length", "nBlocks", "maskBlocks")
def __init__(self, stream, offset, length):
"""Initialize the file stream and file position of the sequence data."""
self.stream = stream
self.offset = offset
self.length = length
super().__init__()
def __getitem__(self, key):
"""Return the sequence contents (as a bytes object) for the requested region."""
length = self.length
if isinstance(key, slice):
start, end, step = key.indices(length)
size = len(range(start, end, step))
if size == 0:
return b""
else:
if key < 0:
key += length
if key < 0:
raise IndexError("index out of range")
start = key
end = key + 1
step = 1
size = 1
byteStart = start // 4
byteEnd = (end + 3) // 4
byteSize = byteEnd - byteStart
stream = self.stream
try:
stream.seek(self.offset + byteStart)
except ValueError as exception:
if str(exception) == "seek of closed file":
raise ValueError("cannot retrieve sequence: file is closed") from None
raise
data = numpy.fromfile(stream, dtype="uint8", count=byteSize)
sequence = _twoBitIO.convert(
data, start, end, step, self.nBlocks, self.maskBlocks
)
if isinstance(key, slice):
return sequence
else: # single nucleotide
return ord(sequence)
def __len__(self):
"""Get the sequence length."""
return self.length
def upper(self):
"""Remove the sequence mask."""
data = _TwoBitSequenceData(self.stream, self.offset, self.length)
data.nBlocks = self.nBlocks[:, :]
data.maskBlocks = numpy.empty((0, 2), dtype="uint32")
return data
def lower(self):
"""Extend the sequence mask to the full sequence."""
data = _TwoBitSequenceData(self.stream, self.offset, self.length)
data.nBlocks = self.nBlocks[:, :]
data.maskBlocks = numpy.array([[0, self.length]], dtype="uint32")
return data
class TwoBitIterator(SequenceIterator):
"""Parser for UCSC twoBit (.2bit) files."""
def __init__(self, source):
"""Read the file index."""
super().__init__(source, mode="b", fmt="twoBit")
# wait to close the file until the TwoBitIterator goes out of scope:
self.should_close_stream = False
stream = self.stream
data = stream.read(4)
if not data:
raise ValueError("Empty file.")
byteorders = ("little", "big")
dtypes = ("<u4", ">u4")
for byteorder, dtype in zip(byteorders, dtypes):
signature = int.from_bytes(data, byteorder)
if signature == 0x1A412743:
break
else:
raise ValueError("Unknown signature")
self.byteorder = byteorder
data = stream.read(4)
version = int.from_bytes(data, byteorder, signed=False)
if version == 1:
raise ValueError(
"version-1 twoBit files with 64-bit offsets for index are currently not supported"
)
if version != 0:
raise ValueError("Found unexpected file version %u; aborting" % version)
data = stream.read(4)
sequenceCount = int.from_bytes(data, byteorder, signed=False)
data = stream.read(4)
reserved = int.from_bytes(data, byteorder, signed=False)
if reserved != 0:
raise ValueError("Found non-zero reserved field; aborting")
sequences = {}
for i in range(sequenceCount):
data = stream.read(1)
nameSize = int.from_bytes(data, byteorder, signed=False)
data = stream.read(nameSize)
name = data.decode("ASCII")
data = stream.read(4)
offset = int.from_bytes(data, byteorder, signed=False)
sequences[name] = (stream, offset)
self.sequences = sequences
for name, (stream, offset) in sequences.items():
stream.seek(offset)
data = stream.read(4)
dnaSize = int.from_bytes(data, byteorder, signed=False)
sequence = _TwoBitSequenceData(stream, offset, dnaSize)
data = stream.read(4)
nBlockCount = int.from_bytes(data, byteorder, signed=False)
nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32")
sequence.nBlocks[:, 0] = nBlockStarts
sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes
data = stream.read(4)
maskBlockCount = int.from_bytes(data, byteorder, signed=False)
maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32")
sequence.maskBlocks[:, 0] = maskBlockStarts
sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes
data = stream.read(4)
reserved = int.from_bytes(data, byteorder, signed=False)
if reserved != 0:
raise ValueError("Found non-zero reserved field %u" % reserved)
sequence.offset = stream.tell()
sequences[name] = sequence
def parse(self, stream):
"""Iterate over the sequences in the file."""
for name, sequence in self.sequences.items():
sequence = Seq(sequence)
record = SeqRecord(sequence, id=name)
yield record
def __getitem__(self, name):
"""Return sequence associated with given name as a SeqRecord object."""
try:
sequence = self.sequences[name]
except ValueError:
raise KeyError(name) from None
sequence = Seq(sequence)
return SeqRecord(sequence, id=name)
def keys(self):
"""Return a list with the names of the sequences in the file."""
return self.sequences.keys()
def __len__(self):
"""Return number of sequences."""
return len(self.sequences)