Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /SeqIO /TwoBitIO.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

11.1 kB

	# Copyright 2020 by Michiel de Hoon
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.SeqIO support for UCSC's "twoBit" (.2bit) file format.

	This parser reads the index stored in the twoBit file, as well as the masked
	regions and the N's for each sequence. It also creates sequence data objects
	(_TwoBitSequenceData objects), which support only two methods: __len__ and
	__getitem__. The former will return the length of the sequence, while the
	latter returns the sequence (as a bytes object) for the requested region.

	Using the information in the index, the __getitem__ method calculates the file
	position at which the requested region starts, and only reads the requested
	sequence region. Note that the full sequence of a record is loaded only if
	specifically requested, making the parser memory-efficient.

	The TwoBitIterator object implements the __getitem__, keys, and __len__
	methods that allow it to be used as a dictionary.
	"""
	# The .2bit file format is defined by UCSC as follows
	# (see http://genome.ucsc.edu/FAQ/FAQformat.html#format7):
	#
	#
	# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
	# randomly-accessible format. The file contains masking information as well
	# as the DNA itself.
	#
	# The file begins with a 16-byte header containing the following fields:
	#
	# signature - the number 0x1A412743 in the architecture of the machine that
	# created the file
	# version - zero for now. Readers should abort if they see a version number
	# higher than 0
	# sequenceCount - the number of sequences in the file
	# reserved - always zero for now
	#
	# All fields are 32 bits unless noted. If the signature value is not as
	# given, the reader program should byte-swap the signature and check if the
	# swapped version matches. If so, all multiple-byte entities in the file
	# will have to be byte-swapped. This enables these binary files to be used
	# unchanged on different architectures.
	#
	# The header is followed by a file index, which contains one entry for each
	# sequence. Each index entry contains three fields:
	#
	# nameSize - a byte containing the length of the name field
	# name - the sequence name itself (in ASCII-compatible byte string), of
	# variable length depending on nameSize
	# offset - the 32-bit offset of the sequence data relative to the start of
	# the file, not aligned to any 4-byte padding boundary
	#
	# The index is followed by the sequence records, which contain nine fields:
	#
	# dnaSize - number of bases of DNA in the sequence
	# nBlockCount - the number of blocks of Ns in the file (representing unknown
	# sequence)
	# nBlockStarts - an array of length nBlockCount of 32 bit integers
	# indicating the (0-based) starting position of a block of Ns
	# nBlockSizes - an array of length nBlockCount of 32 bit integers indicating
	# the length of a block of Ns
	# maskBlockCount - the number of masked (lower-case) blocks
	# maskBlockStarts - an array of length maskBlockCount of 32 bit integers
	# indicating the (0-based) starting position of a masked block
	# maskBlockSizes - an array of length maskBlockCount of 32 bit integers
	# indicating the length of a masked block
	# reserved - always zero for now
	# packedDna - the DNA packed to two bits per base, represented as so:
	# T - 00, C - 01, A - 10, G - 11. The first base is in the most
	# significant 2-bit byte; the last base is in the least significan
	# 2 bits. For example, the sequence TCAG is represented as 00011011.
	import numpy

	from Bio.Seq import Seq
	from Bio.Seq import SequenceDataAbstractBaseClass
	from Bio.SeqRecord import SeqRecord

	from . import _twoBitIO
	from .Interfaces import SequenceIterator


	class _TwoBitSequenceData(SequenceDataAbstractBaseClass):
	"""Stores information needed to retrieve sequence data from a .2bit file (PRIVATE).

	Objects of this class store the file position at which the sequence data
	start, the sequence length, and the start and end position of unknown (N)
	and masked (lowercase) letters in the sequence.

	Only two methods are provided: __len__ and __getitem__. The former will
	return the length of the sequence, while the latter returns the sequence
	(as a bytes object) for the requested region. The full sequence of a record
	is loaded only if explicitly requested.
	"""

	__slots__ = ("stream", "offset", "length", "nBlocks", "maskBlocks")

	def __init__(self, stream, offset, length):
	"""Initialize the file stream and file position of the sequence data."""
	self.stream = stream
	self.offset = offset
	self.length = length
	super().__init__()

	def __getitem__(self, key):
	"""Return the sequence contents (as a bytes object) for the requested region."""
	length = self.length
	if isinstance(key, slice):
	start, end, step = key.indices(length)
	size = len(range(start, end, step))
	if size == 0:
	return b""
	else:
	if key < 0:
	key += length
	if key < 0:
	raise IndexError("index out of range")
	start = key
	end = key + 1
	step = 1
	size = 1
	byteStart = start // 4
	byteEnd = (end + 3) // 4
	byteSize = byteEnd - byteStart
	stream = self.stream
	try:
	stream.seek(self.offset + byteStart)
	except ValueError as exception:
	if str(exception) == "seek of closed file":
	raise ValueError("cannot retrieve sequence: file is closed") from None
	raise
	data = numpy.fromfile(stream, dtype="uint8", count=byteSize)
	sequence = _twoBitIO.convert(
	data, start, end, step, self.nBlocks, self.maskBlocks
	)
	if isinstance(key, slice):
	return sequence
	else: # single nucleotide
	return ord(sequence)

	def __len__(self):
	"""Get the sequence length."""
	return self.length

	def upper(self):
	"""Remove the sequence mask."""
	data = _TwoBitSequenceData(self.stream, self.offset, self.length)
	data.nBlocks = self.nBlocks[:, :]
	data.maskBlocks = numpy.empty((0, 2), dtype="uint32")
	return data

	def lower(self):
	"""Extend the sequence mask to the full sequence."""
	data = _TwoBitSequenceData(self.stream, self.offset, self.length)
	data.nBlocks = self.nBlocks[:, :]
	data.maskBlocks = numpy.array([[0, self.length]], dtype="uint32")
	return data


	class TwoBitIterator(SequenceIterator):
	"""Parser for UCSC twoBit (.2bit) files."""

	def __init__(self, source):
	"""Read the file index."""
	super().__init__(source, mode="b", fmt="twoBit")
	# wait to close the file until the TwoBitIterator goes out of scope:
	self.should_close_stream = False
	stream = self.stream
	data = stream.read(4)
	if not data:
	raise ValueError("Empty file.")
	byteorders = ("little", "big")
	dtypes = ("<u4", ">u4")
	for byteorder, dtype in zip(byteorders, dtypes):
	signature = int.from_bytes(data, byteorder)
	if signature == 0x1A412743:
	break
	else:
	raise ValueError("Unknown signature")
	self.byteorder = byteorder
	data = stream.read(4)
	version = int.from_bytes(data, byteorder, signed=False)
	if version == 1:
	raise ValueError(
	"version-1 twoBit files with 64-bit offsets for index are currently not supported"
	)
	if version != 0:
	raise ValueError("Found unexpected file version %u; aborting" % version)
	data = stream.read(4)
	sequenceCount = int.from_bytes(data, byteorder, signed=False)
	data = stream.read(4)
	reserved = int.from_bytes(data, byteorder, signed=False)
	if reserved != 0:
	raise ValueError("Found non-zero reserved field; aborting")
	sequences = {}
	for i in range(sequenceCount):
	data = stream.read(1)
	nameSize = int.from_bytes(data, byteorder, signed=False)
	data = stream.read(nameSize)
	name = data.decode("ASCII")
	data = stream.read(4)
	offset = int.from_bytes(data, byteorder, signed=False)
	sequences[name] = (stream, offset)
	self.sequences = sequences
	for name, (stream, offset) in sequences.items():
	stream.seek(offset)
	data = stream.read(4)
	dnaSize = int.from_bytes(data, byteorder, signed=False)
	sequence = _TwoBitSequenceData(stream, offset, dnaSize)
	data = stream.read(4)
	nBlockCount = int.from_bytes(data, byteorder, signed=False)
	nBlockStarts = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
	nBlockSizes = numpy.fromfile(stream, dtype=dtype, count=nBlockCount)
	sequence.nBlocks = numpy.empty((nBlockCount, 2), dtype="uint32")
	sequence.nBlocks[:, 0] = nBlockStarts
	sequence.nBlocks[:, 1] = nBlockStarts + nBlockSizes
	data = stream.read(4)
	maskBlockCount = int.from_bytes(data, byteorder, signed=False)
	maskBlockStarts = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
	maskBlockSizes = numpy.fromfile(stream, dtype=dtype, count=maskBlockCount)
	sequence.maskBlocks = numpy.empty((maskBlockCount, 2), dtype="uint32")
	sequence.maskBlocks[:, 0] = maskBlockStarts
	sequence.maskBlocks[:, 1] = maskBlockStarts + maskBlockSizes
	data = stream.read(4)
	reserved = int.from_bytes(data, byteorder, signed=False)
	if reserved != 0:
	raise ValueError("Found non-zero reserved field %u" % reserved)
	sequence.offset = stream.tell()
	sequences[name] = sequence

	def parse(self, stream):
	"""Iterate over the sequences in the file."""
	for name, sequence in self.sequences.items():
	sequence = Seq(sequence)
	record = SeqRecord(sequence, id=name)
	yield record

	def __getitem__(self, name):
	"""Return sequence associated with given name as a SeqRecord object."""
	try:
	sequence = self.sequences[name]
	except ValueError:
	raise KeyError(name) from None
	sequence = Seq(sequence)
	return SeqRecord(sequence, id=name)

	def keys(self):
	"""Return a list with the names of the sequences in the file."""
	return self.sequences.keys()

	def __len__(self):
	"""Return number of sequences."""
	return len(self.sequences)