Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /SeqIO /NibIO.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

5.84 kB

	# Copyright 2019 by Michiel de Hoon. All rights reserved.
	# Based on code contributed and copyright 2016 by Peter Cock.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.SeqIO support for the UCSC nib file format.

	Nib stands for nibble (4 bit) representation of nucleotide sequences.
	The two nibbles in a byte each store one nucleotide, represented numerically
	as follows:

	- ``0`` - T
	- ``1`` - C
	- ``2`` - A
	- ``3`` - G
	- ``4`` - N (unknown)

	As the first bit in a nibble is set if the nucleotide is soft-masked, we
	additionally have:

	- ``8`` - t
	- ``9`` - c
	- ``a`` - a
	- ``b`` - g
	- ``c`` - n (unknown)

	A nib file contains only one sequence record.
	You are expected to use this module via the Bio.SeqIO functions under
	the format name "nib":

	>>> from Bio import SeqIO
	>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
	>>> print("%i %s..." % (len(record), record.seq[:20]))
	50 nAGAAGagccgcNGgCActt...

	For detailed information on the file format, please see the UCSC
	description at https://genome.ucsc.edu/FAQ/FAQformat.html.
	"""
	import binascii
	import struct
	import sys

	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord

	from .Interfaces import SequenceIterator
	from .Interfaces import SequenceWriter


	class NibIterator(SequenceIterator):
	"""Parser for nib files."""

	def __init__(self, source):
	"""Iterate over a nib file and yield a SeqRecord.

	- source - a file-like object or a path to a file in the nib file
	format as defined by UCSC; the file must be opened in binary mode.

	Note that a nib file always contains only one sequence record.
	The sequence of the resulting SeqRecord object should match the sequence
	generated by Jim Kent's nibFrag utility run with the -masked option.

	This function is used internally via the Bio.SeqIO functions:

	>>> from Bio import SeqIO
	>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
	>>> print("%s %i" % (record.seq, len(record)))
	nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

	You can also call it directly:

	>>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
	... for record in NibIterator(handle):
	... print("%s %i" % (record.seq, len(record)))
	...
	nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

	"""
	super().__init__(source, mode="b", fmt="Nib")

	def parse(self, handle):
	"""Start parsing the file, and return a SeqRecord generator."""
	word = handle.read(4)
	if not word:
	raise ValueError("Empty file.")
	signature = word.hex()
	if signature == "3a3de96b":
	byteorder = "little" # little-endian
	elif signature == "6be93d3a":
	byteorder = "big" # big-endian
	else:
	raise ValueError("unexpected signature in nib header")
	records = self.iterate(handle, byteorder)
	return records

	def iterate(self, handle, byteorder):
	"""Iterate over the records in the nib file."""
	number = handle.read(4)
	length = int.from_bytes(number, byteorder)
	data = handle.read()
	indices = binascii.hexlify(data)
	if length % 2 == 0:
	if len(indices) != length:
	raise ValueError("Unexpected file size")
	elif length % 2 == 1:
	if len(indices) != length + 1:
	raise ValueError("Unexpected file size")
	indices = indices[:length]
	if not set(indices).issubset(b"0123489abc"):
	raise ValueError("Unexpected sequence data found in file")
	table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn")
	nucleotides = indices.translate(table)
	sequence = Seq(nucleotides)
	record = SeqRecord(sequence)
	yield record


	class NibWriter(SequenceWriter):
	"""Nib file writer."""

	def __init__(self, target):
	"""Initialize a Nib writer object.

	Arguments:
	- target - output stream opened in binary mode, or a path to a file

	"""
	super().__init__(target, mode="wb")

	def write_header(self):
	"""Write the file header."""
	super().write_header()
	handle = self.handle
	byteorder = sys.byteorder
	if byteorder == "little": # little-endian
	signature = "3a3de96b"
	elif byteorder == "big": # big-endian
	signature = "6be93d3a"
	else:
	raise RuntimeError(f"unexpected system byte order {byteorder}")
	handle.write(bytes.fromhex(signature))

	def write_record(self, record):
	"""Write a single record to the output file."""
	handle = self.handle
	sequence = record.seq
	nucleotides = bytes(sequence)
	length = len(sequence)
	handle.write(struct.pack("i", length))
	table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc")
	padding = length % 2
	suffix = padding * b"T"
	nucleotides += suffix
	if not set(nucleotides).issubset(b"ACGTNacgtn"):
	raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
	indices = nucleotides.translate(table)
	handle.write(binascii.unhexlify(indices))

	def write_file(self, records):
	"""Write the complete file with the records, and return the number of records."""
	count = super().write_file(records, mincount=1, maxcount=1)
	return count


	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest(verbose=0)