Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

File size: 5,841 Bytes

b7731cd

# Copyright 2019 by Michiel de Hoon.  All rights reserved.
# Based on code contributed and copyright 2016 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the UCSC nib file format.

Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:

    - ``0`` - T
    - ``1`` - C
    - ``2`` - A
    - ``3`` - G
    - ``4`` - N (unknown)

As the first bit in a nibble is set if the nucleotide is soft-masked, we
additionally have:

    - ``8`` - t
    - ``9`` - c
    - ``a`` - a
    - ``b`` - g
    - ``c`` - n (unknown)

A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
    >>> print("%i %s..." % (len(record), record.seq[:20]))
    50 nAGAAGagccgcNGgCActt...

For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
"""
import binascii
import struct
import sys

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter


class NibIterator(SequenceIterator):
    """Parser for nib files."""

    def __init__(self, source):
        """Iterate over a nib file and yield a SeqRecord.

            - source - a file-like object or a path to a file in the nib file
              format as defined by UCSC; the file must be opened in binary mode.

        Note that a nib file always contains only one sequence record.
        The sequence of the resulting SeqRecord object should match the sequence
        generated by Jim Kent's nibFrag utility run with the -masked option.

        This function is used internally via the Bio.SeqIO functions:

        >>> from Bio import SeqIO
        >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
        >>> print("%s %i" % (record.seq, len(record)))
        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

        You can also call it directly:

        >>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
        ...     for record in NibIterator(handle):
        ...         print("%s %i" % (record.seq, len(record)))
        ...
        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

        """
        super().__init__(source, mode="b", fmt="Nib")

    def parse(self, handle):
        """Start parsing the file, and return a SeqRecord generator."""
        word = handle.read(4)
        if not word:
            raise ValueError("Empty file.")
        signature = word.hex()
        if signature == "3a3de96b":
            byteorder = "little"  # little-endian
        elif signature == "6be93d3a":
            byteorder = "big"  # big-endian
        else:
            raise ValueError("unexpected signature in nib header")
        records = self.iterate(handle, byteorder)
        return records

    def iterate(self, handle, byteorder):
        """Iterate over the records in the nib file."""
        number = handle.read(4)
        length = int.from_bytes(number, byteorder)
        data = handle.read()
        indices = binascii.hexlify(data)
        if length % 2 == 0:
            if len(indices) != length:
                raise ValueError("Unexpected file size")
        elif length % 2 == 1:
            if len(indices) != length + 1:
                raise ValueError("Unexpected file size")
            indices = indices[:length]
        if not set(indices).issubset(b"0123489abc"):
            raise ValueError("Unexpected sequence data found in file")
        table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn")
        nucleotides = indices.translate(table)
        sequence = Seq(nucleotides)
        record = SeqRecord(sequence)
        yield record


class NibWriter(SequenceWriter):
    """Nib file writer."""

    def __init__(self, target):
        """Initialize a Nib writer object.

        Arguments:
         - target - output stream opened in binary mode, or a path to a file

        """
        super().__init__(target, mode="wb")

    def write_header(self):
        """Write the file header."""
        super().write_header()
        handle = self.handle
        byteorder = sys.byteorder
        if byteorder == "little":  # little-endian
            signature = "3a3de96b"
        elif byteorder == "big":  # big-endian
            signature = "6be93d3a"
        else:
            raise RuntimeError(f"unexpected system byte order {byteorder}")
        handle.write(bytes.fromhex(signature))

    def write_record(self, record):
        """Write a single record to the output file."""
        handle = self.handle
        sequence = record.seq
        nucleotides = bytes(sequence)
        length = len(sequence)
        handle.write(struct.pack("i", length))
        table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc")
        padding = length % 2
        suffix = padding * b"T"
        nucleotides += suffix
        if not set(nucleotides).issubset(b"ACGTNacgtn"):
            raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
        indices = nucleotides.translate(table)
        handle.write(binascii.unhexlify(indices))

    def write_file(self, records):
        """Write the complete file with the records, and return the number of records."""
        count = super().write_file(records, mincount=1, maxcount=1)
        return count


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest(verbose=0)