File size: 5,841 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Copyright 2019 by Michiel de Hoon.  All rights reserved.
# Based on code contributed and copyright 2016 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the UCSC nib file format.

Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:

    - ``0`` - T
    - ``1`` - C
    - ``2`` - A
    - ``3`` - G
    - ``4`` - N (unknown)

As the first bit in a nibble is set if the nucleotide is soft-masked, we
additionally have:

    - ``8`` - t
    - ``9`` - c
    - ``a`` - a
    - ``b`` - g
    - ``c`` - n (unknown)

A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
    >>> print("%i %s..." % (len(record), record.seq[:20]))
    50 nAGAAGagccgcNGgCActt...

For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
"""
import binascii
import struct
import sys

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter


class NibIterator(SequenceIterator):
    """Parser for nib files."""

    def __init__(self, source):
        """Iterate over a nib file and yield a SeqRecord.

            - source - a file-like object or a path to a file in the nib file
              format as defined by UCSC; the file must be opened in binary mode.

        Note that a nib file always contains only one sequence record.
        The sequence of the resulting SeqRecord object should match the sequence
        generated by Jim Kent's nibFrag utility run with the -masked option.

        This function is used internally via the Bio.SeqIO functions:

        >>> from Bio import SeqIO
        >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
        >>> print("%s %i" % (record.seq, len(record)))
        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

        You can also call it directly:

        >>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
        ...     for record in NibIterator(handle):
        ...         print("%s %i" % (record.seq, len(record)))
        ...
        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

        """
        super().__init__(source, mode="b", fmt="Nib")

    def parse(self, handle):
        """Start parsing the file, and return a SeqRecord generator."""
        word = handle.read(4)
        if not word:
            raise ValueError("Empty file.")
        signature = word.hex()
        if signature == "3a3de96b":
            byteorder = "little"  # little-endian
        elif signature == "6be93d3a":
            byteorder = "big"  # big-endian
        else:
            raise ValueError("unexpected signature in nib header")
        records = self.iterate(handle, byteorder)
        return records

    def iterate(self, handle, byteorder):
        """Iterate over the records in the nib file."""
        number = handle.read(4)
        length = int.from_bytes(number, byteorder)
        data = handle.read()
        indices = binascii.hexlify(data)
        if length % 2 == 0:
            if len(indices) != length:
                raise ValueError("Unexpected file size")
        elif length % 2 == 1:
            if len(indices) != length + 1:
                raise ValueError("Unexpected file size")
            indices = indices[:length]
        if not set(indices).issubset(b"0123489abc"):
            raise ValueError("Unexpected sequence data found in file")
        table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn")
        nucleotides = indices.translate(table)
        sequence = Seq(nucleotides)
        record = SeqRecord(sequence)
        yield record


class NibWriter(SequenceWriter):
    """Nib file writer."""

    def __init__(self, target):
        """Initialize a Nib writer object.

        Arguments:
         - target - output stream opened in binary mode, or a path to a file

        """
        super().__init__(target, mode="wb")

    def write_header(self):
        """Write the file header."""
        super().write_header()
        handle = self.handle
        byteorder = sys.byteorder
        if byteorder == "little":  # little-endian
            signature = "3a3de96b"
        elif byteorder == "big":  # big-endian
            signature = "6be93d3a"
        else:
            raise RuntimeError(f"unexpected system byte order {byteorder}")
        handle.write(bytes.fromhex(signature))

    def write_record(self, record):
        """Write a single record to the output file."""
        handle = self.handle
        sequence = record.seq
        nucleotides = bytes(sequence)
        length = len(sequence)
        handle.write(struct.pack("i", length))
        table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc")
        padding = length % 2
        suffix = padding * b"T"
        nucleotides += suffix
        if not set(nucleotides).issubset(b"ACGTNacgtn"):
            raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
        indices = nucleotides.translate(table)
        handle.write(binascii.unhexlify(indices))

    def write_file(self, records):
        """Write the complete file with the records, and return the number of records."""
        count = super().write_file(records, mincount=1, maxcount=1)
        return count


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest(verbose=0)