Spaces:
No application file
No application file
File size: 5,841 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# Copyright 2019 by Michiel de Hoon. All rights reserved.
# Based on code contributed and copyright 2016 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the UCSC nib file format.
Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:
- ``0`` - T
- ``1`` - C
- ``2`` - A
- ``3`` - G
- ``4`` - N (unknown)
As the first bit in a nibble is set if the nucleotide is soft-masked, we
additionally have:
- ``8`` - t
- ``9`` - c
- ``a`` - a
- ``b`` - g
- ``c`` - n (unknown)
A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":
>>> from Bio import SeqIO
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
>>> print("%i %s..." % (len(record), record.seq[:20]))
50 nAGAAGagccgcNGgCActt...
For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
"""
import binascii
import struct
import sys
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter
class NibIterator(SequenceIterator):
"""Parser for nib files."""
def __init__(self, source):
"""Iterate over a nib file and yield a SeqRecord.
- source - a file-like object or a path to a file in the nib file
format as defined by UCSC; the file must be opened in binary mode.
Note that a nib file always contains only one sequence record.
The sequence of the resulting SeqRecord object should match the sequence
generated by Jim Kent's nibFrag utility run with the -masked option.
This function is used internally via the Bio.SeqIO functions:
>>> from Bio import SeqIO
>>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
>>> print("%s %i" % (record.seq, len(record)))
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
You can also call it directly:
>>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
... for record in NibIterator(handle):
... print("%s %i" % (record.seq, len(record)))
...
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
"""
super().__init__(source, mode="b", fmt="Nib")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
word = handle.read(4)
if not word:
raise ValueError("Empty file.")
signature = word.hex()
if signature == "3a3de96b":
byteorder = "little" # little-endian
elif signature == "6be93d3a":
byteorder = "big" # big-endian
else:
raise ValueError("unexpected signature in nib header")
records = self.iterate(handle, byteorder)
return records
def iterate(self, handle, byteorder):
"""Iterate over the records in the nib file."""
number = handle.read(4)
length = int.from_bytes(number, byteorder)
data = handle.read()
indices = binascii.hexlify(data)
if length % 2 == 0:
if len(indices) != length:
raise ValueError("Unexpected file size")
elif length % 2 == 1:
if len(indices) != length + 1:
raise ValueError("Unexpected file size")
indices = indices[:length]
if not set(indices).issubset(b"0123489abc"):
raise ValueError("Unexpected sequence data found in file")
table = bytes.maketrans(b"0123489abc", b"TCAGNtcagn")
nucleotides = indices.translate(table)
sequence = Seq(nucleotides)
record = SeqRecord(sequence)
yield record
class NibWriter(SequenceWriter):
"""Nib file writer."""
def __init__(self, target):
"""Initialize a Nib writer object.
Arguments:
- target - output stream opened in binary mode, or a path to a file
"""
super().__init__(target, mode="wb")
def write_header(self):
"""Write the file header."""
super().write_header()
handle = self.handle
byteorder = sys.byteorder
if byteorder == "little": # little-endian
signature = "3a3de96b"
elif byteorder == "big": # big-endian
signature = "6be93d3a"
else:
raise RuntimeError(f"unexpected system byte order {byteorder}")
handle.write(bytes.fromhex(signature))
def write_record(self, record):
"""Write a single record to the output file."""
handle = self.handle
sequence = record.seq
nucleotides = bytes(sequence)
length = len(sequence)
handle.write(struct.pack("i", length))
table = bytes.maketrans(b"TCAGNtcagn", b"0123489abc")
padding = length % 2
suffix = padding * b"T"
nucleotides += suffix
if not set(nucleotides).issubset(b"ACGTNacgtn"):
raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
indices = nucleotides.translate(table)
handle.write(binascii.unhexlify(indices))
def write_file(self, records):
"""Write the complete file with the records, and return the number of records."""
count = super().write_file(records, mincount=1, maxcount=1)
return count
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest(verbose=0)
|