Spaces:
No application file
No application file
File size: 8,069 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# Copyright 2019 Damien Goutte-Gattat. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the "gck" file format.
The GCK binary format is generated by the Gene Construction Kit software
from Textco BioSoftware, Inc.
"""
from struct import unpack
from Bio.Seq import Seq
from Bio.SeqFeature import SimpleLocation
from Bio.SeqFeature import SeqFeature
from Bio.SeqRecord import SeqRecord
from .Interfaces import SequenceIterator
def _read(handle, length):
"""Read the specified number of bytes from the given handle."""
data = handle.read(length)
if len(data) < length:
raise ValueError(f"Cannot read {length} bytes from handle")
return data
def _read_packet(handle):
"""Read a length-prefixed packet.
Parts of a GCK file are made of "packets" comprising of 4 bytes
giving the packet's size, followed by the packet's data.
There is no type tag. The type of a packet, and thus the type of data
it contains, is solely indicated by the position of the packet within
the GCK file.
"""
length = _read(handle, 4)
length = unpack(">I", length)[0]
data = _read(handle, length)
return (data, length)
def _read_pstring(handle):
"""Read a Pascal string.
A Pascal string is one byte for length followed by the actual string.
"""
length = _read(handle, 1)
length = unpack(">B", length)[0]
data = _read(handle, length).decode("ASCII")
return data
def _read_p4string(handle):
"""Read a 32-bit Pascal string.
Similar to a Pascal string but length is encoded on 4 bytes.
"""
length = _read(handle, 4)
length = unpack(">I", length)[0]
data = _read(handle, length).decode("ASCII")
return data
def _parse(handle):
# Skip file header
# GCK files start with a 24-bytes header. Bytes 4 and 8 seem to
# always be 12, maybe this could act as a magic cookie. Bytes
# 17-20 and 21-24 contain variable values of unknown meaning.
# check if file is empty
data = handle.read(24)
if not data:
raise ValueError("Empty file.")
if len(data) < 24:
raise ValueError("Improper header, cannot read 24 bytes from handle")
# Read the actual sequence data
packet, length = _read_packet(handle)
# The body of the sequence packet starts with a 32-bit integer
# representing the length of the sequence.
seq_length = unpack(">I", packet[:4])[0]
# This length should not be larger than the length of the
# sequence packet.
if seq_length > length - 4:
raise ValueError("Conflicting sequence length values")
sequence = packet[4:].decode("ASCII")
record = SeqRecord(Seq(sequence))
# Skip unknown packet
_read_packet(handle)
# Read features packet
packet, length = _read_packet(handle)
(seq_length, num_features) = unpack(">IH", packet[:6])
# Check that length in the features packet matches the actual
# length of the sequence
if seq_length != len(sequence):
raise ValueError("Conflicting sequence length values")
# Each feature is stored in a 92-bytes structure.
if length - 6 != num_features * 92:
raise ValueError("Features packet size inconsistent with number of features")
for i in range(0, num_features):
offset = 6 + i * 92
feature_data = packet[offset : offset + 92]
# There's probably more stuff to unpack in that structure,
# but those values are the only ones I understand.
(start, end, type, strand, has_name, has_comment, version) = unpack(
">II6xH14xB17xII35xB", feature_data
)
if strand == 1: # Reverse strand
strand = -1
else:
# Other possible values are 0 (no strand specified),
# 2 (forward strand), and 3 (both strands). All are
# treated as a forward strand.
strand = 1
location = SimpleLocation(start, end, strand=strand)
# It looks like any value > 0 indicates a CDS...
if type > 0:
type = "CDS"
else:
type = "misc_feature"
# Each feature may have a name and a comment, which are then
# stored immediately after the features packet. Names are
# stored as Pascal strings (1 length byte followed by the
# string itself), comments are stored as "32-bit Pascal strings"
# (4 length bytes followed by the string).
qualifiers = {}
if has_name > 0:
name = _read_pstring(handle)
qualifiers["label"] = [name]
if has_comment > 0:
comment = _read_p4string(handle)
qualifiers["note"] = [comment]
# Each feature may exist in several "versions". We keep only
# the most recent version.
if version > 0:
continue
feature = SeqFeature(location, type=type, qualifiers=qualifiers)
record.features.append(feature)
# Read restriction sites packet
# We are not interested in restriction sites, but we must still read
# that packet so that we can skip the names and comments for each
# site, which are stored after that packet in a similar way as for
# the features above.
packet, length = _read_packet(handle)
(seq_length, num_sites) = unpack(">IH", packet[:6])
# Each site is stored in a 88-bytes structure
if length - 6 != num_sites * 88:
raise ValueError("Sites packet size inconsistent with number of sites")
for i in range(0, num_sites):
offset = 6 + i * 88
site_data = packet[offset : offset + 88]
(start, end, has_name, has_comment) = unpack(">II24xII48x", site_data)
# Skip names and comments
if has_name:
_read_pstring(handle)
if has_comment:
_read_p4string(handle)
# Skip unknown packet
_read_packet(handle)
# Next in the file are "version packets".
# However they are not properly formatted "packets" as they are not
# preceded by an integer giving their size. Instead we have a
# short integer indicating how many versions are there, and then
# as many 260-bytes block as we have versions.
num_versions = _read(handle, 2)
num_versions = unpack(">H", num_versions)[0]
versions = _read(handle, num_versions * 260)
for i in range(0, num_versions):
offset = i * 260
version_data = versions[offset : offset + 260]
# Each version may have a comment, which is then stored
# after all the "version packets".
has_comment = unpack(">I", version_data[-4:])[0]
if has_comment > 0:
_read_p4string(handle)
# Skip unknown fixed-size block
# Whatever this block contains, it is not preceded by any length
# indicator, so I hope its size is indeed constant in all files...
_read(handle, 706)
# Read the construct's name
name = _read_pstring(handle)
record.name = record.id = name.split(" ")[0]
record.description = name
# Circularity byte
# There may be other flags in that block, but their meaning
# is unknown to me.
flags = _read(handle, 17)
circularity = unpack(">16xB", flags)[0]
if circularity > 0:
record.annotations["topology"] = "circular"
else:
record.annotations["topology"] = "linear"
yield record
class GckIterator(SequenceIterator):
"""Parser for GCK files."""
def __init__(self, source):
"""Break up a GCK file into SeqRecord objects."""
super().__init__(source, mode="b", fmt="GCK")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator.
Note that a GCK file can only contain one sequence, so this
iterator will always return a single record.
"""
records = _parse(handle)
return records
|