Spaces:
No application file
No application file
# Copyright 2019 Damien Goutte-Gattat. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO support for the "gck" file format. | |
The GCK binary format is generated by the Gene Construction Kit software | |
from Textco BioSoftware, Inc. | |
""" | |
from struct import unpack | |
from Bio.Seq import Seq | |
from Bio.SeqFeature import SimpleLocation | |
from Bio.SeqFeature import SeqFeature | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import SequenceIterator | |
def _read(handle, length): | |
"""Read the specified number of bytes from the given handle.""" | |
data = handle.read(length) | |
if len(data) < length: | |
raise ValueError(f"Cannot read {length} bytes from handle") | |
return data | |
def _read_packet(handle): | |
"""Read a length-prefixed packet. | |
Parts of a GCK file are made of "packets" comprising of 4 bytes | |
giving the packet's size, followed by the packet's data. | |
There is no type tag. The type of a packet, and thus the type of data | |
it contains, is solely indicated by the position of the packet within | |
the GCK file. | |
""" | |
length = _read(handle, 4) | |
length = unpack(">I", length)[0] | |
data = _read(handle, length) | |
return (data, length) | |
def _read_pstring(handle): | |
"""Read a Pascal string. | |
A Pascal string is one byte for length followed by the actual string. | |
""" | |
length = _read(handle, 1) | |
length = unpack(">B", length)[0] | |
data = _read(handle, length).decode("ASCII") | |
return data | |
def _read_p4string(handle): | |
"""Read a 32-bit Pascal string. | |
Similar to a Pascal string but length is encoded on 4 bytes. | |
""" | |
length = _read(handle, 4) | |
length = unpack(">I", length)[0] | |
data = _read(handle, length).decode("ASCII") | |
return data | |
def _parse(handle): | |
# Skip file header | |
# GCK files start with a 24-bytes header. Bytes 4 and 8 seem to | |
# always be 12, maybe this could act as a magic cookie. Bytes | |
# 17-20 and 21-24 contain variable values of unknown meaning. | |
# check if file is empty | |
data = handle.read(24) | |
if not data: | |
raise ValueError("Empty file.") | |
if len(data) < 24: | |
raise ValueError("Improper header, cannot read 24 bytes from handle") | |
# Read the actual sequence data | |
packet, length = _read_packet(handle) | |
# The body of the sequence packet starts with a 32-bit integer | |
# representing the length of the sequence. | |
seq_length = unpack(">I", packet[:4])[0] | |
# This length should not be larger than the length of the | |
# sequence packet. | |
if seq_length > length - 4: | |
raise ValueError("Conflicting sequence length values") | |
sequence = packet[4:].decode("ASCII") | |
record = SeqRecord(Seq(sequence)) | |
# Skip unknown packet | |
_read_packet(handle) | |
# Read features packet | |
packet, length = _read_packet(handle) | |
(seq_length, num_features) = unpack(">IH", packet[:6]) | |
# Check that length in the features packet matches the actual | |
# length of the sequence | |
if seq_length != len(sequence): | |
raise ValueError("Conflicting sequence length values") | |
# Each feature is stored in a 92-bytes structure. | |
if length - 6 != num_features * 92: | |
raise ValueError("Features packet size inconsistent with number of features") | |
for i in range(0, num_features): | |
offset = 6 + i * 92 | |
feature_data = packet[offset : offset + 92] | |
# There's probably more stuff to unpack in that structure, | |
# but those values are the only ones I understand. | |
(start, end, type, strand, has_name, has_comment, version) = unpack( | |
">II6xH14xB17xII35xB", feature_data | |
) | |
if strand == 1: # Reverse strand | |
strand = -1 | |
else: | |
# Other possible values are 0 (no strand specified), | |
# 2 (forward strand), and 3 (both strands). All are | |
# treated as a forward strand. | |
strand = 1 | |
location = SimpleLocation(start, end, strand=strand) | |
# It looks like any value > 0 indicates a CDS... | |
if type > 0: | |
type = "CDS" | |
else: | |
type = "misc_feature" | |
# Each feature may have a name and a comment, which are then | |
# stored immediately after the features packet. Names are | |
# stored as Pascal strings (1 length byte followed by the | |
# string itself), comments are stored as "32-bit Pascal strings" | |
# (4 length bytes followed by the string). | |
qualifiers = {} | |
if has_name > 0: | |
name = _read_pstring(handle) | |
qualifiers["label"] = [name] | |
if has_comment > 0: | |
comment = _read_p4string(handle) | |
qualifiers["note"] = [comment] | |
# Each feature may exist in several "versions". We keep only | |
# the most recent version. | |
if version > 0: | |
continue | |
feature = SeqFeature(location, type=type, qualifiers=qualifiers) | |
record.features.append(feature) | |
# Read restriction sites packet | |
# We are not interested in restriction sites, but we must still read | |
# that packet so that we can skip the names and comments for each | |
# site, which are stored after that packet in a similar way as for | |
# the features above. | |
packet, length = _read_packet(handle) | |
(seq_length, num_sites) = unpack(">IH", packet[:6]) | |
# Each site is stored in a 88-bytes structure | |
if length - 6 != num_sites * 88: | |
raise ValueError("Sites packet size inconsistent with number of sites") | |
for i in range(0, num_sites): | |
offset = 6 + i * 88 | |
site_data = packet[offset : offset + 88] | |
(start, end, has_name, has_comment) = unpack(">II24xII48x", site_data) | |
# Skip names and comments | |
if has_name: | |
_read_pstring(handle) | |
if has_comment: | |
_read_p4string(handle) | |
# Skip unknown packet | |
_read_packet(handle) | |
# Next in the file are "version packets". | |
# However they are not properly formatted "packets" as they are not | |
# preceded by an integer giving their size. Instead we have a | |
# short integer indicating how many versions are there, and then | |
# as many 260-bytes block as we have versions. | |
num_versions = _read(handle, 2) | |
num_versions = unpack(">H", num_versions)[0] | |
versions = _read(handle, num_versions * 260) | |
for i in range(0, num_versions): | |
offset = i * 260 | |
version_data = versions[offset : offset + 260] | |
# Each version may have a comment, which is then stored | |
# after all the "version packets". | |
has_comment = unpack(">I", version_data[-4:])[0] | |
if has_comment > 0: | |
_read_p4string(handle) | |
# Skip unknown fixed-size block | |
# Whatever this block contains, it is not preceded by any length | |
# indicator, so I hope its size is indeed constant in all files... | |
_read(handle, 706) | |
# Read the construct's name | |
name = _read_pstring(handle) | |
record.name = record.id = name.split(" ")[0] | |
record.description = name | |
# Circularity byte | |
# There may be other flags in that block, but their meaning | |
# is unknown to me. | |
flags = _read(handle, 17) | |
circularity = unpack(">16xB", flags)[0] | |
if circularity > 0: | |
record.annotations["topology"] = "circular" | |
else: | |
record.annotations["topology"] = "linear" | |
yield record | |
class GckIterator(SequenceIterator): | |
"""Parser for GCK files.""" | |
def __init__(self, source): | |
"""Break up a GCK file into SeqRecord objects.""" | |
super().__init__(source, mode="b", fmt="GCK") | |
def parse(self, handle): | |
"""Start parsing the file, and return a SeqRecord generator. | |
Note that a GCK file can only contain one sequence, so this | |
iterator will always return a single record. | |
""" | |
records = _parse(handle) | |
return records | |