Spaces:
No application file
No application file
# Copyright 2017-2019 Damien Goutte-Gattat. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO support for the SnapGene file format. | |
The SnapGene binary format is the native format used by the SnapGene program | |
from GSL Biotech LLC. | |
""" | |
from datetime import datetime | |
from re import sub | |
from struct import unpack | |
from xml.dom.minidom import parseString | |
from Bio.Seq import Seq | |
from Bio.SeqFeature import SimpleLocation | |
from Bio.SeqFeature import SeqFeature | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import SequenceIterator | |
def _iterate(handle): | |
"""Iterate over the packets of a SnapGene file. | |
A SnapGene file is made of packets, each packet being a TLV-like | |
structure comprising: | |
- 1 single byte indicating the packet's type; | |
- 1 big-endian long integer (4 bytes) indicating the length of the | |
packet's data; | |
- the actual data. | |
""" | |
while True: | |
packet_type = handle.read(1) | |
if len(packet_type) < 1: # No more packet | |
return | |
packet_type = unpack(">B", packet_type)[0] | |
length = handle.read(4) | |
if len(length) < 4: | |
raise ValueError("Unexpected end of packet") | |
length = unpack(">I", length)[0] | |
data = handle.read(length) | |
if len(data) < length: | |
raise ValueError("Unexpected end of packet") | |
yield (packet_type, length, data) | |
def _parse_dna_packet(length, data, record): | |
"""Parse a DNA sequence packet. | |
A DNA sequence packet contains a single byte flag followed by the | |
sequence itself. | |
""" | |
if record.seq: | |
raise ValueError("The file contains more than one DNA packet") | |
flags, sequence = unpack(">B%ds" % (length - 1), data) | |
record.seq = Seq(sequence.decode("ASCII")) | |
record.annotations["molecule_type"] = "DNA" | |
if flags & 0x01: | |
record.annotations["topology"] = "circular" | |
else: | |
record.annotations["topology"] = "linear" | |
def _parse_notes_packet(length, data, record): | |
"""Parse a 'Notes' packet. | |
This type of packet contains some metadata about the sequence. They | |
are stored as a XML string with a 'Notes' root node. | |
""" | |
xml = parseString(data.decode("UTF-8")) | |
type = _get_child_value(xml, "Type") | |
if type == "Synthetic": | |
record.annotations["data_file_division"] = "SYN" | |
else: | |
record.annotations["data_file_division"] = "UNC" | |
date = _get_child_value(xml, "LastModified") | |
if date: | |
record.annotations["date"] = datetime.strptime(date, "%Y.%m.%d") | |
acc = _get_child_value(xml, "AccessionNumber") | |
if acc: | |
record.id = acc | |
comment = _get_child_value(xml, "Comments") | |
if comment: | |
record.name = comment.split(" ", 1)[0] | |
record.description = comment | |
if not acc: | |
record.id = record.name | |
def _parse_cookie_packet(length, data, record): | |
"""Parse a SnapGene cookie packet. | |
Every SnapGene file starts with a packet of this type. It acts as | |
a magic cookie identifying the file as a SnapGene file. | |
""" | |
cookie, seq_type, exp_version, imp_version = unpack(">8sHHH", data) | |
if cookie.decode("ASCII") != "SnapGene": | |
raise ValueError("The file is not a valid SnapGene file") | |
def _parse_location(rangespec, strand, record, is_primer=False): | |
start, end = (int(x) for x in rangespec.split("-")) | |
# Account for SnapGene's 1-based coordinates | |
start = start - 1 | |
if is_primer: | |
# Primers' coordinates in SnapGene files are shifted by -1 | |
# for some reasons | |
start += 1 | |
end += 1 | |
if start > end: | |
# Range wrapping the end of the sequence | |
l1 = SimpleLocation(start, len(record), strand=strand) | |
l2 = SimpleLocation(0, end, strand=strand) | |
location = l1 + l2 | |
else: | |
location = SimpleLocation(start, end, strand=strand) | |
return location | |
def _parse_features_packet(length, data, record): | |
"""Parse a sequence features packet. | |
This packet stores sequence features (except primer binding sites, | |
which are in a dedicated Primers packet). The data is a XML string | |
starting with a 'Features' root node. | |
""" | |
xml = parseString(data.decode("UTF-8")) | |
for feature in xml.getElementsByTagName("Feature"): | |
quals = {} | |
type = _get_attribute_value(feature, "type", default="misc_feature") | |
strand = +1 | |
directionality = int( | |
_get_attribute_value(feature, "directionality", default="1") | |
) | |
if directionality == 2: | |
strand = -1 | |
location = None | |
subparts = [] | |
n_parts = 0 | |
for segment in feature.getElementsByTagName("Segment"): | |
if _get_attribute_value(segment, "type", "standard") == "gap": | |
continue | |
rng = _get_attribute_value(segment, "range") | |
n_parts += 1 | |
next_location = _parse_location(rng, strand, record) | |
if not location: | |
location = next_location | |
elif strand == -1: | |
# Reverse segments order for reverse-strand features | |
location = next_location + location | |
else: | |
location = location + next_location | |
name = _get_attribute_value(segment, "name") | |
if name: | |
subparts.append([n_parts, name]) | |
if len(subparts) > 0: | |
# Add a "parts" qualifiers to represent "named subfeatures" | |
if strand == -1: | |
# Reverse segment indexes and order for reverse-strand features | |
subparts = reversed([[n_parts - i + 1, name] for i, name in subparts]) | |
quals["parts"] = [";".join(f"{i}:{name}" for i, name in subparts)] | |
if not location: | |
raise ValueError("Missing feature location") | |
for qualifier in feature.getElementsByTagName("Q"): | |
qname = _get_attribute_value( | |
qualifier, "name", error="Missing qualifier name" | |
) | |
qvalues = [] | |
for value in qualifier.getElementsByTagName("V"): | |
if value.hasAttribute("text"): | |
qvalues.append(_decode(value.attributes["text"].value)) | |
elif value.hasAttribute("predef"): | |
qvalues.append(_decode(value.attributes["predef"].value)) | |
elif value.hasAttribute("int"): | |
qvalues.append(int(value.attributes["int"].value)) | |
quals[qname] = qvalues | |
name = _get_attribute_value(feature, "name") | |
if name: | |
if "label" not in quals: | |
# No explicit label attribute, use the SnapGene name | |
quals["label"] = [name] | |
elif name not in quals["label"]: | |
# The SnapGene name is different from the label, | |
# add a specific attribute to represent it | |
quals["name"] = [name] | |
feature = SeqFeature(location, type=type, qualifiers=quals) | |
record.features.append(feature) | |
def _parse_primers_packet(length, data, record): | |
"""Parse a Primers packet. | |
A Primers packet is similar to a Features packet but specifically | |
stores primer binding features. The data is a XML string starting | |
with a 'Primers' root node. | |
""" | |
xml = parseString(data.decode("UTF-8")) | |
for primer in xml.getElementsByTagName("Primer"): | |
quals = {} | |
name = _get_attribute_value(primer, "name") | |
if name: | |
quals["label"] = [name] | |
locations = [] | |
for site in primer.getElementsByTagName("BindingSite"): | |
rng = _get_attribute_value( | |
site, "location", error="Missing binding site location" | |
) | |
strand = int(_get_attribute_value(site, "boundStrand", default="0")) | |
if strand == 1: | |
strand = -1 | |
else: | |
strand = +1 | |
location = _parse_location(rng, strand, record, is_primer=True) | |
simplified = int(_get_attribute_value(site, "simplified", default="0")) == 1 | |
if simplified and location in locations: | |
# Duplicate "simplified" binding site, ignore | |
continue | |
locations.append(location) | |
feature = SeqFeature( | |
location, | |
type="primer_bind", | |
qualifiers=quals, | |
) | |
record.features.append(feature) | |
_packet_handlers = { | |
0x00: _parse_dna_packet, | |
0x05: _parse_primers_packet, | |
0x06: _parse_notes_packet, | |
0x0A: _parse_features_packet, | |
} | |
# Helper functions to process the XML data in | |
# some of the segments | |
def _decode(text): | |
# Get rid of HTML tags in some values | |
return sub("<[^>]+>", "", text) | |
def _get_attribute_value(node, name, default=None, error=None): | |
if node.hasAttribute(name): | |
return _decode(node.attributes[name].value) | |
elif error: | |
raise ValueError(error) | |
else: | |
return default | |
def _get_child_value(node, name, default=None, error=None): | |
children = node.getElementsByTagName(name) | |
if ( | |
children | |
and children[0].childNodes | |
and children[0].firstChild.nodeType == node.TEXT_NODE | |
): | |
return _decode(children[0].firstChild.data) | |
elif error: | |
raise ValueError(error) | |
else: | |
return default | |
class SnapGeneIterator(SequenceIterator): | |
"""Parser for SnapGene files.""" | |
def __init__(self, source): | |
"""Parse a SnapGene file and return a SeqRecord object. | |
Argument source is a file-like object or a path to a file. | |
Note that a SnapGene file can only contain one sequence, so this | |
iterator will always return a single record. | |
""" | |
super().__init__(source, mode="b", fmt="SnapGene") | |
def parse(self, handle): | |
"""Start parsing the file, and return a SeqRecord generator.""" | |
records = self.iterate(handle) | |
return records | |
def iterate(self, handle): | |
"""Iterate over the records in the SnapGene file.""" | |
record = SeqRecord(None) | |
packets = _iterate(handle) | |
try: | |
packet_type, length, data = next(packets) | |
except StopIteration: | |
raise ValueError("Empty file.") from None | |
if packet_type != 0x09: | |
raise ValueError("The file does not start with a SnapGene cookie packet") | |
_parse_cookie_packet(length, data, record) | |
for (packet_type, length, data) in packets: | |
handler = _packet_handlers.get(packet_type) | |
if handler is not None: | |
handler(length, data, record) | |
if not record.seq: | |
raise ValueError("No DNA packet in file") | |
yield record | |