File size: 10,978 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# Copyright 2017-2019 Damien Goutte-Gattat.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the SnapGene file format.

The SnapGene binary format is the native format used by the SnapGene program
from GSL Biotech LLC.
"""
from datetime import datetime
from re import sub
from struct import unpack
from xml.dom.minidom import parseString

from Bio.Seq import Seq
from Bio.SeqFeature import SimpleLocation
from Bio.SeqFeature import SeqFeature
from Bio.SeqRecord import SeqRecord

from .Interfaces import SequenceIterator


def _iterate(handle):
    """Iterate over the packets of a SnapGene file.

    A SnapGene file is made of packets, each packet being a TLV-like
    structure comprising:

      - 1 single byte indicating the packet's type;
      - 1 big-endian long integer (4 bytes) indicating the length of the
        packet's data;
      - the actual data.
    """
    while True:
        packet_type = handle.read(1)
        if len(packet_type) < 1:  # No more packet
            return
        packet_type = unpack(">B", packet_type)[0]

        length = handle.read(4)
        if len(length) < 4:
            raise ValueError("Unexpected end of packet")
        length = unpack(">I", length)[0]

        data = handle.read(length)
        if len(data) < length:
            raise ValueError("Unexpected end of packet")

        yield (packet_type, length, data)


def _parse_dna_packet(length, data, record):
    """Parse a DNA sequence packet.

    A DNA sequence packet contains a single byte flag followed by the
    sequence itself.
    """
    if record.seq:
        raise ValueError("The file contains more than one DNA packet")

    flags, sequence = unpack(">B%ds" % (length - 1), data)
    record.seq = Seq(sequence.decode("ASCII"))
    record.annotations["molecule_type"] = "DNA"
    if flags & 0x01:
        record.annotations["topology"] = "circular"
    else:
        record.annotations["topology"] = "linear"


def _parse_notes_packet(length, data, record):
    """Parse a 'Notes' packet.

    This type of packet contains some metadata about the sequence. They
    are stored as a XML string with a 'Notes' root node.
    """
    xml = parseString(data.decode("UTF-8"))
    type = _get_child_value(xml, "Type")
    if type == "Synthetic":
        record.annotations["data_file_division"] = "SYN"
    else:
        record.annotations["data_file_division"] = "UNC"

    date = _get_child_value(xml, "LastModified")
    if date:
        record.annotations["date"] = datetime.strptime(date, "%Y.%m.%d")

    acc = _get_child_value(xml, "AccessionNumber")
    if acc:
        record.id = acc

    comment = _get_child_value(xml, "Comments")
    if comment:
        record.name = comment.split(" ", 1)[0]
        record.description = comment
        if not acc:
            record.id = record.name


def _parse_cookie_packet(length, data, record):
    """Parse a SnapGene cookie packet.

    Every SnapGene file starts with a packet of this type. It acts as
    a magic cookie identifying the file as a SnapGene file.
    """
    cookie, seq_type, exp_version, imp_version = unpack(">8sHHH", data)
    if cookie.decode("ASCII") != "SnapGene":
        raise ValueError("The file is not a valid SnapGene file")


def _parse_location(rangespec, strand, record, is_primer=False):
    start, end = (int(x) for x in rangespec.split("-"))
    # Account for SnapGene's 1-based coordinates
    start = start - 1
    if is_primer:
        # Primers' coordinates in SnapGene files are shifted by -1
        # for some reasons
        start += 1
        end += 1
    if start > end:
        # Range wrapping the end of the sequence
        l1 = SimpleLocation(start, len(record), strand=strand)
        l2 = SimpleLocation(0, end, strand=strand)
        location = l1 + l2
    else:
        location = SimpleLocation(start, end, strand=strand)
    return location


def _parse_features_packet(length, data, record):
    """Parse a sequence features packet.

    This packet stores sequence features (except primer binding sites,
    which are in a dedicated Primers packet). The data is a XML string
    starting with a 'Features' root node.
    """
    xml = parseString(data.decode("UTF-8"))
    for feature in xml.getElementsByTagName("Feature"):
        quals = {}

        type = _get_attribute_value(feature, "type", default="misc_feature")

        strand = +1
        directionality = int(
            _get_attribute_value(feature, "directionality", default="1")
        )
        if directionality == 2:
            strand = -1

        location = None
        subparts = []
        n_parts = 0
        for segment in feature.getElementsByTagName("Segment"):
            if _get_attribute_value(segment, "type", "standard") == "gap":
                continue
            rng = _get_attribute_value(segment, "range")
            n_parts += 1
            next_location = _parse_location(rng, strand, record)
            if not location:
                location = next_location
            elif strand == -1:
                # Reverse segments order for reverse-strand features
                location = next_location + location
            else:
                location = location + next_location

            name = _get_attribute_value(segment, "name")
            if name:
                subparts.append([n_parts, name])

        if len(subparts) > 0:
            # Add a "parts" qualifiers to represent "named subfeatures"
            if strand == -1:
                # Reverse segment indexes and order for reverse-strand features
                subparts = reversed([[n_parts - i + 1, name] for i, name in subparts])
            quals["parts"] = [";".join(f"{i}:{name}" for i, name in subparts)]

        if not location:
            raise ValueError("Missing feature location")

        for qualifier in feature.getElementsByTagName("Q"):
            qname = _get_attribute_value(
                qualifier, "name", error="Missing qualifier name"
            )
            qvalues = []
            for value in qualifier.getElementsByTagName("V"):
                if value.hasAttribute("text"):
                    qvalues.append(_decode(value.attributes["text"].value))
                elif value.hasAttribute("predef"):
                    qvalues.append(_decode(value.attributes["predef"].value))
                elif value.hasAttribute("int"):
                    qvalues.append(int(value.attributes["int"].value))
            quals[qname] = qvalues

        name = _get_attribute_value(feature, "name")
        if name:
            if "label" not in quals:
                # No explicit label attribute, use the SnapGene name
                quals["label"] = [name]
            elif name not in quals["label"]:
                # The SnapGene name is different from the label,
                # add a specific attribute to represent it
                quals["name"] = [name]

        feature = SeqFeature(location, type=type, qualifiers=quals)
        record.features.append(feature)


def _parse_primers_packet(length, data, record):
    """Parse a Primers packet.

    A Primers packet is similar to a Features packet but specifically
    stores primer binding features. The data is a XML string starting
    with a 'Primers' root node.
    """
    xml = parseString(data.decode("UTF-8"))
    for primer in xml.getElementsByTagName("Primer"):
        quals = {}

        name = _get_attribute_value(primer, "name")
        if name:
            quals["label"] = [name]

        locations = []
        for site in primer.getElementsByTagName("BindingSite"):
            rng = _get_attribute_value(
                site, "location", error="Missing binding site location"
            )
            strand = int(_get_attribute_value(site, "boundStrand", default="0"))
            if strand == 1:
                strand = -1
            else:
                strand = +1

            location = _parse_location(rng, strand, record, is_primer=True)
            simplified = int(_get_attribute_value(site, "simplified", default="0")) == 1
            if simplified and location in locations:
                # Duplicate "simplified" binding site, ignore
                continue

            locations.append(location)
            feature = SeqFeature(
                location,
                type="primer_bind",
                qualifiers=quals,
            )
            record.features.append(feature)


_packet_handlers = {
    0x00: _parse_dna_packet,
    0x05: _parse_primers_packet,
    0x06: _parse_notes_packet,
    0x0A: _parse_features_packet,
}

# Helper functions to process the XML data in
# some of the segments


def _decode(text):
    # Get rid of HTML tags in some values
    return sub("<[^>]+>", "", text)


def _get_attribute_value(node, name, default=None, error=None):
    if node.hasAttribute(name):
        return _decode(node.attributes[name].value)
    elif error:
        raise ValueError(error)
    else:
        return default


def _get_child_value(node, name, default=None, error=None):
    children = node.getElementsByTagName(name)
    if (
        children
        and children[0].childNodes
        and children[0].firstChild.nodeType == node.TEXT_NODE
    ):
        return _decode(children[0].firstChild.data)
    elif error:
        raise ValueError(error)
    else:
        return default


class SnapGeneIterator(SequenceIterator):
    """Parser for SnapGene files."""

    def __init__(self, source):
        """Parse a SnapGene file and return a SeqRecord object.

        Argument source is a file-like object or a path to a file.

        Note that a SnapGene file can only contain one sequence, so this
        iterator will always return a single record.
        """
        super().__init__(source, mode="b", fmt="SnapGene")

    def parse(self, handle):
        """Start parsing the file, and return a SeqRecord generator."""
        records = self.iterate(handle)
        return records

    def iterate(self, handle):
        """Iterate over the records in the SnapGene file."""
        record = SeqRecord(None)
        packets = _iterate(handle)
        try:
            packet_type, length, data = next(packets)
        except StopIteration:
            raise ValueError("Empty file.") from None

        if packet_type != 0x09:
            raise ValueError("The file does not start with a SnapGene cookie packet")
        _parse_cookie_packet(length, data, record)

        for (packet_type, length, data) in packets:
            handler = _packet_handlers.get(packet_type)
            if handler is not None:
                handler(length, data, record)

        if not record.seq:
            raise ValueError("No DNA packet in file")

        yield record